diff options
Diffstat (limited to 'mm')
47 files changed, 2907 insertions, 2239 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e338407f1225..82fed4eb2b6f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -198,7 +198,7 @@ config COMPACTION | |||
198 | config MIGRATION | 198 | config MIGRATION |
199 | bool "Page migration" | 199 | bool "Page migration" |
200 | def_bool y | 200 | def_bool y |
201 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION | 201 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA |
202 | help | 202 | help |
203 | Allows the migration of the physical location of pages of processes | 203 | Allows the migration of the physical location of pages of processes |
204 | while the virtual addresses are not changed. This is useful in | 204 | while the virtual addresses are not changed. This is useful in |
@@ -349,6 +349,16 @@ choice | |||
349 | benefit. | 349 | benefit. |
350 | endchoice | 350 | endchoice |
351 | 351 | ||
352 | config CROSS_MEMORY_ATTACH | ||
353 | bool "Cross Memory Support" | ||
354 | depends on MMU | ||
355 | default y | ||
356 | help | ||
357 | Enabling this option adds the system calls process_vm_readv and | ||
358 | process_vm_writev which allow a process with the correct privileges | ||
359 | to directly read from or write to to another process's address space. | ||
360 | See the man page for more details. | ||
361 | |||
352 | # | 362 | # |
353 | # UP and nommu archs use km based percpu allocator | 363 | # UP and nommu archs use km based percpu allocator |
354 | # | 364 | # |
@@ -379,3 +389,20 @@ config CLEANCACHE | |||
379 | in a negligible performance hit. | 389 | in a negligible performance hit. |
380 | 390 | ||
381 | If unsure, say Y to enable cleancache | 391 | If unsure, say Y to enable cleancache |
392 | |||
393 | config FRONTSWAP | ||
394 | bool "Enable frontswap to cache swap pages if tmem is present" | ||
395 | depends on SWAP | ||
396 | default n | ||
397 | help | ||
398 | Frontswap is so named because it can be thought of as the opposite | ||
399 | of a "backing" store for a swap device. The data is stored into | ||
400 | "transcendent memory", memory that is not directly accessible or | ||
401 | addressable by the kernel and is of unknown and possibly | ||
402 | time-varying size. When space in transcendent memory is available, | ||
403 | a significant swap I/O reduction may be achieved. When none is | ||
404 | available, all frontswap calls are reduced to a single pointer- | ||
405 | compare-against-NULL resulting in a negligible performance hit | ||
406 | and swap data is stored as normal on the matching swap device. | ||
407 | |||
408 | If unsure, say Y to enable frontswap. | ||
diff --git a/mm/Makefile b/mm/Makefile index 50ec00ef2a0e..2e2fbbefb99f 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -5,15 +5,18 @@ | |||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o \ | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | process_vm_access.o | 9 | |
10 | ifdef CONFIG_CROSS_MEMORY_ATTACH | ||
11 | mmu-$(CONFIG_MMU) += process_vm_access.o | ||
12 | endif | ||
10 | 13 | ||
11 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | 14 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
12 | maccess.o page_alloc.o page-writeback.o \ | 15 | maccess.o page_alloc.o page-writeback.o \ |
13 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
14 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
15 | page_isolation.o mm_init.o mmu_context.o percpu.o \ | 18 | page_isolation.o mm_init.o mmu_context.o percpu.o \ |
16 | $(mmu-y) | 19 | compaction.o $(mmu-y) |
17 | obj-y += init-mm.o | 20 | obj-y += init-mm.o |
18 | 21 | ||
19 | ifdef CONFIG_NO_BOOTMEM | 22 | ifdef CONFIG_NO_BOOTMEM |
@@ -25,14 +28,14 @@ endif | |||
25 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 28 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
26 | 29 | ||
27 | obj-$(CONFIG_BOUNCE) += bounce.o | 30 | obj-$(CONFIG_BOUNCE) += bounce.o |
28 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 31 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
32 | obj-$(CONFIG_FRONTSWAP) += frontswap.o | ||
29 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 33 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
30 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 34 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
31 | obj-$(CONFIG_NUMA) += mempolicy.o | 35 | obj-$(CONFIG_NUMA) += mempolicy.o |
32 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 36 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
33 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 37 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
34 | obj-$(CONFIG_SLOB) += slob.o | 38 | obj-$(CONFIG_SLOB) += slob.o |
35 | obj-$(CONFIG_COMPACTION) += compaction.o | ||
36 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 39 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
37 | obj-$(CONFIG_KSM) += ksm.o | 40 | obj-$(CONFIG_KSM) += ksm.o |
38 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 41 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 0131170c9d54..bcb63ac48cc5 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages) | |||
77 | */ | 77 | */ |
78 | static void __init link_bootmem(bootmem_data_t *bdata) | 78 | static void __init link_bootmem(bootmem_data_t *bdata) |
79 | { | 79 | { |
80 | struct list_head *iter; | 80 | bootmem_data_t *ent; |
81 | 81 | ||
82 | list_for_each(iter, &bdata_list) { | 82 | list_for_each_entry(ent, &bdata_list, list) { |
83 | bootmem_data_t *ent; | 83 | if (bdata->node_min_pfn < ent->node_min_pfn) { |
84 | 84 | list_add_tail(&bdata->list, &ent->list); | |
85 | ent = list_entry(iter, bootmem_data_t, list); | 85 | return; |
86 | if (bdata->node_min_pfn < ent->node_min_pfn) | 86 | } |
87 | break; | ||
88 | } | 87 | } |
89 | list_add_tail(&bdata->list, iter); | 88 | |
89 | list_add_tail(&bdata->list, &bdata_list); | ||
90 | } | 90 | } |
91 | 91 | ||
92 | /* | 92 | /* |
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
203 | } else { | 203 | } else { |
204 | unsigned long off = 0; | 204 | unsigned long off = 0; |
205 | 205 | ||
206 | while (vec && off < BITS_PER_LONG) { | 206 | vec >>= start & (BITS_PER_LONG - 1); |
207 | while (vec) { | ||
207 | if (vec & 1) { | 208 | if (vec & 1) { |
208 | page = pfn_to_page(start + off); | 209 | page = pfn_to_page(start + off); |
209 | __free_pages_bootmem(page, 0); | 210 | __free_pages_bootmem(page, 0); |
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata, | |||
467 | return ALIGN(base + off, align) - base; | 468 | return ALIGN(base + off, align) - base; |
468 | } | 469 | } |
469 | 470 | ||
470 | static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | 471 | static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, |
471 | unsigned long size, unsigned long align, | 472 | unsigned long size, unsigned long align, |
472 | unsigned long goal, unsigned long limit) | 473 | unsigned long goal, unsigned long limit) |
473 | { | 474 | { |
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
588 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, | 589 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, |
589 | goal, limit); | 590 | goal, limit); |
590 | if (p_bdata) | 591 | if (p_bdata) |
591 | return alloc_bootmem_core(p_bdata, size, align, | 592 | return alloc_bootmem_bdata(p_bdata, size, align, |
592 | goal, limit); | 593 | goal, limit); |
593 | } | 594 | } |
594 | #endif | 595 | #endif |
595 | return NULL; | 596 | return NULL; |
596 | } | 597 | } |
597 | 598 | ||
598 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | 599 | static void * __init alloc_bootmem_core(unsigned long size, |
599 | unsigned long align, | 600 | unsigned long align, |
600 | unsigned long goal, | 601 | unsigned long goal, |
601 | unsigned long limit) | 602 | unsigned long limit) |
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, | |||
603 | bootmem_data_t *bdata; | 604 | bootmem_data_t *bdata; |
604 | void *region; | 605 | void *region; |
605 | 606 | ||
606 | restart: | ||
607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); | 607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); |
608 | if (region) | 608 | if (region) |
609 | return region; | 609 | return region; |
@@ -614,11 +614,25 @@ restart: | |||
614 | if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) | 614 | if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) |
615 | break; | 615 | break; |
616 | 616 | ||
617 | region = alloc_bootmem_core(bdata, size, align, goal, limit); | 617 | region = alloc_bootmem_bdata(bdata, size, align, goal, limit); |
618 | if (region) | 618 | if (region) |
619 | return region; | 619 | return region; |
620 | } | 620 | } |
621 | 621 | ||
622 | return NULL; | ||
623 | } | ||
624 | |||
625 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | ||
626 | unsigned long align, | ||
627 | unsigned long goal, | ||
628 | unsigned long limit) | ||
629 | { | ||
630 | void *ptr; | ||
631 | |||
632 | restart: | ||
633 | ptr = alloc_bootmem_core(size, align, goal, limit); | ||
634 | if (ptr) | ||
635 | return ptr; | ||
622 | if (goal) { | 636 | if (goal) { |
623 | goal = 0; | 637 | goal = 0; |
624 | goto restart; | 638 | goto restart; |
@@ -684,21 +698,60 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
684 | return ___alloc_bootmem(size, align, goal, limit); | 698 | return ___alloc_bootmem(size, align, goal, limit); |
685 | } | 699 | } |
686 | 700 | ||
687 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | 701 | void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, |
688 | unsigned long size, unsigned long align, | 702 | unsigned long size, unsigned long align, |
689 | unsigned long goal, unsigned long limit) | 703 | unsigned long goal, unsigned long limit) |
690 | { | 704 | { |
691 | void *ptr; | 705 | void *ptr; |
692 | 706 | ||
693 | ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); | 707 | again: |
708 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, | ||
709 | align, goal, limit); | ||
694 | if (ptr) | 710 | if (ptr) |
695 | return ptr; | 711 | return ptr; |
696 | 712 | ||
697 | ptr = alloc_bootmem_core(bdata, size, align, goal, limit); | 713 | /* do not panic in alloc_bootmem_bdata() */ |
714 | if (limit && goal + size > limit) | ||
715 | limit = 0; | ||
716 | |||
717 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); | ||
698 | if (ptr) | 718 | if (ptr) |
699 | return ptr; | 719 | return ptr; |
700 | 720 | ||
701 | return ___alloc_bootmem(size, align, goal, limit); | 721 | ptr = alloc_bootmem_core(size, align, goal, limit); |
722 | if (ptr) | ||
723 | return ptr; | ||
724 | |||
725 | if (goal) { | ||
726 | goal = 0; | ||
727 | goto again; | ||
728 | } | ||
729 | |||
730 | return NULL; | ||
731 | } | ||
732 | |||
733 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | ||
734 | unsigned long align, unsigned long goal) | ||
735 | { | ||
736 | if (WARN_ON_ONCE(slab_is_available())) | ||
737 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
738 | |||
739 | return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | ||
740 | } | ||
741 | |||
742 | void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
743 | unsigned long align, unsigned long goal, | ||
744 | unsigned long limit) | ||
745 | { | ||
746 | void *ptr; | ||
747 | |||
748 | ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | ||
749 | if (ptr) | ||
750 | return ptr; | ||
751 | |||
752 | printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); | ||
753 | panic("Out of memory"); | ||
754 | return NULL; | ||
702 | } | 755 | } |
703 | 756 | ||
704 | /** | 757 | /** |
@@ -722,7 +775,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
722 | if (WARN_ON_ONCE(slab_is_available())) | 775 | if (WARN_ON_ONCE(slab_is_available())) |
723 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 776 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
724 | 777 | ||
725 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 778 | return ___alloc_bootmem_node(pgdat, size, align, goal, 0); |
726 | } | 779 | } |
727 | 780 | ||
728 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | 781 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, |
@@ -743,7 +796,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
743 | unsigned long new_goal; | 796 | unsigned long new_goal; |
744 | 797 | ||
745 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | 798 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; |
746 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, | 799 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, |
747 | new_goal, 0); | 800 | new_goal, 0); |
748 | if (ptr) | 801 | if (ptr) |
749 | return ptr; | 802 | return ptr; |
@@ -754,47 +807,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
754 | 807 | ||
755 | } | 808 | } |
756 | 809 | ||
757 | #ifdef CONFIG_SPARSEMEM | ||
758 | /** | ||
759 | * alloc_bootmem_section - allocate boot memory from a specific section | ||
760 | * @size: size of the request in bytes | ||
761 | * @section_nr: sparse map section to allocate from | ||
762 | * | ||
763 | * Return NULL on failure. | ||
764 | */ | ||
765 | void * __init alloc_bootmem_section(unsigned long size, | ||
766 | unsigned long section_nr) | ||
767 | { | ||
768 | bootmem_data_t *bdata; | ||
769 | unsigned long pfn, goal; | ||
770 | |||
771 | pfn = section_nr_to_pfn(section_nr); | ||
772 | goal = pfn << PAGE_SHIFT; | ||
773 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | ||
774 | |||
775 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); | ||
776 | } | ||
777 | #endif | ||
778 | |||
779 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | ||
780 | unsigned long align, unsigned long goal) | ||
781 | { | ||
782 | void *ptr; | ||
783 | |||
784 | if (WARN_ON_ONCE(slab_is_available())) | ||
785 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
786 | |||
787 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | ||
788 | if (ptr) | ||
789 | return ptr; | ||
790 | |||
791 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | ||
792 | if (ptr) | ||
793 | return ptr; | ||
794 | |||
795 | return __alloc_bootmem_nopanic(size, align, goal); | ||
796 | } | ||
797 | |||
798 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 810 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
799 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | 811 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL |
800 | #endif | 812 | #endif |
@@ -839,6 +851,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | |||
839 | if (WARN_ON_ONCE(slab_is_available())) | 851 | if (WARN_ON_ONCE(slab_is_available())) |
840 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 852 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
841 | 853 | ||
842 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 854 | return ___alloc_bootmem_node(pgdat, size, align, |
843 | goal, ARCH_LOW_ADDRESS_LIMIT); | 855 | goal, ARCH_LOW_ADDRESS_LIMIT); |
844 | } | 856 | } |
diff --git a/mm/cleancache.c b/mm/cleancache.c index 5646c740f613..32e6f4136fa2 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c | |||
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs); | |||
80 | static int cleancache_get_key(struct inode *inode, | 80 | static int cleancache_get_key(struct inode *inode, |
81 | struct cleancache_filekey *key) | 81 | struct cleancache_filekey *key) |
82 | { | 82 | { |
83 | int (*fhfn)(struct dentry *, __u32 *fh, int *, int); | 83 | int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); |
84 | int len = 0, maxlen = CLEANCACHE_KEY_MAX; | 84 | int len = 0, maxlen = CLEANCACHE_KEY_MAX; |
85 | struct super_block *sb = inode->i_sb; | 85 | struct super_block *sb = inode->i_sb; |
86 | 86 | ||
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode, | |||
88 | if (sb->s_export_op != NULL) { | 88 | if (sb->s_export_op != NULL) { |
89 | fhfn = sb->s_export_op->encode_fh; | 89 | fhfn = sb->s_export_op->encode_fh; |
90 | if (fhfn) { | 90 | if (fhfn) { |
91 | struct dentry d; | 91 | len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); |
92 | d.d_inode = inode; | ||
93 | len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); | ||
94 | if (len <= 0 || len == 255) | 92 | if (len <= 0 || len == 255) |
95 | return -1; | 93 | return -1; |
96 | if (maxlen > CLEANCACHE_KEY_MAX) | 94 | if (maxlen > CLEANCACHE_KEY_MAX) |
diff --git a/mm/compaction.c b/mm/compaction.c index 74a8c825ff28..2f42d9528539 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -16,30 +16,11 @@ | |||
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include "internal.h" | 17 | #include "internal.h" |
18 | 18 | ||
19 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
20 | |||
19 | #define CREATE_TRACE_POINTS | 21 | #define CREATE_TRACE_POINTS |
20 | #include <trace/events/compaction.h> | 22 | #include <trace/events/compaction.h> |
21 | 23 | ||
22 | /* | ||
23 | * compact_control is used to track pages being migrated and the free pages | ||
24 | * they are being migrated to during memory compaction. The free_pfn starts | ||
25 | * at the end of a zone and migrate_pfn begins at the start. Movable pages | ||
26 | * are moved to the end of a zone during a compaction run and the run | ||
27 | * completes when free_pfn <= migrate_pfn | ||
28 | */ | ||
29 | struct compact_control { | ||
30 | struct list_head freepages; /* List of free pages to migrate to */ | ||
31 | struct list_head migratepages; /* List of pages being migrated */ | ||
32 | unsigned long nr_freepages; /* Number of isolated free pages */ | ||
33 | unsigned long nr_migratepages; /* Number of pages to migrate */ | ||
34 | unsigned long free_pfn; /* isolate_freepages search base */ | ||
35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | ||
36 | bool sync; /* Synchronous migration */ | ||
37 | |||
38 | int order; /* order a direct compactor needs */ | ||
39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | ||
40 | struct zone *zone; | ||
41 | }; | ||
42 | |||
43 | static unsigned long release_freepages(struct list_head *freelist) | 24 | static unsigned long release_freepages(struct list_head *freelist) |
44 | { | 25 | { |
45 | struct page *page, *next; | 26 | struct page *page, *next; |
@@ -54,24 +35,35 @@ static unsigned long release_freepages(struct list_head *freelist) | |||
54 | return count; | 35 | return count; |
55 | } | 36 | } |
56 | 37 | ||
57 | /* Isolate free pages onto a private freelist. Must hold zone->lock */ | 38 | static void map_pages(struct list_head *list) |
58 | static unsigned long isolate_freepages_block(struct zone *zone, | 39 | { |
59 | unsigned long blockpfn, | 40 | struct page *page; |
60 | struct list_head *freelist) | 41 | |
42 | list_for_each_entry(page, list, lru) { | ||
43 | arch_alloc_page(page, 0); | ||
44 | kernel_map_pages(page, 1, 1); | ||
45 | } | ||
46 | } | ||
47 | |||
48 | static inline bool migrate_async_suitable(int migratetype) | ||
49 | { | ||
50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; | ||
51 | } | ||
52 | |||
53 | /* | ||
54 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | ||
55 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | ||
56 | * pages inside of the pageblock (even though it may still end up isolating | ||
57 | * some pages). | ||
58 | */ | ||
59 | static unsigned long isolate_freepages_block(unsigned long blockpfn, | ||
60 | unsigned long end_pfn, | ||
61 | struct list_head *freelist, | ||
62 | bool strict) | ||
61 | { | 63 | { |
62 | unsigned long zone_end_pfn, end_pfn; | ||
63 | int nr_scanned = 0, total_isolated = 0; | 64 | int nr_scanned = 0, total_isolated = 0; |
64 | struct page *cursor; | 65 | struct page *cursor; |
65 | 66 | ||
66 | /* Get the last PFN we should scan for free pages at */ | ||
67 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
68 | end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); | ||
69 | |||
70 | /* Find the first usable PFN in the block to initialse page cursor */ | ||
71 | for (; blockpfn < end_pfn; blockpfn++) { | ||
72 | if (pfn_valid_within(blockpfn)) | ||
73 | break; | ||
74 | } | ||
75 | cursor = pfn_to_page(blockpfn); | 67 | cursor = pfn_to_page(blockpfn); |
76 | 68 | ||
77 | /* Isolate free pages. This assumes the block is valid */ | 69 | /* Isolate free pages. This assumes the block is valid */ |
@@ -79,15 +71,23 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
79 | int isolated, i; | 71 | int isolated, i; |
80 | struct page *page = cursor; | 72 | struct page *page = cursor; |
81 | 73 | ||
82 | if (!pfn_valid_within(blockpfn)) | 74 | if (!pfn_valid_within(blockpfn)) { |
75 | if (strict) | ||
76 | return 0; | ||
83 | continue; | 77 | continue; |
78 | } | ||
84 | nr_scanned++; | 79 | nr_scanned++; |
85 | 80 | ||
86 | if (!PageBuddy(page)) | 81 | if (!PageBuddy(page)) { |
82 | if (strict) | ||
83 | return 0; | ||
87 | continue; | 84 | continue; |
85 | } | ||
88 | 86 | ||
89 | /* Found a free page, break it into order-0 pages */ | 87 | /* Found a free page, break it into order-0 pages */ |
90 | isolated = split_free_page(page); | 88 | isolated = split_free_page(page); |
89 | if (!isolated && strict) | ||
90 | return 0; | ||
91 | total_isolated += isolated; | 91 | total_isolated += isolated; |
92 | for (i = 0; i < isolated; i++) { | 92 | for (i = 0; i < isolated; i++) { |
93 | list_add(&page->lru, freelist); | 93 | list_add(&page->lru, freelist); |
@@ -105,114 +105,71 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
105 | return total_isolated; | 105 | return total_isolated; |
106 | } | 106 | } |
107 | 107 | ||
108 | /* Returns true if the page is within a block suitable for migration to */ | 108 | /** |
109 | static bool suitable_migration_target(struct page *page) | 109 | * isolate_freepages_range() - isolate free pages. |
110 | { | 110 | * @start_pfn: The first PFN to start isolating. |
111 | 111 | * @end_pfn: The one-past-last PFN. | |
112 | int migratetype = get_pageblock_migratetype(page); | 112 | * |
113 | 113 | * Non-free pages, invalid PFNs, or zone boundaries within the | |
114 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | 114 | * [start_pfn, end_pfn) range are considered errors, cause function to |
115 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | 115 | * undo its actions and return zero. |
116 | return false; | 116 | * |
117 | 117 | * Otherwise, function returns one-past-the-last PFN of isolated page | |
118 | /* If the page is a large free page, then allow migration */ | 118 | * (which may be greater then end_pfn if end fell in a middle of |
119 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | 119 | * a free page). |
120 | return true; | ||
121 | |||
122 | /* If the block is MIGRATE_MOVABLE, allow migration */ | ||
123 | if (migratetype == MIGRATE_MOVABLE) | ||
124 | return true; | ||
125 | |||
126 | /* Otherwise skip the block */ | ||
127 | return false; | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * Based on information in the current compact_control, find blocks | ||
132 | * suitable for isolating free pages from and then isolate them. | ||
133 | */ | 120 | */ |
134 | static void isolate_freepages(struct zone *zone, | 121 | unsigned long |
135 | struct compact_control *cc) | 122 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) |
136 | { | 123 | { |
137 | struct page *page; | 124 | unsigned long isolated, pfn, block_end_pfn, flags; |
138 | unsigned long high_pfn, low_pfn, pfn; | 125 | struct zone *zone = NULL; |
139 | unsigned long flags; | 126 | LIST_HEAD(freelist); |
140 | int nr_freepages = cc->nr_freepages; | ||
141 | struct list_head *freelist = &cc->freepages; | ||
142 | |||
143 | /* | ||
144 | * Initialise the free scanner. The starting point is where we last | ||
145 | * scanned from (or the end of the zone if starting). The low point | ||
146 | * is the end of the pageblock the migration scanner is using. | ||
147 | */ | ||
148 | pfn = cc->free_pfn; | ||
149 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | ||
150 | 127 | ||
151 | /* | 128 | if (pfn_valid(start_pfn)) |
152 | * Take care that if the migration scanner is at the end of the zone | 129 | zone = page_zone(pfn_to_page(start_pfn)); |
153 | * that the free scanner does not accidentally move to the next zone | ||
154 | * in the next isolation cycle. | ||
155 | */ | ||
156 | high_pfn = min(low_pfn, pfn); | ||
157 | 130 | ||
158 | /* | 131 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { |
159 | * Isolate free pages until enough are available to migrate the | 132 | if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) |
160 | * pages on cc->migratepages. We stop searching if the migrate | 133 | break; |
161 | * and free page scanners meet or enough free pages are isolated. | ||
162 | */ | ||
163 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | ||
164 | pfn -= pageblock_nr_pages) { | ||
165 | unsigned long isolated; | ||
166 | |||
167 | if (!pfn_valid(pfn)) | ||
168 | continue; | ||
169 | 134 | ||
170 | /* | 135 | /* |
171 | * Check for overlapping nodes/zones. It's possible on some | 136 | * On subsequent iterations ALIGN() is actually not needed, |
172 | * configurations to have a setup like | 137 | * but we keep it that we not to complicate the code. |
173 | * node0 node1 node0 | ||
174 | * i.e. it's possible that all pages within a zones range of | ||
175 | * pages do not belong to a single zone. | ||
176 | */ | 138 | */ |
177 | page = pfn_to_page(pfn); | 139 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
178 | if (page_zone(page) != zone) | 140 | block_end_pfn = min(block_end_pfn, end_pfn); |
179 | continue; | ||
180 | 141 | ||
181 | /* Check the block is suitable for migration */ | 142 | spin_lock_irqsave(&zone->lock, flags); |
182 | if (!suitable_migration_target(page)) | 143 | isolated = isolate_freepages_block(pfn, block_end_pfn, |
183 | continue; | 144 | &freelist, true); |
145 | spin_unlock_irqrestore(&zone->lock, flags); | ||
184 | 146 | ||
185 | /* | 147 | /* |
186 | * Found a block suitable for isolating free pages from. Now | 148 | * In strict mode, isolate_freepages_block() returns 0 if |
187 | * we disabled interrupts, double check things are ok and | 149 | * there are any holes in the block (ie. invalid PFNs or |
188 | * isolate the pages. This is to minimise the time IRQs | 150 | * non-free pages). |
189 | * are disabled | ||
190 | */ | 151 | */ |
191 | isolated = 0; | 152 | if (!isolated) |
192 | spin_lock_irqsave(&zone->lock, flags); | 153 | break; |
193 | if (suitable_migration_target(page)) { | ||
194 | isolated = isolate_freepages_block(zone, pfn, freelist); | ||
195 | nr_freepages += isolated; | ||
196 | } | ||
197 | spin_unlock_irqrestore(&zone->lock, flags); | ||
198 | 154 | ||
199 | /* | 155 | /* |
200 | * Record the highest PFN we isolated pages from. When next | 156 | * If we managed to isolate pages, it is always (1 << n) * |
201 | * looking for free pages, the search will restart here as | 157 | * pageblock_nr_pages for some non-negative n. (Max order |
202 | * page migration may have returned some pages to the allocator | 158 | * page may span two pageblocks). |
203 | */ | 159 | */ |
204 | if (isolated) | ||
205 | high_pfn = max(high_pfn, pfn); | ||
206 | } | 160 | } |
207 | 161 | ||
208 | /* split_free_page does not map the pages */ | 162 | /* split_free_page does not map the pages */ |
209 | list_for_each_entry(page, freelist, lru) { | 163 | map_pages(&freelist); |
210 | arch_alloc_page(page, 0); | 164 | |
211 | kernel_map_pages(page, 1, 1); | 165 | if (pfn < end_pfn) { |
166 | /* Loop terminated early, cleanup. */ | ||
167 | release_freepages(&freelist); | ||
168 | return 0; | ||
212 | } | 169 | } |
213 | 170 | ||
214 | cc->free_pfn = high_pfn; | 171 | /* We don't use freelists for anything. */ |
215 | cc->nr_freepages = nr_freepages; | 172 | return pfn; |
216 | } | 173 | } |
217 | 174 | ||
218 | /* Update the number of anon and file isolated pages in the zone */ | 175 | /* Update the number of anon and file isolated pages in the zone */ |
@@ -243,37 +200,34 @@ static bool too_many_isolated(struct zone *zone) | |||
243 | return isolated > (inactive + active) / 2; | 200 | return isolated > (inactive + active) / 2; |
244 | } | 201 | } |
245 | 202 | ||
246 | /* possible outcome of isolate_migratepages */ | 203 | /** |
247 | typedef enum { | 204 | * isolate_migratepages_range() - isolate all migrate-able pages in range. |
248 | ISOLATE_ABORT, /* Abort compaction now */ | 205 | * @zone: Zone pages are in. |
249 | ISOLATE_NONE, /* No pages isolated, continue scanning */ | 206 | * @cc: Compaction control structure. |
250 | ISOLATE_SUCCESS, /* Pages isolated, migrate */ | 207 | * @low_pfn: The first PFN of the range. |
251 | } isolate_migrate_t; | 208 | * @end_pfn: The one-past-the-last PFN of the range. |
252 | 209 | * | |
253 | /* | 210 | * Isolate all pages that can be migrated from the range specified by |
254 | * Isolate all pages that can be migrated from the block pointed to by | 211 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal |
255 | * the migrate scanner within compact_control. | 212 | * pending), otherwise PFN of the first page that was not scanned |
213 | * (which may be both less, equal to or more then end_pfn). | ||
214 | * | ||
215 | * Assumes that cc->migratepages is empty and cc->nr_migratepages is | ||
216 | * zero. | ||
217 | * | ||
218 | * Apart from cc->migratepages and cc->nr_migratetypes this function | ||
219 | * does not modify any cc's fields, in particular it does not modify | ||
220 | * (or read for that matter) cc->migrate_pfn. | ||
256 | */ | 221 | */ |
257 | static isolate_migrate_t isolate_migratepages(struct zone *zone, | 222 | unsigned long |
258 | struct compact_control *cc) | 223 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
224 | unsigned long low_pfn, unsigned long end_pfn) | ||
259 | { | 225 | { |
260 | unsigned long low_pfn, end_pfn; | ||
261 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 226 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
262 | unsigned long nr_scanned = 0, nr_isolated = 0; | 227 | unsigned long nr_scanned = 0, nr_isolated = 0; |
263 | struct list_head *migratelist = &cc->migratepages; | 228 | struct list_head *migratelist = &cc->migratepages; |
264 | isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; | 229 | isolate_mode_t mode = 0; |
265 | 230 | struct lruvec *lruvec; | |
266 | /* Do not scan outside zone boundaries */ | ||
267 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | ||
268 | |||
269 | /* Only scan within a pageblock boundary */ | ||
270 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); | ||
271 | |||
272 | /* Do not cross the free scanner or scan within a memory hole */ | ||
273 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | ||
274 | cc->migrate_pfn = end_pfn; | ||
275 | return ISOLATE_NONE; | ||
276 | } | ||
277 | 231 | ||
278 | /* | 232 | /* |
279 | * Ensure that there are not too many pages isolated from the LRU | 233 | * Ensure that there are not too many pages isolated from the LRU |
@@ -283,12 +237,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
283 | while (unlikely(too_many_isolated(zone))) { | 237 | while (unlikely(too_many_isolated(zone))) { |
284 | /* async migration should just abort */ | 238 | /* async migration should just abort */ |
285 | if (!cc->sync) | 239 | if (!cc->sync) |
286 | return ISOLATE_ABORT; | 240 | return 0; |
287 | 241 | ||
288 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 242 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
289 | 243 | ||
290 | if (fatal_signal_pending(current)) | 244 | if (fatal_signal_pending(current)) |
291 | return ISOLATE_ABORT; | 245 | return 0; |
292 | } | 246 | } |
293 | 247 | ||
294 | /* Time to isolate some pages for migration */ | 248 | /* Time to isolate some pages for migration */ |
@@ -351,7 +305,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
351 | */ | 305 | */ |
352 | pageblock_nr = low_pfn >> pageblock_order; | 306 | pageblock_nr = low_pfn >> pageblock_order; |
353 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 307 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
354 | get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { | 308 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
355 | low_pfn += pageblock_nr_pages; | 309 | low_pfn += pageblock_nr_pages; |
356 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | 310 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; |
357 | last_pageblock_nr = pageblock_nr; | 311 | last_pageblock_nr = pageblock_nr; |
@@ -374,14 +328,16 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
374 | if (!cc->sync) | 328 | if (!cc->sync) |
375 | mode |= ISOLATE_ASYNC_MIGRATE; | 329 | mode |= ISOLATE_ASYNC_MIGRATE; |
376 | 330 | ||
331 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
332 | |||
377 | /* Try isolate the page */ | 333 | /* Try isolate the page */ |
378 | if (__isolate_lru_page(page, mode, 0) != 0) | 334 | if (__isolate_lru_page(page, mode) != 0) |
379 | continue; | 335 | continue; |
380 | 336 | ||
381 | VM_BUG_ON(PageTransCompound(page)); | 337 | VM_BUG_ON(PageTransCompound(page)); |
382 | 338 | ||
383 | /* Successfully isolated */ | 339 | /* Successfully isolated */ |
384 | del_page_from_lru_list(zone, page, page_lru(page)); | 340 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
385 | list_add(&page->lru, migratelist); | 341 | list_add(&page->lru, migratelist); |
386 | cc->nr_migratepages++; | 342 | cc->nr_migratepages++; |
387 | nr_isolated++; | 343 | nr_isolated++; |
@@ -396,11 +352,124 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
396 | acct_isolated(zone, cc); | 352 | acct_isolated(zone, cc); |
397 | 353 | ||
398 | spin_unlock_irq(&zone->lru_lock); | 354 | spin_unlock_irq(&zone->lru_lock); |
399 | cc->migrate_pfn = low_pfn; | ||
400 | 355 | ||
401 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 356 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
402 | 357 | ||
403 | return ISOLATE_SUCCESS; | 358 | return low_pfn; |
359 | } | ||
360 | |||
361 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | ||
362 | #ifdef CONFIG_COMPACTION | ||
363 | |||
364 | /* Returns true if the page is within a block suitable for migration to */ | ||
365 | static bool suitable_migration_target(struct page *page) | ||
366 | { | ||
367 | |||
368 | int migratetype = get_pageblock_migratetype(page); | ||
369 | |||
370 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
371 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | ||
372 | return false; | ||
373 | |||
374 | /* If the page is a large free page, then allow migration */ | ||
375 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
376 | return true; | ||
377 | |||
378 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
379 | if (migrate_async_suitable(migratetype)) | ||
380 | return true; | ||
381 | |||
382 | /* Otherwise skip the block */ | ||
383 | return false; | ||
384 | } | ||
385 | |||
386 | /* | ||
387 | * Based on information in the current compact_control, find blocks | ||
388 | * suitable for isolating free pages from and then isolate them. | ||
389 | */ | ||
390 | static void isolate_freepages(struct zone *zone, | ||
391 | struct compact_control *cc) | ||
392 | { | ||
393 | struct page *page; | ||
394 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; | ||
395 | unsigned long flags; | ||
396 | int nr_freepages = cc->nr_freepages; | ||
397 | struct list_head *freelist = &cc->freepages; | ||
398 | |||
399 | /* | ||
400 | * Initialise the free scanner. The starting point is where we last | ||
401 | * scanned from (or the end of the zone if starting). The low point | ||
402 | * is the end of the pageblock the migration scanner is using. | ||
403 | */ | ||
404 | pfn = cc->free_pfn; | ||
405 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | ||
406 | |||
407 | /* | ||
408 | * Take care that if the migration scanner is at the end of the zone | ||
409 | * that the free scanner does not accidentally move to the next zone | ||
410 | * in the next isolation cycle. | ||
411 | */ | ||
412 | high_pfn = min(low_pfn, pfn); | ||
413 | |||
414 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
415 | |||
416 | /* | ||
417 | * Isolate free pages until enough are available to migrate the | ||
418 | * pages on cc->migratepages. We stop searching if the migrate | ||
419 | * and free page scanners meet or enough free pages are isolated. | ||
420 | */ | ||
421 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | ||
422 | pfn -= pageblock_nr_pages) { | ||
423 | unsigned long isolated; | ||
424 | |||
425 | if (!pfn_valid(pfn)) | ||
426 | continue; | ||
427 | |||
428 | /* | ||
429 | * Check for overlapping nodes/zones. It's possible on some | ||
430 | * configurations to have a setup like | ||
431 | * node0 node1 node0 | ||
432 | * i.e. it's possible that all pages within a zones range of | ||
433 | * pages do not belong to a single zone. | ||
434 | */ | ||
435 | page = pfn_to_page(pfn); | ||
436 | if (page_zone(page) != zone) | ||
437 | continue; | ||
438 | |||
439 | /* Check the block is suitable for migration */ | ||
440 | if (!suitable_migration_target(page)) | ||
441 | continue; | ||
442 | |||
443 | /* | ||
444 | * Found a block suitable for isolating free pages from. Now | ||
445 | * we disabled interrupts, double check things are ok and | ||
446 | * isolate the pages. This is to minimise the time IRQs | ||
447 | * are disabled | ||
448 | */ | ||
449 | isolated = 0; | ||
450 | spin_lock_irqsave(&zone->lock, flags); | ||
451 | if (suitable_migration_target(page)) { | ||
452 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); | ||
453 | isolated = isolate_freepages_block(pfn, end_pfn, | ||
454 | freelist, false); | ||
455 | nr_freepages += isolated; | ||
456 | } | ||
457 | spin_unlock_irqrestore(&zone->lock, flags); | ||
458 | |||
459 | /* | ||
460 | * Record the highest PFN we isolated pages from. When next | ||
461 | * looking for free pages, the search will restart here as | ||
462 | * page migration may have returned some pages to the allocator | ||
463 | */ | ||
464 | if (isolated) | ||
465 | high_pfn = max(high_pfn, pfn); | ||
466 | } | ||
467 | |||
468 | /* split_free_page does not map the pages */ | ||
469 | map_pages(freelist); | ||
470 | |||
471 | cc->free_pfn = high_pfn; | ||
472 | cc->nr_freepages = nr_freepages; | ||
404 | } | 473 | } |
405 | 474 | ||
406 | /* | 475 | /* |
@@ -449,6 +518,44 @@ static void update_nr_listpages(struct compact_control *cc) | |||
449 | cc->nr_freepages = nr_freepages; | 518 | cc->nr_freepages = nr_freepages; |
450 | } | 519 | } |
451 | 520 | ||
521 | /* possible outcome of isolate_migratepages */ | ||
522 | typedef enum { | ||
523 | ISOLATE_ABORT, /* Abort compaction now */ | ||
524 | ISOLATE_NONE, /* No pages isolated, continue scanning */ | ||
525 | ISOLATE_SUCCESS, /* Pages isolated, migrate */ | ||
526 | } isolate_migrate_t; | ||
527 | |||
528 | /* | ||
529 | * Isolate all pages that can be migrated from the block pointed to by | ||
530 | * the migrate scanner within compact_control. | ||
531 | */ | ||
532 | static isolate_migrate_t isolate_migratepages(struct zone *zone, | ||
533 | struct compact_control *cc) | ||
534 | { | ||
535 | unsigned long low_pfn, end_pfn; | ||
536 | |||
537 | /* Do not scan outside zone boundaries */ | ||
538 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | ||
539 | |||
540 | /* Only scan within a pageblock boundary */ | ||
541 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); | ||
542 | |||
543 | /* Do not cross the free scanner or scan within a memory hole */ | ||
544 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | ||
545 | cc->migrate_pfn = end_pfn; | ||
546 | return ISOLATE_NONE; | ||
547 | } | ||
548 | |||
549 | /* Perform the isolation */ | ||
550 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); | ||
551 | if (!low_pfn) | ||
552 | return ISOLATE_ABORT; | ||
553 | |||
554 | cc->migrate_pfn = low_pfn; | ||
555 | |||
556 | return ISOLATE_SUCCESS; | ||
557 | } | ||
558 | |||
452 | static int compact_finished(struct zone *zone, | 559 | static int compact_finished(struct zone *zone, |
453 | struct compact_control *cc) | 560 | struct compact_control *cc) |
454 | { | 561 | { |
@@ -594,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
594 | if (err) { | 701 | if (err) { |
595 | putback_lru_pages(&cc->migratepages); | 702 | putback_lru_pages(&cc->migratepages); |
596 | cc->nr_migratepages = 0; | 703 | cc->nr_migratepages = 0; |
704 | if (err == -ENOMEM) { | ||
705 | ret = COMPACT_PARTIAL; | ||
706 | goto out; | ||
707 | } | ||
597 | } | 708 | } |
598 | |||
599 | } | 709 | } |
600 | 710 | ||
601 | out: | 711 | out: |
@@ -795,3 +905,5 @@ void compaction_unregister_node(struct node *node) | |||
795 | return device_remove_file(&node->dev, &dev_attr_compact); | 905 | return device_remove_file(&node->dev, &dev_attr_compact); |
796 | } | 906 | } |
797 | #endif /* CONFIG_SYSFS && CONFIG_NUMA */ | 907 | #endif /* CONFIG_SYSFS && CONFIG_NUMA */ |
908 | |||
909 | #endif /* CONFIG_COMPACTION */ | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 79c4b2b0b14e..a4a5260b0279 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
30 | #include <linux/blkdev.h> | 30 | #include <linux/blkdev.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <linux/syscalls.h> | ||
33 | #include <linux/cpuset.h> | 32 | #include <linux/cpuset.h> |
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 33 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 34 | #include <linux/memcontrol.h> |
@@ -1478,44 +1477,6 @@ out: | |||
1478 | } | 1477 | } |
1479 | EXPORT_SYMBOL(generic_file_aio_read); | 1478 | EXPORT_SYMBOL(generic_file_aio_read); |
1480 | 1479 | ||
1481 | static ssize_t | ||
1482 | do_readahead(struct address_space *mapping, struct file *filp, | ||
1483 | pgoff_t index, unsigned long nr) | ||
1484 | { | ||
1485 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | ||
1486 | return -EINVAL; | ||
1487 | |||
1488 | force_page_cache_readahead(mapping, filp, index, nr); | ||
1489 | return 0; | ||
1490 | } | ||
1491 | |||
1492 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) | ||
1493 | { | ||
1494 | ssize_t ret; | ||
1495 | struct file *file; | ||
1496 | |||
1497 | ret = -EBADF; | ||
1498 | file = fget(fd); | ||
1499 | if (file) { | ||
1500 | if (file->f_mode & FMODE_READ) { | ||
1501 | struct address_space *mapping = file->f_mapping; | ||
1502 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; | ||
1503 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | ||
1504 | unsigned long len = end - start + 1; | ||
1505 | ret = do_readahead(mapping, file, start, len); | ||
1506 | } | ||
1507 | fput(file); | ||
1508 | } | ||
1509 | return ret; | ||
1510 | } | ||
1511 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | ||
1512 | asmlinkage long SyS_readahead(long fd, loff_t offset, long count) | ||
1513 | { | ||
1514 | return SYSC_readahead((int) fd, offset, (size_t) count); | ||
1515 | } | ||
1516 | SYSCALL_ALIAS(sys_readahead, SyS_readahead); | ||
1517 | #endif | ||
1518 | |||
1519 | #ifdef CONFIG_MMU | 1480 | #ifdef CONFIG_MMU |
1520 | /** | 1481 | /** |
1521 | * page_cache_read - adds requested page to the page cache if not already there | 1482 | * page_cache_read - adds requested page to the page cache if not already there |
@@ -1938,71 +1899,6 @@ struct page *read_cache_page(struct address_space *mapping, | |||
1938 | } | 1899 | } |
1939 | EXPORT_SYMBOL(read_cache_page); | 1900 | EXPORT_SYMBOL(read_cache_page); |
1940 | 1901 | ||
1941 | /* | ||
1942 | * The logic we want is | ||
1943 | * | ||
1944 | * if suid or (sgid and xgrp) | ||
1945 | * remove privs | ||
1946 | */ | ||
1947 | int should_remove_suid(struct dentry *dentry) | ||
1948 | { | ||
1949 | umode_t mode = dentry->d_inode->i_mode; | ||
1950 | int kill = 0; | ||
1951 | |||
1952 | /* suid always must be killed */ | ||
1953 | if (unlikely(mode & S_ISUID)) | ||
1954 | kill = ATTR_KILL_SUID; | ||
1955 | |||
1956 | /* | ||
1957 | * sgid without any exec bits is just a mandatory locking mark; leave | ||
1958 | * it alone. If some exec bits are set, it's a real sgid; kill it. | ||
1959 | */ | ||
1960 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | ||
1961 | kill |= ATTR_KILL_SGID; | ||
1962 | |||
1963 | if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) | ||
1964 | return kill; | ||
1965 | |||
1966 | return 0; | ||
1967 | } | ||
1968 | EXPORT_SYMBOL(should_remove_suid); | ||
1969 | |||
1970 | static int __remove_suid(struct dentry *dentry, int kill) | ||
1971 | { | ||
1972 | struct iattr newattrs; | ||
1973 | |||
1974 | newattrs.ia_valid = ATTR_FORCE | kill; | ||
1975 | return notify_change(dentry, &newattrs); | ||
1976 | } | ||
1977 | |||
1978 | int file_remove_suid(struct file *file) | ||
1979 | { | ||
1980 | struct dentry *dentry = file->f_path.dentry; | ||
1981 | struct inode *inode = dentry->d_inode; | ||
1982 | int killsuid; | ||
1983 | int killpriv; | ||
1984 | int error = 0; | ||
1985 | |||
1986 | /* Fast path for nothing security related */ | ||
1987 | if (IS_NOSEC(inode)) | ||
1988 | return 0; | ||
1989 | |||
1990 | killsuid = should_remove_suid(dentry); | ||
1991 | killpriv = security_inode_need_killpriv(dentry); | ||
1992 | |||
1993 | if (killpriv < 0) | ||
1994 | return killpriv; | ||
1995 | if (killpriv) | ||
1996 | error = security_inode_killpriv(dentry); | ||
1997 | if (!error && killsuid) | ||
1998 | error = __remove_suid(dentry, killsuid); | ||
1999 | if (!error && (inode->i_sb->s_flags & MS_NOSEC)) | ||
2000 | inode->i_flags |= S_NOSEC; | ||
2001 | |||
2002 | return error; | ||
2003 | } | ||
2004 | EXPORT_SYMBOL(file_remove_suid); | ||
2005 | |||
2006 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, | 1902 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
2007 | const struct iovec *iov, size_t base, size_t bytes) | 1903 | const struct iovec *iov, size_t base, size_t bytes) |
2008 | { | 1904 | { |
@@ -2528,7 +2424,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2528 | if (err) | 2424 | if (err) |
2529 | goto out; | 2425 | goto out; |
2530 | 2426 | ||
2531 | file_update_time(file); | 2427 | err = file_update_time(file); |
2428 | if (err) | ||
2429 | goto out; | ||
2532 | 2430 | ||
2533 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 2431 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
2534 | if (unlikely(file->f_flags & O_DIRECT)) { | 2432 | if (unlikely(file->f_flags & O_DIRECT)) { |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index a4eb31132229..213ca1f53409 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -426,7 +426,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
426 | if (ret) | 426 | if (ret) |
427 | goto out_backing; | 427 | goto out_backing; |
428 | 428 | ||
429 | file_update_time(filp); | 429 | ret = file_update_time(filp); |
430 | if (ret) | ||
431 | goto out_backing; | ||
430 | 432 | ||
431 | ret = __xip_file_write (filp, buf, count, pos, ppos); | 433 | ret = __xip_file_write (filp, buf, count, pos, ppos); |
432 | 434 | ||
diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..e25025574a02 --- /dev/null +++ b/mm/frontswap.c | |||
@@ -0,0 +1,314 @@ | |||
1 | /* | ||
2 | * Frontswap frontend | ||
3 | * | ||
4 | * This code provides the generic "frontend" layer to call a matching | ||
5 | * "backend" driver implementation of frontswap. See | ||
6 | * Documentation/vm/frontswap.txt for more information. | ||
7 | * | ||
8 | * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. | ||
9 | * Author: Dan Magenheimer | ||
10 | * | ||
11 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
12 | */ | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/mman.h> | ||
16 | #include <linux/swap.h> | ||
17 | #include <linux/swapops.h> | ||
18 | #include <linux/proc_fs.h> | ||
19 | #include <linux/security.h> | ||
20 | #include <linux/capability.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/uaccess.h> | ||
23 | #include <linux/debugfs.h> | ||
24 | #include <linux/frontswap.h> | ||
25 | #include <linux/swapfile.h> | ||
26 | |||
27 | /* | ||
28 | * frontswap_ops is set by frontswap_register_ops to contain the pointers | ||
29 | * to the frontswap "backend" implementation functions. | ||
30 | */ | ||
31 | static struct frontswap_ops frontswap_ops __read_mostly; | ||
32 | |||
33 | /* | ||
34 | * This global enablement flag reduces overhead on systems where frontswap_ops | ||
35 | * has not been registered, so is preferred to the slower alternative: a | ||
36 | * function call that checks a non-global. | ||
37 | */ | ||
38 | bool frontswap_enabled __read_mostly; | ||
39 | EXPORT_SYMBOL(frontswap_enabled); | ||
40 | |||
41 | /* | ||
42 | * If enabled, frontswap_store will return failure even on success. As | ||
43 | * a result, the swap subsystem will always write the page to swap, in | ||
44 | * effect converting frontswap into a writethrough cache. In this mode, | ||
45 | * there is no direct reduction in swap writes, but a frontswap backend | ||
46 | * can unilaterally "reclaim" any pages in use with no data loss, thus | ||
47 | * providing increases control over maximum memory usage due to frontswap. | ||
48 | */ | ||
49 | static bool frontswap_writethrough_enabled __read_mostly; | ||
50 | |||
51 | #ifdef CONFIG_DEBUG_FS | ||
52 | /* | ||
53 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | ||
54 | * properly configured). These are for information only so are not protected | ||
55 | * against increment races. | ||
56 | */ | ||
57 | static u64 frontswap_loads; | ||
58 | static u64 frontswap_succ_stores; | ||
59 | static u64 frontswap_failed_stores; | ||
60 | static u64 frontswap_invalidates; | ||
61 | |||
62 | static inline void inc_frontswap_loads(void) { | ||
63 | frontswap_loads++; | ||
64 | } | ||
65 | static inline void inc_frontswap_succ_stores(void) { | ||
66 | frontswap_succ_stores++; | ||
67 | } | ||
68 | static inline void inc_frontswap_failed_stores(void) { | ||
69 | frontswap_failed_stores++; | ||
70 | } | ||
71 | static inline void inc_frontswap_invalidates(void) { | ||
72 | frontswap_invalidates++; | ||
73 | } | ||
74 | #else | ||
75 | static inline void inc_frontswap_loads(void) { } | ||
76 | static inline void inc_frontswap_succ_stores(void) { } | ||
77 | static inline void inc_frontswap_failed_stores(void) { } | ||
78 | static inline void inc_frontswap_invalidates(void) { } | ||
79 | #endif | ||
80 | /* | ||
81 | * Register operations for frontswap, returning previous thus allowing | ||
82 | * detection of multiple backends and possible nesting. | ||
83 | */ | ||
84 | struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) | ||
85 | { | ||
86 | struct frontswap_ops old = frontswap_ops; | ||
87 | |||
88 | frontswap_ops = *ops; | ||
89 | frontswap_enabled = true; | ||
90 | return old; | ||
91 | } | ||
92 | EXPORT_SYMBOL(frontswap_register_ops); | ||
93 | |||
94 | /* | ||
95 | * Enable/disable frontswap writethrough (see above). | ||
96 | */ | ||
97 | void frontswap_writethrough(bool enable) | ||
98 | { | ||
99 | frontswap_writethrough_enabled = enable; | ||
100 | } | ||
101 | EXPORT_SYMBOL(frontswap_writethrough); | ||
102 | |||
103 | /* | ||
104 | * Called when a swap device is swapon'd. | ||
105 | */ | ||
106 | void __frontswap_init(unsigned type) | ||
107 | { | ||
108 | struct swap_info_struct *sis = swap_info[type]; | ||
109 | |||
110 | BUG_ON(sis == NULL); | ||
111 | if (sis->frontswap_map == NULL) | ||
112 | return; | ||
113 | if (frontswap_enabled) | ||
114 | (*frontswap_ops.init)(type); | ||
115 | } | ||
116 | EXPORT_SYMBOL(__frontswap_init); | ||
117 | |||
118 | /* | ||
119 | * "Store" data from a page to frontswap and associate it with the page's | ||
120 | * swaptype and offset. Page must be locked and in the swap cache. | ||
121 | * If frontswap already contains a page with matching swaptype and | ||
122 | * offset, the frontswap implmentation may either overwrite the data and | ||
123 | * return success or invalidate the page from frontswap and return failure. | ||
124 | */ | ||
125 | int __frontswap_store(struct page *page) | ||
126 | { | ||
127 | int ret = -1, dup = 0; | ||
128 | swp_entry_t entry = { .val = page_private(page), }; | ||
129 | int type = swp_type(entry); | ||
130 | struct swap_info_struct *sis = swap_info[type]; | ||
131 | pgoff_t offset = swp_offset(entry); | ||
132 | |||
133 | BUG_ON(!PageLocked(page)); | ||
134 | BUG_ON(sis == NULL); | ||
135 | if (frontswap_test(sis, offset)) | ||
136 | dup = 1; | ||
137 | ret = (*frontswap_ops.store)(type, offset, page); | ||
138 | if (ret == 0) { | ||
139 | frontswap_set(sis, offset); | ||
140 | inc_frontswap_succ_stores(); | ||
141 | if (!dup) | ||
142 | atomic_inc(&sis->frontswap_pages); | ||
143 | } else if (dup) { | ||
144 | /* | ||
145 | failed dup always results in automatic invalidate of | ||
146 | the (older) page from frontswap | ||
147 | */ | ||
148 | frontswap_clear(sis, offset); | ||
149 | atomic_dec(&sis->frontswap_pages); | ||
150 | inc_frontswap_failed_stores(); | ||
151 | } else | ||
152 | inc_frontswap_failed_stores(); | ||
153 | if (frontswap_writethrough_enabled) | ||
154 | /* report failure so swap also writes to swap device */ | ||
155 | ret = -1; | ||
156 | return ret; | ||
157 | } | ||
158 | EXPORT_SYMBOL(__frontswap_store); | ||
159 | |||
160 | /* | ||
161 | * "Get" data from frontswap associated with swaptype and offset that were | ||
162 | * specified when the data was put to frontswap and use it to fill the | ||
163 | * specified page with data. Page must be locked and in the swap cache. | ||
164 | */ | ||
165 | int __frontswap_load(struct page *page) | ||
166 | { | ||
167 | int ret = -1; | ||
168 | swp_entry_t entry = { .val = page_private(page), }; | ||
169 | int type = swp_type(entry); | ||
170 | struct swap_info_struct *sis = swap_info[type]; | ||
171 | pgoff_t offset = swp_offset(entry); | ||
172 | |||
173 | BUG_ON(!PageLocked(page)); | ||
174 | BUG_ON(sis == NULL); | ||
175 | if (frontswap_test(sis, offset)) | ||
176 | ret = (*frontswap_ops.load)(type, offset, page); | ||
177 | if (ret == 0) | ||
178 | inc_frontswap_loads(); | ||
179 | return ret; | ||
180 | } | ||
181 | EXPORT_SYMBOL(__frontswap_load); | ||
182 | |||
183 | /* | ||
184 | * Invalidate any data from frontswap associated with the specified swaptype | ||
185 | * and offset so that a subsequent "get" will fail. | ||
186 | */ | ||
187 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
188 | { | ||
189 | struct swap_info_struct *sis = swap_info[type]; | ||
190 | |||
191 | BUG_ON(sis == NULL); | ||
192 | if (frontswap_test(sis, offset)) { | ||
193 | (*frontswap_ops.invalidate_page)(type, offset); | ||
194 | atomic_dec(&sis->frontswap_pages); | ||
195 | frontswap_clear(sis, offset); | ||
196 | inc_frontswap_invalidates(); | ||
197 | } | ||
198 | } | ||
199 | EXPORT_SYMBOL(__frontswap_invalidate_page); | ||
200 | |||
201 | /* | ||
202 | * Invalidate all data from frontswap associated with all offsets for the | ||
203 | * specified swaptype. | ||
204 | */ | ||
205 | void __frontswap_invalidate_area(unsigned type) | ||
206 | { | ||
207 | struct swap_info_struct *sis = swap_info[type]; | ||
208 | |||
209 | BUG_ON(sis == NULL); | ||
210 | if (sis->frontswap_map == NULL) | ||
211 | return; | ||
212 | (*frontswap_ops.invalidate_area)(type); | ||
213 | atomic_set(&sis->frontswap_pages, 0); | ||
214 | memset(sis->frontswap_map, 0, sis->max / sizeof(long)); | ||
215 | } | ||
216 | EXPORT_SYMBOL(__frontswap_invalidate_area); | ||
217 | |||
218 | /* | ||
219 | * Frontswap, like a true swap device, may unnecessarily retain pages | ||
220 | * under certain circumstances; "shrink" frontswap is essentially a | ||
221 | * "partial swapoff" and works by calling try_to_unuse to attempt to | ||
222 | * unuse enough frontswap pages to attempt to -- subject to memory | ||
223 | * constraints -- reduce the number of pages in frontswap to the | ||
224 | * number given in the parameter target_pages. | ||
225 | */ | ||
226 | void frontswap_shrink(unsigned long target_pages) | ||
227 | { | ||
228 | struct swap_info_struct *si = NULL; | ||
229 | int si_frontswap_pages; | ||
230 | unsigned long total_pages = 0, total_pages_to_unuse; | ||
231 | unsigned long pages = 0, pages_to_unuse = 0; | ||
232 | int type; | ||
233 | bool locked = false; | ||
234 | |||
235 | /* | ||
236 | * we don't want to hold swap_lock while doing a very | ||
237 | * lengthy try_to_unuse, but swap_list may change | ||
238 | * so restart scan from swap_list.head each time | ||
239 | */ | ||
240 | spin_lock(&swap_lock); | ||
241 | locked = true; | ||
242 | total_pages = 0; | ||
243 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
244 | si = swap_info[type]; | ||
245 | total_pages += atomic_read(&si->frontswap_pages); | ||
246 | } | ||
247 | if (total_pages <= target_pages) | ||
248 | goto out; | ||
249 | total_pages_to_unuse = total_pages - target_pages; | ||
250 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
251 | si = swap_info[type]; | ||
252 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | ||
253 | if (total_pages_to_unuse < si_frontswap_pages) | ||
254 | pages = pages_to_unuse = total_pages_to_unuse; | ||
255 | else { | ||
256 | pages = si_frontswap_pages; | ||
257 | pages_to_unuse = 0; /* unuse all */ | ||
258 | } | ||
259 | /* ensure there is enough RAM to fetch pages from frontswap */ | ||
260 | if (security_vm_enough_memory_mm(current->mm, pages)) | ||
261 | continue; | ||
262 | vm_unacct_memory(pages); | ||
263 | break; | ||
264 | } | ||
265 | if (type < 0) | ||
266 | goto out; | ||
267 | locked = false; | ||
268 | spin_unlock(&swap_lock); | ||
269 | try_to_unuse(type, true, pages_to_unuse); | ||
270 | out: | ||
271 | if (locked) | ||
272 | spin_unlock(&swap_lock); | ||
273 | return; | ||
274 | } | ||
275 | EXPORT_SYMBOL(frontswap_shrink); | ||
276 | |||
277 | /* | ||
278 | * Count and return the number of frontswap pages across all | ||
279 | * swap devices. This is exported so that backend drivers can | ||
280 | * determine current usage without reading debugfs. | ||
281 | */ | ||
282 | unsigned long frontswap_curr_pages(void) | ||
283 | { | ||
284 | int type; | ||
285 | unsigned long totalpages = 0; | ||
286 | struct swap_info_struct *si = NULL; | ||
287 | |||
288 | spin_lock(&swap_lock); | ||
289 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
290 | si = swap_info[type]; | ||
291 | totalpages += atomic_read(&si->frontswap_pages); | ||
292 | } | ||
293 | spin_unlock(&swap_lock); | ||
294 | return totalpages; | ||
295 | } | ||
296 | EXPORT_SYMBOL(frontswap_curr_pages); | ||
297 | |||
298 | static int __init init_frontswap(void) | ||
299 | { | ||
300 | #ifdef CONFIG_DEBUG_FS | ||
301 | struct dentry *root = debugfs_create_dir("frontswap", NULL); | ||
302 | if (root == NULL) | ||
303 | return -ENXIO; | ||
304 | debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); | ||
305 | debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); | ||
306 | debugfs_create_u64("failed_stores", S_IRUGO, root, | ||
307 | &frontswap_failed_stores); | ||
308 | debugfs_create_u64("invalidates", S_IRUGO, | ||
309 | root, &frontswap_invalidates); | ||
310 | #endif | ||
311 | return 0; | ||
312 | } | ||
313 | |||
314 | module_init(init_frontswap); | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0e5306eeb55..57c4b9309015 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
636 | unsigned long haddr, pmd_t *pmd, | 636 | unsigned long haddr, pmd_t *pmd, |
637 | struct page *page) | 637 | struct page *page) |
638 | { | 638 | { |
639 | int ret = 0; | ||
640 | pgtable_t pgtable; | 639 | pgtable_t pgtable; |
641 | 640 | ||
642 | VM_BUG_ON(!PageCompound(page)); | 641 | VM_BUG_ON(!PageCompound(page)); |
643 | pgtable = pte_alloc_one(mm, haddr); | 642 | pgtable = pte_alloc_one(mm, haddr); |
644 | if (unlikely(!pgtable)) { | 643 | if (unlikely(!pgtable)) |
645 | mem_cgroup_uncharge_page(page); | ||
646 | put_page(page); | ||
647 | return VM_FAULT_OOM; | 644 | return VM_FAULT_OOM; |
648 | } | ||
649 | 645 | ||
650 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | 646 | clear_huge_page(page, haddr, HPAGE_PMD_NR); |
651 | __SetPageUptodate(page); | 647 | __SetPageUptodate(page); |
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
675 | spin_unlock(&mm->page_table_lock); | 671 | spin_unlock(&mm->page_table_lock); |
676 | } | 672 | } |
677 | 673 | ||
678 | return ret; | 674 | return 0; |
679 | } | 675 | } |
680 | 676 | ||
681 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | 677 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) |
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
724 | put_page(page); | 720 | put_page(page); |
725 | goto out; | 721 | goto out; |
726 | } | 722 | } |
723 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, | ||
724 | page))) { | ||
725 | mem_cgroup_uncharge_page(page); | ||
726 | put_page(page); | ||
727 | goto out; | ||
728 | } | ||
727 | 729 | ||
728 | return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); | 730 | return 0; |
729 | } | 731 | } |
730 | out: | 732 | out: |
731 | /* | 733 | /* |
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
950 | count_vm_event(THP_FAULT_FALLBACK); | 952 | count_vm_event(THP_FAULT_FALLBACK); |
951 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 953 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
952 | pmd, orig_pmd, page, haddr); | 954 | pmd, orig_pmd, page, haddr); |
955 | if (ret & VM_FAULT_OOM) | ||
956 | split_huge_page(page); | ||
953 | put_page(page); | 957 | put_page(page); |
954 | goto out; | 958 | goto out; |
955 | } | 959 | } |
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
957 | 961 | ||
958 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 962 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
959 | put_page(new_page); | 963 | put_page(new_page); |
964 | split_huge_page(page); | ||
960 | put_page(page); | 965 | put_page(page); |
961 | ret |= VM_FAULT_OOM; | 966 | ret |= VM_FAULT_OOM; |
962 | goto out; | 967 | goto out; |
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
968 | spin_lock(&mm->page_table_lock); | 973 | spin_lock(&mm->page_table_lock); |
969 | put_page(page); | 974 | put_page(page); |
970 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 975 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
976 | spin_unlock(&mm->page_table_lock); | ||
971 | mem_cgroup_uncharge_page(new_page); | 977 | mem_cgroup_uncharge_page(new_page); |
972 | put_page(new_page); | 978 | put_page(new_page); |
979 | goto out; | ||
973 | } else { | 980 | } else { |
974 | pmd_t entry; | 981 | pmd_t entry; |
975 | VM_BUG_ON(!PageHead(page)); | 982 | VM_BUG_ON(!PageHead(page)); |
@@ -1224,10 +1231,13 @@ static void __split_huge_page_refcount(struct page *page) | |||
1224 | { | 1231 | { |
1225 | int i; | 1232 | int i; |
1226 | struct zone *zone = page_zone(page); | 1233 | struct zone *zone = page_zone(page); |
1234 | struct lruvec *lruvec; | ||
1227 | int tail_count = 0; | 1235 | int tail_count = 0; |
1228 | 1236 | ||
1229 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 1237 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
1230 | spin_lock_irq(&zone->lru_lock); | 1238 | spin_lock_irq(&zone->lru_lock); |
1239 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1240 | |||
1231 | compound_lock(page); | 1241 | compound_lock(page); |
1232 | /* complete memcg works before add pages to LRU */ | 1242 | /* complete memcg works before add pages to LRU */ |
1233 | mem_cgroup_split_huge_fixup(page); | 1243 | mem_cgroup_split_huge_fixup(page); |
@@ -1302,13 +1312,12 @@ static void __split_huge_page_refcount(struct page *page) | |||
1302 | BUG_ON(!PageDirty(page_tail)); | 1312 | BUG_ON(!PageDirty(page_tail)); |
1303 | BUG_ON(!PageSwapBacked(page_tail)); | 1313 | BUG_ON(!PageSwapBacked(page_tail)); |
1304 | 1314 | ||
1305 | 1315 | lru_add_page_tail(page, page_tail, lruvec); | |
1306 | lru_add_page_tail(zone, page, page_tail); | ||
1307 | } | 1316 | } |
1308 | atomic_sub(tail_count, &page->_count); | 1317 | atomic_sub(tail_count, &page->_count); |
1309 | BUG_ON(atomic_read(&page->_count) <= 0); | 1318 | BUG_ON(atomic_read(&page->_count) <= 0); |
1310 | 1319 | ||
1311 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1320 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); |
1312 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | 1321 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); |
1313 | 1322 | ||
1314 | ClearPageCompound(page); | 1323 | ClearPageCompound(page); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ae8f708e3d75..e198831276a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t) | |||
273 | 273 | ||
274 | /* Locate each segment we overlap with, and count that overlap. */ | 274 | /* Locate each segment we overlap with, and count that overlap. */ |
275 | list_for_each_entry(rg, head, link) { | 275 | list_for_each_entry(rg, head, link) { |
276 | int seg_from; | 276 | long seg_from; |
277 | int seg_to; | 277 | long seg_to; |
278 | 278 | ||
279 | if (rg->to <= f) | 279 | if (rg->to <= f) |
280 | continue; | 280 | continue; |
@@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
2157 | kref_get(&reservations->refs); | 2157 | kref_get(&reservations->refs); |
2158 | } | 2158 | } |
2159 | 2159 | ||
2160 | static void resv_map_put(struct vm_area_struct *vma) | ||
2161 | { | ||
2162 | struct resv_map *reservations = vma_resv_map(vma); | ||
2163 | |||
2164 | if (!reservations) | ||
2165 | return; | ||
2166 | kref_put(&reservations->refs, resv_map_release); | ||
2167 | } | ||
2168 | |||
2160 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | 2169 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) |
2161 | { | 2170 | { |
2162 | struct hstate *h = hstate_vma(vma); | 2171 | struct hstate *h = hstate_vma(vma); |
@@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2173 | reserve = (end - start) - | 2182 | reserve = (end - start) - |
2174 | region_count(&reservations->regions, start, end); | 2183 | region_count(&reservations->regions, start, end); |
2175 | 2184 | ||
2176 | kref_put(&reservations->refs, resv_map_release); | 2185 | resv_map_put(vma); |
2177 | 2186 | ||
2178 | if (reserve) { | 2187 | if (reserve) { |
2179 | hugetlb_acct_memory(h, -reserve); | 2188 | hugetlb_acct_memory(h, -reserve); |
@@ -2213,6 +2222,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, | |||
2213 | } | 2222 | } |
2214 | entry = pte_mkyoung(entry); | 2223 | entry = pte_mkyoung(entry); |
2215 | entry = pte_mkhuge(entry); | 2224 | entry = pte_mkhuge(entry); |
2225 | entry = arch_make_huge_pte(entry, vma, page, writable); | ||
2216 | 2226 | ||
2217 | return entry; | 2227 | return entry; |
2218 | } | 2228 | } |
@@ -2990,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2990 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); | 3000 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); |
2991 | } | 3001 | } |
2992 | 3002 | ||
2993 | if (chg < 0) | 3003 | if (chg < 0) { |
2994 | return chg; | 3004 | ret = chg; |
3005 | goto out_err; | ||
3006 | } | ||
2995 | 3007 | ||
2996 | /* There must be enough pages in the subpool for the mapping */ | 3008 | /* There must be enough pages in the subpool for the mapping */ |
2997 | if (hugepage_subpool_get_pages(spool, chg)) | 3009 | if (hugepage_subpool_get_pages(spool, chg)) { |
2998 | return -ENOSPC; | 3010 | ret = -ENOSPC; |
3011 | goto out_err; | ||
3012 | } | ||
2999 | 3013 | ||
3000 | /* | 3014 | /* |
3001 | * Check enough hugepages are available for the reservation. | 3015 | * Check enough hugepages are available for the reservation. |
@@ -3004,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3004 | ret = hugetlb_acct_memory(h, chg); | 3018 | ret = hugetlb_acct_memory(h, chg); |
3005 | if (ret < 0) { | 3019 | if (ret < 0) { |
3006 | hugepage_subpool_put_pages(spool, chg); | 3020 | hugepage_subpool_put_pages(spool, chg); |
3007 | return ret; | 3021 | goto out_err; |
3008 | } | 3022 | } |
3009 | 3023 | ||
3010 | /* | 3024 | /* |
@@ -3021,6 +3035,10 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3021 | if (!vma || vma->vm_flags & VM_MAYSHARE) | 3035 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
3022 | region_add(&inode->i_mapping->private_list, from, to); | 3036 | region_add(&inode->i_mapping->private_list, from, to); |
3023 | return 0; | 3037 | return 0; |
3038 | out_err: | ||
3039 | if (vma) | ||
3040 | resv_map_put(vma); | ||
3041 | return ret; | ||
3024 | } | 3042 | } |
3025 | 3043 | ||
3026 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 3044 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) |
diff --git a/mm/internal.h b/mm/internal.h index 2189af491783..2ba87fbfb75b 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -100,6 +100,39 @@ extern void prep_compound_page(struct page *page, unsigned long order); | |||
100 | extern bool is_free_buddy_page(struct page *page); | 100 | extern bool is_free_buddy_page(struct page *page); |
101 | #endif | 101 | #endif |
102 | 102 | ||
103 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
104 | |||
105 | /* | ||
106 | * in mm/compaction.c | ||
107 | */ | ||
108 | /* | ||
109 | * compact_control is used to track pages being migrated and the free pages | ||
110 | * they are being migrated to during memory compaction. The free_pfn starts | ||
111 | * at the end of a zone and migrate_pfn begins at the start. Movable pages | ||
112 | * are moved to the end of a zone during a compaction run and the run | ||
113 | * completes when free_pfn <= migrate_pfn | ||
114 | */ | ||
115 | struct compact_control { | ||
116 | struct list_head freepages; /* List of free pages to migrate to */ | ||
117 | struct list_head migratepages; /* List of pages being migrated */ | ||
118 | unsigned long nr_freepages; /* Number of isolated free pages */ | ||
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | ||
120 | unsigned long free_pfn; /* isolate_freepages search base */ | ||
121 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | ||
122 | bool sync; /* Synchronous migration */ | ||
123 | |||
124 | int order; /* order a direct compactor needs */ | ||
125 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | ||
126 | struct zone *zone; | ||
127 | }; | ||
128 | |||
129 | unsigned long | ||
130 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); | ||
131 | unsigned long | ||
132 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
133 | unsigned long low_pfn, unsigned long end_pfn); | ||
134 | |||
135 | #endif | ||
103 | 136 | ||
104 | /* | 137 | /* |
105 | * function for dealing with page's order in buddy system. | 138 | * function for dealing with page's order in buddy system. |
@@ -131,7 +164,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
131 | * to determine if it's being mapped into a LOCKED vma. | 164 | * to determine if it's being mapped into a LOCKED vma. |
132 | * If so, mark page as mlocked. | 165 | * If so, mark page as mlocked. |
133 | */ | 166 | */ |
134 | static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) | 167 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, |
168 | struct page *page) | ||
135 | { | 169 | { |
136 | VM_BUG_ON(PageLRU(page)); | 170 | VM_BUG_ON(PageLRU(page)); |
137 | 171 | ||
@@ -189,7 +223,7 @@ extern unsigned long vma_address(struct page *page, | |||
189 | struct vm_area_struct *vma); | 223 | struct vm_area_struct *vma); |
190 | #endif | 224 | #endif |
191 | #else /* !CONFIG_MMU */ | 225 | #else /* !CONFIG_MMU */ |
192 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 226 | static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) |
193 | { | 227 | { |
194 | return 0; | 228 | return 0; |
195 | } | 229 | } |
@@ -309,3 +343,7 @@ extern u64 hwpoison_filter_flags_mask; | |||
309 | extern u64 hwpoison_filter_flags_value; | 343 | extern u64 hwpoison_filter_flags_value; |
310 | extern u64 hwpoison_filter_memcg; | 344 | extern u64 hwpoison_filter_memcg; |
311 | extern u32 hwpoison_filter_enable; | 345 | extern u32 hwpoison_filter_enable; |
346 | |||
347 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, | ||
348 | unsigned long, unsigned long, | ||
349 | unsigned long, unsigned long); | ||
diff --git a/mm/madvise.c b/mm/madvise.c index 1ccbba5b6674..14d260fa0d17 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -11,8 +11,11 @@ | |||
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/page-isolation.h> | 12 | #include <linux/page-isolation.h> |
13 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
14 | #include <linux/falloc.h> | ||
14 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
15 | #include <linux/ksm.h> | 16 | #include <linux/ksm.h> |
17 | #include <linux/fs.h> | ||
18 | #include <linux/file.h> | ||
16 | 19 | ||
17 | /* | 20 | /* |
18 | * Any behaviour which results in changes to the vma->vm_flags needs to | 21 | * Any behaviour which results in changes to the vma->vm_flags needs to |
@@ -200,33 +203,39 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
200 | struct vm_area_struct **prev, | 203 | struct vm_area_struct **prev, |
201 | unsigned long start, unsigned long end) | 204 | unsigned long start, unsigned long end) |
202 | { | 205 | { |
203 | struct address_space *mapping; | 206 | loff_t offset; |
204 | loff_t offset, endoff; | ||
205 | int error; | 207 | int error; |
208 | struct file *f; | ||
206 | 209 | ||
207 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 210 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
208 | 211 | ||
209 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | 212 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) |
210 | return -EINVAL; | 213 | return -EINVAL; |
211 | 214 | ||
212 | if (!vma->vm_file || !vma->vm_file->f_mapping | 215 | f = vma->vm_file; |
213 | || !vma->vm_file->f_mapping->host) { | 216 | |
217 | if (!f || !f->f_mapping || !f->f_mapping->host) { | ||
214 | return -EINVAL; | 218 | return -EINVAL; |
215 | } | 219 | } |
216 | 220 | ||
217 | if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) | 221 | if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) |
218 | return -EACCES; | 222 | return -EACCES; |
219 | 223 | ||
220 | mapping = vma->vm_file->f_mapping; | ||
221 | |||
222 | offset = (loff_t)(start - vma->vm_start) | 224 | offset = (loff_t)(start - vma->vm_start) |
223 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 225 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
224 | endoff = (loff_t)(end - vma->vm_start - 1) | ||
225 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
226 | 226 | ||
227 | /* vmtruncate_range needs to take i_mutex */ | 227 | /* |
228 | * Filesystem's fallocate may need to take i_mutex. We need to | ||
229 | * explicitly grab a reference because the vma (and hence the | ||
230 | * vma's reference to the file) can go away as soon as we drop | ||
231 | * mmap_sem. | ||
232 | */ | ||
233 | get_file(f); | ||
228 | up_read(¤t->mm->mmap_sem); | 234 | up_read(¤t->mm->mmap_sem); |
229 | error = vmtruncate_range(mapping->host, offset, endoff); | 235 | error = do_fallocate(f, |
236 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, | ||
237 | offset, end - start); | ||
238 | fput(f); | ||
230 | down_read(¤t->mm->mmap_sem); | 239 | down_read(¤t->mm->mmap_sem); |
231 | return error; | 240 | return error; |
232 | } | 241 | } |
diff --git a/mm/memblock.c b/mm/memblock.c index a44eab3157f8..5cc6731b00cc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = { | |||
37 | 37 | ||
38 | int memblock_debug __initdata_memblock; | 38 | int memblock_debug __initdata_memblock; |
39 | static int memblock_can_resize __initdata_memblock; | 39 | static int memblock_can_resize __initdata_memblock; |
40 | static int memblock_memory_in_slab __initdata_memblock = 0; | ||
41 | static int memblock_reserved_in_slab __initdata_memblock = 0; | ||
40 | 42 | ||
41 | /* inline so we don't get a warning when pr_debug is compiled out */ | 43 | /* inline so we don't get a warning when pr_debug is compiled out */ |
42 | static inline const char *memblock_type_name(struct memblock_type *type) | 44 | static inline const char *memblock_type_name(struct memblock_type *type) |
@@ -141,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | |||
141 | MAX_NUMNODES); | 143 | MAX_NUMNODES); |
142 | } | 144 | } |
143 | 145 | ||
144 | /* | ||
145 | * Free memblock.reserved.regions | ||
146 | */ | ||
147 | int __init_memblock memblock_free_reserved_regions(void) | ||
148 | { | ||
149 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
150 | return 0; | ||
151 | |||
152 | return memblock_free(__pa(memblock.reserved.regions), | ||
153 | sizeof(struct memblock_region) * memblock.reserved.max); | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * Reserve memblock.reserved.regions | ||
158 | */ | ||
159 | int __init_memblock memblock_reserve_reserved_regions(void) | ||
160 | { | ||
161 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
162 | return 0; | ||
163 | |||
164 | return memblock_reserve(__pa(memblock.reserved.regions), | ||
165 | sizeof(struct memblock_region) * memblock.reserved.max); | ||
166 | } | ||
167 | |||
168 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) | 146 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) |
169 | { | 147 | { |
170 | type->total_size -= type->regions[r].size; | 148 | type->total_size -= type->regions[r].size; |
@@ -182,11 +160,42 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
182 | } | 160 | } |
183 | } | 161 | } |
184 | 162 | ||
185 | static int __init_memblock memblock_double_array(struct memblock_type *type) | 163 | phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( |
164 | phys_addr_t *addr) | ||
165 | { | ||
166 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
167 | return 0; | ||
168 | |||
169 | *addr = __pa(memblock.reserved.regions); | ||
170 | |||
171 | return PAGE_ALIGN(sizeof(struct memblock_region) * | ||
172 | memblock.reserved.max); | ||
173 | } | ||
174 | |||
175 | /** | ||
176 | * memblock_double_array - double the size of the memblock regions array | ||
177 | * @type: memblock type of the regions array being doubled | ||
178 | * @new_area_start: starting address of memory range to avoid overlap with | ||
179 | * @new_area_size: size of memory range to avoid overlap with | ||
180 | * | ||
181 | * Double the size of the @type regions array. If memblock is being used to | ||
182 | * allocate memory for a new reserved regions array and there is a previously | ||
183 | * allocated memory range [@new_area_start,@new_area_start+@new_area_size] | ||
184 | * waiting to be reserved, ensure the memory used by the new array does | ||
185 | * not overlap. | ||
186 | * | ||
187 | * RETURNS: | ||
188 | * 0 on success, -1 on failure. | ||
189 | */ | ||
190 | static int __init_memblock memblock_double_array(struct memblock_type *type, | ||
191 | phys_addr_t new_area_start, | ||
192 | phys_addr_t new_area_size) | ||
186 | { | 193 | { |
187 | struct memblock_region *new_array, *old_array; | 194 | struct memblock_region *new_array, *old_array; |
195 | phys_addr_t old_alloc_size, new_alloc_size; | ||
188 | phys_addr_t old_size, new_size, addr; | 196 | phys_addr_t old_size, new_size, addr; |
189 | int use_slab = slab_is_available(); | 197 | int use_slab = slab_is_available(); |
198 | int *in_slab; | ||
190 | 199 | ||
191 | /* We don't allow resizing until we know about the reserved regions | 200 | /* We don't allow resizing until we know about the reserved regions |
192 | * of memory that aren't suitable for allocation | 201 | * of memory that aren't suitable for allocation |
@@ -197,6 +206,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
197 | /* Calculate new doubled size */ | 206 | /* Calculate new doubled size */ |
198 | old_size = type->max * sizeof(struct memblock_region); | 207 | old_size = type->max * sizeof(struct memblock_region); |
199 | new_size = old_size << 1; | 208 | new_size = old_size << 1; |
209 | /* | ||
210 | * We need to allocated new one align to PAGE_SIZE, | ||
211 | * so we can free them completely later. | ||
212 | */ | ||
213 | old_alloc_size = PAGE_ALIGN(old_size); | ||
214 | new_alloc_size = PAGE_ALIGN(new_size); | ||
215 | |||
216 | /* Retrieve the slab flag */ | ||
217 | if (type == &memblock.memory) | ||
218 | in_slab = &memblock_memory_in_slab; | ||
219 | else | ||
220 | in_slab = &memblock_reserved_in_slab; | ||
200 | 221 | ||
201 | /* Try to find some space for it. | 222 | /* Try to find some space for it. |
202 | * | 223 | * |
@@ -212,14 +233,26 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
212 | if (use_slab) { | 233 | if (use_slab) { |
213 | new_array = kmalloc(new_size, GFP_KERNEL); | 234 | new_array = kmalloc(new_size, GFP_KERNEL); |
214 | addr = new_array ? __pa(new_array) : 0; | 235 | addr = new_array ? __pa(new_array) : 0; |
215 | } else | 236 | } else { |
216 | addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); | 237 | /* only exclude range when trying to double reserved.regions */ |
238 | if (type != &memblock.reserved) | ||
239 | new_area_start = new_area_size = 0; | ||
240 | |||
241 | addr = memblock_find_in_range(new_area_start + new_area_size, | ||
242 | memblock.current_limit, | ||
243 | new_alloc_size, PAGE_SIZE); | ||
244 | if (!addr && new_area_size) | ||
245 | addr = memblock_find_in_range(0, | ||
246 | min(new_area_start, memblock.current_limit), | ||
247 | new_alloc_size, PAGE_SIZE); | ||
248 | |||
249 | new_array = addr ? __va(addr) : 0; | ||
250 | } | ||
217 | if (!addr) { | 251 | if (!addr) { |
218 | pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", | 252 | pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", |
219 | memblock_type_name(type), type->max, type->max * 2); | 253 | memblock_type_name(type), type->max, type->max * 2); |
220 | return -1; | 254 | return -1; |
221 | } | 255 | } |
222 | new_array = __va(addr); | ||
223 | 256 | ||
224 | memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", | 257 | memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", |
225 | memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); | 258 | memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); |
@@ -234,21 +267,23 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
234 | type->regions = new_array; | 267 | type->regions = new_array; |
235 | type->max <<= 1; | 268 | type->max <<= 1; |
236 | 269 | ||
237 | /* If we use SLAB that's it, we are done */ | 270 | /* Free old array. We needn't free it if the array is the |
238 | if (use_slab) | 271 | * static one |
239 | return 0; | ||
240 | |||
241 | /* Add the new reserved region now. Should not fail ! */ | ||
242 | BUG_ON(memblock_reserve(addr, new_size)); | ||
243 | |||
244 | /* If the array wasn't our static init one, then free it. We only do | ||
245 | * that before SLAB is available as later on, we don't know whether | ||
246 | * to use kfree or free_bootmem_pages(). Shouldn't be a big deal | ||
247 | * anyways | ||
248 | */ | 272 | */ |
249 | if (old_array != memblock_memory_init_regions && | 273 | if (*in_slab) |
250 | old_array != memblock_reserved_init_regions) | 274 | kfree(old_array); |
251 | memblock_free(__pa(old_array), old_size); | 275 | else if (old_array != memblock_memory_init_regions && |
276 | old_array != memblock_reserved_init_regions) | ||
277 | memblock_free(__pa(old_array), old_alloc_size); | ||
278 | |||
279 | /* Reserve the new array if that comes from the memblock. | ||
280 | * Otherwise, we needn't do it | ||
281 | */ | ||
282 | if (!use_slab) | ||
283 | BUG_ON(memblock_reserve(addr, new_alloc_size)); | ||
284 | |||
285 | /* Update slab flag */ | ||
286 | *in_slab = use_slab; | ||
252 | 287 | ||
253 | return 0; | 288 | return 0; |
254 | } | 289 | } |
@@ -387,7 +422,7 @@ repeat: | |||
387 | */ | 422 | */ |
388 | if (!insert) { | 423 | if (!insert) { |
389 | while (type->cnt + nr_new > type->max) | 424 | while (type->cnt + nr_new > type->max) |
390 | if (memblock_double_array(type) < 0) | 425 | if (memblock_double_array(type, obase, size) < 0) |
391 | return -ENOMEM; | 426 | return -ENOMEM; |
392 | insert = true; | 427 | insert = true; |
393 | goto repeat; | 428 | goto repeat; |
@@ -438,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
438 | 473 | ||
439 | /* we'll create at most two more regions */ | 474 | /* we'll create at most two more regions */ |
440 | while (type->cnt + 2 > type->max) | 475 | while (type->cnt + 2 > type->max) |
441 | if (memblock_double_array(type) < 0) | 476 | if (memblock_double_array(type, base, size) < 0) |
442 | return -ENOMEM; | 477 | return -ENOMEM; |
443 | 478 | ||
444 | for (i = 0; i < type->cnt; i++) { | 479 | for (i = 0; i < type->cnt; i++) { |
@@ -528,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
528 | * __next_free_mem_range - next function for for_each_free_mem_range() | 563 | * __next_free_mem_range - next function for for_each_free_mem_range() |
529 | * @idx: pointer to u64 loop variable | 564 | * @idx: pointer to u64 loop variable |
530 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 565 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
531 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 566 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
532 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 567 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
533 | * @p_nid: ptr to int for nid of the range, can be %NULL | 568 | * @out_nid: ptr to int for nid of the range, can be %NULL |
534 | * | 569 | * |
535 | * Find the first free area from *@idx which matches @nid, fill the out | 570 | * Find the first free area from *@idx which matches @nid, fill the out |
536 | * parameters, and update *@idx for the next iteration. The lower 32bit of | 571 | * parameters, and update *@idx for the next iteration. The lower 32bit of |
@@ -604,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
604 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 639 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() |
605 | * @idx: pointer to u64 loop variable | 640 | * @idx: pointer to u64 loop variable |
606 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 641 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
607 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 642 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
608 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 643 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
609 | * @p_nid: ptr to int for nid of the range, can be %NULL | 644 | * @out_nid: ptr to int for nid of the range, can be %NULL |
610 | * | 645 | * |
611 | * Reverse of __next_free_mem_range(). | 646 | * Reverse of __next_free_mem_range(). |
612 | */ | 647 | */ |
@@ -855,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) | |||
855 | return memblock_search(&memblock.memory, addr) != -1; | 890 | return memblock_search(&memblock.memory, addr) != -1; |
856 | } | 891 | } |
857 | 892 | ||
893 | /** | ||
894 | * memblock_is_region_memory - check if a region is a subset of memory | ||
895 | * @base: base of region to check | ||
896 | * @size: size of region to check | ||
897 | * | ||
898 | * Check if the region [@base, @base+@size) is a subset of a memory block. | ||
899 | * | ||
900 | * RETURNS: | ||
901 | * 0 if false, non-zero if true | ||
902 | */ | ||
858 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) | 903 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) |
859 | { | 904 | { |
860 | int idx = memblock_search(&memblock.memory, base); | 905 | int idx = memblock_search(&memblock.memory, base); |
@@ -867,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size | |||
867 | memblock.memory.regions[idx].size) >= end; | 912 | memblock.memory.regions[idx].size) >= end; |
868 | } | 913 | } |
869 | 914 | ||
915 | /** | ||
916 | * memblock_is_region_reserved - check if a region intersects reserved memory | ||
917 | * @base: base of region to check | ||
918 | * @size: size of region to check | ||
919 | * | ||
920 | * Check if the region [@base, @base+@size) intersects a reserved memory block. | ||
921 | * | ||
922 | * RETURNS: | ||
923 | * 0 if false, non-zero if true | ||
924 | */ | ||
870 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) | 925 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) |
871 | { | 926 | { |
872 | memblock_cap_size(base, &size); | 927 | memblock_cap_size(base, &size); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7685d4a0b3ce..f72b5e52451a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -59,7 +59,7 @@ | |||
59 | 59 | ||
60 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 60 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
62 | struct mem_cgroup *root_mem_cgroup __read_mostly; | 62 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
63 | 63 | ||
64 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 64 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
@@ -73,7 +73,7 @@ static int really_do_swap_account __initdata = 0; | |||
73 | #endif | 73 | #endif |
74 | 74 | ||
75 | #else | 75 | #else |
76 | #define do_swap_account (0) | 76 | #define do_swap_account 0 |
77 | #endif | 77 | #endif |
78 | 78 | ||
79 | 79 | ||
@@ -88,18 +88,31 @@ enum mem_cgroup_stat_index { | |||
88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
91 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ | ||
92 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
93 | }; | 92 | }; |
94 | 93 | ||
94 | static const char * const mem_cgroup_stat_names[] = { | ||
95 | "cache", | ||
96 | "rss", | ||
97 | "mapped_file", | ||
98 | "swap", | ||
99 | }; | ||
100 | |||
95 | enum mem_cgroup_events_index { | 101 | enum mem_cgroup_events_index { |
96 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | 102 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ |
97 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | 103 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ |
98 | MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ | ||
99 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ | 104 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ |
100 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ | 105 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ |
101 | MEM_CGROUP_EVENTS_NSTATS, | 106 | MEM_CGROUP_EVENTS_NSTATS, |
102 | }; | 107 | }; |
108 | |||
109 | static const char * const mem_cgroup_events_names[] = { | ||
110 | "pgpgin", | ||
111 | "pgpgout", | ||
112 | "pgfault", | ||
113 | "pgmajfault", | ||
114 | }; | ||
115 | |||
103 | /* | 116 | /* |
104 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | 117 | * Per memcg event counter is incremented at every pagein/pageout. With THP, |
105 | * it will be incremated by the number of pages. This counter is used for | 118 | * it will be incremated by the number of pages. This counter is used for |
@@ -112,13 +125,14 @@ enum mem_cgroup_events_target { | |||
112 | MEM_CGROUP_TARGET_NUMAINFO, | 125 | MEM_CGROUP_TARGET_NUMAINFO, |
113 | MEM_CGROUP_NTARGETS, | 126 | MEM_CGROUP_NTARGETS, |
114 | }; | 127 | }; |
115 | #define THRESHOLDS_EVENTS_TARGET (128) | 128 | #define THRESHOLDS_EVENTS_TARGET 128 |
116 | #define SOFTLIMIT_EVENTS_TARGET (1024) | 129 | #define SOFTLIMIT_EVENTS_TARGET 1024 |
117 | #define NUMAINFO_EVENTS_TARGET (1024) | 130 | #define NUMAINFO_EVENTS_TARGET 1024 |
118 | 131 | ||
119 | struct mem_cgroup_stat_cpu { | 132 | struct mem_cgroup_stat_cpu { |
120 | long count[MEM_CGROUP_STAT_NSTATS]; | 133 | long count[MEM_CGROUP_STAT_NSTATS]; |
121 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | 134 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; |
135 | unsigned long nr_page_events; | ||
122 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 136 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
123 | }; | 137 | }; |
124 | 138 | ||
@@ -138,7 +152,6 @@ struct mem_cgroup_per_zone { | |||
138 | 152 | ||
139 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 153 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
140 | 154 | ||
141 | struct zone_reclaim_stat reclaim_stat; | ||
142 | struct rb_node tree_node; /* RB tree node */ | 155 | struct rb_node tree_node; /* RB tree node */ |
143 | unsigned long long usage_in_excess;/* Set to the value by which */ | 156 | unsigned long long usage_in_excess;/* Set to the value by which */ |
144 | /* the soft limit is exceeded*/ | 157 | /* the soft limit is exceeded*/ |
@@ -182,7 +195,7 @@ struct mem_cgroup_threshold { | |||
182 | 195 | ||
183 | /* For threshold */ | 196 | /* For threshold */ |
184 | struct mem_cgroup_threshold_ary { | 197 | struct mem_cgroup_threshold_ary { |
185 | /* An array index points to threshold just below usage. */ | 198 | /* An array index points to threshold just below or equal to usage. */ |
186 | int current_threshold; | 199 | int current_threshold; |
187 | /* Size of entries[] */ | 200 | /* Size of entries[] */ |
188 | unsigned int size; | 201 | unsigned int size; |
@@ -245,8 +258,8 @@ struct mem_cgroup { | |||
245 | */ | 258 | */ |
246 | struct rcu_head rcu_freeing; | 259 | struct rcu_head rcu_freeing; |
247 | /* | 260 | /* |
248 | * But when using vfree(), that cannot be done at | 261 | * We also need some space for a worker in deferred freeing. |
249 | * interrupt time, so we must then queue the work. | 262 | * By the time we call it, rcu_freeing is no longer in use. |
250 | */ | 263 | */ |
251 | struct work_struct work_freeing; | 264 | struct work_struct work_freeing; |
252 | }; | 265 | }; |
@@ -305,7 +318,7 @@ struct mem_cgroup { | |||
305 | /* | 318 | /* |
306 | * percpu counter. | 319 | * percpu counter. |
307 | */ | 320 | */ |
308 | struct mem_cgroup_stat_cpu *stat; | 321 | struct mem_cgroup_stat_cpu __percpu *stat; |
309 | /* | 322 | /* |
310 | * used when a cpu is offlined or other synchronizations | 323 | * used when a cpu is offlined or other synchronizations |
311 | * See mem_cgroup_read_stat(). | 324 | * See mem_cgroup_read_stat(). |
@@ -360,8 +373,8 @@ static bool move_file(void) | |||
360 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 373 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
361 | * limit reclaim to prevent infinite loops, if they ever occur. | 374 | * limit reclaim to prevent infinite loops, if they ever occur. |
362 | */ | 375 | */ |
363 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) | 376 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
364 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) | 377 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 |
365 | 378 | ||
366 | enum charge_type { | 379 | enum charge_type { |
367 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 380 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
@@ -377,8 +390,8 @@ enum charge_type { | |||
377 | #define _MEM (0) | 390 | #define _MEM (0) |
378 | #define _MEMSWAP (1) | 391 | #define _MEMSWAP (1) |
379 | #define _OOM_TYPE (2) | 392 | #define _OOM_TYPE (2) |
380 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 393 | #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) |
381 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 394 | #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) |
382 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 395 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
383 | /* Used for OOM nofiier */ | 396 | /* Used for OOM nofiier */ |
384 | #define OOM_CONTROL (0) | 397 | #define OOM_CONTROL (0) |
@@ -404,6 +417,7 @@ void sock_update_memcg(struct sock *sk) | |||
404 | { | 417 | { |
405 | if (mem_cgroup_sockets_enabled) { | 418 | if (mem_cgroup_sockets_enabled) { |
406 | struct mem_cgroup *memcg; | 419 | struct mem_cgroup *memcg; |
420 | struct cg_proto *cg_proto; | ||
407 | 421 | ||
408 | BUG_ON(!sk->sk_prot->proto_cgroup); | 422 | BUG_ON(!sk->sk_prot->proto_cgroup); |
409 | 423 | ||
@@ -423,9 +437,10 @@ void sock_update_memcg(struct sock *sk) | |||
423 | 437 | ||
424 | rcu_read_lock(); | 438 | rcu_read_lock(); |
425 | memcg = mem_cgroup_from_task(current); | 439 | memcg = mem_cgroup_from_task(current); |
426 | if (!mem_cgroup_is_root(memcg)) { | 440 | cg_proto = sk->sk_prot->proto_cgroup(memcg); |
441 | if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { | ||
427 | mem_cgroup_get(memcg); | 442 | mem_cgroup_get(memcg); |
428 | sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); | 443 | sk->sk_cgrp = cg_proto; |
429 | } | 444 | } |
430 | rcu_read_unlock(); | 445 | rcu_read_unlock(); |
431 | } | 446 | } |
@@ -454,6 +469,19 @@ EXPORT_SYMBOL(tcp_proto_cgroup); | |||
454 | #endif /* CONFIG_INET */ | 469 | #endif /* CONFIG_INET */ |
455 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ | 470 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ |
456 | 471 | ||
472 | #if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) | ||
473 | static void disarm_sock_keys(struct mem_cgroup *memcg) | ||
474 | { | ||
475 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) | ||
476 | return; | ||
477 | static_key_slow_dec(&memcg_socket_limit_enabled); | ||
478 | } | ||
479 | #else | ||
480 | static void disarm_sock_keys(struct mem_cgroup *memcg) | ||
481 | { | ||
482 | } | ||
483 | #endif | ||
484 | |||
457 | static void drain_all_stock_async(struct mem_cgroup *memcg); | 485 | static void drain_all_stock_async(struct mem_cgroup *memcg); |
458 | 486 | ||
459 | static struct mem_cgroup_per_zone * | 487 | static struct mem_cgroup_per_zone * |
@@ -718,12 +746,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
718 | nr_pages = -nr_pages; /* for event */ | 746 | nr_pages = -nr_pages; /* for event */ |
719 | } | 747 | } |
720 | 748 | ||
721 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); | 749 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); |
722 | 750 | ||
723 | preempt_enable(); | 751 | preempt_enable(); |
724 | } | 752 | } |
725 | 753 | ||
726 | unsigned long | 754 | unsigned long |
755 | mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) | ||
756 | { | ||
757 | struct mem_cgroup_per_zone *mz; | ||
758 | |||
759 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | ||
760 | return mz->lru_size[lru]; | ||
761 | } | ||
762 | |||
763 | static unsigned long | ||
727 | mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, | 764 | mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, |
728 | unsigned int lru_mask) | 765 | unsigned int lru_mask) |
729 | { | 766 | { |
@@ -770,7 +807,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
770 | { | 807 | { |
771 | unsigned long val, next; | 808 | unsigned long val, next; |
772 | 809 | ||
773 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 810 | val = __this_cpu_read(memcg->stat->nr_page_events); |
774 | next = __this_cpu_read(memcg->stat->targets[target]); | 811 | next = __this_cpu_read(memcg->stat->targets[target]); |
775 | /* from time_after() in jiffies.h */ | 812 | /* from time_after() in jiffies.h */ |
776 | if ((long)next - (long)val < 0) { | 813 | if ((long)next - (long)val < 0) { |
@@ -1013,7 +1050,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); | |||
1013 | /** | 1050 | /** |
1014 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | 1051 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg |
1015 | * @zone: zone of the wanted lruvec | 1052 | * @zone: zone of the wanted lruvec |
1016 | * @mem: memcg of the wanted lruvec | 1053 | * @memcg: memcg of the wanted lruvec |
1017 | * | 1054 | * |
1018 | * Returns the lru list vector holding pages for the given @zone and | 1055 | * Returns the lru list vector holding pages for the given @zone and |
1019 | * @mem. This can be the global zone lruvec, if the memory controller | 1056 | * @mem. This can be the global zone lruvec, if the memory controller |
@@ -1046,19 +1083,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, | |||
1046 | */ | 1083 | */ |
1047 | 1084 | ||
1048 | /** | 1085 | /** |
1049 | * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec | 1086 | * mem_cgroup_page_lruvec - return lruvec for adding an lru page |
1050 | * @zone: zone of the page | ||
1051 | * @page: the page | 1087 | * @page: the page |
1052 | * @lru: current lru | 1088 | * @zone: zone of the page |
1053 | * | ||
1054 | * This function accounts for @page being added to @lru, and returns | ||
1055 | * the lruvec for the given @zone and the memcg @page is charged to. | ||
1056 | * | ||
1057 | * The callsite is then responsible for physically linking the page to | ||
1058 | * the returned lruvec->lists[@lru]. | ||
1059 | */ | 1089 | */ |
1060 | struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | 1090 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) |
1061 | enum lru_list lru) | ||
1062 | { | 1091 | { |
1063 | struct mem_cgroup_per_zone *mz; | 1092 | struct mem_cgroup_per_zone *mz; |
1064 | struct mem_cgroup *memcg; | 1093 | struct mem_cgroup *memcg; |
@@ -1071,7 +1100,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | |||
1071 | memcg = pc->mem_cgroup; | 1100 | memcg = pc->mem_cgroup; |
1072 | 1101 | ||
1073 | /* | 1102 | /* |
1074 | * Surreptitiously switch any uncharged page to root: | 1103 | * Surreptitiously switch any uncharged offlist page to root: |
1075 | * an uncharged page off lru does nothing to secure | 1104 | * an uncharged page off lru does nothing to secure |
1076 | * its former mem_cgroup from sudden removal. | 1105 | * its former mem_cgroup from sudden removal. |
1077 | * | 1106 | * |
@@ -1079,85 +1108,60 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | |||
1079 | * under page_cgroup lock: between them, they make all uses | 1108 | * under page_cgroup lock: between them, they make all uses |
1080 | * of pc->mem_cgroup safe. | 1109 | * of pc->mem_cgroup safe. |
1081 | */ | 1110 | */ |
1082 | if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) | 1111 | if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) |
1083 | pc->mem_cgroup = memcg = root_mem_cgroup; | 1112 | pc->mem_cgroup = memcg = root_mem_cgroup; |
1084 | 1113 | ||
1085 | mz = page_cgroup_zoneinfo(memcg, page); | 1114 | mz = page_cgroup_zoneinfo(memcg, page); |
1086 | /* compound_order() is stabilized through lru_lock */ | ||
1087 | mz->lru_size[lru] += 1 << compound_order(page); | ||
1088 | return &mz->lruvec; | 1115 | return &mz->lruvec; |
1089 | } | 1116 | } |
1090 | 1117 | ||
1091 | /** | 1118 | /** |
1092 | * mem_cgroup_lru_del_list - account for removing an lru page | 1119 | * mem_cgroup_update_lru_size - account for adding or removing an lru page |
1093 | * @page: the page | 1120 | * @lruvec: mem_cgroup per zone lru vector |
1094 | * @lru: target lru | 1121 | * @lru: index of lru list the page is sitting on |
1095 | * | 1122 | * @nr_pages: positive when adding or negative when removing |
1096 | * This function accounts for @page being removed from @lru. | ||
1097 | * | 1123 | * |
1098 | * The callsite is then responsible for physically unlinking | 1124 | * This function must be called when a page is added to or removed from an |
1099 | * @page->lru. | 1125 | * lru list. |
1100 | */ | 1126 | */ |
1101 | void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) | 1127 | void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, |
1128 | int nr_pages) | ||
1102 | { | 1129 | { |
1103 | struct mem_cgroup_per_zone *mz; | 1130 | struct mem_cgroup_per_zone *mz; |
1104 | struct mem_cgroup *memcg; | 1131 | unsigned long *lru_size; |
1105 | struct page_cgroup *pc; | ||
1106 | 1132 | ||
1107 | if (mem_cgroup_disabled()) | 1133 | if (mem_cgroup_disabled()) |
1108 | return; | 1134 | return; |
1109 | 1135 | ||
1110 | pc = lookup_page_cgroup(page); | 1136 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); |
1111 | memcg = pc->mem_cgroup; | 1137 | lru_size = mz->lru_size + lru; |
1112 | VM_BUG_ON(!memcg); | 1138 | *lru_size += nr_pages; |
1113 | mz = page_cgroup_zoneinfo(memcg, page); | 1139 | VM_BUG_ON((long)(*lru_size) < 0); |
1114 | /* huge page split is done under lru_lock. so, we have no races. */ | ||
1115 | VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); | ||
1116 | mz->lru_size[lru] -= 1 << compound_order(page); | ||
1117 | } | ||
1118 | |||
1119 | void mem_cgroup_lru_del(struct page *page) | ||
1120 | { | ||
1121 | mem_cgroup_lru_del_list(page, page_lru(page)); | ||
1122 | } | ||
1123 | |||
1124 | /** | ||
1125 | * mem_cgroup_lru_move_lists - account for moving a page between lrus | ||
1126 | * @zone: zone of the page | ||
1127 | * @page: the page | ||
1128 | * @from: current lru | ||
1129 | * @to: target lru | ||
1130 | * | ||
1131 | * This function accounts for @page being moved between the lrus @from | ||
1132 | * and @to, and returns the lruvec for the given @zone and the memcg | ||
1133 | * @page is charged to. | ||
1134 | * | ||
1135 | * The callsite is then responsible for physically relinking | ||
1136 | * @page->lru to the returned lruvec->lists[@to]. | ||
1137 | */ | ||
1138 | struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, | ||
1139 | struct page *page, | ||
1140 | enum lru_list from, | ||
1141 | enum lru_list to) | ||
1142 | { | ||
1143 | /* XXX: Optimize this, especially for @from == @to */ | ||
1144 | mem_cgroup_lru_del_list(page, from); | ||
1145 | return mem_cgroup_lru_add_list(zone, page, to); | ||
1146 | } | 1140 | } |
1147 | 1141 | ||
1148 | /* | 1142 | /* |
1149 | * Checks whether given mem is same or in the root_mem_cgroup's | 1143 | * Checks whether given mem is same or in the root_mem_cgroup's |
1150 | * hierarchy subtree | 1144 | * hierarchy subtree |
1151 | */ | 1145 | */ |
1146 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | ||
1147 | struct mem_cgroup *memcg) | ||
1148 | { | ||
1149 | if (root_memcg == memcg) | ||
1150 | return true; | ||
1151 | if (!root_memcg->use_hierarchy || !memcg) | ||
1152 | return false; | ||
1153 | return css_is_ancestor(&memcg->css, &root_memcg->css); | ||
1154 | } | ||
1155 | |||
1152 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | 1156 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, |
1153 | struct mem_cgroup *memcg) | 1157 | struct mem_cgroup *memcg) |
1154 | { | 1158 | { |
1155 | if (root_memcg != memcg) { | 1159 | bool ret; |
1156 | return (root_memcg->use_hierarchy && | ||
1157 | css_is_ancestor(&memcg->css, &root_memcg->css)); | ||
1158 | } | ||
1159 | 1160 | ||
1160 | return true; | 1161 | rcu_read_lock(); |
1162 | ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); | ||
1163 | rcu_read_unlock(); | ||
1164 | return ret; | ||
1161 | } | 1165 | } |
1162 | 1166 | ||
1163 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | 1167 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) |
@@ -1195,19 +1199,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | |||
1195 | return ret; | 1199 | return ret; |
1196 | } | 1200 | } |
1197 | 1201 | ||
1198 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) | 1202 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) |
1199 | { | 1203 | { |
1200 | unsigned long inactive_ratio; | 1204 | unsigned long inactive_ratio; |
1201 | int nid = zone_to_nid(zone); | ||
1202 | int zid = zone_idx(zone); | ||
1203 | unsigned long inactive; | 1205 | unsigned long inactive; |
1204 | unsigned long active; | 1206 | unsigned long active; |
1205 | unsigned long gb; | 1207 | unsigned long gb; |
1206 | 1208 | ||
1207 | inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | 1209 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); |
1208 | BIT(LRU_INACTIVE_ANON)); | 1210 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); |
1209 | active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | ||
1210 | BIT(LRU_ACTIVE_ANON)); | ||
1211 | 1211 | ||
1212 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 1212 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
1213 | if (gb) | 1213 | if (gb) |
@@ -1218,55 +1218,23 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) | |||
1218 | return inactive * inactive_ratio < active; | 1218 | return inactive * inactive_ratio < active; |
1219 | } | 1219 | } |
1220 | 1220 | ||
1221 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) | 1221 | int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) |
1222 | { | 1222 | { |
1223 | unsigned long active; | 1223 | unsigned long active; |
1224 | unsigned long inactive; | 1224 | unsigned long inactive; |
1225 | int zid = zone_idx(zone); | ||
1226 | int nid = zone_to_nid(zone); | ||
1227 | 1225 | ||
1228 | inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | 1226 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1229 | BIT(LRU_INACTIVE_FILE)); | 1227 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE); |
1230 | active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | ||
1231 | BIT(LRU_ACTIVE_FILE)); | ||
1232 | 1228 | ||
1233 | return (active > inactive); | 1229 | return (active > inactive); |
1234 | } | 1230 | } |
1235 | 1231 | ||
1236 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | ||
1237 | struct zone *zone) | ||
1238 | { | ||
1239 | int nid = zone_to_nid(zone); | ||
1240 | int zid = zone_idx(zone); | ||
1241 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
1242 | |||
1243 | return &mz->reclaim_stat; | ||
1244 | } | ||
1245 | |||
1246 | struct zone_reclaim_stat * | ||
1247 | mem_cgroup_get_reclaim_stat_from_page(struct page *page) | ||
1248 | { | ||
1249 | struct page_cgroup *pc; | ||
1250 | struct mem_cgroup_per_zone *mz; | ||
1251 | |||
1252 | if (mem_cgroup_disabled()) | ||
1253 | return NULL; | ||
1254 | |||
1255 | pc = lookup_page_cgroup(page); | ||
1256 | if (!PageCgroupUsed(pc)) | ||
1257 | return NULL; | ||
1258 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
1259 | smp_rmb(); | ||
1260 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
1261 | return &mz->reclaim_stat; | ||
1262 | } | ||
1263 | |||
1264 | #define mem_cgroup_from_res_counter(counter, member) \ | 1232 | #define mem_cgroup_from_res_counter(counter, member) \ |
1265 | container_of(counter, struct mem_cgroup, member) | 1233 | container_of(counter, struct mem_cgroup, member) |
1266 | 1234 | ||
1267 | /** | 1235 | /** |
1268 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 1236 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1269 | * @mem: the memory cgroup | 1237 | * @memcg: the memory cgroup |
1270 | * | 1238 | * |
1271 | * Returns the maximum amount of memory @mem can be charged with, in | 1239 | * Returns the maximum amount of memory @mem can be charged with, in |
1272 | * pages. | 1240 | * pages. |
@@ -1540,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1540 | 1508 | ||
1541 | /** | 1509 | /** |
1542 | * test_mem_cgroup_node_reclaimable | 1510 | * test_mem_cgroup_node_reclaimable |
1543 | * @mem: the target memcg | 1511 | * @memcg: the target memcg |
1544 | * @nid: the node ID to be checked. | 1512 | * @nid: the node ID to be checked. |
1545 | * @noswap : specify true here if the user wants flle only information. | 1513 | * @noswap : specify true here if the user wants flle only information. |
1546 | * | 1514 | * |
@@ -1634,7 +1602,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1634 | * unused nodes. But scan_nodes is lazily updated and may not cotain | 1602 | * unused nodes. But scan_nodes is lazily updated and may not cotain |
1635 | * enough new information. We need to do double check. | 1603 | * enough new information. We need to do double check. |
1636 | */ | 1604 | */ |
1637 | bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | 1605 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1638 | { | 1606 | { |
1639 | int nid; | 1607 | int nid; |
1640 | 1608 | ||
@@ -1669,7 +1637,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1669 | return 0; | 1637 | return 0; |
1670 | } | 1638 | } |
1671 | 1639 | ||
1672 | bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | 1640 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1673 | { | 1641 | { |
1674 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); | 1642 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
1675 | } | 1643 | } |
@@ -1843,7 +1811,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
1843 | /* | 1811 | /* |
1844 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1812 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1845 | */ | 1813 | */ |
1846 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 1814 | static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, |
1815 | int order) | ||
1847 | { | 1816 | { |
1848 | struct oom_wait_info owait; | 1817 | struct oom_wait_info owait; |
1849 | bool locked, need_to_kill; | 1818 | bool locked, need_to_kill; |
@@ -1992,7 +1961,7 @@ struct memcg_stock_pcp { | |||
1992 | unsigned int nr_pages; | 1961 | unsigned int nr_pages; |
1993 | struct work_struct work; | 1962 | struct work_struct work; |
1994 | unsigned long flags; | 1963 | unsigned long flags; |
1995 | #define FLUSHING_CACHED_CHARGE (0) | 1964 | #define FLUSHING_CACHED_CHARGE 0 |
1996 | }; | 1965 | }; |
1997 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 1966 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1998 | static DEFINE_MUTEX(percpu_charge_mutex); | 1967 | static DEFINE_MUTEX(percpu_charge_mutex); |
@@ -2139,7 +2108,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) | |||
2139 | int i; | 2108 | int i; |
2140 | 2109 | ||
2141 | spin_lock(&memcg->pcp_counter_lock); | 2110 | spin_lock(&memcg->pcp_counter_lock); |
2142 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | 2111 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
2143 | long x = per_cpu(memcg->stat->count[i], cpu); | 2112 | long x = per_cpu(memcg->stat->count[i], cpu); |
2144 | 2113 | ||
2145 | per_cpu(memcg->stat->count[i], cpu) = 0; | 2114 | per_cpu(memcg->stat->count[i], cpu) = 0; |
@@ -2427,6 +2396,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, | |||
2427 | } | 2396 | } |
2428 | 2397 | ||
2429 | /* | 2398 | /* |
2399 | * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. | ||
2400 | * This is useful when moving usage to parent cgroup. | ||
2401 | */ | ||
2402 | static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | ||
2403 | unsigned int nr_pages) | ||
2404 | { | ||
2405 | unsigned long bytes = nr_pages * PAGE_SIZE; | ||
2406 | |||
2407 | if (mem_cgroup_is_root(memcg)) | ||
2408 | return; | ||
2409 | |||
2410 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); | ||
2411 | if (do_swap_account) | ||
2412 | res_counter_uncharge_until(&memcg->memsw, | ||
2413 | memcg->memsw.parent, bytes); | ||
2414 | } | ||
2415 | |||
2416 | /* | ||
2430 | * A helper function to get mem_cgroup from ID. must be called under | 2417 | * A helper function to get mem_cgroup from ID. must be called under |
2431 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 2418 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
2432 | * it's concern. (dropping refcnt from swap can be called against removed | 2419 | * it's concern. (dropping refcnt from swap can be called against removed |
@@ -2481,6 +2468,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2481 | { | 2468 | { |
2482 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2469 | struct page_cgroup *pc = lookup_page_cgroup(page); |
2483 | struct zone *uninitialized_var(zone); | 2470 | struct zone *uninitialized_var(zone); |
2471 | struct lruvec *lruvec; | ||
2484 | bool was_on_lru = false; | 2472 | bool was_on_lru = false; |
2485 | bool anon; | 2473 | bool anon; |
2486 | 2474 | ||
@@ -2503,8 +2491,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2503 | zone = page_zone(page); | 2491 | zone = page_zone(page); |
2504 | spin_lock_irq(&zone->lru_lock); | 2492 | spin_lock_irq(&zone->lru_lock); |
2505 | if (PageLRU(page)) { | 2493 | if (PageLRU(page)) { |
2494 | lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); | ||
2506 | ClearPageLRU(page); | 2495 | ClearPageLRU(page); |
2507 | del_page_from_lru_list(zone, page, page_lru(page)); | 2496 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
2508 | was_on_lru = true; | 2497 | was_on_lru = true; |
2509 | } | 2498 | } |
2510 | } | 2499 | } |
@@ -2522,9 +2511,10 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2522 | 2511 | ||
2523 | if (lrucare) { | 2512 | if (lrucare) { |
2524 | if (was_on_lru) { | 2513 | if (was_on_lru) { |
2514 | lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); | ||
2525 | VM_BUG_ON(PageLRU(page)); | 2515 | VM_BUG_ON(PageLRU(page)); |
2526 | SetPageLRU(page); | 2516 | SetPageLRU(page); |
2527 | add_page_to_lru_list(zone, page, page_lru(page)); | 2517 | add_page_to_lru_list(page, lruvec, page_lru(page)); |
2528 | } | 2518 | } |
2529 | spin_unlock_irq(&zone->lru_lock); | 2519 | spin_unlock_irq(&zone->lru_lock); |
2530 | } | 2520 | } |
@@ -2547,7 +2537,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2547 | 2537 | ||
2548 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2538 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2549 | 2539 | ||
2550 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) | 2540 | #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) |
2551 | /* | 2541 | /* |
2552 | * Because tail pages are not marked as "used", set it. We're under | 2542 | * Because tail pages are not marked as "used", set it. We're under |
2553 | * zone->lru_lock, 'splitting on pmd' and compound_lock. | 2543 | * zone->lru_lock, 'splitting on pmd' and compound_lock. |
@@ -2578,23 +2568,19 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
2578 | * @pc: page_cgroup of the page. | 2568 | * @pc: page_cgroup of the page. |
2579 | * @from: mem_cgroup which the page is moved from. | 2569 | * @from: mem_cgroup which the page is moved from. |
2580 | * @to: mem_cgroup which the page is moved to. @from != @to. | 2570 | * @to: mem_cgroup which the page is moved to. @from != @to. |
2581 | * @uncharge: whether we should call uncharge and css_put against @from. | ||
2582 | * | 2571 | * |
2583 | * The caller must confirm following. | 2572 | * The caller must confirm following. |
2584 | * - page is not on LRU (isolate_page() is useful.) | 2573 | * - page is not on LRU (isolate_page() is useful.) |
2585 | * - compound_lock is held when nr_pages > 1 | 2574 | * - compound_lock is held when nr_pages > 1 |
2586 | * | 2575 | * |
2587 | * This function doesn't do "charge" nor css_get to new cgroup. It should be | 2576 | * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" |
2588 | * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is | 2577 | * from old cgroup. |
2589 | * true, this function does "uncharge" from old cgroup, but it doesn't if | ||
2590 | * @uncharge is false, so a caller should do "uncharge". | ||
2591 | */ | 2578 | */ |
2592 | static int mem_cgroup_move_account(struct page *page, | 2579 | static int mem_cgroup_move_account(struct page *page, |
2593 | unsigned int nr_pages, | 2580 | unsigned int nr_pages, |
2594 | struct page_cgroup *pc, | 2581 | struct page_cgroup *pc, |
2595 | struct mem_cgroup *from, | 2582 | struct mem_cgroup *from, |
2596 | struct mem_cgroup *to, | 2583 | struct mem_cgroup *to) |
2597 | bool uncharge) | ||
2598 | { | 2584 | { |
2599 | unsigned long flags; | 2585 | unsigned long flags; |
2600 | int ret; | 2586 | int ret; |
@@ -2628,9 +2614,6 @@ static int mem_cgroup_move_account(struct page *page, | |||
2628 | preempt_enable(); | 2614 | preempt_enable(); |
2629 | } | 2615 | } |
2630 | mem_cgroup_charge_statistics(from, anon, -nr_pages); | 2616 | mem_cgroup_charge_statistics(from, anon, -nr_pages); |
2631 | if (uncharge) | ||
2632 | /* This is not "cancel", but cancel_charge does all we need. */ | ||
2633 | __mem_cgroup_cancel_charge(from, nr_pages); | ||
2634 | 2617 | ||
2635 | /* caller should have done css_get */ | 2618 | /* caller should have done css_get */ |
2636 | pc->mem_cgroup = to; | 2619 | pc->mem_cgroup = to; |
@@ -2664,15 +2647,13 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2664 | struct mem_cgroup *child, | 2647 | struct mem_cgroup *child, |
2665 | gfp_t gfp_mask) | 2648 | gfp_t gfp_mask) |
2666 | { | 2649 | { |
2667 | struct cgroup *cg = child->css.cgroup; | ||
2668 | struct cgroup *pcg = cg->parent; | ||
2669 | struct mem_cgroup *parent; | 2650 | struct mem_cgroup *parent; |
2670 | unsigned int nr_pages; | 2651 | unsigned int nr_pages; |
2671 | unsigned long uninitialized_var(flags); | 2652 | unsigned long uninitialized_var(flags); |
2672 | int ret; | 2653 | int ret; |
2673 | 2654 | ||
2674 | /* Is ROOT ? */ | 2655 | /* Is ROOT ? */ |
2675 | if (!pcg) | 2656 | if (mem_cgroup_is_root(child)) |
2676 | return -EINVAL; | 2657 | return -EINVAL; |
2677 | 2658 | ||
2678 | ret = -EBUSY; | 2659 | ret = -EBUSY; |
@@ -2683,21 +2664,23 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2683 | 2664 | ||
2684 | nr_pages = hpage_nr_pages(page); | 2665 | nr_pages = hpage_nr_pages(page); |
2685 | 2666 | ||
2686 | parent = mem_cgroup_from_cont(pcg); | 2667 | parent = parent_mem_cgroup(child); |
2687 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); | 2668 | /* |
2688 | if (ret) | 2669 | * If no parent, move charges to root cgroup. |
2689 | goto put_back; | 2670 | */ |
2671 | if (!parent) | ||
2672 | parent = root_mem_cgroup; | ||
2690 | 2673 | ||
2691 | if (nr_pages > 1) | 2674 | if (nr_pages > 1) |
2692 | flags = compound_lock_irqsave(page); | 2675 | flags = compound_lock_irqsave(page); |
2693 | 2676 | ||
2694 | ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); | 2677 | ret = mem_cgroup_move_account(page, nr_pages, |
2695 | if (ret) | 2678 | pc, child, parent); |
2696 | __mem_cgroup_cancel_charge(parent, nr_pages); | 2679 | if (!ret) |
2680 | __mem_cgroup_cancel_local_charge(child, nr_pages); | ||
2697 | 2681 | ||
2698 | if (nr_pages > 1) | 2682 | if (nr_pages > 1) |
2699 | compound_unlock_irqrestore(page, flags); | 2683 | compound_unlock_irqrestore(page, flags); |
2700 | put_back: | ||
2701 | putback_lru_page(page); | 2684 | putback_lru_page(page); |
2702 | put: | 2685 | put: |
2703 | put_page(page); | 2686 | put_page(page); |
@@ -2845,24 +2828,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | |||
2845 | */ | 2828 | */ |
2846 | if (do_swap_account && PageSwapCache(page)) { | 2829 | if (do_swap_account && PageSwapCache(page)) { |
2847 | swp_entry_t ent = {.val = page_private(page)}; | 2830 | swp_entry_t ent = {.val = page_private(page)}; |
2848 | struct mem_cgroup *swap_memcg; | 2831 | mem_cgroup_uncharge_swap(ent); |
2849 | unsigned short id; | ||
2850 | |||
2851 | id = swap_cgroup_record(ent, 0); | ||
2852 | rcu_read_lock(); | ||
2853 | swap_memcg = mem_cgroup_lookup(id); | ||
2854 | if (swap_memcg) { | ||
2855 | /* | ||
2856 | * This recorded memcg can be obsolete one. So, avoid | ||
2857 | * calling css_tryget | ||
2858 | */ | ||
2859 | if (!mem_cgroup_is_root(swap_memcg)) | ||
2860 | res_counter_uncharge(&swap_memcg->memsw, | ||
2861 | PAGE_SIZE); | ||
2862 | mem_cgroup_swap_statistics(swap_memcg, false); | ||
2863 | mem_cgroup_put(swap_memcg); | ||
2864 | } | ||
2865 | rcu_read_unlock(); | ||
2866 | } | 2832 | } |
2867 | /* | 2833 | /* |
2868 | * At swapin, we may charge account against cgroup which has no tasks. | 2834 | * At swapin, we may charge account against cgroup which has no tasks. |
@@ -3155,7 +3121,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
3155 | * @entry: swap entry to be moved | 3121 | * @entry: swap entry to be moved |
3156 | * @from: mem_cgroup which the entry is moved from | 3122 | * @from: mem_cgroup which the entry is moved from |
3157 | * @to: mem_cgroup which the entry is moved to | 3123 | * @to: mem_cgroup which the entry is moved to |
3158 | * @need_fixup: whether we should fixup res_counters and refcounts. | ||
3159 | * | 3124 | * |
3160 | * It succeeds only when the swap_cgroup's record for this entry is the same | 3125 | * It succeeds only when the swap_cgroup's record for this entry is the same |
3161 | * as the mem_cgroup's id of @from. | 3126 | * as the mem_cgroup's id of @from. |
@@ -3166,7 +3131,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
3166 | * both res and memsw, and called css_get(). | 3131 | * both res and memsw, and called css_get(). |
3167 | */ | 3132 | */ |
3168 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | 3133 | static int mem_cgroup_move_swap_account(swp_entry_t entry, |
3169 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | 3134 | struct mem_cgroup *from, struct mem_cgroup *to) |
3170 | { | 3135 | { |
3171 | unsigned short old_id, new_id; | 3136 | unsigned short old_id, new_id; |
3172 | 3137 | ||
@@ -3185,24 +3150,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3185 | * swap-in, the refcount of @to might be decreased to 0. | 3150 | * swap-in, the refcount of @to might be decreased to 0. |
3186 | */ | 3151 | */ |
3187 | mem_cgroup_get(to); | 3152 | mem_cgroup_get(to); |
3188 | if (need_fixup) { | ||
3189 | if (!mem_cgroup_is_root(from)) | ||
3190 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
3191 | mem_cgroup_put(from); | ||
3192 | /* | ||
3193 | * we charged both to->res and to->memsw, so we should | ||
3194 | * uncharge to->res. | ||
3195 | */ | ||
3196 | if (!mem_cgroup_is_root(to)) | ||
3197 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
3198 | } | ||
3199 | return 0; | 3153 | return 0; |
3200 | } | 3154 | } |
3201 | return -EINVAL; | 3155 | return -EINVAL; |
3202 | } | 3156 | } |
3203 | #else | 3157 | #else |
3204 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | 3158 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, |
3205 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | 3159 | struct mem_cgroup *from, struct mem_cgroup *to) |
3206 | { | 3160 | { |
3207 | return -EINVAL; | 3161 | return -EINVAL; |
3208 | } | 3162 | } |
@@ -3363,7 +3317,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3363 | void mem_cgroup_replace_page_cache(struct page *oldpage, | 3317 | void mem_cgroup_replace_page_cache(struct page *oldpage, |
3364 | struct page *newpage) | 3318 | struct page *newpage) |
3365 | { | 3319 | { |
3366 | struct mem_cgroup *memcg; | 3320 | struct mem_cgroup *memcg = NULL; |
3367 | struct page_cgroup *pc; | 3321 | struct page_cgroup *pc; |
3368 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3322 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3369 | 3323 | ||
@@ -3373,11 +3327,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3373 | pc = lookup_page_cgroup(oldpage); | 3327 | pc = lookup_page_cgroup(oldpage); |
3374 | /* fix accounting on old pages */ | 3328 | /* fix accounting on old pages */ |
3375 | lock_page_cgroup(pc); | 3329 | lock_page_cgroup(pc); |
3376 | memcg = pc->mem_cgroup; | 3330 | if (PageCgroupUsed(pc)) { |
3377 | mem_cgroup_charge_statistics(memcg, false, -1); | 3331 | memcg = pc->mem_cgroup; |
3378 | ClearPageCgroupUsed(pc); | 3332 | mem_cgroup_charge_statistics(memcg, false, -1); |
3333 | ClearPageCgroupUsed(pc); | ||
3334 | } | ||
3379 | unlock_page_cgroup(pc); | 3335 | unlock_page_cgroup(pc); |
3380 | 3336 | ||
3337 | /* | ||
3338 | * When called from shmem_replace_page(), in some cases the | ||
3339 | * oldpage has already been charged, and in some cases not. | ||
3340 | */ | ||
3341 | if (!memcg) | ||
3342 | return; | ||
3343 | |||
3381 | if (PageSwapBacked(oldpage)) | 3344 | if (PageSwapBacked(oldpage)) |
3382 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3345 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
3383 | 3346 | ||
@@ -3793,7 +3756,7 @@ try_to_free: | |||
3793 | goto move_account; | 3756 | goto move_account; |
3794 | } | 3757 | } |
3795 | 3758 | ||
3796 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 3759 | static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
3797 | { | 3760 | { |
3798 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); | 3761 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); |
3799 | } | 3762 | } |
@@ -3873,14 +3836,21 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
3873 | return val << PAGE_SHIFT; | 3836 | return val << PAGE_SHIFT; |
3874 | } | 3837 | } |
3875 | 3838 | ||
3876 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 3839 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, |
3840 | struct file *file, char __user *buf, | ||
3841 | size_t nbytes, loff_t *ppos) | ||
3877 | { | 3842 | { |
3878 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 3843 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3844 | char str[64]; | ||
3879 | u64 val; | 3845 | u64 val; |
3880 | int type, name; | 3846 | int type, name, len; |
3881 | 3847 | ||
3882 | type = MEMFILE_TYPE(cft->private); | 3848 | type = MEMFILE_TYPE(cft->private); |
3883 | name = MEMFILE_ATTR(cft->private); | 3849 | name = MEMFILE_ATTR(cft->private); |
3850 | |||
3851 | if (!do_swap_account && type == _MEMSWAP) | ||
3852 | return -EOPNOTSUPP; | ||
3853 | |||
3884 | switch (type) { | 3854 | switch (type) { |
3885 | case _MEM: | 3855 | case _MEM: |
3886 | if (name == RES_USAGE) | 3856 | if (name == RES_USAGE) |
@@ -3897,7 +3867,9 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
3897 | default: | 3867 | default: |
3898 | BUG(); | 3868 | BUG(); |
3899 | } | 3869 | } |
3900 | return val; | 3870 | |
3871 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | ||
3872 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
3901 | } | 3873 | } |
3902 | /* | 3874 | /* |
3903 | * The user of this function is... | 3875 | * The user of this function is... |
@@ -3913,6 +3885,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
3913 | 3885 | ||
3914 | type = MEMFILE_TYPE(cft->private); | 3886 | type = MEMFILE_TYPE(cft->private); |
3915 | name = MEMFILE_ATTR(cft->private); | 3887 | name = MEMFILE_ATTR(cft->private); |
3888 | |||
3889 | if (!do_swap_account && type == _MEMSWAP) | ||
3890 | return -EOPNOTSUPP; | ||
3891 | |||
3916 | switch (name) { | 3892 | switch (name) { |
3917 | case RES_LIMIT: | 3893 | case RES_LIMIT: |
3918 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | 3894 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ |
@@ -3978,12 +3954,15 @@ out: | |||
3978 | 3954 | ||
3979 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 3955 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
3980 | { | 3956 | { |
3981 | struct mem_cgroup *memcg; | 3957 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3982 | int type, name; | 3958 | int type, name; |
3983 | 3959 | ||
3984 | memcg = mem_cgroup_from_cont(cont); | ||
3985 | type = MEMFILE_TYPE(event); | 3960 | type = MEMFILE_TYPE(event); |
3986 | name = MEMFILE_ATTR(event); | 3961 | name = MEMFILE_ATTR(event); |
3962 | |||
3963 | if (!do_swap_account && type == _MEMSWAP) | ||
3964 | return -EOPNOTSUPP; | ||
3965 | |||
3987 | switch (name) { | 3966 | switch (name) { |
3988 | case RES_MAX_USAGE: | 3967 | case RES_MAX_USAGE: |
3989 | if (type == _MEM) | 3968 | if (type == _MEM) |
@@ -4035,103 +4014,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
4035 | } | 4014 | } |
4036 | #endif | 4015 | #endif |
4037 | 4016 | ||
4038 | |||
4039 | /* For read statistics */ | ||
4040 | enum { | ||
4041 | MCS_CACHE, | ||
4042 | MCS_RSS, | ||
4043 | MCS_FILE_MAPPED, | ||
4044 | MCS_PGPGIN, | ||
4045 | MCS_PGPGOUT, | ||
4046 | MCS_SWAP, | ||
4047 | MCS_PGFAULT, | ||
4048 | MCS_PGMAJFAULT, | ||
4049 | MCS_INACTIVE_ANON, | ||
4050 | MCS_ACTIVE_ANON, | ||
4051 | MCS_INACTIVE_FILE, | ||
4052 | MCS_ACTIVE_FILE, | ||
4053 | MCS_UNEVICTABLE, | ||
4054 | NR_MCS_STAT, | ||
4055 | }; | ||
4056 | |||
4057 | struct mcs_total_stat { | ||
4058 | s64 stat[NR_MCS_STAT]; | ||
4059 | }; | ||
4060 | |||
4061 | struct { | ||
4062 | char *local_name; | ||
4063 | char *total_name; | ||
4064 | } memcg_stat_strings[NR_MCS_STAT] = { | ||
4065 | {"cache", "total_cache"}, | ||
4066 | {"rss", "total_rss"}, | ||
4067 | {"mapped_file", "total_mapped_file"}, | ||
4068 | {"pgpgin", "total_pgpgin"}, | ||
4069 | {"pgpgout", "total_pgpgout"}, | ||
4070 | {"swap", "total_swap"}, | ||
4071 | {"pgfault", "total_pgfault"}, | ||
4072 | {"pgmajfault", "total_pgmajfault"}, | ||
4073 | {"inactive_anon", "total_inactive_anon"}, | ||
4074 | {"active_anon", "total_active_anon"}, | ||
4075 | {"inactive_file", "total_inactive_file"}, | ||
4076 | {"active_file", "total_active_file"}, | ||
4077 | {"unevictable", "total_unevictable"} | ||
4078 | }; | ||
4079 | |||
4080 | |||
4081 | static void | ||
4082 | mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) | ||
4083 | { | ||
4084 | s64 val; | ||
4085 | |||
4086 | /* per cpu stat */ | ||
4087 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); | ||
4088 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | ||
4089 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); | ||
4090 | s->stat[MCS_RSS] += val * PAGE_SIZE; | ||
4091 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); | ||
4092 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | ||
4093 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); | ||
4094 | s->stat[MCS_PGPGIN] += val; | ||
4095 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); | ||
4096 | s->stat[MCS_PGPGOUT] += val; | ||
4097 | if (do_swap_account) { | ||
4098 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); | ||
4099 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | ||
4100 | } | ||
4101 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); | ||
4102 | s->stat[MCS_PGFAULT] += val; | ||
4103 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); | ||
4104 | s->stat[MCS_PGMAJFAULT] += val; | ||
4105 | |||
4106 | /* per zone stat */ | ||
4107 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); | ||
4108 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; | ||
4109 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); | ||
4110 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; | ||
4111 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); | ||
4112 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; | ||
4113 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); | ||
4114 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | ||
4115 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); | ||
4116 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | ||
4117 | } | ||
4118 | |||
4119 | static void | ||
4120 | mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) | ||
4121 | { | ||
4122 | struct mem_cgroup *iter; | ||
4123 | |||
4124 | for_each_mem_cgroup_tree(iter, memcg) | ||
4125 | mem_cgroup_get_local_stat(iter, s); | ||
4126 | } | ||
4127 | |||
4128 | #ifdef CONFIG_NUMA | 4017 | #ifdef CONFIG_NUMA |
4129 | static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | 4018 | static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, |
4019 | struct seq_file *m) | ||
4130 | { | 4020 | { |
4131 | int nid; | 4021 | int nid; |
4132 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; | 4022 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; |
4133 | unsigned long node_nr; | 4023 | unsigned long node_nr; |
4134 | struct cgroup *cont = m->private; | ||
4135 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4024 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4136 | 4025 | ||
4137 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); | 4026 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); |
@@ -4172,64 +4061,100 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4172 | } | 4061 | } |
4173 | #endif /* CONFIG_NUMA */ | 4062 | #endif /* CONFIG_NUMA */ |
4174 | 4063 | ||
4064 | static const char * const mem_cgroup_lru_names[] = { | ||
4065 | "inactive_anon", | ||
4066 | "active_anon", | ||
4067 | "inactive_file", | ||
4068 | "active_file", | ||
4069 | "unevictable", | ||
4070 | }; | ||
4071 | |||
4072 | static inline void mem_cgroup_lru_names_not_uptodate(void) | ||
4073 | { | ||
4074 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | ||
4075 | } | ||
4076 | |||
4175 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 4077 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
4176 | struct cgroup_map_cb *cb) | 4078 | struct seq_file *m) |
4177 | { | 4079 | { |
4178 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4080 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4179 | struct mcs_total_stat mystat; | 4081 | struct mem_cgroup *mi; |
4180 | int i; | 4082 | unsigned int i; |
4181 | |||
4182 | memset(&mystat, 0, sizeof(mystat)); | ||
4183 | mem_cgroup_get_local_stat(memcg, &mystat); | ||
4184 | 4083 | ||
4185 | 4084 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | |
4186 | for (i = 0; i < NR_MCS_STAT; i++) { | 4085 | if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) |
4187 | if (i == MCS_SWAP && !do_swap_account) | ||
4188 | continue; | 4086 | continue; |
4189 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); | 4087 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], |
4088 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); | ||
4190 | } | 4089 | } |
4191 | 4090 | ||
4091 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) | ||
4092 | seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], | ||
4093 | mem_cgroup_read_events(memcg, i)); | ||
4094 | |||
4095 | for (i = 0; i < NR_LRU_LISTS; i++) | ||
4096 | seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], | ||
4097 | mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); | ||
4098 | |||
4192 | /* Hierarchical information */ | 4099 | /* Hierarchical information */ |
4193 | { | 4100 | { |
4194 | unsigned long long limit, memsw_limit; | 4101 | unsigned long long limit, memsw_limit; |
4195 | memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); | 4102 | memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); |
4196 | cb->fill(cb, "hierarchical_memory_limit", limit); | 4103 | seq_printf(m, "hierarchical_memory_limit %llu\n", limit); |
4197 | if (do_swap_account) | 4104 | if (do_swap_account) |
4198 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); | 4105 | seq_printf(m, "hierarchical_memsw_limit %llu\n", |
4106 | memsw_limit); | ||
4199 | } | 4107 | } |
4200 | 4108 | ||
4201 | memset(&mystat, 0, sizeof(mystat)); | 4109 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4202 | mem_cgroup_get_total_stat(memcg, &mystat); | 4110 | long long val = 0; |
4203 | for (i = 0; i < NR_MCS_STAT; i++) { | 4111 | |
4204 | if (i == MCS_SWAP && !do_swap_account) | 4112 | if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) |
4205 | continue; | 4113 | continue; |
4206 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); | 4114 | for_each_mem_cgroup_tree(mi, memcg) |
4115 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; | ||
4116 | seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); | ||
4117 | } | ||
4118 | |||
4119 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | ||
4120 | unsigned long long val = 0; | ||
4121 | |||
4122 | for_each_mem_cgroup_tree(mi, memcg) | ||
4123 | val += mem_cgroup_read_events(mi, i); | ||
4124 | seq_printf(m, "total_%s %llu\n", | ||
4125 | mem_cgroup_events_names[i], val); | ||
4126 | } | ||
4127 | |||
4128 | for (i = 0; i < NR_LRU_LISTS; i++) { | ||
4129 | unsigned long long val = 0; | ||
4130 | |||
4131 | for_each_mem_cgroup_tree(mi, memcg) | ||
4132 | val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; | ||
4133 | seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); | ||
4207 | } | 4134 | } |
4208 | 4135 | ||
4209 | #ifdef CONFIG_DEBUG_VM | 4136 | #ifdef CONFIG_DEBUG_VM |
4210 | { | 4137 | { |
4211 | int nid, zid; | 4138 | int nid, zid; |
4212 | struct mem_cgroup_per_zone *mz; | 4139 | struct mem_cgroup_per_zone *mz; |
4140 | struct zone_reclaim_stat *rstat; | ||
4213 | unsigned long recent_rotated[2] = {0, 0}; | 4141 | unsigned long recent_rotated[2] = {0, 0}; |
4214 | unsigned long recent_scanned[2] = {0, 0}; | 4142 | unsigned long recent_scanned[2] = {0, 0}; |
4215 | 4143 | ||
4216 | for_each_online_node(nid) | 4144 | for_each_online_node(nid) |
4217 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 4145 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
4218 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 4146 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
4147 | rstat = &mz->lruvec.reclaim_stat; | ||
4219 | 4148 | ||
4220 | recent_rotated[0] += | 4149 | recent_rotated[0] += rstat->recent_rotated[0]; |
4221 | mz->reclaim_stat.recent_rotated[0]; | 4150 | recent_rotated[1] += rstat->recent_rotated[1]; |
4222 | recent_rotated[1] += | 4151 | recent_scanned[0] += rstat->recent_scanned[0]; |
4223 | mz->reclaim_stat.recent_rotated[1]; | 4152 | recent_scanned[1] += rstat->recent_scanned[1]; |
4224 | recent_scanned[0] += | ||
4225 | mz->reclaim_stat.recent_scanned[0]; | ||
4226 | recent_scanned[1] += | ||
4227 | mz->reclaim_stat.recent_scanned[1]; | ||
4228 | } | 4153 | } |
4229 | cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); | 4154 | seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); |
4230 | cb->fill(cb, "recent_rotated_file", recent_rotated[1]); | 4155 | seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); |
4231 | cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); | 4156 | seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); |
4232 | cb->fill(cb, "recent_scanned_file", recent_scanned[1]); | 4157 | seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); |
4233 | } | 4158 | } |
4234 | #endif | 4159 | #endif |
4235 | 4160 | ||
@@ -4291,7 +4216,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
4291 | usage = mem_cgroup_usage(memcg, swap); | 4216 | usage = mem_cgroup_usage(memcg, swap); |
4292 | 4217 | ||
4293 | /* | 4218 | /* |
4294 | * current_threshold points to threshold just below usage. | 4219 | * current_threshold points to threshold just below or equal to usage. |
4295 | * If it's not true, a threshold was crossed after last | 4220 | * If it's not true, a threshold was crossed after last |
4296 | * call of __mem_cgroup_threshold(). | 4221 | * call of __mem_cgroup_threshold(). |
4297 | */ | 4222 | */ |
@@ -4417,14 +4342,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | |||
4417 | /* Find current threshold */ | 4342 | /* Find current threshold */ |
4418 | new->current_threshold = -1; | 4343 | new->current_threshold = -1; |
4419 | for (i = 0; i < size; i++) { | 4344 | for (i = 0; i < size; i++) { |
4420 | if (new->entries[i].threshold < usage) { | 4345 | if (new->entries[i].threshold <= usage) { |
4421 | /* | 4346 | /* |
4422 | * new->current_threshold will not be used until | 4347 | * new->current_threshold will not be used until |
4423 | * rcu_assign_pointer(), so it's safe to increment | 4348 | * rcu_assign_pointer(), so it's safe to increment |
4424 | * it here. | 4349 | * it here. |
4425 | */ | 4350 | */ |
4426 | ++new->current_threshold; | 4351 | ++new->current_threshold; |
4427 | } | 4352 | } else |
4353 | break; | ||
4428 | } | 4354 | } |
4429 | 4355 | ||
4430 | /* Free old spare buffer and save old primary buffer as spare */ | 4356 | /* Free old spare buffer and save old primary buffer as spare */ |
@@ -4493,7 +4419,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4493 | continue; | 4419 | continue; |
4494 | 4420 | ||
4495 | new->entries[j] = thresholds->primary->entries[i]; | 4421 | new->entries[j] = thresholds->primary->entries[i]; |
4496 | if (new->entries[j].threshold < usage) { | 4422 | if (new->entries[j].threshold <= usage) { |
4497 | /* | 4423 | /* |
4498 | * new->current_threshold will not be used | 4424 | * new->current_threshold will not be used |
4499 | * until rcu_assign_pointer(), so it's safe to increment | 4425 | * until rcu_assign_pointer(), so it's safe to increment |
@@ -4607,46 +4533,23 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4607 | return 0; | 4533 | return 0; |
4608 | } | 4534 | } |
4609 | 4535 | ||
4610 | #ifdef CONFIG_NUMA | ||
4611 | static const struct file_operations mem_control_numa_stat_file_operations = { | ||
4612 | .read = seq_read, | ||
4613 | .llseek = seq_lseek, | ||
4614 | .release = single_release, | ||
4615 | }; | ||
4616 | |||
4617 | static int mem_control_numa_stat_open(struct inode *unused, struct file *file) | ||
4618 | { | ||
4619 | struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; | ||
4620 | |||
4621 | file->f_op = &mem_control_numa_stat_file_operations; | ||
4622 | return single_open(file, mem_control_numa_stat_show, cont); | ||
4623 | } | ||
4624 | #endif /* CONFIG_NUMA */ | ||
4625 | |||
4626 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 4536 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM |
4627 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | 4537 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4628 | { | 4538 | { |
4629 | /* | 4539 | return mem_cgroup_sockets_init(memcg, ss); |
4630 | * Part of this would be better living in a separate allocation | ||
4631 | * function, leaving us with just the cgroup tree population work. | ||
4632 | * We, however, depend on state such as network's proto_list that | ||
4633 | * is only initialized after cgroup creation. I found the less | ||
4634 | * cumbersome way to deal with it to defer it all to populate time | ||
4635 | */ | ||
4636 | return mem_cgroup_sockets_init(cont, ss); | ||
4637 | }; | 4540 | }; |
4638 | 4541 | ||
4639 | static void kmem_cgroup_destroy(struct cgroup *cont) | 4542 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) |
4640 | { | 4543 | { |
4641 | mem_cgroup_sockets_destroy(cont); | 4544 | mem_cgroup_sockets_destroy(memcg); |
4642 | } | 4545 | } |
4643 | #else | 4546 | #else |
4644 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | 4547 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4645 | { | 4548 | { |
4646 | return 0; | 4549 | return 0; |
4647 | } | 4550 | } |
4648 | 4551 | ||
4649 | static void kmem_cgroup_destroy(struct cgroup *cont) | 4552 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) |
4650 | { | 4553 | { |
4651 | } | 4554 | } |
4652 | #endif | 4555 | #endif |
@@ -4655,7 +4558,7 @@ static struct cftype mem_cgroup_files[] = { | |||
4655 | { | 4558 | { |
4656 | .name = "usage_in_bytes", | 4559 | .name = "usage_in_bytes", |
4657 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 4560 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
4658 | .read_u64 = mem_cgroup_read, | 4561 | .read = mem_cgroup_read, |
4659 | .register_event = mem_cgroup_usage_register_event, | 4562 | .register_event = mem_cgroup_usage_register_event, |
4660 | .unregister_event = mem_cgroup_usage_unregister_event, | 4563 | .unregister_event = mem_cgroup_usage_unregister_event, |
4661 | }, | 4564 | }, |
@@ -4663,29 +4566,29 @@ static struct cftype mem_cgroup_files[] = { | |||
4663 | .name = "max_usage_in_bytes", | 4566 | .name = "max_usage_in_bytes", |
4664 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), | 4567 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
4665 | .trigger = mem_cgroup_reset, | 4568 | .trigger = mem_cgroup_reset, |
4666 | .read_u64 = mem_cgroup_read, | 4569 | .read = mem_cgroup_read, |
4667 | }, | 4570 | }, |
4668 | { | 4571 | { |
4669 | .name = "limit_in_bytes", | 4572 | .name = "limit_in_bytes", |
4670 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), | 4573 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
4671 | .write_string = mem_cgroup_write, | 4574 | .write_string = mem_cgroup_write, |
4672 | .read_u64 = mem_cgroup_read, | 4575 | .read = mem_cgroup_read, |
4673 | }, | 4576 | }, |
4674 | { | 4577 | { |
4675 | .name = "soft_limit_in_bytes", | 4578 | .name = "soft_limit_in_bytes", |
4676 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | 4579 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), |
4677 | .write_string = mem_cgroup_write, | 4580 | .write_string = mem_cgroup_write, |
4678 | .read_u64 = mem_cgroup_read, | 4581 | .read = mem_cgroup_read, |
4679 | }, | 4582 | }, |
4680 | { | 4583 | { |
4681 | .name = "failcnt", | 4584 | .name = "failcnt", |
4682 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 4585 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
4683 | .trigger = mem_cgroup_reset, | 4586 | .trigger = mem_cgroup_reset, |
4684 | .read_u64 = mem_cgroup_read, | 4587 | .read = mem_cgroup_read, |
4685 | }, | 4588 | }, |
4686 | { | 4589 | { |
4687 | .name = "stat", | 4590 | .name = "stat", |
4688 | .read_map = mem_control_stat_show, | 4591 | .read_seq_string = mem_control_stat_show, |
4689 | }, | 4592 | }, |
4690 | { | 4593 | { |
4691 | .name = "force_empty", | 4594 | .name = "force_empty", |
@@ -4717,18 +4620,14 @@ static struct cftype mem_cgroup_files[] = { | |||
4717 | #ifdef CONFIG_NUMA | 4620 | #ifdef CONFIG_NUMA |
4718 | { | 4621 | { |
4719 | .name = "numa_stat", | 4622 | .name = "numa_stat", |
4720 | .open = mem_control_numa_stat_open, | 4623 | .read_seq_string = mem_control_numa_stat_show, |
4721 | .mode = S_IRUGO, | ||
4722 | }, | 4624 | }, |
4723 | #endif | 4625 | #endif |
4724 | }; | ||
4725 | |||
4726 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4626 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
4727 | static struct cftype memsw_cgroup_files[] = { | ||
4728 | { | 4627 | { |
4729 | .name = "memsw.usage_in_bytes", | 4628 | .name = "memsw.usage_in_bytes", |
4730 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 4629 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
4731 | .read_u64 = mem_cgroup_read, | 4630 | .read = mem_cgroup_read, |
4732 | .register_event = mem_cgroup_usage_register_event, | 4631 | .register_event = mem_cgroup_usage_register_event, |
4733 | .unregister_event = mem_cgroup_usage_unregister_event, | 4632 | .unregister_event = mem_cgroup_usage_unregister_event, |
4734 | }, | 4633 | }, |
@@ -4736,41 +4635,28 @@ static struct cftype memsw_cgroup_files[] = { | |||
4736 | .name = "memsw.max_usage_in_bytes", | 4635 | .name = "memsw.max_usage_in_bytes", |
4737 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | 4636 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), |
4738 | .trigger = mem_cgroup_reset, | 4637 | .trigger = mem_cgroup_reset, |
4739 | .read_u64 = mem_cgroup_read, | 4638 | .read = mem_cgroup_read, |
4740 | }, | 4639 | }, |
4741 | { | 4640 | { |
4742 | .name = "memsw.limit_in_bytes", | 4641 | .name = "memsw.limit_in_bytes", |
4743 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | 4642 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), |
4744 | .write_string = mem_cgroup_write, | 4643 | .write_string = mem_cgroup_write, |
4745 | .read_u64 = mem_cgroup_read, | 4644 | .read = mem_cgroup_read, |
4746 | }, | 4645 | }, |
4747 | { | 4646 | { |
4748 | .name = "memsw.failcnt", | 4647 | .name = "memsw.failcnt", |
4749 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | 4648 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), |
4750 | .trigger = mem_cgroup_reset, | 4649 | .trigger = mem_cgroup_reset, |
4751 | .read_u64 = mem_cgroup_read, | 4650 | .read = mem_cgroup_read, |
4752 | }, | 4651 | }, |
4753 | }; | ||
4754 | |||
4755 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
4756 | { | ||
4757 | if (!do_swap_account) | ||
4758 | return 0; | ||
4759 | return cgroup_add_files(cont, ss, memsw_cgroup_files, | ||
4760 | ARRAY_SIZE(memsw_cgroup_files)); | ||
4761 | }; | ||
4762 | #else | ||
4763 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
4764 | { | ||
4765 | return 0; | ||
4766 | } | ||
4767 | #endif | 4652 | #endif |
4653 | { }, /* terminate */ | ||
4654 | }; | ||
4768 | 4655 | ||
4769 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 4656 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
4770 | { | 4657 | { |
4771 | struct mem_cgroup_per_node *pn; | 4658 | struct mem_cgroup_per_node *pn; |
4772 | struct mem_cgroup_per_zone *mz; | 4659 | struct mem_cgroup_per_zone *mz; |
4773 | enum lru_list lru; | ||
4774 | int zone, tmp = node; | 4660 | int zone, tmp = node; |
4775 | /* | 4661 | /* |
4776 | * This routine is called against possible nodes. | 4662 | * This routine is called against possible nodes. |
@@ -4788,8 +4674,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4788 | 4674 | ||
4789 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4675 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4790 | mz = &pn->zoneinfo[zone]; | 4676 | mz = &pn->zoneinfo[zone]; |
4791 | for_each_lru(lru) | 4677 | lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); |
4792 | INIT_LIST_HEAD(&mz->lruvec.lists[lru]); | ||
4793 | mz->usage_in_excess = 0; | 4678 | mz->usage_in_excess = 0; |
4794 | mz->on_tree = false; | 4679 | mz->on_tree = false; |
4795 | mz->memcg = memcg; | 4680 | mz->memcg = memcg; |
@@ -4832,23 +4717,40 @@ out_free: | |||
4832 | } | 4717 | } |
4833 | 4718 | ||
4834 | /* | 4719 | /* |
4835 | * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, | 4720 | * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, |
4836 | * but in process context. The work_freeing structure is overlaid | 4721 | * but in process context. The work_freeing structure is overlaid |
4837 | * on the rcu_freeing structure, which itself is overlaid on memsw. | 4722 | * on the rcu_freeing structure, which itself is overlaid on memsw. |
4838 | */ | 4723 | */ |
4839 | static void vfree_work(struct work_struct *work) | 4724 | static void free_work(struct work_struct *work) |
4840 | { | 4725 | { |
4841 | struct mem_cgroup *memcg; | 4726 | struct mem_cgroup *memcg; |
4727 | int size = sizeof(struct mem_cgroup); | ||
4842 | 4728 | ||
4843 | memcg = container_of(work, struct mem_cgroup, work_freeing); | 4729 | memcg = container_of(work, struct mem_cgroup, work_freeing); |
4844 | vfree(memcg); | 4730 | /* |
4731 | * We need to make sure that (at least for now), the jump label | ||
4732 | * destruction code runs outside of the cgroup lock. This is because | ||
4733 | * get_online_cpus(), which is called from the static_branch update, | ||
4734 | * can't be called inside the cgroup_lock. cpusets are the ones | ||
4735 | * enforcing this dependency, so if they ever change, we might as well. | ||
4736 | * | ||
4737 | * schedule_work() will guarantee this happens. Be careful if you need | ||
4738 | * to move this code around, and make sure it is outside | ||
4739 | * the cgroup_lock. | ||
4740 | */ | ||
4741 | disarm_sock_keys(memcg); | ||
4742 | if (size < PAGE_SIZE) | ||
4743 | kfree(memcg); | ||
4744 | else | ||
4745 | vfree(memcg); | ||
4845 | } | 4746 | } |
4846 | static void vfree_rcu(struct rcu_head *rcu_head) | 4747 | |
4748 | static void free_rcu(struct rcu_head *rcu_head) | ||
4847 | { | 4749 | { |
4848 | struct mem_cgroup *memcg; | 4750 | struct mem_cgroup *memcg; |
4849 | 4751 | ||
4850 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); | 4752 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); |
4851 | INIT_WORK(&memcg->work_freeing, vfree_work); | 4753 | INIT_WORK(&memcg->work_freeing, free_work); |
4852 | schedule_work(&memcg->work_freeing); | 4754 | schedule_work(&memcg->work_freeing); |
4853 | } | 4755 | } |
4854 | 4756 | ||
@@ -4874,10 +4776,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4874 | free_mem_cgroup_per_zone_info(memcg, node); | 4776 | free_mem_cgroup_per_zone_info(memcg, node); |
4875 | 4777 | ||
4876 | free_percpu(memcg->stat); | 4778 | free_percpu(memcg->stat); |
4877 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | 4779 | call_rcu(&memcg->rcu_freeing, free_rcu); |
4878 | kfree_rcu(memcg, rcu_freeing); | ||
4879 | else | ||
4880 | call_rcu(&memcg->rcu_freeing, vfree_rcu); | ||
4881 | } | 4780 | } |
4882 | 4781 | ||
4883 | static void mem_cgroup_get(struct mem_cgroup *memcg) | 4782 | static void mem_cgroup_get(struct mem_cgroup *memcg) |
@@ -5016,6 +4915,17 @@ mem_cgroup_create(struct cgroup *cont) | |||
5016 | memcg->move_charge_at_immigrate = 0; | 4915 | memcg->move_charge_at_immigrate = 0; |
5017 | mutex_init(&memcg->thresholds_lock); | 4916 | mutex_init(&memcg->thresholds_lock); |
5018 | spin_lock_init(&memcg->move_lock); | 4917 | spin_lock_init(&memcg->move_lock); |
4918 | |||
4919 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); | ||
4920 | if (error) { | ||
4921 | /* | ||
4922 | * We call put now because our (and parent's) refcnts | ||
4923 | * are already in place. mem_cgroup_put() will internally | ||
4924 | * call __mem_cgroup_free, so return directly | ||
4925 | */ | ||
4926 | mem_cgroup_put(memcg); | ||
4927 | return ERR_PTR(error); | ||
4928 | } | ||
5019 | return &memcg->css; | 4929 | return &memcg->css; |
5020 | free_out: | 4930 | free_out: |
5021 | __mem_cgroup_free(memcg); | 4931 | __mem_cgroup_free(memcg); |
@@ -5033,28 +4943,11 @@ static void mem_cgroup_destroy(struct cgroup *cont) | |||
5033 | { | 4943 | { |
5034 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4944 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5035 | 4945 | ||
5036 | kmem_cgroup_destroy(cont); | 4946 | kmem_cgroup_destroy(memcg); |
5037 | 4947 | ||
5038 | mem_cgroup_put(memcg); | 4948 | mem_cgroup_put(memcg); |
5039 | } | 4949 | } |
5040 | 4950 | ||
5041 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | ||
5042 | struct cgroup *cont) | ||
5043 | { | ||
5044 | int ret; | ||
5045 | |||
5046 | ret = cgroup_add_files(cont, ss, mem_cgroup_files, | ||
5047 | ARRAY_SIZE(mem_cgroup_files)); | ||
5048 | |||
5049 | if (!ret) | ||
5050 | ret = register_memsw_files(cont, ss); | ||
5051 | |||
5052 | if (!ret) | ||
5053 | ret = register_kmem_files(cont, ss); | ||
5054 | |||
5055 | return ret; | ||
5056 | } | ||
5057 | |||
5058 | #ifdef CONFIG_MMU | 4951 | #ifdef CONFIG_MMU |
5059 | /* Handlers for move charge at task migration. */ | 4952 | /* Handlers for move charge at task migration. */ |
5060 | #define PRECHARGE_COUNT_AT_ONCE 256 | 4953 | #define PRECHARGE_COUNT_AT_ONCE 256 |
@@ -5147,7 +5040,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
5147 | return NULL; | 5040 | return NULL; |
5148 | if (PageAnon(page)) { | 5041 | if (PageAnon(page)) { |
5149 | /* we don't move shared anon */ | 5042 | /* we don't move shared anon */ |
5150 | if (!move_anon() || page_mapcount(page) > 2) | 5043 | if (!move_anon()) |
5151 | return NULL; | 5044 | return NULL; |
5152 | } else if (!move_file()) | 5045 | } else if (!move_file()) |
5153 | /* we ignore mapcount for file pages */ | 5046 | /* we ignore mapcount for file pages */ |
@@ -5158,32 +5051,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
5158 | return page; | 5051 | return page; |
5159 | } | 5052 | } |
5160 | 5053 | ||
5054 | #ifdef CONFIG_SWAP | ||
5161 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 5055 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, |
5162 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 5056 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
5163 | { | 5057 | { |
5164 | int usage_count; | ||
5165 | struct page *page = NULL; | 5058 | struct page *page = NULL; |
5166 | swp_entry_t ent = pte_to_swp_entry(ptent); | 5059 | swp_entry_t ent = pte_to_swp_entry(ptent); |
5167 | 5060 | ||
5168 | if (!move_anon() || non_swap_entry(ent)) | 5061 | if (!move_anon() || non_swap_entry(ent)) |
5169 | return NULL; | 5062 | return NULL; |
5170 | usage_count = mem_cgroup_count_swap_user(ent, &page); | 5063 | /* |
5171 | if (usage_count > 1) { /* we don't move shared anon */ | 5064 | * Because lookup_swap_cache() updates some statistics counter, |
5172 | if (page) | 5065 | * we call find_get_page() with swapper_space directly. |
5173 | put_page(page); | 5066 | */ |
5174 | return NULL; | 5067 | page = find_get_page(&swapper_space, ent.val); |
5175 | } | ||
5176 | if (do_swap_account) | 5068 | if (do_swap_account) |
5177 | entry->val = ent.val; | 5069 | entry->val = ent.val; |
5178 | 5070 | ||
5179 | return page; | 5071 | return page; |
5180 | } | 5072 | } |
5073 | #else | ||
5074 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | ||
5075 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
5076 | { | ||
5077 | return NULL; | ||
5078 | } | ||
5079 | #endif | ||
5181 | 5080 | ||
5182 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | 5081 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, |
5183 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 5082 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
5184 | { | 5083 | { |
5185 | struct page *page = NULL; | 5084 | struct page *page = NULL; |
5186 | struct inode *inode; | ||
5187 | struct address_space *mapping; | 5085 | struct address_space *mapping; |
5188 | pgoff_t pgoff; | 5086 | pgoff_t pgoff; |
5189 | 5087 | ||
@@ -5192,7 +5090,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
5192 | if (!move_file()) | 5090 | if (!move_file()) |
5193 | return NULL; | 5091 | return NULL; |
5194 | 5092 | ||
5195 | inode = vma->vm_file->f_path.dentry->d_inode; | ||
5196 | mapping = vma->vm_file->f_mapping; | 5093 | mapping = vma->vm_file->f_mapping; |
5197 | if (pte_none(ptent)) | 5094 | if (pte_none(ptent)) |
5198 | pgoff = linear_page_index(vma, addr); | 5095 | pgoff = linear_page_index(vma, addr); |
@@ -5491,8 +5388,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5491 | if (!isolate_lru_page(page)) { | 5388 | if (!isolate_lru_page(page)) { |
5492 | pc = lookup_page_cgroup(page); | 5389 | pc = lookup_page_cgroup(page); |
5493 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, | 5390 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, |
5494 | pc, mc.from, mc.to, | 5391 | pc, mc.from, mc.to)) { |
5495 | false)) { | ||
5496 | mc.precharge -= HPAGE_PMD_NR; | 5392 | mc.precharge -= HPAGE_PMD_NR; |
5497 | mc.moved_charge += HPAGE_PMD_NR; | 5393 | mc.moved_charge += HPAGE_PMD_NR; |
5498 | } | 5394 | } |
@@ -5522,7 +5418,7 @@ retry: | |||
5522 | goto put; | 5418 | goto put; |
5523 | pc = lookup_page_cgroup(page); | 5419 | pc = lookup_page_cgroup(page); |
5524 | if (!mem_cgroup_move_account(page, 1, pc, | 5420 | if (!mem_cgroup_move_account(page, 1, pc, |
5525 | mc.from, mc.to, false)) { | 5421 | mc.from, mc.to)) { |
5526 | mc.precharge--; | 5422 | mc.precharge--; |
5527 | /* we uncharge from mc.from later. */ | 5423 | /* we uncharge from mc.from later. */ |
5528 | mc.moved_charge++; | 5424 | mc.moved_charge++; |
@@ -5533,8 +5429,7 @@ put: /* get_mctgt_type() gets the page */ | |||
5533 | break; | 5429 | break; |
5534 | case MC_TARGET_SWAP: | 5430 | case MC_TARGET_SWAP: |
5535 | ent = target.ent; | 5431 | ent = target.ent; |
5536 | if (!mem_cgroup_move_swap_account(ent, | 5432 | if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { |
5537 | mc.from, mc.to, false)) { | ||
5538 | mc.precharge--; | 5433 | mc.precharge--; |
5539 | /* we fixup refcnts and charges later. */ | 5434 | /* we fixup refcnts and charges later. */ |
5540 | mc.moved_swap++; | 5435 | mc.moved_swap++; |
@@ -5610,7 +5505,6 @@ static void mem_cgroup_move_task(struct cgroup *cont, | |||
5610 | if (mm) { | 5505 | if (mm) { |
5611 | if (mc.to) | 5506 | if (mc.to) |
5612 | mem_cgroup_move_charge(mm); | 5507 | mem_cgroup_move_charge(mm); |
5613 | put_swap_token(mm); | ||
5614 | mmput(mm); | 5508 | mmput(mm); |
5615 | } | 5509 | } |
5616 | if (mc.to) | 5510 | if (mc.to) |
@@ -5638,12 +5532,13 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
5638 | .create = mem_cgroup_create, | 5532 | .create = mem_cgroup_create, |
5639 | .pre_destroy = mem_cgroup_pre_destroy, | 5533 | .pre_destroy = mem_cgroup_pre_destroy, |
5640 | .destroy = mem_cgroup_destroy, | 5534 | .destroy = mem_cgroup_destroy, |
5641 | .populate = mem_cgroup_populate, | ||
5642 | .can_attach = mem_cgroup_can_attach, | 5535 | .can_attach = mem_cgroup_can_attach, |
5643 | .cancel_attach = mem_cgroup_cancel_attach, | 5536 | .cancel_attach = mem_cgroup_cancel_attach, |
5644 | .attach = mem_cgroup_move_task, | 5537 | .attach = mem_cgroup_move_task, |
5538 | .base_cftypes = mem_cgroup_files, | ||
5645 | .early_init = 0, | 5539 | .early_init = 0, |
5646 | .use_id = 1, | 5540 | .use_id = 1, |
5541 | .__DEPRECATED_clear_css_refs = true, | ||
5647 | }; | 5542 | }; |
5648 | 5543 | ||
5649 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 5544 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97cc2733551a..ab1e7145e290 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1388,23 +1388,23 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1388 | */ | 1388 | */ |
1389 | if (!get_page_unless_zero(compound_head(p))) { | 1389 | if (!get_page_unless_zero(compound_head(p))) { |
1390 | if (PageHuge(p)) { | 1390 | if (PageHuge(p)) { |
1391 | pr_info("get_any_page: %#lx free huge page\n", pfn); | 1391 | pr_info("%s: %#lx free huge page\n", __func__, pfn); |
1392 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | 1392 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); |
1393 | } else if (is_free_buddy_page(p)) { | 1393 | } else if (is_free_buddy_page(p)) { |
1394 | pr_info("get_any_page: %#lx free buddy page\n", pfn); | 1394 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); |
1395 | /* Set hwpoison bit while page is still isolated */ | 1395 | /* Set hwpoison bit while page is still isolated */ |
1396 | SetPageHWPoison(p); | 1396 | SetPageHWPoison(p); |
1397 | ret = 0; | 1397 | ret = 0; |
1398 | } else { | 1398 | } else { |
1399 | pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", | 1399 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", |
1400 | pfn, p->flags); | 1400 | __func__, pfn, p->flags); |
1401 | ret = -EIO; | 1401 | ret = -EIO; |
1402 | } | 1402 | } |
1403 | } else { | 1403 | } else { |
1404 | /* Not a free page */ | 1404 | /* Not a free page */ |
1405 | ret = 1; | 1405 | ret = 1; |
1406 | } | 1406 | } |
1407 | unset_migratetype_isolate(p); | 1407 | unset_migratetype_isolate(p, MIGRATE_MOVABLE); |
1408 | unlock_memory_hotplug(); | 1408 | unlock_memory_hotplug(); |
1409 | return ret; | 1409 | return ret; |
1410 | } | 1410 | } |
diff --git a/mm/memory.c b/mm/memory.c index 6105f475fa86..2466d1250231 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1225 | next = pmd_addr_end(addr, end); | 1225 | next = pmd_addr_end(addr, end); |
1226 | if (pmd_trans_huge(*pmd)) { | 1226 | if (pmd_trans_huge(*pmd)) { |
1227 | if (next - addr != HPAGE_PMD_SIZE) { | 1227 | if (next - addr != HPAGE_PMD_SIZE) { |
1228 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1228 | #ifdef CONFIG_DEBUG_VM |
1229 | if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { | ||
1230 | pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", | ||
1231 | __func__, addr, end, | ||
1232 | vma->vm_start, | ||
1233 | vma->vm_end); | ||
1234 | BUG(); | ||
1235 | } | ||
1236 | #endif | ||
1229 | split_huge_page_pmd(vma->vm_mm, pmd); | 1237 | split_huge_page_pmd(vma->vm_mm, pmd); |
1230 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1238 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1231 | goto next; | 1239 | goto next; |
@@ -1295,7 +1303,7 @@ static void unmap_page_range(struct mmu_gather *tlb, | |||
1295 | 1303 | ||
1296 | static void unmap_single_vma(struct mmu_gather *tlb, | 1304 | static void unmap_single_vma(struct mmu_gather *tlb, |
1297 | struct vm_area_struct *vma, unsigned long start_addr, | 1305 | struct vm_area_struct *vma, unsigned long start_addr, |
1298 | unsigned long end_addr, unsigned long *nr_accounted, | 1306 | unsigned long end_addr, |
1299 | struct zap_details *details) | 1307 | struct zap_details *details) |
1300 | { | 1308 | { |
1301 | unsigned long start = max(vma->vm_start, start_addr); | 1309 | unsigned long start = max(vma->vm_start, start_addr); |
@@ -1307,8 +1315,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1307 | if (end <= vma->vm_start) | 1315 | if (end <= vma->vm_start) |
1308 | return; | 1316 | return; |
1309 | 1317 | ||
1310 | if (vma->vm_flags & VM_ACCOUNT) | 1318 | if (vma->vm_file) |
1311 | *nr_accounted += (end - start) >> PAGE_SHIFT; | 1319 | uprobe_munmap(vma, start, end); |
1312 | 1320 | ||
1313 | if (unlikely(is_pfn_mapping(vma))) | 1321 | if (unlikely(is_pfn_mapping(vma))) |
1314 | untrack_pfn_vma(vma, 0, 0); | 1322 | untrack_pfn_vma(vma, 0, 0); |
@@ -1339,8 +1347,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1339 | * @vma: the starting vma | 1347 | * @vma: the starting vma |
1340 | * @start_addr: virtual address at which to start unmapping | 1348 | * @start_addr: virtual address at which to start unmapping |
1341 | * @end_addr: virtual address at which to end unmapping | 1349 | * @end_addr: virtual address at which to end unmapping |
1342 | * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here | ||
1343 | * @details: details of nonlinear truncation or shared cache invalidation | ||
1344 | * | 1350 | * |
1345 | * Unmap all pages in the vma list. | 1351 | * Unmap all pages in the vma list. |
1346 | * | 1352 | * |
@@ -1355,40 +1361,40 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1355 | */ | 1361 | */ |
1356 | void unmap_vmas(struct mmu_gather *tlb, | 1362 | void unmap_vmas(struct mmu_gather *tlb, |
1357 | struct vm_area_struct *vma, unsigned long start_addr, | 1363 | struct vm_area_struct *vma, unsigned long start_addr, |
1358 | unsigned long end_addr, unsigned long *nr_accounted, | 1364 | unsigned long end_addr) |
1359 | struct zap_details *details) | ||
1360 | { | 1365 | { |
1361 | struct mm_struct *mm = vma->vm_mm; | 1366 | struct mm_struct *mm = vma->vm_mm; |
1362 | 1367 | ||
1363 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | 1368 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); |
1364 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) | 1369 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) |
1365 | unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, | 1370 | unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); |
1366 | details); | ||
1367 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | 1371 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); |
1368 | } | 1372 | } |
1369 | 1373 | ||
1370 | /** | 1374 | /** |
1371 | * zap_page_range - remove user pages in a given range | 1375 | * zap_page_range - remove user pages in a given range |
1372 | * @vma: vm_area_struct holding the applicable pages | 1376 | * @vma: vm_area_struct holding the applicable pages |
1373 | * @address: starting address of pages to zap | 1377 | * @start: starting address of pages to zap |
1374 | * @size: number of bytes to zap | 1378 | * @size: number of bytes to zap |
1375 | * @details: details of nonlinear truncation or shared cache invalidation | 1379 | * @details: details of nonlinear truncation or shared cache invalidation |
1376 | * | 1380 | * |
1377 | * Caller must protect the VMA list | 1381 | * Caller must protect the VMA list |
1378 | */ | 1382 | */ |
1379 | void zap_page_range(struct vm_area_struct *vma, unsigned long address, | 1383 | void zap_page_range(struct vm_area_struct *vma, unsigned long start, |
1380 | unsigned long size, struct zap_details *details) | 1384 | unsigned long size, struct zap_details *details) |
1381 | { | 1385 | { |
1382 | struct mm_struct *mm = vma->vm_mm; | 1386 | struct mm_struct *mm = vma->vm_mm; |
1383 | struct mmu_gather tlb; | 1387 | struct mmu_gather tlb; |
1384 | unsigned long end = address + size; | 1388 | unsigned long end = start + size; |
1385 | unsigned long nr_accounted = 0; | ||
1386 | 1389 | ||
1387 | lru_add_drain(); | 1390 | lru_add_drain(); |
1388 | tlb_gather_mmu(&tlb, mm, 0); | 1391 | tlb_gather_mmu(&tlb, mm, 0); |
1389 | update_hiwater_rss(mm); | 1392 | update_hiwater_rss(mm); |
1390 | unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); | 1393 | mmu_notifier_invalidate_range_start(mm, start, end); |
1391 | tlb_finish_mmu(&tlb, address, end); | 1394 | for ( ; vma && vma->vm_start < end; vma = vma->vm_next) |
1395 | unmap_single_vma(&tlb, vma, start, end, details); | ||
1396 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
1397 | tlb_finish_mmu(&tlb, start, end); | ||
1392 | } | 1398 | } |
1393 | 1399 | ||
1394 | /** | 1400 | /** |
@@ -1406,13 +1412,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr | |||
1406 | struct mm_struct *mm = vma->vm_mm; | 1412 | struct mm_struct *mm = vma->vm_mm; |
1407 | struct mmu_gather tlb; | 1413 | struct mmu_gather tlb; |
1408 | unsigned long end = address + size; | 1414 | unsigned long end = address + size; |
1409 | unsigned long nr_accounted = 0; | ||
1410 | 1415 | ||
1411 | lru_add_drain(); | 1416 | lru_add_drain(); |
1412 | tlb_gather_mmu(&tlb, mm, 0); | 1417 | tlb_gather_mmu(&tlb, mm, 0); |
1413 | update_hiwater_rss(mm); | 1418 | update_hiwater_rss(mm); |
1414 | mmu_notifier_invalidate_range_start(mm, address, end); | 1419 | mmu_notifier_invalidate_range_start(mm, address, end); |
1415 | unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); | 1420 | unmap_single_vma(&tlb, vma, address, end, details); |
1416 | mmu_notifier_invalidate_range_end(mm, address, end); | 1421 | mmu_notifier_invalidate_range_end(mm, address, end); |
1417 | tlb_finish_mmu(&tlb, address, end); | 1422 | tlb_finish_mmu(&tlb, address, end); |
1418 | } | 1423 | } |
@@ -2911,7 +2916,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2911 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2916 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2912 | page = lookup_swap_cache(entry); | 2917 | page = lookup_swap_cache(entry); |
2913 | if (!page) { | 2918 | if (!page) { |
2914 | grab_swap_token(mm); /* Contend for token _before_ read-in */ | ||
2915 | page = swapin_readahead(entry, | 2919 | page = swapin_readahead(entry, |
2916 | GFP_HIGHUSER_MOVABLE, vma, address); | 2920 | GFP_HIGHUSER_MOVABLE, vma, address); |
2917 | if (!page) { | 2921 | if (!page) { |
@@ -2941,6 +2945,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2941 | } | 2945 | } |
2942 | 2946 | ||
2943 | locked = lock_page_or_retry(page, mm, flags); | 2947 | locked = lock_page_or_retry(page, mm, flags); |
2948 | |||
2944 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2949 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2945 | if (!locked) { | 2950 | if (!locked) { |
2946 | ret |= VM_FAULT_RETRY; | 2951 | ret |= VM_FAULT_RETRY; |
@@ -3489,6 +3494,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3489 | if (unlikely(is_vm_hugetlb_page(vma))) | 3494 | if (unlikely(is_vm_hugetlb_page(vma))) |
3490 | return hugetlb_fault(mm, vma, address, flags); | 3495 | return hugetlb_fault(mm, vma, address, flags); |
3491 | 3496 | ||
3497 | retry: | ||
3492 | pgd = pgd_offset(mm, address); | 3498 | pgd = pgd_offset(mm, address); |
3493 | pud = pud_alloc(mm, pgd, address); | 3499 | pud = pud_alloc(mm, pgd, address); |
3494 | if (!pud) | 3500 | if (!pud) |
@@ -3502,13 +3508,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3502 | pmd, flags); | 3508 | pmd, flags); |
3503 | } else { | 3509 | } else { |
3504 | pmd_t orig_pmd = *pmd; | 3510 | pmd_t orig_pmd = *pmd; |
3511 | int ret; | ||
3512 | |||
3505 | barrier(); | 3513 | barrier(); |
3506 | if (pmd_trans_huge(orig_pmd)) { | 3514 | if (pmd_trans_huge(orig_pmd)) { |
3507 | if (flags & FAULT_FLAG_WRITE && | 3515 | if (flags & FAULT_FLAG_WRITE && |
3508 | !pmd_write(orig_pmd) && | 3516 | !pmd_write(orig_pmd) && |
3509 | !pmd_trans_splitting(orig_pmd)) | 3517 | !pmd_trans_splitting(orig_pmd)) { |
3510 | return do_huge_pmd_wp_page(mm, vma, address, | 3518 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3511 | pmd, orig_pmd); | 3519 | orig_pmd); |
3520 | /* | ||
3521 | * If COW results in an oom, the huge pmd will | ||
3522 | * have been split, so retry the fault on the | ||
3523 | * pte for a smaller charge. | ||
3524 | */ | ||
3525 | if (unlikely(ret & VM_FAULT_OOM)) | ||
3526 | goto retry; | ||
3527 | return ret; | ||
3528 | } | ||
3512 | return 0; | 3529 | return 0; |
3513 | } | 3530 | } |
3514 | } | 3531 | } |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6629fafd6ce4..427bb291dd0f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) | |||
74 | res->end = start + size - 1; | 74 | res->end = start + size - 1; |
75 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 75 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
76 | if (request_resource(&iomem_resource, res) < 0) { | 76 | if (request_resource(&iomem_resource, res) < 0) { |
77 | printk("System RAM resource %llx - %llx cannot be added\n", | 77 | printk("System RAM resource %pR cannot be added\n", res); |
78 | (unsigned long long)res->start, (unsigned long long)res->end); | ||
79 | kfree(res); | 78 | kfree(res); |
80 | res = NULL; | 79 | res = NULL; |
81 | } | 80 | } |
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
502 | online_pages_range); | 501 | online_pages_range); |
503 | if (ret) { | 502 | if (ret) { |
504 | mutex_unlock(&zonelists_mutex); | 503 | mutex_unlock(&zonelists_mutex); |
505 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", | 504 | printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", |
506 | nr_pages, pfn); | 505 | (unsigned long long) pfn << PAGE_SHIFT, |
506 | (((unsigned long long) pfn + nr_pages) | ||
507 | << PAGE_SHIFT) - 1); | ||
507 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 508 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
508 | unlock_memory_hotplug(); | 509 | unlock_memory_hotplug(); |
509 | return ret; | 510 | return ret; |
@@ -617,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
617 | pgdat = hotadd_new_pgdat(nid, start); | 618 | pgdat = hotadd_new_pgdat(nid, start); |
618 | ret = -ENOMEM; | 619 | ret = -ENOMEM; |
619 | if (!pgdat) | 620 | if (!pgdat) |
620 | goto out; | 621 | goto error; |
621 | new_pgdat = 1; | 622 | new_pgdat = 1; |
622 | } | 623 | } |
623 | 624 | ||
@@ -891,7 +892,7 @@ static int __ref offline_pages(unsigned long start_pfn, | |||
891 | nr_pages = end_pfn - start_pfn; | 892 | nr_pages = end_pfn - start_pfn; |
892 | 893 | ||
893 | /* set above range as isolated */ | 894 | /* set above range as isolated */ |
894 | ret = start_isolate_page_range(start_pfn, end_pfn); | 895 | ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
895 | if (ret) | 896 | if (ret) |
896 | goto out; | 897 | goto out; |
897 | 898 | ||
@@ -956,7 +957,7 @@ repeat: | |||
956 | We cannot do rollback at this point. */ | 957 | We cannot do rollback at this point. */ |
957 | offline_isolated_pages(start_pfn, end_pfn); | 958 | offline_isolated_pages(start_pfn, end_pfn); |
958 | /* reset pagetype flags and makes migrate type to be MOVABLE */ | 959 | /* reset pagetype flags and makes migrate type to be MOVABLE */ |
959 | undo_isolate_page_range(start_pfn, end_pfn); | 960 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
960 | /* removal success */ | 961 | /* removal success */ |
961 | zone->present_pages -= offlined_pages; | 962 | zone->present_pages -= offlined_pages; |
962 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 963 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
@@ -977,11 +978,12 @@ repeat: | |||
977 | return 0; | 978 | return 0; |
978 | 979 | ||
979 | failed_removal: | 980 | failed_removal: |
980 | printk(KERN_INFO "memory offlining %lx to %lx failed\n", | 981 | printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", |
981 | start_pfn, end_pfn); | 982 | (unsigned long long) start_pfn << PAGE_SHIFT, |
983 | ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); | ||
982 | memory_notify(MEM_CANCEL_OFFLINE, &arg); | 984 | memory_notify(MEM_CANCEL_OFFLINE, &arg); |
983 | /* pushback to free area */ | 985 | /* pushback to free area */ |
984 | undo_isolate_page_range(start_pfn, end_pfn); | 986 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
985 | 987 | ||
986 | out: | 988 | out: |
987 | unlock_memory_hotplug(); | 989 | unlock_memory_hotplug(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b19569137529..1d771e4200d2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, | |||
390 | { | 390 | { |
391 | if (!pol) | 391 | if (!pol) |
392 | return; | 392 | return; |
393 | if (!mpol_store_user_nodemask(pol) && step == 0 && | 393 | if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && |
394 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) | 394 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) |
395 | return; | 395 | return; |
396 | 396 | ||
@@ -607,27 +607,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
607 | return first; | 607 | return first; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* Apply policy to a single VMA */ | ||
611 | static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | ||
612 | { | ||
613 | int err = 0; | ||
614 | struct mempolicy *old = vma->vm_policy; | ||
615 | |||
616 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
617 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
618 | vma->vm_ops, vma->vm_file, | ||
619 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
620 | |||
621 | if (vma->vm_ops && vma->vm_ops->set_policy) | ||
622 | err = vma->vm_ops->set_policy(vma, new); | ||
623 | if (!err) { | ||
624 | mpol_get(new); | ||
625 | vma->vm_policy = new; | ||
626 | mpol_put(old); | ||
627 | } | ||
628 | return err; | ||
629 | } | ||
630 | |||
631 | /* Step 2: apply policy to a range and do splits. */ | 610 | /* Step 2: apply policy to a range and do splits. */ |
632 | static int mbind_range(struct mm_struct *mm, unsigned long start, | 611 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
633 | unsigned long end, struct mempolicy *new_pol) | 612 | unsigned long end, struct mempolicy *new_pol) |
@@ -676,9 +655,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
676 | if (err) | 655 | if (err) |
677 | goto out; | 656 | goto out; |
678 | } | 657 | } |
679 | err = policy_vma(vma, new_pol); | 658 | |
680 | if (err) | 659 | /* |
681 | goto out; | 660 | * Apply policy to a single VMA. The reference counting of |
661 | * policy for vma_policy linkages has already been handled by | ||
662 | * vma_merge and split_vma as necessary. If this is a shared | ||
663 | * policy then ->set_policy will increment the reference count | ||
664 | * for an sp node. | ||
665 | */ | ||
666 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
667 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
668 | vma->vm_ops, vma->vm_file, | ||
669 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
670 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
671 | err = vma->vm_ops->set_policy(vma, new_pol); | ||
672 | if (err) | ||
673 | goto out; | ||
674 | } | ||
682 | } | 675 | } |
683 | 676 | ||
684 | out: | 677 | out: |
@@ -957,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
957 | * | 950 | * |
958 | * Returns the number of page that could not be moved. | 951 | * Returns the number of page that could not be moved. |
959 | */ | 952 | */ |
960 | int do_migrate_pages(struct mm_struct *mm, | 953 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, |
961 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 954 | const nodemask_t *to, int flags) |
962 | { | 955 | { |
963 | int busy = 0; | 956 | int busy = 0; |
964 | int err; | 957 | int err; |
@@ -970,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm, | |||
970 | 963 | ||
971 | down_read(&mm->mmap_sem); | 964 | down_read(&mm->mmap_sem); |
972 | 965 | ||
973 | err = migrate_vmas(mm, from_nodes, to_nodes, flags); | 966 | err = migrate_vmas(mm, from, to, flags); |
974 | if (err) | 967 | if (err) |
975 | goto out; | 968 | goto out; |
976 | 969 | ||
@@ -1005,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm, | |||
1005 | * moved to an empty node, then there is nothing left worth migrating. | 998 | * moved to an empty node, then there is nothing left worth migrating. |
1006 | */ | 999 | */ |
1007 | 1000 | ||
1008 | tmp = *from_nodes; | 1001 | tmp = *from; |
1009 | while (!nodes_empty(tmp)) { | 1002 | while (!nodes_empty(tmp)) { |
1010 | int s,d; | 1003 | int s,d; |
1011 | int source = -1; | 1004 | int source = -1; |
1012 | int dest = 0; | 1005 | int dest = 0; |
1013 | 1006 | ||
1014 | for_each_node_mask(s, tmp) { | 1007 | for_each_node_mask(s, tmp) { |
1015 | d = node_remap(s, *from_nodes, *to_nodes); | 1008 | |
1009 | /* | ||
1010 | * do_migrate_pages() tries to maintain the relative | ||
1011 | * node relationship of the pages established between | ||
1012 | * threads and memory areas. | ||
1013 | * | ||
1014 | * However if the number of source nodes is not equal to | ||
1015 | * the number of destination nodes we can not preserve | ||
1016 | * this node relative relationship. In that case, skip | ||
1017 | * copying memory from a node that is in the destination | ||
1018 | * mask. | ||
1019 | * | ||
1020 | * Example: [2,3,4] -> [3,4,5] moves everything. | ||
1021 | * [0-7] - > [3,4,5] moves only 0,1,2,6,7. | ||
1022 | */ | ||
1023 | |||
1024 | if ((nodes_weight(*from) != nodes_weight(*to)) && | ||
1025 | (node_isset(s, *to))) | ||
1026 | continue; | ||
1027 | |||
1028 | d = node_remap(s, *from, *to); | ||
1016 | if (s == d) | 1029 | if (s == d) |
1017 | continue; | 1030 | continue; |
1018 | 1031 | ||
@@ -1072,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
1072 | { | 1085 | { |
1073 | } | 1086 | } |
1074 | 1087 | ||
1075 | int do_migrate_pages(struct mm_struct *mm, | 1088 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, |
1076 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 1089 | const nodemask_t *to, int flags) |
1077 | { | 1090 | { |
1078 | return -ENOSYS; | 1091 | return -ENOSYS; |
1079 | } | 1092 | } |
@@ -1164,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1164 | if (!list_empty(&pagelist)) { | 1177 | if (!list_empty(&pagelist)) { |
1165 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1178 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1166 | (unsigned long)vma, | 1179 | (unsigned long)vma, |
1167 | false, true); | 1180 | false, MIGRATE_SYNC); |
1168 | if (nr_failed) | 1181 | if (nr_failed) |
1169 | putback_lru_pages(&pagelist); | 1182 | putback_lru_pages(&pagelist); |
1170 | } | 1183 | } |
@@ -1334,8 +1347,8 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1334 | * userid as the target process. | 1347 | * userid as the target process. |
1335 | */ | 1348 | */ |
1336 | tcred = __task_cred(task); | 1349 | tcred = __task_cred(task); |
1337 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1350 | if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && |
1338 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1351 | !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && |
1339 | !capable(CAP_SYS_NICE)) { | 1352 | !capable(CAP_SYS_NICE)) { |
1340 | rcu_read_unlock(); | 1353 | rcu_read_unlock(); |
1341 | err = -EPERM; | 1354 | err = -EPERM; |
diff --git a/mm/migrate.c b/mm/migrate.c index 11072383ae12..be26d5cbe56b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -436,7 +436,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
436 | * is actually a signal that all of the page has become dirty. | 436 | * is actually a signal that all of the page has become dirty. |
437 | * Whereas only part of our page may be dirty. | 437 | * Whereas only part of our page may be dirty. |
438 | */ | 438 | */ |
439 | __set_page_dirty_nobuffers(newpage); | 439 | if (PageSwapBacked(page)) |
440 | SetPageDirty(newpage); | ||
441 | else | ||
442 | __set_page_dirty_nobuffers(newpage); | ||
440 | } | 443 | } |
441 | 444 | ||
442 | mlock_migrate_page(newpage, page); | 445 | mlock_migrate_page(newpage, page); |
@@ -1371,8 +1374,8 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1371 | * userid as the target process. | 1374 | * userid as the target process. |
1372 | */ | 1375 | */ |
1373 | tcred = __task_cred(task); | 1376 | tcred = __task_cred(task); |
1374 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1377 | if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && |
1375 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1378 | !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && |
1376 | !capable(CAP_SYS_NICE)) { | 1379 | !capable(CAP_SYS_NICE)) { |
1377 | rcu_read_unlock(); | 1380 | rcu_read_unlock(); |
1378 | err = -EPERM; | 1381 | err = -EPERM; |
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/perf_event.h> | 30 | #include <linux/perf_event.h> |
31 | #include <linux/audit.h> | 31 | #include <linux/audit.h> |
32 | #include <linux/khugepaged.h> | 32 | #include <linux/khugepaged.h> |
33 | #include <linux/uprobes.h> | ||
33 | 34 | ||
34 | #include <asm/uaccess.h> | 35 | #include <asm/uaccess.h> |
35 | #include <asm/cacheflush.h> | 36 | #include <asm/cacheflush.h> |
@@ -546,8 +547,15 @@ again: remove_next = 1 + (end > next->vm_end); | |||
546 | 547 | ||
547 | if (file) { | 548 | if (file) { |
548 | mapping = file->f_mapping; | 549 | mapping = file->f_mapping; |
549 | if (!(vma->vm_flags & VM_NONLINEAR)) | 550 | if (!(vma->vm_flags & VM_NONLINEAR)) { |
550 | root = &mapping->i_mmap; | 551 | root = &mapping->i_mmap; |
552 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); | ||
553 | |||
554 | if (adjust_next) | ||
555 | uprobe_munmap(next, next->vm_start, | ||
556 | next->vm_end); | ||
557 | } | ||
558 | |||
551 | mutex_lock(&mapping->i_mmap_mutex); | 559 | mutex_lock(&mapping->i_mmap_mutex); |
552 | if (insert) { | 560 | if (insert) { |
553 | /* | 561 | /* |
@@ -617,8 +625,16 @@ again: remove_next = 1 + (end > next->vm_end); | |||
617 | if (mapping) | 625 | if (mapping) |
618 | mutex_unlock(&mapping->i_mmap_mutex); | 626 | mutex_unlock(&mapping->i_mmap_mutex); |
619 | 627 | ||
628 | if (root) { | ||
629 | uprobe_mmap(vma); | ||
630 | |||
631 | if (adjust_next) | ||
632 | uprobe_mmap(next); | ||
633 | } | ||
634 | |||
620 | if (remove_next) { | 635 | if (remove_next) { |
621 | if (file) { | 636 | if (file) { |
637 | uprobe_munmap(next, next->vm_start, next->vm_end); | ||
622 | fput(file); | 638 | fput(file); |
623 | if (next->vm_flags & VM_EXECUTABLE) | 639 | if (next->vm_flags & VM_EXECUTABLE) |
624 | removed_exe_file_vma(mm); | 640 | removed_exe_file_vma(mm); |
@@ -638,6 +654,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
638 | goto again; | 654 | goto again; |
639 | } | 655 | } |
640 | } | 656 | } |
657 | if (insert && file) | ||
658 | uprobe_mmap(insert); | ||
641 | 659 | ||
642 | validate_mm(mm); | 660 | validate_mm(mm); |
643 | 661 | ||
@@ -953,15 +971,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint) | |||
953 | * The caller must hold down_write(¤t->mm->mmap_sem). | 971 | * The caller must hold down_write(¤t->mm->mmap_sem). |
954 | */ | 972 | */ |
955 | 973 | ||
956 | static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 974 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
957 | unsigned long len, unsigned long prot, | 975 | unsigned long len, unsigned long prot, |
958 | unsigned long flags, unsigned long pgoff) | 976 | unsigned long flags, unsigned long pgoff) |
959 | { | 977 | { |
960 | struct mm_struct * mm = current->mm; | 978 | struct mm_struct * mm = current->mm; |
961 | struct inode *inode; | 979 | struct inode *inode; |
962 | vm_flags_t vm_flags; | 980 | vm_flags_t vm_flags; |
963 | int error; | ||
964 | unsigned long reqprot = prot; | ||
965 | 981 | ||
966 | /* | 982 | /* |
967 | * Does the application expect PROT_READ to imply PROT_EXEC? | 983 | * Does the application expect PROT_READ to imply PROT_EXEC? |
@@ -1083,39 +1099,9 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1083 | } | 1099 | } |
1084 | } | 1100 | } |
1085 | 1101 | ||
1086 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); | ||
1087 | if (error) | ||
1088 | return error; | ||
1089 | |||
1090 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1102 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); |
1091 | } | 1103 | } |
1092 | 1104 | ||
1093 | unsigned long do_mmap(struct file *file, unsigned long addr, | ||
1094 | unsigned long len, unsigned long prot, | ||
1095 | unsigned long flag, unsigned long offset) | ||
1096 | { | ||
1097 | if (unlikely(offset + PAGE_ALIGN(len) < offset)) | ||
1098 | return -EINVAL; | ||
1099 | if (unlikely(offset & ~PAGE_MASK)) | ||
1100 | return -EINVAL; | ||
1101 | return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); | ||
1102 | } | ||
1103 | EXPORT_SYMBOL(do_mmap); | ||
1104 | |||
1105 | unsigned long vm_mmap(struct file *file, unsigned long addr, | ||
1106 | unsigned long len, unsigned long prot, | ||
1107 | unsigned long flag, unsigned long offset) | ||
1108 | { | ||
1109 | unsigned long ret; | ||
1110 | struct mm_struct *mm = current->mm; | ||
1111 | |||
1112 | down_write(&mm->mmap_sem); | ||
1113 | ret = do_mmap(file, addr, len, prot, flag, offset); | ||
1114 | up_write(&mm->mmap_sem); | ||
1115 | return ret; | ||
1116 | } | ||
1117 | EXPORT_SYMBOL(vm_mmap); | ||
1118 | |||
1119 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1105 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
1120 | unsigned long, prot, unsigned long, flags, | 1106 | unsigned long, prot, unsigned long, flags, |
1121 | unsigned long, fd, unsigned long, pgoff) | 1107 | unsigned long, fd, unsigned long, pgoff) |
@@ -1147,10 +1133,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1147 | 1133 | ||
1148 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1134 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
1149 | 1135 | ||
1150 | down_write(¤t->mm->mmap_sem); | 1136 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
1151 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1152 | up_write(¤t->mm->mmap_sem); | ||
1153 | |||
1154 | if (file) | 1137 | if (file) |
1155 | fput(file); | 1138 | fput(file); |
1156 | out: | 1139 | out: |
@@ -1371,6 +1354,11 @@ out: | |||
1371 | mm->locked_vm += (len >> PAGE_SHIFT); | 1354 | mm->locked_vm += (len >> PAGE_SHIFT); |
1372 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1355 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) |
1373 | make_pages_present(addr, addr + len); | 1356 | make_pages_present(addr, addr + len); |
1357 | |||
1358 | if (file && uprobe_mmap(vma)) | ||
1359 | /* matching probes but cannot insert */ | ||
1360 | goto unmap_and_free_vma; | ||
1361 | |||
1374 | return addr; | 1362 | return addr; |
1375 | 1363 | ||
1376 | unmap_and_free_vma: | 1364 | unmap_and_free_vma: |
@@ -1606,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1606 | if (addr & ~PAGE_MASK) | 1594 | if (addr & ~PAGE_MASK) |
1607 | return -EINVAL; | 1595 | return -EINVAL; |
1608 | 1596 | ||
1609 | return arch_rebalance_pgtables(addr, len); | 1597 | addr = arch_rebalance_pgtables(addr, len); |
1598 | error = security_mmap_addr(addr); | ||
1599 | return error ? error : addr; | ||
1610 | } | 1600 | } |
1611 | 1601 | ||
1612 | EXPORT_SYMBOL(get_unmapped_area); | 1602 | EXPORT_SYMBOL(get_unmapped_area); |
@@ -1616,33 +1606,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
1616 | { | 1606 | { |
1617 | struct vm_area_struct *vma = NULL; | 1607 | struct vm_area_struct *vma = NULL; |
1618 | 1608 | ||
1619 | if (mm) { | 1609 | if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */ |
1620 | /* Check the cache first. */ | 1610 | return NULL; |
1621 | /* (Cache hit rate is typically around 35%.) */ | 1611 | |
1622 | vma = mm->mmap_cache; | 1612 | /* Check the cache first. */ |
1623 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | 1613 | /* (Cache hit rate is typically around 35%.) */ |
1624 | struct rb_node * rb_node; | 1614 | vma = mm->mmap_cache; |
1625 | 1615 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | |
1626 | rb_node = mm->mm_rb.rb_node; | 1616 | struct rb_node *rb_node; |
1627 | vma = NULL; | 1617 | |
1628 | 1618 | rb_node = mm->mm_rb.rb_node; | |
1629 | while (rb_node) { | 1619 | vma = NULL; |
1630 | struct vm_area_struct * vma_tmp; | 1620 | |
1631 | 1621 | while (rb_node) { | |
1632 | vma_tmp = rb_entry(rb_node, | 1622 | struct vm_area_struct *vma_tmp; |
1633 | struct vm_area_struct, vm_rb); | 1623 | |
1634 | 1624 | vma_tmp = rb_entry(rb_node, | |
1635 | if (vma_tmp->vm_end > addr) { | 1625 | struct vm_area_struct, vm_rb); |
1636 | vma = vma_tmp; | 1626 | |
1637 | if (vma_tmp->vm_start <= addr) | 1627 | if (vma_tmp->vm_end > addr) { |
1638 | break; | 1628 | vma = vma_tmp; |
1639 | rb_node = rb_node->rb_left; | 1629 | if (vma_tmp->vm_start <= addr) |
1640 | } else | 1630 | break; |
1641 | rb_node = rb_node->rb_right; | 1631 | rb_node = rb_node->rb_left; |
1642 | } | 1632 | } else |
1643 | if (vma) | 1633 | rb_node = rb_node->rb_right; |
1644 | mm->mmap_cache = vma; | ||
1645 | } | 1634 | } |
1635 | if (vma) | ||
1636 | mm->mmap_cache = vma; | ||
1646 | } | 1637 | } |
1647 | return vma; | 1638 | return vma; |
1648 | } | 1639 | } |
@@ -1795,7 +1786,7 @@ int expand_downwards(struct vm_area_struct *vma, | |||
1795 | return -ENOMEM; | 1786 | return -ENOMEM; |
1796 | 1787 | ||
1797 | address &= PAGE_MASK; | 1788 | address &= PAGE_MASK; |
1798 | error = security_file_mmap(NULL, 0, 0, 0, address, 1); | 1789 | error = security_mmap_addr(address); |
1799 | if (error) | 1790 | if (error) |
1800 | return error; | 1791 | return error; |
1801 | 1792 | ||
@@ -1889,15 +1880,20 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
1889 | */ | 1880 | */ |
1890 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | 1881 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) |
1891 | { | 1882 | { |
1883 | unsigned long nr_accounted = 0; | ||
1884 | |||
1892 | /* Update high watermark before we lower total_vm */ | 1885 | /* Update high watermark before we lower total_vm */ |
1893 | update_hiwater_vm(mm); | 1886 | update_hiwater_vm(mm); |
1894 | do { | 1887 | do { |
1895 | long nrpages = vma_pages(vma); | 1888 | long nrpages = vma_pages(vma); |
1896 | 1889 | ||
1890 | if (vma->vm_flags & VM_ACCOUNT) | ||
1891 | nr_accounted += nrpages; | ||
1897 | mm->total_vm -= nrpages; | 1892 | mm->total_vm -= nrpages; |
1898 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 1893 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
1899 | vma = remove_vma(vma); | 1894 | vma = remove_vma(vma); |
1900 | } while (vma); | 1895 | } while (vma); |
1896 | vm_unacct_memory(nr_accounted); | ||
1901 | validate_mm(mm); | 1897 | validate_mm(mm); |
1902 | } | 1898 | } |
1903 | 1899 | ||
@@ -1912,13 +1908,11 @@ static void unmap_region(struct mm_struct *mm, | |||
1912 | { | 1908 | { |
1913 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | 1909 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; |
1914 | struct mmu_gather tlb; | 1910 | struct mmu_gather tlb; |
1915 | unsigned long nr_accounted = 0; | ||
1916 | 1911 | ||
1917 | lru_add_drain(); | 1912 | lru_add_drain(); |
1918 | tlb_gather_mmu(&tlb, mm, 0); | 1913 | tlb_gather_mmu(&tlb, mm, 0); |
1919 | update_hiwater_rss(mm); | 1914 | update_hiwater_rss(mm); |
1920 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); | 1915 | unmap_vmas(&tlb, vma, start, end); |
1921 | vm_unacct_memory(nr_accounted); | ||
1922 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, | 1916 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, |
1923 | next ? next->vm_start : 0); | 1917 | next ? next->vm_start : 0); |
1924 | tlb_finish_mmu(&tlb, start, end); | 1918 | tlb_finish_mmu(&tlb, start, end); |
@@ -2132,7 +2126,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2132 | 2126 | ||
2133 | return 0; | 2127 | return 0; |
2134 | } | 2128 | } |
2135 | EXPORT_SYMBOL(do_munmap); | ||
2136 | 2129 | ||
2137 | int vm_munmap(unsigned long start, size_t len) | 2130 | int vm_munmap(unsigned long start, size_t len) |
2138 | { | 2131 | { |
@@ -2180,10 +2173,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2180 | if (!len) | 2173 | if (!len) |
2181 | return addr; | 2174 | return addr; |
2182 | 2175 | ||
2183 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); | ||
2184 | if (error) | ||
2185 | return error; | ||
2186 | |||
2187 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2176 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
2188 | 2177 | ||
2189 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); | 2178 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
@@ -2305,8 +2294,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2305 | tlb_gather_mmu(&tlb, mm, 1); | 2294 | tlb_gather_mmu(&tlb, mm, 1); |
2306 | /* update_hiwater_rss(mm) here? but nobody should be looking */ | 2295 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2307 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2296 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2308 | unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2297 | unmap_vmas(&tlb, vma, 0, -1); |
2309 | vm_unacct_memory(nr_accounted); | ||
2310 | 2298 | ||
2311 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); | 2299 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); |
2312 | tlb_finish_mmu(&tlb, 0, -1); | 2300 | tlb_finish_mmu(&tlb, 0, -1); |
@@ -2315,8 +2303,12 @@ void exit_mmap(struct mm_struct *mm) | |||
2315 | * Walk the list again, actually closing and freeing it, | 2303 | * Walk the list again, actually closing and freeing it, |
2316 | * with preemption enabled, without holding any MM locks. | 2304 | * with preemption enabled, without holding any MM locks. |
2317 | */ | 2305 | */ |
2318 | while (vma) | 2306 | while (vma) { |
2307 | if (vma->vm_flags & VM_ACCOUNT) | ||
2308 | nr_accounted += vma_pages(vma); | ||
2319 | vma = remove_vma(vma); | 2309 | vma = remove_vma(vma); |
2310 | } | ||
2311 | vm_unacct_memory(nr_accounted); | ||
2320 | 2312 | ||
2321 | BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | 2313 | BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); |
2322 | } | 2314 | } |
@@ -2352,6 +2344,10 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | |||
2352 | if ((vma->vm_flags & VM_ACCOUNT) && | 2344 | if ((vma->vm_flags & VM_ACCOUNT) && |
2353 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | 2345 | security_vm_enough_memory_mm(mm, vma_pages(vma))) |
2354 | return -ENOMEM; | 2346 | return -ENOMEM; |
2347 | |||
2348 | if (vma->vm_file && uprobe_mmap(vma)) | ||
2349 | return -EINVAL; | ||
2350 | |||
2355 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2351 | vma_link(mm, vma, prev, rb_link, rb_parent); |
2356 | return 0; | 2352 | return 0; |
2357 | } | 2353 | } |
@@ -2421,6 +2417,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2421 | new_vma->vm_pgoff = pgoff; | 2417 | new_vma->vm_pgoff = pgoff; |
2422 | if (new_vma->vm_file) { | 2418 | if (new_vma->vm_file) { |
2423 | get_file(new_vma->vm_file); | 2419 | get_file(new_vma->vm_file); |
2420 | |||
2421 | if (uprobe_mmap(new_vma)) | ||
2422 | goto out_free_mempol; | ||
2423 | |||
2424 | if (vma->vm_flags & VM_EXECUTABLE) | 2424 | if (vma->vm_flags & VM_EXECUTABLE) |
2425 | added_exe_file_vma(mm); | 2425 | added_exe_file_vma(mm); |
2426 | } | 2426 | } |
@@ -2525,10 +2525,6 @@ int install_special_mapping(struct mm_struct *mm, | |||
2525 | vma->vm_ops = &special_mapping_vmops; | 2525 | vma->vm_ops = &special_mapping_vmops; |
2526 | vma->vm_private_data = pages; | 2526 | vma->vm_private_data = pages; |
2527 | 2527 | ||
2528 | ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); | ||
2529 | if (ret) | ||
2530 | goto out; | ||
2531 | |||
2532 | ret = insert_vm_struct(mm, vma); | 2528 | ret = insert_vm_struct(mm, vma); |
2533 | if (ret) | 2529 | if (ret) |
2534 | goto out; | 2530 | goto out; |
diff --git a/mm/mmzone.c b/mm/mmzone.c index 7cf7b7ddc7c5..6830eab5bf09 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -86,3 +86,17 @@ int memmap_valid_within(unsigned long pfn, | |||
86 | return 1; | 86 | return 1; |
87 | } | 87 | } |
88 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | 88 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ |
89 | |||
90 | void lruvec_init(struct lruvec *lruvec, struct zone *zone) | ||
91 | { | ||
92 | enum lru_list lru; | ||
93 | |||
94 | memset(lruvec, 0, sizeof(struct lruvec)); | ||
95 | |||
96 | for_each_lru(lru) | ||
97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | ||
98 | |||
99 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
100 | lruvec->zone = zone; | ||
101 | #endif | ||
102 | } | ||
diff --git a/mm/mremap.c b/mm/mremap.c index db8d983b5a7d..21fed202ddad 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -371,10 +371,6 @@ static unsigned long mremap_to(unsigned long addr, | |||
371 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | 371 | if ((addr <= new_addr) && (addr+old_len) > new_addr) |
372 | goto out; | 372 | goto out; |
373 | 373 | ||
374 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
375 | if (ret) | ||
376 | goto out; | ||
377 | |||
378 | ret = do_munmap(mm, new_addr, new_len); | 374 | ret = do_munmap(mm, new_addr, new_len); |
379 | if (ret) | 375 | if (ret) |
380 | goto out; | 376 | goto out; |
@@ -432,15 +428,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) | |||
432 | * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise | 428 | * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise |
433 | * This option implies MREMAP_MAYMOVE. | 429 | * This option implies MREMAP_MAYMOVE. |
434 | */ | 430 | */ |
435 | unsigned long do_mremap(unsigned long addr, | 431 | SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, |
436 | unsigned long old_len, unsigned long new_len, | 432 | unsigned long, new_len, unsigned long, flags, |
437 | unsigned long flags, unsigned long new_addr) | 433 | unsigned long, new_addr) |
438 | { | 434 | { |
439 | struct mm_struct *mm = current->mm; | 435 | struct mm_struct *mm = current->mm; |
440 | struct vm_area_struct *vma; | 436 | struct vm_area_struct *vma; |
441 | unsigned long ret = -EINVAL; | 437 | unsigned long ret = -EINVAL; |
442 | unsigned long charged = 0; | 438 | unsigned long charged = 0; |
443 | 439 | ||
440 | down_write(¤t->mm->mmap_sem); | ||
441 | |||
444 | if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) | 442 | if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) |
445 | goto out; | 443 | goto out; |
446 | 444 | ||
@@ -530,25 +528,11 @@ unsigned long do_mremap(unsigned long addr, | |||
530 | goto out; | 528 | goto out; |
531 | } | 529 | } |
532 | 530 | ||
533 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
534 | if (ret) | ||
535 | goto out; | ||
536 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 531 | ret = move_vma(vma, addr, old_len, new_len, new_addr); |
537 | } | 532 | } |
538 | out: | 533 | out: |
539 | if (ret & ~PAGE_MASK) | 534 | if (ret & ~PAGE_MASK) |
540 | vm_unacct_memory(charged); | 535 | vm_unacct_memory(charged); |
541 | return ret; | ||
542 | } | ||
543 | |||
544 | SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | ||
545 | unsigned long, new_len, unsigned long, flags, | ||
546 | unsigned long, new_addr) | ||
547 | { | ||
548 | unsigned long ret; | ||
549 | |||
550 | down_write(¤t->mm->mmap_sem); | ||
551 | ret = do_mremap(addr, old_len, new_len, flags, new_addr); | ||
552 | up_write(¤t->mm->mmap_sem); | 536 | up_write(¤t->mm->mmap_sem); |
553 | return ret; | 537 | return ret; |
554 | } | 538 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 1983fb1c7026..405573010f99 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) | |||
105 | __free_pages_bootmem(pfn_to_page(i), 0); | 105 | __free_pages_bootmem(pfn_to_page(i), 0); |
106 | } | 106 | } |
107 | 107 | ||
108 | static unsigned long __init __free_memory_core(phys_addr_t start, | ||
109 | phys_addr_t end) | ||
110 | { | ||
111 | unsigned long start_pfn = PFN_UP(start); | ||
112 | unsigned long end_pfn = min_t(unsigned long, | ||
113 | PFN_DOWN(end), max_low_pfn); | ||
114 | |||
115 | if (start_pfn > end_pfn) | ||
116 | return 0; | ||
117 | |||
118 | __free_pages_memory(start_pfn, end_pfn); | ||
119 | |||
120 | return end_pfn - start_pfn; | ||
121 | } | ||
122 | |||
108 | unsigned long __init free_low_memory_core_early(int nodeid) | 123 | unsigned long __init free_low_memory_core_early(int nodeid) |
109 | { | 124 | { |
110 | unsigned long count = 0; | 125 | unsigned long count = 0; |
111 | phys_addr_t start, end; | 126 | phys_addr_t start, end, size; |
112 | u64 i; | 127 | u64 i; |
113 | 128 | ||
114 | /* free reserved array temporarily so that it's treated as free area */ | 129 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) |
115 | memblock_free_reserved_regions(); | 130 | count += __free_memory_core(start, end); |
116 | 131 | ||
117 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { | 132 | /* free range that is used for reserved array if we allocate it */ |
118 | unsigned long start_pfn = PFN_UP(start); | 133 | size = get_allocated_memblock_reserved_regions_info(&start); |
119 | unsigned long end_pfn = min_t(unsigned long, | 134 | if (size) |
120 | PFN_DOWN(end), max_low_pfn); | 135 | count += __free_memory_core(start, start + size); |
121 | if (start_pfn < end_pfn) { | ||
122 | __free_pages_memory(start_pfn, end_pfn); | ||
123 | count += end_pfn - start_pfn; | ||
124 | } | ||
125 | } | ||
126 | 136 | ||
127 | /* put region array back? */ | ||
128 | memblock_reserve_reserved_regions(); | ||
129 | return count; | 137 | return count; |
130 | } | 138 | } |
131 | 139 | ||
@@ -274,86 +282,85 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
274 | return ___alloc_bootmem(size, align, goal, limit); | 282 | return ___alloc_bootmem(size, align, goal, limit); |
275 | } | 283 | } |
276 | 284 | ||
277 | /** | 285 | void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, |
278 | * __alloc_bootmem_node - allocate boot memory from a specific node | 286 | unsigned long size, |
279 | * @pgdat: node to allocate from | 287 | unsigned long align, |
280 | * @size: size of the request in bytes | 288 | unsigned long goal, |
281 | * @align: alignment of the region | 289 | unsigned long limit) |
282 | * @goal: preferred starting address of the region | ||
283 | * | ||
284 | * The goal is dropped if it can not be satisfied and the allocation will | ||
285 | * fall back to memory below @goal. | ||
286 | * | ||
287 | * Allocation may fall back to any node in the system if the specified node | ||
288 | * can not hold the requested memory. | ||
289 | * | ||
290 | * The function panics if the request can not be satisfied. | ||
291 | */ | ||
292 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
293 | unsigned long align, unsigned long goal) | ||
294 | { | 290 | { |
295 | void *ptr; | 291 | void *ptr; |
296 | 292 | ||
297 | if (WARN_ON_ONCE(slab_is_available())) | ||
298 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
299 | |||
300 | again: | 293 | again: |
301 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | 294 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, |
302 | goal, -1ULL); | 295 | goal, limit); |
303 | if (ptr) | 296 | if (ptr) |
304 | return ptr; | 297 | return ptr; |
305 | 298 | ||
306 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, | 299 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, |
307 | goal, -1ULL); | 300 | goal, limit); |
308 | if (!ptr && goal) { | 301 | if (ptr) |
302 | return ptr; | ||
303 | |||
304 | if (goal) { | ||
309 | goal = 0; | 305 | goal = 0; |
310 | goto again; | 306 | goto again; |
311 | } | 307 | } |
312 | return ptr; | 308 | |
309 | return NULL; | ||
313 | } | 310 | } |
314 | 311 | ||
315 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | 312 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, |
316 | unsigned long align, unsigned long goal) | 313 | unsigned long align, unsigned long goal) |
317 | { | 314 | { |
318 | return __alloc_bootmem_node(pgdat, size, align, goal); | 315 | if (WARN_ON_ONCE(slab_is_available())) |
316 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
317 | |||
318 | return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | ||
319 | } | 319 | } |
320 | 320 | ||
321 | #ifdef CONFIG_SPARSEMEM | 321 | void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, |
322 | /** | 322 | unsigned long align, unsigned long goal, |
323 | * alloc_bootmem_section - allocate boot memory from a specific section | 323 | unsigned long limit) |
324 | * @size: size of the request in bytes | ||
325 | * @section_nr: sparse map section to allocate from | ||
326 | * | ||
327 | * Return NULL on failure. | ||
328 | */ | ||
329 | void * __init alloc_bootmem_section(unsigned long size, | ||
330 | unsigned long section_nr) | ||
331 | { | 324 | { |
332 | unsigned long pfn, goal, limit; | 325 | void *ptr; |
333 | 326 | ||
334 | pfn = section_nr_to_pfn(section_nr); | 327 | ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); |
335 | goal = pfn << PAGE_SHIFT; | 328 | if (ptr) |
336 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | 329 | return ptr; |
337 | 330 | ||
338 | return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, | 331 | printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); |
339 | SMP_CACHE_BYTES, goal, limit); | 332 | panic("Out of memory"); |
333 | return NULL; | ||
340 | } | 334 | } |
341 | #endif | ||
342 | 335 | ||
343 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | 336 | /** |
337 | * __alloc_bootmem_node - allocate boot memory from a specific node | ||
338 | * @pgdat: node to allocate from | ||
339 | * @size: size of the request in bytes | ||
340 | * @align: alignment of the region | ||
341 | * @goal: preferred starting address of the region | ||
342 | * | ||
343 | * The goal is dropped if it can not be satisfied and the allocation will | ||
344 | * fall back to memory below @goal. | ||
345 | * | ||
346 | * Allocation may fall back to any node in the system if the specified node | ||
347 | * can not hold the requested memory. | ||
348 | * | ||
349 | * The function panics if the request can not be satisfied. | ||
350 | */ | ||
351 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
344 | unsigned long align, unsigned long goal) | 352 | unsigned long align, unsigned long goal) |
345 | { | 353 | { |
346 | void *ptr; | ||
347 | |||
348 | if (WARN_ON_ONCE(slab_is_available())) | 354 | if (WARN_ON_ONCE(slab_is_available())) |
349 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 355 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
350 | 356 | ||
351 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | 357 | return ___alloc_bootmem_node(pgdat, size, align, goal, 0); |
352 | goal, -1ULL); | 358 | } |
353 | if (ptr) | ||
354 | return ptr; | ||
355 | 359 | ||
356 | return __alloc_bootmem_nopanic(size, align, goal); | 360 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, |
361 | unsigned long align, unsigned long goal) | ||
362 | { | ||
363 | return __alloc_bootmem_node(pgdat, size, align, goal); | ||
357 | } | 364 | } |
358 | 365 | ||
359 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 366 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
@@ -397,16 +404,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | |||
397 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 404 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
398 | unsigned long align, unsigned long goal) | 405 | unsigned long align, unsigned long goal) |
399 | { | 406 | { |
400 | void *ptr; | ||
401 | |||
402 | if (WARN_ON_ONCE(slab_is_available())) | 407 | if (WARN_ON_ONCE(slab_is_available())) |
403 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 408 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
404 | 409 | ||
405 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | 410 | return ___alloc_bootmem_node(pgdat, size, align, goal, |
406 | goal, ARCH_LOW_ADDRESS_LIMIT); | 411 | ARCH_LOW_ADDRESS_LIMIT); |
407 | if (ptr) | ||
408 | return ptr; | ||
409 | |||
410 | return __alloc_memory_core_early(MAX_NUMNODES, size, align, | ||
411 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
412 | } | 412 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index bb8f4f004a82..d4b0c10872de 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file, | |||
889 | unsigned long *_capabilities) | 889 | unsigned long *_capabilities) |
890 | { | 890 | { |
891 | unsigned long capabilities, rlen; | 891 | unsigned long capabilities, rlen; |
892 | unsigned long reqprot = prot; | ||
893 | int ret; | 892 | int ret; |
894 | 893 | ||
895 | /* do the simple checks first */ | 894 | /* do the simple checks first */ |
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file, | |||
1047 | } | 1046 | } |
1048 | 1047 | ||
1049 | /* allow the security API to have its say */ | 1048 | /* allow the security API to have its say */ |
1050 | ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); | 1049 | ret = security_mmap_addr(addr); |
1051 | if (ret < 0) | 1050 | if (ret < 0) |
1052 | return ret; | 1051 | return ret; |
1053 | 1052 | ||
@@ -1233,7 +1232,7 @@ enomem: | |||
1233 | /* | 1232 | /* |
1234 | * handle mapping creation for uClinux | 1233 | * handle mapping creation for uClinux |
1235 | */ | 1234 | */ |
1236 | static unsigned long do_mmap_pgoff(struct file *file, | 1235 | unsigned long do_mmap_pgoff(struct file *file, |
1237 | unsigned long addr, | 1236 | unsigned long addr, |
1238 | unsigned long len, | 1237 | unsigned long len, |
1239 | unsigned long prot, | 1238 | unsigned long prot, |
@@ -1471,32 +1470,6 @@ error_getting_region: | |||
1471 | return -ENOMEM; | 1470 | return -ENOMEM; |
1472 | } | 1471 | } |
1473 | 1472 | ||
1474 | unsigned long do_mmap(struct file *file, unsigned long addr, | ||
1475 | unsigned long len, unsigned long prot, | ||
1476 | unsigned long flag, unsigned long offset) | ||
1477 | { | ||
1478 | if (unlikely(offset + PAGE_ALIGN(len) < offset)) | ||
1479 | return -EINVAL; | ||
1480 | if (unlikely(offset & ~PAGE_MASK)) | ||
1481 | return -EINVAL; | ||
1482 | return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); | ||
1483 | } | ||
1484 | EXPORT_SYMBOL(do_mmap); | ||
1485 | |||
1486 | unsigned long vm_mmap(struct file *file, unsigned long addr, | ||
1487 | unsigned long len, unsigned long prot, | ||
1488 | unsigned long flag, unsigned long offset) | ||
1489 | { | ||
1490 | unsigned long ret; | ||
1491 | struct mm_struct *mm = current->mm; | ||
1492 | |||
1493 | down_write(&mm->mmap_sem); | ||
1494 | ret = do_mmap(file, addr, len, prot, flag, offset); | ||
1495 | up_write(&mm->mmap_sem); | ||
1496 | return ret; | ||
1497 | } | ||
1498 | EXPORT_SYMBOL(vm_mmap); | ||
1499 | |||
1500 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1473 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
1501 | unsigned long, prot, unsigned long, flags, | 1474 | unsigned long, prot, unsigned long, flags, |
1502 | unsigned long, fd, unsigned long, pgoff) | 1475 | unsigned long, fd, unsigned long, pgoff) |
@@ -1513,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1513 | 1486 | ||
1514 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
1515 | 1488 | ||
1516 | down_write(¤t->mm->mmap_sem); | 1489 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
1517 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1518 | up_write(¤t->mm->mmap_sem); | ||
1519 | 1490 | ||
1520 | if (file) | 1491 | if (file) |
1521 | fput(file); | 1492 | fput(file); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 46bf2ed5594c..ac300c99baf6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -180,10 +180,11 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
180 | * predictable as possible. The goal is to return the highest value for the | 180 | * predictable as possible. The goal is to return the highest value for the |
181 | * task consuming the most memory to avoid subsequent oom failures. | 181 | * task consuming the most memory to avoid subsequent oom failures. |
182 | */ | 182 | */ |
183 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | 183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, |
184 | const nodemask_t *nodemask, unsigned long totalpages) | 184 | const nodemask_t *nodemask, unsigned long totalpages) |
185 | { | 185 | { |
186 | long points; | 186 | long points; |
187 | long adj; | ||
187 | 188 | ||
188 | if (oom_unkillable_task(p, memcg, nodemask)) | 189 | if (oom_unkillable_task(p, memcg, nodemask)) |
189 | return 0; | 190 | return 0; |
@@ -192,27 +193,18 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
192 | if (!p) | 193 | if (!p) |
193 | return 0; | 194 | return 0; |
194 | 195 | ||
195 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | 196 | adj = p->signal->oom_score_adj; |
197 | if (adj == OOM_SCORE_ADJ_MIN) { | ||
196 | task_unlock(p); | 198 | task_unlock(p); |
197 | return 0; | 199 | return 0; |
198 | } | 200 | } |
199 | 201 | ||
200 | /* | 202 | /* |
201 | * The memory controller may have a limit of 0 bytes, so avoid a divide | ||
202 | * by zero, if necessary. | ||
203 | */ | ||
204 | if (!totalpages) | ||
205 | totalpages = 1; | ||
206 | |||
207 | /* | ||
208 | * The baseline for the badness score is the proportion of RAM that each | 203 | * The baseline for the badness score is the proportion of RAM that each |
209 | * task's rss, pagetable and swap space use. | 204 | * task's rss, pagetable and swap space use. |
210 | */ | 205 | */ |
211 | points = get_mm_rss(p->mm) + p->mm->nr_ptes; | 206 | points = get_mm_rss(p->mm) + p->mm->nr_ptes + |
212 | points += get_mm_counter(p->mm, MM_SWAPENTS); | 207 | get_mm_counter(p->mm, MM_SWAPENTS); |
213 | |||
214 | points *= 1000; | ||
215 | points /= totalpages; | ||
216 | task_unlock(p); | 208 | task_unlock(p); |
217 | 209 | ||
218 | /* | 210 | /* |
@@ -220,23 +212,17 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
220 | * implementation used by LSMs. | 212 | * implementation used by LSMs. |
221 | */ | 213 | */ |
222 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) | 214 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
223 | points -= 30; | 215 | adj -= 30; |
224 | 216 | ||
225 | /* | 217 | /* Normalize to oom_score_adj units */ |
226 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may | 218 | adj *= totalpages / 1000; |
227 | * either completely disable oom killing or always prefer a certain | 219 | points += adj; |
228 | * task. | ||
229 | */ | ||
230 | points += p->signal->oom_score_adj; | ||
231 | 220 | ||
232 | /* | 221 | /* |
233 | * Never return 0 for an eligible task that may be killed since it's | 222 | * Never return 0 for an eligible task regardless of the root bonus and |
234 | * possible that no single user task uses more than 0.1% of memory and | 223 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). |
235 | * no single admin tasks uses more than 3.0%. | ||
236 | */ | 224 | */ |
237 | if (points <= 0) | 225 | return points > 0 ? points : 1; |
238 | return 1; | ||
239 | return (points < 1000) ? points : 1000; | ||
240 | } | 226 | } |
241 | 227 | ||
242 | /* | 228 | /* |
@@ -314,7 +300,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
314 | { | 300 | { |
315 | struct task_struct *g, *p; | 301 | struct task_struct *g, *p; |
316 | struct task_struct *chosen = NULL; | 302 | struct task_struct *chosen = NULL; |
317 | *ppoints = 0; | 303 | unsigned long chosen_points = 0; |
318 | 304 | ||
319 | do_each_thread(g, p) { | 305 | do_each_thread(g, p) { |
320 | unsigned int points; | 306 | unsigned int points; |
@@ -354,7 +340,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
354 | */ | 340 | */ |
355 | if (p == current) { | 341 | if (p == current) { |
356 | chosen = p; | 342 | chosen = p; |
357 | *ppoints = 1000; | 343 | chosen_points = ULONG_MAX; |
358 | } else if (!force_kill) { | 344 | } else if (!force_kill) { |
359 | /* | 345 | /* |
360 | * If this task is not being ptraced on exit, | 346 | * If this task is not being ptraced on exit, |
@@ -367,18 +353,19 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
367 | } | 353 | } |
368 | 354 | ||
369 | points = oom_badness(p, memcg, nodemask, totalpages); | 355 | points = oom_badness(p, memcg, nodemask, totalpages); |
370 | if (points > *ppoints) { | 356 | if (points > chosen_points) { |
371 | chosen = p; | 357 | chosen = p; |
372 | *ppoints = points; | 358 | chosen_points = points; |
373 | } | 359 | } |
374 | } while_each_thread(g, p); | 360 | } while_each_thread(g, p); |
375 | 361 | ||
362 | *ppoints = chosen_points * 1000 / totalpages; | ||
376 | return chosen; | 363 | return chosen; |
377 | } | 364 | } |
378 | 365 | ||
379 | /** | 366 | /** |
380 | * dump_tasks - dump current memory state of all system tasks | 367 | * dump_tasks - dump current memory state of all system tasks |
381 | * @mem: current's memory controller, if constrained | 368 | * @memcg: current's memory controller, if constrained |
382 | * @nodemask: nodemask passed to page allocator for mempolicy ooms | 369 | * @nodemask: nodemask passed to page allocator for mempolicy ooms |
383 | * | 370 | * |
384 | * Dumps the current memory state of all eligible tasks. Tasks not in the same | 371 | * Dumps the current memory state of all eligible tasks. Tasks not in the same |
@@ -410,8 +397,8 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas | |||
410 | } | 397 | } |
411 | 398 | ||
412 | pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", | 399 | pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", |
413 | task->pid, task_uid(task), task->tgid, | 400 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
414 | task->mm->total_vm, get_mm_rss(task->mm), | 401 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
415 | task_cpu(task), task->signal->oom_adj, | 402 | task_cpu(task), task->signal->oom_adj, |
416 | task->signal->oom_score_adj, task->comm); | 403 | task->signal->oom_score_adj, task->comm); |
417 | task_unlock(task); | 404 | task_unlock(task); |
@@ -572,7 +559,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
572 | } | 559 | } |
573 | 560 | ||
574 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | 561 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); |
575 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; | 562 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; |
576 | read_lock(&tasklist_lock); | 563 | read_lock(&tasklist_lock); |
577 | p = select_bad_process(&points, limit, memcg, NULL, false); | 564 | p = select_bad_process(&points, limit, memcg, NULL, false); |
578 | if (p && PTR_ERR(p) != -1UL) | 565 | if (p && PTR_ERR(p) != -1UL) |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 26adea8ca2e7..93d8d2f7108c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -204,7 +204,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
204 | * Returns the global number of pages potentially available for dirty | 204 | * Returns the global number of pages potentially available for dirty |
205 | * page cache. This is the base value for the global dirty limits. | 205 | * page cache. This is the base value for the global dirty limits. |
206 | */ | 206 | */ |
207 | unsigned long global_dirtyable_memory(void) | 207 | static unsigned long global_dirtyable_memory(void) |
208 | { | 208 | { |
209 | unsigned long x; | 209 | unsigned long x; |
210 | 210 | ||
@@ -1568,6 +1568,7 @@ void writeback_set_ratelimit(void) | |||
1568 | unsigned long background_thresh; | 1568 | unsigned long background_thresh; |
1569 | unsigned long dirty_thresh; | 1569 | unsigned long dirty_thresh; |
1570 | global_dirty_limits(&background_thresh, &dirty_thresh); | 1570 | global_dirty_limits(&background_thresh, &dirty_thresh); |
1571 | global_dirty_limit = dirty_thresh; | ||
1571 | ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); | 1572 | ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); |
1572 | if (ratelimit_pages < 16) | 1573 | if (ratelimit_pages < 16) |
1573 | ratelimit_pages = 16; | 1574 | ratelimit_pages = 16; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 918330f71dba..4a4f9219683f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/ftrace_event.h> | 57 | #include <linux/ftrace_event.h> |
58 | #include <linux/memcontrol.h> | 58 | #include <linux/memcontrol.h> |
59 | #include <linux/prefetch.h> | 59 | #include <linux/prefetch.h> |
60 | #include <linux/migrate.h> | ||
60 | #include <linux/page-debug-flags.h> | 61 | #include <linux/page-debug-flags.h> |
61 | 62 | ||
62 | #include <asm/tlbflush.h> | 63 | #include <asm/tlbflush.h> |
@@ -513,10 +514,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
513 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's | 514 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's |
514 | * order is recorded in page_private(page) field. | 515 | * order is recorded in page_private(page) field. |
515 | * So when we are allocating or freeing one, we can derive the state of the | 516 | * So when we are allocating or freeing one, we can derive the state of the |
516 | * other. That is, if we allocate a small block, and both were | 517 | * other. That is, if we allocate a small block, and both were |
517 | * free, the remainder of the region must be split into blocks. | 518 | * free, the remainder of the region must be split into blocks. |
518 | * If a block is freed, and its buddy is also free, then this | 519 | * If a block is freed, and its buddy is also free, then this |
519 | * triggers coalescing into a block of larger size. | 520 | * triggers coalescing into a block of larger size. |
520 | * | 521 | * |
521 | * -- wli | 522 | * -- wli |
522 | */ | 523 | */ |
@@ -749,6 +750,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | |||
749 | __free_pages(page, order); | 750 | __free_pages(page, order); |
750 | } | 751 | } |
751 | 752 | ||
753 | #ifdef CONFIG_CMA | ||
754 | /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ | ||
755 | void __init init_cma_reserved_pageblock(struct page *page) | ||
756 | { | ||
757 | unsigned i = pageblock_nr_pages; | ||
758 | struct page *p = page; | ||
759 | |||
760 | do { | ||
761 | __ClearPageReserved(p); | ||
762 | set_page_count(p, 0); | ||
763 | } while (++p, --i); | ||
764 | |||
765 | set_page_refcounted(page); | ||
766 | set_pageblock_migratetype(page, MIGRATE_CMA); | ||
767 | __free_pages(page, pageblock_order); | ||
768 | totalram_pages += pageblock_nr_pages; | ||
769 | } | ||
770 | #endif | ||
752 | 771 | ||
753 | /* | 772 | /* |
754 | * The order of subdivision here is critical for the IO subsystem. | 773 | * The order of subdivision here is critical for the IO subsystem. |
@@ -874,11 +893,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
874 | * This array describes the order lists are fallen back to when | 893 | * This array describes the order lists are fallen back to when |
875 | * the free lists for the desirable migrate type are depleted | 894 | * the free lists for the desirable migrate type are depleted |
876 | */ | 895 | */ |
877 | static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { | 896 | static int fallbacks[MIGRATE_TYPES][4] = { |
878 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 897 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
879 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 898 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
880 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 899 | #ifdef CONFIG_CMA |
881 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ | 900 | [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
901 | [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ | ||
902 | #else | ||
903 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
904 | #endif | ||
905 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | ||
906 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ | ||
882 | }; | 907 | }; |
883 | 908 | ||
884 | /* | 909 | /* |
@@ -973,12 +998,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
973 | /* Find the largest possible block of pages in the other list */ | 998 | /* Find the largest possible block of pages in the other list */ |
974 | for (current_order = MAX_ORDER-1; current_order >= order; | 999 | for (current_order = MAX_ORDER-1; current_order >= order; |
975 | --current_order) { | 1000 | --current_order) { |
976 | for (i = 0; i < MIGRATE_TYPES - 1; i++) { | 1001 | for (i = 0;; i++) { |
977 | migratetype = fallbacks[start_migratetype][i]; | 1002 | migratetype = fallbacks[start_migratetype][i]; |
978 | 1003 | ||
979 | /* MIGRATE_RESERVE handled later if necessary */ | 1004 | /* MIGRATE_RESERVE handled later if necessary */ |
980 | if (migratetype == MIGRATE_RESERVE) | 1005 | if (migratetype == MIGRATE_RESERVE) |
981 | continue; | 1006 | break; |
982 | 1007 | ||
983 | area = &(zone->free_area[current_order]); | 1008 | area = &(zone->free_area[current_order]); |
984 | if (list_empty(&area->free_list[migratetype])) | 1009 | if (list_empty(&area->free_list[migratetype])) |
@@ -993,11 +1018,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
993 | * pages to the preferred allocation list. If falling | 1018 | * pages to the preferred allocation list. If falling |
994 | * back for a reclaimable kernel allocation, be more | 1019 | * back for a reclaimable kernel allocation, be more |
995 | * aggressive about taking ownership of free pages | 1020 | * aggressive about taking ownership of free pages |
1021 | * | ||
1022 | * On the other hand, never change migration | ||
1023 | * type of MIGRATE_CMA pageblocks nor move CMA | ||
1024 | * pages on different free lists. We don't | ||
1025 | * want unmovable pages to be allocated from | ||
1026 | * MIGRATE_CMA areas. | ||
996 | */ | 1027 | */ |
997 | if (unlikely(current_order >= (pageblock_order >> 1)) || | 1028 | if (!is_migrate_cma(migratetype) && |
998 | start_migratetype == MIGRATE_RECLAIMABLE || | 1029 | (unlikely(current_order >= pageblock_order / 2) || |
999 | page_group_by_mobility_disabled) { | 1030 | start_migratetype == MIGRATE_RECLAIMABLE || |
1000 | unsigned long pages; | 1031 | page_group_by_mobility_disabled)) { |
1032 | int pages; | ||
1001 | pages = move_freepages_block(zone, page, | 1033 | pages = move_freepages_block(zone, page, |
1002 | start_migratetype); | 1034 | start_migratetype); |
1003 | 1035 | ||
@@ -1015,11 +1047,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1015 | rmv_page_order(page); | 1047 | rmv_page_order(page); |
1016 | 1048 | ||
1017 | /* Take ownership for orders >= pageblock_order */ | 1049 | /* Take ownership for orders >= pageblock_order */ |
1018 | if (current_order >= pageblock_order) | 1050 | if (current_order >= pageblock_order && |
1051 | !is_migrate_cma(migratetype)) | ||
1019 | change_pageblock_range(page, current_order, | 1052 | change_pageblock_range(page, current_order, |
1020 | start_migratetype); | 1053 | start_migratetype); |
1021 | 1054 | ||
1022 | expand(zone, page, order, current_order, area, migratetype); | 1055 | expand(zone, page, order, current_order, area, |
1056 | is_migrate_cma(migratetype) | ||
1057 | ? migratetype : start_migratetype); | ||
1023 | 1058 | ||
1024 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1059 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1025 | start_migratetype, migratetype); | 1060 | start_migratetype, migratetype); |
@@ -1061,17 +1096,17 @@ retry_reserve: | |||
1061 | return page; | 1096 | return page; |
1062 | } | 1097 | } |
1063 | 1098 | ||
1064 | /* | 1099 | /* |
1065 | * Obtain a specified number of elements from the buddy allocator, all under | 1100 | * Obtain a specified number of elements from the buddy allocator, all under |
1066 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 1101 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
1067 | * Returns the number of new pages which were placed at *list. | 1102 | * Returns the number of new pages which were placed at *list. |
1068 | */ | 1103 | */ |
1069 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 1104 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
1070 | unsigned long count, struct list_head *list, | 1105 | unsigned long count, struct list_head *list, |
1071 | int migratetype, int cold) | 1106 | int migratetype, int cold) |
1072 | { | 1107 | { |
1073 | int i; | 1108 | int mt = migratetype, i; |
1074 | 1109 | ||
1075 | spin_lock(&zone->lock); | 1110 | spin_lock(&zone->lock); |
1076 | for (i = 0; i < count; ++i) { | 1111 | for (i = 0; i < count; ++i) { |
1077 | struct page *page = __rmqueue(zone, order, migratetype); | 1112 | struct page *page = __rmqueue(zone, order, migratetype); |
@@ -1091,7 +1126,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1091 | list_add(&page->lru, list); | 1126 | list_add(&page->lru, list); |
1092 | else | 1127 | else |
1093 | list_add_tail(&page->lru, list); | 1128 | list_add_tail(&page->lru, list); |
1094 | set_page_private(page, migratetype); | 1129 | if (IS_ENABLED(CONFIG_CMA)) { |
1130 | mt = get_pageblock_migratetype(page); | ||
1131 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | ||
1132 | mt = migratetype; | ||
1133 | } | ||
1134 | set_page_private(page, mt); | ||
1095 | list = &page->lru; | 1135 | list = &page->lru; |
1096 | } | 1136 | } |
1097 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 1137 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
@@ -1371,8 +1411,12 @@ int split_free_page(struct page *page) | |||
1371 | 1411 | ||
1372 | if (order >= pageblock_order - 1) { | 1412 | if (order >= pageblock_order - 1) { |
1373 | struct page *endpage = page + (1 << order) - 1; | 1413 | struct page *endpage = page + (1 << order) - 1; |
1374 | for (; page < endpage; page += pageblock_nr_pages) | 1414 | for (; page < endpage; page += pageblock_nr_pages) { |
1375 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 1415 | int mt = get_pageblock_migratetype(page); |
1416 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) | ||
1417 | set_pageblock_migratetype(page, | ||
1418 | MIGRATE_MOVABLE); | ||
1419 | } | ||
1376 | } | 1420 | } |
1377 | 1421 | ||
1378 | return 1 << order; | 1422 | return 1 << order; |
@@ -2086,16 +2130,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2086 | } | 2130 | } |
2087 | #endif /* CONFIG_COMPACTION */ | 2131 | #endif /* CONFIG_COMPACTION */ |
2088 | 2132 | ||
2089 | /* The really slow allocator path where we enter direct reclaim */ | 2133 | /* Perform direct synchronous page reclaim */ |
2090 | static inline struct page * | 2134 | static int |
2091 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2135 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, |
2092 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2136 | nodemask_t *nodemask) |
2093 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
2094 | int migratetype, unsigned long *did_some_progress) | ||
2095 | { | 2137 | { |
2096 | struct page *page = NULL; | ||
2097 | struct reclaim_state reclaim_state; | 2138 | struct reclaim_state reclaim_state; |
2098 | bool drained = false; | 2139 | int progress; |
2099 | 2140 | ||
2100 | cond_resched(); | 2141 | cond_resched(); |
2101 | 2142 | ||
@@ -2106,7 +2147,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2106 | reclaim_state.reclaimed_slab = 0; | 2147 | reclaim_state.reclaimed_slab = 0; |
2107 | current->reclaim_state = &reclaim_state; | 2148 | current->reclaim_state = &reclaim_state; |
2108 | 2149 | ||
2109 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2150 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); |
2110 | 2151 | ||
2111 | current->reclaim_state = NULL; | 2152 | current->reclaim_state = NULL; |
2112 | lockdep_clear_current_reclaim_state(); | 2153 | lockdep_clear_current_reclaim_state(); |
@@ -2114,6 +2155,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2114 | 2155 | ||
2115 | cond_resched(); | 2156 | cond_resched(); |
2116 | 2157 | ||
2158 | return progress; | ||
2159 | } | ||
2160 | |||
2161 | /* The really slow allocator path where we enter direct reclaim */ | ||
2162 | static inline struct page * | ||
2163 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | ||
2164 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
2165 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
2166 | int migratetype, unsigned long *did_some_progress) | ||
2167 | { | ||
2168 | struct page *page = NULL; | ||
2169 | bool drained = false; | ||
2170 | |||
2171 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
2172 | nodemask); | ||
2117 | if (unlikely(!(*did_some_progress))) | 2173 | if (unlikely(!(*did_some_progress))) |
2118 | return NULL; | 2174 | return NULL; |
2119 | 2175 | ||
@@ -4244,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat, | |||
4244 | 4300 | ||
4245 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4301 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4246 | 4302 | ||
4247 | /* Return a sensible default order for the pageblock size. */ | ||
4248 | static inline int pageblock_default_order(void) | ||
4249 | { | ||
4250 | if (HPAGE_SHIFT > PAGE_SHIFT) | ||
4251 | return HUGETLB_PAGE_ORDER; | ||
4252 | |||
4253 | return MAX_ORDER-1; | ||
4254 | } | ||
4255 | |||
4256 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4303 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4257 | static inline void __init set_pageblock_order(unsigned int order) | 4304 | static inline void __init set_pageblock_order(void) |
4258 | { | 4305 | { |
4306 | unsigned int order; | ||
4307 | |||
4259 | /* Check that pageblock_nr_pages has not already been setup */ | 4308 | /* Check that pageblock_nr_pages has not already been setup */ |
4260 | if (pageblock_order) | 4309 | if (pageblock_order) |
4261 | return; | 4310 | return; |
4262 | 4311 | ||
4312 | if (HPAGE_SHIFT > PAGE_SHIFT) | ||
4313 | order = HUGETLB_PAGE_ORDER; | ||
4314 | else | ||
4315 | order = MAX_ORDER - 1; | ||
4316 | |||
4263 | /* | 4317 | /* |
4264 | * Assume the largest contiguous order of interest is a huge page. | 4318 | * Assume the largest contiguous order of interest is a huge page. |
4265 | * This value may be variable depending on boot parameters on IA64 | 4319 | * This value may be variable depending on boot parameters on IA64 and |
4320 | * powerpc. | ||
4266 | */ | 4321 | */ |
4267 | pageblock_order = order; | 4322 | pageblock_order = order; |
4268 | } | 4323 | } |
@@ -4270,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order) | |||
4270 | 4325 | ||
4271 | /* | 4326 | /* |
4272 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() | 4327 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() |
4273 | * and pageblock_default_order() are unused as pageblock_order is set | 4328 | * is unused as pageblock_order is set at compile-time. See |
4274 | * at compile-time. See include/linux/pageblock-flags.h for the values of | 4329 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4275 | * pageblock_order based on the kernel config | 4330 | * the kernel config |
4276 | */ | 4331 | */ |
4277 | static inline int pageblock_default_order(unsigned int order) | 4332 | static inline void set_pageblock_order(void) |
4278 | { | 4333 | { |
4279 | return MAX_ORDER-1; | ||
4280 | } | 4334 | } |
4281 | #define set_pageblock_order(x) do {} while (0) | ||
4282 | 4335 | ||
4283 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4336 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4284 | 4337 | ||
@@ -4301,11 +4354,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4301 | init_waitqueue_head(&pgdat->kswapd_wait); | 4354 | init_waitqueue_head(&pgdat->kswapd_wait); |
4302 | pgdat->kswapd_max_order = 0; | 4355 | pgdat->kswapd_max_order = 0; |
4303 | pgdat_page_cgroup_init(pgdat); | 4356 | pgdat_page_cgroup_init(pgdat); |
4304 | 4357 | ||
4305 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4358 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4306 | struct zone *zone = pgdat->node_zones + j; | 4359 | struct zone *zone = pgdat->node_zones + j; |
4307 | unsigned long size, realsize, memmap_pages; | 4360 | unsigned long size, realsize, memmap_pages; |
4308 | enum lru_list lru; | ||
4309 | 4361 | ||
4310 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4362 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4311 | realsize = size - zone_absent_pages_in_node(nid, j, | 4363 | realsize = size - zone_absent_pages_in_node(nid, j, |
@@ -4355,18 +4407,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4355 | zone->zone_pgdat = pgdat; | 4407 | zone->zone_pgdat = pgdat; |
4356 | 4408 | ||
4357 | zone_pcp_init(zone); | 4409 | zone_pcp_init(zone); |
4358 | for_each_lru(lru) | 4410 | lruvec_init(&zone->lruvec, zone); |
4359 | INIT_LIST_HEAD(&zone->lruvec.lists[lru]); | ||
4360 | zone->reclaim_stat.recent_rotated[0] = 0; | ||
4361 | zone->reclaim_stat.recent_rotated[1] = 0; | ||
4362 | zone->reclaim_stat.recent_scanned[0] = 0; | ||
4363 | zone->reclaim_stat.recent_scanned[1] = 0; | ||
4364 | zap_zone_vm_stats(zone); | 4411 | zap_zone_vm_stats(zone); |
4365 | zone->flags = 0; | 4412 | zone->flags = 0; |
4366 | if (!size) | 4413 | if (!size) |
4367 | continue; | 4414 | continue; |
4368 | 4415 | ||
4369 | set_pageblock_order(pageblock_default_order()); | 4416 | set_pageblock_order(); |
4370 | setup_usemap(pgdat, zone, size); | 4417 | setup_usemap(pgdat, zone, size); |
4371 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 4418 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
4372 | size, MEMMAP_EARLY); | 4419 | size, MEMMAP_EARLY); |
@@ -4759,31 +4806,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4759 | find_zone_movable_pfns_for_nodes(); | 4806 | find_zone_movable_pfns_for_nodes(); |
4760 | 4807 | ||
4761 | /* Print out the zone ranges */ | 4808 | /* Print out the zone ranges */ |
4762 | printk("Zone PFN ranges:\n"); | 4809 | printk("Zone ranges:\n"); |
4763 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4810 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4764 | if (i == ZONE_MOVABLE) | 4811 | if (i == ZONE_MOVABLE) |
4765 | continue; | 4812 | continue; |
4766 | printk(" %-8s ", zone_names[i]); | 4813 | printk(KERN_CONT " %-8s ", zone_names[i]); |
4767 | if (arch_zone_lowest_possible_pfn[i] == | 4814 | if (arch_zone_lowest_possible_pfn[i] == |
4768 | arch_zone_highest_possible_pfn[i]) | 4815 | arch_zone_highest_possible_pfn[i]) |
4769 | printk("empty\n"); | 4816 | printk(KERN_CONT "empty\n"); |
4770 | else | 4817 | else |
4771 | printk("%0#10lx -> %0#10lx\n", | 4818 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", |
4772 | arch_zone_lowest_possible_pfn[i], | 4819 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, |
4773 | arch_zone_highest_possible_pfn[i]); | 4820 | (arch_zone_highest_possible_pfn[i] |
4821 | << PAGE_SHIFT) - 1); | ||
4774 | } | 4822 | } |
4775 | 4823 | ||
4776 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ | 4824 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ |
4777 | printk("Movable zone start PFN for each node\n"); | 4825 | printk("Movable zone start for each node\n"); |
4778 | for (i = 0; i < MAX_NUMNODES; i++) { | 4826 | for (i = 0; i < MAX_NUMNODES; i++) { |
4779 | if (zone_movable_pfn[i]) | 4827 | if (zone_movable_pfn[i]) |
4780 | printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); | 4828 | printk(" Node %d: %#010lx\n", i, |
4829 | zone_movable_pfn[i] << PAGE_SHIFT); | ||
4781 | } | 4830 | } |
4782 | 4831 | ||
4783 | /* Print out the early_node_map[] */ | 4832 | /* Print out the early_node_map[] */ |
4784 | printk("Early memory PFN ranges\n"); | 4833 | printk("Early memory node ranges\n"); |
4785 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4834 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4786 | printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); | 4835 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
4836 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | ||
4787 | 4837 | ||
4788 | /* Initialise every node */ | 4838 | /* Initialise every node */ |
4789 | mminit_verify_pageflags_layout(); | 4839 | mminit_verify_pageflags_layout(); |
@@ -4976,14 +5026,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
4976 | calculate_totalreserve_pages(); | 5026 | calculate_totalreserve_pages(); |
4977 | } | 5027 | } |
4978 | 5028 | ||
4979 | /** | 5029 | static void __setup_per_zone_wmarks(void) |
4980 | * setup_per_zone_wmarks - called when min_free_kbytes changes | ||
4981 | * or when memory is hot-{added|removed} | ||
4982 | * | ||
4983 | * Ensures that the watermark[min,low,high] values for each zone are set | ||
4984 | * correctly with respect to min_free_kbytes. | ||
4985 | */ | ||
4986 | void setup_per_zone_wmarks(void) | ||
4987 | { | 5030 | { |
4988 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 5031 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
4989 | unsigned long lowmem_pages = 0; | 5032 | unsigned long lowmem_pages = 0; |
@@ -5030,6 +5073,11 @@ void setup_per_zone_wmarks(void) | |||
5030 | 5073 | ||
5031 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5074 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5032 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5075 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5076 | |||
5077 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); | ||
5078 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); | ||
5079 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); | ||
5080 | |||
5033 | setup_zone_migrate_reserve(zone); | 5081 | setup_zone_migrate_reserve(zone); |
5034 | spin_unlock_irqrestore(&zone->lock, flags); | 5082 | spin_unlock_irqrestore(&zone->lock, flags); |
5035 | } | 5083 | } |
@@ -5038,6 +5086,20 @@ void setup_per_zone_wmarks(void) | |||
5038 | calculate_totalreserve_pages(); | 5086 | calculate_totalreserve_pages(); |
5039 | } | 5087 | } |
5040 | 5088 | ||
5089 | /** | ||
5090 | * setup_per_zone_wmarks - called when min_free_kbytes changes | ||
5091 | * or when memory is hot-{added|removed} | ||
5092 | * | ||
5093 | * Ensures that the watermark[min,low,high] values for each zone are set | ||
5094 | * correctly with respect to min_free_kbytes. | ||
5095 | */ | ||
5096 | void setup_per_zone_wmarks(void) | ||
5097 | { | ||
5098 | mutex_lock(&zonelists_mutex); | ||
5099 | __setup_per_zone_wmarks(); | ||
5100 | mutex_unlock(&zonelists_mutex); | ||
5101 | } | ||
5102 | |||
5041 | /* | 5103 | /* |
5042 | * The inactive anon list should be small enough that the VM never has to | 5104 | * The inactive anon list should be small enough that the VM never has to |
5043 | * do too much work, but large enough that each inactive page has a chance | 5105 | * do too much work, but large enough that each inactive page has a chance |
@@ -5242,9 +5304,10 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5242 | int flags, | 5304 | int flags, |
5243 | unsigned int *_hash_shift, | 5305 | unsigned int *_hash_shift, |
5244 | unsigned int *_hash_mask, | 5306 | unsigned int *_hash_mask, |
5245 | unsigned long limit) | 5307 | unsigned long low_limit, |
5308 | unsigned long high_limit) | ||
5246 | { | 5309 | { |
5247 | unsigned long long max = limit; | 5310 | unsigned long long max = high_limit; |
5248 | unsigned long log2qty, size; | 5311 | unsigned long log2qty, size; |
5249 | void *table = NULL; | 5312 | void *table = NULL; |
5250 | 5313 | ||
@@ -5282,6 +5345,8 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5282 | } | 5345 | } |
5283 | max = min(max, 0x80000000ULL); | 5346 | max = min(max, 0x80000000ULL); |
5284 | 5347 | ||
5348 | if (numentries < low_limit) | ||
5349 | numentries = low_limit; | ||
5285 | if (numentries > max) | 5350 | if (numentries > max) |
5286 | numentries = max; | 5351 | numentries = max; |
5287 | 5352 | ||
@@ -5412,14 +5477,16 @@ static int | |||
5412 | __count_immobile_pages(struct zone *zone, struct page *page, int count) | 5477 | __count_immobile_pages(struct zone *zone, struct page *page, int count) |
5413 | { | 5478 | { |
5414 | unsigned long pfn, iter, found; | 5479 | unsigned long pfn, iter, found; |
5480 | int mt; | ||
5481 | |||
5415 | /* | 5482 | /* |
5416 | * For avoiding noise data, lru_add_drain_all() should be called | 5483 | * For avoiding noise data, lru_add_drain_all() should be called |
5417 | * If ZONE_MOVABLE, the zone never contains immobile pages | 5484 | * If ZONE_MOVABLE, the zone never contains immobile pages |
5418 | */ | 5485 | */ |
5419 | if (zone_idx(zone) == ZONE_MOVABLE) | 5486 | if (zone_idx(zone) == ZONE_MOVABLE) |
5420 | return true; | 5487 | return true; |
5421 | 5488 | mt = get_pageblock_migratetype(page); | |
5422 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) | 5489 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) |
5423 | return true; | 5490 | return true; |
5424 | 5491 | ||
5425 | pfn = page_to_pfn(page); | 5492 | pfn = page_to_pfn(page); |
@@ -5536,7 +5603,7 @@ out: | |||
5536 | return ret; | 5603 | return ret; |
5537 | } | 5604 | } |
5538 | 5605 | ||
5539 | void unset_migratetype_isolate(struct page *page) | 5606 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) |
5540 | { | 5607 | { |
5541 | struct zone *zone; | 5608 | struct zone *zone; |
5542 | unsigned long flags; | 5609 | unsigned long flags; |
@@ -5544,12 +5611,264 @@ void unset_migratetype_isolate(struct page *page) | |||
5544 | spin_lock_irqsave(&zone->lock, flags); | 5611 | spin_lock_irqsave(&zone->lock, flags); |
5545 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 5612 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
5546 | goto out; | 5613 | goto out; |
5547 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 5614 | set_pageblock_migratetype(page, migratetype); |
5548 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | 5615 | move_freepages_block(zone, page, migratetype); |
5549 | out: | 5616 | out: |
5550 | spin_unlock_irqrestore(&zone->lock, flags); | 5617 | spin_unlock_irqrestore(&zone->lock, flags); |
5551 | } | 5618 | } |
5552 | 5619 | ||
5620 | #ifdef CONFIG_CMA | ||
5621 | |||
5622 | static unsigned long pfn_max_align_down(unsigned long pfn) | ||
5623 | { | ||
5624 | return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, | ||
5625 | pageblock_nr_pages) - 1); | ||
5626 | } | ||
5627 | |||
5628 | static unsigned long pfn_max_align_up(unsigned long pfn) | ||
5629 | { | ||
5630 | return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, | ||
5631 | pageblock_nr_pages)); | ||
5632 | } | ||
5633 | |||
5634 | static struct page * | ||
5635 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | ||
5636 | int **resultp) | ||
5637 | { | ||
5638 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
5639 | |||
5640 | if (PageHighMem(page)) | ||
5641 | gfp_mask |= __GFP_HIGHMEM; | ||
5642 | |||
5643 | return alloc_page(gfp_mask); | ||
5644 | } | ||
5645 | |||
5646 | /* [start, end) must belong to a single zone. */ | ||
5647 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | ||
5648 | { | ||
5649 | /* This function is based on compact_zone() from compaction.c. */ | ||
5650 | |||
5651 | unsigned long pfn = start; | ||
5652 | unsigned int tries = 0; | ||
5653 | int ret = 0; | ||
5654 | |||
5655 | struct compact_control cc = { | ||
5656 | .nr_migratepages = 0, | ||
5657 | .order = -1, | ||
5658 | .zone = page_zone(pfn_to_page(start)), | ||
5659 | .sync = true, | ||
5660 | }; | ||
5661 | INIT_LIST_HEAD(&cc.migratepages); | ||
5662 | |||
5663 | migrate_prep_local(); | ||
5664 | |||
5665 | while (pfn < end || !list_empty(&cc.migratepages)) { | ||
5666 | if (fatal_signal_pending(current)) { | ||
5667 | ret = -EINTR; | ||
5668 | break; | ||
5669 | } | ||
5670 | |||
5671 | if (list_empty(&cc.migratepages)) { | ||
5672 | cc.nr_migratepages = 0; | ||
5673 | pfn = isolate_migratepages_range(cc.zone, &cc, | ||
5674 | pfn, end); | ||
5675 | if (!pfn) { | ||
5676 | ret = -EINTR; | ||
5677 | break; | ||
5678 | } | ||
5679 | tries = 0; | ||
5680 | } else if (++tries == 5) { | ||
5681 | ret = ret < 0 ? ret : -EBUSY; | ||
5682 | break; | ||
5683 | } | ||
5684 | |||
5685 | ret = migrate_pages(&cc.migratepages, | ||
5686 | __alloc_contig_migrate_alloc, | ||
5687 | 0, false, MIGRATE_SYNC); | ||
5688 | } | ||
5689 | |||
5690 | putback_lru_pages(&cc.migratepages); | ||
5691 | return ret > 0 ? 0 : ret; | ||
5692 | } | ||
5693 | |||
5694 | /* | ||
5695 | * Update zone's cma pages counter used for watermark level calculation. | ||
5696 | */ | ||
5697 | static inline void __update_cma_watermarks(struct zone *zone, int count) | ||
5698 | { | ||
5699 | unsigned long flags; | ||
5700 | spin_lock_irqsave(&zone->lock, flags); | ||
5701 | zone->min_cma_pages += count; | ||
5702 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5703 | setup_per_zone_wmarks(); | ||
5704 | } | ||
5705 | |||
5706 | /* | ||
5707 | * Trigger memory pressure bump to reclaim some pages in order to be able to | ||
5708 | * allocate 'count' pages in single page units. Does similar work as | ||
5709 | *__alloc_pages_slowpath() function. | ||
5710 | */ | ||
5711 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | ||
5712 | { | ||
5713 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
5714 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); | ||
5715 | int did_some_progress = 0; | ||
5716 | int order = 1; | ||
5717 | |||
5718 | /* | ||
5719 | * Increase level of watermarks to force kswapd do his job | ||
5720 | * to stabilise at new watermark level. | ||
5721 | */ | ||
5722 | __update_cma_watermarks(zone, count); | ||
5723 | |||
5724 | /* Obey watermarks as if the page was being allocated */ | ||
5725 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { | ||
5726 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); | ||
5727 | |||
5728 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
5729 | NULL); | ||
5730 | if (!did_some_progress) { | ||
5731 | /* Exhausted what can be done so it's blamo time */ | ||
5732 | out_of_memory(zonelist, gfp_mask, order, NULL, false); | ||
5733 | } | ||
5734 | } | ||
5735 | |||
5736 | /* Restore original watermark levels. */ | ||
5737 | __update_cma_watermarks(zone, -count); | ||
5738 | |||
5739 | return count; | ||
5740 | } | ||
5741 | |||
5742 | /** | ||
5743 | * alloc_contig_range() -- tries to allocate given range of pages | ||
5744 | * @start: start PFN to allocate | ||
5745 | * @end: one-past-the-last PFN to allocate | ||
5746 | * @migratetype: migratetype of the underlaying pageblocks (either | ||
5747 | * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks | ||
5748 | * in range must have the same migratetype and it must | ||
5749 | * be either of the two. | ||
5750 | * | ||
5751 | * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES | ||
5752 | * aligned, however it's the caller's responsibility to guarantee that | ||
5753 | * we are the only thread that changes migrate type of pageblocks the | ||
5754 | * pages fall in. | ||
5755 | * | ||
5756 | * The PFN range must belong to a single zone. | ||
5757 | * | ||
5758 | * Returns zero on success or negative error code. On success all | ||
5759 | * pages which PFN is in [start, end) are allocated for the caller and | ||
5760 | * need to be freed with free_contig_range(). | ||
5761 | */ | ||
5762 | int alloc_contig_range(unsigned long start, unsigned long end, | ||
5763 | unsigned migratetype) | ||
5764 | { | ||
5765 | struct zone *zone = page_zone(pfn_to_page(start)); | ||
5766 | unsigned long outer_start, outer_end; | ||
5767 | int ret = 0, order; | ||
5768 | |||
5769 | /* | ||
5770 | * What we do here is we mark all pageblocks in range as | ||
5771 | * MIGRATE_ISOLATE. Because pageblock and max order pages may | ||
5772 | * have different sizes, and due to the way page allocator | ||
5773 | * work, we align the range to biggest of the two pages so | ||
5774 | * that page allocator won't try to merge buddies from | ||
5775 | * different pageblocks and change MIGRATE_ISOLATE to some | ||
5776 | * other migration type. | ||
5777 | * | ||
5778 | * Once the pageblocks are marked as MIGRATE_ISOLATE, we | ||
5779 | * migrate the pages from an unaligned range (ie. pages that | ||
5780 | * we are interested in). This will put all the pages in | ||
5781 | * range back to page allocator as MIGRATE_ISOLATE. | ||
5782 | * | ||
5783 | * When this is done, we take the pages in range from page | ||
5784 | * allocator removing them from the buddy system. This way | ||
5785 | * page allocator will never consider using them. | ||
5786 | * | ||
5787 | * This lets us mark the pageblocks back as | ||
5788 | * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the | ||
5789 | * aligned range but not in the unaligned, original range are | ||
5790 | * put back to page allocator so that buddy can use them. | ||
5791 | */ | ||
5792 | |||
5793 | ret = start_isolate_page_range(pfn_max_align_down(start), | ||
5794 | pfn_max_align_up(end), migratetype); | ||
5795 | if (ret) | ||
5796 | goto done; | ||
5797 | |||
5798 | ret = __alloc_contig_migrate_range(start, end); | ||
5799 | if (ret) | ||
5800 | goto done; | ||
5801 | |||
5802 | /* | ||
5803 | * Pages from [start, end) are within a MAX_ORDER_NR_PAGES | ||
5804 | * aligned blocks that are marked as MIGRATE_ISOLATE. What's | ||
5805 | * more, all pages in [start, end) are free in page allocator. | ||
5806 | * What we are going to do is to allocate all pages from | ||
5807 | * [start, end) (that is remove them from page allocator). | ||
5808 | * | ||
5809 | * The only problem is that pages at the beginning and at the | ||
5810 | * end of interesting range may be not aligned with pages that | ||
5811 | * page allocator holds, ie. they can be part of higher order | ||
5812 | * pages. Because of this, we reserve the bigger range and | ||
5813 | * once this is done free the pages we are not interested in. | ||
5814 | * | ||
5815 | * We don't have to hold zone->lock here because the pages are | ||
5816 | * isolated thus they won't get removed from buddy. | ||
5817 | */ | ||
5818 | |||
5819 | lru_add_drain_all(); | ||
5820 | drain_all_pages(); | ||
5821 | |||
5822 | order = 0; | ||
5823 | outer_start = start; | ||
5824 | while (!PageBuddy(pfn_to_page(outer_start))) { | ||
5825 | if (++order >= MAX_ORDER) { | ||
5826 | ret = -EBUSY; | ||
5827 | goto done; | ||
5828 | } | ||
5829 | outer_start &= ~0UL << order; | ||
5830 | } | ||
5831 | |||
5832 | /* Make sure the range is really isolated. */ | ||
5833 | if (test_pages_isolated(outer_start, end)) { | ||
5834 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", | ||
5835 | outer_start, end); | ||
5836 | ret = -EBUSY; | ||
5837 | goto done; | ||
5838 | } | ||
5839 | |||
5840 | /* | ||
5841 | * Reclaim enough pages to make sure that contiguous allocation | ||
5842 | * will not starve the system. | ||
5843 | */ | ||
5844 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | ||
5845 | |||
5846 | /* Grab isolated pages from freelists. */ | ||
5847 | outer_end = isolate_freepages_range(outer_start, end); | ||
5848 | if (!outer_end) { | ||
5849 | ret = -EBUSY; | ||
5850 | goto done; | ||
5851 | } | ||
5852 | |||
5853 | /* Free head and tail (if any) */ | ||
5854 | if (start != outer_start) | ||
5855 | free_contig_range(outer_start, start - outer_start); | ||
5856 | if (end != outer_end) | ||
5857 | free_contig_range(end, outer_end - end); | ||
5858 | |||
5859 | done: | ||
5860 | undo_isolate_page_range(pfn_max_align_down(start), | ||
5861 | pfn_max_align_up(end), migratetype); | ||
5862 | return ret; | ||
5863 | } | ||
5864 | |||
5865 | void free_contig_range(unsigned long pfn, unsigned nr_pages) | ||
5866 | { | ||
5867 | for (; nr_pages--; ++pfn) | ||
5868 | __free_page(pfn_to_page(pfn)); | ||
5869 | } | ||
5870 | #endif | ||
5871 | |||
5553 | #ifdef CONFIG_MEMORY_HOTREMOVE | 5872 | #ifdef CONFIG_MEMORY_HOTREMOVE |
5554 | /* | 5873 | /* |
5555 | * All pages in the range must be isolated before calling this. | 5874 | * All pages in the range must be isolated before calling this. |
@@ -5618,7 +5937,7 @@ bool is_free_buddy_page(struct page *page) | |||
5618 | } | 5937 | } |
5619 | #endif | 5938 | #endif |
5620 | 5939 | ||
5621 | static struct trace_print_flags pageflag_names[] = { | 5940 | static const struct trace_print_flags pageflag_names[] = { |
5622 | {1UL << PG_locked, "locked" }, | 5941 | {1UL << PG_locked, "locked" }, |
5623 | {1UL << PG_error, "error" }, | 5942 | {1UL << PG_error, "error" }, |
5624 | {1UL << PG_referenced, "referenced" }, | 5943 | {1UL << PG_referenced, "referenced" }, |
@@ -5653,7 +5972,9 @@ static struct trace_print_flags pageflag_names[] = { | |||
5653 | #ifdef CONFIG_MEMORY_FAILURE | 5972 | #ifdef CONFIG_MEMORY_FAILURE |
5654 | {1UL << PG_hwpoison, "hwpoison" }, | 5973 | {1UL << PG_hwpoison, "hwpoison" }, |
5655 | #endif | 5974 | #endif |
5656 | {-1UL, NULL }, | 5975 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
5976 | {1UL << PG_compound_lock, "compound_lock" }, | ||
5977 | #endif | ||
5657 | }; | 5978 | }; |
5658 | 5979 | ||
5659 | static void dump_page_flags(unsigned long flags) | 5980 | static void dump_page_flags(unsigned long flags) |
@@ -5662,12 +5983,14 @@ static void dump_page_flags(unsigned long flags) | |||
5662 | unsigned long mask; | 5983 | unsigned long mask; |
5663 | int i; | 5984 | int i; |
5664 | 5985 | ||
5986 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | ||
5987 | |||
5665 | printk(KERN_ALERT "page flags: %#lx(", flags); | 5988 | printk(KERN_ALERT "page flags: %#lx(", flags); |
5666 | 5989 | ||
5667 | /* remove zone id */ | 5990 | /* remove zone id */ |
5668 | flags &= (1UL << NR_PAGEFLAGS) - 1; | 5991 | flags &= (1UL << NR_PAGEFLAGS) - 1; |
5669 | 5992 | ||
5670 | for (i = 0; pageflag_names[i].name && flags; i++) { | 5993 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { |
5671 | 5994 | ||
5672 | mask = pageflag_names[i].mask; | 5995 | mask = pageflag_names[i].mask; |
5673 | if ((flags & mask) != mask) | 5996 | if ((flags & mask) != mask) |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1ccbd714059c..eb750f851395 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | |||
392 | 392 | ||
393 | /** | 393 | /** |
394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | 394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. |
395 | * @end: swap entry to be cmpxchged | 395 | * @ent: swap entry to be cmpxchged |
396 | * @old: old id | 396 | * @old: old id |
397 | * @new: new id | 397 | * @new: new id |
398 | * | 398 | * |
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | |||
422 | /** | 422 | /** |
423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | 423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
424 | * @ent: swap entry to be recorded into | 424 | * @ent: swap entry to be recorded into |
425 | * @mem: mem_cgroup to be recorded | 425 | * @id: mem_cgroup to be recorded |
426 | * | 426 | * |
427 | * Returns old value at success, 0 at failure. | 427 | * Returns old value at success, 0 at failure. |
428 | * (Of course, old value can be 0.) | 428 | * (Of course, old value can be 0.) |
diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..34f02923744c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
20 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
21 | #include <linux/frontswap.h> | ||
21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
22 | 23 | ||
23 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 24 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
98 | unlock_page(page); | 99 | unlock_page(page); |
99 | goto out; | 100 | goto out; |
100 | } | 101 | } |
102 | if (frontswap_store(page) == 0) { | ||
103 | set_page_writeback(page); | ||
104 | unlock_page(page); | ||
105 | end_page_writeback(page); | ||
106 | goto out; | ||
107 | } | ||
101 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); | 108 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
102 | if (bio == NULL) { | 109 | if (bio == NULL) { |
103 | set_page_dirty(page); | 110 | set_page_dirty(page); |
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page) | |||
122 | 129 | ||
123 | VM_BUG_ON(!PageLocked(page)); | 130 | VM_BUG_ON(!PageLocked(page)); |
124 | VM_BUG_ON(PageUptodate(page)); | 131 | VM_BUG_ON(PageUptodate(page)); |
132 | if (frontswap_load(page) == 0) { | ||
133 | SetPageUptodate(page); | ||
134 | unlock_page(page); | ||
135 | goto out; | ||
136 | } | ||
125 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 137 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
126 | if (bio == NULL) { | 138 | if (bio == NULL) { |
127 | unlock_page(page); | 139 | unlock_page(page); |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 4ae42bb40892..c9f04774f2b8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -24,6 +24,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) | |||
24 | * to be MIGRATE_ISOLATE. | 24 | * to be MIGRATE_ISOLATE. |
25 | * @start_pfn: The lower PFN of the range to be isolated. | 25 | * @start_pfn: The lower PFN of the range to be isolated. |
26 | * @end_pfn: The upper PFN of the range to be isolated. | 26 | * @end_pfn: The upper PFN of the range to be isolated. |
27 | * @migratetype: migrate type to set in error recovery. | ||
27 | * | 28 | * |
28 | * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in | 29 | * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in |
29 | * the range will never be allocated. Any free pages and pages freed in the | 30 | * the range will never be allocated. Any free pages and pages freed in the |
@@ -32,8 +33,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) | |||
32 | * start_pfn/end_pfn must be aligned to pageblock_order. | 33 | * start_pfn/end_pfn must be aligned to pageblock_order. |
33 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. | 34 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. |
34 | */ | 35 | */ |
35 | int | 36 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
36 | start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | 37 | unsigned migratetype) |
37 | { | 38 | { |
38 | unsigned long pfn; | 39 | unsigned long pfn; |
39 | unsigned long undo_pfn; | 40 | unsigned long undo_pfn; |
@@ -56,7 +57,7 @@ undo: | |||
56 | for (pfn = start_pfn; | 57 | for (pfn = start_pfn; |
57 | pfn < undo_pfn; | 58 | pfn < undo_pfn; |
58 | pfn += pageblock_nr_pages) | 59 | pfn += pageblock_nr_pages) |
59 | unset_migratetype_isolate(pfn_to_page(pfn)); | 60 | unset_migratetype_isolate(pfn_to_page(pfn), migratetype); |
60 | 61 | ||
61 | return -EBUSY; | 62 | return -EBUSY; |
62 | } | 63 | } |
@@ -64,8 +65,8 @@ undo: | |||
64 | /* | 65 | /* |
65 | * Make isolated pages available again. | 66 | * Make isolated pages available again. |
66 | */ | 67 | */ |
67 | int | 68 | int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
68 | undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | 69 | unsigned migratetype) |
69 | { | 70 | { |
70 | unsigned long pfn; | 71 | unsigned long pfn; |
71 | struct page *page; | 72 | struct page *page; |
@@ -77,7 +78,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | |||
77 | page = __first_valid_page(pfn, pageblock_nr_pages); | 78 | page = __first_valid_page(pfn, pageblock_nr_pages); |
78 | if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 79 | if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
79 | continue; | 80 | continue; |
80 | unset_migratetype_isolate(page); | 81 | unset_migratetype_isolate(page, migratetype); |
81 | } | 82 | } |
82 | return 0; | 83 | return 0; |
83 | } | 84 | } |
@@ -86,7 +87,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | |||
86 | * all pages in [start_pfn...end_pfn) must be in the same zone. | 87 | * all pages in [start_pfn...end_pfn) must be in the same zone. |
87 | * zone->lock must be held before call this. | 88 | * zone->lock must be held before call this. |
88 | * | 89 | * |
89 | * Returns 1 if all pages in the range is isolated. | 90 | * Returns 1 if all pages in the range are isolated. |
90 | */ | 91 | */ |
91 | static int | 92 | static int |
92 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | 93 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e12714..6c118d012bb5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
162 | 162 | ||
163 | /** | 163 | /** |
164 | * walk_page_range - walk a memory map's page tables with a callback | 164 | * walk_page_range - walk a memory map's page tables with a callback |
165 | * @mm: memory map to walk | ||
166 | * @addr: starting address | 165 | * @addr: starting address |
167 | * @end: ending address | 166 | * @end: ending address |
168 | * @walk: set of callbacks to invoke for each level of the tree | 167 | * @walk: set of callbacks to invoke for each level of the tree |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 405d331804c3..3707c71ae4cd 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -360,7 +360,6 @@ err_free: | |||
360 | * @chunk: chunk to depopulate | 360 | * @chunk: chunk to depopulate |
361 | * @off: offset to the area to depopulate | 361 | * @off: offset to the area to depopulate |
362 | * @size: size of the area to depopulate in bytes | 362 | * @size: size of the area to depopulate in bytes |
363 | * @flush: whether to flush cache and tlb or not | ||
364 | * | 363 | * |
365 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 364 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
366 | * from @chunk. If @flush is true, vcache is flushed before unmapping | 365 | * from @chunk. If @flush is true, vcache is flushed before unmapping |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5a74fea182f1..74c0ddaa6fa0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -109,8 +109,8 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | |||
109 | 109 | ||
110 | #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH | 110 | #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH |
111 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 111 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
112 | pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | 112 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, |
113 | pmd_t *pmdp) | 113 | pmd_t *pmdp) |
114 | { | 114 | { |
115 | pmd_t pmd = pmd_mksplitting(*pmdp); | 115 | pmd_t pmd = pmd_mksplitting(*pmdp); |
116 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 116 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index c20ff48994c2..926b46649749 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
@@ -371,15 +371,15 @@ static ssize_t process_vm_rw(pid_t pid, | |||
371 | /* Check iovecs */ | 371 | /* Check iovecs */ |
372 | if (vm_write) | 372 | if (vm_write) |
373 | rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, | 373 | rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, |
374 | iovstack_l, &iov_l, 1); | 374 | iovstack_l, &iov_l); |
375 | else | 375 | else |
376 | rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, | 376 | rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, |
377 | iovstack_l, &iov_l, 1); | 377 | iovstack_l, &iov_l); |
378 | if (rc <= 0) | 378 | if (rc <= 0) |
379 | goto free_iovecs; | 379 | goto free_iovecs; |
380 | 380 | ||
381 | rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, | 381 | rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, |
382 | iovstack_r, &iov_r, 0); | 382 | iovstack_r, &iov_r); |
383 | if (rc <= 0) | 383 | if (rc <= 0) |
384 | goto free_iovecs; | 384 | goto free_iovecs; |
385 | 385 | ||
@@ -438,16 +438,16 @@ compat_process_vm_rw(compat_pid_t pid, | |||
438 | if (vm_write) | 438 | if (vm_write) |
439 | rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, | 439 | rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, |
440 | UIO_FASTIOV, iovstack_l, | 440 | UIO_FASTIOV, iovstack_l, |
441 | &iov_l, 1); | 441 | &iov_l); |
442 | else | 442 | else |
443 | rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, | 443 | rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, |
444 | UIO_FASTIOV, iovstack_l, | 444 | UIO_FASTIOV, iovstack_l, |
445 | &iov_l, 1); | 445 | &iov_l); |
446 | if (rc <= 0) | 446 | if (rc <= 0) |
447 | goto free_iovecs; | 447 | goto free_iovecs; |
448 | rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, | 448 | rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, |
449 | UIO_FASTIOV, iovstack_r, | 449 | UIO_FASTIOV, iovstack_r, |
450 | &iov_r, 0); | 450 | &iov_r); |
451 | if (rc <= 0) | 451 | if (rc <= 0) |
452 | goto free_iovecs; | 452 | goto free_iovecs; |
453 | 453 | ||
diff --git a/mm/readahead.c b/mm/readahead.c index cbcbb02f3e28..ea8f8fa21649 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <linux/task_io_accounting_ops.h> | 17 | #include <linux/task_io_accounting_ops.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/syscalls.h> | ||
21 | #include <linux/file.h> | ||
20 | 22 | ||
21 | /* | 23 | /* |
22 | * Initialise a struct file's readahead state. Assumes that the caller has | 24 | * Initialise a struct file's readahead state. Assumes that the caller has |
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping, | |||
562 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 564 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
563 | } | 565 | } |
564 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 566 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
567 | |||
568 | static ssize_t | ||
569 | do_readahead(struct address_space *mapping, struct file *filp, | ||
570 | pgoff_t index, unsigned long nr) | ||
571 | { | ||
572 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | ||
573 | return -EINVAL; | ||
574 | |||
575 | force_page_cache_readahead(mapping, filp, index, nr); | ||
576 | return 0; | ||
577 | } | ||
578 | |||
579 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) | ||
580 | { | ||
581 | ssize_t ret; | ||
582 | struct file *file; | ||
583 | |||
584 | ret = -EBADF; | ||
585 | file = fget(fd); | ||
586 | if (file) { | ||
587 | if (file->f_mode & FMODE_READ) { | ||
588 | struct address_space *mapping = file->f_mapping; | ||
589 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; | ||
590 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | ||
591 | unsigned long len = end - start + 1; | ||
592 | ret = do_readahead(mapping, file, start, len); | ||
593 | } | ||
594 | fput(file); | ||
595 | } | ||
596 | return ret; | ||
597 | } | ||
598 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | ||
599 | asmlinkage long SyS_readahead(long fd, loff_t offset, long count) | ||
600 | { | ||
601 | return SYSC_readahead((int) fd, offset, (size_t) count); | ||
602 | } | ||
603 | SYSCALL_ALIAS(sys_readahead, SyS_readahead); | ||
604 | #endif | ||
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
755 | pte_unmap_unlock(pte, ptl); | 755 | pte_unmap_unlock(pte, ptl); |
756 | } | 756 | } |
757 | 757 | ||
758 | /* Pretend the page is referenced if the task has the | ||
759 | swap token and is in the middle of a page fault. */ | ||
760 | if (mm != current->mm && has_swap_token(mm) && | ||
761 | rwsem_is_locked(&mm->mmap_sem)) | ||
762 | referenced++; | ||
763 | |||
764 | (*mapcount)--; | 758 | (*mapcount)--; |
765 | 759 | ||
766 | if (referenced) | 760 | if (referenced) |
diff --git a/mm/shmem.c b/mm/shmem.c index f99ff3e50bd6..bd106361be4b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt; | |||
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/pagevec.h> | 54 | #include <linux/pagevec.h> |
55 | #include <linux/percpu_counter.h> | 55 | #include <linux/percpu_counter.h> |
56 | #include <linux/falloc.h> | ||
56 | #include <linux/splice.h> | 57 | #include <linux/splice.h> |
57 | #include <linux/security.h> | 58 | #include <linux/security.h> |
58 | #include <linux/swapops.h> | 59 | #include <linux/swapops.h> |
@@ -83,12 +84,25 @@ struct shmem_xattr { | |||
83 | char value[0]; | 84 | char value[0]; |
84 | }; | 85 | }; |
85 | 86 | ||
87 | /* | ||
88 | * shmem_fallocate and shmem_writepage communicate via inode->i_private | ||
89 | * (with i_mutex making sure that it has only one user at a time): | ||
90 | * we would prefer not to enlarge the shmem inode just for that. | ||
91 | */ | ||
92 | struct shmem_falloc { | ||
93 | pgoff_t start; /* start of range currently being fallocated */ | ||
94 | pgoff_t next; /* the next page offset to be fallocated */ | ||
95 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ | ||
96 | pgoff_t nr_unswapped; /* how often writepage refused to swap out */ | ||
97 | }; | ||
98 | |||
86 | /* Flag allocation requirements to shmem_getpage */ | 99 | /* Flag allocation requirements to shmem_getpage */ |
87 | enum sgp_type { | 100 | enum sgp_type { |
88 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 101 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
89 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | 102 | SGP_CACHE, /* don't exceed i_size, may allocate page */ |
90 | SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ | 103 | SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ |
91 | SGP_WRITE, /* may exceed i_size, may allocate page */ | 104 | SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ |
105 | SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ | ||
92 | }; | 106 | }; |
93 | 107 | ||
94 | #ifdef CONFIG_TMPFS | 108 | #ifdef CONFIG_TMPFS |
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void) | |||
103 | } | 117 | } |
104 | #endif | 118 | #endif |
105 | 119 | ||
120 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp); | ||
121 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
122 | struct shmem_inode_info *info, pgoff_t index); | ||
106 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | 123 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
107 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); | 124 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
108 | 125 | ||
@@ -247,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping, | |||
247 | } | 264 | } |
248 | 265 | ||
249 | /* | 266 | /* |
267 | * Sometimes, before we decide whether to proceed or to fail, we must check | ||
268 | * that an entry was not already brought back from swap by a racing thread. | ||
269 | * | ||
270 | * Checking page is not enough: by the time a SwapCache page is locked, it | ||
271 | * might be reused, and again be SwapCache, using the same swap as before. | ||
272 | */ | ||
273 | static bool shmem_confirm_swap(struct address_space *mapping, | ||
274 | pgoff_t index, swp_entry_t swap) | ||
275 | { | ||
276 | void *item; | ||
277 | |||
278 | rcu_read_lock(); | ||
279 | item = radix_tree_lookup(&mapping->page_tree, index); | ||
280 | rcu_read_unlock(); | ||
281 | return item == swp_to_radix_entry(swap); | ||
282 | } | ||
283 | |||
284 | /* | ||
250 | * Like add_to_page_cache_locked, but error if expected item has gone. | 285 | * Like add_to_page_cache_locked, but error if expected item has gone. |
251 | */ | 286 | */ |
252 | static int shmem_add_to_page_cache(struct page *page, | 287 | static int shmem_add_to_page_cache(struct page *page, |
253 | struct address_space *mapping, | 288 | struct address_space *mapping, |
254 | pgoff_t index, gfp_t gfp, void *expected) | 289 | pgoff_t index, gfp_t gfp, void *expected) |
255 | { | 290 | { |
256 | int error = 0; | 291 | int error; |
257 | 292 | ||
258 | VM_BUG_ON(!PageLocked(page)); | 293 | VM_BUG_ON(!PageLocked(page)); |
259 | VM_BUG_ON(!PageSwapBacked(page)); | 294 | VM_BUG_ON(!PageSwapBacked(page)); |
260 | 295 | ||
296 | page_cache_get(page); | ||
297 | page->mapping = mapping; | ||
298 | page->index = index; | ||
299 | |||
300 | spin_lock_irq(&mapping->tree_lock); | ||
261 | if (!expected) | 301 | if (!expected) |
262 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | 302 | error = radix_tree_insert(&mapping->page_tree, index, page); |
303 | else | ||
304 | error = shmem_radix_tree_replace(mapping, index, expected, | ||
305 | page); | ||
263 | if (!error) { | 306 | if (!error) { |
264 | page_cache_get(page); | 307 | mapping->nrpages++; |
265 | page->mapping = mapping; | 308 | __inc_zone_page_state(page, NR_FILE_PAGES); |
266 | page->index = index; | 309 | __inc_zone_page_state(page, NR_SHMEM); |
267 | 310 | spin_unlock_irq(&mapping->tree_lock); | |
268 | spin_lock_irq(&mapping->tree_lock); | 311 | } else { |
269 | if (!expected) | 312 | page->mapping = NULL; |
270 | error = radix_tree_insert(&mapping->page_tree, | 313 | spin_unlock_irq(&mapping->tree_lock); |
271 | index, page); | 314 | page_cache_release(page); |
272 | else | ||
273 | error = shmem_radix_tree_replace(mapping, index, | ||
274 | expected, page); | ||
275 | if (!error) { | ||
276 | mapping->nrpages++; | ||
277 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
278 | __inc_zone_page_state(page, NR_SHMEM); | ||
279 | spin_unlock_irq(&mapping->tree_lock); | ||
280 | } else { | ||
281 | page->mapping = NULL; | ||
282 | spin_unlock_irq(&mapping->tree_lock); | ||
283 | page_cache_release(page); | ||
284 | } | ||
285 | if (!expected) | ||
286 | radix_tree_preload_end(); | ||
287 | } | 315 | } |
288 | if (error) | ||
289 | mem_cgroup_uncharge_cache_page(page); | ||
290 | return error; | 316 | return error; |
291 | } | 317 | } |
292 | 318 | ||
@@ -423,27 +449,31 @@ void shmem_unlock_mapping(struct address_space *mapping) | |||
423 | 449 | ||
424 | /* | 450 | /* |
425 | * Remove range of pages and swap entries from radix tree, and free them. | 451 | * Remove range of pages and swap entries from radix tree, and free them. |
452 | * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. | ||
426 | */ | 453 | */ |
427 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | 454 | static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, |
455 | bool unfalloc) | ||
428 | { | 456 | { |
429 | struct address_space *mapping = inode->i_mapping; | 457 | struct address_space *mapping = inode->i_mapping; |
430 | struct shmem_inode_info *info = SHMEM_I(inode); | 458 | struct shmem_inode_info *info = SHMEM_I(inode); |
431 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 459 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
432 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 460 | pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; |
433 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); | 461 | unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); |
462 | unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); | ||
434 | struct pagevec pvec; | 463 | struct pagevec pvec; |
435 | pgoff_t indices[PAGEVEC_SIZE]; | 464 | pgoff_t indices[PAGEVEC_SIZE]; |
436 | long nr_swaps_freed = 0; | 465 | long nr_swaps_freed = 0; |
437 | pgoff_t index; | 466 | pgoff_t index; |
438 | int i; | 467 | int i; |
439 | 468 | ||
440 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); | 469 | if (lend == -1) |
470 | end = -1; /* unsigned, so actually very big */ | ||
441 | 471 | ||
442 | pagevec_init(&pvec, 0); | 472 | pagevec_init(&pvec, 0); |
443 | index = start; | 473 | index = start; |
444 | while (index <= end) { | 474 | while (index < end) { |
445 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 475 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
446 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 476 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
447 | pvec.pages, indices); | 477 | pvec.pages, indices); |
448 | if (!pvec.nr) | 478 | if (!pvec.nr) |
449 | break; | 479 | break; |
@@ -452,10 +482,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
452 | struct page *page = pvec.pages[i]; | 482 | struct page *page = pvec.pages[i]; |
453 | 483 | ||
454 | index = indices[i]; | 484 | index = indices[i]; |
455 | if (index > end) | 485 | if (index >= end) |
456 | break; | 486 | break; |
457 | 487 | ||
458 | if (radix_tree_exceptional_entry(page)) { | 488 | if (radix_tree_exceptional_entry(page)) { |
489 | if (unfalloc) | ||
490 | continue; | ||
459 | nr_swaps_freed += !shmem_free_swap(mapping, | 491 | nr_swaps_freed += !shmem_free_swap(mapping, |
460 | index, page); | 492 | index, page); |
461 | continue; | 493 | continue; |
@@ -463,9 +495,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
463 | 495 | ||
464 | if (!trylock_page(page)) | 496 | if (!trylock_page(page)) |
465 | continue; | 497 | continue; |
466 | if (page->mapping == mapping) { | 498 | if (!unfalloc || !PageUptodate(page)) { |
467 | VM_BUG_ON(PageWriteback(page)); | 499 | if (page->mapping == mapping) { |
468 | truncate_inode_page(mapping, page); | 500 | VM_BUG_ON(PageWriteback(page)); |
501 | truncate_inode_page(mapping, page); | ||
502 | } | ||
469 | } | 503 | } |
470 | unlock_page(page); | 504 | unlock_page(page); |
471 | } | 505 | } |
@@ -476,30 +510,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
476 | index++; | 510 | index++; |
477 | } | 511 | } |
478 | 512 | ||
479 | if (partial) { | 513 | if (partial_start) { |
480 | struct page *page = NULL; | 514 | struct page *page = NULL; |
481 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); | 515 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); |
482 | if (page) { | 516 | if (page) { |
483 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); | 517 | unsigned int top = PAGE_CACHE_SIZE; |
518 | if (start > end) { | ||
519 | top = partial_end; | ||
520 | partial_end = 0; | ||
521 | } | ||
522 | zero_user_segment(page, partial_start, top); | ||
484 | set_page_dirty(page); | 523 | set_page_dirty(page); |
485 | unlock_page(page); | 524 | unlock_page(page); |
486 | page_cache_release(page); | 525 | page_cache_release(page); |
487 | } | 526 | } |
488 | } | 527 | } |
528 | if (partial_end) { | ||
529 | struct page *page = NULL; | ||
530 | shmem_getpage(inode, end, &page, SGP_READ, NULL); | ||
531 | if (page) { | ||
532 | zero_user_segment(page, 0, partial_end); | ||
533 | set_page_dirty(page); | ||
534 | unlock_page(page); | ||
535 | page_cache_release(page); | ||
536 | } | ||
537 | } | ||
538 | if (start >= end) | ||
539 | return; | ||
489 | 540 | ||
490 | index = start; | 541 | index = start; |
491 | for ( ; ; ) { | 542 | for ( ; ; ) { |
492 | cond_resched(); | 543 | cond_resched(); |
493 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 544 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
494 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 545 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
495 | pvec.pages, indices); | 546 | pvec.pages, indices); |
496 | if (!pvec.nr) { | 547 | if (!pvec.nr) { |
497 | if (index == start) | 548 | if (index == start || unfalloc) |
498 | break; | 549 | break; |
499 | index = start; | 550 | index = start; |
500 | continue; | 551 | continue; |
501 | } | 552 | } |
502 | if (index == start && indices[0] > end) { | 553 | if ((index == start || unfalloc) && indices[0] >= end) { |
503 | shmem_deswap_pagevec(&pvec); | 554 | shmem_deswap_pagevec(&pvec); |
504 | pagevec_release(&pvec); | 555 | pagevec_release(&pvec); |
505 | break; | 556 | break; |
@@ -509,19 +560,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
509 | struct page *page = pvec.pages[i]; | 560 | struct page *page = pvec.pages[i]; |
510 | 561 | ||
511 | index = indices[i]; | 562 | index = indices[i]; |
512 | if (index > end) | 563 | if (index >= end) |
513 | break; | 564 | break; |
514 | 565 | ||
515 | if (radix_tree_exceptional_entry(page)) { | 566 | if (radix_tree_exceptional_entry(page)) { |
567 | if (unfalloc) | ||
568 | continue; | ||
516 | nr_swaps_freed += !shmem_free_swap(mapping, | 569 | nr_swaps_freed += !shmem_free_swap(mapping, |
517 | index, page); | 570 | index, page); |
518 | continue; | 571 | continue; |
519 | } | 572 | } |
520 | 573 | ||
521 | lock_page(page); | 574 | lock_page(page); |
522 | if (page->mapping == mapping) { | 575 | if (!unfalloc || !PageUptodate(page)) { |
523 | VM_BUG_ON(PageWriteback(page)); | 576 | if (page->mapping == mapping) { |
524 | truncate_inode_page(mapping, page); | 577 | VM_BUG_ON(PageWriteback(page)); |
578 | truncate_inode_page(mapping, page); | ||
579 | } | ||
525 | } | 580 | } |
526 | unlock_page(page); | 581 | unlock_page(page); |
527 | } | 582 | } |
@@ -535,7 +590,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
535 | info->swapped -= nr_swaps_freed; | 590 | info->swapped -= nr_swaps_freed; |
536 | shmem_recalc_inode(inode); | 591 | shmem_recalc_inode(inode); |
537 | spin_unlock(&info->lock); | 592 | spin_unlock(&info->lock); |
593 | } | ||
538 | 594 | ||
595 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
596 | { | ||
597 | shmem_undo_range(inode, lstart, lend, false); | ||
539 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 598 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
540 | } | 599 | } |
541 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 600 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
@@ -597,19 +656,20 @@ static void shmem_evict_inode(struct inode *inode) | |||
597 | } | 656 | } |
598 | BUG_ON(inode->i_blocks); | 657 | BUG_ON(inode->i_blocks); |
599 | shmem_free_inode(inode->i_sb); | 658 | shmem_free_inode(inode->i_sb); |
600 | end_writeback(inode); | 659 | clear_inode(inode); |
601 | } | 660 | } |
602 | 661 | ||
603 | /* | 662 | /* |
604 | * If swap found in inode, free it and move page from swapcache to filecache. | 663 | * If swap found in inode, free it and move page from swapcache to filecache. |
605 | */ | 664 | */ |
606 | static int shmem_unuse_inode(struct shmem_inode_info *info, | 665 | static int shmem_unuse_inode(struct shmem_inode_info *info, |
607 | swp_entry_t swap, struct page *page) | 666 | swp_entry_t swap, struct page **pagep) |
608 | { | 667 | { |
609 | struct address_space *mapping = info->vfs_inode.i_mapping; | 668 | struct address_space *mapping = info->vfs_inode.i_mapping; |
610 | void *radswap; | 669 | void *radswap; |
611 | pgoff_t index; | 670 | pgoff_t index; |
612 | int error; | 671 | gfp_t gfp; |
672 | int error = 0; | ||
613 | 673 | ||
614 | radswap = swp_to_radix_entry(swap); | 674 | radswap = swp_to_radix_entry(swap); |
615 | index = radix_tree_locate_item(&mapping->page_tree, radswap); | 675 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
@@ -625,22 +685,48 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
625 | if (shmem_swaplist.next != &info->swaplist) | 685 | if (shmem_swaplist.next != &info->swaplist) |
626 | list_move_tail(&shmem_swaplist, &info->swaplist); | 686 | list_move_tail(&shmem_swaplist, &info->swaplist); |
627 | 687 | ||
688 | gfp = mapping_gfp_mask(mapping); | ||
689 | if (shmem_should_replace_page(*pagep, gfp)) { | ||
690 | mutex_unlock(&shmem_swaplist_mutex); | ||
691 | error = shmem_replace_page(pagep, gfp, info, index); | ||
692 | mutex_lock(&shmem_swaplist_mutex); | ||
693 | /* | ||
694 | * We needed to drop mutex to make that restrictive page | ||
695 | * allocation, but the inode might have been freed while we | ||
696 | * dropped it: although a racing shmem_evict_inode() cannot | ||
697 | * complete without emptying the radix_tree, our page lock | ||
698 | * on this swapcache page is not enough to prevent that - | ||
699 | * free_swap_and_cache() of our swap entry will only | ||
700 | * trylock_page(), removing swap from radix_tree whatever. | ||
701 | * | ||
702 | * We must not proceed to shmem_add_to_page_cache() if the | ||
703 | * inode has been freed, but of course we cannot rely on | ||
704 | * inode or mapping or info to check that. However, we can | ||
705 | * safely check if our swap entry is still in use (and here | ||
706 | * it can't have got reused for another page): if it's still | ||
707 | * in use, then the inode cannot have been freed yet, and we | ||
708 | * can safely proceed (if it's no longer in use, that tells | ||
709 | * nothing about the inode, but we don't need to unuse swap). | ||
710 | */ | ||
711 | if (!page_swapcount(*pagep)) | ||
712 | error = -ENOENT; | ||
713 | } | ||
714 | |||
628 | /* | 715 | /* |
629 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, | 716 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, |
630 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 717 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
631 | * beneath us (pagelock doesn't help until the page is in pagecache). | 718 | * beneath us (pagelock doesn't help until the page is in pagecache). |
632 | */ | 719 | */ |
633 | error = shmem_add_to_page_cache(page, mapping, index, | 720 | if (!error) |
721 | error = shmem_add_to_page_cache(*pagep, mapping, index, | ||
634 | GFP_NOWAIT, radswap); | 722 | GFP_NOWAIT, radswap); |
635 | /* which does mem_cgroup_uncharge_cache_page on error */ | ||
636 | |||
637 | if (error != -ENOMEM) { | 723 | if (error != -ENOMEM) { |
638 | /* | 724 | /* |
639 | * Truncation and eviction use free_swap_and_cache(), which | 725 | * Truncation and eviction use free_swap_and_cache(), which |
640 | * only does trylock page: if we raced, best clean up here. | 726 | * only does trylock page: if we raced, best clean up here. |
641 | */ | 727 | */ |
642 | delete_from_swap_cache(page); | 728 | delete_from_swap_cache(*pagep); |
643 | set_page_dirty(page); | 729 | set_page_dirty(*pagep); |
644 | if (!error) { | 730 | if (!error) { |
645 | spin_lock(&info->lock); | 731 | spin_lock(&info->lock); |
646 | info->swapped--; | 732 | info->swapped--; |
@@ -660,7 +746,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
660 | struct list_head *this, *next; | 746 | struct list_head *this, *next; |
661 | struct shmem_inode_info *info; | 747 | struct shmem_inode_info *info; |
662 | int found = 0; | 748 | int found = 0; |
663 | int error; | 749 | int error = 0; |
750 | |||
751 | /* | ||
752 | * There's a faint possibility that swap page was replaced before | ||
753 | * caller locked it: caller will come back later with the right page. | ||
754 | */ | ||
755 | if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) | ||
756 | goto out; | ||
664 | 757 | ||
665 | /* | 758 | /* |
666 | * Charge page using GFP_KERNEL while we can wait, before taking | 759 | * Charge page using GFP_KERNEL while we can wait, before taking |
@@ -676,7 +769,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
676 | list_for_each_safe(this, next, &shmem_swaplist) { | 769 | list_for_each_safe(this, next, &shmem_swaplist) { |
677 | info = list_entry(this, struct shmem_inode_info, swaplist); | 770 | info = list_entry(this, struct shmem_inode_info, swaplist); |
678 | if (info->swapped) | 771 | if (info->swapped) |
679 | found = shmem_unuse_inode(info, swap, page); | 772 | found = shmem_unuse_inode(info, swap, &page); |
680 | else | 773 | else |
681 | list_del_init(&info->swaplist); | 774 | list_del_init(&info->swaplist); |
682 | cond_resched(); | 775 | cond_resched(); |
@@ -685,8 +778,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
685 | } | 778 | } |
686 | mutex_unlock(&shmem_swaplist_mutex); | 779 | mutex_unlock(&shmem_swaplist_mutex); |
687 | 780 | ||
688 | if (!found) | ||
689 | mem_cgroup_uncharge_cache_page(page); | ||
690 | if (found < 0) | 781 | if (found < 0) |
691 | error = found; | 782 | error = found; |
692 | out: | 783 | out: |
@@ -727,6 +818,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
727 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ | 818 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
728 | goto redirty; | 819 | goto redirty; |
729 | } | 820 | } |
821 | |||
822 | /* | ||
823 | * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC | ||
824 | * value into swapfile.c, the only way we can correctly account for a | ||
825 | * fallocated page arriving here is now to initialize it and write it. | ||
826 | * | ||
827 | * That's okay for a page already fallocated earlier, but if we have | ||
828 | * not yet completed the fallocation, then (a) we want to keep track | ||
829 | * of this page in case we have to undo it, and (b) it may not be a | ||
830 | * good idea to continue anyway, once we're pushing into swap. So | ||
831 | * reactivate the page, and let shmem_fallocate() quit when too many. | ||
832 | */ | ||
833 | if (!PageUptodate(page)) { | ||
834 | if (inode->i_private) { | ||
835 | struct shmem_falloc *shmem_falloc; | ||
836 | spin_lock(&inode->i_lock); | ||
837 | shmem_falloc = inode->i_private; | ||
838 | if (shmem_falloc && | ||
839 | index >= shmem_falloc->start && | ||
840 | index < shmem_falloc->next) | ||
841 | shmem_falloc->nr_unswapped++; | ||
842 | else | ||
843 | shmem_falloc = NULL; | ||
844 | spin_unlock(&inode->i_lock); | ||
845 | if (shmem_falloc) | ||
846 | goto redirty; | ||
847 | } | ||
848 | clear_highpage(page); | ||
849 | flush_dcache_page(page); | ||
850 | SetPageUptodate(page); | ||
851 | } | ||
852 | |||
730 | swap = get_swap_page(); | 853 | swap = get_swap_page(); |
731 | if (!swap.val) | 854 | if (!swap.val) |
732 | goto redirty; | 855 | goto redirty; |
@@ -856,6 +979,89 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
856 | #endif | 979 | #endif |
857 | 980 | ||
858 | /* | 981 | /* |
982 | * When a page is moved from swapcache to shmem filecache (either by the | ||
983 | * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of | ||
984 | * shmem_unuse_inode()), it may have been read in earlier from swap, in | ||
985 | * ignorance of the mapping it belongs to. If that mapping has special | ||
986 | * constraints (like the gma500 GEM driver, which requires RAM below 4GB), | ||
987 | * we may need to copy to a suitable page before moving to filecache. | ||
988 | * | ||
989 | * In a future release, this may well be extended to respect cpuset and | ||
990 | * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); | ||
991 | * but for now it is a simple matter of zone. | ||
992 | */ | ||
993 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp) | ||
994 | { | ||
995 | return page_zonenum(page) > gfp_zone(gfp); | ||
996 | } | ||
997 | |||
998 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
999 | struct shmem_inode_info *info, pgoff_t index) | ||
1000 | { | ||
1001 | struct page *oldpage, *newpage; | ||
1002 | struct address_space *swap_mapping; | ||
1003 | pgoff_t swap_index; | ||
1004 | int error; | ||
1005 | |||
1006 | oldpage = *pagep; | ||
1007 | swap_index = page_private(oldpage); | ||
1008 | swap_mapping = page_mapping(oldpage); | ||
1009 | |||
1010 | /* | ||
1011 | * We have arrived here because our zones are constrained, so don't | ||
1012 | * limit chance of success by further cpuset and node constraints. | ||
1013 | */ | ||
1014 | gfp &= ~GFP_CONSTRAINT_MASK; | ||
1015 | newpage = shmem_alloc_page(gfp, info, index); | ||
1016 | if (!newpage) | ||
1017 | return -ENOMEM; | ||
1018 | |||
1019 | page_cache_get(newpage); | ||
1020 | copy_highpage(newpage, oldpage); | ||
1021 | flush_dcache_page(newpage); | ||
1022 | |||
1023 | __set_page_locked(newpage); | ||
1024 | SetPageUptodate(newpage); | ||
1025 | SetPageSwapBacked(newpage); | ||
1026 | set_page_private(newpage, swap_index); | ||
1027 | SetPageSwapCache(newpage); | ||
1028 | |||
1029 | /* | ||
1030 | * Our caller will very soon move newpage out of swapcache, but it's | ||
1031 | * a nice clean interface for us to replace oldpage by newpage there. | ||
1032 | */ | ||
1033 | spin_lock_irq(&swap_mapping->tree_lock); | ||
1034 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | ||
1035 | newpage); | ||
1036 | if (!error) { | ||
1037 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | ||
1038 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | ||
1039 | } | ||
1040 | spin_unlock_irq(&swap_mapping->tree_lock); | ||
1041 | |||
1042 | if (unlikely(error)) { | ||
1043 | /* | ||
1044 | * Is this possible? I think not, now that our callers check | ||
1045 | * both PageSwapCache and page_private after getting page lock; | ||
1046 | * but be defensive. Reverse old to newpage for clear and free. | ||
1047 | */ | ||
1048 | oldpage = newpage; | ||
1049 | } else { | ||
1050 | mem_cgroup_replace_page_cache(oldpage, newpage); | ||
1051 | lru_cache_add_anon(newpage); | ||
1052 | *pagep = newpage; | ||
1053 | } | ||
1054 | |||
1055 | ClearPageSwapCache(oldpage); | ||
1056 | set_page_private(oldpage, 0); | ||
1057 | |||
1058 | unlock_page(oldpage); | ||
1059 | page_cache_release(oldpage); | ||
1060 | page_cache_release(oldpage); | ||
1061 | return error; | ||
1062 | } | ||
1063 | |||
1064 | /* | ||
859 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate | 1065 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
860 | * | 1066 | * |
861 | * If we allocate a new one we do not mark it dirty. That's up to the | 1067 | * If we allocate a new one we do not mark it dirty. That's up to the |
@@ -872,6 +1078,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
872 | swp_entry_t swap; | 1078 | swp_entry_t swap; |
873 | int error; | 1079 | int error; |
874 | int once = 0; | 1080 | int once = 0; |
1081 | int alloced = 0; | ||
875 | 1082 | ||
876 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) | 1083 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) |
877 | return -EFBIG; | 1084 | return -EFBIG; |
@@ -883,19 +1090,21 @@ repeat: | |||
883 | page = NULL; | 1090 | page = NULL; |
884 | } | 1091 | } |
885 | 1092 | ||
886 | if (sgp != SGP_WRITE && | 1093 | if (sgp != SGP_WRITE && sgp != SGP_FALLOC && |
887 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | 1094 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
888 | error = -EINVAL; | 1095 | error = -EINVAL; |
889 | goto failed; | 1096 | goto failed; |
890 | } | 1097 | } |
891 | 1098 | ||
1099 | /* fallocated page? */ | ||
1100 | if (page && !PageUptodate(page)) { | ||
1101 | if (sgp != SGP_READ) | ||
1102 | goto clear; | ||
1103 | unlock_page(page); | ||
1104 | page_cache_release(page); | ||
1105 | page = NULL; | ||
1106 | } | ||
892 | if (page || (sgp == SGP_READ && !swap.val)) { | 1107 | if (page || (sgp == SGP_READ && !swap.val)) { |
893 | /* | ||
894 | * Once we can get the page lock, it must be uptodate: | ||
895 | * if there were an error in reading back from swap, | ||
896 | * the page would not be inserted into the filecache. | ||
897 | */ | ||
898 | BUG_ON(page && !PageUptodate(page)); | ||
899 | *pagep = page; | 1108 | *pagep = page; |
900 | return 0; | 1109 | return 0; |
901 | } | 1110 | } |
@@ -923,26 +1132,31 @@ repeat: | |||
923 | 1132 | ||
924 | /* We have to do this with page locked to prevent races */ | 1133 | /* We have to do this with page locked to prevent races */ |
925 | lock_page(page); | 1134 | lock_page(page); |
1135 | if (!PageSwapCache(page) || page_private(page) != swap.val || | ||
1136 | !shmem_confirm_swap(mapping, index, swap)) { | ||
1137 | error = -EEXIST; /* try again */ | ||
1138 | goto unlock; | ||
1139 | } | ||
926 | if (!PageUptodate(page)) { | 1140 | if (!PageUptodate(page)) { |
927 | error = -EIO; | 1141 | error = -EIO; |
928 | goto failed; | 1142 | goto failed; |
929 | } | 1143 | } |
930 | wait_on_page_writeback(page); | 1144 | wait_on_page_writeback(page); |
931 | 1145 | ||
932 | /* Someone may have already done it for us */ | 1146 | if (shmem_should_replace_page(page, gfp)) { |
933 | if (page->mapping) { | 1147 | error = shmem_replace_page(&page, gfp, info, index); |
934 | if (page->mapping == mapping && | 1148 | if (error) |
935 | page->index == index) | 1149 | goto failed; |
936 | goto done; | ||
937 | error = -EEXIST; | ||
938 | goto failed; | ||
939 | } | 1150 | } |
940 | 1151 | ||
941 | error = mem_cgroup_cache_charge(page, current->mm, | 1152 | error = mem_cgroup_cache_charge(page, current->mm, |
942 | gfp & GFP_RECLAIM_MASK); | 1153 | gfp & GFP_RECLAIM_MASK); |
943 | if (!error) | 1154 | if (!error) { |
944 | error = shmem_add_to_page_cache(page, mapping, index, | 1155 | error = shmem_add_to_page_cache(page, mapping, index, |
945 | gfp, swp_to_radix_entry(swap)); | 1156 | gfp, swp_to_radix_entry(swap)); |
1157 | /* We already confirmed swap, and make no allocation */ | ||
1158 | VM_BUG_ON(error); | ||
1159 | } | ||
946 | if (error) | 1160 | if (error) |
947 | goto failed; | 1161 | goto failed; |
948 | 1162 | ||
@@ -979,11 +1193,18 @@ repeat: | |||
979 | __set_page_locked(page); | 1193 | __set_page_locked(page); |
980 | error = mem_cgroup_cache_charge(page, current->mm, | 1194 | error = mem_cgroup_cache_charge(page, current->mm, |
981 | gfp & GFP_RECLAIM_MASK); | 1195 | gfp & GFP_RECLAIM_MASK); |
982 | if (!error) | ||
983 | error = shmem_add_to_page_cache(page, mapping, index, | ||
984 | gfp, NULL); | ||
985 | if (error) | 1196 | if (error) |
986 | goto decused; | 1197 | goto decused; |
1198 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | ||
1199 | if (!error) { | ||
1200 | error = shmem_add_to_page_cache(page, mapping, index, | ||
1201 | gfp, NULL); | ||
1202 | radix_tree_preload_end(); | ||
1203 | } | ||
1204 | if (error) { | ||
1205 | mem_cgroup_uncharge_cache_page(page); | ||
1206 | goto decused; | ||
1207 | } | ||
987 | lru_cache_add_anon(page); | 1208 | lru_cache_add_anon(page); |
988 | 1209 | ||
989 | spin_lock(&info->lock); | 1210 | spin_lock(&info->lock); |
@@ -991,19 +1212,36 @@ repeat: | |||
991 | inode->i_blocks += BLOCKS_PER_PAGE; | 1212 | inode->i_blocks += BLOCKS_PER_PAGE; |
992 | shmem_recalc_inode(inode); | 1213 | shmem_recalc_inode(inode); |
993 | spin_unlock(&info->lock); | 1214 | spin_unlock(&info->lock); |
1215 | alloced = true; | ||
994 | 1216 | ||
995 | clear_highpage(page); | 1217 | /* |
996 | flush_dcache_page(page); | 1218 | * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. |
997 | SetPageUptodate(page); | 1219 | */ |
1220 | if (sgp == SGP_FALLOC) | ||
1221 | sgp = SGP_WRITE; | ||
1222 | clear: | ||
1223 | /* | ||
1224 | * Let SGP_WRITE caller clear ends if write does not fill page; | ||
1225 | * but SGP_FALLOC on a page fallocated earlier must initialize | ||
1226 | * it now, lest undo on failure cancel our earlier guarantee. | ||
1227 | */ | ||
1228 | if (sgp != SGP_WRITE) { | ||
1229 | clear_highpage(page); | ||
1230 | flush_dcache_page(page); | ||
1231 | SetPageUptodate(page); | ||
1232 | } | ||
998 | if (sgp == SGP_DIRTY) | 1233 | if (sgp == SGP_DIRTY) |
999 | set_page_dirty(page); | 1234 | set_page_dirty(page); |
1000 | } | 1235 | } |
1001 | done: | 1236 | |
1002 | /* Perhaps the file has been truncated since we checked */ | 1237 | /* Perhaps the file has been truncated since we checked */ |
1003 | if (sgp != SGP_WRITE && | 1238 | if (sgp != SGP_WRITE && sgp != SGP_FALLOC && |
1004 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | 1239 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
1005 | error = -EINVAL; | 1240 | error = -EINVAL; |
1006 | goto trunc; | 1241 | if (alloced) |
1242 | goto trunc; | ||
1243 | else | ||
1244 | goto failed; | ||
1007 | } | 1245 | } |
1008 | *pagep = page; | 1246 | *pagep = page; |
1009 | return 0; | 1247 | return 0; |
@@ -1012,6 +1250,7 @@ done: | |||
1012 | * Error recovery. | 1250 | * Error recovery. |
1013 | */ | 1251 | */ |
1014 | trunc: | 1252 | trunc: |
1253 | info = SHMEM_I(inode); | ||
1015 | ClearPageDirty(page); | 1254 | ClearPageDirty(page); |
1016 | delete_from_page_cache(page); | 1255 | delete_from_page_cache(page); |
1017 | spin_lock(&info->lock); | 1256 | spin_lock(&info->lock); |
@@ -1019,19 +1258,16 @@ trunc: | |||
1019 | inode->i_blocks -= BLOCKS_PER_PAGE; | 1258 | inode->i_blocks -= BLOCKS_PER_PAGE; |
1020 | spin_unlock(&info->lock); | 1259 | spin_unlock(&info->lock); |
1021 | decused: | 1260 | decused: |
1261 | sbinfo = SHMEM_SB(inode->i_sb); | ||
1022 | if (sbinfo->max_blocks) | 1262 | if (sbinfo->max_blocks) |
1023 | percpu_counter_add(&sbinfo->used_blocks, -1); | 1263 | percpu_counter_add(&sbinfo->used_blocks, -1); |
1024 | unacct: | 1264 | unacct: |
1025 | shmem_unacct_blocks(info->flags, 1); | 1265 | shmem_unacct_blocks(info->flags, 1); |
1026 | failed: | 1266 | failed: |
1027 | if (swap.val && error != -EINVAL) { | 1267 | if (swap.val && error != -EINVAL && |
1028 | struct page *test = find_get_page(mapping, index); | 1268 | !shmem_confirm_swap(mapping, index, swap)) |
1029 | if (test && !radix_tree_exceptional_entry(test)) | 1269 | error = -EEXIST; |
1030 | page_cache_release(test); | 1270 | unlock: |
1031 | /* Have another try if the entry has changed */ | ||
1032 | if (test != swp_to_radix_entry(swap)) | ||
1033 | error = -EEXIST; | ||
1034 | } | ||
1035 | if (page) { | 1271 | if (page) { |
1036 | unlock_page(page); | 1272 | unlock_page(page); |
1037 | page_cache_release(page); | 1273 | page_cache_release(page); |
@@ -1043,7 +1279,7 @@ failed: | |||
1043 | spin_unlock(&info->lock); | 1279 | spin_unlock(&info->lock); |
1044 | goto repeat; | 1280 | goto repeat; |
1045 | } | 1281 | } |
1046 | if (error == -EEXIST) | 1282 | if (error == -EEXIST) /* from above or from radix_tree_insert */ |
1047 | goto repeat; | 1283 | goto repeat; |
1048 | return error; | 1284 | return error; |
1049 | } | 1285 | } |
@@ -1204,6 +1440,14 @@ shmem_write_end(struct file *file, struct address_space *mapping, | |||
1204 | if (pos + copied > inode->i_size) | 1440 | if (pos + copied > inode->i_size) |
1205 | i_size_write(inode, pos + copied); | 1441 | i_size_write(inode, pos + copied); |
1206 | 1442 | ||
1443 | if (!PageUptodate(page)) { | ||
1444 | if (copied < PAGE_CACHE_SIZE) { | ||
1445 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | ||
1446 | zero_user_segments(page, 0, from, | ||
1447 | from + copied, PAGE_CACHE_SIZE); | ||
1448 | } | ||
1449 | SetPageUptodate(page); | ||
1450 | } | ||
1207 | set_page_dirty(page); | 1451 | set_page_dirty(page); |
1208 | unlock_page(page); | 1452 | unlock_page(page); |
1209 | page_cache_release(page); | 1453 | page_cache_release(page); |
@@ -1365,6 +1609,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1365 | struct splice_pipe_desc spd = { | 1609 | struct splice_pipe_desc spd = { |
1366 | .pages = pages, | 1610 | .pages = pages, |
1367 | .partial = partial, | 1611 | .partial = partial, |
1612 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
1368 | .flags = flags, | 1613 | .flags = flags, |
1369 | .ops = &page_cache_pipe_buf_ops, | 1614 | .ops = &page_cache_pipe_buf_ops, |
1370 | .spd_release = spd_release_page, | 1615 | .spd_release = spd_release_page, |
@@ -1453,7 +1698,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1453 | if (spd.nr_pages) | 1698 | if (spd.nr_pages) |
1454 | error = splice_to_pipe(pipe, &spd); | 1699 | error = splice_to_pipe(pipe, &spd); |
1455 | 1700 | ||
1456 | splice_shrink_spd(pipe, &spd); | 1701 | splice_shrink_spd(&spd); |
1457 | 1702 | ||
1458 | if (error > 0) { | 1703 | if (error > 0) { |
1459 | *ppos += error; | 1704 | *ppos += error; |
@@ -1462,6 +1707,107 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1462 | return error; | 1707 | return error; |
1463 | } | 1708 | } |
1464 | 1709 | ||
1710 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | ||
1711 | loff_t len) | ||
1712 | { | ||
1713 | struct inode *inode = file->f_path.dentry->d_inode; | ||
1714 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
1715 | struct shmem_falloc shmem_falloc; | ||
1716 | pgoff_t start, index, end; | ||
1717 | int error; | ||
1718 | |||
1719 | mutex_lock(&inode->i_mutex); | ||
1720 | |||
1721 | if (mode & FALLOC_FL_PUNCH_HOLE) { | ||
1722 | struct address_space *mapping = file->f_mapping; | ||
1723 | loff_t unmap_start = round_up(offset, PAGE_SIZE); | ||
1724 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; | ||
1725 | |||
1726 | if ((u64)unmap_end > (u64)unmap_start) | ||
1727 | unmap_mapping_range(mapping, unmap_start, | ||
1728 | 1 + unmap_end - unmap_start, 0); | ||
1729 | shmem_truncate_range(inode, offset, offset + len - 1); | ||
1730 | /* No need to unmap again: hole-punching leaves COWed pages */ | ||
1731 | error = 0; | ||
1732 | goto out; | ||
1733 | } | ||
1734 | |||
1735 | /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ | ||
1736 | error = inode_newsize_ok(inode, offset + len); | ||
1737 | if (error) | ||
1738 | goto out; | ||
1739 | |||
1740 | start = offset >> PAGE_CACHE_SHIFT; | ||
1741 | end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1742 | /* Try to avoid a swapstorm if len is impossible to satisfy */ | ||
1743 | if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { | ||
1744 | error = -ENOSPC; | ||
1745 | goto out; | ||
1746 | } | ||
1747 | |||
1748 | shmem_falloc.start = start; | ||
1749 | shmem_falloc.next = start; | ||
1750 | shmem_falloc.nr_falloced = 0; | ||
1751 | shmem_falloc.nr_unswapped = 0; | ||
1752 | spin_lock(&inode->i_lock); | ||
1753 | inode->i_private = &shmem_falloc; | ||
1754 | spin_unlock(&inode->i_lock); | ||
1755 | |||
1756 | for (index = start; index < end; index++) { | ||
1757 | struct page *page; | ||
1758 | |||
1759 | /* | ||
1760 | * Good, the fallocate(2) manpage permits EINTR: we may have | ||
1761 | * been interrupted because we are using up too much memory. | ||
1762 | */ | ||
1763 | if (signal_pending(current)) | ||
1764 | error = -EINTR; | ||
1765 | else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) | ||
1766 | error = -ENOMEM; | ||
1767 | else | ||
1768 | error = shmem_getpage(inode, index, &page, SGP_FALLOC, | ||
1769 | NULL); | ||
1770 | if (error) { | ||
1771 | /* Remove the !PageUptodate pages we added */ | ||
1772 | shmem_undo_range(inode, | ||
1773 | (loff_t)start << PAGE_CACHE_SHIFT, | ||
1774 | (loff_t)index << PAGE_CACHE_SHIFT, true); | ||
1775 | goto undone; | ||
1776 | } | ||
1777 | |||
1778 | /* | ||
1779 | * Inform shmem_writepage() how far we have reached. | ||
1780 | * No need for lock or barrier: we have the page lock. | ||
1781 | */ | ||
1782 | shmem_falloc.next++; | ||
1783 | if (!PageUptodate(page)) | ||
1784 | shmem_falloc.nr_falloced++; | ||
1785 | |||
1786 | /* | ||
1787 | * If !PageUptodate, leave it that way so that freeable pages | ||
1788 | * can be recognized if we need to rollback on error later. | ||
1789 | * But set_page_dirty so that memory pressure will swap rather | ||
1790 | * than free the pages we are allocating (and SGP_CACHE pages | ||
1791 | * might still be clean: we now need to mark those dirty too). | ||
1792 | */ | ||
1793 | set_page_dirty(page); | ||
1794 | unlock_page(page); | ||
1795 | page_cache_release(page); | ||
1796 | cond_resched(); | ||
1797 | } | ||
1798 | |||
1799 | if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) | ||
1800 | i_size_write(inode, offset + len); | ||
1801 | inode->i_ctime = CURRENT_TIME; | ||
1802 | undone: | ||
1803 | spin_lock(&inode->i_lock); | ||
1804 | inode->i_private = NULL; | ||
1805 | spin_unlock(&inode->i_lock); | ||
1806 | out: | ||
1807 | mutex_unlock(&inode->i_mutex); | ||
1808 | return error; | ||
1809 | } | ||
1810 | |||
1465 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1811 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1466 | { | 1812 | { |
1467 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); | 1813 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
@@ -1665,6 +2011,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1665 | kaddr = kmap_atomic(page); | 2011 | kaddr = kmap_atomic(page); |
1666 | memcpy(kaddr, symname, len); | 2012 | memcpy(kaddr, symname, len); |
1667 | kunmap_atomic(kaddr); | 2013 | kunmap_atomic(kaddr); |
2014 | SetPageUptodate(page); | ||
1668 | set_page_dirty(page); | 2015 | set_page_dirty(page); |
1669 | unlock_page(page); | 2016 | unlock_page(page); |
1670 | page_cache_release(page); | 2017 | page_cache_release(page); |
@@ -2033,11 +2380,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, | |||
2033 | return dentry; | 2380 | return dentry; |
2034 | } | 2381 | } |
2035 | 2382 | ||
2036 | static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, | 2383 | static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, |
2037 | int connectable) | 2384 | struct inode *parent) |
2038 | { | 2385 | { |
2039 | struct inode *inode = dentry->d_inode; | ||
2040 | |||
2041 | if (*len < 3) { | 2386 | if (*len < 3) { |
2042 | *len = 3; | 2387 | *len = 3; |
2043 | return 255; | 2388 | return 255; |
@@ -2075,6 +2420,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2075 | bool remount) | 2420 | bool remount) |
2076 | { | 2421 | { |
2077 | char *this_char, *value, *rest; | 2422 | char *this_char, *value, *rest; |
2423 | uid_t uid; | ||
2424 | gid_t gid; | ||
2078 | 2425 | ||
2079 | while (options != NULL) { | 2426 | while (options != NULL) { |
2080 | this_char = options; | 2427 | this_char = options; |
@@ -2134,15 +2481,21 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2134 | } else if (!strcmp(this_char,"uid")) { | 2481 | } else if (!strcmp(this_char,"uid")) { |
2135 | if (remount) | 2482 | if (remount) |
2136 | continue; | 2483 | continue; |
2137 | sbinfo->uid = simple_strtoul(value, &rest, 0); | 2484 | uid = simple_strtoul(value, &rest, 0); |
2138 | if (*rest) | 2485 | if (*rest) |
2139 | goto bad_val; | 2486 | goto bad_val; |
2487 | sbinfo->uid = make_kuid(current_user_ns(), uid); | ||
2488 | if (!uid_valid(sbinfo->uid)) | ||
2489 | goto bad_val; | ||
2140 | } else if (!strcmp(this_char,"gid")) { | 2490 | } else if (!strcmp(this_char,"gid")) { |
2141 | if (remount) | 2491 | if (remount) |
2142 | continue; | 2492 | continue; |
2143 | sbinfo->gid = simple_strtoul(value, &rest, 0); | 2493 | gid = simple_strtoul(value, &rest, 0); |
2144 | if (*rest) | 2494 | if (*rest) |
2145 | goto bad_val; | 2495 | goto bad_val; |
2496 | sbinfo->gid = make_kgid(current_user_ns(), gid); | ||
2497 | if (!gid_valid(sbinfo->gid)) | ||
2498 | goto bad_val; | ||
2146 | } else if (!strcmp(this_char,"mpol")) { | 2499 | } else if (!strcmp(this_char,"mpol")) { |
2147 | if (mpol_parse_str(value, &sbinfo->mpol, 1)) | 2500 | if (mpol_parse_str(value, &sbinfo->mpol, 1)) |
2148 | goto bad_val; | 2501 | goto bad_val; |
@@ -2210,10 +2563,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) | |||
2210 | seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); | 2563 | seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); |
2211 | if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) | 2564 | if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) |
2212 | seq_printf(seq, ",mode=%03ho", sbinfo->mode); | 2565 | seq_printf(seq, ",mode=%03ho", sbinfo->mode); |
2213 | if (sbinfo->uid != 0) | 2566 | if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) |
2214 | seq_printf(seq, ",uid=%u", sbinfo->uid); | 2567 | seq_printf(seq, ",uid=%u", |
2215 | if (sbinfo->gid != 0) | 2568 | from_kuid_munged(&init_user_ns, sbinfo->uid)); |
2216 | seq_printf(seq, ",gid=%u", sbinfo->gid); | 2569 | if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) |
2570 | seq_printf(seq, ",gid=%u", | ||
2571 | from_kgid_munged(&init_user_ns, sbinfo->gid)); | ||
2217 | shmem_show_mpol(seq, sbinfo->mpol); | 2572 | shmem_show_mpol(seq, sbinfo->mpol); |
2218 | return 0; | 2573 | return 0; |
2219 | } | 2574 | } |
@@ -2260,6 +2615,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2260 | } | 2615 | } |
2261 | } | 2616 | } |
2262 | sb->s_export_op = &shmem_export_ops; | 2617 | sb->s_export_op = &shmem_export_ops; |
2618 | sb->s_flags |= MS_NOSEC; | ||
2263 | #else | 2619 | #else |
2264 | sb->s_flags |= MS_NOUSER; | 2620 | sb->s_flags |= MS_NOUSER; |
2265 | #endif | 2621 | #endif |
@@ -2362,12 +2718,12 @@ static const struct file_operations shmem_file_operations = { | |||
2362 | .fsync = noop_fsync, | 2718 | .fsync = noop_fsync, |
2363 | .splice_read = shmem_file_splice_read, | 2719 | .splice_read = shmem_file_splice_read, |
2364 | .splice_write = generic_file_splice_write, | 2720 | .splice_write = generic_file_splice_write, |
2721 | .fallocate = shmem_fallocate, | ||
2365 | #endif | 2722 | #endif |
2366 | }; | 2723 | }; |
2367 | 2724 | ||
2368 | static const struct inode_operations shmem_inode_operations = { | 2725 | static const struct inode_operations shmem_inode_operations = { |
2369 | .setattr = shmem_setattr, | 2726 | .setattr = shmem_setattr, |
2370 | .truncate_range = shmem_truncate_range, | ||
2371 | #ifdef CONFIG_TMPFS_XATTR | 2727 | #ifdef CONFIG_TMPFS_XATTR |
2372 | .setxattr = shmem_setxattr, | 2728 | .setxattr = shmem_setxattr, |
2373 | .getxattr = shmem_getxattr, | 2729 | .getxattr = shmem_getxattr, |
@@ -1369,7 +1369,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1369 | 1369 | ||
1370 | inc_slabs_node(s, page_to_nid(page), page->objects); | 1370 | inc_slabs_node(s, page_to_nid(page), page->objects); |
1371 | page->slab = s; | 1371 | page->slab = s; |
1372 | page->flags |= 1 << PG_slab; | 1372 | __SetPageSlab(page); |
1373 | 1373 | ||
1374 | start = page_address(page); | 1374 | start = page_address(page); |
1375 | 1375 | ||
@@ -1514,15 +1514,19 @@ static inline void *acquire_slab(struct kmem_cache *s, | |||
1514 | freelist = page->freelist; | 1514 | freelist = page->freelist; |
1515 | counters = page->counters; | 1515 | counters = page->counters; |
1516 | new.counters = counters; | 1516 | new.counters = counters; |
1517 | if (mode) | 1517 | if (mode) { |
1518 | new.inuse = page->objects; | 1518 | new.inuse = page->objects; |
1519 | new.freelist = NULL; | ||
1520 | } else { | ||
1521 | new.freelist = freelist; | ||
1522 | } | ||
1519 | 1523 | ||
1520 | VM_BUG_ON(new.frozen); | 1524 | VM_BUG_ON(new.frozen); |
1521 | new.frozen = 1; | 1525 | new.frozen = 1; |
1522 | 1526 | ||
1523 | } while (!__cmpxchg_double_slab(s, page, | 1527 | } while (!__cmpxchg_double_slab(s, page, |
1524 | freelist, counters, | 1528 | freelist, counters, |
1525 | NULL, new.counters, | 1529 | new.freelist, new.counters, |
1526 | "lock and freeze")); | 1530 | "lock and freeze")); |
1527 | 1531 | ||
1528 | remove_partial(n, page); | 1532 | remove_partial(n, page); |
@@ -1564,7 +1568,6 @@ static void *get_partial_node(struct kmem_cache *s, | |||
1564 | object = t; | 1568 | object = t; |
1565 | available = page->objects - page->inuse; | 1569 | available = page->objects - page->inuse; |
1566 | } else { | 1570 | } else { |
1567 | page->freelist = t; | ||
1568 | available = put_cpu_partial(s, page, 0); | 1571 | available = put_cpu_partial(s, page, 0); |
1569 | stat(s, CPU_PARTIAL_NODE); | 1572 | stat(s, CPU_PARTIAL_NODE); |
1570 | } | 1573 | } |
@@ -1579,7 +1582,7 @@ static void *get_partial_node(struct kmem_cache *s, | |||
1579 | /* | 1582 | /* |
1580 | * Get a page from somewhere. Search in increasing NUMA distances. | 1583 | * Get a page from somewhere. Search in increasing NUMA distances. |
1581 | */ | 1584 | */ |
1582 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, | 1585 | static void *get_any_partial(struct kmem_cache *s, gfp_t flags, |
1583 | struct kmem_cache_cpu *c) | 1586 | struct kmem_cache_cpu *c) |
1584 | { | 1587 | { |
1585 | #ifdef CONFIG_NUMA | 1588 | #ifdef CONFIG_NUMA |
@@ -2766,7 +2769,7 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
2766 | } | 2769 | } |
2767 | 2770 | ||
2768 | static void | 2771 | static void |
2769 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | 2772 | init_kmem_cache_node(struct kmem_cache_node *n) |
2770 | { | 2773 | { |
2771 | n->nr_partial = 0; | 2774 | n->nr_partial = 0; |
2772 | spin_lock_init(&n->list_lock); | 2775 | spin_lock_init(&n->list_lock); |
@@ -2836,7 +2839,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2836 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 2839 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
2837 | init_tracking(kmem_cache_node, n); | 2840 | init_tracking(kmem_cache_node, n); |
2838 | #endif | 2841 | #endif |
2839 | init_kmem_cache_node(n, kmem_cache_node); | 2842 | init_kmem_cache_node(n); |
2840 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2843 | inc_slabs_node(kmem_cache_node, node, page->objects); |
2841 | 2844 | ||
2842 | add_partial(n, page, DEACTIVATE_TO_HEAD); | 2845 | add_partial(n, page, DEACTIVATE_TO_HEAD); |
@@ -2876,7 +2879,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) | |||
2876 | } | 2879 | } |
2877 | 2880 | ||
2878 | s->node[node] = n; | 2881 | s->node[node] = n; |
2879 | init_kmem_cache_node(n, s); | 2882 | init_kmem_cache_node(n); |
2880 | } | 2883 | } |
2881 | return 1; | 2884 | return 1; |
2882 | } | 2885 | } |
@@ -3625,7 +3628,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
3625 | ret = -ENOMEM; | 3628 | ret = -ENOMEM; |
3626 | goto out; | 3629 | goto out; |
3627 | } | 3630 | } |
3628 | init_kmem_cache_node(n, s); | 3631 | init_kmem_cache_node(n); |
3629 | s->node[nid] = n; | 3632 | s->node[nid] = n; |
3630 | } | 3633 | } |
3631 | out: | 3634 | out: |
@@ -3968,9 +3971,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
3968 | } | 3971 | } |
3969 | return s; | 3972 | return s; |
3970 | } | 3973 | } |
3971 | kfree(n); | ||
3972 | kfree(s); | 3974 | kfree(s); |
3973 | } | 3975 | } |
3976 | kfree(n); | ||
3974 | err: | 3977 | err: |
3975 | up_write(&slub_lock); | 3978 | up_write(&slub_lock); |
3976 | 3979 | ||
diff --git a/mm/sparse.c b/mm/sparse.c index a8bc7d364deb..c7bb952400c8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -273,10 +273,11 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
273 | #ifdef CONFIG_MEMORY_HOTREMOVE | 273 | #ifdef CONFIG_MEMORY_HOTREMOVE |
274 | static unsigned long * __init | 274 | static unsigned long * __init |
275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
276 | unsigned long count) | 276 | unsigned long size) |
277 | { | 277 | { |
278 | unsigned long section_nr; | 278 | unsigned long goal, limit; |
279 | 279 | unsigned long *p; | |
280 | int nid; | ||
280 | /* | 281 | /* |
281 | * A page may contain usemaps for other sections preventing the | 282 | * A page may contain usemaps for other sections preventing the |
282 | * page being freed and making a section unremovable while | 283 | * page being freed and making a section unremovable while |
@@ -287,8 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | |||
287 | * from the same section as the pgdat where possible to avoid | 288 | * from the same section as the pgdat where possible to avoid |
288 | * this problem. | 289 | * this problem. |
289 | */ | 290 | */ |
290 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | 291 | goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); |
291 | return alloc_bootmem_section(usemap_size() * count, section_nr); | 292 | limit = goal + (1UL << PA_SECTION_SHIFT); |
293 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); | ||
294 | again: | ||
295 | p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, | ||
296 | SMP_CACHE_BYTES, goal, limit); | ||
297 | if (!p && limit) { | ||
298 | limit = 0; | ||
299 | goto again; | ||
300 | } | ||
301 | return p; | ||
292 | } | 302 | } |
293 | 303 | ||
294 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 304 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -332,9 +342,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
332 | #else | 342 | #else |
333 | static unsigned long * __init | 343 | static unsigned long * __init |
334 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 344 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
335 | unsigned long count) | 345 | unsigned long size) |
336 | { | 346 | { |
337 | return NULL; | 347 | return alloc_bootmem_node_nopanic(pgdat, size); |
338 | } | 348 | } |
339 | 349 | ||
340 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 350 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -352,13 +362,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, | |||
352 | int size = usemap_size(); | 362 | int size = usemap_size(); |
353 | 363 | ||
354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), | 364 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
355 | usemap_count); | 365 | size * usemap_count); |
356 | if (!usemap) { | 366 | if (!usemap) { |
357 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); | 367 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
358 | if (!usemap) { | 368 | return; |
359 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | ||
360 | return; | ||
361 | } | ||
362 | } | 369 | } |
363 | 370 | ||
364 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 371 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
@@ -47,13 +47,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | |||
47 | static void __page_cache_release(struct page *page) | 47 | static void __page_cache_release(struct page *page) |
48 | { | 48 | { |
49 | if (PageLRU(page)) { | 49 | if (PageLRU(page)) { |
50 | unsigned long flags; | ||
51 | struct zone *zone = page_zone(page); | 50 | struct zone *zone = page_zone(page); |
51 | struct lruvec *lruvec; | ||
52 | unsigned long flags; | ||
52 | 53 | ||
53 | spin_lock_irqsave(&zone->lru_lock, flags); | 54 | spin_lock_irqsave(&zone->lru_lock, flags); |
55 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
54 | VM_BUG_ON(!PageLRU(page)); | 56 | VM_BUG_ON(!PageLRU(page)); |
55 | __ClearPageLRU(page); | 57 | __ClearPageLRU(page); |
56 | del_page_from_lru_list(zone, page, page_off_lru(page)); | 58 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
57 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 59 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
58 | } | 60 | } |
59 | } | 61 | } |
@@ -82,6 +84,25 @@ static void put_compound_page(struct page *page) | |||
82 | if (likely(page != page_head && | 84 | if (likely(page != page_head && |
83 | get_page_unless_zero(page_head))) { | 85 | get_page_unless_zero(page_head))) { |
84 | unsigned long flags; | 86 | unsigned long flags; |
87 | |||
88 | /* | ||
89 | * THP can not break up slab pages so avoid taking | ||
90 | * compound_lock(). Slab performs non-atomic bit ops | ||
91 | * on page->flags for better performance. In particular | ||
92 | * slab_unlock() in slub used to be a hot path. It is | ||
93 | * still hot on arches that do not support | ||
94 | * this_cpu_cmpxchg_double(). | ||
95 | */ | ||
96 | if (PageSlab(page_head)) { | ||
97 | if (PageTail(page)) { | ||
98 | if (put_page_testzero(page_head)) | ||
99 | VM_BUG_ON(1); | ||
100 | |||
101 | atomic_dec(&page->_mapcount); | ||
102 | goto skip_lock_tail; | ||
103 | } else | ||
104 | goto skip_lock; | ||
105 | } | ||
85 | /* | 106 | /* |
86 | * page_head wasn't a dangling pointer but it | 107 | * page_head wasn't a dangling pointer but it |
87 | * may not be a head page anymore by the time | 108 | * may not be a head page anymore by the time |
@@ -92,10 +113,10 @@ static void put_compound_page(struct page *page) | |||
92 | if (unlikely(!PageTail(page))) { | 113 | if (unlikely(!PageTail(page))) { |
93 | /* __split_huge_page_refcount run before us */ | 114 | /* __split_huge_page_refcount run before us */ |
94 | compound_unlock_irqrestore(page_head, flags); | 115 | compound_unlock_irqrestore(page_head, flags); |
95 | VM_BUG_ON(PageHead(page_head)); | 116 | skip_lock: |
96 | if (put_page_testzero(page_head)) | 117 | if (put_page_testzero(page_head)) |
97 | __put_single_page(page_head); | 118 | __put_single_page(page_head); |
98 | out_put_single: | 119 | out_put_single: |
99 | if (put_page_testzero(page)) | 120 | if (put_page_testzero(page)) |
100 | __put_single_page(page); | 121 | __put_single_page(page); |
101 | return; | 122 | return; |
@@ -115,6 +136,8 @@ static void put_compound_page(struct page *page) | |||
115 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | 136 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); |
116 | VM_BUG_ON(atomic_read(&page->_count) != 0); | 137 | VM_BUG_ON(atomic_read(&page->_count) != 0); |
117 | compound_unlock_irqrestore(page_head, flags); | 138 | compound_unlock_irqrestore(page_head, flags); |
139 | |||
140 | skip_lock_tail: | ||
118 | if (put_page_testzero(page_head)) { | 141 | if (put_page_testzero(page_head)) { |
119 | if (PageHead(page_head)) | 142 | if (PageHead(page_head)) |
120 | __put_compound_page(page_head); | 143 | __put_compound_page(page_head); |
@@ -162,6 +185,18 @@ bool __get_page_tail(struct page *page) | |||
162 | struct page *page_head = compound_trans_head(page); | 185 | struct page *page_head = compound_trans_head(page); |
163 | 186 | ||
164 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 187 | if (likely(page != page_head && get_page_unless_zero(page_head))) { |
188 | |||
189 | /* Ref to put_compound_page() comment. */ | ||
190 | if (PageSlab(page_head)) { | ||
191 | if (likely(PageTail(page))) { | ||
192 | __get_page_tail_foll(page, false); | ||
193 | return true; | ||
194 | } else { | ||
195 | put_page(page_head); | ||
196 | return false; | ||
197 | } | ||
198 | } | ||
199 | |||
165 | /* | 200 | /* |
166 | * page_head wasn't a dangling pointer but it | 201 | * page_head wasn't a dangling pointer but it |
167 | * may not be a head page anymore by the time | 202 | * may not be a head page anymore by the time |
@@ -202,11 +237,12 @@ void put_pages_list(struct list_head *pages) | |||
202 | EXPORT_SYMBOL(put_pages_list); | 237 | EXPORT_SYMBOL(put_pages_list); |
203 | 238 | ||
204 | static void pagevec_lru_move_fn(struct pagevec *pvec, | 239 | static void pagevec_lru_move_fn(struct pagevec *pvec, |
205 | void (*move_fn)(struct page *page, void *arg), | 240 | void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), |
206 | void *arg) | 241 | void *arg) |
207 | { | 242 | { |
208 | int i; | 243 | int i; |
209 | struct zone *zone = NULL; | 244 | struct zone *zone = NULL; |
245 | struct lruvec *lruvec; | ||
210 | unsigned long flags = 0; | 246 | unsigned long flags = 0; |
211 | 247 | ||
212 | for (i = 0; i < pagevec_count(pvec); i++) { | 248 | for (i = 0; i < pagevec_count(pvec); i++) { |
@@ -220,7 +256,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, | |||
220 | spin_lock_irqsave(&zone->lru_lock, flags); | 256 | spin_lock_irqsave(&zone->lru_lock, flags); |
221 | } | 257 | } |
222 | 258 | ||
223 | (*move_fn)(page, arg); | 259 | lruvec = mem_cgroup_page_lruvec(page, zone); |
260 | (*move_fn)(page, lruvec, arg); | ||
224 | } | 261 | } |
225 | if (zone) | 262 | if (zone) |
226 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 263 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
@@ -228,16 +265,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, | |||
228 | pagevec_reinit(pvec); | 265 | pagevec_reinit(pvec); |
229 | } | 266 | } |
230 | 267 | ||
231 | static void pagevec_move_tail_fn(struct page *page, void *arg) | 268 | static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, |
269 | void *arg) | ||
232 | { | 270 | { |
233 | int *pgmoved = arg; | 271 | int *pgmoved = arg; |
234 | 272 | ||
235 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 273 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
236 | enum lru_list lru = page_lru_base_type(page); | 274 | enum lru_list lru = page_lru_base_type(page); |
237 | struct lruvec *lruvec; | ||
238 | |||
239 | lruvec = mem_cgroup_lru_move_lists(page_zone(page), | ||
240 | page, lru, lru); | ||
241 | list_move_tail(&page->lru, &lruvec->lists[lru]); | 275 | list_move_tail(&page->lru, &lruvec->lists[lru]); |
242 | (*pgmoved)++; | 276 | (*pgmoved)++; |
243 | } | 277 | } |
@@ -276,41 +310,30 @@ void rotate_reclaimable_page(struct page *page) | |||
276 | } | 310 | } |
277 | } | 311 | } |
278 | 312 | ||
279 | static void update_page_reclaim_stat(struct zone *zone, struct page *page, | 313 | static void update_page_reclaim_stat(struct lruvec *lruvec, |
280 | int file, int rotated) | 314 | int file, int rotated) |
281 | { | 315 | { |
282 | struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; | 316 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
283 | struct zone_reclaim_stat *memcg_reclaim_stat; | ||
284 | |||
285 | memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); | ||
286 | 317 | ||
287 | reclaim_stat->recent_scanned[file]++; | 318 | reclaim_stat->recent_scanned[file]++; |
288 | if (rotated) | 319 | if (rotated) |
289 | reclaim_stat->recent_rotated[file]++; | 320 | reclaim_stat->recent_rotated[file]++; |
290 | |||
291 | if (!memcg_reclaim_stat) | ||
292 | return; | ||
293 | |||
294 | memcg_reclaim_stat->recent_scanned[file]++; | ||
295 | if (rotated) | ||
296 | memcg_reclaim_stat->recent_rotated[file]++; | ||
297 | } | 321 | } |
298 | 322 | ||
299 | static void __activate_page(struct page *page, void *arg) | 323 | static void __activate_page(struct page *page, struct lruvec *lruvec, |
324 | void *arg) | ||
300 | { | 325 | { |
301 | struct zone *zone = page_zone(page); | ||
302 | |||
303 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 326 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
304 | int file = page_is_file_cache(page); | 327 | int file = page_is_file_cache(page); |
305 | int lru = page_lru_base_type(page); | 328 | int lru = page_lru_base_type(page); |
306 | del_page_from_lru_list(zone, page, lru); | ||
307 | 329 | ||
330 | del_page_from_lru_list(page, lruvec, lru); | ||
308 | SetPageActive(page); | 331 | SetPageActive(page); |
309 | lru += LRU_ACTIVE; | 332 | lru += LRU_ACTIVE; |
310 | add_page_to_lru_list(zone, page, lru); | 333 | add_page_to_lru_list(page, lruvec, lru); |
311 | __count_vm_event(PGACTIVATE); | ||
312 | 334 | ||
313 | update_page_reclaim_stat(zone, page, file, 1); | 335 | __count_vm_event(PGACTIVATE); |
336 | update_page_reclaim_stat(lruvec, file, 1); | ||
314 | } | 337 | } |
315 | } | 338 | } |
316 | 339 | ||
@@ -347,7 +370,7 @@ void activate_page(struct page *page) | |||
347 | struct zone *zone = page_zone(page); | 370 | struct zone *zone = page_zone(page); |
348 | 371 | ||
349 | spin_lock_irq(&zone->lru_lock); | 372 | spin_lock_irq(&zone->lru_lock); |
350 | __activate_page(page, NULL); | 373 | __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); |
351 | spin_unlock_irq(&zone->lru_lock); | 374 | spin_unlock_irq(&zone->lru_lock); |
352 | } | 375 | } |
353 | #endif | 376 | #endif |
@@ -414,11 +437,13 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru) | |||
414 | void add_page_to_unevictable_list(struct page *page) | 437 | void add_page_to_unevictable_list(struct page *page) |
415 | { | 438 | { |
416 | struct zone *zone = page_zone(page); | 439 | struct zone *zone = page_zone(page); |
440 | struct lruvec *lruvec; | ||
417 | 441 | ||
418 | spin_lock_irq(&zone->lru_lock); | 442 | spin_lock_irq(&zone->lru_lock); |
443 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
419 | SetPageUnevictable(page); | 444 | SetPageUnevictable(page); |
420 | SetPageLRU(page); | 445 | SetPageLRU(page); |
421 | add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); | 446 | add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); |
422 | spin_unlock_irq(&zone->lru_lock); | 447 | spin_unlock_irq(&zone->lru_lock); |
423 | } | 448 | } |
424 | 449 | ||
@@ -443,11 +468,11 @@ void add_page_to_unevictable_list(struct page *page) | |||
443 | * be write it out by flusher threads as this is much more effective | 468 | * be write it out by flusher threads as this is much more effective |
444 | * than the single-page writeout from reclaim. | 469 | * than the single-page writeout from reclaim. |
445 | */ | 470 | */ |
446 | static void lru_deactivate_fn(struct page *page, void *arg) | 471 | static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, |
472 | void *arg) | ||
447 | { | 473 | { |
448 | int lru, file; | 474 | int lru, file; |
449 | bool active; | 475 | bool active; |
450 | struct zone *zone = page_zone(page); | ||
451 | 476 | ||
452 | if (!PageLRU(page)) | 477 | if (!PageLRU(page)) |
453 | return; | 478 | return; |
@@ -460,13 +485,13 @@ static void lru_deactivate_fn(struct page *page, void *arg) | |||
460 | return; | 485 | return; |
461 | 486 | ||
462 | active = PageActive(page); | 487 | active = PageActive(page); |
463 | |||
464 | file = page_is_file_cache(page); | 488 | file = page_is_file_cache(page); |
465 | lru = page_lru_base_type(page); | 489 | lru = page_lru_base_type(page); |
466 | del_page_from_lru_list(zone, page, lru + active); | 490 | |
491 | del_page_from_lru_list(page, lruvec, lru + active); | ||
467 | ClearPageActive(page); | 492 | ClearPageActive(page); |
468 | ClearPageReferenced(page); | 493 | ClearPageReferenced(page); |
469 | add_page_to_lru_list(zone, page, lru); | 494 | add_page_to_lru_list(page, lruvec, lru); |
470 | 495 | ||
471 | if (PageWriteback(page) || PageDirty(page)) { | 496 | if (PageWriteback(page) || PageDirty(page)) { |
472 | /* | 497 | /* |
@@ -476,19 +501,17 @@ static void lru_deactivate_fn(struct page *page, void *arg) | |||
476 | */ | 501 | */ |
477 | SetPageReclaim(page); | 502 | SetPageReclaim(page); |
478 | } else { | 503 | } else { |
479 | struct lruvec *lruvec; | ||
480 | /* | 504 | /* |
481 | * The page's writeback ends up during pagevec | 505 | * The page's writeback ends up during pagevec |
482 | * We moves tha page into tail of inactive. | 506 | * We moves tha page into tail of inactive. |
483 | */ | 507 | */ |
484 | lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru); | ||
485 | list_move_tail(&page->lru, &lruvec->lists[lru]); | 508 | list_move_tail(&page->lru, &lruvec->lists[lru]); |
486 | __count_vm_event(PGROTATED); | 509 | __count_vm_event(PGROTATED); |
487 | } | 510 | } |
488 | 511 | ||
489 | if (active) | 512 | if (active) |
490 | __count_vm_event(PGDEACTIVATE); | 513 | __count_vm_event(PGDEACTIVATE); |
491 | update_page_reclaim_stat(zone, page, file, 0); | 514 | update_page_reclaim_stat(lruvec, file, 0); |
492 | } | 515 | } |
493 | 516 | ||
494 | /* | 517 | /* |
@@ -588,6 +611,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
588 | int i; | 611 | int i; |
589 | LIST_HEAD(pages_to_free); | 612 | LIST_HEAD(pages_to_free); |
590 | struct zone *zone = NULL; | 613 | struct zone *zone = NULL; |
614 | struct lruvec *lruvec; | ||
591 | unsigned long uninitialized_var(flags); | 615 | unsigned long uninitialized_var(flags); |
592 | 616 | ||
593 | for (i = 0; i < nr; i++) { | 617 | for (i = 0; i < nr; i++) { |
@@ -615,9 +639,11 @@ void release_pages(struct page **pages, int nr, int cold) | |||
615 | zone = pagezone; | 639 | zone = pagezone; |
616 | spin_lock_irqsave(&zone->lru_lock, flags); | 640 | spin_lock_irqsave(&zone->lru_lock, flags); |
617 | } | 641 | } |
642 | |||
643 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
618 | VM_BUG_ON(!PageLRU(page)); | 644 | VM_BUG_ON(!PageLRU(page)); |
619 | __ClearPageLRU(page); | 645 | __ClearPageLRU(page); |
620 | del_page_from_lru_list(zone, page, page_off_lru(page)); | 646 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
621 | } | 647 | } |
622 | 648 | ||
623 | list_add(&page->lru, &pages_to_free); | 649 | list_add(&page->lru, &pages_to_free); |
@@ -649,8 +675,8 @@ EXPORT_SYMBOL(__pagevec_release); | |||
649 | 675 | ||
650 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 676 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
651 | /* used by __split_huge_page_refcount() */ | 677 | /* used by __split_huge_page_refcount() */ |
652 | void lru_add_page_tail(struct zone* zone, | 678 | void lru_add_page_tail(struct page *page, struct page *page_tail, |
653 | struct page *page, struct page *page_tail) | 679 | struct lruvec *lruvec) |
654 | { | 680 | { |
655 | int uninitialized_var(active); | 681 | int uninitialized_var(active); |
656 | enum lru_list lru; | 682 | enum lru_list lru; |
@@ -659,7 +685,8 @@ void lru_add_page_tail(struct zone* zone, | |||
659 | VM_BUG_ON(!PageHead(page)); | 685 | VM_BUG_ON(!PageHead(page)); |
660 | VM_BUG_ON(PageCompound(page_tail)); | 686 | VM_BUG_ON(PageCompound(page_tail)); |
661 | VM_BUG_ON(PageLRU(page_tail)); | 687 | VM_BUG_ON(PageLRU(page_tail)); |
662 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); | 688 | VM_BUG_ON(NR_CPUS != 1 && |
689 | !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); | ||
663 | 690 | ||
664 | SetPageLRU(page_tail); | 691 | SetPageLRU(page_tail); |
665 | 692 | ||
@@ -688,20 +715,20 @@ void lru_add_page_tail(struct zone* zone, | |||
688 | * Use the standard add function to put page_tail on the list, | 715 | * Use the standard add function to put page_tail on the list, |
689 | * but then correct its position so they all end up in order. | 716 | * but then correct its position so they all end up in order. |
690 | */ | 717 | */ |
691 | add_page_to_lru_list(zone, page_tail, lru); | 718 | add_page_to_lru_list(page_tail, lruvec, lru); |
692 | list_head = page_tail->lru.prev; | 719 | list_head = page_tail->lru.prev; |
693 | list_move_tail(&page_tail->lru, list_head); | 720 | list_move_tail(&page_tail->lru, list_head); |
694 | } | 721 | } |
695 | 722 | ||
696 | if (!PageUnevictable(page)) | 723 | if (!PageUnevictable(page)) |
697 | update_page_reclaim_stat(zone, page_tail, file, active); | 724 | update_page_reclaim_stat(lruvec, file, active); |
698 | } | 725 | } |
699 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 726 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
700 | 727 | ||
701 | static void __pagevec_lru_add_fn(struct page *page, void *arg) | 728 | static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, |
729 | void *arg) | ||
702 | { | 730 | { |
703 | enum lru_list lru = (enum lru_list)arg; | 731 | enum lru_list lru = (enum lru_list)arg; |
704 | struct zone *zone = page_zone(page); | ||
705 | int file = is_file_lru(lru); | 732 | int file = is_file_lru(lru); |
706 | int active = is_active_lru(lru); | 733 | int active = is_active_lru(lru); |
707 | 734 | ||
@@ -712,8 +739,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg) | |||
712 | SetPageLRU(page); | 739 | SetPageLRU(page); |
713 | if (active) | 740 | if (active) |
714 | SetPageActive(page); | 741 | SetPageActive(page); |
715 | add_page_to_lru_list(zone, page, lru); | 742 | add_page_to_lru_list(page, lruvec, lru); |
716 | update_page_reclaim_stat(zone, page, file, active); | 743 | update_page_reclaim_stat(lruvec, file, active); |
717 | } | 744 | } |
718 | 745 | ||
719 | /* | 746 | /* |
diff --git a/mm/swapfile.c b/mm/swapfile.c index fafc26d1b1dc..71373d03fcee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -31,6 +31,8 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/poll.h> | 32 | #include <linux/poll.h> |
33 | #include <linux/oom.h> | 33 | #include <linux/oom.h> |
34 | #include <linux/frontswap.h> | ||
35 | #include <linux/swapfile.h> | ||
34 | 36 | ||
35 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
36 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | |||
42 | static void free_swap_count_continuations(struct swap_info_struct *); | 44 | static void free_swap_count_continuations(struct swap_info_struct *); |
43 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | 45 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); |
44 | 46 | ||
45 | static DEFINE_SPINLOCK(swap_lock); | 47 | DEFINE_SPINLOCK(swap_lock); |
46 | static unsigned int nr_swapfiles; | 48 | static unsigned int nr_swapfiles; |
47 | long nr_swap_pages; | 49 | long nr_swap_pages; |
48 | long total_swap_pages; | 50 | long total_swap_pages; |
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry "; | |||
53 | static const char Bad_offset[] = "Bad swap offset entry "; | 55 | static const char Bad_offset[] = "Bad swap offset entry "; |
54 | static const char Unused_offset[] = "Unused swap offset entry "; | 56 | static const char Unused_offset[] = "Unused swap offset entry "; |
55 | 57 | ||
56 | static struct swap_list_t swap_list = {-1, -1}; | 58 | struct swap_list_t swap_list = {-1, -1}; |
57 | 59 | ||
58 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 60 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
59 | 61 | ||
60 | static DEFINE_MUTEX(swapon_mutex); | 62 | static DEFINE_MUTEX(swapon_mutex); |
61 | 63 | ||
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
556 | swap_list.next = p->type; | 558 | swap_list.next = p->type; |
557 | nr_swap_pages++; | 559 | nr_swap_pages++; |
558 | p->inuse_pages--; | 560 | p->inuse_pages--; |
561 | frontswap_invalidate_page(p->type, offset); | ||
559 | if ((p->flags & SWP_BLKDEV) && | 562 | if ((p->flags & SWP_BLKDEV) && |
560 | disk->fops->swap_slot_free_notify) | 563 | disk->fops->swap_slot_free_notify) |
561 | disk->fops->swap_slot_free_notify(p->bdev, offset); | 564 | disk->fops->swap_slot_free_notify(p->bdev, offset); |
@@ -601,7 +604,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
601 | * This does not give an exact answer when swap count is continued, | 604 | * This does not give an exact answer when swap count is continued, |
602 | * but does include the high COUNT_CONTINUED flag to allow for that. | 605 | * but does include the high COUNT_CONTINUED flag to allow for that. |
603 | */ | 606 | */ |
604 | static inline int page_swapcount(struct page *page) | 607 | int page_swapcount(struct page *page) |
605 | { | 608 | { |
606 | int count = 0; | 609 | int count = 0; |
607 | struct swap_info_struct *p; | 610 | struct swap_info_struct *p; |
@@ -717,37 +720,6 @@ int free_swap_and_cache(swp_entry_t entry) | |||
717 | return p != NULL; | 720 | return p != NULL; |
718 | } | 721 | } |
719 | 722 | ||
720 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
721 | /** | ||
722 | * mem_cgroup_count_swap_user - count the user of a swap entry | ||
723 | * @ent: the swap entry to be checked | ||
724 | * @pagep: the pointer for the swap cache page of the entry to be stored | ||
725 | * | ||
726 | * Returns the number of the user of the swap entry. The number is valid only | ||
727 | * for swaps of anonymous pages. | ||
728 | * If the entry is found on swap cache, the page is stored to pagep with | ||
729 | * refcount of it being incremented. | ||
730 | */ | ||
731 | int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | ||
732 | { | ||
733 | struct page *page; | ||
734 | struct swap_info_struct *p; | ||
735 | int count = 0; | ||
736 | |||
737 | page = find_get_page(&swapper_space, ent.val); | ||
738 | if (page) | ||
739 | count += page_mapcount(page); | ||
740 | p = swap_info_get(ent); | ||
741 | if (p) { | ||
742 | count += swap_count(p->swap_map[swp_offset(ent)]); | ||
743 | spin_unlock(&swap_lock); | ||
744 | } | ||
745 | |||
746 | *pagep = page; | ||
747 | return count; | ||
748 | } | ||
749 | #endif | ||
750 | |||
751 | #ifdef CONFIG_HIBERNATION | 723 | #ifdef CONFIG_HIBERNATION |
752 | /* | 724 | /* |
753 | * Find the swap type that corresponds to given device (if any). | 725 | * Find the swap type that corresponds to given device (if any). |
@@ -1016,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm, | |||
1016 | } | 988 | } |
1017 | 989 | ||
1018 | /* | 990 | /* |
1019 | * Scan swap_map from current position to next entry still in use. | 991 | * Scan swap_map (or frontswap_map if frontswap parameter is true) |
992 | * from current position to next entry still in use. | ||
1020 | * Recycle to start on reaching the end, returning 0 when empty. | 993 | * Recycle to start on reaching the end, returning 0 when empty. |
1021 | */ | 994 | */ |
1022 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, | 995 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
1023 | unsigned int prev) | 996 | unsigned int prev, bool frontswap) |
1024 | { | 997 | { |
1025 | unsigned int max = si->max; | 998 | unsigned int max = si->max; |
1026 | unsigned int i = prev; | 999 | unsigned int i = prev; |
@@ -1046,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1046 | prev = 0; | 1019 | prev = 0; |
1047 | i = 1; | 1020 | i = 1; |
1048 | } | 1021 | } |
1022 | if (frontswap) { | ||
1023 | if (frontswap_test(si, i)) | ||
1024 | break; | ||
1025 | else | ||
1026 | continue; | ||
1027 | } | ||
1049 | count = si->swap_map[i]; | 1028 | count = si->swap_map[i]; |
1050 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1029 | if (count && swap_count(count) != SWAP_MAP_BAD) |
1051 | break; | 1030 | break; |
@@ -1057,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1057 | * We completely avoid races by reading each swap page in advance, | 1036 | * We completely avoid races by reading each swap page in advance, |
1058 | * and then search for the process using it. All the necessary | 1037 | * and then search for the process using it. All the necessary |
1059 | * page table adjustments can then be made atomically. | 1038 | * page table adjustments can then be made atomically. |
1039 | * | ||
1040 | * if the boolean frontswap is true, only unuse pages_to_unuse pages; | ||
1041 | * pages_to_unuse==0 means all pages; ignored if frontswap is false | ||
1060 | */ | 1042 | */ |
1061 | static int try_to_unuse(unsigned int type) | 1043 | int try_to_unuse(unsigned int type, bool frontswap, |
1044 | unsigned long pages_to_unuse) | ||
1062 | { | 1045 | { |
1063 | struct swap_info_struct *si = swap_info[type]; | 1046 | struct swap_info_struct *si = swap_info[type]; |
1064 | struct mm_struct *start_mm; | 1047 | struct mm_struct *start_mm; |
@@ -1091,7 +1074,7 @@ static int try_to_unuse(unsigned int type) | |||
1091 | * one pass through swap_map is enough, but not necessarily: | 1074 | * one pass through swap_map is enough, but not necessarily: |
1092 | * there are races when an instance of an entry might be missed. | 1075 | * there are races when an instance of an entry might be missed. |
1093 | */ | 1076 | */ |
1094 | while ((i = find_next_to_unuse(si, i)) != 0) { | 1077 | while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { |
1095 | if (signal_pending(current)) { | 1078 | if (signal_pending(current)) { |
1096 | retval = -EINTR; | 1079 | retval = -EINTR; |
1097 | break; | 1080 | break; |
@@ -1258,6 +1241,10 @@ static int try_to_unuse(unsigned int type) | |||
1258 | * interactive performance. | 1241 | * interactive performance. |
1259 | */ | 1242 | */ |
1260 | cond_resched(); | 1243 | cond_resched(); |
1244 | if (frontswap && pages_to_unuse > 0) { | ||
1245 | if (!--pages_to_unuse) | ||
1246 | break; | ||
1247 | } | ||
1261 | } | 1248 | } |
1262 | 1249 | ||
1263 | mmput(start_mm); | 1250 | mmput(start_mm); |
@@ -1517,7 +1504,8 @@ bad_bmap: | |||
1517 | } | 1504 | } |
1518 | 1505 | ||
1519 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1506 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
1520 | unsigned char *swap_map) | 1507 | unsigned char *swap_map, |
1508 | unsigned long *frontswap_map) | ||
1521 | { | 1509 | { |
1522 | int i, prev; | 1510 | int i, prev; |
1523 | 1511 | ||
@@ -1527,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1527 | else | 1515 | else |
1528 | p->prio = --least_priority; | 1516 | p->prio = --least_priority; |
1529 | p->swap_map = swap_map; | 1517 | p->swap_map = swap_map; |
1518 | frontswap_map_set(p, frontswap_map); | ||
1530 | p->flags |= SWP_WRITEOK; | 1519 | p->flags |= SWP_WRITEOK; |
1531 | nr_swap_pages += p->pages; | 1520 | nr_swap_pages += p->pages; |
1532 | total_swap_pages += p->pages; | 1521 | total_swap_pages += p->pages; |
@@ -1543,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1543 | swap_list.head = swap_list.next = p->type; | 1532 | swap_list.head = swap_list.next = p->type; |
1544 | else | 1533 | else |
1545 | swap_info[prev]->next = p->type; | 1534 | swap_info[prev]->next = p->type; |
1535 | frontswap_init(p->type); | ||
1546 | spin_unlock(&swap_lock); | 1536 | spin_unlock(&swap_lock); |
1547 | } | 1537 | } |
1548 | 1538 | ||
@@ -1616,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1616 | spin_unlock(&swap_lock); | 1606 | spin_unlock(&swap_lock); |
1617 | 1607 | ||
1618 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1608 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
1619 | err = try_to_unuse(type); | 1609 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
1620 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); | 1610 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); |
1621 | 1611 | ||
1622 | if (err) { | 1612 | if (err) { |
@@ -1627,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1627 | * sys_swapoff for this swap_info_struct at this point. | 1617 | * sys_swapoff for this swap_info_struct at this point. |
1628 | */ | 1618 | */ |
1629 | /* re-insert swap space back into swap_list */ | 1619 | /* re-insert swap space back into swap_list */ |
1630 | enable_swap_info(p, p->prio, p->swap_map); | 1620 | enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
1631 | goto out_dput; | 1621 | goto out_dput; |
1632 | } | 1622 | } |
1633 | 1623 | ||
@@ -1653,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1653 | swap_map = p->swap_map; | 1643 | swap_map = p->swap_map; |
1654 | p->swap_map = NULL; | 1644 | p->swap_map = NULL; |
1655 | p->flags = 0; | 1645 | p->flags = 0; |
1646 | frontswap_invalidate_area(type); | ||
1656 | spin_unlock(&swap_lock); | 1647 | spin_unlock(&swap_lock); |
1657 | mutex_unlock(&swapon_mutex); | 1648 | mutex_unlock(&swapon_mutex); |
1658 | vfree(swap_map); | 1649 | vfree(swap_map); |
1650 | vfree(frontswap_map_get(p)); | ||
1659 | /* Destroy swap account informatin */ | 1651 | /* Destroy swap account informatin */ |
1660 | swap_cgroup_swapoff(type); | 1652 | swap_cgroup_swapoff(type); |
1661 | 1653 | ||
@@ -1924,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1924 | 1916 | ||
1925 | /* | 1917 | /* |
1926 | * Find out how many pages are allowed for a single swap | 1918 | * Find out how many pages are allowed for a single swap |
1927 | * device. There are three limiting factors: 1) the number | 1919 | * device. There are two limiting factors: 1) the number |
1928 | * of bits for the swap offset in the swp_entry_t type, and | 1920 | * of bits for the swap offset in the swp_entry_t type, and |
1929 | * 2) the number of bits in the swap pte as defined by the | 1921 | * 2) the number of bits in the swap pte as defined by the |
1930 | * the different architectures, and 3) the number of free bits | 1922 | * different architectures. In order to find the |
1931 | * in an exceptional radix_tree entry. In order to find the | ||
1932 | * largest possible bit mask, a swap entry with swap type 0 | 1923 | * largest possible bit mask, a swap entry with swap type 0 |
1933 | * and swap offset ~0UL is created, encoded to a swap pte, | 1924 | * and swap offset ~0UL is created, encoded to a swap pte, |
1934 | * decoded to a swp_entry_t again, and finally the swap | 1925 | * decoded to a swp_entry_t again, and finally the swap |
1935 | * offset is extracted. This will mask all the bits from | 1926 | * offset is extracted. This will mask all the bits from |
1936 | * the initial ~0UL mask that can't be encoded in either | 1927 | * the initial ~0UL mask that can't be encoded in either |
1937 | * the swp_entry_t or the architecture definition of a | 1928 | * the swp_entry_t or the architecture definition of a |
1938 | * swap pte. Then the same is done for a radix_tree entry. | 1929 | * swap pte. |
1939 | */ | 1930 | */ |
1940 | maxpages = swp_offset(pte_to_swp_entry( | 1931 | maxpages = swp_offset(pte_to_swp_entry( |
1941 | swp_entry_to_pte(swp_entry(0, ~0UL)))); | 1932 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
1942 | maxpages = swp_offset(radix_to_swp_entry( | ||
1943 | swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; | ||
1944 | |||
1945 | if (maxpages > swap_header->info.last_page) { | 1933 | if (maxpages > swap_header->info.last_page) { |
1946 | maxpages = swap_header->info.last_page + 1; | 1934 | maxpages = swap_header->info.last_page + 1; |
1947 | /* p->max is an unsigned int: don't overflow it */ | 1935 | /* p->max is an unsigned int: don't overflow it */ |
@@ -2019,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2019 | sector_t span; | 2007 | sector_t span; |
2020 | unsigned long maxpages; | 2008 | unsigned long maxpages; |
2021 | unsigned char *swap_map = NULL; | 2009 | unsigned char *swap_map = NULL; |
2010 | unsigned long *frontswap_map = NULL; | ||
2022 | struct page *page = NULL; | 2011 | struct page *page = NULL; |
2023 | struct inode *inode = NULL; | 2012 | struct inode *inode = NULL; |
2024 | 2013 | ||
@@ -2102,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2102 | error = nr_extents; | 2091 | error = nr_extents; |
2103 | goto bad_swap; | 2092 | goto bad_swap; |
2104 | } | 2093 | } |
2094 | /* frontswap enabled? set up bit-per-page map for frontswap */ | ||
2095 | if (frontswap_enabled) | ||
2096 | frontswap_map = vzalloc(maxpages / sizeof(long)); | ||
2105 | 2097 | ||
2106 | if (p->bdev) { | 2098 | if (p->bdev) { |
2107 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2099 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
@@ -2117,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2117 | if (swap_flags & SWAP_FLAG_PREFER) | 2109 | if (swap_flags & SWAP_FLAG_PREFER) |
2118 | prio = | 2110 | prio = |
2119 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 2111 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
2120 | enable_swap_info(p, prio, swap_map); | 2112 | enable_swap_info(p, prio, swap_map, frontswap_map); |
2121 | 2113 | ||
2122 | printk(KERN_INFO "Adding %uk swap on %s. " | 2114 | printk(KERN_INFO "Adding %uk swap on %s. " |
2123 | "Priority:%d extents:%d across:%lluk %s%s\n", | 2115 | "Priority:%d extents:%d across:%lluk %s%s%s\n", |
2124 | p->pages<<(PAGE_SHIFT-10), name, p->prio, | 2116 | p->pages<<(PAGE_SHIFT-10), name, p->prio, |
2125 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2117 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
2126 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2118 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
2127 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | 2119 | (p->flags & SWP_DISCARDABLE) ? "D" : "", |
2120 | (frontswap_map) ? "FS" : ""); | ||
2128 | 2121 | ||
2129 | mutex_unlock(&swapon_mutex); | 2122 | mutex_unlock(&swapon_mutex); |
2130 | atomic_inc(&proc_poll_event); | 2123 | atomic_inc(&proc_poll_event); |
diff --git a/mm/thrash.c b/mm/thrash.c deleted file mode 100644 index 57ad495dbd54..000000000000 --- a/mm/thrash.c +++ /dev/null | |||
@@ -1,155 +0,0 @@ | |||
1 | /* | ||
2 | * mm/thrash.c | ||
3 | * | ||
4 | * Copyright (C) 2004, Red Hat, Inc. | ||
5 | * Copyright (C) 2004, Rik van Riel <riel@redhat.com> | ||
6 | * Released under the GPL, see the file COPYING for details. | ||
7 | * | ||
8 | * Simple token based thrashing protection, using the algorithm | ||
9 | * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html | ||
10 | * | ||
11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> | ||
12 | * Improved algorithm to pass token: | ||
13 | * Each task has a priority which is incremented if it contended | ||
14 | * for the token in an interval less than its previous attempt. | ||
15 | * If the token is acquired, that task's priority is boosted to prevent | ||
16 | * the token from bouncing around too often and to let the task make | ||
17 | * some progress in its execution. | ||
18 | */ | ||
19 | |||
20 | #include <linux/jiffies.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/swap.h> | ||
24 | #include <linux/memcontrol.h> | ||
25 | |||
26 | #include <trace/events/vmscan.h> | ||
27 | |||
28 | #define TOKEN_AGING_INTERVAL (0xFF) | ||
29 | |||
30 | static DEFINE_SPINLOCK(swap_token_lock); | ||
31 | struct mm_struct *swap_token_mm; | ||
32 | static struct mem_cgroup *swap_token_memcg; | ||
33 | |||
34 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
35 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
36 | { | ||
37 | struct mem_cgroup *memcg; | ||
38 | |||
39 | memcg = try_get_mem_cgroup_from_mm(mm); | ||
40 | if (memcg) | ||
41 | css_put(mem_cgroup_css(memcg)); | ||
42 | |||
43 | return memcg; | ||
44 | } | ||
45 | #else | ||
46 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
47 | { | ||
48 | return NULL; | ||
49 | } | ||
50 | #endif | ||
51 | |||
52 | void grab_swap_token(struct mm_struct *mm) | ||
53 | { | ||
54 | int current_interval; | ||
55 | unsigned int old_prio = mm->token_priority; | ||
56 | static unsigned int global_faults; | ||
57 | static unsigned int last_aging; | ||
58 | |||
59 | global_faults++; | ||
60 | |||
61 | current_interval = global_faults - mm->faultstamp; | ||
62 | |||
63 | if (!spin_trylock(&swap_token_lock)) | ||
64 | return; | ||
65 | |||
66 | /* First come first served */ | ||
67 | if (!swap_token_mm) | ||
68 | goto replace_token; | ||
69 | |||
70 | /* | ||
71 | * Usually, we don't need priority aging because long interval faults | ||
72 | * makes priority decrease quickly. But there is one exception. If the | ||
73 | * token owner task is sleeping, it never make long interval faults. | ||
74 | * Thus, we need a priority aging mechanism instead. The requirements | ||
75 | * of priority aging are | ||
76 | * 1) An aging interval is reasonable enough long. Too short aging | ||
77 | * interval makes quick swap token lost and decrease performance. | ||
78 | * 2) The swap token owner task have to get priority aging even if | ||
79 | * it's under sleep. | ||
80 | */ | ||
81 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { | ||
82 | swap_token_mm->token_priority /= 2; | ||
83 | last_aging = global_faults; | ||
84 | } | ||
85 | |||
86 | if (mm == swap_token_mm) { | ||
87 | mm->token_priority += 2; | ||
88 | goto update_priority; | ||
89 | } | ||
90 | |||
91 | if (current_interval < mm->last_interval) | ||
92 | mm->token_priority++; | ||
93 | else { | ||
94 | if (likely(mm->token_priority > 0)) | ||
95 | mm->token_priority--; | ||
96 | } | ||
97 | |||
98 | /* Check if we deserve the token */ | ||
99 | if (mm->token_priority > swap_token_mm->token_priority) | ||
100 | goto replace_token; | ||
101 | |||
102 | update_priority: | ||
103 | trace_update_swap_token_priority(mm, old_prio, swap_token_mm); | ||
104 | |||
105 | out: | ||
106 | mm->faultstamp = global_faults; | ||
107 | mm->last_interval = current_interval; | ||
108 | spin_unlock(&swap_token_lock); | ||
109 | return; | ||
110 | |||
111 | replace_token: | ||
112 | mm->token_priority += 2; | ||
113 | trace_replace_swap_token(swap_token_mm, mm); | ||
114 | swap_token_mm = mm; | ||
115 | swap_token_memcg = swap_token_memcg_from_mm(mm); | ||
116 | last_aging = global_faults; | ||
117 | goto out; | ||
118 | } | ||
119 | |||
120 | /* Called on process exit. */ | ||
121 | void __put_swap_token(struct mm_struct *mm) | ||
122 | { | ||
123 | spin_lock(&swap_token_lock); | ||
124 | if (likely(mm == swap_token_mm)) { | ||
125 | trace_put_swap_token(swap_token_mm); | ||
126 | swap_token_mm = NULL; | ||
127 | swap_token_memcg = NULL; | ||
128 | } | ||
129 | spin_unlock(&swap_token_lock); | ||
130 | } | ||
131 | |||
132 | static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) | ||
133 | { | ||
134 | if (!a) | ||
135 | return true; | ||
136 | if (!b) | ||
137 | return true; | ||
138 | if (a == b) | ||
139 | return true; | ||
140 | return false; | ||
141 | } | ||
142 | |||
143 | void disable_swap_token(struct mem_cgroup *memcg) | ||
144 | { | ||
145 | /* memcg reclaim don't disable unrelated mm token. */ | ||
146 | if (match_memcg(memcg, swap_token_memcg)) { | ||
147 | spin_lock(&swap_token_lock); | ||
148 | if (match_memcg(memcg, swap_token_memcg)) { | ||
149 | trace_disable_swap_token(swap_token_mm); | ||
150 | swap_token_mm = NULL; | ||
151 | swap_token_memcg = NULL; | ||
152 | } | ||
153 | spin_unlock(&swap_token_lock); | ||
154 | } | ||
155 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 61a183b89df6..75801acdaac7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize) | |||
602 | } | 602 | } |
603 | EXPORT_SYMBOL(vmtruncate); | 603 | EXPORT_SYMBOL(vmtruncate); |
604 | 604 | ||
605 | int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
606 | { | ||
607 | struct address_space *mapping = inode->i_mapping; | ||
608 | loff_t holebegin = round_up(lstart, PAGE_SIZE); | ||
609 | loff_t holelen = 1 + lend - holebegin; | ||
610 | |||
611 | /* | ||
612 | * If the underlying filesystem is not going to provide | ||
613 | * a way to truncate a range of blocks (punch a hole) - | ||
614 | * we should return failure right now. | ||
615 | */ | ||
616 | if (!inode->i_op->truncate_range) | ||
617 | return -ENOSYS; | ||
618 | |||
619 | mutex_lock(&inode->i_mutex); | ||
620 | inode_dio_wait(inode); | ||
621 | unmap_mapping_range(mapping, holebegin, holelen, 1); | ||
622 | inode->i_op->truncate_range(inode, lstart, lend); | ||
623 | /* unmap again to remove racily COWed private pages */ | ||
624 | unmap_mapping_range(mapping, holebegin, holelen, 1); | ||
625 | mutex_unlock(&inode->i_mutex); | ||
626 | |||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | /** | 605 | /** |
631 | * truncate_pagecache_range - unmap and remove pagecache that is hole-punched | 606 | * truncate_pagecache_range - unmap and remove pagecache that is hole-punched |
632 | * @inode: inode | 607 | * @inode: inode |
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/export.h> | 4 | #include <linux/export.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/security.h> | ||
7 | #include <asm/uaccess.h> | 8 | #include <asm/uaccess.h> |
8 | 9 | ||
9 | #include "internal.h" | 10 | #include "internal.h" |
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, | |||
341 | } | 342 | } |
342 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 343 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
343 | 344 | ||
345 | unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, | ||
346 | unsigned long len, unsigned long prot, | ||
347 | unsigned long flag, unsigned long pgoff) | ||
348 | { | ||
349 | unsigned long ret; | ||
350 | struct mm_struct *mm = current->mm; | ||
351 | |||
352 | ret = security_mmap_file(file, prot, flag); | ||
353 | if (!ret) { | ||
354 | down_write(&mm->mmap_sem); | ||
355 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); | ||
356 | up_write(&mm->mmap_sem); | ||
357 | } | ||
358 | return ret; | ||
359 | } | ||
360 | |||
361 | unsigned long vm_mmap(struct file *file, unsigned long addr, | ||
362 | unsigned long len, unsigned long prot, | ||
363 | unsigned long flag, unsigned long offset) | ||
364 | { | ||
365 | if (unlikely(offset + PAGE_ALIGN(len) < offset)) | ||
366 | return -EINVAL; | ||
367 | if (unlikely(offset & ~PAGE_MASK)) | ||
368 | return -EINVAL; | ||
369 | |||
370 | return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); | ||
371 | } | ||
372 | EXPORT_SYMBOL(vm_mmap); | ||
373 | |||
344 | /* Tracepoints definitions. */ | 374 | /* Tracepoints definitions. */ |
345 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 375 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
346 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 376 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 94dff883b449..2aad49981b57 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1185,9 +1185,10 @@ void __init vmalloc_init(void) | |||
1185 | /* Import existing vmlist entries. */ | 1185 | /* Import existing vmlist entries. */ |
1186 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1186 | for (tmp = vmlist; tmp; tmp = tmp->next) { |
1187 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); | 1187 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); |
1188 | va->flags = tmp->flags | VM_VM_AREA; | 1188 | va->flags = VM_VM_AREA; |
1189 | va->va_start = (unsigned long)tmp->addr; | 1189 | va->va_start = (unsigned long)tmp->addr; |
1190 | va->va_end = va->va_start + tmp->size; | 1190 | va->va_end = va->va_start + tmp->size; |
1191 | va->vm = tmp; | ||
1191 | __insert_vmap_area(va); | 1192 | __insert_vmap_area(va); |
1192 | } | 1193 | } |
1193 | 1194 | ||
@@ -2375,8 +2376,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2375 | return NULL; | 2376 | return NULL; |
2376 | } | 2377 | } |
2377 | 2378 | ||
2378 | vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); | 2379 | vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); |
2379 | vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); | 2380 | vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); |
2380 | if (!vas || !vms) | 2381 | if (!vas || !vms) |
2381 | goto err_free2; | 2382 | goto err_free2; |
2382 | 2383 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 33dc256033b5..66e431060c05 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -53,24 +53,6 @@ | |||
53 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
54 | #include <trace/events/vmscan.h> | 54 | #include <trace/events/vmscan.h> |
55 | 55 | ||
56 | /* | ||
57 | * reclaim_mode determines how the inactive list is shrunk | ||
58 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages | ||
59 | * RECLAIM_MODE_ASYNC: Do not block | ||
60 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback | ||
61 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | ||
62 | * page from the LRU and reclaim all pages within a | ||
63 | * naturally aligned range | ||
64 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | ||
65 | * order-0 pages and then compact the zone | ||
66 | */ | ||
67 | typedef unsigned __bitwise__ reclaim_mode_t; | ||
68 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | ||
69 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | ||
70 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | ||
71 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | ||
72 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | ||
73 | |||
74 | struct scan_control { | 56 | struct scan_control { |
75 | /* Incremented by the number of inactive pages that were scanned */ | 57 | /* Incremented by the number of inactive pages that were scanned */ |
76 | unsigned long nr_scanned; | 58 | unsigned long nr_scanned; |
@@ -96,11 +78,8 @@ struct scan_control { | |||
96 | 78 | ||
97 | int order; | 79 | int order; |
98 | 80 | ||
99 | /* | 81 | /* Scan (total_size >> priority) pages at once */ |
100 | * Intend to reclaim enough continuous memory rather than reclaim | 82 | int priority; |
101 | * enough amount of memory. i.e, mode for high order allocation. | ||
102 | */ | ||
103 | reclaim_mode_t reclaim_mode; | ||
104 | 83 | ||
105 | /* | 84 | /* |
106 | * The memory cgroup that hit its limit and as a result is the | 85 | * The memory cgroup that hit its limit and as a result is the |
@@ -115,11 +94,6 @@ struct scan_control { | |||
115 | nodemask_t *nodemask; | 94 | nodemask_t *nodemask; |
116 | }; | 95 | }; |
117 | 96 | ||
118 | struct mem_cgroup_zone { | ||
119 | struct mem_cgroup *mem_cgroup; | ||
120 | struct zone *zone; | ||
121 | }; | ||
122 | |||
123 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 97 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
124 | 98 | ||
125 | #ifdef ARCH_HAS_PREFETCH | 99 | #ifdef ARCH_HAS_PREFETCH |
@@ -164,44 +138,21 @@ static bool global_reclaim(struct scan_control *sc) | |||
164 | { | 138 | { |
165 | return !sc->target_mem_cgroup; | 139 | return !sc->target_mem_cgroup; |
166 | } | 140 | } |
167 | |||
168 | static bool scanning_global_lru(struct mem_cgroup_zone *mz) | ||
169 | { | ||
170 | return !mz->mem_cgroup; | ||
171 | } | ||
172 | #else | 141 | #else |
173 | static bool global_reclaim(struct scan_control *sc) | 142 | static bool global_reclaim(struct scan_control *sc) |
174 | { | 143 | { |
175 | return true; | 144 | return true; |
176 | } | 145 | } |
177 | |||
178 | static bool scanning_global_lru(struct mem_cgroup_zone *mz) | ||
179 | { | ||
180 | return true; | ||
181 | } | ||
182 | #endif | 146 | #endif |
183 | 147 | ||
184 | static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) | 148 | static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
185 | { | 149 | { |
186 | if (!scanning_global_lru(mz)) | 150 | if (!mem_cgroup_disabled()) |
187 | return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); | 151 | return mem_cgroup_get_lru_size(lruvec, lru); |
188 | 152 | ||
189 | return &mz->zone->reclaim_stat; | 153 | return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); |
190 | } | 154 | } |
191 | 155 | ||
192 | static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, | ||
193 | enum lru_list lru) | ||
194 | { | ||
195 | if (!scanning_global_lru(mz)) | ||
196 | return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, | ||
197 | zone_to_nid(mz->zone), | ||
198 | zone_idx(mz->zone), | ||
199 | BIT(lru)); | ||
200 | |||
201 | return zone_page_state(mz->zone, NR_LRU_BASE + lru); | ||
202 | } | ||
203 | |||
204 | |||
205 | /* | 156 | /* |
206 | * Add a shrinker callback to be called from the vm | 157 | * Add a shrinker callback to be called from the vm |
207 | */ | 158 | */ |
@@ -364,39 +315,6 @@ out: | |||
364 | return ret; | 315 | return ret; |
365 | } | 316 | } |
366 | 317 | ||
367 | static void set_reclaim_mode(int priority, struct scan_control *sc, | ||
368 | bool sync) | ||
369 | { | ||
370 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; | ||
371 | |||
372 | /* | ||
373 | * Initially assume we are entering either lumpy reclaim or | ||
374 | * reclaim/compaction.Depending on the order, we will either set the | ||
375 | * sync mode or just reclaim order-0 pages later. | ||
376 | */ | ||
377 | if (COMPACTION_BUILD) | ||
378 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; | ||
379 | else | ||
380 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | ||
381 | |||
382 | /* | ||
383 | * Avoid using lumpy reclaim or reclaim/compaction if possible by | ||
384 | * restricting when its set to either costly allocations or when | ||
385 | * under memory pressure | ||
386 | */ | ||
387 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
388 | sc->reclaim_mode |= syncmode; | ||
389 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
390 | sc->reclaim_mode |= syncmode; | ||
391 | else | ||
392 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | ||
393 | } | ||
394 | |||
395 | static void reset_reclaim_mode(struct scan_control *sc) | ||
396 | { | ||
397 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | ||
398 | } | ||
399 | |||
400 | static inline int is_page_cache_freeable(struct page *page) | 318 | static inline int is_page_cache_freeable(struct page *page) |
401 | { | 319 | { |
402 | /* | 320 | /* |
@@ -416,10 +334,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi, | |||
416 | return 1; | 334 | return 1; |
417 | if (bdi == current->backing_dev_info) | 335 | if (bdi == current->backing_dev_info) |
418 | return 1; | 336 | return 1; |
419 | |||
420 | /* lumpy reclaim for hugepage often need a lot of write */ | ||
421 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
422 | return 1; | ||
423 | return 0; | 337 | return 0; |
424 | } | 338 | } |
425 | 339 | ||
@@ -523,8 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
523 | /* synchronous write or broken a_ops? */ | 437 | /* synchronous write or broken a_ops? */ |
524 | ClearPageReclaim(page); | 438 | ClearPageReclaim(page); |
525 | } | 439 | } |
526 | trace_mm_vmscan_writepage(page, | 440 | trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); |
527 | trace_reclaim_flags(page, sc->reclaim_mode)); | ||
528 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 441 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
529 | return PAGE_SUCCESS; | 442 | return PAGE_SUCCESS; |
530 | } | 443 | } |
@@ -701,19 +614,15 @@ enum page_references { | |||
701 | }; | 614 | }; |
702 | 615 | ||
703 | static enum page_references page_check_references(struct page *page, | 616 | static enum page_references page_check_references(struct page *page, |
704 | struct mem_cgroup_zone *mz, | ||
705 | struct scan_control *sc) | 617 | struct scan_control *sc) |
706 | { | 618 | { |
707 | int referenced_ptes, referenced_page; | 619 | int referenced_ptes, referenced_page; |
708 | unsigned long vm_flags; | 620 | unsigned long vm_flags; |
709 | 621 | ||
710 | referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); | 622 | referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, |
623 | &vm_flags); | ||
711 | referenced_page = TestClearPageReferenced(page); | 624 | referenced_page = TestClearPageReferenced(page); |
712 | 625 | ||
713 | /* Lumpy reclaim - ignore references */ | ||
714 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
715 | return PAGEREF_RECLAIM; | ||
716 | |||
717 | /* | 626 | /* |
718 | * Mlock lost the isolation race with us. Let try_to_unmap() | 627 | * Mlock lost the isolation race with us. Let try_to_unmap() |
719 | * move the page to the unevictable list. | 628 | * move the page to the unevictable list. |
@@ -722,7 +631,7 @@ static enum page_references page_check_references(struct page *page, | |||
722 | return PAGEREF_RECLAIM; | 631 | return PAGEREF_RECLAIM; |
723 | 632 | ||
724 | if (referenced_ptes) { | 633 | if (referenced_ptes) { |
725 | if (PageAnon(page)) | 634 | if (PageSwapBacked(page)) |
726 | return PAGEREF_ACTIVATE; | 635 | return PAGEREF_ACTIVATE; |
727 | /* | 636 | /* |
728 | * All mapped pages start out with page table | 637 | * All mapped pages start out with page table |
@@ -763,9 +672,8 @@ static enum page_references page_check_references(struct page *page, | |||
763 | * shrink_page_list() returns the number of reclaimed pages | 672 | * shrink_page_list() returns the number of reclaimed pages |
764 | */ | 673 | */ |
765 | static unsigned long shrink_page_list(struct list_head *page_list, | 674 | static unsigned long shrink_page_list(struct list_head *page_list, |
766 | struct mem_cgroup_zone *mz, | 675 | struct zone *zone, |
767 | struct scan_control *sc, | 676 | struct scan_control *sc, |
768 | int priority, | ||
769 | unsigned long *ret_nr_dirty, | 677 | unsigned long *ret_nr_dirty, |
770 | unsigned long *ret_nr_writeback) | 678 | unsigned long *ret_nr_writeback) |
771 | { | 679 | { |
@@ -794,7 +702,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
794 | goto keep; | 702 | goto keep; |
795 | 703 | ||
796 | VM_BUG_ON(PageActive(page)); | 704 | VM_BUG_ON(PageActive(page)); |
797 | VM_BUG_ON(page_zone(page) != mz->zone); | 705 | VM_BUG_ON(page_zone(page) != zone); |
798 | 706 | ||
799 | sc->nr_scanned++; | 707 | sc->nr_scanned++; |
800 | 708 | ||
@@ -813,22 +721,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
813 | 721 | ||
814 | if (PageWriteback(page)) { | 722 | if (PageWriteback(page)) { |
815 | nr_writeback++; | 723 | nr_writeback++; |
816 | /* | 724 | unlock_page(page); |
817 | * Synchronous reclaim cannot queue pages for | 725 | goto keep; |
818 | * writeback due to the possibility of stack overflow | ||
819 | * but if it encounters a page under writeback, wait | ||
820 | * for the IO to complete. | ||
821 | */ | ||
822 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && | ||
823 | may_enter_fs) | ||
824 | wait_on_page_writeback(page); | ||
825 | else { | ||
826 | unlock_page(page); | ||
827 | goto keep_lumpy; | ||
828 | } | ||
829 | } | 726 | } |
830 | 727 | ||
831 | references = page_check_references(page, mz, sc); | 728 | references = page_check_references(page, sc); |
832 | switch (references) { | 729 | switch (references) { |
833 | case PAGEREF_ACTIVATE: | 730 | case PAGEREF_ACTIVATE: |
834 | goto activate_locked; | 731 | goto activate_locked; |
@@ -879,7 +776,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
879 | * unless under significant pressure. | 776 | * unless under significant pressure. |
880 | */ | 777 | */ |
881 | if (page_is_file_cache(page) && | 778 | if (page_is_file_cache(page) && |
882 | (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { | 779 | (!current_is_kswapd() || |
780 | sc->priority >= DEF_PRIORITY - 2)) { | ||
883 | /* | 781 | /* |
884 | * Immediately reclaim when written back. | 782 | * Immediately reclaim when written back. |
885 | * Similar in principal to deactivate_page() | 783 | * Similar in principal to deactivate_page() |
@@ -908,7 +806,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
908 | goto activate_locked; | 806 | goto activate_locked; |
909 | case PAGE_SUCCESS: | 807 | case PAGE_SUCCESS: |
910 | if (PageWriteback(page)) | 808 | if (PageWriteback(page)) |
911 | goto keep_lumpy; | 809 | goto keep; |
912 | if (PageDirty(page)) | 810 | if (PageDirty(page)) |
913 | goto keep; | 811 | goto keep; |
914 | 812 | ||
@@ -994,7 +892,6 @@ cull_mlocked: | |||
994 | try_to_free_swap(page); | 892 | try_to_free_swap(page); |
995 | unlock_page(page); | 893 | unlock_page(page); |
996 | putback_lru_page(page); | 894 | putback_lru_page(page); |
997 | reset_reclaim_mode(sc); | ||
998 | continue; | 895 | continue; |
999 | 896 | ||
1000 | activate_locked: | 897 | activate_locked: |
@@ -1007,8 +904,6 @@ activate_locked: | |||
1007 | keep_locked: | 904 | keep_locked: |
1008 | unlock_page(page); | 905 | unlock_page(page); |
1009 | keep: | 906 | keep: |
1010 | reset_reclaim_mode(sc); | ||
1011 | keep_lumpy: | ||
1012 | list_add(&page->lru, &ret_pages); | 907 | list_add(&page->lru, &ret_pages); |
1013 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 908 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
1014 | } | 909 | } |
@@ -1020,7 +915,7 @@ keep_lumpy: | |||
1020 | * will encounter the same problem | 915 | * will encounter the same problem |
1021 | */ | 916 | */ |
1022 | if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) | 917 | if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) |
1023 | zone_set_flag(mz->zone, ZONE_CONGESTED); | 918 | zone_set_flag(zone, ZONE_CONGESTED); |
1024 | 919 | ||
1025 | free_hot_cold_page_list(&free_pages, 1); | 920 | free_hot_cold_page_list(&free_pages, 1); |
1026 | 921 | ||
@@ -1041,34 +936,15 @@ keep_lumpy: | |||
1041 | * | 936 | * |
1042 | * returns 0 on success, -ve errno on failure. | 937 | * returns 0 on success, -ve errno on failure. |
1043 | */ | 938 | */ |
1044 | int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | 939 | int __isolate_lru_page(struct page *page, isolate_mode_t mode) |
1045 | { | 940 | { |
1046 | bool all_lru_mode; | ||
1047 | int ret = -EINVAL; | 941 | int ret = -EINVAL; |
1048 | 942 | ||
1049 | /* Only take pages on the LRU. */ | 943 | /* Only take pages on the LRU. */ |
1050 | if (!PageLRU(page)) | 944 | if (!PageLRU(page)) |
1051 | return ret; | 945 | return ret; |
1052 | 946 | ||
1053 | all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == | 947 | /* Do not give back unevictable pages for compaction */ |
1054 | (ISOLATE_ACTIVE|ISOLATE_INACTIVE); | ||
1055 | |||
1056 | /* | ||
1057 | * When checking the active state, we need to be sure we are | ||
1058 | * dealing with comparible boolean values. Take the logical not | ||
1059 | * of each. | ||
1060 | */ | ||
1061 | if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) | ||
1062 | return ret; | ||
1063 | |||
1064 | if (!all_lru_mode && !!page_is_file_cache(page) != file) | ||
1065 | return ret; | ||
1066 | |||
1067 | /* | ||
1068 | * When this function is being called for lumpy reclaim, we | ||
1069 | * initially look into all LRU pages, active, inactive and | ||
1070 | * unevictable; only give shrink_page_list evictable pages. | ||
1071 | */ | ||
1072 | if (PageUnevictable(page)) | 948 | if (PageUnevictable(page)) |
1073 | return ret; | 949 | return ret; |
1074 | 950 | ||
@@ -1135,54 +1011,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1135 | * Appropriate locks must be held before calling this function. | 1011 | * Appropriate locks must be held before calling this function. |
1136 | * | 1012 | * |
1137 | * @nr_to_scan: The number of pages to look through on the list. | 1013 | * @nr_to_scan: The number of pages to look through on the list. |
1138 | * @mz: The mem_cgroup_zone to pull pages from. | 1014 | * @lruvec: The LRU vector to pull pages from. |
1139 | * @dst: The temp list to put pages on to. | 1015 | * @dst: The temp list to put pages on to. |
1140 | * @nr_scanned: The number of pages that were scanned. | 1016 | * @nr_scanned: The number of pages that were scanned. |
1141 | * @sc: The scan_control struct for this reclaim session | 1017 | * @sc: The scan_control struct for this reclaim session |
1142 | * @mode: One of the LRU isolation modes | 1018 | * @mode: One of the LRU isolation modes |
1143 | * @active: True [1] if isolating active pages | 1019 | * @lru: LRU list id for isolating |
1144 | * @file: True [1] if isolating file [!anon] pages | ||
1145 | * | 1020 | * |
1146 | * returns how many pages were moved onto *@dst. | 1021 | * returns how many pages were moved onto *@dst. |
1147 | */ | 1022 | */ |
1148 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1023 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1149 | struct mem_cgroup_zone *mz, struct list_head *dst, | 1024 | struct lruvec *lruvec, struct list_head *dst, |
1150 | unsigned long *nr_scanned, struct scan_control *sc, | 1025 | unsigned long *nr_scanned, struct scan_control *sc, |
1151 | isolate_mode_t mode, int active, int file) | 1026 | isolate_mode_t mode, enum lru_list lru) |
1152 | { | 1027 | { |
1153 | struct lruvec *lruvec; | 1028 | struct list_head *src = &lruvec->lists[lru]; |
1154 | struct list_head *src; | ||
1155 | unsigned long nr_taken = 0; | 1029 | unsigned long nr_taken = 0; |
1156 | unsigned long nr_lumpy_taken = 0; | ||
1157 | unsigned long nr_lumpy_dirty = 0; | ||
1158 | unsigned long nr_lumpy_failed = 0; | ||
1159 | unsigned long scan; | 1030 | unsigned long scan; |
1160 | int lru = LRU_BASE; | ||
1161 | |||
1162 | lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); | ||
1163 | if (active) | ||
1164 | lru += LRU_ACTIVE; | ||
1165 | if (file) | ||
1166 | lru += LRU_FILE; | ||
1167 | src = &lruvec->lists[lru]; | ||
1168 | 1031 | ||
1169 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | 1032 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { |
1170 | struct page *page; | 1033 | struct page *page; |
1171 | unsigned long pfn; | 1034 | int nr_pages; |
1172 | unsigned long end_pfn; | ||
1173 | unsigned long page_pfn; | ||
1174 | int zone_id; | ||
1175 | 1035 | ||
1176 | page = lru_to_page(src); | 1036 | page = lru_to_page(src); |
1177 | prefetchw_prev_lru_page(page, src, flags); | 1037 | prefetchw_prev_lru_page(page, src, flags); |
1178 | 1038 | ||
1179 | VM_BUG_ON(!PageLRU(page)); | 1039 | VM_BUG_ON(!PageLRU(page)); |
1180 | 1040 | ||
1181 | switch (__isolate_lru_page(page, mode, file)) { | 1041 | switch (__isolate_lru_page(page, mode)) { |
1182 | case 0: | 1042 | case 0: |
1183 | mem_cgroup_lru_del(page); | 1043 | nr_pages = hpage_nr_pages(page); |
1044 | mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); | ||
1184 | list_move(&page->lru, dst); | 1045 | list_move(&page->lru, dst); |
1185 | nr_taken += hpage_nr_pages(page); | 1046 | nr_taken += nr_pages; |
1186 | break; | 1047 | break; |
1187 | 1048 | ||
1188 | case -EBUSY: | 1049 | case -EBUSY: |
@@ -1193,93 +1054,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1193 | default: | 1054 | default: |
1194 | BUG(); | 1055 | BUG(); |
1195 | } | 1056 | } |
1196 | |||
1197 | if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) | ||
1198 | continue; | ||
1199 | |||
1200 | /* | ||
1201 | * Attempt to take all pages in the order aligned region | ||
1202 | * surrounding the tag page. Only take those pages of | ||
1203 | * the same active state as that tag page. We may safely | ||
1204 | * round the target page pfn down to the requested order | ||
1205 | * as the mem_map is guaranteed valid out to MAX_ORDER, | ||
1206 | * where that page is in a different zone we will detect | ||
1207 | * it from its zone id and abort this block scan. | ||
1208 | */ | ||
1209 | zone_id = page_zone_id(page); | ||
1210 | page_pfn = page_to_pfn(page); | ||
1211 | pfn = page_pfn & ~((1 << sc->order) - 1); | ||
1212 | end_pfn = pfn + (1 << sc->order); | ||
1213 | for (; pfn < end_pfn; pfn++) { | ||
1214 | struct page *cursor_page; | ||
1215 | |||
1216 | /* The target page is in the block, ignore it. */ | ||
1217 | if (unlikely(pfn == page_pfn)) | ||
1218 | continue; | ||
1219 | |||
1220 | /* Avoid holes within the zone. */ | ||
1221 | if (unlikely(!pfn_valid_within(pfn))) | ||
1222 | break; | ||
1223 | |||
1224 | cursor_page = pfn_to_page(pfn); | ||
1225 | |||
1226 | /* Check that we have not crossed a zone boundary. */ | ||
1227 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | ||
1228 | break; | ||
1229 | |||
1230 | /* | ||
1231 | * If we don't have enough swap space, reclaiming of | ||
1232 | * anon page which don't already have a swap slot is | ||
1233 | * pointless. | ||
1234 | */ | ||
1235 | if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && | ||
1236 | !PageSwapCache(cursor_page)) | ||
1237 | break; | ||
1238 | |||
1239 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | ||
1240 | unsigned int isolated_pages; | ||
1241 | |||
1242 | mem_cgroup_lru_del(cursor_page); | ||
1243 | list_move(&cursor_page->lru, dst); | ||
1244 | isolated_pages = hpage_nr_pages(cursor_page); | ||
1245 | nr_taken += isolated_pages; | ||
1246 | nr_lumpy_taken += isolated_pages; | ||
1247 | if (PageDirty(cursor_page)) | ||
1248 | nr_lumpy_dirty += isolated_pages; | ||
1249 | scan++; | ||
1250 | pfn += isolated_pages - 1; | ||
1251 | } else { | ||
1252 | /* | ||
1253 | * Check if the page is freed already. | ||
1254 | * | ||
1255 | * We can't use page_count() as that | ||
1256 | * requires compound_head and we don't | ||
1257 | * have a pin on the page here. If a | ||
1258 | * page is tail, we may or may not | ||
1259 | * have isolated the head, so assume | ||
1260 | * it's not free, it'd be tricky to | ||
1261 | * track the head status without a | ||
1262 | * page pin. | ||
1263 | */ | ||
1264 | if (!PageTail(cursor_page) && | ||
1265 | !atomic_read(&cursor_page->_count)) | ||
1266 | continue; | ||
1267 | break; | ||
1268 | } | ||
1269 | } | ||
1270 | |||
1271 | /* If we break out of the loop above, lumpy reclaim failed */ | ||
1272 | if (pfn < end_pfn) | ||
1273 | nr_lumpy_failed++; | ||
1274 | } | 1057 | } |
1275 | 1058 | ||
1276 | *nr_scanned = scan; | 1059 | *nr_scanned = scan; |
1277 | 1060 | trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, | |
1278 | trace_mm_vmscan_lru_isolate(sc->order, | 1061 | nr_taken, mode, is_file_lru(lru)); |
1279 | nr_to_scan, scan, | ||
1280 | nr_taken, | ||
1281 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, | ||
1282 | mode, file); | ||
1283 | return nr_taken; | 1062 | return nr_taken; |
1284 | } | 1063 | } |
1285 | 1064 | ||
@@ -1316,15 +1095,16 @@ int isolate_lru_page(struct page *page) | |||
1316 | 1095 | ||
1317 | if (PageLRU(page)) { | 1096 | if (PageLRU(page)) { |
1318 | struct zone *zone = page_zone(page); | 1097 | struct zone *zone = page_zone(page); |
1098 | struct lruvec *lruvec; | ||
1319 | 1099 | ||
1320 | spin_lock_irq(&zone->lru_lock); | 1100 | spin_lock_irq(&zone->lru_lock); |
1101 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1321 | if (PageLRU(page)) { | 1102 | if (PageLRU(page)) { |
1322 | int lru = page_lru(page); | 1103 | int lru = page_lru(page); |
1323 | ret = 0; | ||
1324 | get_page(page); | 1104 | get_page(page); |
1325 | ClearPageLRU(page); | 1105 | ClearPageLRU(page); |
1326 | 1106 | del_page_from_lru_list(page, lruvec, lru); | |
1327 | del_page_from_lru_list(zone, page, lru); | 1107 | ret = 0; |
1328 | } | 1108 | } |
1329 | spin_unlock_irq(&zone->lru_lock); | 1109 | spin_unlock_irq(&zone->lru_lock); |
1330 | } | 1110 | } |
@@ -1357,11 +1137,10 @@ static int too_many_isolated(struct zone *zone, int file, | |||
1357 | } | 1137 | } |
1358 | 1138 | ||
1359 | static noinline_for_stack void | 1139 | static noinline_for_stack void |
1360 | putback_inactive_pages(struct mem_cgroup_zone *mz, | 1140 | putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) |
1361 | struct list_head *page_list) | ||
1362 | { | 1141 | { |
1363 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1142 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1364 | struct zone *zone = mz->zone; | 1143 | struct zone *zone = lruvec_zone(lruvec); |
1365 | LIST_HEAD(pages_to_free); | 1144 | LIST_HEAD(pages_to_free); |
1366 | 1145 | ||
1367 | /* | 1146 | /* |
@@ -1379,9 +1158,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, | |||
1379 | spin_lock_irq(&zone->lru_lock); | 1158 | spin_lock_irq(&zone->lru_lock); |
1380 | continue; | 1159 | continue; |
1381 | } | 1160 | } |
1161 | |||
1162 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1163 | |||
1382 | SetPageLRU(page); | 1164 | SetPageLRU(page); |
1383 | lru = page_lru(page); | 1165 | lru = page_lru(page); |
1384 | add_page_to_lru_list(zone, page, lru); | 1166 | add_page_to_lru_list(page, lruvec, lru); |
1167 | |||
1385 | if (is_active_lru(lru)) { | 1168 | if (is_active_lru(lru)) { |
1386 | int file = is_file_lru(lru); | 1169 | int file = is_file_lru(lru); |
1387 | int numpages = hpage_nr_pages(page); | 1170 | int numpages = hpage_nr_pages(page); |
@@ -1390,7 +1173,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, | |||
1390 | if (put_page_testzero(page)) { | 1173 | if (put_page_testzero(page)) { |
1391 | __ClearPageLRU(page); | 1174 | __ClearPageLRU(page); |
1392 | __ClearPageActive(page); | 1175 | __ClearPageActive(page); |
1393 | del_page_from_lru_list(zone, page, lru); | 1176 | del_page_from_lru_list(page, lruvec, lru); |
1394 | 1177 | ||
1395 | if (unlikely(PageCompound(page))) { | 1178 | if (unlikely(PageCompound(page))) { |
1396 | spin_unlock_irq(&zone->lru_lock); | 1179 | spin_unlock_irq(&zone->lru_lock); |
@@ -1407,112 +1190,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, | |||
1407 | list_splice(&pages_to_free, page_list); | 1190 | list_splice(&pages_to_free, page_list); |
1408 | } | 1191 | } |
1409 | 1192 | ||
1410 | static noinline_for_stack void | ||
1411 | update_isolated_counts(struct mem_cgroup_zone *mz, | ||
1412 | struct list_head *page_list, | ||
1413 | unsigned long *nr_anon, | ||
1414 | unsigned long *nr_file) | ||
1415 | { | ||
1416 | struct zone *zone = mz->zone; | ||
1417 | unsigned int count[NR_LRU_LISTS] = { 0, }; | ||
1418 | unsigned long nr_active = 0; | ||
1419 | struct page *page; | ||
1420 | int lru; | ||
1421 | |||
1422 | /* | ||
1423 | * Count pages and clear active flags | ||
1424 | */ | ||
1425 | list_for_each_entry(page, page_list, lru) { | ||
1426 | int numpages = hpage_nr_pages(page); | ||
1427 | lru = page_lru_base_type(page); | ||
1428 | if (PageActive(page)) { | ||
1429 | lru += LRU_ACTIVE; | ||
1430 | ClearPageActive(page); | ||
1431 | nr_active += numpages; | ||
1432 | } | ||
1433 | count[lru] += numpages; | ||
1434 | } | ||
1435 | |||
1436 | preempt_disable(); | ||
1437 | __count_vm_events(PGDEACTIVATE, nr_active); | ||
1438 | |||
1439 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, | ||
1440 | -count[LRU_ACTIVE_FILE]); | ||
1441 | __mod_zone_page_state(zone, NR_INACTIVE_FILE, | ||
1442 | -count[LRU_INACTIVE_FILE]); | ||
1443 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, | ||
1444 | -count[LRU_ACTIVE_ANON]); | ||
1445 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, | ||
1446 | -count[LRU_INACTIVE_ANON]); | ||
1447 | |||
1448 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | ||
1449 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | ||
1450 | |||
1451 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); | ||
1452 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); | ||
1453 | preempt_enable(); | ||
1454 | } | ||
1455 | |||
1456 | /* | ||
1457 | * Returns true if a direct reclaim should wait on pages under writeback. | ||
1458 | * | ||
1459 | * If we are direct reclaiming for contiguous pages and we do not reclaim | ||
1460 | * everything in the list, try again and wait for writeback IO to complete. | ||
1461 | * This will stall high-order allocations noticeably. Only do that when really | ||
1462 | * need to free the pages under high memory pressure. | ||
1463 | */ | ||
1464 | static inline bool should_reclaim_stall(unsigned long nr_taken, | ||
1465 | unsigned long nr_freed, | ||
1466 | int priority, | ||
1467 | struct scan_control *sc) | ||
1468 | { | ||
1469 | int lumpy_stall_priority; | ||
1470 | |||
1471 | /* kswapd should not stall on sync IO */ | ||
1472 | if (current_is_kswapd()) | ||
1473 | return false; | ||
1474 | |||
1475 | /* Only stall on lumpy reclaim */ | ||
1476 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) | ||
1477 | return false; | ||
1478 | |||
1479 | /* If we have reclaimed everything on the isolated list, no stall */ | ||
1480 | if (nr_freed == nr_taken) | ||
1481 | return false; | ||
1482 | |||
1483 | /* | ||
1484 | * For high-order allocations, there are two stall thresholds. | ||
1485 | * High-cost allocations stall immediately where as lower | ||
1486 | * order allocations such as stacks require the scanning | ||
1487 | * priority to be much higher before stalling. | ||
1488 | */ | ||
1489 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1490 | lumpy_stall_priority = DEF_PRIORITY; | ||
1491 | else | ||
1492 | lumpy_stall_priority = DEF_PRIORITY / 3; | ||
1493 | |||
1494 | return priority <= lumpy_stall_priority; | ||
1495 | } | ||
1496 | |||
1497 | /* | 1193 | /* |
1498 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1194 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
1499 | * of reclaimed pages | 1195 | * of reclaimed pages |
1500 | */ | 1196 | */ |
1501 | static noinline_for_stack unsigned long | 1197 | static noinline_for_stack unsigned long |
1502 | shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | 1198 | shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, |
1503 | struct scan_control *sc, int priority, int file) | 1199 | struct scan_control *sc, enum lru_list lru) |
1504 | { | 1200 | { |
1505 | LIST_HEAD(page_list); | 1201 | LIST_HEAD(page_list); |
1506 | unsigned long nr_scanned; | 1202 | unsigned long nr_scanned; |
1507 | unsigned long nr_reclaimed = 0; | 1203 | unsigned long nr_reclaimed = 0; |
1508 | unsigned long nr_taken; | 1204 | unsigned long nr_taken; |
1509 | unsigned long nr_anon; | ||
1510 | unsigned long nr_file; | ||
1511 | unsigned long nr_dirty = 0; | 1205 | unsigned long nr_dirty = 0; |
1512 | unsigned long nr_writeback = 0; | 1206 | unsigned long nr_writeback = 0; |
1513 | isolate_mode_t isolate_mode = ISOLATE_INACTIVE; | 1207 | isolate_mode_t isolate_mode = 0; |
1514 | struct zone *zone = mz->zone; | 1208 | int file = is_file_lru(lru); |
1515 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1209 | struct zone *zone = lruvec_zone(lruvec); |
1210 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | ||
1516 | 1211 | ||
1517 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1212 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1518 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1213 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1522,10 +1217,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1522 | return SWAP_CLUSTER_MAX; | 1217 | return SWAP_CLUSTER_MAX; |
1523 | } | 1218 | } |
1524 | 1219 | ||
1525 | set_reclaim_mode(priority, sc, false); | ||
1526 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
1527 | isolate_mode |= ISOLATE_ACTIVE; | ||
1528 | |||
1529 | lru_add_drain(); | 1220 | lru_add_drain(); |
1530 | 1221 | ||
1531 | if (!sc->may_unmap) | 1222 | if (!sc->may_unmap) |
@@ -1535,38 +1226,30 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1535 | 1226 | ||
1536 | spin_lock_irq(&zone->lru_lock); | 1227 | spin_lock_irq(&zone->lru_lock); |
1537 | 1228 | ||
1538 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, | 1229 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, |
1539 | sc, isolate_mode, 0, file); | 1230 | &nr_scanned, sc, isolate_mode, lru); |
1231 | |||
1232 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); | ||
1233 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | ||
1234 | |||
1540 | if (global_reclaim(sc)) { | 1235 | if (global_reclaim(sc)) { |
1541 | zone->pages_scanned += nr_scanned; | 1236 | zone->pages_scanned += nr_scanned; |
1542 | if (current_is_kswapd()) | 1237 | if (current_is_kswapd()) |
1543 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1238 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); |
1544 | nr_scanned); | ||
1545 | else | 1239 | else |
1546 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1240 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); |
1547 | nr_scanned); | ||
1548 | } | 1241 | } |
1549 | spin_unlock_irq(&zone->lru_lock); | 1242 | spin_unlock_irq(&zone->lru_lock); |
1550 | 1243 | ||
1551 | if (nr_taken == 0) | 1244 | if (nr_taken == 0) |
1552 | return 0; | 1245 | return 0; |
1553 | 1246 | ||
1554 | update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); | 1247 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, |
1555 | |||
1556 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, | ||
1557 | &nr_dirty, &nr_writeback); | 1248 | &nr_dirty, &nr_writeback); |
1558 | 1249 | ||
1559 | /* Check if we should syncronously wait for writeback */ | ||
1560 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | ||
1561 | set_reclaim_mode(priority, sc, true); | ||
1562 | nr_reclaimed += shrink_page_list(&page_list, mz, sc, | ||
1563 | priority, &nr_dirty, &nr_writeback); | ||
1564 | } | ||
1565 | |||
1566 | spin_lock_irq(&zone->lru_lock); | 1250 | spin_lock_irq(&zone->lru_lock); |
1567 | 1251 | ||
1568 | reclaim_stat->recent_scanned[0] += nr_anon; | 1252 | reclaim_stat->recent_scanned[file] += nr_taken; |
1569 | reclaim_stat->recent_scanned[1] += nr_file; | ||
1570 | 1253 | ||
1571 | if (global_reclaim(sc)) { | 1254 | if (global_reclaim(sc)) { |
1572 | if (current_is_kswapd()) | 1255 | if (current_is_kswapd()) |
@@ -1577,10 +1260,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1577 | nr_reclaimed); | 1260 | nr_reclaimed); |
1578 | } | 1261 | } |
1579 | 1262 | ||
1580 | putback_inactive_pages(mz, &page_list); | 1263 | putback_inactive_pages(lruvec, &page_list); |
1581 | 1264 | ||
1582 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); | 1265 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
1583 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); | ||
1584 | 1266 | ||
1585 | spin_unlock_irq(&zone->lru_lock); | 1267 | spin_unlock_irq(&zone->lru_lock); |
1586 | 1268 | ||
@@ -1609,14 +1291,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1609 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any | 1291 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any |
1610 | * isolated page is PageWriteback | 1292 | * isolated page is PageWriteback |
1611 | */ | 1293 | */ |
1612 | if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) | 1294 | if (nr_writeback && nr_writeback >= |
1295 | (nr_taken >> (DEF_PRIORITY - sc->priority))) | ||
1613 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | 1296 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); |
1614 | 1297 | ||
1615 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1298 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
1616 | zone_idx(zone), | 1299 | zone_idx(zone), |
1617 | nr_scanned, nr_reclaimed, | 1300 | nr_scanned, nr_reclaimed, |
1618 | priority, | 1301 | sc->priority, |
1619 | trace_shrink_flags(file, sc->reclaim_mode)); | 1302 | trace_shrink_flags(file)); |
1620 | return nr_reclaimed; | 1303 | return nr_reclaimed; |
1621 | } | 1304 | } |
1622 | 1305 | ||
@@ -1638,30 +1321,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1638 | * But we had to alter page->flags anyway. | 1321 | * But we had to alter page->flags anyway. |
1639 | */ | 1322 | */ |
1640 | 1323 | ||
1641 | static void move_active_pages_to_lru(struct zone *zone, | 1324 | static void move_active_pages_to_lru(struct lruvec *lruvec, |
1642 | struct list_head *list, | 1325 | struct list_head *list, |
1643 | struct list_head *pages_to_free, | 1326 | struct list_head *pages_to_free, |
1644 | enum lru_list lru) | 1327 | enum lru_list lru) |
1645 | { | 1328 | { |
1329 | struct zone *zone = lruvec_zone(lruvec); | ||
1646 | unsigned long pgmoved = 0; | 1330 | unsigned long pgmoved = 0; |
1647 | struct page *page; | 1331 | struct page *page; |
1332 | int nr_pages; | ||
1648 | 1333 | ||
1649 | while (!list_empty(list)) { | 1334 | while (!list_empty(list)) { |
1650 | struct lruvec *lruvec; | ||
1651 | |||
1652 | page = lru_to_page(list); | 1335 | page = lru_to_page(list); |
1336 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1653 | 1337 | ||
1654 | VM_BUG_ON(PageLRU(page)); | 1338 | VM_BUG_ON(PageLRU(page)); |
1655 | SetPageLRU(page); | 1339 | SetPageLRU(page); |
1656 | 1340 | ||
1657 | lruvec = mem_cgroup_lru_add_list(zone, page, lru); | 1341 | nr_pages = hpage_nr_pages(page); |
1342 | mem_cgroup_update_lru_size(lruvec, lru, nr_pages); | ||
1658 | list_move(&page->lru, &lruvec->lists[lru]); | 1343 | list_move(&page->lru, &lruvec->lists[lru]); |
1659 | pgmoved += hpage_nr_pages(page); | 1344 | pgmoved += nr_pages; |
1660 | 1345 | ||
1661 | if (put_page_testzero(page)) { | 1346 | if (put_page_testzero(page)) { |
1662 | __ClearPageLRU(page); | 1347 | __ClearPageLRU(page); |
1663 | __ClearPageActive(page); | 1348 | __ClearPageActive(page); |
1664 | del_page_from_lru_list(zone, page, lru); | 1349 | del_page_from_lru_list(page, lruvec, lru); |
1665 | 1350 | ||
1666 | if (unlikely(PageCompound(page))) { | 1351 | if (unlikely(PageCompound(page))) { |
1667 | spin_unlock_irq(&zone->lru_lock); | 1352 | spin_unlock_irq(&zone->lru_lock); |
@@ -1677,9 +1362,9 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1677 | } | 1362 | } |
1678 | 1363 | ||
1679 | static void shrink_active_list(unsigned long nr_to_scan, | 1364 | static void shrink_active_list(unsigned long nr_to_scan, |
1680 | struct mem_cgroup_zone *mz, | 1365 | struct lruvec *lruvec, |
1681 | struct scan_control *sc, | 1366 | struct scan_control *sc, |
1682 | int priority, int file) | 1367 | enum lru_list lru) |
1683 | { | 1368 | { |
1684 | unsigned long nr_taken; | 1369 | unsigned long nr_taken; |
1685 | unsigned long nr_scanned; | 1370 | unsigned long nr_scanned; |
@@ -1688,15 +1373,14 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1688 | LIST_HEAD(l_active); | 1373 | LIST_HEAD(l_active); |
1689 | LIST_HEAD(l_inactive); | 1374 | LIST_HEAD(l_inactive); |
1690 | struct page *page; | 1375 | struct page *page; |
1691 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1376 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1692 | unsigned long nr_rotated = 0; | 1377 | unsigned long nr_rotated = 0; |
1693 | isolate_mode_t isolate_mode = ISOLATE_ACTIVE; | 1378 | isolate_mode_t isolate_mode = 0; |
1694 | struct zone *zone = mz->zone; | 1379 | int file = is_file_lru(lru); |
1380 | struct zone *zone = lruvec_zone(lruvec); | ||
1695 | 1381 | ||
1696 | lru_add_drain(); | 1382 | lru_add_drain(); |
1697 | 1383 | ||
1698 | reset_reclaim_mode(sc); | ||
1699 | |||
1700 | if (!sc->may_unmap) | 1384 | if (!sc->may_unmap) |
1701 | isolate_mode |= ISOLATE_UNMAPPED; | 1385 | isolate_mode |= ISOLATE_UNMAPPED; |
1702 | if (!sc->may_writepage) | 1386 | if (!sc->may_writepage) |
@@ -1704,18 +1388,15 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1704 | 1388 | ||
1705 | spin_lock_irq(&zone->lru_lock); | 1389 | spin_lock_irq(&zone->lru_lock); |
1706 | 1390 | ||
1707 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, | 1391 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, |
1708 | isolate_mode, 1, file); | 1392 | &nr_scanned, sc, isolate_mode, lru); |
1709 | if (global_reclaim(sc)) | 1393 | if (global_reclaim(sc)) |
1710 | zone->pages_scanned += nr_scanned; | 1394 | zone->pages_scanned += nr_scanned; |
1711 | 1395 | ||
1712 | reclaim_stat->recent_scanned[file] += nr_taken; | 1396 | reclaim_stat->recent_scanned[file] += nr_taken; |
1713 | 1397 | ||
1714 | __count_zone_vm_events(PGREFILL, zone, nr_scanned); | 1398 | __count_zone_vm_events(PGREFILL, zone, nr_scanned); |
1715 | if (file) | 1399 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); |
1716 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); | ||
1717 | else | ||
1718 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); | ||
1719 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | 1400 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); |
1720 | spin_unlock_irq(&zone->lru_lock); | 1401 | spin_unlock_irq(&zone->lru_lock); |
1721 | 1402 | ||
@@ -1737,7 +1418,8 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1737 | } | 1418 | } |
1738 | } | 1419 | } |
1739 | 1420 | ||
1740 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { | 1421 | if (page_referenced(page, 0, sc->target_mem_cgroup, |
1422 | &vm_flags)) { | ||
1741 | nr_rotated += hpage_nr_pages(page); | 1423 | nr_rotated += hpage_nr_pages(page); |
1742 | /* | 1424 | /* |
1743 | * Identify referenced, file-backed active pages and | 1425 | * Identify referenced, file-backed active pages and |
@@ -1770,10 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1770 | */ | 1452 | */ |
1771 | reclaim_stat->recent_rotated[file] += nr_rotated; | 1453 | reclaim_stat->recent_rotated[file] += nr_rotated; |
1772 | 1454 | ||
1773 | move_active_pages_to_lru(zone, &l_active, &l_hold, | 1455 | move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); |
1774 | LRU_ACTIVE + file * LRU_FILE); | 1456 | move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); |
1775 | move_active_pages_to_lru(zone, &l_inactive, &l_hold, | ||
1776 | LRU_BASE + file * LRU_FILE); | ||
1777 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); | 1457 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
1778 | spin_unlock_irq(&zone->lru_lock); | 1458 | spin_unlock_irq(&zone->lru_lock); |
1779 | 1459 | ||
@@ -1796,13 +1476,12 @@ static int inactive_anon_is_low_global(struct zone *zone) | |||
1796 | 1476 | ||
1797 | /** | 1477 | /** |
1798 | * inactive_anon_is_low - check if anonymous pages need to be deactivated | 1478 | * inactive_anon_is_low - check if anonymous pages need to be deactivated |
1799 | * @zone: zone to check | 1479 | * @lruvec: LRU vector to check |
1800 | * @sc: scan control of this context | ||
1801 | * | 1480 | * |
1802 | * Returns true if the zone does not have enough inactive anon pages, | 1481 | * Returns true if the zone does not have enough inactive anon pages, |
1803 | * meaning some active anon pages need to be deactivated. | 1482 | * meaning some active anon pages need to be deactivated. |
1804 | */ | 1483 | */ |
1805 | static int inactive_anon_is_low(struct mem_cgroup_zone *mz) | 1484 | static int inactive_anon_is_low(struct lruvec *lruvec) |
1806 | { | 1485 | { |
1807 | /* | 1486 | /* |
1808 | * If we don't have swap space, anonymous page deactivation | 1487 | * If we don't have swap space, anonymous page deactivation |
@@ -1811,14 +1490,13 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz) | |||
1811 | if (!total_swap_pages) | 1490 | if (!total_swap_pages) |
1812 | return 0; | 1491 | return 0; |
1813 | 1492 | ||
1814 | if (!scanning_global_lru(mz)) | 1493 | if (!mem_cgroup_disabled()) |
1815 | return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, | 1494 | return mem_cgroup_inactive_anon_is_low(lruvec); |
1816 | mz->zone); | ||
1817 | 1495 | ||
1818 | return inactive_anon_is_low_global(mz->zone); | 1496 | return inactive_anon_is_low_global(lruvec_zone(lruvec)); |
1819 | } | 1497 | } |
1820 | #else | 1498 | #else |
1821 | static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) | 1499 | static inline int inactive_anon_is_low(struct lruvec *lruvec) |
1822 | { | 1500 | { |
1823 | return 0; | 1501 | return 0; |
1824 | } | 1502 | } |
@@ -1836,7 +1514,7 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1836 | 1514 | ||
1837 | /** | 1515 | /** |
1838 | * inactive_file_is_low - check if file pages need to be deactivated | 1516 | * inactive_file_is_low - check if file pages need to be deactivated |
1839 | * @mz: memory cgroup and zone to check | 1517 | * @lruvec: LRU vector to check |
1840 | * | 1518 | * |
1841 | * When the system is doing streaming IO, memory pressure here | 1519 | * When the system is doing streaming IO, memory pressure here |
1842 | * ensures that active file pages get deactivated, until more | 1520 | * ensures that active file pages get deactivated, until more |
@@ -1848,44 +1526,39 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1848 | * This uses a different ratio than the anonymous pages, because | 1526 | * This uses a different ratio than the anonymous pages, because |
1849 | * the page cache uses a use-once replacement algorithm. | 1527 | * the page cache uses a use-once replacement algorithm. |
1850 | */ | 1528 | */ |
1851 | static int inactive_file_is_low(struct mem_cgroup_zone *mz) | 1529 | static int inactive_file_is_low(struct lruvec *lruvec) |
1852 | { | 1530 | { |
1853 | if (!scanning_global_lru(mz)) | 1531 | if (!mem_cgroup_disabled()) |
1854 | return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, | 1532 | return mem_cgroup_inactive_file_is_low(lruvec); |
1855 | mz->zone); | ||
1856 | 1533 | ||
1857 | return inactive_file_is_low_global(mz->zone); | 1534 | return inactive_file_is_low_global(lruvec_zone(lruvec)); |
1858 | } | 1535 | } |
1859 | 1536 | ||
1860 | static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) | 1537 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) |
1861 | { | 1538 | { |
1862 | if (file) | 1539 | if (is_file_lru(lru)) |
1863 | return inactive_file_is_low(mz); | 1540 | return inactive_file_is_low(lruvec); |
1864 | else | 1541 | else |
1865 | return inactive_anon_is_low(mz); | 1542 | return inactive_anon_is_low(lruvec); |
1866 | } | 1543 | } |
1867 | 1544 | ||
1868 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1545 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1869 | struct mem_cgroup_zone *mz, | 1546 | struct lruvec *lruvec, struct scan_control *sc) |
1870 | struct scan_control *sc, int priority) | ||
1871 | { | 1547 | { |
1872 | int file = is_file_lru(lru); | ||
1873 | |||
1874 | if (is_active_lru(lru)) { | 1548 | if (is_active_lru(lru)) { |
1875 | if (inactive_list_is_low(mz, file)) | 1549 | if (inactive_list_is_low(lruvec, lru)) |
1876 | shrink_active_list(nr_to_scan, mz, sc, priority, file); | 1550 | shrink_active_list(nr_to_scan, lruvec, sc, lru); |
1877 | return 0; | 1551 | return 0; |
1878 | } | 1552 | } |
1879 | 1553 | ||
1880 | return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); | 1554 | return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); |
1881 | } | 1555 | } |
1882 | 1556 | ||
1883 | static int vmscan_swappiness(struct mem_cgroup_zone *mz, | 1557 | static int vmscan_swappiness(struct scan_control *sc) |
1884 | struct scan_control *sc) | ||
1885 | { | 1558 | { |
1886 | if (global_reclaim(sc)) | 1559 | if (global_reclaim(sc)) |
1887 | return vm_swappiness; | 1560 | return vm_swappiness; |
1888 | return mem_cgroup_swappiness(mz->mem_cgroup); | 1561 | return mem_cgroup_swappiness(sc->target_mem_cgroup); |
1889 | } | 1562 | } |
1890 | 1563 | ||
1891 | /* | 1564 | /* |
@@ -1896,17 +1569,18 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz, | |||
1896 | * | 1569 | * |
1897 | * nr[0] = anon pages to scan; nr[1] = file pages to scan | 1570 | * nr[0] = anon pages to scan; nr[1] = file pages to scan |
1898 | */ | 1571 | */ |
1899 | static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | 1572 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, |
1900 | unsigned long *nr, int priority) | 1573 | unsigned long *nr) |
1901 | { | 1574 | { |
1902 | unsigned long anon, file, free; | 1575 | unsigned long anon, file, free; |
1903 | unsigned long anon_prio, file_prio; | 1576 | unsigned long anon_prio, file_prio; |
1904 | unsigned long ap, fp; | 1577 | unsigned long ap, fp; |
1905 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1578 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1906 | u64 fraction[2], denominator; | 1579 | u64 fraction[2], denominator; |
1907 | enum lru_list lru; | 1580 | enum lru_list lru; |
1908 | int noswap = 0; | 1581 | int noswap = 0; |
1909 | bool force_scan = false; | 1582 | bool force_scan = false; |
1583 | struct zone *zone = lruvec_zone(lruvec); | ||
1910 | 1584 | ||
1911 | /* | 1585 | /* |
1912 | * If the zone or memcg is small, nr[l] can be 0. This | 1586 | * If the zone or memcg is small, nr[l] can be 0. This |
@@ -1918,7 +1592,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1918 | * latencies, so it's better to scan a minimum amount there as | 1592 | * latencies, so it's better to scan a minimum amount there as |
1919 | * well. | 1593 | * well. |
1920 | */ | 1594 | */ |
1921 | if (current_is_kswapd() && mz->zone->all_unreclaimable) | 1595 | if (current_is_kswapd() && zone->all_unreclaimable) |
1922 | force_scan = true; | 1596 | force_scan = true; |
1923 | if (!global_reclaim(sc)) | 1597 | if (!global_reclaim(sc)) |
1924 | force_scan = true; | 1598 | force_scan = true; |
@@ -1932,16 +1606,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1932 | goto out; | 1606 | goto out; |
1933 | } | 1607 | } |
1934 | 1608 | ||
1935 | anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + | 1609 | anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + |
1936 | zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); | 1610 | get_lru_size(lruvec, LRU_INACTIVE_ANON); |
1937 | file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + | 1611 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + |
1938 | zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); | 1612 | get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1939 | 1613 | ||
1940 | if (global_reclaim(sc)) { | 1614 | if (global_reclaim(sc)) { |
1941 | free = zone_page_state(mz->zone, NR_FREE_PAGES); | 1615 | free = zone_page_state(zone, NR_FREE_PAGES); |
1942 | /* If we have very few page cache pages, | 1616 | /* If we have very few page cache pages, |
1943 | force-scan anon pages. */ | 1617 | force-scan anon pages. */ |
1944 | if (unlikely(file + free <= high_wmark_pages(mz->zone))) { | 1618 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1945 | fraction[0] = 1; | 1619 | fraction[0] = 1; |
1946 | fraction[1] = 0; | 1620 | fraction[1] = 0; |
1947 | denominator = 1; | 1621 | denominator = 1; |
@@ -1953,8 +1627,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1953 | * With swappiness at 100, anonymous and file have the same priority. | 1627 | * With swappiness at 100, anonymous and file have the same priority. |
1954 | * This scanning priority is essentially the inverse of IO cost. | 1628 | * This scanning priority is essentially the inverse of IO cost. |
1955 | */ | 1629 | */ |
1956 | anon_prio = vmscan_swappiness(mz, sc); | 1630 | anon_prio = vmscan_swappiness(sc); |
1957 | file_prio = 200 - vmscan_swappiness(mz, sc); | 1631 | file_prio = 200 - anon_prio; |
1958 | 1632 | ||
1959 | /* | 1633 | /* |
1960 | * OK, so we have swap space and a fair amount of page cache | 1634 | * OK, so we have swap space and a fair amount of page cache |
@@ -1967,7 +1641,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1967 | * | 1641 | * |
1968 | * anon in [0], file in [1] | 1642 | * anon in [0], file in [1] |
1969 | */ | 1643 | */ |
1970 | spin_lock_irq(&mz->zone->lru_lock); | 1644 | spin_lock_irq(&zone->lru_lock); |
1971 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { | 1645 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
1972 | reclaim_stat->recent_scanned[0] /= 2; | 1646 | reclaim_stat->recent_scanned[0] /= 2; |
1973 | reclaim_stat->recent_rotated[0] /= 2; | 1647 | reclaim_stat->recent_rotated[0] /= 2; |
@@ -1983,12 +1657,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1983 | * proportional to the fraction of recently scanned pages on | 1657 | * proportional to the fraction of recently scanned pages on |
1984 | * each list that were recently referenced and in active use. | 1658 | * each list that were recently referenced and in active use. |
1985 | */ | 1659 | */ |
1986 | ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); | 1660 | ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); |
1987 | ap /= reclaim_stat->recent_rotated[0] + 1; | 1661 | ap /= reclaim_stat->recent_rotated[0] + 1; |
1988 | 1662 | ||
1989 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); | 1663 | fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); |
1990 | fp /= reclaim_stat->recent_rotated[1] + 1; | 1664 | fp /= reclaim_stat->recent_rotated[1] + 1; |
1991 | spin_unlock_irq(&mz->zone->lru_lock); | 1665 | spin_unlock_irq(&zone->lru_lock); |
1992 | 1666 | ||
1993 | fraction[0] = ap; | 1667 | fraction[0] = ap; |
1994 | fraction[1] = fp; | 1668 | fraction[1] = fp; |
@@ -1998,9 +1672,9 @@ out: | |||
1998 | int file = is_file_lru(lru); | 1672 | int file = is_file_lru(lru); |
1999 | unsigned long scan; | 1673 | unsigned long scan; |
2000 | 1674 | ||
2001 | scan = zone_nr_lru_pages(mz, lru); | 1675 | scan = get_lru_size(lruvec, lru); |
2002 | if (priority || noswap) { | 1676 | if (sc->priority || noswap || !vmscan_swappiness(sc)) { |
2003 | scan >>= priority; | 1677 | scan >>= sc->priority; |
2004 | if (!scan && force_scan) | 1678 | if (!scan && force_scan) |
2005 | scan = SWAP_CLUSTER_MAX; | 1679 | scan = SWAP_CLUSTER_MAX; |
2006 | scan = div64_u64(scan * fraction[file], denominator); | 1680 | scan = div64_u64(scan * fraction[file], denominator); |
@@ -2009,14 +1683,25 @@ out: | |||
2009 | } | 1683 | } |
2010 | } | 1684 | } |
2011 | 1685 | ||
1686 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | ||
1687 | static bool in_reclaim_compaction(struct scan_control *sc) | ||
1688 | { | ||
1689 | if (COMPACTION_BUILD && sc->order && | ||
1690 | (sc->order > PAGE_ALLOC_COSTLY_ORDER || | ||
1691 | sc->priority < DEF_PRIORITY - 2)) | ||
1692 | return true; | ||
1693 | |||
1694 | return false; | ||
1695 | } | ||
1696 | |||
2012 | /* | 1697 | /* |
2013 | * Reclaim/compaction depends on a number of pages being freed. To avoid | 1698 | * Reclaim/compaction is used for high-order allocation requests. It reclaims |
2014 | * disruption to the system, a small number of order-0 pages continue to be | 1699 | * order-0 pages before compacting the zone. should_continue_reclaim() returns |
2015 | * rotated and reclaimed in the normal fashion. However, by the time we get | 1700 | * true if more pages should be reclaimed such that when the page allocator |
2016 | * back to the allocator and call try_to_compact_zone(), we ensure that | 1701 | * calls try_to_compact_zone() that it will have enough free pages to succeed. |
2017 | * there are enough free pages for it to be likely successful | 1702 | * It will give up earlier than that if there is difficulty reclaiming pages. |
2018 | */ | 1703 | */ |
2019 | static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, | 1704 | static inline bool should_continue_reclaim(struct lruvec *lruvec, |
2020 | unsigned long nr_reclaimed, | 1705 | unsigned long nr_reclaimed, |
2021 | unsigned long nr_scanned, | 1706 | unsigned long nr_scanned, |
2022 | struct scan_control *sc) | 1707 | struct scan_control *sc) |
@@ -2025,7 +1710,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, | |||
2025 | unsigned long inactive_lru_pages; | 1710 | unsigned long inactive_lru_pages; |
2026 | 1711 | ||
2027 | /* If not in reclaim/compaction mode, stop */ | 1712 | /* If not in reclaim/compaction mode, stop */ |
2028 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | 1713 | if (!in_reclaim_compaction(sc)) |
2029 | return false; | 1714 | return false; |
2030 | 1715 | ||
2031 | /* Consider stopping depending on scan and reclaim activity */ | 1716 | /* Consider stopping depending on scan and reclaim activity */ |
@@ -2056,15 +1741,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, | |||
2056 | * inactive lists are large enough, continue reclaiming | 1741 | * inactive lists are large enough, continue reclaiming |
2057 | */ | 1742 | */ |
2058 | pages_for_compaction = (2UL << sc->order); | 1743 | pages_for_compaction = (2UL << sc->order); |
2059 | inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); | 1744 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); |
2060 | if (nr_swap_pages > 0) | 1745 | if (nr_swap_pages > 0) |
2061 | inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); | 1746 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); |
2062 | if (sc->nr_reclaimed < pages_for_compaction && | 1747 | if (sc->nr_reclaimed < pages_for_compaction && |
2063 | inactive_lru_pages > pages_for_compaction) | 1748 | inactive_lru_pages > pages_for_compaction) |
2064 | return true; | 1749 | return true; |
2065 | 1750 | ||
2066 | /* If compaction would go ahead or the allocation would succeed, stop */ | 1751 | /* If compaction would go ahead or the allocation would succeed, stop */ |
2067 | switch (compaction_suitable(mz->zone, sc->order)) { | 1752 | switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { |
2068 | case COMPACT_PARTIAL: | 1753 | case COMPACT_PARTIAL: |
2069 | case COMPACT_CONTINUE: | 1754 | case COMPACT_CONTINUE: |
2070 | return false; | 1755 | return false; |
@@ -2076,8 +1761,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, | |||
2076 | /* | 1761 | /* |
2077 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1762 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
2078 | */ | 1763 | */ |
2079 | static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, | 1764 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) |
2080 | struct scan_control *sc) | ||
2081 | { | 1765 | { |
2082 | unsigned long nr[NR_LRU_LISTS]; | 1766 | unsigned long nr[NR_LRU_LISTS]; |
2083 | unsigned long nr_to_scan; | 1767 | unsigned long nr_to_scan; |
@@ -2089,7 +1773,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, | |||
2089 | restart: | 1773 | restart: |
2090 | nr_reclaimed = 0; | 1774 | nr_reclaimed = 0; |
2091 | nr_scanned = sc->nr_scanned; | 1775 | nr_scanned = sc->nr_scanned; |
2092 | get_scan_count(mz, sc, nr, priority); | 1776 | get_scan_count(lruvec, sc, nr); |
2093 | 1777 | ||
2094 | blk_start_plug(&plug); | 1778 | blk_start_plug(&plug); |
2095 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1779 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
@@ -2101,7 +1785,7 @@ restart: | |||
2101 | nr[lru] -= nr_to_scan; | 1785 | nr[lru] -= nr_to_scan; |
2102 | 1786 | ||
2103 | nr_reclaimed += shrink_list(lru, nr_to_scan, | 1787 | nr_reclaimed += shrink_list(lru, nr_to_scan, |
2104 | mz, sc, priority); | 1788 | lruvec, sc); |
2105 | } | 1789 | } |
2106 | } | 1790 | } |
2107 | /* | 1791 | /* |
@@ -2112,7 +1796,8 @@ restart: | |||
2112 | * with multiple processes reclaiming pages, the total | 1796 | * with multiple processes reclaiming pages, the total |
2113 | * freeing target can get unreasonably large. | 1797 | * freeing target can get unreasonably large. |
2114 | */ | 1798 | */ |
2115 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 1799 | if (nr_reclaimed >= nr_to_reclaim && |
1800 | sc->priority < DEF_PRIORITY) | ||
2116 | break; | 1801 | break; |
2117 | } | 1802 | } |
2118 | blk_finish_plug(&plug); | 1803 | blk_finish_plug(&plug); |
@@ -2122,35 +1807,33 @@ restart: | |||
2122 | * Even if we did not try to evict anon pages at all, we want to | 1807 | * Even if we did not try to evict anon pages at all, we want to |
2123 | * rebalance the anon lru active/inactive ratio. | 1808 | * rebalance the anon lru active/inactive ratio. |
2124 | */ | 1809 | */ |
2125 | if (inactive_anon_is_low(mz)) | 1810 | if (inactive_anon_is_low(lruvec)) |
2126 | shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); | 1811 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
1812 | sc, LRU_ACTIVE_ANON); | ||
2127 | 1813 | ||
2128 | /* reclaim/compaction might need reclaim to continue */ | 1814 | /* reclaim/compaction might need reclaim to continue */ |
2129 | if (should_continue_reclaim(mz, nr_reclaimed, | 1815 | if (should_continue_reclaim(lruvec, nr_reclaimed, |
2130 | sc->nr_scanned - nr_scanned, sc)) | 1816 | sc->nr_scanned - nr_scanned, sc)) |
2131 | goto restart; | 1817 | goto restart; |
2132 | 1818 | ||
2133 | throttle_vm_writeout(sc->gfp_mask); | 1819 | throttle_vm_writeout(sc->gfp_mask); |
2134 | } | 1820 | } |
2135 | 1821 | ||
2136 | static void shrink_zone(int priority, struct zone *zone, | 1822 | static void shrink_zone(struct zone *zone, struct scan_control *sc) |
2137 | struct scan_control *sc) | ||
2138 | { | 1823 | { |
2139 | struct mem_cgroup *root = sc->target_mem_cgroup; | 1824 | struct mem_cgroup *root = sc->target_mem_cgroup; |
2140 | struct mem_cgroup_reclaim_cookie reclaim = { | 1825 | struct mem_cgroup_reclaim_cookie reclaim = { |
2141 | .zone = zone, | 1826 | .zone = zone, |
2142 | .priority = priority, | 1827 | .priority = sc->priority, |
2143 | }; | 1828 | }; |
2144 | struct mem_cgroup *memcg; | 1829 | struct mem_cgroup *memcg; |
2145 | 1830 | ||
2146 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 1831 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
2147 | do { | 1832 | do { |
2148 | struct mem_cgroup_zone mz = { | 1833 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2149 | .mem_cgroup = memcg, | 1834 | |
2150 | .zone = zone, | 1835 | shrink_lruvec(lruvec, sc); |
2151 | }; | ||
2152 | 1836 | ||
2153 | shrink_mem_cgroup_zone(priority, &mz, sc); | ||
2154 | /* | 1837 | /* |
2155 | * Limit reclaim has historically picked one memcg and | 1838 | * Limit reclaim has historically picked one memcg and |
2156 | * scanned it with decreasing priority levels until | 1839 | * scanned it with decreasing priority levels until |
@@ -2226,8 +1909,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
2226 | * the caller that it should consider retrying the allocation instead of | 1909 | * the caller that it should consider retrying the allocation instead of |
2227 | * further reclaim. | 1910 | * further reclaim. |
2228 | */ | 1911 | */ |
2229 | static bool shrink_zones(int priority, struct zonelist *zonelist, | 1912 | static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) |
2230 | struct scan_control *sc) | ||
2231 | { | 1913 | { |
2232 | struct zoneref *z; | 1914 | struct zoneref *z; |
2233 | struct zone *zone; | 1915 | struct zone *zone; |
@@ -2254,7 +1936,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2254 | if (global_reclaim(sc)) { | 1936 | if (global_reclaim(sc)) { |
2255 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1937 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2256 | continue; | 1938 | continue; |
2257 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1939 | if (zone->all_unreclaimable && |
1940 | sc->priority != DEF_PRIORITY) | ||
2258 | continue; /* Let kswapd poll it */ | 1941 | continue; /* Let kswapd poll it */ |
2259 | if (COMPACTION_BUILD) { | 1942 | if (COMPACTION_BUILD) { |
2260 | /* | 1943 | /* |
@@ -2286,7 +1969,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2286 | /* need some check for avoid more shrink_zone() */ | 1969 | /* need some check for avoid more shrink_zone() */ |
2287 | } | 1970 | } |
2288 | 1971 | ||
2289 | shrink_zone(priority, zone, sc); | 1972 | shrink_zone(zone, sc); |
2290 | } | 1973 | } |
2291 | 1974 | ||
2292 | return aborted_reclaim; | 1975 | return aborted_reclaim; |
@@ -2337,7 +2020,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2337 | struct scan_control *sc, | 2020 | struct scan_control *sc, |
2338 | struct shrink_control *shrink) | 2021 | struct shrink_control *shrink) |
2339 | { | 2022 | { |
2340 | int priority; | ||
2341 | unsigned long total_scanned = 0; | 2023 | unsigned long total_scanned = 0; |
2342 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2024 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2343 | struct zoneref *z; | 2025 | struct zoneref *z; |
@@ -2350,11 +2032,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2350 | if (global_reclaim(sc)) | 2032 | if (global_reclaim(sc)) |
2351 | count_vm_event(ALLOCSTALL); | 2033 | count_vm_event(ALLOCSTALL); |
2352 | 2034 | ||
2353 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2035 | do { |
2354 | sc->nr_scanned = 0; | 2036 | sc->nr_scanned = 0; |
2355 | if (!priority) | 2037 | aborted_reclaim = shrink_zones(zonelist, sc); |
2356 | disable_swap_token(sc->target_mem_cgroup); | ||
2357 | aborted_reclaim = shrink_zones(priority, zonelist, sc); | ||
2358 | 2038 | ||
2359 | /* | 2039 | /* |
2360 | * Don't shrink slabs when reclaiming memory from | 2040 | * Don't shrink slabs when reclaiming memory from |
@@ -2396,7 +2076,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2396 | 2076 | ||
2397 | /* Take a nap, wait for some writeback to complete */ | 2077 | /* Take a nap, wait for some writeback to complete */ |
2398 | if (!sc->hibernation_mode && sc->nr_scanned && | 2078 | if (!sc->hibernation_mode && sc->nr_scanned && |
2399 | priority < DEF_PRIORITY - 2) { | 2079 | sc->priority < DEF_PRIORITY - 2) { |
2400 | struct zone *preferred_zone; | 2080 | struct zone *preferred_zone; |
2401 | 2081 | ||
2402 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | 2082 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), |
@@ -2404,7 +2084,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2404 | &preferred_zone); | 2084 | &preferred_zone); |
2405 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | 2085 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); |
2406 | } | 2086 | } |
2407 | } | 2087 | } while (--sc->priority >= 0); |
2408 | 2088 | ||
2409 | out: | 2089 | out: |
2410 | delayacct_freepages_end(); | 2090 | delayacct_freepages_end(); |
@@ -2442,6 +2122,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2442 | .may_unmap = 1, | 2122 | .may_unmap = 1, |
2443 | .may_swap = 1, | 2123 | .may_swap = 1, |
2444 | .order = order, | 2124 | .order = order, |
2125 | .priority = DEF_PRIORITY, | ||
2445 | .target_mem_cgroup = NULL, | 2126 | .target_mem_cgroup = NULL, |
2446 | .nodemask = nodemask, | 2127 | .nodemask = nodemask, |
2447 | }; | 2128 | }; |
@@ -2474,17 +2155,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2474 | .may_unmap = 1, | 2155 | .may_unmap = 1, |
2475 | .may_swap = !noswap, | 2156 | .may_swap = !noswap, |
2476 | .order = 0, | 2157 | .order = 0, |
2158 | .priority = 0, | ||
2477 | .target_mem_cgroup = memcg, | 2159 | .target_mem_cgroup = memcg, |
2478 | }; | 2160 | }; |
2479 | struct mem_cgroup_zone mz = { | 2161 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2480 | .mem_cgroup = memcg, | ||
2481 | .zone = zone, | ||
2482 | }; | ||
2483 | 2162 | ||
2484 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2163 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2485 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2164 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
2486 | 2165 | ||
2487 | trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, | 2166 | trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, |
2488 | sc.may_writepage, | 2167 | sc.may_writepage, |
2489 | sc.gfp_mask); | 2168 | sc.gfp_mask); |
2490 | 2169 | ||
@@ -2495,7 +2174,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2495 | * will pick up pages from other mem cgroup's as well. We hack | 2174 | * will pick up pages from other mem cgroup's as well. We hack |
2496 | * the priority and make it zero. | 2175 | * the priority and make it zero. |
2497 | */ | 2176 | */ |
2498 | shrink_mem_cgroup_zone(0, &mz, &sc); | 2177 | shrink_lruvec(lruvec, &sc); |
2499 | 2178 | ||
2500 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2179 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2501 | 2180 | ||
@@ -2516,6 +2195,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
2516 | .may_swap = !noswap, | 2195 | .may_swap = !noswap, |
2517 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2196 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2518 | .order = 0, | 2197 | .order = 0, |
2198 | .priority = DEF_PRIORITY, | ||
2519 | .target_mem_cgroup = memcg, | 2199 | .target_mem_cgroup = memcg, |
2520 | .nodemask = NULL, /* we don't care the placement */ | 2200 | .nodemask = NULL, /* we don't care the placement */ |
2521 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2201 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
@@ -2546,8 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
2546 | } | 2226 | } |
2547 | #endif | 2227 | #endif |
2548 | 2228 | ||
2549 | static void age_active_anon(struct zone *zone, struct scan_control *sc, | 2229 | static void age_active_anon(struct zone *zone, struct scan_control *sc) |
2550 | int priority) | ||
2551 | { | 2230 | { |
2552 | struct mem_cgroup *memcg; | 2231 | struct mem_cgroup *memcg; |
2553 | 2232 | ||
@@ -2556,14 +2235,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc, | |||
2556 | 2235 | ||
2557 | memcg = mem_cgroup_iter(NULL, NULL, NULL); | 2236 | memcg = mem_cgroup_iter(NULL, NULL, NULL); |
2558 | do { | 2237 | do { |
2559 | struct mem_cgroup_zone mz = { | 2238 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2560 | .mem_cgroup = memcg, | ||
2561 | .zone = zone, | ||
2562 | }; | ||
2563 | 2239 | ||
2564 | if (inactive_anon_is_low(&mz)) | 2240 | if (inactive_anon_is_low(lruvec)) |
2565 | shrink_active_list(SWAP_CLUSTER_MAX, &mz, | 2241 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
2566 | sc, priority, 0); | 2242 | sc, LRU_ACTIVE_ANON); |
2567 | 2243 | ||
2568 | memcg = mem_cgroup_iter(NULL, memcg, NULL); | 2244 | memcg = mem_cgroup_iter(NULL, memcg, NULL); |
2569 | } while (memcg); | 2245 | } while (memcg); |
@@ -2672,7 +2348,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2672 | { | 2348 | { |
2673 | int all_zones_ok; | 2349 | int all_zones_ok; |
2674 | unsigned long balanced; | 2350 | unsigned long balanced; |
2675 | int priority; | ||
2676 | int i; | 2351 | int i; |
2677 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2352 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2678 | unsigned long total_scanned; | 2353 | unsigned long total_scanned; |
@@ -2696,18 +2371,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2696 | }; | 2371 | }; |
2697 | loop_again: | 2372 | loop_again: |
2698 | total_scanned = 0; | 2373 | total_scanned = 0; |
2374 | sc.priority = DEF_PRIORITY; | ||
2699 | sc.nr_reclaimed = 0; | 2375 | sc.nr_reclaimed = 0; |
2700 | sc.may_writepage = !laptop_mode; | 2376 | sc.may_writepage = !laptop_mode; |
2701 | count_vm_event(PAGEOUTRUN); | 2377 | count_vm_event(PAGEOUTRUN); |
2702 | 2378 | ||
2703 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2379 | do { |
2704 | unsigned long lru_pages = 0; | 2380 | unsigned long lru_pages = 0; |
2705 | int has_under_min_watermark_zone = 0; | 2381 | int has_under_min_watermark_zone = 0; |
2706 | 2382 | ||
2707 | /* The swap token gets in the way of swapout... */ | ||
2708 | if (!priority) | ||
2709 | disable_swap_token(NULL); | ||
2710 | |||
2711 | all_zones_ok = 1; | 2383 | all_zones_ok = 1; |
2712 | balanced = 0; | 2384 | balanced = 0; |
2713 | 2385 | ||
@@ -2721,14 +2393,15 @@ loop_again: | |||
2721 | if (!populated_zone(zone)) | 2393 | if (!populated_zone(zone)) |
2722 | continue; | 2394 | continue; |
2723 | 2395 | ||
2724 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2396 | if (zone->all_unreclaimable && |
2397 | sc.priority != DEF_PRIORITY) | ||
2725 | continue; | 2398 | continue; |
2726 | 2399 | ||
2727 | /* | 2400 | /* |
2728 | * Do some background aging of the anon list, to give | 2401 | * Do some background aging of the anon list, to give |
2729 | * pages a chance to be referenced before reclaiming. | 2402 | * pages a chance to be referenced before reclaiming. |
2730 | */ | 2403 | */ |
2731 | age_active_anon(zone, &sc, priority); | 2404 | age_active_anon(zone, &sc); |
2732 | 2405 | ||
2733 | /* | 2406 | /* |
2734 | * If the number of buffer_heads in the machine | 2407 | * If the number of buffer_heads in the machine |
@@ -2776,7 +2449,8 @@ loop_again: | |||
2776 | if (!populated_zone(zone)) | 2449 | if (!populated_zone(zone)) |
2777 | continue; | 2450 | continue; |
2778 | 2451 | ||
2779 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2452 | if (zone->all_unreclaimable && |
2453 | sc.priority != DEF_PRIORITY) | ||
2780 | continue; | 2454 | continue; |
2781 | 2455 | ||
2782 | sc.nr_scanned = 0; | 2456 | sc.nr_scanned = 0; |
@@ -2820,7 +2494,7 @@ loop_again: | |||
2820 | !zone_watermark_ok_safe(zone, testorder, | 2494 | !zone_watermark_ok_safe(zone, testorder, |
2821 | high_wmark_pages(zone) + balance_gap, | 2495 | high_wmark_pages(zone) + balance_gap, |
2822 | end_zone, 0)) { | 2496 | end_zone, 0)) { |
2823 | shrink_zone(priority, zone, &sc); | 2497 | shrink_zone(zone, &sc); |
2824 | 2498 | ||
2825 | reclaim_state->reclaimed_slab = 0; | 2499 | reclaim_state->reclaimed_slab = 0; |
2826 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | 2500 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
@@ -2877,7 +2551,7 @@ loop_again: | |||
2877 | * OK, kswapd is getting into trouble. Take a nap, then take | 2551 | * OK, kswapd is getting into trouble. Take a nap, then take |
2878 | * another pass across the zones. | 2552 | * another pass across the zones. |
2879 | */ | 2553 | */ |
2880 | if (total_scanned && (priority < DEF_PRIORITY - 2)) { | 2554 | if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { |
2881 | if (has_under_min_watermark_zone) | 2555 | if (has_under_min_watermark_zone) |
2882 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | 2556 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); |
2883 | else | 2557 | else |
@@ -2892,7 +2566,7 @@ loop_again: | |||
2892 | */ | 2566 | */ |
2893 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) | 2567 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
2894 | break; | 2568 | break; |
2895 | } | 2569 | } while (--sc.priority >= 0); |
2896 | out: | 2570 | out: |
2897 | 2571 | ||
2898 | /* | 2572 | /* |
@@ -2942,7 +2616,8 @@ out: | |||
2942 | if (!populated_zone(zone)) | 2616 | if (!populated_zone(zone)) |
2943 | continue; | 2617 | continue; |
2944 | 2618 | ||
2945 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2619 | if (zone->all_unreclaimable && |
2620 | sc.priority != DEF_PRIORITY) | ||
2946 | continue; | 2621 | continue; |
2947 | 2622 | ||
2948 | /* Would compaction fail due to lack of free memory? */ | 2623 | /* Would compaction fail due to lack of free memory? */ |
@@ -3013,7 +2688,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
3013 | * them before going back to sleep. | 2688 | * them before going back to sleep. |
3014 | */ | 2689 | */ |
3015 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2690 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
3016 | schedule(); | 2691 | |
2692 | if (!kthread_should_stop()) | ||
2693 | schedule(); | ||
2694 | |||
3017 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | 2695 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); |
3018 | } else { | 2696 | } else { |
3019 | if (remaining) | 2697 | if (remaining) |
@@ -3209,6 +2887,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
3209 | .nr_to_reclaim = nr_to_reclaim, | 2887 | .nr_to_reclaim = nr_to_reclaim, |
3210 | .hibernation_mode = 1, | 2888 | .hibernation_mode = 1, |
3211 | .order = 0, | 2889 | .order = 0, |
2890 | .priority = DEF_PRIORITY, | ||
3212 | }; | 2891 | }; |
3213 | struct shrink_control shrink = { | 2892 | struct shrink_control shrink = { |
3214 | .gfp_mask = sc.gfp_mask, | 2893 | .gfp_mask = sc.gfp_mask, |
@@ -3279,14 +2958,17 @@ int kswapd_run(int nid) | |||
3279 | } | 2958 | } |
3280 | 2959 | ||
3281 | /* | 2960 | /* |
3282 | * Called by memory hotplug when all memory in a node is offlined. | 2961 | * Called by memory hotplug when all memory in a node is offlined. Caller must |
2962 | * hold lock_memory_hotplug(). | ||
3283 | */ | 2963 | */ |
3284 | void kswapd_stop(int nid) | 2964 | void kswapd_stop(int nid) |
3285 | { | 2965 | { |
3286 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | 2966 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; |
3287 | 2967 | ||
3288 | if (kswapd) | 2968 | if (kswapd) { |
3289 | kthread_stop(kswapd); | 2969 | kthread_stop(kswapd); |
2970 | NODE_DATA(nid)->kswapd = NULL; | ||
2971 | } | ||
3290 | } | 2972 | } |
3291 | 2973 | ||
3292 | static int __init kswapd_init(void) | 2974 | static int __init kswapd_init(void) |
@@ -3386,7 +3068,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3386 | const unsigned long nr_pages = 1 << order; | 3068 | const unsigned long nr_pages = 1 << order; |
3387 | struct task_struct *p = current; | 3069 | struct task_struct *p = current; |
3388 | struct reclaim_state reclaim_state; | 3070 | struct reclaim_state reclaim_state; |
3389 | int priority; | ||
3390 | struct scan_control sc = { | 3071 | struct scan_control sc = { |
3391 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 3072 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
3392 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3073 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
@@ -3395,6 +3076,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3395 | SWAP_CLUSTER_MAX), | 3076 | SWAP_CLUSTER_MAX), |
3396 | .gfp_mask = gfp_mask, | 3077 | .gfp_mask = gfp_mask, |
3397 | .order = order, | 3078 | .order = order, |
3079 | .priority = ZONE_RECLAIM_PRIORITY, | ||
3398 | }; | 3080 | }; |
3399 | struct shrink_control shrink = { | 3081 | struct shrink_control shrink = { |
3400 | .gfp_mask = sc.gfp_mask, | 3082 | .gfp_mask = sc.gfp_mask, |
@@ -3417,11 +3099,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3417 | * Free memory by calling shrink zone with increasing | 3099 | * Free memory by calling shrink zone with increasing |
3418 | * priorities until we have enough memory freed. | 3100 | * priorities until we have enough memory freed. |
3419 | */ | 3101 | */ |
3420 | priority = ZONE_RECLAIM_PRIORITY; | ||
3421 | do { | 3102 | do { |
3422 | shrink_zone(priority, zone, &sc); | 3103 | shrink_zone(zone, &sc); |
3423 | priority--; | 3104 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); |
3424 | } while (priority >= 0 && sc.nr_reclaimed < nr_pages); | ||
3425 | } | 3105 | } |
3426 | 3106 | ||
3427 | nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 3107 | nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
@@ -3536,7 +3216,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) | |||
3536 | if (mapping_unevictable(page_mapping(page))) | 3216 | if (mapping_unevictable(page_mapping(page))) |
3537 | return 0; | 3217 | return 0; |
3538 | 3218 | ||
3539 | if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) | 3219 | if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) |
3540 | return 0; | 3220 | return 0; |
3541 | 3221 | ||
3542 | return 1; | 3222 | return 1; |
@@ -3572,6 +3252,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3572 | zone = pagezone; | 3252 | zone = pagezone; |
3573 | spin_lock_irq(&zone->lru_lock); | 3253 | spin_lock_irq(&zone->lru_lock); |
3574 | } | 3254 | } |
3255 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
3575 | 3256 | ||
3576 | if (!PageLRU(page) || !PageUnevictable(page)) | 3257 | if (!PageLRU(page) || !PageUnevictable(page)) |
3577 | continue; | 3258 | continue; |
@@ -3581,11 +3262,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3581 | 3262 | ||
3582 | VM_BUG_ON(PageActive(page)); | 3263 | VM_BUG_ON(PageActive(page)); |
3583 | ClearPageUnevictable(page); | 3264 | ClearPageUnevictable(page); |
3584 | __dec_zone_state(zone, NR_UNEVICTABLE); | 3265 | del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); |
3585 | lruvec = mem_cgroup_lru_move_lists(zone, page, | 3266 | add_page_to_lru_list(page, lruvec, lru); |
3586 | LRU_UNEVICTABLE, lru); | ||
3587 | list_move(&page->lru, &lruvec->lists[lru]); | ||
3588 | __inc_zone_state(zone, NR_INACTIVE_ANON + lru); | ||
3589 | pgrescued++; | 3267 | pgrescued++; |
3590 | } | 3268 | } |
3591 | } | 3269 | } |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 7db1b9bab492..1bbbbd9776ad 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -613,6 +613,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = { | |||
613 | "Reclaimable", | 613 | "Reclaimable", |
614 | "Movable", | 614 | "Movable", |
615 | "Reserve", | 615 | "Reserve", |
616 | #ifdef CONFIG_CMA | ||
617 | "CMA", | ||
618 | #endif | ||
616 | "Isolate", | 619 | "Isolate", |
617 | }; | 620 | }; |
618 | 621 | ||
@@ -1220,7 +1223,6 @@ module_init(setup_vmstat) | |||
1220 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) | 1223 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) |
1221 | #include <linux/debugfs.h> | 1224 | #include <linux/debugfs.h> |
1222 | 1225 | ||
1223 | static struct dentry *extfrag_debug_root; | ||
1224 | 1226 | ||
1225 | /* | 1227 | /* |
1226 | * Return an index indicating how much of the available free memory is | 1228 | * Return an index indicating how much of the available free memory is |
@@ -1358,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = { | |||
1358 | 1360 | ||
1359 | static int __init extfrag_debug_init(void) | 1361 | static int __init extfrag_debug_init(void) |
1360 | { | 1362 | { |
1363 | struct dentry *extfrag_debug_root; | ||
1364 | |||
1361 | extfrag_debug_root = debugfs_create_dir("extfrag", NULL); | 1365 | extfrag_debug_root = debugfs_create_dir("extfrag", NULL); |
1362 | if (!extfrag_debug_root) | 1366 | if (!extfrag_debug_root) |
1363 | return -ENOMEM; | 1367 | return -ENOMEM; |
1364 | 1368 | ||
1365 | if (!debugfs_create_file("unusable_index", 0444, | 1369 | if (!debugfs_create_file("unusable_index", 0444, |
1366 | extfrag_debug_root, NULL, &unusable_file_ops)) | 1370 | extfrag_debug_root, NULL, &unusable_file_ops)) |
1367 | return -ENOMEM; | 1371 | goto fail; |
1368 | 1372 | ||
1369 | if (!debugfs_create_file("extfrag_index", 0444, | 1373 | if (!debugfs_create_file("extfrag_index", 0444, |
1370 | extfrag_debug_root, NULL, &extfrag_file_ops)) | 1374 | extfrag_debug_root, NULL, &extfrag_file_ops)) |
1371 | return -ENOMEM; | 1375 | goto fail; |
1372 | 1376 | ||
1373 | return 0; | 1377 | return 0; |
1378 | fail: | ||
1379 | debugfs_remove_recursive(extfrag_debug_root); | ||
1380 | return -ENOMEM; | ||
1374 | } | 1381 | } |
1375 | 1382 | ||
1376 | module_init(extfrag_debug_init); | 1383 | module_init(extfrag_debug_init); |