aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig29
-rw-r--r--mm/Makefile13
-rw-r--r--mm/bootmem.c138
-rw-r--r--mm/cleancache.c6
-rw-r--r--mm/compaction.c432
-rw-r--r--mm/filemap.c108
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/frontswap.c314
-rw-r--r--mm/huge_memory.c29
-rw-r--r--mm/hugetlb.c34
-rw-r--r--mm/internal.h42
-rw-r--r--mm/madvise.c29
-rw-r--r--mm/memblock.c155
-rw-r--r--mm/memcontrol.c761
-rw-r--r--mm/memory-failure.c10
-rw-r--r--mm/memory.c61
-rw-r--r--mm/memory_hotplug.c22
-rw-r--r--mm/mempolicy.c83
-rw-r--r--mm/migrate.c9
-rw-r--r--mm/mmap.c158
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mremap.c26
-rw-r--r--mm/nobootmem.c150
-rw-r--r--mm/nommu.c35
-rw-r--r--mm/oom_kill.c59
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c495
-rw-r--r--mm/page_cgroup.c4
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/page_isolation.c15
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c1
-rw-r--r--mm/pgtable-generic.c4
-rw-r--r--mm/process_vm_access.c16
-rw-r--r--mm/readahead.c40
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/shmem.c564
-rw-r--r--mm/slub.c23
-rw-r--r--mm/sparse.c33
-rw-r--r--mm/swap.c129
-rw-r--r--mm/swapfile.c99
-rw-r--r--mm/thrash.c155
-rw-r--r--mm/truncate.c25
-rw-r--r--mm/util.c30
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmscan.c750
-rw-r--r--mm/vmstat.c13
47 files changed, 2907 insertions, 2239 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e338407f1225..82fed4eb2b6f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -198,7 +198,7 @@ config COMPACTION
198config MIGRATION 198config MIGRATION
199 bool "Page migration" 199 bool "Page migration"
200 def_bool y 200 def_bool y
201 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION 201 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
202 help 202 help
203 Allows the migration of the physical location of pages of processes 203 Allows the migration of the physical location of pages of processes
204 while the virtual addresses are not changed. This is useful in 204 while the virtual addresses are not changed. This is useful in
@@ -349,6 +349,16 @@ choice
349 benefit. 349 benefit.
350endchoice 350endchoice
351 351
352config CROSS_MEMORY_ATTACH
353 bool "Cross Memory Support"
354 depends on MMU
355 default y
356 help
357 Enabling this option adds the system calls process_vm_readv and
358 process_vm_writev which allow a process with the correct privileges
359 to directly read from or write to to another process's address space.
360 See the man page for more details.
361
352# 362#
353# UP and nommu archs use km based percpu allocator 363# UP and nommu archs use km based percpu allocator
354# 364#
@@ -379,3 +389,20 @@ config CLEANCACHE
379 in a negligible performance hit. 389 in a negligible performance hit.
380 390
381 If unsure, say Y to enable cleancache 391 If unsure, say Y to enable cleancache
392
393config FRONTSWAP
394 bool "Enable frontswap to cache swap pages if tmem is present"
395 depends on SWAP
396 default n
397 help
398 Frontswap is so named because it can be thought of as the opposite
399 of a "backing" store for a swap device. The data is stored into
400 "transcendent memory", memory that is not directly accessible or
401 addressable by the kernel and is of unknown and possibly
402 time-varying size. When space in transcendent memory is available,
403 a significant swap I/O reduction may be achieved. When none is
404 available, all frontswap calls are reduced to a single pointer-
405 compare-against-NULL resulting in a negligible performance hit
406 and swap data is stored as normal on the matching swap device.
407
408 If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00ef2a0e..2e2fbbefb99f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,15 +5,18 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o \ 8 vmalloc.o pagewalk.o pgtable-generic.o
9 process_vm_access.o 9
10ifdef CONFIG_CROSS_MEMORY_ATTACH
11mmu-$(CONFIG_MMU) += process_vm_access.o
12endif
10 13
11obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 14obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
12 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
13 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
14 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
15 page_isolation.o mm_init.o mmu_context.o percpu.o \ 18 page_isolation.o mm_init.o mmu_context.o percpu.o \
16 $(mmu-y) 19 compaction.o $(mmu-y)
17obj-y += init-mm.o 20obj-y += init-mm.o
18 21
19ifdef CONFIG_NO_BOOTMEM 22ifdef CONFIG_NO_BOOTMEM
@@ -25,14 +28,14 @@ endif
25obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 28obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
26 29
27obj-$(CONFIG_BOUNCE) += bounce.o 30obj-$(CONFIG_BOUNCE) += bounce.o
28obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 31obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
32obj-$(CONFIG_FRONTSWAP) += frontswap.o
29obj-$(CONFIG_HAS_DMA) += dmapool.o 33obj-$(CONFIG_HAS_DMA) += dmapool.o
30obj-$(CONFIG_HUGETLBFS) += hugetlb.o 34obj-$(CONFIG_HUGETLBFS) += hugetlb.o
31obj-$(CONFIG_NUMA) += mempolicy.o 35obj-$(CONFIG_NUMA) += mempolicy.o
32obj-$(CONFIG_SPARSEMEM) += sparse.o 36obj-$(CONFIG_SPARSEMEM) += sparse.o
33obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 37obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
34obj-$(CONFIG_SLOB) += slob.o 38obj-$(CONFIG_SLOB) += slob.o
35obj-$(CONFIG_COMPACTION) += compaction.o
36obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 39obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
37obj-$(CONFIG_KSM) += ksm.o 40obj-$(CONFIG_KSM) += ksm.o
38obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 41obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0131170c9d54..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages)
77 */ 77 */
78static void __init link_bootmem(bootmem_data_t *bdata) 78static void __init link_bootmem(bootmem_data_t *bdata)
79{ 79{
80 struct list_head *iter; 80 bootmem_data_t *ent;
81 81
82 list_for_each(iter, &bdata_list) { 82 list_for_each_entry(ent, &bdata_list, list) {
83 bootmem_data_t *ent; 83 if (bdata->node_min_pfn < ent->node_min_pfn) {
84 84 list_add_tail(&bdata->list, &ent->list);
85 ent = list_entry(iter, bootmem_data_t, list); 85 return;
86 if (bdata->node_min_pfn < ent->node_min_pfn) 86 }
87 break;
88 } 87 }
89 list_add_tail(&bdata->list, iter); 88
89 list_add_tail(&bdata->list, &bdata_list);
90} 90}
91 91
92/* 92/*
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
203 } else { 203 } else {
204 unsigned long off = 0; 204 unsigned long off = 0;
205 205
206 while (vec && off < BITS_PER_LONG) { 206 vec >>= start & (BITS_PER_LONG - 1);
207 while (vec) {
207 if (vec & 1) { 208 if (vec & 1) {
208 page = pfn_to_page(start + off); 209 page = pfn_to_page(start + off);
209 __free_pages_bootmem(page, 0); 210 __free_pages_bootmem(page, 0);
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
467 return ALIGN(base + off, align) - base; 468 return ALIGN(base + off, align) - base;
468} 469}
469 470
470static void * __init alloc_bootmem_core(struct bootmem_data *bdata, 471static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
471 unsigned long size, unsigned long align, 472 unsigned long size, unsigned long align,
472 unsigned long goal, unsigned long limit) 473 unsigned long goal, unsigned long limit)
473{ 474{
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
588 p_bdata = bootmem_arch_preferred_node(bdata, size, align, 589 p_bdata = bootmem_arch_preferred_node(bdata, size, align,
589 goal, limit); 590 goal, limit);
590 if (p_bdata) 591 if (p_bdata)
591 return alloc_bootmem_core(p_bdata, size, align, 592 return alloc_bootmem_bdata(p_bdata, size, align,
592 goal, limit); 593 goal, limit);
593 } 594 }
594#endif 595#endif
595 return NULL; 596 return NULL;
596} 597}
597 598
598static void * __init ___alloc_bootmem_nopanic(unsigned long size, 599static void * __init alloc_bootmem_core(unsigned long size,
599 unsigned long align, 600 unsigned long align,
600 unsigned long goal, 601 unsigned long goal,
601 unsigned long limit) 602 unsigned long limit)
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
603 bootmem_data_t *bdata; 604 bootmem_data_t *bdata;
604 void *region; 605 void *region;
605 606
606restart:
607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); 607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
608 if (region) 608 if (region)
609 return region; 609 return region;
@@ -614,11 +614,25 @@ restart:
614 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) 614 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
615 break; 615 break;
616 616
617 region = alloc_bootmem_core(bdata, size, align, goal, limit); 617 region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
618 if (region) 618 if (region)
619 return region; 619 return region;
620 } 620 }
621 621
622 return NULL;
623}
624
625static void * __init ___alloc_bootmem_nopanic(unsigned long size,
626 unsigned long align,
627 unsigned long goal,
628 unsigned long limit)
629{
630 void *ptr;
631
632restart:
633 ptr = alloc_bootmem_core(size, align, goal, limit);
634 if (ptr)
635 return ptr;
622 if (goal) { 636 if (goal) {
623 goal = 0; 637 goal = 0;
624 goto restart; 638 goto restart;
@@ -684,21 +698,60 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
684 return ___alloc_bootmem(size, align, goal, limit); 698 return ___alloc_bootmem(size, align, goal, limit);
685} 699}
686 700
687static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 701void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
688 unsigned long size, unsigned long align, 702 unsigned long size, unsigned long align,
689 unsigned long goal, unsigned long limit) 703 unsigned long goal, unsigned long limit)
690{ 704{
691 void *ptr; 705 void *ptr;
692 706
693 ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); 707again:
708 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
709 align, goal, limit);
694 if (ptr) 710 if (ptr)
695 return ptr; 711 return ptr;
696 712
697 ptr = alloc_bootmem_core(bdata, size, align, goal, limit); 713 /* do not panic in alloc_bootmem_bdata() */
714 if (limit && goal + size > limit)
715 limit = 0;
716
717 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
698 if (ptr) 718 if (ptr)
699 return ptr; 719 return ptr;
700 720
701 return ___alloc_bootmem(size, align, goal, limit); 721 ptr = alloc_bootmem_core(size, align, goal, limit);
722 if (ptr)
723 return ptr;
724
725 if (goal) {
726 goal = 0;
727 goto again;
728 }
729
730 return NULL;
731}
732
733void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
734 unsigned long align, unsigned long goal)
735{
736 if (WARN_ON_ONCE(slab_is_available()))
737 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
738
739 return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
740}
741
742void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
743 unsigned long align, unsigned long goal,
744 unsigned long limit)
745{
746 void *ptr;
747
748 ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
749 if (ptr)
750 return ptr;
751
752 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
753 panic("Out of memory");
754 return NULL;
702} 755}
703 756
704/** 757/**
@@ -722,7 +775,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
722 if (WARN_ON_ONCE(slab_is_available())) 775 if (WARN_ON_ONCE(slab_is_available()))
723 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 776 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
724 777
725 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 778 return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
726} 779}
727 780
728void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 781void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -743,7 +796,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
743 unsigned long new_goal; 796 unsigned long new_goal;
744 797
745 new_goal = MAX_DMA32_PFN << PAGE_SHIFT; 798 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
746 ptr = alloc_bootmem_core(pgdat->bdata, size, align, 799 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
747 new_goal, 0); 800 new_goal, 0);
748 if (ptr) 801 if (ptr)
749 return ptr; 802 return ptr;
@@ -754,47 +807,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
754 807
755} 808}
756 809
757#ifdef CONFIG_SPARSEMEM
758/**
759 * alloc_bootmem_section - allocate boot memory from a specific section
760 * @size: size of the request in bytes
761 * @section_nr: sparse map section to allocate from
762 *
763 * Return NULL on failure.
764 */
765void * __init alloc_bootmem_section(unsigned long size,
766 unsigned long section_nr)
767{
768 bootmem_data_t *bdata;
769 unsigned long pfn, goal;
770
771 pfn = section_nr_to_pfn(section_nr);
772 goal = pfn << PAGE_SHIFT;
773 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
774
775 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
776}
777#endif
778
779void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
780 unsigned long align, unsigned long goal)
781{
782 void *ptr;
783
784 if (WARN_ON_ONCE(slab_is_available()))
785 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
786
787 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
788 if (ptr)
789 return ptr;
790
791 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
792 if (ptr)
793 return ptr;
794
795 return __alloc_bootmem_nopanic(size, align, goal);
796}
797
798#ifndef ARCH_LOW_ADDRESS_LIMIT 810#ifndef ARCH_LOW_ADDRESS_LIMIT
799#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 811#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
800#endif 812#endif
@@ -839,6 +851,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
839 if (WARN_ON_ONCE(slab_is_available())) 851 if (WARN_ON_ONCE(slab_is_available()))
840 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 852 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
841 853
842 return ___alloc_bootmem_node(pgdat->bdata, size, align, 854 return ___alloc_bootmem_node(pgdat, size, align,
843 goal, ARCH_LOW_ADDRESS_LIMIT); 855 goal, ARCH_LOW_ADDRESS_LIMIT);
844} 856}
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5646c740f613..32e6f4136fa2 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs);
80static int cleancache_get_key(struct inode *inode, 80static int cleancache_get_key(struct inode *inode,
81 struct cleancache_filekey *key) 81 struct cleancache_filekey *key)
82{ 82{
83 int (*fhfn)(struct dentry *, __u32 *fh, int *, int); 83 int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
84 int len = 0, maxlen = CLEANCACHE_KEY_MAX; 84 int len = 0, maxlen = CLEANCACHE_KEY_MAX;
85 struct super_block *sb = inode->i_sb; 85 struct super_block *sb = inode->i_sb;
86 86
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode,
88 if (sb->s_export_op != NULL) { 88 if (sb->s_export_op != NULL) {
89 fhfn = sb->s_export_op->encode_fh; 89 fhfn = sb->s_export_op->encode_fh;
90 if (fhfn) { 90 if (fhfn) {
91 struct dentry d; 91 len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
92 d.d_inode = inode;
93 len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
94 if (len <= 0 || len == 255) 92 if (len <= 0 || len == 255)
95 return -1; 93 return -1;
96 if (maxlen > CLEANCACHE_KEY_MAX) 94 if (maxlen > CLEANCACHE_KEY_MAX)
diff --git a/mm/compaction.c b/mm/compaction.c
index 74a8c825ff28..2f42d9528539 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,30 +16,11 @@
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include "internal.h" 17#include "internal.h"
18 18
19#if defined CONFIG_COMPACTION || defined CONFIG_CMA
20
19#define CREATE_TRACE_POINTS 21#define CREATE_TRACE_POINTS
20#include <trace/events/compaction.h> 22#include <trace/events/compaction.h>
21 23
22/*
23 * compact_control is used to track pages being migrated and the free pages
24 * they are being migrated to during memory compaction. The free_pfn starts
25 * at the end of a zone and migrate_pfn begins at the start. Movable pages
26 * are moved to the end of a zone during a compaction run and the run
27 * completes when free_pfn <= migrate_pfn
28 */
29struct compact_control {
30 struct list_head freepages; /* List of free pages to migrate to */
31 struct list_head migratepages; /* List of pages being migrated */
32 unsigned long nr_freepages; /* Number of isolated free pages */
33 unsigned long nr_migratepages; /* Number of pages to migrate */
34 unsigned long free_pfn; /* isolate_freepages search base */
35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */
37
38 int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone;
41};
42
43static unsigned long release_freepages(struct list_head *freelist) 24static unsigned long release_freepages(struct list_head *freelist)
44{ 25{
45 struct page *page, *next; 26 struct page *page, *next;
@@ -54,24 +35,35 @@ static unsigned long release_freepages(struct list_head *freelist)
54 return count; 35 return count;
55} 36}
56 37
57/* Isolate free pages onto a private freelist. Must hold zone->lock */ 38static void map_pages(struct list_head *list)
58static unsigned long isolate_freepages_block(struct zone *zone, 39{
59 unsigned long blockpfn, 40 struct page *page;
60 struct list_head *freelist) 41
42 list_for_each_entry(page, list, lru) {
43 arch_alloc_page(page, 0);
44 kernel_map_pages(page, 1, 1);
45 }
46}
47
48static inline bool migrate_async_suitable(int migratetype)
49{
50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
51}
52
53/*
54 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
55 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
56 * pages inside of the pageblock (even though it may still end up isolating
57 * some pages).
58 */
59static unsigned long isolate_freepages_block(unsigned long blockpfn,
60 unsigned long end_pfn,
61 struct list_head *freelist,
62 bool strict)
61{ 63{
62 unsigned long zone_end_pfn, end_pfn;
63 int nr_scanned = 0, total_isolated = 0; 64 int nr_scanned = 0, total_isolated = 0;
64 struct page *cursor; 65 struct page *cursor;
65 66
66 /* Get the last PFN we should scan for free pages at */
67 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
68 end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
69
70 /* Find the first usable PFN in the block to initialse page cursor */
71 for (; blockpfn < end_pfn; blockpfn++) {
72 if (pfn_valid_within(blockpfn))
73 break;
74 }
75 cursor = pfn_to_page(blockpfn); 67 cursor = pfn_to_page(blockpfn);
76 68
77 /* Isolate free pages. This assumes the block is valid */ 69 /* Isolate free pages. This assumes the block is valid */
@@ -79,15 +71,23 @@ static unsigned long isolate_freepages_block(struct zone *zone,
79 int isolated, i; 71 int isolated, i;
80 struct page *page = cursor; 72 struct page *page = cursor;
81 73
82 if (!pfn_valid_within(blockpfn)) 74 if (!pfn_valid_within(blockpfn)) {
75 if (strict)
76 return 0;
83 continue; 77 continue;
78 }
84 nr_scanned++; 79 nr_scanned++;
85 80
86 if (!PageBuddy(page)) 81 if (!PageBuddy(page)) {
82 if (strict)
83 return 0;
87 continue; 84 continue;
85 }
88 86
89 /* Found a free page, break it into order-0 pages */ 87 /* Found a free page, break it into order-0 pages */
90 isolated = split_free_page(page); 88 isolated = split_free_page(page);
89 if (!isolated && strict)
90 return 0;
91 total_isolated += isolated; 91 total_isolated += isolated;
92 for (i = 0; i < isolated; i++) { 92 for (i = 0; i < isolated; i++) {
93 list_add(&page->lru, freelist); 93 list_add(&page->lru, freelist);
@@ -105,114 +105,71 @@ static unsigned long isolate_freepages_block(struct zone *zone,
105 return total_isolated; 105 return total_isolated;
106} 106}
107 107
108/* Returns true if the page is within a block suitable for migration to */ 108/**
109static bool suitable_migration_target(struct page *page) 109 * isolate_freepages_range() - isolate free pages.
110{ 110 * @start_pfn: The first PFN to start isolating.
111 111 * @end_pfn: The one-past-last PFN.
112 int migratetype = get_pageblock_migratetype(page); 112 *
113 113 * Non-free pages, invalid PFNs, or zone boundaries within the
114 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ 114 * [start_pfn, end_pfn) range are considered errors, cause function to
115 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) 115 * undo its actions and return zero.
116 return false; 116 *
117 117 * Otherwise, function returns one-past-the-last PFN of isolated page
118 /* If the page is a large free page, then allow migration */ 118 * (which may be greater then end_pfn if end fell in a middle of
119 if (PageBuddy(page) && page_order(page) >= pageblock_order) 119 * a free page).
120 return true;
121
122 /* If the block is MIGRATE_MOVABLE, allow migration */
123 if (migratetype == MIGRATE_MOVABLE)
124 return true;
125
126 /* Otherwise skip the block */
127 return false;
128}
129
130/*
131 * Based on information in the current compact_control, find blocks
132 * suitable for isolating free pages from and then isolate them.
133 */ 120 */
134static void isolate_freepages(struct zone *zone, 121unsigned long
135 struct compact_control *cc) 122isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
136{ 123{
137 struct page *page; 124 unsigned long isolated, pfn, block_end_pfn, flags;
138 unsigned long high_pfn, low_pfn, pfn; 125 struct zone *zone = NULL;
139 unsigned long flags; 126 LIST_HEAD(freelist);
140 int nr_freepages = cc->nr_freepages;
141 struct list_head *freelist = &cc->freepages;
142
143 /*
144 * Initialise the free scanner. The starting point is where we last
145 * scanned from (or the end of the zone if starting). The low point
146 * is the end of the pageblock the migration scanner is using.
147 */
148 pfn = cc->free_pfn;
149 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
150 127
151 /* 128 if (pfn_valid(start_pfn))
152 * Take care that if the migration scanner is at the end of the zone 129 zone = page_zone(pfn_to_page(start_pfn));
153 * that the free scanner does not accidentally move to the next zone
154 * in the next isolation cycle.
155 */
156 high_pfn = min(low_pfn, pfn);
157 130
158 /* 131 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
159 * Isolate free pages until enough are available to migrate the 132 if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
160 * pages on cc->migratepages. We stop searching if the migrate 133 break;
161 * and free page scanners meet or enough free pages are isolated.
162 */
163 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
164 pfn -= pageblock_nr_pages) {
165 unsigned long isolated;
166
167 if (!pfn_valid(pfn))
168 continue;
169 134
170 /* 135 /*
171 * Check for overlapping nodes/zones. It's possible on some 136 * On subsequent iterations ALIGN() is actually not needed,
172 * configurations to have a setup like 137 * but we keep it that we not to complicate the code.
173 * node0 node1 node0
174 * i.e. it's possible that all pages within a zones range of
175 * pages do not belong to a single zone.
176 */ 138 */
177 page = pfn_to_page(pfn); 139 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
178 if (page_zone(page) != zone) 140 block_end_pfn = min(block_end_pfn, end_pfn);
179 continue;
180 141
181 /* Check the block is suitable for migration */ 142 spin_lock_irqsave(&zone->lock, flags);
182 if (!suitable_migration_target(page)) 143 isolated = isolate_freepages_block(pfn, block_end_pfn,
183 continue; 144 &freelist, true);
145 spin_unlock_irqrestore(&zone->lock, flags);
184 146
185 /* 147 /*
186 * Found a block suitable for isolating free pages from. Now 148 * In strict mode, isolate_freepages_block() returns 0 if
187 * we disabled interrupts, double check things are ok and 149 * there are any holes in the block (ie. invalid PFNs or
188 * isolate the pages. This is to minimise the time IRQs 150 * non-free pages).
189 * are disabled
190 */ 151 */
191 isolated = 0; 152 if (!isolated)
192 spin_lock_irqsave(&zone->lock, flags); 153 break;
193 if (suitable_migration_target(page)) {
194 isolated = isolate_freepages_block(zone, pfn, freelist);
195 nr_freepages += isolated;
196 }
197 spin_unlock_irqrestore(&zone->lock, flags);
198 154
199 /* 155 /*
200 * Record the highest PFN we isolated pages from. When next 156 * If we managed to isolate pages, it is always (1 << n) *
201 * looking for free pages, the search will restart here as 157 * pageblock_nr_pages for some non-negative n. (Max order
202 * page migration may have returned some pages to the allocator 158 * page may span two pageblocks).
203 */ 159 */
204 if (isolated)
205 high_pfn = max(high_pfn, pfn);
206 } 160 }
207 161
208 /* split_free_page does not map the pages */ 162 /* split_free_page does not map the pages */
209 list_for_each_entry(page, freelist, lru) { 163 map_pages(&freelist);
210 arch_alloc_page(page, 0); 164
211 kernel_map_pages(page, 1, 1); 165 if (pfn < end_pfn) {
166 /* Loop terminated early, cleanup. */
167 release_freepages(&freelist);
168 return 0;
212 } 169 }
213 170
214 cc->free_pfn = high_pfn; 171 /* We don't use freelists for anything. */
215 cc->nr_freepages = nr_freepages; 172 return pfn;
216} 173}
217 174
218/* Update the number of anon and file isolated pages in the zone */ 175/* Update the number of anon and file isolated pages in the zone */
@@ -243,37 +200,34 @@ static bool too_many_isolated(struct zone *zone)
243 return isolated > (inactive + active) / 2; 200 return isolated > (inactive + active) / 2;
244} 201}
245 202
246/* possible outcome of isolate_migratepages */ 203/**
247typedef enum { 204 * isolate_migratepages_range() - isolate all migrate-able pages in range.
248 ISOLATE_ABORT, /* Abort compaction now */ 205 * @zone: Zone pages are in.
249 ISOLATE_NONE, /* No pages isolated, continue scanning */ 206 * @cc: Compaction control structure.
250 ISOLATE_SUCCESS, /* Pages isolated, migrate */ 207 * @low_pfn: The first PFN of the range.
251} isolate_migrate_t; 208 * @end_pfn: The one-past-the-last PFN of the range.
252 209 *
253/* 210 * Isolate all pages that can be migrated from the range specified by
254 * Isolate all pages that can be migrated from the block pointed to by 211 * [low_pfn, end_pfn). Returns zero if there is a fatal signal
255 * the migrate scanner within compact_control. 212 * pending), otherwise PFN of the first page that was not scanned
213 * (which may be both less, equal to or more then end_pfn).
214 *
215 * Assumes that cc->migratepages is empty and cc->nr_migratepages is
216 * zero.
217 *
218 * Apart from cc->migratepages and cc->nr_migratetypes this function
219 * does not modify any cc's fields, in particular it does not modify
220 * (or read for that matter) cc->migrate_pfn.
256 */ 221 */
257static isolate_migrate_t isolate_migratepages(struct zone *zone, 222unsigned long
258 struct compact_control *cc) 223isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
224 unsigned long low_pfn, unsigned long end_pfn)
259{ 225{
260 unsigned long low_pfn, end_pfn;
261 unsigned long last_pageblock_nr = 0, pageblock_nr; 226 unsigned long last_pageblock_nr = 0, pageblock_nr;
262 unsigned long nr_scanned = 0, nr_isolated = 0; 227 unsigned long nr_scanned = 0, nr_isolated = 0;
263 struct list_head *migratelist = &cc->migratepages; 228 struct list_head *migratelist = &cc->migratepages;
264 isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; 229 isolate_mode_t mode = 0;
265 230 struct lruvec *lruvec;
266 /* Do not scan outside zone boundaries */
267 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
268
269 /* Only scan within a pageblock boundary */
270 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
271
272 /* Do not cross the free scanner or scan within a memory hole */
273 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
274 cc->migrate_pfn = end_pfn;
275 return ISOLATE_NONE;
276 }
277 231
278 /* 232 /*
279 * Ensure that there are not too many pages isolated from the LRU 233 * Ensure that there are not too many pages isolated from the LRU
@@ -283,12 +237,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
283 while (unlikely(too_many_isolated(zone))) { 237 while (unlikely(too_many_isolated(zone))) {
284 /* async migration should just abort */ 238 /* async migration should just abort */
285 if (!cc->sync) 239 if (!cc->sync)
286 return ISOLATE_ABORT; 240 return 0;
287 241
288 congestion_wait(BLK_RW_ASYNC, HZ/10); 242 congestion_wait(BLK_RW_ASYNC, HZ/10);
289 243
290 if (fatal_signal_pending(current)) 244 if (fatal_signal_pending(current))
291 return ISOLATE_ABORT; 245 return 0;
292 } 246 }
293 247
294 /* Time to isolate some pages for migration */ 248 /* Time to isolate some pages for migration */
@@ -351,7 +305,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
351 */ 305 */
352 pageblock_nr = low_pfn >> pageblock_order; 306 pageblock_nr = low_pfn >> pageblock_order;
353 if (!cc->sync && last_pageblock_nr != pageblock_nr && 307 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
354 get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { 308 !migrate_async_suitable(get_pageblock_migratetype(page))) {
355 low_pfn += pageblock_nr_pages; 309 low_pfn += pageblock_nr_pages;
356 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; 310 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
357 last_pageblock_nr = pageblock_nr; 311 last_pageblock_nr = pageblock_nr;
@@ -374,14 +328,16 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
374 if (!cc->sync) 328 if (!cc->sync)
375 mode |= ISOLATE_ASYNC_MIGRATE; 329 mode |= ISOLATE_ASYNC_MIGRATE;
376 330
331 lruvec = mem_cgroup_page_lruvec(page, zone);
332
377 /* Try isolate the page */ 333 /* Try isolate the page */
378 if (__isolate_lru_page(page, mode, 0) != 0) 334 if (__isolate_lru_page(page, mode) != 0)
379 continue; 335 continue;
380 336
381 VM_BUG_ON(PageTransCompound(page)); 337 VM_BUG_ON(PageTransCompound(page));
382 338
383 /* Successfully isolated */ 339 /* Successfully isolated */
384 del_page_from_lru_list(zone, page, page_lru(page)); 340 del_page_from_lru_list(page, lruvec, page_lru(page));
385 list_add(&page->lru, migratelist); 341 list_add(&page->lru, migratelist);
386 cc->nr_migratepages++; 342 cc->nr_migratepages++;
387 nr_isolated++; 343 nr_isolated++;
@@ -396,11 +352,124 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
396 acct_isolated(zone, cc); 352 acct_isolated(zone, cc);
397 353
398 spin_unlock_irq(&zone->lru_lock); 354 spin_unlock_irq(&zone->lru_lock);
399 cc->migrate_pfn = low_pfn;
400 355
401 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 356 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
402 357
403 return ISOLATE_SUCCESS; 358 return low_pfn;
359}
360
361#endif /* CONFIG_COMPACTION || CONFIG_CMA */
362#ifdef CONFIG_COMPACTION
363
364/* Returns true if the page is within a block suitable for migration to */
365static bool suitable_migration_target(struct page *page)
366{
367
368 int migratetype = get_pageblock_migratetype(page);
369
370 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
371 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
372 return false;
373
374 /* If the page is a large free page, then allow migration */
375 if (PageBuddy(page) && page_order(page) >= pageblock_order)
376 return true;
377
378 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
379 if (migrate_async_suitable(migratetype))
380 return true;
381
382 /* Otherwise skip the block */
383 return false;
384}
385
386/*
387 * Based on information in the current compact_control, find blocks
388 * suitable for isolating free pages from and then isolate them.
389 */
390static void isolate_freepages(struct zone *zone,
391 struct compact_control *cc)
392{
393 struct page *page;
394 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
395 unsigned long flags;
396 int nr_freepages = cc->nr_freepages;
397 struct list_head *freelist = &cc->freepages;
398
399 /*
400 * Initialise the free scanner. The starting point is where we last
401 * scanned from (or the end of the zone if starting). The low point
402 * is the end of the pageblock the migration scanner is using.
403 */
404 pfn = cc->free_pfn;
405 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
406
407 /*
408 * Take care that if the migration scanner is at the end of the zone
409 * that the free scanner does not accidentally move to the next zone
410 * in the next isolation cycle.
411 */
412 high_pfn = min(low_pfn, pfn);
413
414 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
415
416 /*
417 * Isolate free pages until enough are available to migrate the
418 * pages on cc->migratepages. We stop searching if the migrate
419 * and free page scanners meet or enough free pages are isolated.
420 */
421 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
422 pfn -= pageblock_nr_pages) {
423 unsigned long isolated;
424
425 if (!pfn_valid(pfn))
426 continue;
427
428 /*
429 * Check for overlapping nodes/zones. It's possible on some
430 * configurations to have a setup like
431 * node0 node1 node0
432 * i.e. it's possible that all pages within a zones range of
433 * pages do not belong to a single zone.
434 */
435 page = pfn_to_page(pfn);
436 if (page_zone(page) != zone)
437 continue;
438
439 /* Check the block is suitable for migration */
440 if (!suitable_migration_target(page))
441 continue;
442
443 /*
444 * Found a block suitable for isolating free pages from. Now
445 * we disabled interrupts, double check things are ok and
446 * isolate the pages. This is to minimise the time IRQs
447 * are disabled
448 */
449 isolated = 0;
450 spin_lock_irqsave(&zone->lock, flags);
451 if (suitable_migration_target(page)) {
452 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
453 isolated = isolate_freepages_block(pfn, end_pfn,
454 freelist, false);
455 nr_freepages += isolated;
456 }
457 spin_unlock_irqrestore(&zone->lock, flags);
458
459 /*
460 * Record the highest PFN we isolated pages from. When next
461 * looking for free pages, the search will restart here as
462 * page migration may have returned some pages to the allocator
463 */
464 if (isolated)
465 high_pfn = max(high_pfn, pfn);
466 }
467
468 /* split_free_page does not map the pages */
469 map_pages(freelist);
470
471 cc->free_pfn = high_pfn;
472 cc->nr_freepages = nr_freepages;
404} 473}
405 474
406/* 475/*
@@ -449,6 +518,44 @@ static void update_nr_listpages(struct compact_control *cc)
449 cc->nr_freepages = nr_freepages; 518 cc->nr_freepages = nr_freepages;
450} 519}
451 520
521/* possible outcome of isolate_migratepages */
522typedef enum {
523 ISOLATE_ABORT, /* Abort compaction now */
524 ISOLATE_NONE, /* No pages isolated, continue scanning */
525 ISOLATE_SUCCESS, /* Pages isolated, migrate */
526} isolate_migrate_t;
527
528/*
529 * Isolate all pages that can be migrated from the block pointed to by
530 * the migrate scanner within compact_control.
531 */
532static isolate_migrate_t isolate_migratepages(struct zone *zone,
533 struct compact_control *cc)
534{
535 unsigned long low_pfn, end_pfn;
536
537 /* Do not scan outside zone boundaries */
538 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
539
540 /* Only scan within a pageblock boundary */
541 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
542
543 /* Do not cross the free scanner or scan within a memory hole */
544 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
545 cc->migrate_pfn = end_pfn;
546 return ISOLATE_NONE;
547 }
548
549 /* Perform the isolation */
550 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
551 if (!low_pfn)
552 return ISOLATE_ABORT;
553
554 cc->migrate_pfn = low_pfn;
555
556 return ISOLATE_SUCCESS;
557}
558
452static int compact_finished(struct zone *zone, 559static int compact_finished(struct zone *zone,
453 struct compact_control *cc) 560 struct compact_control *cc)
454{ 561{
@@ -594,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
594 if (err) { 701 if (err) {
595 putback_lru_pages(&cc->migratepages); 702 putback_lru_pages(&cc->migratepages);
596 cc->nr_migratepages = 0; 703 cc->nr_migratepages = 0;
704 if (err == -ENOMEM) {
705 ret = COMPACT_PARTIAL;
706 goto out;
707 }
597 } 708 }
598
599 } 709 }
600 710
601out: 711out:
@@ -795,3 +905,5 @@ void compaction_unregister_node(struct node *node)
795 return device_remove_file(&node->dev, &dev_attr_compact); 905 return device_remove_file(&node->dev, &dev_attr_compact);
796} 906}
797#endif /* CONFIG_SYSFS && CONFIG_NUMA */ 907#endif /* CONFIG_SYSFS && CONFIG_NUMA */
908
909#endif /* CONFIG_COMPACTION */
diff --git a/mm/filemap.c b/mm/filemap.c
index 79c4b2b0b14e..a4a5260b0279 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,6 @@
29#include <linux/pagevec.h> 29#include <linux/pagevec.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/syscalls.h>
33#include <linux/cpuset.h> 32#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 34#include <linux/memcontrol.h>
@@ -1478,44 +1477,6 @@ out:
1478} 1477}
1479EXPORT_SYMBOL(generic_file_aio_read); 1478EXPORT_SYMBOL(generic_file_aio_read);
1480 1479
1481static ssize_t
1482do_readahead(struct address_space *mapping, struct file *filp,
1483 pgoff_t index, unsigned long nr)
1484{
1485 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1486 return -EINVAL;
1487
1488 force_page_cache_readahead(mapping, filp, index, nr);
1489 return 0;
1490}
1491
1492SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
1493{
1494 ssize_t ret;
1495 struct file *file;
1496
1497 ret = -EBADF;
1498 file = fget(fd);
1499 if (file) {
1500 if (file->f_mode & FMODE_READ) {
1501 struct address_space *mapping = file->f_mapping;
1502 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1503 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1504 unsigned long len = end - start + 1;
1505 ret = do_readahead(mapping, file, start, len);
1506 }
1507 fput(file);
1508 }
1509 return ret;
1510}
1511#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
1512asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
1513{
1514 return SYSC_readahead((int) fd, offset, (size_t) count);
1515}
1516SYSCALL_ALIAS(sys_readahead, SyS_readahead);
1517#endif
1518
1519#ifdef CONFIG_MMU 1480#ifdef CONFIG_MMU
1520/** 1481/**
1521 * page_cache_read - adds requested page to the page cache if not already there 1482 * page_cache_read - adds requested page to the page cache if not already there
@@ -1938,71 +1899,6 @@ struct page *read_cache_page(struct address_space *mapping,
1938} 1899}
1939EXPORT_SYMBOL(read_cache_page); 1900EXPORT_SYMBOL(read_cache_page);
1940 1901
1941/*
1942 * The logic we want is
1943 *
1944 * if suid or (sgid and xgrp)
1945 * remove privs
1946 */
1947int should_remove_suid(struct dentry *dentry)
1948{
1949 umode_t mode = dentry->d_inode->i_mode;
1950 int kill = 0;
1951
1952 /* suid always must be killed */
1953 if (unlikely(mode & S_ISUID))
1954 kill = ATTR_KILL_SUID;
1955
1956 /*
1957 * sgid without any exec bits is just a mandatory locking mark; leave
1958 * it alone. If some exec bits are set, it's a real sgid; kill it.
1959 */
1960 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1961 kill |= ATTR_KILL_SGID;
1962
1963 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1964 return kill;
1965
1966 return 0;
1967}
1968EXPORT_SYMBOL(should_remove_suid);
1969
1970static int __remove_suid(struct dentry *dentry, int kill)
1971{
1972 struct iattr newattrs;
1973
1974 newattrs.ia_valid = ATTR_FORCE | kill;
1975 return notify_change(dentry, &newattrs);
1976}
1977
1978int file_remove_suid(struct file *file)
1979{
1980 struct dentry *dentry = file->f_path.dentry;
1981 struct inode *inode = dentry->d_inode;
1982 int killsuid;
1983 int killpriv;
1984 int error = 0;
1985
1986 /* Fast path for nothing security related */
1987 if (IS_NOSEC(inode))
1988 return 0;
1989
1990 killsuid = should_remove_suid(dentry);
1991 killpriv = security_inode_need_killpriv(dentry);
1992
1993 if (killpriv < 0)
1994 return killpriv;
1995 if (killpriv)
1996 error = security_inode_killpriv(dentry);
1997 if (!error && killsuid)
1998 error = __remove_suid(dentry, killsuid);
1999 if (!error && (inode->i_sb->s_flags & MS_NOSEC))
2000 inode->i_flags |= S_NOSEC;
2001
2002 return error;
2003}
2004EXPORT_SYMBOL(file_remove_suid);
2005
2006static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1902static size_t __iovec_copy_from_user_inatomic(char *vaddr,
2007 const struct iovec *iov, size_t base, size_t bytes) 1903 const struct iovec *iov, size_t base, size_t bytes)
2008{ 1904{
@@ -2528,7 +2424,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2528 if (err) 2424 if (err)
2529 goto out; 2425 goto out;
2530 2426
2531 file_update_time(file); 2427 err = file_update_time(file);
2428 if (err)
2429 goto out;
2532 2430
2533 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2431 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2534 if (unlikely(file->f_flags & O_DIRECT)) { 2432 if (unlikely(file->f_flags & O_DIRECT)) {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index a4eb31132229..213ca1f53409 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -426,7 +426,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
426 if (ret) 426 if (ret)
427 goto out_backing; 427 goto out_backing;
428 428
429 file_update_time(filp); 429 ret = file_update_time(filp);
430 if (ret)
431 goto out_backing;
430 432
431 ret = __xip_file_write (filp, buf, count, pos, ppos); 433 ret = __xip_file_write (filp, buf, count, pos, ppos);
432 434
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..e25025574a02
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,314 @@
1/*
2 * Frontswap frontend
3 *
4 * This code provides the generic "frontend" layer to call a matching
5 * "backend" driver implementation of frontswap. See
6 * Documentation/vm/frontswap.txt for more information.
7 *
8 * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
9 * Author: Dan Magenheimer
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */
13
14#include <linux/mm.h>
15#include <linux/mman.h>
16#include <linux/swap.h>
17#include <linux/swapops.h>
18#include <linux/proc_fs.h>
19#include <linux/security.h>
20#include <linux/capability.h>
21#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/frontswap.h>
25#include <linux/swapfile.h>
26
27/*
28 * frontswap_ops is set by frontswap_register_ops to contain the pointers
29 * to the frontswap "backend" implementation functions.
30 */
31static struct frontswap_ops frontswap_ops __read_mostly;
32
33/*
34 * This global enablement flag reduces overhead on systems where frontswap_ops
35 * has not been registered, so is preferred to the slower alternative: a
36 * function call that checks a non-global.
37 */
38bool frontswap_enabled __read_mostly;
39EXPORT_SYMBOL(frontswap_enabled);
40
41/*
42 * If enabled, frontswap_store will return failure even on success. As
43 * a result, the swap subsystem will always write the page to swap, in
44 * effect converting frontswap into a writethrough cache. In this mode,
45 * there is no direct reduction in swap writes, but a frontswap backend
46 * can unilaterally "reclaim" any pages in use with no data loss, thus
47 * providing increases control over maximum memory usage due to frontswap.
48 */
49static bool frontswap_writethrough_enabled __read_mostly;
50
51#ifdef CONFIG_DEBUG_FS
52/*
53 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
54 * properly configured). These are for information only so are not protected
55 * against increment races.
56 */
57static u64 frontswap_loads;
58static u64 frontswap_succ_stores;
59static u64 frontswap_failed_stores;
60static u64 frontswap_invalidates;
61
62static inline void inc_frontswap_loads(void) {
63 frontswap_loads++;
64}
65static inline void inc_frontswap_succ_stores(void) {
66 frontswap_succ_stores++;
67}
68static inline void inc_frontswap_failed_stores(void) {
69 frontswap_failed_stores++;
70}
71static inline void inc_frontswap_invalidates(void) {
72 frontswap_invalidates++;
73}
74#else
75static inline void inc_frontswap_loads(void) { }
76static inline void inc_frontswap_succ_stores(void) { }
77static inline void inc_frontswap_failed_stores(void) { }
78static inline void inc_frontswap_invalidates(void) { }
79#endif
80/*
81 * Register operations for frontswap, returning previous thus allowing
82 * detection of multiple backends and possible nesting.
83 */
84struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
85{
86 struct frontswap_ops old = frontswap_ops;
87
88 frontswap_ops = *ops;
89 frontswap_enabled = true;
90 return old;
91}
92EXPORT_SYMBOL(frontswap_register_ops);
93
94/*
95 * Enable/disable frontswap writethrough (see above).
96 */
97void frontswap_writethrough(bool enable)
98{
99 frontswap_writethrough_enabled = enable;
100}
101EXPORT_SYMBOL(frontswap_writethrough);
102
103/*
104 * Called when a swap device is swapon'd.
105 */
106void __frontswap_init(unsigned type)
107{
108 struct swap_info_struct *sis = swap_info[type];
109
110 BUG_ON(sis == NULL);
111 if (sis->frontswap_map == NULL)
112 return;
113 if (frontswap_enabled)
114 (*frontswap_ops.init)(type);
115}
116EXPORT_SYMBOL(__frontswap_init);
117
118/*
119 * "Store" data from a page to frontswap and associate it with the page's
120 * swaptype and offset. Page must be locked and in the swap cache.
121 * If frontswap already contains a page with matching swaptype and
122 * offset, the frontswap implmentation may either overwrite the data and
123 * return success or invalidate the page from frontswap and return failure.
124 */
125int __frontswap_store(struct page *page)
126{
127 int ret = -1, dup = 0;
128 swp_entry_t entry = { .val = page_private(page), };
129 int type = swp_type(entry);
130 struct swap_info_struct *sis = swap_info[type];
131 pgoff_t offset = swp_offset(entry);
132
133 BUG_ON(!PageLocked(page));
134 BUG_ON(sis == NULL);
135 if (frontswap_test(sis, offset))
136 dup = 1;
137 ret = (*frontswap_ops.store)(type, offset, page);
138 if (ret == 0) {
139 frontswap_set(sis, offset);
140 inc_frontswap_succ_stores();
141 if (!dup)
142 atomic_inc(&sis->frontswap_pages);
143 } else if (dup) {
144 /*
145 failed dup always results in automatic invalidate of
146 the (older) page from frontswap
147 */
148 frontswap_clear(sis, offset);
149 atomic_dec(&sis->frontswap_pages);
150 inc_frontswap_failed_stores();
151 } else
152 inc_frontswap_failed_stores();
153 if (frontswap_writethrough_enabled)
154 /* report failure so swap also writes to swap device */
155 ret = -1;
156 return ret;
157}
158EXPORT_SYMBOL(__frontswap_store);
159
160/*
161 * "Get" data from frontswap associated with swaptype and offset that were
162 * specified when the data was put to frontswap and use it to fill the
163 * specified page with data. Page must be locked and in the swap cache.
164 */
165int __frontswap_load(struct page *page)
166{
167 int ret = -1;
168 swp_entry_t entry = { .val = page_private(page), };
169 int type = swp_type(entry);
170 struct swap_info_struct *sis = swap_info[type];
171 pgoff_t offset = swp_offset(entry);
172
173 BUG_ON(!PageLocked(page));
174 BUG_ON(sis == NULL);
175 if (frontswap_test(sis, offset))
176 ret = (*frontswap_ops.load)(type, offset, page);
177 if (ret == 0)
178 inc_frontswap_loads();
179 return ret;
180}
181EXPORT_SYMBOL(__frontswap_load);
182
183/*
184 * Invalidate any data from frontswap associated with the specified swaptype
185 * and offset so that a subsequent "get" will fail.
186 */
187void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
188{
189 struct swap_info_struct *sis = swap_info[type];
190
191 BUG_ON(sis == NULL);
192 if (frontswap_test(sis, offset)) {
193 (*frontswap_ops.invalidate_page)(type, offset);
194 atomic_dec(&sis->frontswap_pages);
195 frontswap_clear(sis, offset);
196 inc_frontswap_invalidates();
197 }
198}
199EXPORT_SYMBOL(__frontswap_invalidate_page);
200
201/*
202 * Invalidate all data from frontswap associated with all offsets for the
203 * specified swaptype.
204 */
205void __frontswap_invalidate_area(unsigned type)
206{
207 struct swap_info_struct *sis = swap_info[type];
208
209 BUG_ON(sis == NULL);
210 if (sis->frontswap_map == NULL)
211 return;
212 (*frontswap_ops.invalidate_area)(type);
213 atomic_set(&sis->frontswap_pages, 0);
214 memset(sis->frontswap_map, 0, sis->max / sizeof(long));
215}
216EXPORT_SYMBOL(__frontswap_invalidate_area);
217
218/*
219 * Frontswap, like a true swap device, may unnecessarily retain pages
220 * under certain circumstances; "shrink" frontswap is essentially a
221 * "partial swapoff" and works by calling try_to_unuse to attempt to
222 * unuse enough frontswap pages to attempt to -- subject to memory
223 * constraints -- reduce the number of pages in frontswap to the
224 * number given in the parameter target_pages.
225 */
226void frontswap_shrink(unsigned long target_pages)
227{
228 struct swap_info_struct *si = NULL;
229 int si_frontswap_pages;
230 unsigned long total_pages = 0, total_pages_to_unuse;
231 unsigned long pages = 0, pages_to_unuse = 0;
232 int type;
233 bool locked = false;
234
235 /*
236 * we don't want to hold swap_lock while doing a very
237 * lengthy try_to_unuse, but swap_list may change
238 * so restart scan from swap_list.head each time
239 */
240 spin_lock(&swap_lock);
241 locked = true;
242 total_pages = 0;
243 for (type = swap_list.head; type >= 0; type = si->next) {
244 si = swap_info[type];
245 total_pages += atomic_read(&si->frontswap_pages);
246 }
247 if (total_pages <= target_pages)
248 goto out;
249 total_pages_to_unuse = total_pages - target_pages;
250 for (type = swap_list.head; type >= 0; type = si->next) {
251 si = swap_info[type];
252 si_frontswap_pages = atomic_read(&si->frontswap_pages);
253 if (total_pages_to_unuse < si_frontswap_pages)
254 pages = pages_to_unuse = total_pages_to_unuse;
255 else {
256 pages = si_frontswap_pages;
257 pages_to_unuse = 0; /* unuse all */
258 }
259 /* ensure there is enough RAM to fetch pages from frontswap */
260 if (security_vm_enough_memory_mm(current->mm, pages))
261 continue;
262 vm_unacct_memory(pages);
263 break;
264 }
265 if (type < 0)
266 goto out;
267 locked = false;
268 spin_unlock(&swap_lock);
269 try_to_unuse(type, true, pages_to_unuse);
270out:
271 if (locked)
272 spin_unlock(&swap_lock);
273 return;
274}
275EXPORT_SYMBOL(frontswap_shrink);
276
277/*
278 * Count and return the number of frontswap pages across all
279 * swap devices. This is exported so that backend drivers can
280 * determine current usage without reading debugfs.
281 */
282unsigned long frontswap_curr_pages(void)
283{
284 int type;
285 unsigned long totalpages = 0;
286 struct swap_info_struct *si = NULL;
287
288 spin_lock(&swap_lock);
289 for (type = swap_list.head; type >= 0; type = si->next) {
290 si = swap_info[type];
291 totalpages += atomic_read(&si->frontswap_pages);
292 }
293 spin_unlock(&swap_lock);
294 return totalpages;
295}
296EXPORT_SYMBOL(frontswap_curr_pages);
297
298static int __init init_frontswap(void)
299{
300#ifdef CONFIG_DEBUG_FS
301 struct dentry *root = debugfs_create_dir("frontswap", NULL);
302 if (root == NULL)
303 return -ENXIO;
304 debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
305 debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
306 debugfs_create_u64("failed_stores", S_IRUGO, root,
307 &frontswap_failed_stores);
308 debugfs_create_u64("invalidates", S_IRUGO,
309 root, &frontswap_invalidates);
310#endif
311 return 0;
312}
313
314module_init(init_frontswap);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e5306eeb55..57c4b9309015 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
636 unsigned long haddr, pmd_t *pmd, 636 unsigned long haddr, pmd_t *pmd,
637 struct page *page) 637 struct page *page)
638{ 638{
639 int ret = 0;
640 pgtable_t pgtable; 639 pgtable_t pgtable;
641 640
642 VM_BUG_ON(!PageCompound(page)); 641 VM_BUG_ON(!PageCompound(page));
643 pgtable = pte_alloc_one(mm, haddr); 642 pgtable = pte_alloc_one(mm, haddr);
644 if (unlikely(!pgtable)) { 643 if (unlikely(!pgtable))
645 mem_cgroup_uncharge_page(page);
646 put_page(page);
647 return VM_FAULT_OOM; 644 return VM_FAULT_OOM;
648 }
649 645
650 clear_huge_page(page, haddr, HPAGE_PMD_NR); 646 clear_huge_page(page, haddr, HPAGE_PMD_NR);
651 __SetPageUptodate(page); 647 __SetPageUptodate(page);
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
675 spin_unlock(&mm->page_table_lock); 671 spin_unlock(&mm->page_table_lock);
676 } 672 }
677 673
678 return ret; 674 return 0;
679} 675}
680 676
681static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) 677static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
724 put_page(page); 720 put_page(page);
725 goto out; 721 goto out;
726 } 722 }
723 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
724 page))) {
725 mem_cgroup_uncharge_page(page);
726 put_page(page);
727 goto out;
728 }
727 729
728 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); 730 return 0;
729 } 731 }
730out: 732out:
731 /* 733 /*
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
950 count_vm_event(THP_FAULT_FALLBACK); 952 count_vm_event(THP_FAULT_FALLBACK);
951 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 953 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
952 pmd, orig_pmd, page, haddr); 954 pmd, orig_pmd, page, haddr);
955 if (ret & VM_FAULT_OOM)
956 split_huge_page(page);
953 put_page(page); 957 put_page(page);
954 goto out; 958 goto out;
955 } 959 }
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
957 961
958 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 962 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
959 put_page(new_page); 963 put_page(new_page);
964 split_huge_page(page);
960 put_page(page); 965 put_page(page);
961 ret |= VM_FAULT_OOM; 966 ret |= VM_FAULT_OOM;
962 goto out; 967 goto out;
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
968 spin_lock(&mm->page_table_lock); 973 spin_lock(&mm->page_table_lock);
969 put_page(page); 974 put_page(page);
970 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 975 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock);
971 mem_cgroup_uncharge_page(new_page); 977 mem_cgroup_uncharge_page(new_page);
972 put_page(new_page); 978 put_page(new_page);
979 goto out;
973 } else { 980 } else {
974 pmd_t entry; 981 pmd_t entry;
975 VM_BUG_ON(!PageHead(page)); 982 VM_BUG_ON(!PageHead(page));
@@ -1224,10 +1231,13 @@ static void __split_huge_page_refcount(struct page *page)
1224{ 1231{
1225 int i; 1232 int i;
1226 struct zone *zone = page_zone(page); 1233 struct zone *zone = page_zone(page);
1234 struct lruvec *lruvec;
1227 int tail_count = 0; 1235 int tail_count = 0;
1228 1236
1229 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1237 /* prevent PageLRU to go away from under us, and freeze lru stats */
1230 spin_lock_irq(&zone->lru_lock); 1238 spin_lock_irq(&zone->lru_lock);
1239 lruvec = mem_cgroup_page_lruvec(page, zone);
1240
1231 compound_lock(page); 1241 compound_lock(page);
1232 /* complete memcg works before add pages to LRU */ 1242 /* complete memcg works before add pages to LRU */
1233 mem_cgroup_split_huge_fixup(page); 1243 mem_cgroup_split_huge_fixup(page);
@@ -1302,13 +1312,12 @@ static void __split_huge_page_refcount(struct page *page)
1302 BUG_ON(!PageDirty(page_tail)); 1312 BUG_ON(!PageDirty(page_tail));
1303 BUG_ON(!PageSwapBacked(page_tail)); 1313 BUG_ON(!PageSwapBacked(page_tail));
1304 1314
1305 1315 lru_add_page_tail(page, page_tail, lruvec);
1306 lru_add_page_tail(zone, page, page_tail);
1307 } 1316 }
1308 atomic_sub(tail_count, &page->_count); 1317 atomic_sub(tail_count, &page->_count);
1309 BUG_ON(atomic_read(&page->_count) <= 0); 1318 BUG_ON(atomic_read(&page->_count) <= 0);
1310 1319
1311 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1320 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
1312 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1321 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1313 1322
1314 ClearPageCompound(page); 1323 ClearPageCompound(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ae8f708e3d75..e198831276a3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t)
273 273
274 /* Locate each segment we overlap with, and count that overlap. */ 274 /* Locate each segment we overlap with, and count that overlap. */
275 list_for_each_entry(rg, head, link) { 275 list_for_each_entry(rg, head, link) {
276 int seg_from; 276 long seg_from;
277 int seg_to; 277 long seg_to;
278 278
279 if (rg->to <= f) 279 if (rg->to <= f)
280 continue; 280 continue;
@@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2157 kref_get(&reservations->refs); 2157 kref_get(&reservations->refs);
2158} 2158}
2159 2159
2160static void resv_map_put(struct vm_area_struct *vma)
2161{
2162 struct resv_map *reservations = vma_resv_map(vma);
2163
2164 if (!reservations)
2165 return;
2166 kref_put(&reservations->refs, resv_map_release);
2167}
2168
2160static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2169static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2161{ 2170{
2162 struct hstate *h = hstate_vma(vma); 2171 struct hstate *h = hstate_vma(vma);
@@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2173 reserve = (end - start) - 2182 reserve = (end - start) -
2174 region_count(&reservations->regions, start, end); 2183 region_count(&reservations->regions, start, end);
2175 2184
2176 kref_put(&reservations->refs, resv_map_release); 2185 resv_map_put(vma);
2177 2186
2178 if (reserve) { 2187 if (reserve) {
2179 hugetlb_acct_memory(h, -reserve); 2188 hugetlb_acct_memory(h, -reserve);
@@ -2213,6 +2222,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
2213 } 2222 }
2214 entry = pte_mkyoung(entry); 2223 entry = pte_mkyoung(entry);
2215 entry = pte_mkhuge(entry); 2224 entry = pte_mkhuge(entry);
2225 entry = arch_make_huge_pte(entry, vma, page, writable);
2216 2226
2217 return entry; 2227 return entry;
2218} 2228}
@@ -2990,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode,
2990 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 3000 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2991 } 3001 }
2992 3002
2993 if (chg < 0) 3003 if (chg < 0) {
2994 return chg; 3004 ret = chg;
3005 goto out_err;
3006 }
2995 3007
2996 /* There must be enough pages in the subpool for the mapping */ 3008 /* There must be enough pages in the subpool for the mapping */
2997 if (hugepage_subpool_get_pages(spool, chg)) 3009 if (hugepage_subpool_get_pages(spool, chg)) {
2998 return -ENOSPC; 3010 ret = -ENOSPC;
3011 goto out_err;
3012 }
2999 3013
3000 /* 3014 /*
3001 * Check enough hugepages are available for the reservation. 3015 * Check enough hugepages are available for the reservation.
@@ -3004,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode,
3004 ret = hugetlb_acct_memory(h, chg); 3018 ret = hugetlb_acct_memory(h, chg);
3005 if (ret < 0) { 3019 if (ret < 0) {
3006 hugepage_subpool_put_pages(spool, chg); 3020 hugepage_subpool_put_pages(spool, chg);
3007 return ret; 3021 goto out_err;
3008 } 3022 }
3009 3023
3010 /* 3024 /*
@@ -3021,6 +3035,10 @@ int hugetlb_reserve_pages(struct inode *inode,
3021 if (!vma || vma->vm_flags & VM_MAYSHARE) 3035 if (!vma || vma->vm_flags & VM_MAYSHARE)
3022 region_add(&inode->i_mapping->private_list, from, to); 3036 region_add(&inode->i_mapping->private_list, from, to);
3023 return 0; 3037 return 0;
3038out_err:
3039 if (vma)
3040 resv_map_put(vma);
3041 return ret;
3024} 3042}
3025 3043
3026void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 3044void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/internal.h b/mm/internal.h
index 2189af491783..2ba87fbfb75b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -100,6 +100,39 @@ extern void prep_compound_page(struct page *page, unsigned long order);
100extern bool is_free_buddy_page(struct page *page); 100extern bool is_free_buddy_page(struct page *page);
101#endif 101#endif
102 102
103#if defined CONFIG_COMPACTION || defined CONFIG_CMA
104
105/*
106 * in mm/compaction.c
107 */
108/*
109 * compact_control is used to track pages being migrated and the free pages
110 * they are being migrated to during memory compaction. The free_pfn starts
111 * at the end of a zone and migrate_pfn begins at the start. Movable pages
112 * are moved to the end of a zone during a compaction run and the run
113 * completes when free_pfn <= migrate_pfn
114 */
115struct compact_control {
116 struct list_head freepages; /* List of free pages to migrate to */
117 struct list_head migratepages; /* List of pages being migrated */
118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long migrate_pfn; /* isolate_migratepages search base */
122 bool sync; /* Synchronous migration */
123
124 int order; /* order a direct compactor needs */
125 int migratetype; /* MOVABLE, RECLAIMABLE etc */
126 struct zone *zone;
127};
128
129unsigned long
130isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
131unsigned long
132isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
133 unsigned long low_pfn, unsigned long end_pfn);
134
135#endif
103 136
104/* 137/*
105 * function for dealing with page's order in buddy system. 138 * function for dealing with page's order in buddy system.
@@ -131,7 +164,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
131 * to determine if it's being mapped into a LOCKED vma. 164 * to determine if it's being mapped into a LOCKED vma.
132 * If so, mark page as mlocked. 165 * If so, mark page as mlocked.
133 */ 166 */
134static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) 167static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
168 struct page *page)
135{ 169{
136 VM_BUG_ON(PageLRU(page)); 170 VM_BUG_ON(PageLRU(page));
137 171
@@ -189,7 +223,7 @@ extern unsigned long vma_address(struct page *page,
189 struct vm_area_struct *vma); 223 struct vm_area_struct *vma);
190#endif 224#endif
191#else /* !CONFIG_MMU */ 225#else /* !CONFIG_MMU */
192static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 226static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
193{ 227{
194 return 0; 228 return 0;
195} 229}
@@ -309,3 +343,7 @@ extern u64 hwpoison_filter_flags_mask;
309extern u64 hwpoison_filter_flags_value; 343extern u64 hwpoison_filter_flags_value;
310extern u64 hwpoison_filter_memcg; 344extern u64 hwpoison_filter_memcg;
311extern u32 hwpoison_filter_enable; 345extern u32 hwpoison_filter_enable;
346
347extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
348 unsigned long, unsigned long,
349 unsigned long, unsigned long);
diff --git a/mm/madvise.c b/mm/madvise.c
index 1ccbba5b6674..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,8 +11,11 @@
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h> 12#include <linux/page-isolation.h>
13#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
14#include <linux/falloc.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
15#include <linux/ksm.h> 16#include <linux/ksm.h>
17#include <linux/fs.h>
18#include <linux/file.h>
16 19
17/* 20/*
18 * Any behaviour which results in changes to the vma->vm_flags needs to 21 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -200,33 +203,39 @@ static long madvise_remove(struct vm_area_struct *vma,
200 struct vm_area_struct **prev, 203 struct vm_area_struct **prev,
201 unsigned long start, unsigned long end) 204 unsigned long start, unsigned long end)
202{ 205{
203 struct address_space *mapping; 206 loff_t offset;
204 loff_t offset, endoff;
205 int error; 207 int error;
208 struct file *f;
206 209
207 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 210 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
208 211
209 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 212 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
210 return -EINVAL; 213 return -EINVAL;
211 214
212 if (!vma->vm_file || !vma->vm_file->f_mapping 215 f = vma->vm_file;
213 || !vma->vm_file->f_mapping->host) { 216
217 if (!f || !f->f_mapping || !f->f_mapping->host) {
214 return -EINVAL; 218 return -EINVAL;
215 } 219 }
216 220
217 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 221 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
218 return -EACCES; 222 return -EACCES;
219 223
220 mapping = vma->vm_file->f_mapping;
221
222 offset = (loff_t)(start - vma->vm_start) 224 offset = (loff_t)(start - vma->vm_start)
223 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 225 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
224 endoff = (loff_t)(end - vma->vm_start - 1)
225 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
226 226
227 /* vmtruncate_range needs to take i_mutex */ 227 /*
228 * Filesystem's fallocate may need to take i_mutex. We need to
229 * explicitly grab a reference because the vma (and hence the
230 * vma's reference to the file) can go away as soon as we drop
231 * mmap_sem.
232 */
233 get_file(f);
228 up_read(&current->mm->mmap_sem); 234 up_read(&current->mm->mmap_sem);
229 error = vmtruncate_range(mapping->host, offset, endoff); 235 error = do_fallocate(f,
236 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
237 offset, end - start);
238 fput(f);
230 down_read(&current->mm->mmap_sem); 239 down_read(&current->mm->mmap_sem);
231 return error; 240 return error;
232} 241}
diff --git a/mm/memblock.c b/mm/memblock.c
index a44eab3157f8..5cc6731b00cc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = {
37 37
38int memblock_debug __initdata_memblock; 38int memblock_debug __initdata_memblock;
39static int memblock_can_resize __initdata_memblock; 39static int memblock_can_resize __initdata_memblock;
40static int memblock_memory_in_slab __initdata_memblock = 0;
41static int memblock_reserved_in_slab __initdata_memblock = 0;
40 42
41/* inline so we don't get a warning when pr_debug is compiled out */ 43/* inline so we don't get a warning when pr_debug is compiled out */
42static inline const char *memblock_type_name(struct memblock_type *type) 44static inline const char *memblock_type_name(struct memblock_type *type)
@@ -141,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
141 MAX_NUMNODES); 143 MAX_NUMNODES);
142} 144}
143 145
144/*
145 * Free memblock.reserved.regions
146 */
147int __init_memblock memblock_free_reserved_regions(void)
148{
149 if (memblock.reserved.regions == memblock_reserved_init_regions)
150 return 0;
151
152 return memblock_free(__pa(memblock.reserved.regions),
153 sizeof(struct memblock_region) * memblock.reserved.max);
154}
155
156/*
157 * Reserve memblock.reserved.regions
158 */
159int __init_memblock memblock_reserve_reserved_regions(void)
160{
161 if (memblock.reserved.regions == memblock_reserved_init_regions)
162 return 0;
163
164 return memblock_reserve(__pa(memblock.reserved.regions),
165 sizeof(struct memblock_region) * memblock.reserved.max);
166}
167
168static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 146static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
169{ 147{
170 type->total_size -= type->regions[r].size; 148 type->total_size -= type->regions[r].size;
@@ -182,11 +160,42 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
182 } 160 }
183} 161}
184 162
185static int __init_memblock memblock_double_array(struct memblock_type *type) 163phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
164 phys_addr_t *addr)
165{
166 if (memblock.reserved.regions == memblock_reserved_init_regions)
167 return 0;
168
169 *addr = __pa(memblock.reserved.regions);
170
171 return PAGE_ALIGN(sizeof(struct memblock_region) *
172 memblock.reserved.max);
173}
174
175/**
176 * memblock_double_array - double the size of the memblock regions array
177 * @type: memblock type of the regions array being doubled
178 * @new_area_start: starting address of memory range to avoid overlap with
179 * @new_area_size: size of memory range to avoid overlap with
180 *
181 * Double the size of the @type regions array. If memblock is being used to
182 * allocate memory for a new reserved regions array and there is a previously
183 * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
184 * waiting to be reserved, ensure the memory used by the new array does
185 * not overlap.
186 *
187 * RETURNS:
188 * 0 on success, -1 on failure.
189 */
190static int __init_memblock memblock_double_array(struct memblock_type *type,
191 phys_addr_t new_area_start,
192 phys_addr_t new_area_size)
186{ 193{
187 struct memblock_region *new_array, *old_array; 194 struct memblock_region *new_array, *old_array;
195 phys_addr_t old_alloc_size, new_alloc_size;
188 phys_addr_t old_size, new_size, addr; 196 phys_addr_t old_size, new_size, addr;
189 int use_slab = slab_is_available(); 197 int use_slab = slab_is_available();
198 int *in_slab;
190 199
191 /* We don't allow resizing until we know about the reserved regions 200 /* We don't allow resizing until we know about the reserved regions
192 * of memory that aren't suitable for allocation 201 * of memory that aren't suitable for allocation
@@ -197,6 +206,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
197 /* Calculate new doubled size */ 206 /* Calculate new doubled size */
198 old_size = type->max * sizeof(struct memblock_region); 207 old_size = type->max * sizeof(struct memblock_region);
199 new_size = old_size << 1; 208 new_size = old_size << 1;
209 /*
210 * We need to allocated new one align to PAGE_SIZE,
211 * so we can free them completely later.
212 */
213 old_alloc_size = PAGE_ALIGN(old_size);
214 new_alloc_size = PAGE_ALIGN(new_size);
215
216 /* Retrieve the slab flag */
217 if (type == &memblock.memory)
218 in_slab = &memblock_memory_in_slab;
219 else
220 in_slab = &memblock_reserved_in_slab;
200 221
201 /* Try to find some space for it. 222 /* Try to find some space for it.
202 * 223 *
@@ -212,14 +233,26 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
212 if (use_slab) { 233 if (use_slab) {
213 new_array = kmalloc(new_size, GFP_KERNEL); 234 new_array = kmalloc(new_size, GFP_KERNEL);
214 addr = new_array ? __pa(new_array) : 0; 235 addr = new_array ? __pa(new_array) : 0;
215 } else 236 } else {
216 addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); 237 /* only exclude range when trying to double reserved.regions */
238 if (type != &memblock.reserved)
239 new_area_start = new_area_size = 0;
240
241 addr = memblock_find_in_range(new_area_start + new_area_size,
242 memblock.current_limit,
243 new_alloc_size, PAGE_SIZE);
244 if (!addr && new_area_size)
245 addr = memblock_find_in_range(0,
246 min(new_area_start, memblock.current_limit),
247 new_alloc_size, PAGE_SIZE);
248
249 new_array = addr ? __va(addr) : 0;
250 }
217 if (!addr) { 251 if (!addr) {
218 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", 252 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
219 memblock_type_name(type), type->max, type->max * 2); 253 memblock_type_name(type), type->max, type->max * 2);
220 return -1; 254 return -1;
221 } 255 }
222 new_array = __va(addr);
223 256
224 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", 257 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
225 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); 258 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
@@ -234,21 +267,23 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
234 type->regions = new_array; 267 type->regions = new_array;
235 type->max <<= 1; 268 type->max <<= 1;
236 269
237 /* If we use SLAB that's it, we are done */ 270 /* Free old array. We needn't free it if the array is the
238 if (use_slab) 271 * static one
239 return 0;
240
241 /* Add the new reserved region now. Should not fail ! */
242 BUG_ON(memblock_reserve(addr, new_size));
243
244 /* If the array wasn't our static init one, then free it. We only do
245 * that before SLAB is available as later on, we don't know whether
246 * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
247 * anyways
248 */ 272 */
249 if (old_array != memblock_memory_init_regions && 273 if (*in_slab)
250 old_array != memblock_reserved_init_regions) 274 kfree(old_array);
251 memblock_free(__pa(old_array), old_size); 275 else if (old_array != memblock_memory_init_regions &&
276 old_array != memblock_reserved_init_regions)
277 memblock_free(__pa(old_array), old_alloc_size);
278
279 /* Reserve the new array if that comes from the memblock.
280 * Otherwise, we needn't do it
281 */
282 if (!use_slab)
283 BUG_ON(memblock_reserve(addr, new_alloc_size));
284
285 /* Update slab flag */
286 *in_slab = use_slab;
252 287
253 return 0; 288 return 0;
254} 289}
@@ -387,7 +422,7 @@ repeat:
387 */ 422 */
388 if (!insert) { 423 if (!insert) {
389 while (type->cnt + nr_new > type->max) 424 while (type->cnt + nr_new > type->max)
390 if (memblock_double_array(type) < 0) 425 if (memblock_double_array(type, obase, size) < 0)
391 return -ENOMEM; 426 return -ENOMEM;
392 insert = true; 427 insert = true;
393 goto repeat; 428 goto repeat;
@@ -438,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
438 473
439 /* we'll create at most two more regions */ 474 /* we'll create at most two more regions */
440 while (type->cnt + 2 > type->max) 475 while (type->cnt + 2 > type->max)
441 if (memblock_double_array(type) < 0) 476 if (memblock_double_array(type, base, size) < 0)
442 return -ENOMEM; 477 return -ENOMEM;
443 478
444 for (i = 0; i < type->cnt; i++) { 479 for (i = 0; i < type->cnt; i++) {
@@ -528,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
528 * __next_free_mem_range - next function for for_each_free_mem_range() 563 * __next_free_mem_range - next function for for_each_free_mem_range()
529 * @idx: pointer to u64 loop variable 564 * @idx: pointer to u64 loop variable
530 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 565 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
531 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 566 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
532 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 567 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
533 * @p_nid: ptr to int for nid of the range, can be %NULL 568 * @out_nid: ptr to int for nid of the range, can be %NULL
534 * 569 *
535 * Find the first free area from *@idx which matches @nid, fill the out 570 * Find the first free area from *@idx which matches @nid, fill the out
536 * parameters, and update *@idx for the next iteration. The lower 32bit of 571 * parameters, and update *@idx for the next iteration. The lower 32bit of
@@ -604,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
604 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 639 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
605 * @idx: pointer to u64 loop variable 640 * @idx: pointer to u64 loop variable
606 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 641 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
607 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 642 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
608 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 643 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
609 * @p_nid: ptr to int for nid of the range, can be %NULL 644 * @out_nid: ptr to int for nid of the range, can be %NULL
610 * 645 *
611 * Reverse of __next_free_mem_range(). 646 * Reverse of __next_free_mem_range().
612 */ 647 */
@@ -855,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
855 return memblock_search(&memblock.memory, addr) != -1; 890 return memblock_search(&memblock.memory, addr) != -1;
856} 891}
857 892
893/**
894 * memblock_is_region_memory - check if a region is a subset of memory
895 * @base: base of region to check
896 * @size: size of region to check
897 *
898 * Check if the region [@base, @base+@size) is a subset of a memory block.
899 *
900 * RETURNS:
901 * 0 if false, non-zero if true
902 */
858int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) 903int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
859{ 904{
860 int idx = memblock_search(&memblock.memory, base); 905 int idx = memblock_search(&memblock.memory, base);
@@ -867,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
867 memblock.memory.regions[idx].size) >= end; 912 memblock.memory.regions[idx].size) >= end;
868} 913}
869 914
915/**
916 * memblock_is_region_reserved - check if a region intersects reserved memory
917 * @base: base of region to check
918 * @size: size of region to check
919 *
920 * Check if the region [@base, @base+@size) intersects a reserved memory block.
921 *
922 * RETURNS:
923 * 0 if false, non-zero if true
924 */
870int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 925int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
871{ 926{
872 memblock_cap_size(base, &size); 927 memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7685d4a0b3ce..f72b5e52451a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,7 +59,7 @@
59 59
60struct cgroup_subsys mem_cgroup_subsys __read_mostly; 60struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5 61#define MEM_CGROUP_RECLAIM_RETRIES 5
62struct mem_cgroup *root_mem_cgroup __read_mostly; 62static struct mem_cgroup *root_mem_cgroup __read_mostly;
63 63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -73,7 +73,7 @@ static int really_do_swap_account __initdata = 0;
73#endif 73#endif
74 74
75#else 75#else
76#define do_swap_account (0) 76#define do_swap_account 0
77#endif 77#endif
78 78
79 79
@@ -88,18 +88,31 @@ enum mem_cgroup_stat_index {
88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
93}; 92};
94 93
94static const char * const mem_cgroup_stat_names[] = {
95 "cache",
96 "rss",
97 "mapped_file",
98 "swap",
99};
100
95enum mem_cgroup_events_index { 101enum mem_cgroup_events_index {
96 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 102 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
97 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 103 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
98 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
99 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 104 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
100 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 105 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
101 MEM_CGROUP_EVENTS_NSTATS, 106 MEM_CGROUP_EVENTS_NSTATS,
102}; 107};
108
109static const char * const mem_cgroup_events_names[] = {
110 "pgpgin",
111 "pgpgout",
112 "pgfault",
113 "pgmajfault",
114};
115
103/* 116/*
104 * Per memcg event counter is incremented at every pagein/pageout. With THP, 117 * Per memcg event counter is incremented at every pagein/pageout. With THP,
105 * it will be incremated by the number of pages. This counter is used for 118 * it will be incremated by the number of pages. This counter is used for
@@ -112,13 +125,14 @@ enum mem_cgroup_events_target {
112 MEM_CGROUP_TARGET_NUMAINFO, 125 MEM_CGROUP_TARGET_NUMAINFO,
113 MEM_CGROUP_NTARGETS, 126 MEM_CGROUP_NTARGETS,
114}; 127};
115#define THRESHOLDS_EVENTS_TARGET (128) 128#define THRESHOLDS_EVENTS_TARGET 128
116#define SOFTLIMIT_EVENTS_TARGET (1024) 129#define SOFTLIMIT_EVENTS_TARGET 1024
117#define NUMAINFO_EVENTS_TARGET (1024) 130#define NUMAINFO_EVENTS_TARGET 1024
118 131
119struct mem_cgroup_stat_cpu { 132struct mem_cgroup_stat_cpu {
120 long count[MEM_CGROUP_STAT_NSTATS]; 133 long count[MEM_CGROUP_STAT_NSTATS];
121 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 134 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
135 unsigned long nr_page_events;
122 unsigned long targets[MEM_CGROUP_NTARGETS]; 136 unsigned long targets[MEM_CGROUP_NTARGETS];
123}; 137};
124 138
@@ -138,7 +152,6 @@ struct mem_cgroup_per_zone {
138 152
139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 153 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
140 154
141 struct zone_reclaim_stat reclaim_stat;
142 struct rb_node tree_node; /* RB tree node */ 155 struct rb_node tree_node; /* RB tree node */
143 unsigned long long usage_in_excess;/* Set to the value by which */ 156 unsigned long long usage_in_excess;/* Set to the value by which */
144 /* the soft limit is exceeded*/ 157 /* the soft limit is exceeded*/
@@ -182,7 +195,7 @@ struct mem_cgroup_threshold {
182 195
183/* For threshold */ 196/* For threshold */
184struct mem_cgroup_threshold_ary { 197struct mem_cgroup_threshold_ary {
185 /* An array index points to threshold just below usage. */ 198 /* An array index points to threshold just below or equal to usage. */
186 int current_threshold; 199 int current_threshold;
187 /* Size of entries[] */ 200 /* Size of entries[] */
188 unsigned int size; 201 unsigned int size;
@@ -245,8 +258,8 @@ struct mem_cgroup {
245 */ 258 */
246 struct rcu_head rcu_freeing; 259 struct rcu_head rcu_freeing;
247 /* 260 /*
248 * But when using vfree(), that cannot be done at 261 * We also need some space for a worker in deferred freeing.
249 * interrupt time, so we must then queue the work. 262 * By the time we call it, rcu_freeing is no longer in use.
250 */ 263 */
251 struct work_struct work_freeing; 264 struct work_struct work_freeing;
252 }; 265 };
@@ -305,7 +318,7 @@ struct mem_cgroup {
305 /* 318 /*
306 * percpu counter. 319 * percpu counter.
307 */ 320 */
308 struct mem_cgroup_stat_cpu *stat; 321 struct mem_cgroup_stat_cpu __percpu *stat;
309 /* 322 /*
310 * used when a cpu is offlined or other synchronizations 323 * used when a cpu is offlined or other synchronizations
311 * See mem_cgroup_read_stat(). 324 * See mem_cgroup_read_stat().
@@ -360,8 +373,8 @@ static bool move_file(void)
360 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 373 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
361 * limit reclaim to prevent infinite loops, if they ever occur. 374 * limit reclaim to prevent infinite loops, if they ever occur.
362 */ 375 */
363#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 376#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
364#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 377#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
365 378
366enum charge_type { 379enum charge_type {
367 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -377,8 +390,8 @@ enum charge_type {
377#define _MEM (0) 390#define _MEM (0)
378#define _MEMSWAP (1) 391#define _MEMSWAP (1)
379#define _OOM_TYPE (2) 392#define _OOM_TYPE (2)
380#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 393#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
381#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 394#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
382#define MEMFILE_ATTR(val) ((val) & 0xffff) 395#define MEMFILE_ATTR(val) ((val) & 0xffff)
383/* Used for OOM nofiier */ 396/* Used for OOM nofiier */
384#define OOM_CONTROL (0) 397#define OOM_CONTROL (0)
@@ -404,6 +417,7 @@ void sock_update_memcg(struct sock *sk)
404{ 417{
405 if (mem_cgroup_sockets_enabled) { 418 if (mem_cgroup_sockets_enabled) {
406 struct mem_cgroup *memcg; 419 struct mem_cgroup *memcg;
420 struct cg_proto *cg_proto;
407 421
408 BUG_ON(!sk->sk_prot->proto_cgroup); 422 BUG_ON(!sk->sk_prot->proto_cgroup);
409 423
@@ -423,9 +437,10 @@ void sock_update_memcg(struct sock *sk)
423 437
424 rcu_read_lock(); 438 rcu_read_lock();
425 memcg = mem_cgroup_from_task(current); 439 memcg = mem_cgroup_from_task(current);
426 if (!mem_cgroup_is_root(memcg)) { 440 cg_proto = sk->sk_prot->proto_cgroup(memcg);
441 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
427 mem_cgroup_get(memcg); 442 mem_cgroup_get(memcg);
428 sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); 443 sk->sk_cgrp = cg_proto;
429 } 444 }
430 rcu_read_unlock(); 445 rcu_read_unlock();
431 } 446 }
@@ -454,6 +469,19 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
454#endif /* CONFIG_INET */ 469#endif /* CONFIG_INET */
455#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 470#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
456 471
472#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
473static void disarm_sock_keys(struct mem_cgroup *memcg)
474{
475 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
476 return;
477 static_key_slow_dec(&memcg_socket_limit_enabled);
478}
479#else
480static void disarm_sock_keys(struct mem_cgroup *memcg)
481{
482}
483#endif
484
457static void drain_all_stock_async(struct mem_cgroup *memcg); 485static void drain_all_stock_async(struct mem_cgroup *memcg);
458 486
459static struct mem_cgroup_per_zone * 487static struct mem_cgroup_per_zone *
@@ -718,12 +746,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
718 nr_pages = -nr_pages; /* for event */ 746 nr_pages = -nr_pages; /* for event */
719 } 747 }
720 748
721 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 749 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
722 750
723 preempt_enable(); 751 preempt_enable();
724} 752}
725 753
726unsigned long 754unsigned long
755mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
756{
757 struct mem_cgroup_per_zone *mz;
758
759 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
760 return mz->lru_size[lru];
761}
762
763static unsigned long
727mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 764mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
728 unsigned int lru_mask) 765 unsigned int lru_mask)
729{ 766{
@@ -770,7 +807,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
770{ 807{
771 unsigned long val, next; 808 unsigned long val, next;
772 809
773 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 810 val = __this_cpu_read(memcg->stat->nr_page_events);
774 next = __this_cpu_read(memcg->stat->targets[target]); 811 next = __this_cpu_read(memcg->stat->targets[target]);
775 /* from time_after() in jiffies.h */ 812 /* from time_after() in jiffies.h */
776 if ((long)next - (long)val < 0) { 813 if ((long)next - (long)val < 0) {
@@ -1013,7 +1050,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
1013/** 1050/**
1014 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1051 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1015 * @zone: zone of the wanted lruvec 1052 * @zone: zone of the wanted lruvec
1016 * @mem: memcg of the wanted lruvec 1053 * @memcg: memcg of the wanted lruvec
1017 * 1054 *
1018 * Returns the lru list vector holding pages for the given @zone and 1055 * Returns the lru list vector holding pages for the given @zone and
1019 * @mem. This can be the global zone lruvec, if the memory controller 1056 * @mem. This can be the global zone lruvec, if the memory controller
@@ -1046,19 +1083,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1046 */ 1083 */
1047 1084
1048/** 1085/**
1049 * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec 1086 * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1050 * @zone: zone of the page
1051 * @page: the page 1087 * @page: the page
1052 * @lru: current lru 1088 * @zone: zone of the page
1053 *
1054 * This function accounts for @page being added to @lru, and returns
1055 * the lruvec for the given @zone and the memcg @page is charged to.
1056 *
1057 * The callsite is then responsible for physically linking the page to
1058 * the returned lruvec->lists[@lru].
1059 */ 1089 */
1060struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, 1090struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1061 enum lru_list lru)
1062{ 1091{
1063 struct mem_cgroup_per_zone *mz; 1092 struct mem_cgroup_per_zone *mz;
1064 struct mem_cgroup *memcg; 1093 struct mem_cgroup *memcg;
@@ -1071,7 +1100,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1071 memcg = pc->mem_cgroup; 1100 memcg = pc->mem_cgroup;
1072 1101
1073 /* 1102 /*
1074 * Surreptitiously switch any uncharged page to root: 1103 * Surreptitiously switch any uncharged offlist page to root:
1075 * an uncharged page off lru does nothing to secure 1104 * an uncharged page off lru does nothing to secure
1076 * its former mem_cgroup from sudden removal. 1105 * its former mem_cgroup from sudden removal.
1077 * 1106 *
@@ -1079,85 +1108,60 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1079 * under page_cgroup lock: between them, they make all uses 1108 * under page_cgroup lock: between them, they make all uses
1080 * of pc->mem_cgroup safe. 1109 * of pc->mem_cgroup safe.
1081 */ 1110 */
1082 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1111 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1083 pc->mem_cgroup = memcg = root_mem_cgroup; 1112 pc->mem_cgroup = memcg = root_mem_cgroup;
1084 1113
1085 mz = page_cgroup_zoneinfo(memcg, page); 1114 mz = page_cgroup_zoneinfo(memcg, page);
1086 /* compound_order() is stabilized through lru_lock */
1087 mz->lru_size[lru] += 1 << compound_order(page);
1088 return &mz->lruvec; 1115 return &mz->lruvec;
1089} 1116}
1090 1117
1091/** 1118/**
1092 * mem_cgroup_lru_del_list - account for removing an lru page 1119 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1093 * @page: the page 1120 * @lruvec: mem_cgroup per zone lru vector
1094 * @lru: target lru 1121 * @lru: index of lru list the page is sitting on
1095 * 1122 * @nr_pages: positive when adding or negative when removing
1096 * This function accounts for @page being removed from @lru.
1097 * 1123 *
1098 * The callsite is then responsible for physically unlinking 1124 * This function must be called when a page is added to or removed from an
1099 * @page->lru. 1125 * lru list.
1100 */ 1126 */
1101void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) 1127void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1128 int nr_pages)
1102{ 1129{
1103 struct mem_cgroup_per_zone *mz; 1130 struct mem_cgroup_per_zone *mz;
1104 struct mem_cgroup *memcg; 1131 unsigned long *lru_size;
1105 struct page_cgroup *pc;
1106 1132
1107 if (mem_cgroup_disabled()) 1133 if (mem_cgroup_disabled())
1108 return; 1134 return;
1109 1135
1110 pc = lookup_page_cgroup(page); 1136 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1111 memcg = pc->mem_cgroup; 1137 lru_size = mz->lru_size + lru;
1112 VM_BUG_ON(!memcg); 1138 *lru_size += nr_pages;
1113 mz = page_cgroup_zoneinfo(memcg, page); 1139 VM_BUG_ON((long)(*lru_size) < 0);
1114 /* huge page split is done under lru_lock. so, we have no races. */
1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
1116 mz->lru_size[lru] -= 1 << compound_order(page);
1117}
1118
1119void mem_cgroup_lru_del(struct page *page)
1120{
1121 mem_cgroup_lru_del_list(page, page_lru(page));
1122}
1123
1124/**
1125 * mem_cgroup_lru_move_lists - account for moving a page between lrus
1126 * @zone: zone of the page
1127 * @page: the page
1128 * @from: current lru
1129 * @to: target lru
1130 *
1131 * This function accounts for @page being moved between the lrus @from
1132 * and @to, and returns the lruvec for the given @zone and the memcg
1133 * @page is charged to.
1134 *
1135 * The callsite is then responsible for physically relinking
1136 * @page->lru to the returned lruvec->lists[@to].
1137 */
1138struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
1139 struct page *page,
1140 enum lru_list from,
1141 enum lru_list to)
1142{
1143 /* XXX: Optimize this, especially for @from == @to */
1144 mem_cgroup_lru_del_list(page, from);
1145 return mem_cgroup_lru_add_list(zone, page, to);
1146} 1140}
1147 1141
1148/* 1142/*
1149 * Checks whether given mem is same or in the root_mem_cgroup's 1143 * Checks whether given mem is same or in the root_mem_cgroup's
1150 * hierarchy subtree 1144 * hierarchy subtree
1151 */ 1145 */
1146bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1147 struct mem_cgroup *memcg)
1148{
1149 if (root_memcg == memcg)
1150 return true;
1151 if (!root_memcg->use_hierarchy || !memcg)
1152 return false;
1153 return css_is_ancestor(&memcg->css, &root_memcg->css);
1154}
1155
1152static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1156static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1153 struct mem_cgroup *memcg) 1157 struct mem_cgroup *memcg)
1154{ 1158{
1155 if (root_memcg != memcg) { 1159 bool ret;
1156 return (root_memcg->use_hierarchy &&
1157 css_is_ancestor(&memcg->css, &root_memcg->css));
1158 }
1159 1160
1160 return true; 1161 rcu_read_lock();
1162 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1163 rcu_read_unlock();
1164 return ret;
1161} 1165}
1162 1166
1163int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1167int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
@@ -1195,19 +1199,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1195 return ret; 1199 return ret;
1196} 1200}
1197 1201
1198int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) 1202int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1199{ 1203{
1200 unsigned long inactive_ratio; 1204 unsigned long inactive_ratio;
1201 int nid = zone_to_nid(zone);
1202 int zid = zone_idx(zone);
1203 unsigned long inactive; 1205 unsigned long inactive;
1204 unsigned long active; 1206 unsigned long active;
1205 unsigned long gb; 1207 unsigned long gb;
1206 1208
1207 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1209 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1208 BIT(LRU_INACTIVE_ANON)); 1210 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1209 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1210 BIT(LRU_ACTIVE_ANON));
1211 1211
1212 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1212 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1213 if (gb) 1213 if (gb)
@@ -1218,55 +1218,23 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
1218 return inactive * inactive_ratio < active; 1218 return inactive * inactive_ratio < active;
1219} 1219}
1220 1220
1221int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) 1221int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1222{ 1222{
1223 unsigned long active; 1223 unsigned long active;
1224 unsigned long inactive; 1224 unsigned long inactive;
1225 int zid = zone_idx(zone);
1226 int nid = zone_to_nid(zone);
1227 1225
1228 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1226 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1229 BIT(LRU_INACTIVE_FILE)); 1227 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1230 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1231 BIT(LRU_ACTIVE_FILE));
1232 1228
1233 return (active > inactive); 1229 return (active > inactive);
1234} 1230}
1235 1231
1236struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1237 struct zone *zone)
1238{
1239 int nid = zone_to_nid(zone);
1240 int zid = zone_idx(zone);
1241 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1242
1243 return &mz->reclaim_stat;
1244}
1245
1246struct zone_reclaim_stat *
1247mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1248{
1249 struct page_cgroup *pc;
1250 struct mem_cgroup_per_zone *mz;
1251
1252 if (mem_cgroup_disabled())
1253 return NULL;
1254
1255 pc = lookup_page_cgroup(page);
1256 if (!PageCgroupUsed(pc))
1257 return NULL;
1258 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1259 smp_rmb();
1260 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1261 return &mz->reclaim_stat;
1262}
1263
1264#define mem_cgroup_from_res_counter(counter, member) \ 1232#define mem_cgroup_from_res_counter(counter, member) \
1265 container_of(counter, struct mem_cgroup, member) 1233 container_of(counter, struct mem_cgroup, member)
1266 1234
1267/** 1235/**
1268 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1236 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1269 * @mem: the memory cgroup 1237 * @memcg: the memory cgroup
1270 * 1238 *
1271 * Returns the maximum amount of memory @mem can be charged with, in 1239 * Returns the maximum amount of memory @mem can be charged with, in
1272 * pages. 1240 * pages.
@@ -1540,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1540 1508
1541/** 1509/**
1542 * test_mem_cgroup_node_reclaimable 1510 * test_mem_cgroup_node_reclaimable
1543 * @mem: the target memcg 1511 * @memcg: the target memcg
1544 * @nid: the node ID to be checked. 1512 * @nid: the node ID to be checked.
1545 * @noswap : specify true here if the user wants flle only information. 1513 * @noswap : specify true here if the user wants flle only information.
1546 * 1514 *
@@ -1634,7 +1602,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1634 * unused nodes. But scan_nodes is lazily updated and may not cotain 1602 * unused nodes. But scan_nodes is lazily updated and may not cotain
1635 * enough new information. We need to do double check. 1603 * enough new information. We need to do double check.
1636 */ 1604 */
1637bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1605static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1638{ 1606{
1639 int nid; 1607 int nid;
1640 1608
@@ -1669,7 +1637,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1669 return 0; 1637 return 0;
1670} 1638}
1671 1639
1672bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1640static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1673{ 1641{
1674 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1642 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1675} 1643}
@@ -1843,7 +1811,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
1843/* 1811/*
1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1812 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1845 */ 1813 */
1846bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1814static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
1815 int order)
1847{ 1816{
1848 struct oom_wait_info owait; 1817 struct oom_wait_info owait;
1849 bool locked, need_to_kill; 1818 bool locked, need_to_kill;
@@ -1992,7 +1961,7 @@ struct memcg_stock_pcp {
1992 unsigned int nr_pages; 1961 unsigned int nr_pages;
1993 struct work_struct work; 1962 struct work_struct work;
1994 unsigned long flags; 1963 unsigned long flags;
1995#define FLUSHING_CACHED_CHARGE (0) 1964#define FLUSHING_CACHED_CHARGE 0
1996}; 1965};
1997static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1966static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1998static DEFINE_MUTEX(percpu_charge_mutex); 1967static DEFINE_MUTEX(percpu_charge_mutex);
@@ -2139,7 +2108,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2139 int i; 2108 int i;
2140 2109
2141 spin_lock(&memcg->pcp_counter_lock); 2110 spin_lock(&memcg->pcp_counter_lock);
2142 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2111 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2143 long x = per_cpu(memcg->stat->count[i], cpu); 2112 long x = per_cpu(memcg->stat->count[i], cpu);
2144 2113
2145 per_cpu(memcg->stat->count[i], cpu) = 0; 2114 per_cpu(memcg->stat->count[i], cpu) = 0;
@@ -2427,6 +2396,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2427} 2396}
2428 2397
2429/* 2398/*
2399 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
2400 * This is useful when moving usage to parent cgroup.
2401 */
2402static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2403 unsigned int nr_pages)
2404{
2405 unsigned long bytes = nr_pages * PAGE_SIZE;
2406
2407 if (mem_cgroup_is_root(memcg))
2408 return;
2409
2410 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2411 if (do_swap_account)
2412 res_counter_uncharge_until(&memcg->memsw,
2413 memcg->memsw.parent, bytes);
2414}
2415
2416/*
2430 * A helper function to get mem_cgroup from ID. must be called under 2417 * A helper function to get mem_cgroup from ID. must be called under
2431 * rcu_read_lock(). The caller must check css_is_removed() or some if 2418 * rcu_read_lock(). The caller must check css_is_removed() or some if
2432 * it's concern. (dropping refcnt from swap can be called against removed 2419 * it's concern. (dropping refcnt from swap can be called against removed
@@ -2481,6 +2468,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2481{ 2468{
2482 struct page_cgroup *pc = lookup_page_cgroup(page); 2469 struct page_cgroup *pc = lookup_page_cgroup(page);
2483 struct zone *uninitialized_var(zone); 2470 struct zone *uninitialized_var(zone);
2471 struct lruvec *lruvec;
2484 bool was_on_lru = false; 2472 bool was_on_lru = false;
2485 bool anon; 2473 bool anon;
2486 2474
@@ -2503,8 +2491,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2503 zone = page_zone(page); 2491 zone = page_zone(page);
2504 spin_lock_irq(&zone->lru_lock); 2492 spin_lock_irq(&zone->lru_lock);
2505 if (PageLRU(page)) { 2493 if (PageLRU(page)) {
2494 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2506 ClearPageLRU(page); 2495 ClearPageLRU(page);
2507 del_page_from_lru_list(zone, page, page_lru(page)); 2496 del_page_from_lru_list(page, lruvec, page_lru(page));
2508 was_on_lru = true; 2497 was_on_lru = true;
2509 } 2498 }
2510 } 2499 }
@@ -2522,9 +2511,10 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2522 2511
2523 if (lrucare) { 2512 if (lrucare) {
2524 if (was_on_lru) { 2513 if (was_on_lru) {
2514 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2525 VM_BUG_ON(PageLRU(page)); 2515 VM_BUG_ON(PageLRU(page));
2526 SetPageLRU(page); 2516 SetPageLRU(page);
2527 add_page_to_lru_list(zone, page, page_lru(page)); 2517 add_page_to_lru_list(page, lruvec, page_lru(page));
2528 } 2518 }
2529 spin_unlock_irq(&zone->lru_lock); 2519 spin_unlock_irq(&zone->lru_lock);
2530 } 2520 }
@@ -2547,7 +2537,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2547 2537
2548#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2538#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2549 2539
2550#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) 2540#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
2551/* 2541/*
2552 * Because tail pages are not marked as "used", set it. We're under 2542 * Because tail pages are not marked as "used", set it. We're under
2553 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2543 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2578,23 +2568,19 @@ void mem_cgroup_split_huge_fixup(struct page *head)
2578 * @pc: page_cgroup of the page. 2568 * @pc: page_cgroup of the page.
2579 * @from: mem_cgroup which the page is moved from. 2569 * @from: mem_cgroup which the page is moved from.
2580 * @to: mem_cgroup which the page is moved to. @from != @to. 2570 * @to: mem_cgroup which the page is moved to. @from != @to.
2581 * @uncharge: whether we should call uncharge and css_put against @from.
2582 * 2571 *
2583 * The caller must confirm following. 2572 * The caller must confirm following.
2584 * - page is not on LRU (isolate_page() is useful.) 2573 * - page is not on LRU (isolate_page() is useful.)
2585 * - compound_lock is held when nr_pages > 1 2574 * - compound_lock is held when nr_pages > 1
2586 * 2575 *
2587 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2576 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
2588 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2577 * from old cgroup.
2589 * true, this function does "uncharge" from old cgroup, but it doesn't if
2590 * @uncharge is false, so a caller should do "uncharge".
2591 */ 2578 */
2592static int mem_cgroup_move_account(struct page *page, 2579static int mem_cgroup_move_account(struct page *page,
2593 unsigned int nr_pages, 2580 unsigned int nr_pages,
2594 struct page_cgroup *pc, 2581 struct page_cgroup *pc,
2595 struct mem_cgroup *from, 2582 struct mem_cgroup *from,
2596 struct mem_cgroup *to, 2583 struct mem_cgroup *to)
2597 bool uncharge)
2598{ 2584{
2599 unsigned long flags; 2585 unsigned long flags;
2600 int ret; 2586 int ret;
@@ -2628,9 +2614,6 @@ static int mem_cgroup_move_account(struct page *page,
2628 preempt_enable(); 2614 preempt_enable();
2629 } 2615 }
2630 mem_cgroup_charge_statistics(from, anon, -nr_pages); 2616 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2631 if (uncharge)
2632 /* This is not "cancel", but cancel_charge does all we need. */
2633 __mem_cgroup_cancel_charge(from, nr_pages);
2634 2617
2635 /* caller should have done css_get */ 2618 /* caller should have done css_get */
2636 pc->mem_cgroup = to; 2619 pc->mem_cgroup = to;
@@ -2664,15 +2647,13 @@ static int mem_cgroup_move_parent(struct page *page,
2664 struct mem_cgroup *child, 2647 struct mem_cgroup *child,
2665 gfp_t gfp_mask) 2648 gfp_t gfp_mask)
2666{ 2649{
2667 struct cgroup *cg = child->css.cgroup;
2668 struct cgroup *pcg = cg->parent;
2669 struct mem_cgroup *parent; 2650 struct mem_cgroup *parent;
2670 unsigned int nr_pages; 2651 unsigned int nr_pages;
2671 unsigned long uninitialized_var(flags); 2652 unsigned long uninitialized_var(flags);
2672 int ret; 2653 int ret;
2673 2654
2674 /* Is ROOT ? */ 2655 /* Is ROOT ? */
2675 if (!pcg) 2656 if (mem_cgroup_is_root(child))
2676 return -EINVAL; 2657 return -EINVAL;
2677 2658
2678 ret = -EBUSY; 2659 ret = -EBUSY;
@@ -2683,21 +2664,23 @@ static int mem_cgroup_move_parent(struct page *page,
2683 2664
2684 nr_pages = hpage_nr_pages(page); 2665 nr_pages = hpage_nr_pages(page);
2685 2666
2686 parent = mem_cgroup_from_cont(pcg); 2667 parent = parent_mem_cgroup(child);
2687 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2668 /*
2688 if (ret) 2669 * If no parent, move charges to root cgroup.
2689 goto put_back; 2670 */
2671 if (!parent)
2672 parent = root_mem_cgroup;
2690 2673
2691 if (nr_pages > 1) 2674 if (nr_pages > 1)
2692 flags = compound_lock_irqsave(page); 2675 flags = compound_lock_irqsave(page);
2693 2676
2694 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2677 ret = mem_cgroup_move_account(page, nr_pages,
2695 if (ret) 2678 pc, child, parent);
2696 __mem_cgroup_cancel_charge(parent, nr_pages); 2679 if (!ret)
2680 __mem_cgroup_cancel_local_charge(child, nr_pages);
2697 2681
2698 if (nr_pages > 1) 2682 if (nr_pages > 1)
2699 compound_unlock_irqrestore(page, flags); 2683 compound_unlock_irqrestore(page, flags);
2700put_back:
2701 putback_lru_page(page); 2684 putback_lru_page(page);
2702put: 2685put:
2703 put_page(page); 2686 put_page(page);
@@ -2845,24 +2828,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2845 */ 2828 */
2846 if (do_swap_account && PageSwapCache(page)) { 2829 if (do_swap_account && PageSwapCache(page)) {
2847 swp_entry_t ent = {.val = page_private(page)}; 2830 swp_entry_t ent = {.val = page_private(page)};
2848 struct mem_cgroup *swap_memcg; 2831 mem_cgroup_uncharge_swap(ent);
2849 unsigned short id;
2850
2851 id = swap_cgroup_record(ent, 0);
2852 rcu_read_lock();
2853 swap_memcg = mem_cgroup_lookup(id);
2854 if (swap_memcg) {
2855 /*
2856 * This recorded memcg can be obsolete one. So, avoid
2857 * calling css_tryget
2858 */
2859 if (!mem_cgroup_is_root(swap_memcg))
2860 res_counter_uncharge(&swap_memcg->memsw,
2861 PAGE_SIZE);
2862 mem_cgroup_swap_statistics(swap_memcg, false);
2863 mem_cgroup_put(swap_memcg);
2864 }
2865 rcu_read_unlock();
2866 } 2832 }
2867 /* 2833 /*
2868 * At swapin, we may charge account against cgroup which has no tasks. 2834 * At swapin, we may charge account against cgroup which has no tasks.
@@ -3155,7 +3121,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
3155 * @entry: swap entry to be moved 3121 * @entry: swap entry to be moved
3156 * @from: mem_cgroup which the entry is moved from 3122 * @from: mem_cgroup which the entry is moved from
3157 * @to: mem_cgroup which the entry is moved to 3123 * @to: mem_cgroup which the entry is moved to
3158 * @need_fixup: whether we should fixup res_counters and refcounts.
3159 * 3124 *
3160 * It succeeds only when the swap_cgroup's record for this entry is the same 3125 * It succeeds only when the swap_cgroup's record for this entry is the same
3161 * as the mem_cgroup's id of @from. 3126 * as the mem_cgroup's id of @from.
@@ -3166,7 +3131,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
3166 * both res and memsw, and called css_get(). 3131 * both res and memsw, and called css_get().
3167 */ 3132 */
3168static int mem_cgroup_move_swap_account(swp_entry_t entry, 3133static int mem_cgroup_move_swap_account(swp_entry_t entry,
3169 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3134 struct mem_cgroup *from, struct mem_cgroup *to)
3170{ 3135{
3171 unsigned short old_id, new_id; 3136 unsigned short old_id, new_id;
3172 3137
@@ -3185,24 +3150,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
3185 * swap-in, the refcount of @to might be decreased to 0. 3150 * swap-in, the refcount of @to might be decreased to 0.
3186 */ 3151 */
3187 mem_cgroup_get(to); 3152 mem_cgroup_get(to);
3188 if (need_fixup) {
3189 if (!mem_cgroup_is_root(from))
3190 res_counter_uncharge(&from->memsw, PAGE_SIZE);
3191 mem_cgroup_put(from);
3192 /*
3193 * we charged both to->res and to->memsw, so we should
3194 * uncharge to->res.
3195 */
3196 if (!mem_cgroup_is_root(to))
3197 res_counter_uncharge(&to->res, PAGE_SIZE);
3198 }
3199 return 0; 3153 return 0;
3200 } 3154 }
3201 return -EINVAL; 3155 return -EINVAL;
3202} 3156}
3203#else 3157#else
3204static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3158static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3205 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3159 struct mem_cgroup *from, struct mem_cgroup *to)
3206{ 3160{
3207 return -EINVAL; 3161 return -EINVAL;
3208} 3162}
@@ -3363,7 +3317,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3363void mem_cgroup_replace_page_cache(struct page *oldpage, 3317void mem_cgroup_replace_page_cache(struct page *oldpage,
3364 struct page *newpage) 3318 struct page *newpage)
3365{ 3319{
3366 struct mem_cgroup *memcg; 3320 struct mem_cgroup *memcg = NULL;
3367 struct page_cgroup *pc; 3321 struct page_cgroup *pc;
3368 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 3322 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3369 3323
@@ -3373,11 +3327,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3373 pc = lookup_page_cgroup(oldpage); 3327 pc = lookup_page_cgroup(oldpage);
3374 /* fix accounting on old pages */ 3328 /* fix accounting on old pages */
3375 lock_page_cgroup(pc); 3329 lock_page_cgroup(pc);
3376 memcg = pc->mem_cgroup; 3330 if (PageCgroupUsed(pc)) {
3377 mem_cgroup_charge_statistics(memcg, false, -1); 3331 memcg = pc->mem_cgroup;
3378 ClearPageCgroupUsed(pc); 3332 mem_cgroup_charge_statistics(memcg, false, -1);
3333 ClearPageCgroupUsed(pc);
3334 }
3379 unlock_page_cgroup(pc); 3335 unlock_page_cgroup(pc);
3380 3336
3337 /*
3338 * When called from shmem_replace_page(), in some cases the
3339 * oldpage has already been charged, and in some cases not.
3340 */
3341 if (!memcg)
3342 return;
3343
3381 if (PageSwapBacked(oldpage)) 3344 if (PageSwapBacked(oldpage))
3382 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3345 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3383 3346
@@ -3793,7 +3756,7 @@ try_to_free:
3793 goto move_account; 3756 goto move_account;
3794} 3757}
3795 3758
3796int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3759static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3797{ 3760{
3798 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3761 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3799} 3762}
@@ -3873,14 +3836,21 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3873 return val << PAGE_SHIFT; 3836 return val << PAGE_SHIFT;
3874} 3837}
3875 3838
3876static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3839static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3840 struct file *file, char __user *buf,
3841 size_t nbytes, loff_t *ppos)
3877{ 3842{
3878 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3843 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3844 char str[64];
3879 u64 val; 3845 u64 val;
3880 int type, name; 3846 int type, name, len;
3881 3847
3882 type = MEMFILE_TYPE(cft->private); 3848 type = MEMFILE_TYPE(cft->private);
3883 name = MEMFILE_ATTR(cft->private); 3849 name = MEMFILE_ATTR(cft->private);
3850
3851 if (!do_swap_account && type == _MEMSWAP)
3852 return -EOPNOTSUPP;
3853
3884 switch (type) { 3854 switch (type) {
3885 case _MEM: 3855 case _MEM:
3886 if (name == RES_USAGE) 3856 if (name == RES_USAGE)
@@ -3897,7 +3867,9 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3897 default: 3867 default:
3898 BUG(); 3868 BUG();
3899 } 3869 }
3900 return val; 3870
3871 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3872 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3901} 3873}
3902/* 3874/*
3903 * The user of this function is... 3875 * The user of this function is...
@@ -3913,6 +3885,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3913 3885
3914 type = MEMFILE_TYPE(cft->private); 3886 type = MEMFILE_TYPE(cft->private);
3915 name = MEMFILE_ATTR(cft->private); 3887 name = MEMFILE_ATTR(cft->private);
3888
3889 if (!do_swap_account && type == _MEMSWAP)
3890 return -EOPNOTSUPP;
3891
3916 switch (name) { 3892 switch (name) {
3917 case RES_LIMIT: 3893 case RES_LIMIT:
3918 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3894 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -3978,12 +3954,15 @@ out:
3978 3954
3979static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3955static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3980{ 3956{
3981 struct mem_cgroup *memcg; 3957 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3982 int type, name; 3958 int type, name;
3983 3959
3984 memcg = mem_cgroup_from_cont(cont);
3985 type = MEMFILE_TYPE(event); 3960 type = MEMFILE_TYPE(event);
3986 name = MEMFILE_ATTR(event); 3961 name = MEMFILE_ATTR(event);
3962
3963 if (!do_swap_account && type == _MEMSWAP)
3964 return -EOPNOTSUPP;
3965
3987 switch (name) { 3966 switch (name) {
3988 case RES_MAX_USAGE: 3967 case RES_MAX_USAGE:
3989 if (type == _MEM) 3968 if (type == _MEM)
@@ -4035,103 +4014,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4035} 4014}
4036#endif 4015#endif
4037 4016
4038
4039/* For read statistics */
4040enum {
4041 MCS_CACHE,
4042 MCS_RSS,
4043 MCS_FILE_MAPPED,
4044 MCS_PGPGIN,
4045 MCS_PGPGOUT,
4046 MCS_SWAP,
4047 MCS_PGFAULT,
4048 MCS_PGMAJFAULT,
4049 MCS_INACTIVE_ANON,
4050 MCS_ACTIVE_ANON,
4051 MCS_INACTIVE_FILE,
4052 MCS_ACTIVE_FILE,
4053 MCS_UNEVICTABLE,
4054 NR_MCS_STAT,
4055};
4056
4057struct mcs_total_stat {
4058 s64 stat[NR_MCS_STAT];
4059};
4060
4061struct {
4062 char *local_name;
4063 char *total_name;
4064} memcg_stat_strings[NR_MCS_STAT] = {
4065 {"cache", "total_cache"},
4066 {"rss", "total_rss"},
4067 {"mapped_file", "total_mapped_file"},
4068 {"pgpgin", "total_pgpgin"},
4069 {"pgpgout", "total_pgpgout"},
4070 {"swap", "total_swap"},
4071 {"pgfault", "total_pgfault"},
4072 {"pgmajfault", "total_pgmajfault"},
4073 {"inactive_anon", "total_inactive_anon"},
4074 {"active_anon", "total_active_anon"},
4075 {"inactive_file", "total_inactive_file"},
4076 {"active_file", "total_active_file"},
4077 {"unevictable", "total_unevictable"}
4078};
4079
4080
4081static void
4082mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4083{
4084 s64 val;
4085
4086 /* per cpu stat */
4087 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
4088 s->stat[MCS_CACHE] += val * PAGE_SIZE;
4089 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
4090 s->stat[MCS_RSS] += val * PAGE_SIZE;
4091 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
4092 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4093 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
4094 s->stat[MCS_PGPGIN] += val;
4095 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
4096 s->stat[MCS_PGPGOUT] += val;
4097 if (do_swap_account) {
4098 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4099 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4100 }
4101 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
4102 s->stat[MCS_PGFAULT] += val;
4103 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
4104 s->stat[MCS_PGMAJFAULT] += val;
4105
4106 /* per zone stat */
4107 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
4108 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4109 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
4110 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4111 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
4112 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4113 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
4114 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4115 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4116 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4117}
4118
4119static void
4120mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4121{
4122 struct mem_cgroup *iter;
4123
4124 for_each_mem_cgroup_tree(iter, memcg)
4125 mem_cgroup_get_local_stat(iter, s);
4126}
4127
4128#ifdef CONFIG_NUMA 4017#ifdef CONFIG_NUMA
4129static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4018static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4019 struct seq_file *m)
4130{ 4020{
4131 int nid; 4021 int nid;
4132 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4022 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4133 unsigned long node_nr; 4023 unsigned long node_nr;
4134 struct cgroup *cont = m->private;
4135 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4024 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4136 4025
4137 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 4026 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
@@ -4172,64 +4061,100 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4172} 4061}
4173#endif /* CONFIG_NUMA */ 4062#endif /* CONFIG_NUMA */
4174 4063
4064static const char * const mem_cgroup_lru_names[] = {
4065 "inactive_anon",
4066 "active_anon",
4067 "inactive_file",
4068 "active_file",
4069 "unevictable",
4070};
4071
4072static inline void mem_cgroup_lru_names_not_uptodate(void)
4073{
4074 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4075}
4076
4175static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4077static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4176 struct cgroup_map_cb *cb) 4078 struct seq_file *m)
4177{ 4079{
4178 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4080 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4179 struct mcs_total_stat mystat; 4081 struct mem_cgroup *mi;
4180 int i; 4082 unsigned int i;
4181
4182 memset(&mystat, 0, sizeof(mystat));
4183 mem_cgroup_get_local_stat(memcg, &mystat);
4184 4083
4185 4084 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4186 for (i = 0; i < NR_MCS_STAT; i++) { 4085 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
4187 if (i == MCS_SWAP && !do_swap_account)
4188 continue; 4086 continue;
4189 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4087 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4088 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
4190 } 4089 }
4191 4090
4091 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
4092 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
4093 mem_cgroup_read_events(memcg, i));
4094
4095 for (i = 0; i < NR_LRU_LISTS; i++)
4096 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
4097 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4098
4192 /* Hierarchical information */ 4099 /* Hierarchical information */
4193 { 4100 {
4194 unsigned long long limit, memsw_limit; 4101 unsigned long long limit, memsw_limit;
4195 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4102 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4196 cb->fill(cb, "hierarchical_memory_limit", limit); 4103 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
4197 if (do_swap_account) 4104 if (do_swap_account)
4198 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4105 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4106 memsw_limit);
4199 } 4107 }
4200 4108
4201 memset(&mystat, 0, sizeof(mystat)); 4109 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4202 mem_cgroup_get_total_stat(memcg, &mystat); 4110 long long val = 0;
4203 for (i = 0; i < NR_MCS_STAT; i++) { 4111
4204 if (i == MCS_SWAP && !do_swap_account) 4112 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
4205 continue; 4113 continue;
4206 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4114 for_each_mem_cgroup_tree(mi, memcg)
4115 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
4116 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
4117 }
4118
4119 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
4120 unsigned long long val = 0;
4121
4122 for_each_mem_cgroup_tree(mi, memcg)
4123 val += mem_cgroup_read_events(mi, i);
4124 seq_printf(m, "total_%s %llu\n",
4125 mem_cgroup_events_names[i], val);
4126 }
4127
4128 for (i = 0; i < NR_LRU_LISTS; i++) {
4129 unsigned long long val = 0;
4130
4131 for_each_mem_cgroup_tree(mi, memcg)
4132 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
4133 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
4207 } 4134 }
4208 4135
4209#ifdef CONFIG_DEBUG_VM 4136#ifdef CONFIG_DEBUG_VM
4210 { 4137 {
4211 int nid, zid; 4138 int nid, zid;
4212 struct mem_cgroup_per_zone *mz; 4139 struct mem_cgroup_per_zone *mz;
4140 struct zone_reclaim_stat *rstat;
4213 unsigned long recent_rotated[2] = {0, 0}; 4141 unsigned long recent_rotated[2] = {0, 0};
4214 unsigned long recent_scanned[2] = {0, 0}; 4142 unsigned long recent_scanned[2] = {0, 0};
4215 4143
4216 for_each_online_node(nid) 4144 for_each_online_node(nid)
4217 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4145 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4218 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 4146 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4147 rstat = &mz->lruvec.reclaim_stat;
4219 4148
4220 recent_rotated[0] += 4149 recent_rotated[0] += rstat->recent_rotated[0];
4221 mz->reclaim_stat.recent_rotated[0]; 4150 recent_rotated[1] += rstat->recent_rotated[1];
4222 recent_rotated[1] += 4151 recent_scanned[0] += rstat->recent_scanned[0];
4223 mz->reclaim_stat.recent_rotated[1]; 4152 recent_scanned[1] += rstat->recent_scanned[1];
4224 recent_scanned[0] +=
4225 mz->reclaim_stat.recent_scanned[0];
4226 recent_scanned[1] +=
4227 mz->reclaim_stat.recent_scanned[1];
4228 } 4153 }
4229 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4154 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
4230 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4155 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
4231 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4156 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
4232 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4157 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4233 } 4158 }
4234#endif 4159#endif
4235 4160
@@ -4291,7 +4216,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4291 usage = mem_cgroup_usage(memcg, swap); 4216 usage = mem_cgroup_usage(memcg, swap);
4292 4217
4293 /* 4218 /*
4294 * current_threshold points to threshold just below usage. 4219 * current_threshold points to threshold just below or equal to usage.
4295 * If it's not true, a threshold was crossed after last 4220 * If it's not true, a threshold was crossed after last
4296 * call of __mem_cgroup_threshold(). 4221 * call of __mem_cgroup_threshold().
4297 */ 4222 */
@@ -4417,14 +4342,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4417 /* Find current threshold */ 4342 /* Find current threshold */
4418 new->current_threshold = -1; 4343 new->current_threshold = -1;
4419 for (i = 0; i < size; i++) { 4344 for (i = 0; i < size; i++) {
4420 if (new->entries[i].threshold < usage) { 4345 if (new->entries[i].threshold <= usage) {
4421 /* 4346 /*
4422 * new->current_threshold will not be used until 4347 * new->current_threshold will not be used until
4423 * rcu_assign_pointer(), so it's safe to increment 4348 * rcu_assign_pointer(), so it's safe to increment
4424 * it here. 4349 * it here.
4425 */ 4350 */
4426 ++new->current_threshold; 4351 ++new->current_threshold;
4427 } 4352 } else
4353 break;
4428 } 4354 }
4429 4355
4430 /* Free old spare buffer and save old primary buffer as spare */ 4356 /* Free old spare buffer and save old primary buffer as spare */
@@ -4493,7 +4419,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4493 continue; 4419 continue;
4494 4420
4495 new->entries[j] = thresholds->primary->entries[i]; 4421 new->entries[j] = thresholds->primary->entries[i];
4496 if (new->entries[j].threshold < usage) { 4422 if (new->entries[j].threshold <= usage) {
4497 /* 4423 /*
4498 * new->current_threshold will not be used 4424 * new->current_threshold will not be used
4499 * until rcu_assign_pointer(), so it's safe to increment 4425 * until rcu_assign_pointer(), so it's safe to increment
@@ -4607,46 +4533,23 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4607 return 0; 4533 return 0;
4608} 4534}
4609 4535
4610#ifdef CONFIG_NUMA
4611static const struct file_operations mem_control_numa_stat_file_operations = {
4612 .read = seq_read,
4613 .llseek = seq_lseek,
4614 .release = single_release,
4615};
4616
4617static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4618{
4619 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4620
4621 file->f_op = &mem_control_numa_stat_file_operations;
4622 return single_open(file, mem_control_numa_stat_show, cont);
4623}
4624#endif /* CONFIG_NUMA */
4625
4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 4536#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
4627static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4537static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4628{ 4538{
4629 /* 4539 return mem_cgroup_sockets_init(memcg, ss);
4630 * Part of this would be better living in a separate allocation
4631 * function, leaving us with just the cgroup tree population work.
4632 * We, however, depend on state such as network's proto_list that
4633 * is only initialized after cgroup creation. I found the less
4634 * cumbersome way to deal with it to defer it all to populate time
4635 */
4636 return mem_cgroup_sockets_init(cont, ss);
4637}; 4540};
4638 4541
4639static void kmem_cgroup_destroy(struct cgroup *cont) 4542static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4640{ 4543{
4641 mem_cgroup_sockets_destroy(cont); 4544 mem_cgroup_sockets_destroy(memcg);
4642} 4545}
4643#else 4546#else
4644static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4547static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4645{ 4548{
4646 return 0; 4549 return 0;
4647} 4550}
4648 4551
4649static void kmem_cgroup_destroy(struct cgroup *cont) 4552static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4650{ 4553{
4651} 4554}
4652#endif 4555#endif
@@ -4655,7 +4558,7 @@ static struct cftype mem_cgroup_files[] = {
4655 { 4558 {
4656 .name = "usage_in_bytes", 4559 .name = "usage_in_bytes",
4657 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4560 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4658 .read_u64 = mem_cgroup_read, 4561 .read = mem_cgroup_read,
4659 .register_event = mem_cgroup_usage_register_event, 4562 .register_event = mem_cgroup_usage_register_event,
4660 .unregister_event = mem_cgroup_usage_unregister_event, 4563 .unregister_event = mem_cgroup_usage_unregister_event,
4661 }, 4564 },
@@ -4663,29 +4566,29 @@ static struct cftype mem_cgroup_files[] = {
4663 .name = "max_usage_in_bytes", 4566 .name = "max_usage_in_bytes",
4664 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4567 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4665 .trigger = mem_cgroup_reset, 4568 .trigger = mem_cgroup_reset,
4666 .read_u64 = mem_cgroup_read, 4569 .read = mem_cgroup_read,
4667 }, 4570 },
4668 { 4571 {
4669 .name = "limit_in_bytes", 4572 .name = "limit_in_bytes",
4670 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4573 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4671 .write_string = mem_cgroup_write, 4574 .write_string = mem_cgroup_write,
4672 .read_u64 = mem_cgroup_read, 4575 .read = mem_cgroup_read,
4673 }, 4576 },
4674 { 4577 {
4675 .name = "soft_limit_in_bytes", 4578 .name = "soft_limit_in_bytes",
4676 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4579 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4677 .write_string = mem_cgroup_write, 4580 .write_string = mem_cgroup_write,
4678 .read_u64 = mem_cgroup_read, 4581 .read = mem_cgroup_read,
4679 }, 4582 },
4680 { 4583 {
4681 .name = "failcnt", 4584 .name = "failcnt",
4682 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4585 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4683 .trigger = mem_cgroup_reset, 4586 .trigger = mem_cgroup_reset,
4684 .read_u64 = mem_cgroup_read, 4587 .read = mem_cgroup_read,
4685 }, 4588 },
4686 { 4589 {
4687 .name = "stat", 4590 .name = "stat",
4688 .read_map = mem_control_stat_show, 4591 .read_seq_string = mem_control_stat_show,
4689 }, 4592 },
4690 { 4593 {
4691 .name = "force_empty", 4594 .name = "force_empty",
@@ -4717,18 +4620,14 @@ static struct cftype mem_cgroup_files[] = {
4717#ifdef CONFIG_NUMA 4620#ifdef CONFIG_NUMA
4718 { 4621 {
4719 .name = "numa_stat", 4622 .name = "numa_stat",
4720 .open = mem_control_numa_stat_open, 4623 .read_seq_string = mem_control_numa_stat_show,
4721 .mode = S_IRUGO,
4722 }, 4624 },
4723#endif 4625#endif
4724};
4725
4726#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4727static struct cftype memsw_cgroup_files[] = {
4728 { 4627 {
4729 .name = "memsw.usage_in_bytes", 4628 .name = "memsw.usage_in_bytes",
4730 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4629 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4731 .read_u64 = mem_cgroup_read, 4630 .read = mem_cgroup_read,
4732 .register_event = mem_cgroup_usage_register_event, 4631 .register_event = mem_cgroup_usage_register_event,
4733 .unregister_event = mem_cgroup_usage_unregister_event, 4632 .unregister_event = mem_cgroup_usage_unregister_event,
4734 }, 4633 },
@@ -4736,41 +4635,28 @@ static struct cftype memsw_cgroup_files[] = {
4736 .name = "memsw.max_usage_in_bytes", 4635 .name = "memsw.max_usage_in_bytes",
4737 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4636 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4738 .trigger = mem_cgroup_reset, 4637 .trigger = mem_cgroup_reset,
4739 .read_u64 = mem_cgroup_read, 4638 .read = mem_cgroup_read,
4740 }, 4639 },
4741 { 4640 {
4742 .name = "memsw.limit_in_bytes", 4641 .name = "memsw.limit_in_bytes",
4743 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4642 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4744 .write_string = mem_cgroup_write, 4643 .write_string = mem_cgroup_write,
4745 .read_u64 = mem_cgroup_read, 4644 .read = mem_cgroup_read,
4746 }, 4645 },
4747 { 4646 {
4748 .name = "memsw.failcnt", 4647 .name = "memsw.failcnt",
4749 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4648 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4750 .trigger = mem_cgroup_reset, 4649 .trigger = mem_cgroup_reset,
4751 .read_u64 = mem_cgroup_read, 4650 .read = mem_cgroup_read,
4752 }, 4651 },
4753};
4754
4755static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4756{
4757 if (!do_swap_account)
4758 return 0;
4759 return cgroup_add_files(cont, ss, memsw_cgroup_files,
4760 ARRAY_SIZE(memsw_cgroup_files));
4761};
4762#else
4763static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4764{
4765 return 0;
4766}
4767#endif 4652#endif
4653 { }, /* terminate */
4654};
4768 4655
4769static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4656static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4770{ 4657{
4771 struct mem_cgroup_per_node *pn; 4658 struct mem_cgroup_per_node *pn;
4772 struct mem_cgroup_per_zone *mz; 4659 struct mem_cgroup_per_zone *mz;
4773 enum lru_list lru;
4774 int zone, tmp = node; 4660 int zone, tmp = node;
4775 /* 4661 /*
4776 * This routine is called against possible nodes. 4662 * This routine is called against possible nodes.
@@ -4788,8 +4674,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4788 4674
4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4675 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4790 mz = &pn->zoneinfo[zone]; 4676 mz = &pn->zoneinfo[zone];
4791 for_each_lru(lru) 4677 lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
4793 mz->usage_in_excess = 0; 4678 mz->usage_in_excess = 0;
4794 mz->on_tree = false; 4679 mz->on_tree = false;
4795 mz->memcg = memcg; 4680 mz->memcg = memcg;
@@ -4832,23 +4717,40 @@ out_free:
4832} 4717}
4833 4718
4834/* 4719/*
4835 * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, 4720 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
4836 * but in process context. The work_freeing structure is overlaid 4721 * but in process context. The work_freeing structure is overlaid
4837 * on the rcu_freeing structure, which itself is overlaid on memsw. 4722 * on the rcu_freeing structure, which itself is overlaid on memsw.
4838 */ 4723 */
4839static void vfree_work(struct work_struct *work) 4724static void free_work(struct work_struct *work)
4840{ 4725{
4841 struct mem_cgroup *memcg; 4726 struct mem_cgroup *memcg;
4727 int size = sizeof(struct mem_cgroup);
4842 4728
4843 memcg = container_of(work, struct mem_cgroup, work_freeing); 4729 memcg = container_of(work, struct mem_cgroup, work_freeing);
4844 vfree(memcg); 4730 /*
4731 * We need to make sure that (at least for now), the jump label
4732 * destruction code runs outside of the cgroup lock. This is because
4733 * get_online_cpus(), which is called from the static_branch update,
4734 * can't be called inside the cgroup_lock. cpusets are the ones
4735 * enforcing this dependency, so if they ever change, we might as well.
4736 *
4737 * schedule_work() will guarantee this happens. Be careful if you need
4738 * to move this code around, and make sure it is outside
4739 * the cgroup_lock.
4740 */
4741 disarm_sock_keys(memcg);
4742 if (size < PAGE_SIZE)
4743 kfree(memcg);
4744 else
4745 vfree(memcg);
4845} 4746}
4846static void vfree_rcu(struct rcu_head *rcu_head) 4747
4748static void free_rcu(struct rcu_head *rcu_head)
4847{ 4749{
4848 struct mem_cgroup *memcg; 4750 struct mem_cgroup *memcg;
4849 4751
4850 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); 4752 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4851 INIT_WORK(&memcg->work_freeing, vfree_work); 4753 INIT_WORK(&memcg->work_freeing, free_work);
4852 schedule_work(&memcg->work_freeing); 4754 schedule_work(&memcg->work_freeing);
4853} 4755}
4854 4756
@@ -4874,10 +4776,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4874 free_mem_cgroup_per_zone_info(memcg, node); 4776 free_mem_cgroup_per_zone_info(memcg, node);
4875 4777
4876 free_percpu(memcg->stat); 4778 free_percpu(memcg->stat);
4877 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4779 call_rcu(&memcg->rcu_freeing, free_rcu);
4878 kfree_rcu(memcg, rcu_freeing);
4879 else
4880 call_rcu(&memcg->rcu_freeing, vfree_rcu);
4881} 4780}
4882 4781
4883static void mem_cgroup_get(struct mem_cgroup *memcg) 4782static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -5016,6 +4915,17 @@ mem_cgroup_create(struct cgroup *cont)
5016 memcg->move_charge_at_immigrate = 0; 4915 memcg->move_charge_at_immigrate = 0;
5017 mutex_init(&memcg->thresholds_lock); 4916 mutex_init(&memcg->thresholds_lock);
5018 spin_lock_init(&memcg->move_lock); 4917 spin_lock_init(&memcg->move_lock);
4918
4919 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
4920 if (error) {
4921 /*
4922 * We call put now because our (and parent's) refcnts
4923 * are already in place. mem_cgroup_put() will internally
4924 * call __mem_cgroup_free, so return directly
4925 */
4926 mem_cgroup_put(memcg);
4927 return ERR_PTR(error);
4928 }
5019 return &memcg->css; 4929 return &memcg->css;
5020free_out: 4930free_out:
5021 __mem_cgroup_free(memcg); 4931 __mem_cgroup_free(memcg);
@@ -5033,28 +4943,11 @@ static void mem_cgroup_destroy(struct cgroup *cont)
5033{ 4943{
5034 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4944 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5035 4945
5036 kmem_cgroup_destroy(cont); 4946 kmem_cgroup_destroy(memcg);
5037 4947
5038 mem_cgroup_put(memcg); 4948 mem_cgroup_put(memcg);
5039} 4949}
5040 4950
5041static int mem_cgroup_populate(struct cgroup_subsys *ss,
5042 struct cgroup *cont)
5043{
5044 int ret;
5045
5046 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
5047 ARRAY_SIZE(mem_cgroup_files));
5048
5049 if (!ret)
5050 ret = register_memsw_files(cont, ss);
5051
5052 if (!ret)
5053 ret = register_kmem_files(cont, ss);
5054
5055 return ret;
5056}
5057
5058#ifdef CONFIG_MMU 4951#ifdef CONFIG_MMU
5059/* Handlers for move charge at task migration. */ 4952/* Handlers for move charge at task migration. */
5060#define PRECHARGE_COUNT_AT_ONCE 256 4953#define PRECHARGE_COUNT_AT_ONCE 256
@@ -5147,7 +5040,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5147 return NULL; 5040 return NULL;
5148 if (PageAnon(page)) { 5041 if (PageAnon(page)) {
5149 /* we don't move shared anon */ 5042 /* we don't move shared anon */
5150 if (!move_anon() || page_mapcount(page) > 2) 5043 if (!move_anon())
5151 return NULL; 5044 return NULL;
5152 } else if (!move_file()) 5045 } else if (!move_file())
5153 /* we ignore mapcount for file pages */ 5046 /* we ignore mapcount for file pages */
@@ -5158,32 +5051,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5158 return page; 5051 return page;
5159} 5052}
5160 5053
5054#ifdef CONFIG_SWAP
5161static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5055static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5162 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5056 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5163{ 5057{
5164 int usage_count;
5165 struct page *page = NULL; 5058 struct page *page = NULL;
5166 swp_entry_t ent = pte_to_swp_entry(ptent); 5059 swp_entry_t ent = pte_to_swp_entry(ptent);
5167 5060
5168 if (!move_anon() || non_swap_entry(ent)) 5061 if (!move_anon() || non_swap_entry(ent))
5169 return NULL; 5062 return NULL;
5170 usage_count = mem_cgroup_count_swap_user(ent, &page); 5063 /*
5171 if (usage_count > 1) { /* we don't move shared anon */ 5064 * Because lookup_swap_cache() updates some statistics counter,
5172 if (page) 5065 * we call find_get_page() with swapper_space directly.
5173 put_page(page); 5066 */
5174 return NULL; 5067 page = find_get_page(&swapper_space, ent.val);
5175 }
5176 if (do_swap_account) 5068 if (do_swap_account)
5177 entry->val = ent.val; 5069 entry->val = ent.val;
5178 5070
5179 return page; 5071 return page;
5180} 5072}
5073#else
5074static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5075 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5076{
5077 return NULL;
5078}
5079#endif
5181 5080
5182static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5081static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5183 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5082 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5184{ 5083{
5185 struct page *page = NULL; 5084 struct page *page = NULL;
5186 struct inode *inode;
5187 struct address_space *mapping; 5085 struct address_space *mapping;
5188 pgoff_t pgoff; 5086 pgoff_t pgoff;
5189 5087
@@ -5192,7 +5090,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5192 if (!move_file()) 5090 if (!move_file())
5193 return NULL; 5091 return NULL;
5194 5092
5195 inode = vma->vm_file->f_path.dentry->d_inode;
5196 mapping = vma->vm_file->f_mapping; 5093 mapping = vma->vm_file->f_mapping;
5197 if (pte_none(ptent)) 5094 if (pte_none(ptent))
5198 pgoff = linear_page_index(vma, addr); 5095 pgoff = linear_page_index(vma, addr);
@@ -5491,8 +5388,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5491 if (!isolate_lru_page(page)) { 5388 if (!isolate_lru_page(page)) {
5492 pc = lookup_page_cgroup(page); 5389 pc = lookup_page_cgroup(page);
5493 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5390 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5494 pc, mc.from, mc.to, 5391 pc, mc.from, mc.to)) {
5495 false)) {
5496 mc.precharge -= HPAGE_PMD_NR; 5392 mc.precharge -= HPAGE_PMD_NR;
5497 mc.moved_charge += HPAGE_PMD_NR; 5393 mc.moved_charge += HPAGE_PMD_NR;
5498 } 5394 }
@@ -5522,7 +5418,7 @@ retry:
5522 goto put; 5418 goto put;
5523 pc = lookup_page_cgroup(page); 5419 pc = lookup_page_cgroup(page);
5524 if (!mem_cgroup_move_account(page, 1, pc, 5420 if (!mem_cgroup_move_account(page, 1, pc,
5525 mc.from, mc.to, false)) { 5421 mc.from, mc.to)) {
5526 mc.precharge--; 5422 mc.precharge--;
5527 /* we uncharge from mc.from later. */ 5423 /* we uncharge from mc.from later. */
5528 mc.moved_charge++; 5424 mc.moved_charge++;
@@ -5533,8 +5429,7 @@ put: /* get_mctgt_type() gets the page */
5533 break; 5429 break;
5534 case MC_TARGET_SWAP: 5430 case MC_TARGET_SWAP:
5535 ent = target.ent; 5431 ent = target.ent;
5536 if (!mem_cgroup_move_swap_account(ent, 5432 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5537 mc.from, mc.to, false)) {
5538 mc.precharge--; 5433 mc.precharge--;
5539 /* we fixup refcnts and charges later. */ 5434 /* we fixup refcnts and charges later. */
5540 mc.moved_swap++; 5435 mc.moved_swap++;
@@ -5610,7 +5505,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
5610 if (mm) { 5505 if (mm) {
5611 if (mc.to) 5506 if (mc.to)
5612 mem_cgroup_move_charge(mm); 5507 mem_cgroup_move_charge(mm);
5613 put_swap_token(mm);
5614 mmput(mm); 5508 mmput(mm);
5615 } 5509 }
5616 if (mc.to) 5510 if (mc.to)
@@ -5638,12 +5532,13 @@ struct cgroup_subsys mem_cgroup_subsys = {
5638 .create = mem_cgroup_create, 5532 .create = mem_cgroup_create,
5639 .pre_destroy = mem_cgroup_pre_destroy, 5533 .pre_destroy = mem_cgroup_pre_destroy,
5640 .destroy = mem_cgroup_destroy, 5534 .destroy = mem_cgroup_destroy,
5641 .populate = mem_cgroup_populate,
5642 .can_attach = mem_cgroup_can_attach, 5535 .can_attach = mem_cgroup_can_attach,
5643 .cancel_attach = mem_cgroup_cancel_attach, 5536 .cancel_attach = mem_cgroup_cancel_attach,
5644 .attach = mem_cgroup_move_task, 5537 .attach = mem_cgroup_move_task,
5538 .base_cftypes = mem_cgroup_files,
5645 .early_init = 0, 5539 .early_init = 0,
5646 .use_id = 1, 5540 .use_id = 1,
5541 .__DEPRECATED_clear_css_refs = true,
5647}; 5542};
5648 5543
5649#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5544#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 97cc2733551a..ab1e7145e290 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1388,23 +1388,23 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1388 */ 1388 */
1389 if (!get_page_unless_zero(compound_head(p))) { 1389 if (!get_page_unless_zero(compound_head(p))) {
1390 if (PageHuge(p)) { 1390 if (PageHuge(p)) {
1391 pr_info("get_any_page: %#lx free huge page\n", pfn); 1391 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1392 ret = dequeue_hwpoisoned_huge_page(compound_head(p)); 1392 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1393 } else if (is_free_buddy_page(p)) { 1393 } else if (is_free_buddy_page(p)) {
1394 pr_info("get_any_page: %#lx free buddy page\n", pfn); 1394 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1395 /* Set hwpoison bit while page is still isolated */ 1395 /* Set hwpoison bit while page is still isolated */
1396 SetPageHWPoison(p); 1396 SetPageHWPoison(p);
1397 ret = 0; 1397 ret = 0;
1398 } else { 1398 } else {
1399 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1399 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1400 pfn, p->flags); 1400 __func__, pfn, p->flags);
1401 ret = -EIO; 1401 ret = -EIO;
1402 } 1402 }
1403 } else { 1403 } else {
1404 /* Not a free page */ 1404 /* Not a free page */
1405 ret = 1; 1405 ret = 1;
1406 } 1406 }
1407 unset_migratetype_isolate(p); 1407 unset_migratetype_isolate(p, MIGRATE_MOVABLE);
1408 unlock_memory_hotplug(); 1408 unlock_memory_hotplug();
1409 return ret; 1409 return ret;
1410} 1410}
diff --git a/mm/memory.c b/mm/memory.c
index 6105f475fa86..2466d1250231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1225 next = pmd_addr_end(addr, end); 1225 next = pmd_addr_end(addr, end);
1226 if (pmd_trans_huge(*pmd)) { 1226 if (pmd_trans_huge(*pmd)) {
1227 if (next - addr != HPAGE_PMD_SIZE) { 1227 if (next - addr != HPAGE_PMD_SIZE) {
1228 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1228#ifdef CONFIG_DEBUG_VM
1229 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1230 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1231 __func__, addr, end,
1232 vma->vm_start,
1233 vma->vm_end);
1234 BUG();
1235 }
1236#endif
1229 split_huge_page_pmd(vma->vm_mm, pmd); 1237 split_huge_page_pmd(vma->vm_mm, pmd);
1230 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1238 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1231 goto next; 1239 goto next;
@@ -1295,7 +1303,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
1295 1303
1296static void unmap_single_vma(struct mmu_gather *tlb, 1304static void unmap_single_vma(struct mmu_gather *tlb,
1297 struct vm_area_struct *vma, unsigned long start_addr, 1305 struct vm_area_struct *vma, unsigned long start_addr,
1298 unsigned long end_addr, unsigned long *nr_accounted, 1306 unsigned long end_addr,
1299 struct zap_details *details) 1307 struct zap_details *details)
1300{ 1308{
1301 unsigned long start = max(vma->vm_start, start_addr); 1309 unsigned long start = max(vma->vm_start, start_addr);
@@ -1307,8 +1315,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1307 if (end <= vma->vm_start) 1315 if (end <= vma->vm_start)
1308 return; 1316 return;
1309 1317
1310 if (vma->vm_flags & VM_ACCOUNT) 1318 if (vma->vm_file)
1311 *nr_accounted += (end - start) >> PAGE_SHIFT; 1319 uprobe_munmap(vma, start, end);
1312 1320
1313 if (unlikely(is_pfn_mapping(vma))) 1321 if (unlikely(is_pfn_mapping(vma)))
1314 untrack_pfn_vma(vma, 0, 0); 1322 untrack_pfn_vma(vma, 0, 0);
@@ -1339,8 +1347,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1339 * @vma: the starting vma 1347 * @vma: the starting vma
1340 * @start_addr: virtual address at which to start unmapping 1348 * @start_addr: virtual address at which to start unmapping
1341 * @end_addr: virtual address at which to end unmapping 1349 * @end_addr: virtual address at which to end unmapping
1342 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
1343 * @details: details of nonlinear truncation or shared cache invalidation
1344 * 1350 *
1345 * Unmap all pages in the vma list. 1351 * Unmap all pages in the vma list.
1346 * 1352 *
@@ -1355,40 +1361,40 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1355 */ 1361 */
1356void unmap_vmas(struct mmu_gather *tlb, 1362void unmap_vmas(struct mmu_gather *tlb,
1357 struct vm_area_struct *vma, unsigned long start_addr, 1363 struct vm_area_struct *vma, unsigned long start_addr,
1358 unsigned long end_addr, unsigned long *nr_accounted, 1364 unsigned long end_addr)
1359 struct zap_details *details)
1360{ 1365{
1361 struct mm_struct *mm = vma->vm_mm; 1366 struct mm_struct *mm = vma->vm_mm;
1362 1367
1363 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1368 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1364 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) 1369 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1365 unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, 1370 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1366 details);
1367 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1371 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1368} 1372}
1369 1373
1370/** 1374/**
1371 * zap_page_range - remove user pages in a given range 1375 * zap_page_range - remove user pages in a given range
1372 * @vma: vm_area_struct holding the applicable pages 1376 * @vma: vm_area_struct holding the applicable pages
1373 * @address: starting address of pages to zap 1377 * @start: starting address of pages to zap
1374 * @size: number of bytes to zap 1378 * @size: number of bytes to zap
1375 * @details: details of nonlinear truncation or shared cache invalidation 1379 * @details: details of nonlinear truncation or shared cache invalidation
1376 * 1380 *
1377 * Caller must protect the VMA list 1381 * Caller must protect the VMA list
1378 */ 1382 */
1379void zap_page_range(struct vm_area_struct *vma, unsigned long address, 1383void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1380 unsigned long size, struct zap_details *details) 1384 unsigned long size, struct zap_details *details)
1381{ 1385{
1382 struct mm_struct *mm = vma->vm_mm; 1386 struct mm_struct *mm = vma->vm_mm;
1383 struct mmu_gather tlb; 1387 struct mmu_gather tlb;
1384 unsigned long end = address + size; 1388 unsigned long end = start + size;
1385 unsigned long nr_accounted = 0;
1386 1389
1387 lru_add_drain(); 1390 lru_add_drain();
1388 tlb_gather_mmu(&tlb, mm, 0); 1391 tlb_gather_mmu(&tlb, mm, 0);
1389 update_hiwater_rss(mm); 1392 update_hiwater_rss(mm);
1390 unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1393 mmu_notifier_invalidate_range_start(mm, start, end);
1391 tlb_finish_mmu(&tlb, address, end); 1394 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1395 unmap_single_vma(&tlb, vma, start, end, details);
1396 mmu_notifier_invalidate_range_end(mm, start, end);
1397 tlb_finish_mmu(&tlb, start, end);
1392} 1398}
1393 1399
1394/** 1400/**
@@ -1406,13 +1412,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
1406 struct mm_struct *mm = vma->vm_mm; 1412 struct mm_struct *mm = vma->vm_mm;
1407 struct mmu_gather tlb; 1413 struct mmu_gather tlb;
1408 unsigned long end = address + size; 1414 unsigned long end = address + size;
1409 unsigned long nr_accounted = 0;
1410 1415
1411 lru_add_drain(); 1416 lru_add_drain();
1412 tlb_gather_mmu(&tlb, mm, 0); 1417 tlb_gather_mmu(&tlb, mm, 0);
1413 update_hiwater_rss(mm); 1418 update_hiwater_rss(mm);
1414 mmu_notifier_invalidate_range_start(mm, address, end); 1419 mmu_notifier_invalidate_range_start(mm, address, end);
1415 unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); 1420 unmap_single_vma(&tlb, vma, address, end, details);
1416 mmu_notifier_invalidate_range_end(mm, address, end); 1421 mmu_notifier_invalidate_range_end(mm, address, end);
1417 tlb_finish_mmu(&tlb, address, end); 1422 tlb_finish_mmu(&tlb, address, end);
1418} 1423}
@@ -2911,7 +2916,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2911 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2916 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2912 page = lookup_swap_cache(entry); 2917 page = lookup_swap_cache(entry);
2913 if (!page) { 2918 if (!page) {
2914 grab_swap_token(mm); /* Contend for token _before_ read-in */
2915 page = swapin_readahead(entry, 2919 page = swapin_readahead(entry,
2916 GFP_HIGHUSER_MOVABLE, vma, address); 2920 GFP_HIGHUSER_MOVABLE, vma, address);
2917 if (!page) { 2921 if (!page) {
@@ -2941,6 +2945,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2941 } 2945 }
2942 2946
2943 locked = lock_page_or_retry(page, mm, flags); 2947 locked = lock_page_or_retry(page, mm, flags);
2948
2944 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2949 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2945 if (!locked) { 2950 if (!locked) {
2946 ret |= VM_FAULT_RETRY; 2951 ret |= VM_FAULT_RETRY;
@@ -3489,6 +3494,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3489 if (unlikely(is_vm_hugetlb_page(vma))) 3494 if (unlikely(is_vm_hugetlb_page(vma)))
3490 return hugetlb_fault(mm, vma, address, flags); 3495 return hugetlb_fault(mm, vma, address, flags);
3491 3496
3497retry:
3492 pgd = pgd_offset(mm, address); 3498 pgd = pgd_offset(mm, address);
3493 pud = pud_alloc(mm, pgd, address); 3499 pud = pud_alloc(mm, pgd, address);
3494 if (!pud) 3500 if (!pud)
@@ -3502,13 +3508,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3502 pmd, flags); 3508 pmd, flags);
3503 } else { 3509 } else {
3504 pmd_t orig_pmd = *pmd; 3510 pmd_t orig_pmd = *pmd;
3511 int ret;
3512
3505 barrier(); 3513 barrier();
3506 if (pmd_trans_huge(orig_pmd)) { 3514 if (pmd_trans_huge(orig_pmd)) {
3507 if (flags & FAULT_FLAG_WRITE && 3515 if (flags & FAULT_FLAG_WRITE &&
3508 !pmd_write(orig_pmd) && 3516 !pmd_write(orig_pmd) &&
3509 !pmd_trans_splitting(orig_pmd)) 3517 !pmd_trans_splitting(orig_pmd)) {
3510 return do_huge_pmd_wp_page(mm, vma, address, 3518 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3511 pmd, orig_pmd); 3519 orig_pmd);
3520 /*
3521 * If COW results in an oom, the huge pmd will
3522 * have been split, so retry the fault on the
3523 * pte for a smaller charge.
3524 */
3525 if (unlikely(ret & VM_FAULT_OOM))
3526 goto retry;
3527 return ret;
3528 }
3512 return 0; 3529 return 0;
3513 } 3530 }
3514 } 3531 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6629fafd6ce4..427bb291dd0f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
74 res->end = start + size - 1; 74 res->end = start + size - 1;
75 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 75 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
76 if (request_resource(&iomem_resource, res) < 0) { 76 if (request_resource(&iomem_resource, res) < 0) {
77 printk("System RAM resource %llx - %llx cannot be added\n", 77 printk("System RAM resource %pR cannot be added\n", res);
78 (unsigned long long)res->start, (unsigned long long)res->end);
79 kfree(res); 78 kfree(res);
80 res = NULL; 79 res = NULL;
81 } 80 }
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
502 online_pages_range); 501 online_pages_range);
503 if (ret) { 502 if (ret) {
504 mutex_unlock(&zonelists_mutex); 503 mutex_unlock(&zonelists_mutex);
505 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 504 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
506 nr_pages, pfn); 505 (unsigned long long) pfn << PAGE_SHIFT,
506 (((unsigned long long) pfn + nr_pages)
507 << PAGE_SHIFT) - 1);
507 memory_notify(MEM_CANCEL_ONLINE, &arg); 508 memory_notify(MEM_CANCEL_ONLINE, &arg);
508 unlock_memory_hotplug(); 509 unlock_memory_hotplug();
509 return ret; 510 return ret;
@@ -617,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
617 pgdat = hotadd_new_pgdat(nid, start); 618 pgdat = hotadd_new_pgdat(nid, start);
618 ret = -ENOMEM; 619 ret = -ENOMEM;
619 if (!pgdat) 620 if (!pgdat)
620 goto out; 621 goto error;
621 new_pgdat = 1; 622 new_pgdat = 1;
622 } 623 }
623 624
@@ -891,7 +892,7 @@ static int __ref offline_pages(unsigned long start_pfn,
891 nr_pages = end_pfn - start_pfn; 892 nr_pages = end_pfn - start_pfn;
892 893
893 /* set above range as isolated */ 894 /* set above range as isolated */
894 ret = start_isolate_page_range(start_pfn, end_pfn); 895 ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
895 if (ret) 896 if (ret)
896 goto out; 897 goto out;
897 898
@@ -956,7 +957,7 @@ repeat:
956 We cannot do rollback at this point. */ 957 We cannot do rollback at this point. */
957 offline_isolated_pages(start_pfn, end_pfn); 958 offline_isolated_pages(start_pfn, end_pfn);
958 /* reset pagetype flags and makes migrate type to be MOVABLE */ 959 /* reset pagetype flags and makes migrate type to be MOVABLE */
959 undo_isolate_page_range(start_pfn, end_pfn); 960 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
960 /* removal success */ 961 /* removal success */
961 zone->present_pages -= offlined_pages; 962 zone->present_pages -= offlined_pages;
962 zone->zone_pgdat->node_present_pages -= offlined_pages; 963 zone->zone_pgdat->node_present_pages -= offlined_pages;
@@ -977,11 +978,12 @@ repeat:
977 return 0; 978 return 0;
978 979
979failed_removal: 980failed_removal:
980 printk(KERN_INFO "memory offlining %lx to %lx failed\n", 981 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
981 start_pfn, end_pfn); 982 (unsigned long long) start_pfn << PAGE_SHIFT,
983 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
982 memory_notify(MEM_CANCEL_OFFLINE, &arg); 984 memory_notify(MEM_CANCEL_OFFLINE, &arg);
983 /* pushback to free area */ 985 /* pushback to free area */
984 undo_isolate_page_range(start_pfn, end_pfn); 986 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
985 987
986out: 988out:
987 unlock_memory_hotplug(); 989 unlock_memory_hotplug();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b19569137529..1d771e4200d2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
390{ 390{
391 if (!pol) 391 if (!pol)
392 return; 392 return;
393 if (!mpol_store_user_nodemask(pol) && step == 0 && 393 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
395 return; 395 return;
396 396
@@ -607,27 +607,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
607 return first; 607 return first;
608} 608}
609 609
610/* Apply policy to a single VMA */
611static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
612{
613 int err = 0;
614 struct mempolicy *old = vma->vm_policy;
615
616 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
617 vma->vm_start, vma->vm_end, vma->vm_pgoff,
618 vma->vm_ops, vma->vm_file,
619 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
620
621 if (vma->vm_ops && vma->vm_ops->set_policy)
622 err = vma->vm_ops->set_policy(vma, new);
623 if (!err) {
624 mpol_get(new);
625 vma->vm_policy = new;
626 mpol_put(old);
627 }
628 return err;
629}
630
631/* Step 2: apply policy to a range and do splits. */ 610/* Step 2: apply policy to a range and do splits. */
632static int mbind_range(struct mm_struct *mm, unsigned long start, 611static int mbind_range(struct mm_struct *mm, unsigned long start,
633 unsigned long end, struct mempolicy *new_pol) 612 unsigned long end, struct mempolicy *new_pol)
@@ -676,9 +655,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
676 if (err) 655 if (err)
677 goto out; 656 goto out;
678 } 657 }
679 err = policy_vma(vma, new_pol); 658
680 if (err) 659 /*
681 goto out; 660 * Apply policy to a single VMA. The reference counting of
661 * policy for vma_policy linkages has already been handled by
662 * vma_merge and split_vma as necessary. If this is a shared
663 * policy then ->set_policy will increment the reference count
664 * for an sp node.
665 */
666 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
667 vma->vm_start, vma->vm_end, vma->vm_pgoff,
668 vma->vm_ops, vma->vm_file,
669 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
670 if (vma->vm_ops && vma->vm_ops->set_policy) {
671 err = vma->vm_ops->set_policy(vma, new_pol);
672 if (err)
673 goto out;
674 }
682 } 675 }
683 676
684 out: 677 out:
@@ -957,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
957 * 950 *
958 * Returns the number of page that could not be moved. 951 * Returns the number of page that could not be moved.
959 */ 952 */
960int do_migrate_pages(struct mm_struct *mm, 953int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
961 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 954 const nodemask_t *to, int flags)
962{ 955{
963 int busy = 0; 956 int busy = 0;
964 int err; 957 int err;
@@ -970,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm,
970 963
971 down_read(&mm->mmap_sem); 964 down_read(&mm->mmap_sem);
972 965
973 err = migrate_vmas(mm, from_nodes, to_nodes, flags); 966 err = migrate_vmas(mm, from, to, flags);
974 if (err) 967 if (err)
975 goto out; 968 goto out;
976 969
@@ -1005,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm,
1005 * moved to an empty node, then there is nothing left worth migrating. 998 * moved to an empty node, then there is nothing left worth migrating.
1006 */ 999 */
1007 1000
1008 tmp = *from_nodes; 1001 tmp = *from;
1009 while (!nodes_empty(tmp)) { 1002 while (!nodes_empty(tmp)) {
1010 int s,d; 1003 int s,d;
1011 int source = -1; 1004 int source = -1;
1012 int dest = 0; 1005 int dest = 0;
1013 1006
1014 for_each_node_mask(s, tmp) { 1007 for_each_node_mask(s, tmp) {
1015 d = node_remap(s, *from_nodes, *to_nodes); 1008
1009 /*
1010 * do_migrate_pages() tries to maintain the relative
1011 * node relationship of the pages established between
1012 * threads and memory areas.
1013 *
1014 * However if the number of source nodes is not equal to
1015 * the number of destination nodes we can not preserve
1016 * this node relative relationship. In that case, skip
1017 * copying memory from a node that is in the destination
1018 * mask.
1019 *
1020 * Example: [2,3,4] -> [3,4,5] moves everything.
1021 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1022 */
1023
1024 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1025 (node_isset(s, *to)))
1026 continue;
1027
1028 d = node_remap(s, *from, *to);
1016 if (s == d) 1029 if (s == d)
1017 continue; 1030 continue;
1018 1031
@@ -1072,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
1072{ 1085{
1073} 1086}
1074 1087
1075int do_migrate_pages(struct mm_struct *mm, 1088int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1076 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 1089 const nodemask_t *to, int flags)
1077{ 1090{
1078 return -ENOSYS; 1091 return -ENOSYS;
1079} 1092}
@@ -1164,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1164 if (!list_empty(&pagelist)) { 1177 if (!list_empty(&pagelist)) {
1165 nr_failed = migrate_pages(&pagelist, new_vma_page, 1178 nr_failed = migrate_pages(&pagelist, new_vma_page,
1166 (unsigned long)vma, 1179 (unsigned long)vma,
1167 false, true); 1180 false, MIGRATE_SYNC);
1168 if (nr_failed) 1181 if (nr_failed)
1169 putback_lru_pages(&pagelist); 1182 putback_lru_pages(&pagelist);
1170 } 1183 }
@@ -1334,8 +1347,8 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1334 * userid as the target process. 1347 * userid as the target process.
1335 */ 1348 */
1336 tcred = __task_cred(task); 1349 tcred = __task_cred(task);
1337 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1350 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1338 cred->uid != tcred->suid && cred->uid != tcred->uid && 1351 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1339 !capable(CAP_SYS_NICE)) { 1352 !capable(CAP_SYS_NICE)) {
1340 rcu_read_unlock(); 1353 rcu_read_unlock();
1341 err = -EPERM; 1354 err = -EPERM;
diff --git a/mm/migrate.c b/mm/migrate.c
index 11072383ae12..be26d5cbe56b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -436,7 +436,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
436 * is actually a signal that all of the page has become dirty. 436 * is actually a signal that all of the page has become dirty.
437 * Whereas only part of our page may be dirty. 437 * Whereas only part of our page may be dirty.
438 */ 438 */
439 __set_page_dirty_nobuffers(newpage); 439 if (PageSwapBacked(page))
440 SetPageDirty(newpage);
441 else
442 __set_page_dirty_nobuffers(newpage);
440 } 443 }
441 444
442 mlock_migrate_page(newpage, page); 445 mlock_migrate_page(newpage, page);
@@ -1371,8 +1374,8 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1371 * userid as the target process. 1374 * userid as the target process.
1372 */ 1375 */
1373 tcred = __task_cred(task); 1376 tcred = __task_cred(task);
1374 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1377 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1375 cred->uid != tcred->suid && cred->uid != tcred->uid && 1378 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1376 !capable(CAP_SYS_NICE)) { 1379 !capable(CAP_SYS_NICE)) {
1377 rcu_read_unlock(); 1380 rcu_read_unlock();
1378 err = -EPERM; 1381 err = -EPERM;
diff --git a/mm/mmap.c b/mm/mmap.c
index 848ef52d9603..3edfcdfa42d9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,7 @@
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h>
33 34
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include <asm/cacheflush.h> 36#include <asm/cacheflush.h>
@@ -546,8 +547,15 @@ again: remove_next = 1 + (end > next->vm_end);
546 547
547 if (file) { 548 if (file) {
548 mapping = file->f_mapping; 549 mapping = file->f_mapping;
549 if (!(vma->vm_flags & VM_NONLINEAR)) 550 if (!(vma->vm_flags & VM_NONLINEAR)) {
550 root = &mapping->i_mmap; 551 root = &mapping->i_mmap;
552 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
553
554 if (adjust_next)
555 uprobe_munmap(next, next->vm_start,
556 next->vm_end);
557 }
558
551 mutex_lock(&mapping->i_mmap_mutex); 559 mutex_lock(&mapping->i_mmap_mutex);
552 if (insert) { 560 if (insert) {
553 /* 561 /*
@@ -617,8 +625,16 @@ again: remove_next = 1 + (end > next->vm_end);
617 if (mapping) 625 if (mapping)
618 mutex_unlock(&mapping->i_mmap_mutex); 626 mutex_unlock(&mapping->i_mmap_mutex);
619 627
628 if (root) {
629 uprobe_mmap(vma);
630
631 if (adjust_next)
632 uprobe_mmap(next);
633 }
634
620 if (remove_next) { 635 if (remove_next) {
621 if (file) { 636 if (file) {
637 uprobe_munmap(next, next->vm_start, next->vm_end);
622 fput(file); 638 fput(file);
623 if (next->vm_flags & VM_EXECUTABLE) 639 if (next->vm_flags & VM_EXECUTABLE)
624 removed_exe_file_vma(mm); 640 removed_exe_file_vma(mm);
@@ -638,6 +654,8 @@ again: remove_next = 1 + (end > next->vm_end);
638 goto again; 654 goto again;
639 } 655 }
640 } 656 }
657 if (insert && file)
658 uprobe_mmap(insert);
641 659
642 validate_mm(mm); 660 validate_mm(mm);
643 661
@@ -953,15 +971,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
953 * The caller must hold down_write(&current->mm->mmap_sem). 971 * The caller must hold down_write(&current->mm->mmap_sem).
954 */ 972 */
955 973
956static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 974unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
957 unsigned long len, unsigned long prot, 975 unsigned long len, unsigned long prot,
958 unsigned long flags, unsigned long pgoff) 976 unsigned long flags, unsigned long pgoff)
959{ 977{
960 struct mm_struct * mm = current->mm; 978 struct mm_struct * mm = current->mm;
961 struct inode *inode; 979 struct inode *inode;
962 vm_flags_t vm_flags; 980 vm_flags_t vm_flags;
963 int error;
964 unsigned long reqprot = prot;
965 981
966 /* 982 /*
967 * Does the application expect PROT_READ to imply PROT_EXEC? 983 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1083,39 +1099,9 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1083 } 1099 }
1084 } 1100 }
1085 1101
1086 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1087 if (error)
1088 return error;
1089
1090 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1102 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1091} 1103}
1092 1104
1093unsigned long do_mmap(struct file *file, unsigned long addr,
1094 unsigned long len, unsigned long prot,
1095 unsigned long flag, unsigned long offset)
1096{
1097 if (unlikely(offset + PAGE_ALIGN(len) < offset))
1098 return -EINVAL;
1099 if (unlikely(offset & ~PAGE_MASK))
1100 return -EINVAL;
1101 return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
1102}
1103EXPORT_SYMBOL(do_mmap);
1104
1105unsigned long vm_mmap(struct file *file, unsigned long addr,
1106 unsigned long len, unsigned long prot,
1107 unsigned long flag, unsigned long offset)
1108{
1109 unsigned long ret;
1110 struct mm_struct *mm = current->mm;
1111
1112 down_write(&mm->mmap_sem);
1113 ret = do_mmap(file, addr, len, prot, flag, offset);
1114 up_write(&mm->mmap_sem);
1115 return ret;
1116}
1117EXPORT_SYMBOL(vm_mmap);
1118
1119SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1105SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1120 unsigned long, prot, unsigned long, flags, 1106 unsigned long, prot, unsigned long, flags,
1121 unsigned long, fd, unsigned long, pgoff) 1107 unsigned long, fd, unsigned long, pgoff)
@@ -1147,10 +1133,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1147 1133
1148 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1134 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1149 1135
1150 down_write(&current->mm->mmap_sem); 1136 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1151 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1152 up_write(&current->mm->mmap_sem);
1153
1154 if (file) 1137 if (file)
1155 fput(file); 1138 fput(file);
1156out: 1139out:
@@ -1371,6 +1354,11 @@ out:
1371 mm->locked_vm += (len >> PAGE_SHIFT); 1354 mm->locked_vm += (len >> PAGE_SHIFT);
1372 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1355 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1373 make_pages_present(addr, addr + len); 1356 make_pages_present(addr, addr + len);
1357
1358 if (file && uprobe_mmap(vma))
1359 /* matching probes but cannot insert */
1360 goto unmap_and_free_vma;
1361
1374 return addr; 1362 return addr;
1375 1363
1376unmap_and_free_vma: 1364unmap_and_free_vma:
@@ -1606,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1606 if (addr & ~PAGE_MASK) 1594 if (addr & ~PAGE_MASK)
1607 return -EINVAL; 1595 return -EINVAL;
1608 1596
1609 return arch_rebalance_pgtables(addr, len); 1597 addr = arch_rebalance_pgtables(addr, len);
1598 error = security_mmap_addr(addr);
1599 return error ? error : addr;
1610} 1600}
1611 1601
1612EXPORT_SYMBOL(get_unmapped_area); 1602EXPORT_SYMBOL(get_unmapped_area);
@@ -1616,33 +1606,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1616{ 1606{
1617 struct vm_area_struct *vma = NULL; 1607 struct vm_area_struct *vma = NULL;
1618 1608
1619 if (mm) { 1609 if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
1620 /* Check the cache first. */ 1610 return NULL;
1621 /* (Cache hit rate is typically around 35%.) */ 1611
1622 vma = mm->mmap_cache; 1612 /* Check the cache first. */
1623 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 1613 /* (Cache hit rate is typically around 35%.) */
1624 struct rb_node * rb_node; 1614 vma = mm->mmap_cache;
1625 1615 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1626 rb_node = mm->mm_rb.rb_node; 1616 struct rb_node *rb_node;
1627 vma = NULL; 1617
1628 1618 rb_node = mm->mm_rb.rb_node;
1629 while (rb_node) { 1619 vma = NULL;
1630 struct vm_area_struct * vma_tmp; 1620
1631 1621 while (rb_node) {
1632 vma_tmp = rb_entry(rb_node, 1622 struct vm_area_struct *vma_tmp;
1633 struct vm_area_struct, vm_rb); 1623
1634 1624 vma_tmp = rb_entry(rb_node,
1635 if (vma_tmp->vm_end > addr) { 1625 struct vm_area_struct, vm_rb);
1636 vma = vma_tmp; 1626
1637 if (vma_tmp->vm_start <= addr) 1627 if (vma_tmp->vm_end > addr) {
1638 break; 1628 vma = vma_tmp;
1639 rb_node = rb_node->rb_left; 1629 if (vma_tmp->vm_start <= addr)
1640 } else 1630 break;
1641 rb_node = rb_node->rb_right; 1631 rb_node = rb_node->rb_left;
1642 } 1632 } else
1643 if (vma) 1633 rb_node = rb_node->rb_right;
1644 mm->mmap_cache = vma;
1645 } 1634 }
1635 if (vma)
1636 mm->mmap_cache = vma;
1646 } 1637 }
1647 return vma; 1638 return vma;
1648} 1639}
@@ -1795,7 +1786,7 @@ int expand_downwards(struct vm_area_struct *vma,
1795 return -ENOMEM; 1786 return -ENOMEM;
1796 1787
1797 address &= PAGE_MASK; 1788 address &= PAGE_MASK;
1798 error = security_file_mmap(NULL, 0, 0, 0, address, 1); 1789 error = security_mmap_addr(address);
1799 if (error) 1790 if (error)
1800 return error; 1791 return error;
1801 1792
@@ -1889,15 +1880,20 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1889 */ 1880 */
1890static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) 1881static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1891{ 1882{
1883 unsigned long nr_accounted = 0;
1884
1892 /* Update high watermark before we lower total_vm */ 1885 /* Update high watermark before we lower total_vm */
1893 update_hiwater_vm(mm); 1886 update_hiwater_vm(mm);
1894 do { 1887 do {
1895 long nrpages = vma_pages(vma); 1888 long nrpages = vma_pages(vma);
1896 1889
1890 if (vma->vm_flags & VM_ACCOUNT)
1891 nr_accounted += nrpages;
1897 mm->total_vm -= nrpages; 1892 mm->total_vm -= nrpages;
1898 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1893 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1899 vma = remove_vma(vma); 1894 vma = remove_vma(vma);
1900 } while (vma); 1895 } while (vma);
1896 vm_unacct_memory(nr_accounted);
1901 validate_mm(mm); 1897 validate_mm(mm);
1902} 1898}
1903 1899
@@ -1912,13 +1908,11 @@ static void unmap_region(struct mm_struct *mm,
1912{ 1908{
1913 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; 1909 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1914 struct mmu_gather tlb; 1910 struct mmu_gather tlb;
1915 unsigned long nr_accounted = 0;
1916 1911
1917 lru_add_drain(); 1912 lru_add_drain();
1918 tlb_gather_mmu(&tlb, mm, 0); 1913 tlb_gather_mmu(&tlb, mm, 0);
1919 update_hiwater_rss(mm); 1914 update_hiwater_rss(mm);
1920 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); 1915 unmap_vmas(&tlb, vma, start, end);
1921 vm_unacct_memory(nr_accounted);
1922 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 1916 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
1923 next ? next->vm_start : 0); 1917 next ? next->vm_start : 0);
1924 tlb_finish_mmu(&tlb, start, end); 1918 tlb_finish_mmu(&tlb, start, end);
@@ -2132,7 +2126,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2132 2126
2133 return 0; 2127 return 0;
2134} 2128}
2135EXPORT_SYMBOL(do_munmap);
2136 2129
2137int vm_munmap(unsigned long start, size_t len) 2130int vm_munmap(unsigned long start, size_t len)
2138{ 2131{
@@ -2180,10 +2173,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2180 if (!len) 2173 if (!len)
2181 return addr; 2174 return addr;
2182 2175
2183 error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
2184 if (error)
2185 return error;
2186
2187 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2176 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2188 2177
2189 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 2178 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
@@ -2305,8 +2294,7 @@ void exit_mmap(struct mm_struct *mm)
2305 tlb_gather_mmu(&tlb, mm, 1); 2294 tlb_gather_mmu(&tlb, mm, 1);
2306 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2295 /* update_hiwater_rss(mm) here? but nobody should be looking */
2307 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2296 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2308 unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2297 unmap_vmas(&tlb, vma, 0, -1);
2309 vm_unacct_memory(nr_accounted);
2310 2298
2311 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2299 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
2312 tlb_finish_mmu(&tlb, 0, -1); 2300 tlb_finish_mmu(&tlb, 0, -1);
@@ -2315,8 +2303,12 @@ void exit_mmap(struct mm_struct *mm)
2315 * Walk the list again, actually closing and freeing it, 2303 * Walk the list again, actually closing and freeing it,
2316 * with preemption enabled, without holding any MM locks. 2304 * with preemption enabled, without holding any MM locks.
2317 */ 2305 */
2318 while (vma) 2306 while (vma) {
2307 if (vma->vm_flags & VM_ACCOUNT)
2308 nr_accounted += vma_pages(vma);
2319 vma = remove_vma(vma); 2309 vma = remove_vma(vma);
2310 }
2311 vm_unacct_memory(nr_accounted);
2320 2312
2321 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 2313 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2322} 2314}
@@ -2352,6 +2344,10 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2352 if ((vma->vm_flags & VM_ACCOUNT) && 2344 if ((vma->vm_flags & VM_ACCOUNT) &&
2353 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2345 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2354 return -ENOMEM; 2346 return -ENOMEM;
2347
2348 if (vma->vm_file && uprobe_mmap(vma))
2349 return -EINVAL;
2350
2355 vma_link(mm, vma, prev, rb_link, rb_parent); 2351 vma_link(mm, vma, prev, rb_link, rb_parent);
2356 return 0; 2352 return 0;
2357} 2353}
@@ -2421,6 +2417,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2421 new_vma->vm_pgoff = pgoff; 2417 new_vma->vm_pgoff = pgoff;
2422 if (new_vma->vm_file) { 2418 if (new_vma->vm_file) {
2423 get_file(new_vma->vm_file); 2419 get_file(new_vma->vm_file);
2420
2421 if (uprobe_mmap(new_vma))
2422 goto out_free_mempol;
2423
2424 if (vma->vm_flags & VM_EXECUTABLE) 2424 if (vma->vm_flags & VM_EXECUTABLE)
2425 added_exe_file_vma(mm); 2425 added_exe_file_vma(mm);
2426 } 2426 }
@@ -2525,10 +2525,6 @@ int install_special_mapping(struct mm_struct *mm,
2525 vma->vm_ops = &special_mapping_vmops; 2525 vma->vm_ops = &special_mapping_vmops;
2526 vma->vm_private_data = pages; 2526 vma->vm_private_data = pages;
2527 2527
2528 ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
2529 if (ret)
2530 goto out;
2531
2532 ret = insert_vm_struct(mm, vma); 2528 ret = insert_vm_struct(mm, vma);
2533 if (ret) 2529 if (ret)
2534 goto out; 2530 goto out;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7cf7b7ddc7c5..6830eab5bf09 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -86,3 +86,17 @@ int memmap_valid_within(unsigned long pfn,
86 return 1; 86 return 1;
87} 87}
88#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 88#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
89
90void lruvec_init(struct lruvec *lruvec, struct zone *zone)
91{
92 enum lru_list lru;
93
94 memset(lruvec, 0, sizeof(struct lruvec));
95
96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98
99#ifdef CONFIG_CGROUP_MEM_RES_CTLR
100 lruvec->zone = zone;
101#endif
102}
diff --git a/mm/mremap.c b/mm/mremap.c
index db8d983b5a7d..21fed202ddad 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -371,10 +371,6 @@ static unsigned long mremap_to(unsigned long addr,
371 if ((addr <= new_addr) && (addr+old_len) > new_addr) 371 if ((addr <= new_addr) && (addr+old_len) > new_addr)
372 goto out; 372 goto out;
373 373
374 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
375 if (ret)
376 goto out;
377
378 ret = do_munmap(mm, new_addr, new_len); 374 ret = do_munmap(mm, new_addr, new_len);
379 if (ret) 375 if (ret)
380 goto out; 376 goto out;
@@ -432,15 +428,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
432 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 428 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
433 * This option implies MREMAP_MAYMOVE. 429 * This option implies MREMAP_MAYMOVE.
434 */ 430 */
435unsigned long do_mremap(unsigned long addr, 431SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
436 unsigned long old_len, unsigned long new_len, 432 unsigned long, new_len, unsigned long, flags,
437 unsigned long flags, unsigned long new_addr) 433 unsigned long, new_addr)
438{ 434{
439 struct mm_struct *mm = current->mm; 435 struct mm_struct *mm = current->mm;
440 struct vm_area_struct *vma; 436 struct vm_area_struct *vma;
441 unsigned long ret = -EINVAL; 437 unsigned long ret = -EINVAL;
442 unsigned long charged = 0; 438 unsigned long charged = 0;
443 439
440 down_write(&current->mm->mmap_sem);
441
444 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) 442 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
445 goto out; 443 goto out;
446 444
@@ -530,25 +528,11 @@ unsigned long do_mremap(unsigned long addr,
530 goto out; 528 goto out;
531 } 529 }
532 530
533 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
534 if (ret)
535 goto out;
536 ret = move_vma(vma, addr, old_len, new_len, new_addr); 531 ret = move_vma(vma, addr, old_len, new_len, new_addr);
537 } 532 }
538out: 533out:
539 if (ret & ~PAGE_MASK) 534 if (ret & ~PAGE_MASK)
540 vm_unacct_memory(charged); 535 vm_unacct_memory(charged);
541 return ret;
542}
543
544SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
545 unsigned long, new_len, unsigned long, flags,
546 unsigned long, new_addr)
547{
548 unsigned long ret;
549
550 down_write(&current->mm->mmap_sem);
551 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
552 up_write(&current->mm->mmap_sem); 536 up_write(&current->mm->mmap_sem);
553 return ret; 537 return ret;
554} 538}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 1983fb1c7026..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
105 __free_pages_bootmem(pfn_to_page(i), 0); 105 __free_pages_bootmem(pfn_to_page(i), 0);
106} 106}
107 107
108static unsigned long __init __free_memory_core(phys_addr_t start,
109 phys_addr_t end)
110{
111 unsigned long start_pfn = PFN_UP(start);
112 unsigned long end_pfn = min_t(unsigned long,
113 PFN_DOWN(end), max_low_pfn);
114
115 if (start_pfn > end_pfn)
116 return 0;
117
118 __free_pages_memory(start_pfn, end_pfn);
119
120 return end_pfn - start_pfn;
121}
122
108unsigned long __init free_low_memory_core_early(int nodeid) 123unsigned long __init free_low_memory_core_early(int nodeid)
109{ 124{
110 unsigned long count = 0; 125 unsigned long count = 0;
111 phys_addr_t start, end; 126 phys_addr_t start, end, size;
112 u64 i; 127 u64 i;
113 128
114 /* free reserved array temporarily so that it's treated as free area */ 129 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
115 memblock_free_reserved_regions(); 130 count += __free_memory_core(start, end);
116 131
117 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 132 /* free range that is used for reserved array if we allocate it */
118 unsigned long start_pfn = PFN_UP(start); 133 size = get_allocated_memblock_reserved_regions_info(&start);
119 unsigned long end_pfn = min_t(unsigned long, 134 if (size)
120 PFN_DOWN(end), max_low_pfn); 135 count += __free_memory_core(start, start + size);
121 if (start_pfn < end_pfn) {
122 __free_pages_memory(start_pfn, end_pfn);
123 count += end_pfn - start_pfn;
124 }
125 }
126 136
127 /* put region array back? */
128 memblock_reserve_reserved_regions();
129 return count; 137 return count;
130} 138}
131 139
@@ -274,86 +282,85 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
274 return ___alloc_bootmem(size, align, goal, limit); 282 return ___alloc_bootmem(size, align, goal, limit);
275} 283}
276 284
277/** 285void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
278 * __alloc_bootmem_node - allocate boot memory from a specific node 286 unsigned long size,
279 * @pgdat: node to allocate from 287 unsigned long align,
280 * @size: size of the request in bytes 288 unsigned long goal,
281 * @align: alignment of the region 289 unsigned long limit)
282 * @goal: preferred starting address of the region
283 *
284 * The goal is dropped if it can not be satisfied and the allocation will
285 * fall back to memory below @goal.
286 *
287 * Allocation may fall back to any node in the system if the specified node
288 * can not hold the requested memory.
289 *
290 * The function panics if the request can not be satisfied.
291 */
292void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
293 unsigned long align, unsigned long goal)
294{ 290{
295 void *ptr; 291 void *ptr;
296 292
297 if (WARN_ON_ONCE(slab_is_available()))
298 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
299
300again: 293again:
301 ptr = __alloc_memory_core_early(pgdat->node_id, size, align, 294 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
302 goal, -1ULL); 295 goal, limit);
303 if (ptr) 296 if (ptr)
304 return ptr; 297 return ptr;
305 298
306 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, 299 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
307 goal, -1ULL); 300 goal, limit);
308 if (!ptr && goal) { 301 if (ptr)
302 return ptr;
303
304 if (goal) {
309 goal = 0; 305 goal = 0;
310 goto again; 306 goto again;
311 } 307 }
312 return ptr; 308
309 return NULL;
313} 310}
314 311
315void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 312void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
316 unsigned long align, unsigned long goal) 313 unsigned long align, unsigned long goal)
317{ 314{
318 return __alloc_bootmem_node(pgdat, size, align, goal); 315 if (WARN_ON_ONCE(slab_is_available()))
316 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
317
318 return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
319} 319}
320 320
321#ifdef CONFIG_SPARSEMEM 321void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
322/** 322 unsigned long align, unsigned long goal,
323 * alloc_bootmem_section - allocate boot memory from a specific section 323 unsigned long limit)
324 * @size: size of the request in bytes
325 * @section_nr: sparse map section to allocate from
326 *
327 * Return NULL on failure.
328 */
329void * __init alloc_bootmem_section(unsigned long size,
330 unsigned long section_nr)
331{ 324{
332 unsigned long pfn, goal, limit; 325 void *ptr;
333 326
334 pfn = section_nr_to_pfn(section_nr); 327 ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
335 goal = pfn << PAGE_SHIFT; 328 if (ptr)
336 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; 329 return ptr;
337 330
338 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, 331 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
339 SMP_CACHE_BYTES, goal, limit); 332 panic("Out of memory");
333 return NULL;
340} 334}
341#endif
342 335
343void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, 336/**
337 * __alloc_bootmem_node - allocate boot memory from a specific node
338 * @pgdat: node to allocate from
339 * @size: size of the request in bytes
340 * @align: alignment of the region
341 * @goal: preferred starting address of the region
342 *
343 * The goal is dropped if it can not be satisfied and the allocation will
344 * fall back to memory below @goal.
345 *
346 * Allocation may fall back to any node in the system if the specified node
347 * can not hold the requested memory.
348 *
349 * The function panics if the request can not be satisfied.
350 */
351void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
344 unsigned long align, unsigned long goal) 352 unsigned long align, unsigned long goal)
345{ 353{
346 void *ptr;
347
348 if (WARN_ON_ONCE(slab_is_available())) 354 if (WARN_ON_ONCE(slab_is_available()))
349 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 355 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
350 356
351 ptr = __alloc_memory_core_early(pgdat->node_id, size, align, 357 return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
352 goal, -1ULL); 358}
353 if (ptr)
354 return ptr;
355 359
356 return __alloc_bootmem_nopanic(size, align, goal); 360void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
361 unsigned long align, unsigned long goal)
362{
363 return __alloc_bootmem_node(pgdat, size, align, goal);
357} 364}
358 365
359#ifndef ARCH_LOW_ADDRESS_LIMIT 366#ifndef ARCH_LOW_ADDRESS_LIMIT
@@ -397,16 +404,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
397void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 404void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
398 unsigned long align, unsigned long goal) 405 unsigned long align, unsigned long goal)
399{ 406{
400 void *ptr;
401
402 if (WARN_ON_ONCE(slab_is_available())) 407 if (WARN_ON_ONCE(slab_is_available()))
403 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 408 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
404 409
405 ptr = __alloc_memory_core_early(pgdat->node_id, size, align, 410 return ___alloc_bootmem_node(pgdat, size, align, goal,
406 goal, ARCH_LOW_ADDRESS_LIMIT); 411 ARCH_LOW_ADDRESS_LIMIT);
407 if (ptr)
408 return ptr;
409
410 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
411 goal, ARCH_LOW_ADDRESS_LIMIT);
412} 412}
diff --git a/mm/nommu.c b/mm/nommu.c
index bb8f4f004a82..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file,
889 unsigned long *_capabilities) 889 unsigned long *_capabilities)
890{ 890{
891 unsigned long capabilities, rlen; 891 unsigned long capabilities, rlen;
892 unsigned long reqprot = prot;
893 int ret; 892 int ret;
894 893
895 /* do the simple checks first */ 894 /* do the simple checks first */
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file,
1047 } 1046 }
1048 1047
1049 /* allow the security API to have its say */ 1048 /* allow the security API to have its say */
1050 ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); 1049 ret = security_mmap_addr(addr);
1051 if (ret < 0) 1050 if (ret < 0)
1052 return ret; 1051 return ret;
1053 1052
@@ -1233,7 +1232,7 @@ enomem:
1233/* 1232/*
1234 * handle mapping creation for uClinux 1233 * handle mapping creation for uClinux
1235 */ 1234 */
1236static unsigned long do_mmap_pgoff(struct file *file, 1235unsigned long do_mmap_pgoff(struct file *file,
1237 unsigned long addr, 1236 unsigned long addr,
1238 unsigned long len, 1237 unsigned long len,
1239 unsigned long prot, 1238 unsigned long prot,
@@ -1471,32 +1470,6 @@ error_getting_region:
1471 return -ENOMEM; 1470 return -ENOMEM;
1472} 1471}
1473 1472
1474unsigned long do_mmap(struct file *file, unsigned long addr,
1475 unsigned long len, unsigned long prot,
1476 unsigned long flag, unsigned long offset)
1477{
1478 if (unlikely(offset + PAGE_ALIGN(len) < offset))
1479 return -EINVAL;
1480 if (unlikely(offset & ~PAGE_MASK))
1481 return -EINVAL;
1482 return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
1483}
1484EXPORT_SYMBOL(do_mmap);
1485
1486unsigned long vm_mmap(struct file *file, unsigned long addr,
1487 unsigned long len, unsigned long prot,
1488 unsigned long flag, unsigned long offset)
1489{
1490 unsigned long ret;
1491 struct mm_struct *mm = current->mm;
1492
1493 down_write(&mm->mmap_sem);
1494 ret = do_mmap(file, addr, len, prot, flag, offset);
1495 up_write(&mm->mmap_sem);
1496 return ret;
1497}
1498EXPORT_SYMBOL(vm_mmap);
1499
1500SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1473SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1501 unsigned long, prot, unsigned long, flags, 1474 unsigned long, prot, unsigned long, flags,
1502 unsigned long, fd, unsigned long, pgoff) 1475 unsigned long, fd, unsigned long, pgoff)
@@ -1513,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1513 1486
1514 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1487 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1515 1488
1516 down_write(&current->mm->mmap_sem); 1489 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1517 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1518 up_write(&current->mm->mmap_sem);
1519 1490
1520 if (file) 1491 if (file)
1521 fput(file); 1492 fput(file);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 46bf2ed5594c..ac300c99baf6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -180,10 +180,11 @@ static bool oom_unkillable_task(struct task_struct *p,
180 * predictable as possible. The goal is to return the highest value for the 180 * predictable as possible. The goal is to return the highest value for the
181 * task consuming the most memory to avoid subsequent oom failures. 181 * task consuming the most memory to avoid subsequent oom failures.
182 */ 182 */
183unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, 183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
184 const nodemask_t *nodemask, unsigned long totalpages) 184 const nodemask_t *nodemask, unsigned long totalpages)
185{ 185{
186 long points; 186 long points;
187 long adj;
187 188
188 if (oom_unkillable_task(p, memcg, nodemask)) 189 if (oom_unkillable_task(p, memcg, nodemask))
189 return 0; 190 return 0;
@@ -192,27 +193,18 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
192 if (!p) 193 if (!p)
193 return 0; 194 return 0;
194 195
195 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 196 adj = p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) {
196 task_unlock(p); 198 task_unlock(p);
197 return 0; 199 return 0;
198 } 200 }
199 201
200 /* 202 /*
201 * The memory controller may have a limit of 0 bytes, so avoid a divide
202 * by zero, if necessary.
203 */
204 if (!totalpages)
205 totalpages = 1;
206
207 /*
208 * The baseline for the badness score is the proportion of RAM that each 203 * The baseline for the badness score is the proportion of RAM that each
209 * task's rss, pagetable and swap space use. 204 * task's rss, pagetable and swap space use.
210 */ 205 */
211 points = get_mm_rss(p->mm) + p->mm->nr_ptes; 206 points = get_mm_rss(p->mm) + p->mm->nr_ptes +
212 points += get_mm_counter(p->mm, MM_SWAPENTS); 207 get_mm_counter(p->mm, MM_SWAPENTS);
213
214 points *= 1000;
215 points /= totalpages;
216 task_unlock(p); 208 task_unlock(p);
217 209
218 /* 210 /*
@@ -220,23 +212,17 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
220 * implementation used by LSMs. 212 * implementation used by LSMs.
221 */ 213 */
222 if (has_capability_noaudit(p, CAP_SYS_ADMIN)) 214 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
223 points -= 30; 215 adj -= 30;
224 216
225 /* 217 /* Normalize to oom_score_adj units */
226 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may 218 adj *= totalpages / 1000;
227 * either completely disable oom killing or always prefer a certain 219 points += adj;
228 * task.
229 */
230 points += p->signal->oom_score_adj;
231 220
232 /* 221 /*
233 * Never return 0 for an eligible task that may be killed since it's 222 * Never return 0 for an eligible task regardless of the root bonus and
234 * possible that no single user task uses more than 0.1% of memory and 223 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
235 * no single admin tasks uses more than 3.0%.
236 */ 224 */
237 if (points <= 0) 225 return points > 0 ? points : 1;
238 return 1;
239 return (points < 1000) ? points : 1000;
240} 226}
241 227
242/* 228/*
@@ -314,7 +300,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
314{ 300{
315 struct task_struct *g, *p; 301 struct task_struct *g, *p;
316 struct task_struct *chosen = NULL; 302 struct task_struct *chosen = NULL;
317 *ppoints = 0; 303 unsigned long chosen_points = 0;
318 304
319 do_each_thread(g, p) { 305 do_each_thread(g, p) {
320 unsigned int points; 306 unsigned int points;
@@ -354,7 +340,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
354 */ 340 */
355 if (p == current) { 341 if (p == current) {
356 chosen = p; 342 chosen = p;
357 *ppoints = 1000; 343 chosen_points = ULONG_MAX;
358 } else if (!force_kill) { 344 } else if (!force_kill) {
359 /* 345 /*
360 * If this task is not being ptraced on exit, 346 * If this task is not being ptraced on exit,
@@ -367,18 +353,19 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
367 } 353 }
368 354
369 points = oom_badness(p, memcg, nodemask, totalpages); 355 points = oom_badness(p, memcg, nodemask, totalpages);
370 if (points > *ppoints) { 356 if (points > chosen_points) {
371 chosen = p; 357 chosen = p;
372 *ppoints = points; 358 chosen_points = points;
373 } 359 }
374 } while_each_thread(g, p); 360 } while_each_thread(g, p);
375 361
362 *ppoints = chosen_points * 1000 / totalpages;
376 return chosen; 363 return chosen;
377} 364}
378 365
379/** 366/**
380 * dump_tasks - dump current memory state of all system tasks 367 * dump_tasks - dump current memory state of all system tasks
381 * @mem: current's memory controller, if constrained 368 * @memcg: current's memory controller, if constrained
382 * @nodemask: nodemask passed to page allocator for mempolicy ooms 369 * @nodemask: nodemask passed to page allocator for mempolicy ooms
383 * 370 *
384 * Dumps the current memory state of all eligible tasks. Tasks not in the same 371 * Dumps the current memory state of all eligible tasks. Tasks not in the same
@@ -410,8 +397,8 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
410 } 397 }
411 398
412 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", 399 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
413 task->pid, task_uid(task), task->tgid, 400 task->pid, from_kuid(&init_user_ns, task_uid(task)),
414 task->mm->total_vm, get_mm_rss(task->mm), 401 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
415 task_cpu(task), task->signal->oom_adj, 402 task_cpu(task), task->signal->oom_adj,
416 task->signal->oom_score_adj, task->comm); 403 task->signal->oom_score_adj, task->comm);
417 task_unlock(task); 404 task_unlock(task);
@@ -572,7 +559,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
572 } 559 }
573 560
574 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 561 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
575 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; 562 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
576 read_lock(&tasklist_lock); 563 read_lock(&tasklist_lock);
577 p = select_bad_process(&points, limit, memcg, NULL, false); 564 p = select_bad_process(&points, limit, memcg, NULL, false);
578 if (p && PTR_ERR(p) != -1UL) 565 if (p && PTR_ERR(p) != -1UL)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26adea8ca2e7..93d8d2f7108c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -204,7 +204,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
204 * Returns the global number of pages potentially available for dirty 204 * Returns the global number of pages potentially available for dirty
205 * page cache. This is the base value for the global dirty limits. 205 * page cache. This is the base value for the global dirty limits.
206 */ 206 */
207unsigned long global_dirtyable_memory(void) 207static unsigned long global_dirtyable_memory(void)
208{ 208{
209 unsigned long x; 209 unsigned long x;
210 210
@@ -1568,6 +1568,7 @@ void writeback_set_ratelimit(void)
1568 unsigned long background_thresh; 1568 unsigned long background_thresh;
1569 unsigned long dirty_thresh; 1569 unsigned long dirty_thresh;
1570 global_dirty_limits(&background_thresh, &dirty_thresh); 1570 global_dirty_limits(&background_thresh, &dirty_thresh);
1571 global_dirty_limit = dirty_thresh;
1571 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); 1572 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
1572 if (ratelimit_pages < 16) 1573 if (ratelimit_pages < 16)
1573 ratelimit_pages = 16; 1574 ratelimit_pages = 16;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 918330f71dba..4a4f9219683f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
57#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
58#include <linux/memcontrol.h> 58#include <linux/memcontrol.h>
59#include <linux/prefetch.h> 59#include <linux/prefetch.h>
60#include <linux/migrate.h>
60#include <linux/page-debug-flags.h> 61#include <linux/page-debug-flags.h>
61 62
62#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
@@ -513,10 +514,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
513 * free pages of length of (1 << order) and marked with _mapcount -2. Page's 514 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
514 * order is recorded in page_private(page) field. 515 * order is recorded in page_private(page) field.
515 * So when we are allocating or freeing one, we can derive the state of the 516 * So when we are allocating or freeing one, we can derive the state of the
516 * other. That is, if we allocate a small block, and both were 517 * other. That is, if we allocate a small block, and both were
517 * free, the remainder of the region must be split into blocks. 518 * free, the remainder of the region must be split into blocks.
518 * If a block is freed, and its buddy is also free, then this 519 * If a block is freed, and its buddy is also free, then this
519 * triggers coalescing into a block of larger size. 520 * triggers coalescing into a block of larger size.
520 * 521 *
521 * -- wli 522 * -- wli
522 */ 523 */
@@ -749,6 +750,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
749 __free_pages(page, order); 750 __free_pages(page, order);
750} 751}
751 752
753#ifdef CONFIG_CMA
754/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
755void __init init_cma_reserved_pageblock(struct page *page)
756{
757 unsigned i = pageblock_nr_pages;
758 struct page *p = page;
759
760 do {
761 __ClearPageReserved(p);
762 set_page_count(p, 0);
763 } while (++p, --i);
764
765 set_page_refcounted(page);
766 set_pageblock_migratetype(page, MIGRATE_CMA);
767 __free_pages(page, pageblock_order);
768 totalram_pages += pageblock_nr_pages;
769}
770#endif
752 771
753/* 772/*
754 * The order of subdivision here is critical for the IO subsystem. 773 * The order of subdivision here is critical for the IO subsystem.
@@ -874,11 +893,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
874 * This array describes the order lists are fallen back to when 893 * This array describes the order lists are fallen back to when
875 * the free lists for the desirable migrate type are depleted 894 * the free lists for the desirable migrate type are depleted
876 */ 895 */
877static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { 896static int fallbacks[MIGRATE_TYPES][4] = {
878 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 897 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
879 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 898 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
880 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 899#ifdef CONFIG_CMA
881 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ 900 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
901 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
902#else
903 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
904#endif
905 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
906 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
882}; 907};
883 908
884/* 909/*
@@ -973,12 +998,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
973 /* Find the largest possible block of pages in the other list */ 998 /* Find the largest possible block of pages in the other list */
974 for (current_order = MAX_ORDER-1; current_order >= order; 999 for (current_order = MAX_ORDER-1; current_order >= order;
975 --current_order) { 1000 --current_order) {
976 for (i = 0; i < MIGRATE_TYPES - 1; i++) { 1001 for (i = 0;; i++) {
977 migratetype = fallbacks[start_migratetype][i]; 1002 migratetype = fallbacks[start_migratetype][i];
978 1003
979 /* MIGRATE_RESERVE handled later if necessary */ 1004 /* MIGRATE_RESERVE handled later if necessary */
980 if (migratetype == MIGRATE_RESERVE) 1005 if (migratetype == MIGRATE_RESERVE)
981 continue; 1006 break;
982 1007
983 area = &(zone->free_area[current_order]); 1008 area = &(zone->free_area[current_order]);
984 if (list_empty(&area->free_list[migratetype])) 1009 if (list_empty(&area->free_list[migratetype]))
@@ -993,11 +1018,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
993 * pages to the preferred allocation list. If falling 1018 * pages to the preferred allocation list. If falling
994 * back for a reclaimable kernel allocation, be more 1019 * back for a reclaimable kernel allocation, be more
995 * aggressive about taking ownership of free pages 1020 * aggressive about taking ownership of free pages
1021 *
1022 * On the other hand, never change migration
1023 * type of MIGRATE_CMA pageblocks nor move CMA
1024 * pages on different free lists. We don't
1025 * want unmovable pages to be allocated from
1026 * MIGRATE_CMA areas.
996 */ 1027 */
997 if (unlikely(current_order >= (pageblock_order >> 1)) || 1028 if (!is_migrate_cma(migratetype) &&
998 start_migratetype == MIGRATE_RECLAIMABLE || 1029 (unlikely(current_order >= pageblock_order / 2) ||
999 page_group_by_mobility_disabled) { 1030 start_migratetype == MIGRATE_RECLAIMABLE ||
1000 unsigned long pages; 1031 page_group_by_mobility_disabled)) {
1032 int pages;
1001 pages = move_freepages_block(zone, page, 1033 pages = move_freepages_block(zone, page,
1002 start_migratetype); 1034 start_migratetype);
1003 1035
@@ -1015,11 +1047,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1015 rmv_page_order(page); 1047 rmv_page_order(page);
1016 1048
1017 /* Take ownership for orders >= pageblock_order */ 1049 /* Take ownership for orders >= pageblock_order */
1018 if (current_order >= pageblock_order) 1050 if (current_order >= pageblock_order &&
1051 !is_migrate_cma(migratetype))
1019 change_pageblock_range(page, current_order, 1052 change_pageblock_range(page, current_order,
1020 start_migratetype); 1053 start_migratetype);
1021 1054
1022 expand(zone, page, order, current_order, area, migratetype); 1055 expand(zone, page, order, current_order, area,
1056 is_migrate_cma(migratetype)
1057 ? migratetype : start_migratetype);
1023 1058
1024 trace_mm_page_alloc_extfrag(page, order, current_order, 1059 trace_mm_page_alloc_extfrag(page, order, current_order,
1025 start_migratetype, migratetype); 1060 start_migratetype, migratetype);
@@ -1061,17 +1096,17 @@ retry_reserve:
1061 return page; 1096 return page;
1062} 1097}
1063 1098
1064/* 1099/*
1065 * Obtain a specified number of elements from the buddy allocator, all under 1100 * Obtain a specified number of elements from the buddy allocator, all under
1066 * a single hold of the lock, for efficiency. Add them to the supplied list. 1101 * a single hold of the lock, for efficiency. Add them to the supplied list.
1067 * Returns the number of new pages which were placed at *list. 1102 * Returns the number of new pages which were placed at *list.
1068 */ 1103 */
1069static int rmqueue_bulk(struct zone *zone, unsigned int order, 1104static int rmqueue_bulk(struct zone *zone, unsigned int order,
1070 unsigned long count, struct list_head *list, 1105 unsigned long count, struct list_head *list,
1071 int migratetype, int cold) 1106 int migratetype, int cold)
1072{ 1107{
1073 int i; 1108 int mt = migratetype, i;
1074 1109
1075 spin_lock(&zone->lock); 1110 spin_lock(&zone->lock);
1076 for (i = 0; i < count; ++i) { 1111 for (i = 0; i < count; ++i) {
1077 struct page *page = __rmqueue(zone, order, migratetype); 1112 struct page *page = __rmqueue(zone, order, migratetype);
@@ -1091,7 +1126,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1091 list_add(&page->lru, list); 1126 list_add(&page->lru, list);
1092 else 1127 else
1093 list_add_tail(&page->lru, list); 1128 list_add_tail(&page->lru, list);
1094 set_page_private(page, migratetype); 1129 if (IS_ENABLED(CONFIG_CMA)) {
1130 mt = get_pageblock_migratetype(page);
1131 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1132 mt = migratetype;
1133 }
1134 set_page_private(page, mt);
1095 list = &page->lru; 1135 list = &page->lru;
1096 } 1136 }
1097 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1137 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
@@ -1371,8 +1411,12 @@ int split_free_page(struct page *page)
1371 1411
1372 if (order >= pageblock_order - 1) { 1412 if (order >= pageblock_order - 1) {
1373 struct page *endpage = page + (1 << order) - 1; 1413 struct page *endpage = page + (1 << order) - 1;
1374 for (; page < endpage; page += pageblock_nr_pages) 1414 for (; page < endpage; page += pageblock_nr_pages) {
1375 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1415 int mt = get_pageblock_migratetype(page);
1416 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
1417 set_pageblock_migratetype(page,
1418 MIGRATE_MOVABLE);
1419 }
1376 } 1420 }
1377 1421
1378 return 1 << order; 1422 return 1 << order;
@@ -2086,16 +2130,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2086} 2130}
2087#endif /* CONFIG_COMPACTION */ 2131#endif /* CONFIG_COMPACTION */
2088 2132
2089/* The really slow allocator path where we enter direct reclaim */ 2133/* Perform direct synchronous page reclaim */
2090static inline struct page * 2134static int
2091__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2135__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2092 struct zonelist *zonelist, enum zone_type high_zoneidx, 2136 nodemask_t *nodemask)
2093 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2094 int migratetype, unsigned long *did_some_progress)
2095{ 2137{
2096 struct page *page = NULL;
2097 struct reclaim_state reclaim_state; 2138 struct reclaim_state reclaim_state;
2098 bool drained = false; 2139 int progress;
2099 2140
2100 cond_resched(); 2141 cond_resched();
2101 2142
@@ -2106,7 +2147,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2106 reclaim_state.reclaimed_slab = 0; 2147 reclaim_state.reclaimed_slab = 0;
2107 current->reclaim_state = &reclaim_state; 2148 current->reclaim_state = &reclaim_state;
2108 2149
2109 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2150 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2110 2151
2111 current->reclaim_state = NULL; 2152 current->reclaim_state = NULL;
2112 lockdep_clear_current_reclaim_state(); 2153 lockdep_clear_current_reclaim_state();
@@ -2114,6 +2155,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2114 2155
2115 cond_resched(); 2156 cond_resched();
2116 2157
2158 return progress;
2159}
2160
2161/* The really slow allocator path where we enter direct reclaim */
2162static inline struct page *
2163__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2164 struct zonelist *zonelist, enum zone_type high_zoneidx,
2165 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2166 int migratetype, unsigned long *did_some_progress)
2167{
2168 struct page *page = NULL;
2169 bool drained = false;
2170
2171 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2172 nodemask);
2117 if (unlikely(!(*did_some_progress))) 2173 if (unlikely(!(*did_some_progress)))
2118 return NULL; 2174 return NULL;
2119 2175
@@ -4244,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat,
4244 4300
4245#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4301#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4246 4302
4247/* Return a sensible default order for the pageblock size. */
4248static inline int pageblock_default_order(void)
4249{
4250 if (HPAGE_SHIFT > PAGE_SHIFT)
4251 return HUGETLB_PAGE_ORDER;
4252
4253 return MAX_ORDER-1;
4254}
4255
4256/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4303/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4257static inline void __init set_pageblock_order(unsigned int order) 4304static inline void __init set_pageblock_order(void)
4258{ 4305{
4306 unsigned int order;
4307
4259 /* Check that pageblock_nr_pages has not already been setup */ 4308 /* Check that pageblock_nr_pages has not already been setup */
4260 if (pageblock_order) 4309 if (pageblock_order)
4261 return; 4310 return;
4262 4311
4312 if (HPAGE_SHIFT > PAGE_SHIFT)
4313 order = HUGETLB_PAGE_ORDER;
4314 else
4315 order = MAX_ORDER - 1;
4316
4263 /* 4317 /*
4264 * Assume the largest contiguous order of interest is a huge page. 4318 * Assume the largest contiguous order of interest is a huge page.
4265 * This value may be variable depending on boot parameters on IA64 4319 * This value may be variable depending on boot parameters on IA64 and
4320 * powerpc.
4266 */ 4321 */
4267 pageblock_order = order; 4322 pageblock_order = order;
4268} 4323}
@@ -4270,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order)
4270 4325
4271/* 4326/*
4272 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4327 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4273 * and pageblock_default_order() are unused as pageblock_order is set 4328 * is unused as pageblock_order is set at compile-time. See
4274 * at compile-time. See include/linux/pageblock-flags.h for the values of 4329 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4275 * pageblock_order based on the kernel config 4330 * the kernel config
4276 */ 4331 */
4277static inline int pageblock_default_order(unsigned int order) 4332static inline void set_pageblock_order(void)
4278{ 4333{
4279 return MAX_ORDER-1;
4280} 4334}
4281#define set_pageblock_order(x) do {} while (0)
4282 4335
4283#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4336#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4284 4337
@@ -4301,11 +4354,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4301 init_waitqueue_head(&pgdat->kswapd_wait); 4354 init_waitqueue_head(&pgdat->kswapd_wait);
4302 pgdat->kswapd_max_order = 0; 4355 pgdat->kswapd_max_order = 0;
4303 pgdat_page_cgroup_init(pgdat); 4356 pgdat_page_cgroup_init(pgdat);
4304 4357
4305 for (j = 0; j < MAX_NR_ZONES; j++) { 4358 for (j = 0; j < MAX_NR_ZONES; j++) {
4306 struct zone *zone = pgdat->node_zones + j; 4359 struct zone *zone = pgdat->node_zones + j;
4307 unsigned long size, realsize, memmap_pages; 4360 unsigned long size, realsize, memmap_pages;
4308 enum lru_list lru;
4309 4361
4310 size = zone_spanned_pages_in_node(nid, j, zones_size); 4362 size = zone_spanned_pages_in_node(nid, j, zones_size);
4311 realsize = size - zone_absent_pages_in_node(nid, j, 4363 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4355,18 +4407,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4355 zone->zone_pgdat = pgdat; 4407 zone->zone_pgdat = pgdat;
4356 4408
4357 zone_pcp_init(zone); 4409 zone_pcp_init(zone);
4358 for_each_lru(lru) 4410 lruvec_init(&zone->lruvec, zone);
4359 INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
4360 zone->reclaim_stat.recent_rotated[0] = 0;
4361 zone->reclaim_stat.recent_rotated[1] = 0;
4362 zone->reclaim_stat.recent_scanned[0] = 0;
4363 zone->reclaim_stat.recent_scanned[1] = 0;
4364 zap_zone_vm_stats(zone); 4411 zap_zone_vm_stats(zone);
4365 zone->flags = 0; 4412 zone->flags = 0;
4366 if (!size) 4413 if (!size)
4367 continue; 4414 continue;
4368 4415
4369 set_pageblock_order(pageblock_default_order()); 4416 set_pageblock_order();
4370 setup_usemap(pgdat, zone, size); 4417 setup_usemap(pgdat, zone, size);
4371 ret = init_currently_empty_zone(zone, zone_start_pfn, 4418 ret = init_currently_empty_zone(zone, zone_start_pfn,
4372 size, MEMMAP_EARLY); 4419 size, MEMMAP_EARLY);
@@ -4759,31 +4806,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4759 find_zone_movable_pfns_for_nodes(); 4806 find_zone_movable_pfns_for_nodes();
4760 4807
4761 /* Print out the zone ranges */ 4808 /* Print out the zone ranges */
4762 printk("Zone PFN ranges:\n"); 4809 printk("Zone ranges:\n");
4763 for (i = 0; i < MAX_NR_ZONES; i++) { 4810 for (i = 0; i < MAX_NR_ZONES; i++) {
4764 if (i == ZONE_MOVABLE) 4811 if (i == ZONE_MOVABLE)
4765 continue; 4812 continue;
4766 printk(" %-8s ", zone_names[i]); 4813 printk(KERN_CONT " %-8s ", zone_names[i]);
4767 if (arch_zone_lowest_possible_pfn[i] == 4814 if (arch_zone_lowest_possible_pfn[i] ==
4768 arch_zone_highest_possible_pfn[i]) 4815 arch_zone_highest_possible_pfn[i])
4769 printk("empty\n"); 4816 printk(KERN_CONT "empty\n");
4770 else 4817 else
4771 printk("%0#10lx -> %0#10lx\n", 4818 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
4772 arch_zone_lowest_possible_pfn[i], 4819 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
4773 arch_zone_highest_possible_pfn[i]); 4820 (arch_zone_highest_possible_pfn[i]
4821 << PAGE_SHIFT) - 1);
4774 } 4822 }
4775 4823
4776 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 4824 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
4777 printk("Movable zone start PFN for each node\n"); 4825 printk("Movable zone start for each node\n");
4778 for (i = 0; i < MAX_NUMNODES; i++) { 4826 for (i = 0; i < MAX_NUMNODES; i++) {
4779 if (zone_movable_pfn[i]) 4827 if (zone_movable_pfn[i])
4780 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); 4828 printk(" Node %d: %#010lx\n", i,
4829 zone_movable_pfn[i] << PAGE_SHIFT);
4781 } 4830 }
4782 4831
4783 /* Print out the early_node_map[] */ 4832 /* Print out the early_node_map[] */
4784 printk("Early memory PFN ranges\n"); 4833 printk("Early memory node ranges\n");
4785 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4834 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4786 printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); 4835 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
4836 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
4787 4837
4788 /* Initialise every node */ 4838 /* Initialise every node */
4789 mminit_verify_pageflags_layout(); 4839 mminit_verify_pageflags_layout();
@@ -4976,14 +5026,7 @@ static void setup_per_zone_lowmem_reserve(void)
4976 calculate_totalreserve_pages(); 5026 calculate_totalreserve_pages();
4977} 5027}
4978 5028
4979/** 5029static void __setup_per_zone_wmarks(void)
4980 * setup_per_zone_wmarks - called when min_free_kbytes changes
4981 * or when memory is hot-{added|removed}
4982 *
4983 * Ensures that the watermark[min,low,high] values for each zone are set
4984 * correctly with respect to min_free_kbytes.
4985 */
4986void setup_per_zone_wmarks(void)
4987{ 5030{
4988 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5031 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4989 unsigned long lowmem_pages = 0; 5032 unsigned long lowmem_pages = 0;
@@ -5030,6 +5073,11 @@ void setup_per_zone_wmarks(void)
5030 5073
5031 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5074 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5032 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5075 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5076
5077 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5078 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5079 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5080
5033 setup_zone_migrate_reserve(zone); 5081 setup_zone_migrate_reserve(zone);
5034 spin_unlock_irqrestore(&zone->lock, flags); 5082 spin_unlock_irqrestore(&zone->lock, flags);
5035 } 5083 }
@@ -5038,6 +5086,20 @@ void setup_per_zone_wmarks(void)
5038 calculate_totalreserve_pages(); 5086 calculate_totalreserve_pages();
5039} 5087}
5040 5088
5089/**
5090 * setup_per_zone_wmarks - called when min_free_kbytes changes
5091 * or when memory is hot-{added|removed}
5092 *
5093 * Ensures that the watermark[min,low,high] values for each zone are set
5094 * correctly with respect to min_free_kbytes.
5095 */
5096void setup_per_zone_wmarks(void)
5097{
5098 mutex_lock(&zonelists_mutex);
5099 __setup_per_zone_wmarks();
5100 mutex_unlock(&zonelists_mutex);
5101}
5102
5041/* 5103/*
5042 * The inactive anon list should be small enough that the VM never has to 5104 * The inactive anon list should be small enough that the VM never has to
5043 * do too much work, but large enough that each inactive page has a chance 5105 * do too much work, but large enough that each inactive page has a chance
@@ -5242,9 +5304,10 @@ void *__init alloc_large_system_hash(const char *tablename,
5242 int flags, 5304 int flags,
5243 unsigned int *_hash_shift, 5305 unsigned int *_hash_shift,
5244 unsigned int *_hash_mask, 5306 unsigned int *_hash_mask,
5245 unsigned long limit) 5307 unsigned long low_limit,
5308 unsigned long high_limit)
5246{ 5309{
5247 unsigned long long max = limit; 5310 unsigned long long max = high_limit;
5248 unsigned long log2qty, size; 5311 unsigned long log2qty, size;
5249 void *table = NULL; 5312 void *table = NULL;
5250 5313
@@ -5282,6 +5345,8 @@ void *__init alloc_large_system_hash(const char *tablename,
5282 } 5345 }
5283 max = min(max, 0x80000000ULL); 5346 max = min(max, 0x80000000ULL);
5284 5347
5348 if (numentries < low_limit)
5349 numentries = low_limit;
5285 if (numentries > max) 5350 if (numentries > max)
5286 numentries = max; 5351 numentries = max;
5287 5352
@@ -5412,14 +5477,16 @@ static int
5412__count_immobile_pages(struct zone *zone, struct page *page, int count) 5477__count_immobile_pages(struct zone *zone, struct page *page, int count)
5413{ 5478{
5414 unsigned long pfn, iter, found; 5479 unsigned long pfn, iter, found;
5480 int mt;
5481
5415 /* 5482 /*
5416 * For avoiding noise data, lru_add_drain_all() should be called 5483 * For avoiding noise data, lru_add_drain_all() should be called
5417 * If ZONE_MOVABLE, the zone never contains immobile pages 5484 * If ZONE_MOVABLE, the zone never contains immobile pages
5418 */ 5485 */
5419 if (zone_idx(zone) == ZONE_MOVABLE) 5486 if (zone_idx(zone) == ZONE_MOVABLE)
5420 return true; 5487 return true;
5421 5488 mt = get_pageblock_migratetype(page);
5422 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) 5489 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5423 return true; 5490 return true;
5424 5491
5425 pfn = page_to_pfn(page); 5492 pfn = page_to_pfn(page);
@@ -5536,7 +5603,7 @@ out:
5536 return ret; 5603 return ret;
5537} 5604}
5538 5605
5539void unset_migratetype_isolate(struct page *page) 5606void unset_migratetype_isolate(struct page *page, unsigned migratetype)
5540{ 5607{
5541 struct zone *zone; 5608 struct zone *zone;
5542 unsigned long flags; 5609 unsigned long flags;
@@ -5544,12 +5611,264 @@ void unset_migratetype_isolate(struct page *page)
5544 spin_lock_irqsave(&zone->lock, flags); 5611 spin_lock_irqsave(&zone->lock, flags);
5545 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 5612 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5546 goto out; 5613 goto out;
5547 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 5614 set_pageblock_migratetype(page, migratetype);
5548 move_freepages_block(zone, page, MIGRATE_MOVABLE); 5615 move_freepages_block(zone, page, migratetype);
5549out: 5616out:
5550 spin_unlock_irqrestore(&zone->lock, flags); 5617 spin_unlock_irqrestore(&zone->lock, flags);
5551} 5618}
5552 5619
5620#ifdef CONFIG_CMA
5621
5622static unsigned long pfn_max_align_down(unsigned long pfn)
5623{
5624 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
5625 pageblock_nr_pages) - 1);
5626}
5627
5628static unsigned long pfn_max_align_up(unsigned long pfn)
5629{
5630 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
5631 pageblock_nr_pages));
5632}
5633
5634static struct page *
5635__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5636 int **resultp)
5637{
5638 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5639
5640 if (PageHighMem(page))
5641 gfp_mask |= __GFP_HIGHMEM;
5642
5643 return alloc_page(gfp_mask);
5644}
5645
5646/* [start, end) must belong to a single zone. */
5647static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
5648{
5649 /* This function is based on compact_zone() from compaction.c. */
5650
5651 unsigned long pfn = start;
5652 unsigned int tries = 0;
5653 int ret = 0;
5654
5655 struct compact_control cc = {
5656 .nr_migratepages = 0,
5657 .order = -1,
5658 .zone = page_zone(pfn_to_page(start)),
5659 .sync = true,
5660 };
5661 INIT_LIST_HEAD(&cc.migratepages);
5662
5663 migrate_prep_local();
5664
5665 while (pfn < end || !list_empty(&cc.migratepages)) {
5666 if (fatal_signal_pending(current)) {
5667 ret = -EINTR;
5668 break;
5669 }
5670
5671 if (list_empty(&cc.migratepages)) {
5672 cc.nr_migratepages = 0;
5673 pfn = isolate_migratepages_range(cc.zone, &cc,
5674 pfn, end);
5675 if (!pfn) {
5676 ret = -EINTR;
5677 break;
5678 }
5679 tries = 0;
5680 } else if (++tries == 5) {
5681 ret = ret < 0 ? ret : -EBUSY;
5682 break;
5683 }
5684
5685 ret = migrate_pages(&cc.migratepages,
5686 __alloc_contig_migrate_alloc,
5687 0, false, MIGRATE_SYNC);
5688 }
5689
5690 putback_lru_pages(&cc.migratepages);
5691 return ret > 0 ? 0 : ret;
5692}
5693
5694/*
5695 * Update zone's cma pages counter used for watermark level calculation.
5696 */
5697static inline void __update_cma_watermarks(struct zone *zone, int count)
5698{
5699 unsigned long flags;
5700 spin_lock_irqsave(&zone->lock, flags);
5701 zone->min_cma_pages += count;
5702 spin_unlock_irqrestore(&zone->lock, flags);
5703 setup_per_zone_wmarks();
5704}
5705
5706/*
5707 * Trigger memory pressure bump to reclaim some pages in order to be able to
5708 * allocate 'count' pages in single page units. Does similar work as
5709 *__alloc_pages_slowpath() function.
5710 */
5711static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5712{
5713 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5714 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5715 int did_some_progress = 0;
5716 int order = 1;
5717
5718 /*
5719 * Increase level of watermarks to force kswapd do his job
5720 * to stabilise at new watermark level.
5721 */
5722 __update_cma_watermarks(zone, count);
5723
5724 /* Obey watermarks as if the page was being allocated */
5725 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5726 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5727
5728 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5729 NULL);
5730 if (!did_some_progress) {
5731 /* Exhausted what can be done so it's blamo time */
5732 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5733 }
5734 }
5735
5736 /* Restore original watermark levels. */
5737 __update_cma_watermarks(zone, -count);
5738
5739 return count;
5740}
5741
5742/**
5743 * alloc_contig_range() -- tries to allocate given range of pages
5744 * @start: start PFN to allocate
5745 * @end: one-past-the-last PFN to allocate
5746 * @migratetype: migratetype of the underlaying pageblocks (either
5747 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
5748 * in range must have the same migratetype and it must
5749 * be either of the two.
5750 *
5751 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
5752 * aligned, however it's the caller's responsibility to guarantee that
5753 * we are the only thread that changes migrate type of pageblocks the
5754 * pages fall in.
5755 *
5756 * The PFN range must belong to a single zone.
5757 *
5758 * Returns zero on success or negative error code. On success all
5759 * pages which PFN is in [start, end) are allocated for the caller and
5760 * need to be freed with free_contig_range().
5761 */
5762int alloc_contig_range(unsigned long start, unsigned long end,
5763 unsigned migratetype)
5764{
5765 struct zone *zone = page_zone(pfn_to_page(start));
5766 unsigned long outer_start, outer_end;
5767 int ret = 0, order;
5768
5769 /*
5770 * What we do here is we mark all pageblocks in range as
5771 * MIGRATE_ISOLATE. Because pageblock and max order pages may
5772 * have different sizes, and due to the way page allocator
5773 * work, we align the range to biggest of the two pages so
5774 * that page allocator won't try to merge buddies from
5775 * different pageblocks and change MIGRATE_ISOLATE to some
5776 * other migration type.
5777 *
5778 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
5779 * migrate the pages from an unaligned range (ie. pages that
5780 * we are interested in). This will put all the pages in
5781 * range back to page allocator as MIGRATE_ISOLATE.
5782 *
5783 * When this is done, we take the pages in range from page
5784 * allocator removing them from the buddy system. This way
5785 * page allocator will never consider using them.
5786 *
5787 * This lets us mark the pageblocks back as
5788 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
5789 * aligned range but not in the unaligned, original range are
5790 * put back to page allocator so that buddy can use them.
5791 */
5792
5793 ret = start_isolate_page_range(pfn_max_align_down(start),
5794 pfn_max_align_up(end), migratetype);
5795 if (ret)
5796 goto done;
5797
5798 ret = __alloc_contig_migrate_range(start, end);
5799 if (ret)
5800 goto done;
5801
5802 /*
5803 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
5804 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
5805 * more, all pages in [start, end) are free in page allocator.
5806 * What we are going to do is to allocate all pages from
5807 * [start, end) (that is remove them from page allocator).
5808 *
5809 * The only problem is that pages at the beginning and at the
5810 * end of interesting range may be not aligned with pages that
5811 * page allocator holds, ie. they can be part of higher order
5812 * pages. Because of this, we reserve the bigger range and
5813 * once this is done free the pages we are not interested in.
5814 *
5815 * We don't have to hold zone->lock here because the pages are
5816 * isolated thus they won't get removed from buddy.
5817 */
5818
5819 lru_add_drain_all();
5820 drain_all_pages();
5821
5822 order = 0;
5823 outer_start = start;
5824 while (!PageBuddy(pfn_to_page(outer_start))) {
5825 if (++order >= MAX_ORDER) {
5826 ret = -EBUSY;
5827 goto done;
5828 }
5829 outer_start &= ~0UL << order;
5830 }
5831
5832 /* Make sure the range is really isolated. */
5833 if (test_pages_isolated(outer_start, end)) {
5834 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5835 outer_start, end);
5836 ret = -EBUSY;
5837 goto done;
5838 }
5839
5840 /*
5841 * Reclaim enough pages to make sure that contiguous allocation
5842 * will not starve the system.
5843 */
5844 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5845
5846 /* Grab isolated pages from freelists. */
5847 outer_end = isolate_freepages_range(outer_start, end);
5848 if (!outer_end) {
5849 ret = -EBUSY;
5850 goto done;
5851 }
5852
5853 /* Free head and tail (if any) */
5854 if (start != outer_start)
5855 free_contig_range(outer_start, start - outer_start);
5856 if (end != outer_end)
5857 free_contig_range(end, outer_end - end);
5858
5859done:
5860 undo_isolate_page_range(pfn_max_align_down(start),
5861 pfn_max_align_up(end), migratetype);
5862 return ret;
5863}
5864
5865void free_contig_range(unsigned long pfn, unsigned nr_pages)
5866{
5867 for (; nr_pages--; ++pfn)
5868 __free_page(pfn_to_page(pfn));
5869}
5870#endif
5871
5553#ifdef CONFIG_MEMORY_HOTREMOVE 5872#ifdef CONFIG_MEMORY_HOTREMOVE
5554/* 5873/*
5555 * All pages in the range must be isolated before calling this. 5874 * All pages in the range must be isolated before calling this.
@@ -5618,7 +5937,7 @@ bool is_free_buddy_page(struct page *page)
5618} 5937}
5619#endif 5938#endif
5620 5939
5621static struct trace_print_flags pageflag_names[] = { 5940static const struct trace_print_flags pageflag_names[] = {
5622 {1UL << PG_locked, "locked" }, 5941 {1UL << PG_locked, "locked" },
5623 {1UL << PG_error, "error" }, 5942 {1UL << PG_error, "error" },
5624 {1UL << PG_referenced, "referenced" }, 5943 {1UL << PG_referenced, "referenced" },
@@ -5653,7 +5972,9 @@ static struct trace_print_flags pageflag_names[] = {
5653#ifdef CONFIG_MEMORY_FAILURE 5972#ifdef CONFIG_MEMORY_FAILURE
5654 {1UL << PG_hwpoison, "hwpoison" }, 5973 {1UL << PG_hwpoison, "hwpoison" },
5655#endif 5974#endif
5656 {-1UL, NULL }, 5975#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5976 {1UL << PG_compound_lock, "compound_lock" },
5977#endif
5657}; 5978};
5658 5979
5659static void dump_page_flags(unsigned long flags) 5980static void dump_page_flags(unsigned long flags)
@@ -5662,12 +5983,14 @@ static void dump_page_flags(unsigned long flags)
5662 unsigned long mask; 5983 unsigned long mask;
5663 int i; 5984 int i;
5664 5985
5986 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
5987
5665 printk(KERN_ALERT "page flags: %#lx(", flags); 5988 printk(KERN_ALERT "page flags: %#lx(", flags);
5666 5989
5667 /* remove zone id */ 5990 /* remove zone id */
5668 flags &= (1UL << NR_PAGEFLAGS) - 1; 5991 flags &= (1UL << NR_PAGEFLAGS) - 1;
5669 5992
5670 for (i = 0; pageflag_names[i].name && flags; i++) { 5993 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
5671 5994
5672 mask = pageflag_names[i].mask; 5995 mask = pageflag_names[i].mask;
5673 if ((flags & mask) != mask) 5996 if ((flags & mask) != mask)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..eb750f851395 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
392 392
393/** 393/**
394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. 394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
395 * @end: swap entry to be cmpxchged 395 * @ent: swap entry to be cmpxchged
396 * @old: old id 396 * @old: old id
397 * @new: new id 397 * @new: new id
398 * 398 *
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
422/** 422/**
423 * swap_cgroup_record - record mem_cgroup for this swp_entry. 423 * swap_cgroup_record - record mem_cgroup for this swp_entry.
424 * @ent: swap entry to be recorded into 424 * @ent: swap entry to be recorded into
425 * @mem: mem_cgroup to be recorded 425 * @id: mem_cgroup to be recorded
426 * 426 *
427 * Returns old value at success, 0 at failure. 427 * Returns old value at success, 0 at failure.
428 * (Of course, old value can be 0.) 428 * (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..34f02923744c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
18#include <linux/bio.h> 18#include <linux/bio.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/frontswap.h>
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22 23
23static struct bio *get_swap_bio(gfp_t gfp_flags, 24static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
98 unlock_page(page); 99 unlock_page(page);
99 goto out; 100 goto out;
100 } 101 }
102 if (frontswap_store(page) == 0) {
103 set_page_writeback(page);
104 unlock_page(page);
105 end_page_writeback(page);
106 goto out;
107 }
101 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 108 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
102 if (bio == NULL) { 109 if (bio == NULL) {
103 set_page_dirty(page); 110 set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
122 129
123 VM_BUG_ON(!PageLocked(page)); 130 VM_BUG_ON(!PageLocked(page));
124 VM_BUG_ON(PageUptodate(page)); 131 VM_BUG_ON(PageUptodate(page));
132 if (frontswap_load(page) == 0) {
133 SetPageUptodate(page);
134 unlock_page(page);
135 goto out;
136 }
125 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 137 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
126 if (bio == NULL) { 138 if (bio == NULL) {
127 unlock_page(page); 139 unlock_page(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 4ae42bb40892..c9f04774f2b8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -24,6 +24,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
24 * to be MIGRATE_ISOLATE. 24 * to be MIGRATE_ISOLATE.
25 * @start_pfn: The lower PFN of the range to be isolated. 25 * @start_pfn: The lower PFN of the range to be isolated.
26 * @end_pfn: The upper PFN of the range to be isolated. 26 * @end_pfn: The upper PFN of the range to be isolated.
27 * @migratetype: migrate type to set in error recovery.
27 * 28 *
28 * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in 29 * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
29 * the range will never be allocated. Any free pages and pages freed in the 30 * the range will never be allocated. Any free pages and pages freed in the
@@ -32,8 +33,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
32 * start_pfn/end_pfn must be aligned to pageblock_order. 33 * start_pfn/end_pfn must be aligned to pageblock_order.
33 * Returns 0 on success and -EBUSY if any part of range cannot be isolated. 34 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
34 */ 35 */
35int 36int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
36start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) 37 unsigned migratetype)
37{ 38{
38 unsigned long pfn; 39 unsigned long pfn;
39 unsigned long undo_pfn; 40 unsigned long undo_pfn;
@@ -56,7 +57,7 @@ undo:
56 for (pfn = start_pfn; 57 for (pfn = start_pfn;
57 pfn < undo_pfn; 58 pfn < undo_pfn;
58 pfn += pageblock_nr_pages) 59 pfn += pageblock_nr_pages)
59 unset_migratetype_isolate(pfn_to_page(pfn)); 60 unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
60 61
61 return -EBUSY; 62 return -EBUSY;
62} 63}
@@ -64,8 +65,8 @@ undo:
64/* 65/*
65 * Make isolated pages available again. 66 * Make isolated pages available again.
66 */ 67 */
67int 68int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
68undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) 69 unsigned migratetype)
69{ 70{
70 unsigned long pfn; 71 unsigned long pfn;
71 struct page *page; 72 struct page *page;
@@ -77,7 +78,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
77 page = __first_valid_page(pfn, pageblock_nr_pages); 78 page = __first_valid_page(pfn, pageblock_nr_pages);
78 if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 79 if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
79 continue; 80 continue;
80 unset_migratetype_isolate(page); 81 unset_migratetype_isolate(page, migratetype);
81 } 82 }
82 return 0; 83 return 0;
83} 84}
@@ -86,7 +87,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
86 * all pages in [start_pfn...end_pfn) must be in the same zone. 87 * all pages in [start_pfn...end_pfn) must be in the same zone.
87 * zone->lock must be held before call this. 88 * zone->lock must be held before call this.
88 * 89 *
89 * Returns 1 if all pages in the range is isolated. 90 * Returns 1 if all pages in the range are isolated.
90 */ 91 */
91static int 92static int
92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 93__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
162 162
163/** 163/**
164 * walk_page_range - walk a memory map's page tables with a callback 164 * walk_page_range - walk a memory map's page tables with a callback
165 * @mm: memory map to walk
166 * @addr: starting address 165 * @addr: starting address
167 * @end: ending address 166 * @end: ending address
168 * @walk: set of callbacks to invoke for each level of the tree 167 * @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
360 * @chunk: chunk to depopulate 360 * @chunk: chunk to depopulate
361 * @off: offset to the area to depopulate 361 * @off: offset to the area to depopulate
362 * @size: size of the area to depopulate in bytes 362 * @size: size of the area to depopulate in bytes
363 * @flush: whether to flush cache and tlb or not
364 * 363 *
365 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 364 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
366 * from @chunk. If @flush is true, vcache is flushed before unmapping 365 * from @chunk. If @flush is true, vcache is flushed before unmapping
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5a74fea182f1..74c0ddaa6fa0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -109,8 +109,8 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
109 109
110#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH 110#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
111#ifdef CONFIG_TRANSPARENT_HUGEPAGE 111#ifdef CONFIG_TRANSPARENT_HUGEPAGE
112pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 112void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
113 pmd_t *pmdp) 113 pmd_t *pmdp)
114{ 114{
115 pmd_t pmd = pmd_mksplitting(*pmdp); 115 pmd_t pmd = pmd_mksplitting(*pmdp);
116 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 116 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index c20ff48994c2..926b46649749 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -371,15 +371,15 @@ static ssize_t process_vm_rw(pid_t pid,
371 /* Check iovecs */ 371 /* Check iovecs */
372 if (vm_write) 372 if (vm_write)
373 rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, 373 rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
374 iovstack_l, &iov_l, 1); 374 iovstack_l, &iov_l);
375 else 375 else
376 rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, 376 rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
377 iovstack_l, &iov_l, 1); 377 iovstack_l, &iov_l);
378 if (rc <= 0) 378 if (rc <= 0)
379 goto free_iovecs; 379 goto free_iovecs;
380 380
381 rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, 381 rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
382 iovstack_r, &iov_r, 0); 382 iovstack_r, &iov_r);
383 if (rc <= 0) 383 if (rc <= 0)
384 goto free_iovecs; 384 goto free_iovecs;
385 385
@@ -438,16 +438,16 @@ compat_process_vm_rw(compat_pid_t pid,
438 if (vm_write) 438 if (vm_write)
439 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, 439 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
440 UIO_FASTIOV, iovstack_l, 440 UIO_FASTIOV, iovstack_l,
441 &iov_l, 1); 441 &iov_l);
442 else 442 else
443 rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, 443 rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
444 UIO_FASTIOV, iovstack_l, 444 UIO_FASTIOV, iovstack_l,
445 &iov_l, 1); 445 &iov_l);
446 if (rc <= 0) 446 if (rc <= 0)
447 goto free_iovecs; 447 goto free_iovecs;
448 rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, 448 rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
449 UIO_FASTIOV, iovstack_r, 449 UIO_FASTIOV, iovstack_r,
450 &iov_r, 0); 450 &iov_r);
451 if (rc <= 0) 451 if (rc <= 0)
452 goto free_iovecs; 452 goto free_iovecs;
453 453
diff --git a/mm/readahead.c b/mm/readahead.c
index cbcbb02f3e28..ea8f8fa21649 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,8 @@
17#include <linux/task_io_accounting_ops.h> 17#include <linux/task_io_accounting_ops.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/syscalls.h>
21#include <linux/file.h>
20 22
21/* 23/*
22 * Initialise a struct file's readahead state. Assumes that the caller has 24 * Initialise a struct file's readahead state. Assumes that the caller has
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping,
562 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 564 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
563} 565}
564EXPORT_SYMBOL_GPL(page_cache_async_readahead); 566EXPORT_SYMBOL_GPL(page_cache_async_readahead);
567
568static ssize_t
569do_readahead(struct address_space *mapping, struct file *filp,
570 pgoff_t index, unsigned long nr)
571{
572 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
573 return -EINVAL;
574
575 force_page_cache_readahead(mapping, filp, index, nr);
576 return 0;
577}
578
579SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
580{
581 ssize_t ret;
582 struct file *file;
583
584 ret = -EBADF;
585 file = fget(fd);
586 if (file) {
587 if (file->f_mode & FMODE_READ) {
588 struct address_space *mapping = file->f_mapping;
589 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
590 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
591 unsigned long len = end - start + 1;
592 ret = do_readahead(mapping, file, start, len);
593 }
594 fput(file);
595 }
596 return ret;
597}
598#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
599asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
600{
601 return SYSC_readahead((int) fd, offset, (size_t) count);
602}
603SYSCALL_ALIAS(sys_readahead, SyS_readahead);
604#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad584ffb7..0f3b7cda2a24 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
755 pte_unmap_unlock(pte, ptl); 755 pte_unmap_unlock(pte, ptl);
756 } 756 }
757 757
758 /* Pretend the page is referenced if the task has the
759 swap token and is in the middle of a page fault. */
760 if (mm != current->mm && has_swap_token(mm) &&
761 rwsem_is_locked(&mm->mmap_sem))
762 referenced++;
763
764 (*mapcount)--; 758 (*mapcount)--;
765 759
766 if (referenced) 760 if (referenced)
diff --git a/mm/shmem.c b/mm/shmem.c
index f99ff3e50bd6..bd106361be4b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/pagevec.h> 54#include <linux/pagevec.h>
55#include <linux/percpu_counter.h> 55#include <linux/percpu_counter.h>
56#include <linux/falloc.h>
56#include <linux/splice.h> 57#include <linux/splice.h>
57#include <linux/security.h> 58#include <linux/security.h>
58#include <linux/swapops.h> 59#include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
83 char value[0]; 84 char value[0];
84}; 85};
85 86
87/*
88 * shmem_fallocate and shmem_writepage communicate via inode->i_private
89 * (with i_mutex making sure that it has only one user at a time):
90 * we would prefer not to enlarge the shmem inode just for that.
91 */
92struct shmem_falloc {
93 pgoff_t start; /* start of range currently being fallocated */
94 pgoff_t next; /* the next page offset to be fallocated */
95 pgoff_t nr_falloced; /* how many new pages have been fallocated */
96 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
97};
98
86/* Flag allocation requirements to shmem_getpage */ 99/* Flag allocation requirements to shmem_getpage */
87enum sgp_type { 100enum sgp_type {
88 SGP_READ, /* don't exceed i_size, don't allocate page */ 101 SGP_READ, /* don't exceed i_size, don't allocate page */
89 SGP_CACHE, /* don't exceed i_size, may allocate page */ 102 SGP_CACHE, /* don't exceed i_size, may allocate page */
90 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ 103 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
91 SGP_WRITE, /* may exceed i_size, may allocate page */ 104 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
105 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
92}; 106};
93 107
94#ifdef CONFIG_TMPFS 108#ifdef CONFIG_TMPFS
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void)
103} 117}
104#endif 118#endif
105 119
120static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
121static int shmem_replace_page(struct page **pagep, gfp_t gfp,
122 struct shmem_inode_info *info, pgoff_t index);
106static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 123static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
107 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); 124 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
108 125
@@ -247,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
247} 264}
248 265
249/* 266/*
267 * Sometimes, before we decide whether to proceed or to fail, we must check
268 * that an entry was not already brought back from swap by a racing thread.
269 *
270 * Checking page is not enough: by the time a SwapCache page is locked, it
271 * might be reused, and again be SwapCache, using the same swap as before.
272 */
273static bool shmem_confirm_swap(struct address_space *mapping,
274 pgoff_t index, swp_entry_t swap)
275{
276 void *item;
277
278 rcu_read_lock();
279 item = radix_tree_lookup(&mapping->page_tree, index);
280 rcu_read_unlock();
281 return item == swp_to_radix_entry(swap);
282}
283
284/*
250 * Like add_to_page_cache_locked, but error if expected item has gone. 285 * Like add_to_page_cache_locked, but error if expected item has gone.
251 */ 286 */
252static int shmem_add_to_page_cache(struct page *page, 287static int shmem_add_to_page_cache(struct page *page,
253 struct address_space *mapping, 288 struct address_space *mapping,
254 pgoff_t index, gfp_t gfp, void *expected) 289 pgoff_t index, gfp_t gfp, void *expected)
255{ 290{
256 int error = 0; 291 int error;
257 292
258 VM_BUG_ON(!PageLocked(page)); 293 VM_BUG_ON(!PageLocked(page));
259 VM_BUG_ON(!PageSwapBacked(page)); 294 VM_BUG_ON(!PageSwapBacked(page));
260 295
296 page_cache_get(page);
297 page->mapping = mapping;
298 page->index = index;
299
300 spin_lock_irq(&mapping->tree_lock);
261 if (!expected) 301 if (!expected)
262 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 302 error = radix_tree_insert(&mapping->page_tree, index, page);
303 else
304 error = shmem_radix_tree_replace(mapping, index, expected,
305 page);
263 if (!error) { 306 if (!error) {
264 page_cache_get(page); 307 mapping->nrpages++;
265 page->mapping = mapping; 308 __inc_zone_page_state(page, NR_FILE_PAGES);
266 page->index = index; 309 __inc_zone_page_state(page, NR_SHMEM);
267 310 spin_unlock_irq(&mapping->tree_lock);
268 spin_lock_irq(&mapping->tree_lock); 311 } else {
269 if (!expected) 312 page->mapping = NULL;
270 error = radix_tree_insert(&mapping->page_tree, 313 spin_unlock_irq(&mapping->tree_lock);
271 index, page); 314 page_cache_release(page);
272 else
273 error = shmem_radix_tree_replace(mapping, index,
274 expected, page);
275 if (!error) {
276 mapping->nrpages++;
277 __inc_zone_page_state(page, NR_FILE_PAGES);
278 __inc_zone_page_state(page, NR_SHMEM);
279 spin_unlock_irq(&mapping->tree_lock);
280 } else {
281 page->mapping = NULL;
282 spin_unlock_irq(&mapping->tree_lock);
283 page_cache_release(page);
284 }
285 if (!expected)
286 radix_tree_preload_end();
287 } 315 }
288 if (error)
289 mem_cgroup_uncharge_cache_page(page);
290 return error; 316 return error;
291} 317}
292 318
@@ -423,27 +449,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
423 449
424/* 450/*
425 * Remove range of pages and swap entries from radix tree, and free them. 451 * Remove range of pages and swap entries from radix tree, and free them.
452 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
426 */ 453 */
427void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 454static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
455 bool unfalloc)
428{ 456{
429 struct address_space *mapping = inode->i_mapping; 457 struct address_space *mapping = inode->i_mapping;
430 struct shmem_inode_info *info = SHMEM_I(inode); 458 struct shmem_inode_info *info = SHMEM_I(inode);
431 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 459 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
432 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 460 pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
433 pgoff_t end = (lend >> PAGE_CACHE_SHIFT); 461 unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
462 unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
434 struct pagevec pvec; 463 struct pagevec pvec;
435 pgoff_t indices[PAGEVEC_SIZE]; 464 pgoff_t indices[PAGEVEC_SIZE];
436 long nr_swaps_freed = 0; 465 long nr_swaps_freed = 0;
437 pgoff_t index; 466 pgoff_t index;
438 int i; 467 int i;
439 468
440 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 469 if (lend == -1)
470 end = -1; /* unsigned, so actually very big */
441 471
442 pagevec_init(&pvec, 0); 472 pagevec_init(&pvec, 0);
443 index = start; 473 index = start;
444 while (index <= end) { 474 while (index < end) {
445 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 475 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
446 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 476 min(end - index, (pgoff_t)PAGEVEC_SIZE),
447 pvec.pages, indices); 477 pvec.pages, indices);
448 if (!pvec.nr) 478 if (!pvec.nr)
449 break; 479 break;
@@ -452,10 +482,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
452 struct page *page = pvec.pages[i]; 482 struct page *page = pvec.pages[i];
453 483
454 index = indices[i]; 484 index = indices[i];
455 if (index > end) 485 if (index >= end)
456 break; 486 break;
457 487
458 if (radix_tree_exceptional_entry(page)) { 488 if (radix_tree_exceptional_entry(page)) {
489 if (unfalloc)
490 continue;
459 nr_swaps_freed += !shmem_free_swap(mapping, 491 nr_swaps_freed += !shmem_free_swap(mapping,
460 index, page); 492 index, page);
461 continue; 493 continue;
@@ -463,9 +495,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
463 495
464 if (!trylock_page(page)) 496 if (!trylock_page(page))
465 continue; 497 continue;
466 if (page->mapping == mapping) { 498 if (!unfalloc || !PageUptodate(page)) {
467 VM_BUG_ON(PageWriteback(page)); 499 if (page->mapping == mapping) {
468 truncate_inode_page(mapping, page); 500 VM_BUG_ON(PageWriteback(page));
501 truncate_inode_page(mapping, page);
502 }
469 } 503 }
470 unlock_page(page); 504 unlock_page(page);
471 } 505 }
@@ -476,30 +510,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
476 index++; 510 index++;
477 } 511 }
478 512
479 if (partial) { 513 if (partial_start) {
480 struct page *page = NULL; 514 struct page *page = NULL;
481 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); 515 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
482 if (page) { 516 if (page) {
483 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 517 unsigned int top = PAGE_CACHE_SIZE;
518 if (start > end) {
519 top = partial_end;
520 partial_end = 0;
521 }
522 zero_user_segment(page, partial_start, top);
484 set_page_dirty(page); 523 set_page_dirty(page);
485 unlock_page(page); 524 unlock_page(page);
486 page_cache_release(page); 525 page_cache_release(page);
487 } 526 }
488 } 527 }
528 if (partial_end) {
529 struct page *page = NULL;
530 shmem_getpage(inode, end, &page, SGP_READ, NULL);
531 if (page) {
532 zero_user_segment(page, 0, partial_end);
533 set_page_dirty(page);
534 unlock_page(page);
535 page_cache_release(page);
536 }
537 }
538 if (start >= end)
539 return;
489 540
490 index = start; 541 index = start;
491 for ( ; ; ) { 542 for ( ; ; ) {
492 cond_resched(); 543 cond_resched();
493 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 544 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
494 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 545 min(end - index, (pgoff_t)PAGEVEC_SIZE),
495 pvec.pages, indices); 546 pvec.pages, indices);
496 if (!pvec.nr) { 547 if (!pvec.nr) {
497 if (index == start) 548 if (index == start || unfalloc)
498 break; 549 break;
499 index = start; 550 index = start;
500 continue; 551 continue;
501 } 552 }
502 if (index == start && indices[0] > end) { 553 if ((index == start || unfalloc) && indices[0] >= end) {
503 shmem_deswap_pagevec(&pvec); 554 shmem_deswap_pagevec(&pvec);
504 pagevec_release(&pvec); 555 pagevec_release(&pvec);
505 break; 556 break;
@@ -509,19 +560,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
509 struct page *page = pvec.pages[i]; 560 struct page *page = pvec.pages[i];
510 561
511 index = indices[i]; 562 index = indices[i];
512 if (index > end) 563 if (index >= end)
513 break; 564 break;
514 565
515 if (radix_tree_exceptional_entry(page)) { 566 if (radix_tree_exceptional_entry(page)) {
567 if (unfalloc)
568 continue;
516 nr_swaps_freed += !shmem_free_swap(mapping, 569 nr_swaps_freed += !shmem_free_swap(mapping,
517 index, page); 570 index, page);
518 continue; 571 continue;
519 } 572 }
520 573
521 lock_page(page); 574 lock_page(page);
522 if (page->mapping == mapping) { 575 if (!unfalloc || !PageUptodate(page)) {
523 VM_BUG_ON(PageWriteback(page)); 576 if (page->mapping == mapping) {
524 truncate_inode_page(mapping, page); 577 VM_BUG_ON(PageWriteback(page));
578 truncate_inode_page(mapping, page);
579 }
525 } 580 }
526 unlock_page(page); 581 unlock_page(page);
527 } 582 }
@@ -535,7 +590,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
535 info->swapped -= nr_swaps_freed; 590 info->swapped -= nr_swaps_freed;
536 shmem_recalc_inode(inode); 591 shmem_recalc_inode(inode);
537 spin_unlock(&info->lock); 592 spin_unlock(&info->lock);
593}
538 594
595void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
596{
597 shmem_undo_range(inode, lstart, lend, false);
539 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 598 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
540} 599}
541EXPORT_SYMBOL_GPL(shmem_truncate_range); 600EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -597,19 +656,20 @@ static void shmem_evict_inode(struct inode *inode)
597 } 656 }
598 BUG_ON(inode->i_blocks); 657 BUG_ON(inode->i_blocks);
599 shmem_free_inode(inode->i_sb); 658 shmem_free_inode(inode->i_sb);
600 end_writeback(inode); 659 clear_inode(inode);
601} 660}
602 661
603/* 662/*
604 * If swap found in inode, free it and move page from swapcache to filecache. 663 * If swap found in inode, free it and move page from swapcache to filecache.
605 */ 664 */
606static int shmem_unuse_inode(struct shmem_inode_info *info, 665static int shmem_unuse_inode(struct shmem_inode_info *info,
607 swp_entry_t swap, struct page *page) 666 swp_entry_t swap, struct page **pagep)
608{ 667{
609 struct address_space *mapping = info->vfs_inode.i_mapping; 668 struct address_space *mapping = info->vfs_inode.i_mapping;
610 void *radswap; 669 void *radswap;
611 pgoff_t index; 670 pgoff_t index;
612 int error; 671 gfp_t gfp;
672 int error = 0;
613 673
614 radswap = swp_to_radix_entry(swap); 674 radswap = swp_to_radix_entry(swap);
615 index = radix_tree_locate_item(&mapping->page_tree, radswap); 675 index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +685,48 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
625 if (shmem_swaplist.next != &info->swaplist) 685 if (shmem_swaplist.next != &info->swaplist)
626 list_move_tail(&shmem_swaplist, &info->swaplist); 686 list_move_tail(&shmem_swaplist, &info->swaplist);
627 687
688 gfp = mapping_gfp_mask(mapping);
689 if (shmem_should_replace_page(*pagep, gfp)) {
690 mutex_unlock(&shmem_swaplist_mutex);
691 error = shmem_replace_page(pagep, gfp, info, index);
692 mutex_lock(&shmem_swaplist_mutex);
693 /*
694 * We needed to drop mutex to make that restrictive page
695 * allocation, but the inode might have been freed while we
696 * dropped it: although a racing shmem_evict_inode() cannot
697 * complete without emptying the radix_tree, our page lock
698 * on this swapcache page is not enough to prevent that -
699 * free_swap_and_cache() of our swap entry will only
700 * trylock_page(), removing swap from radix_tree whatever.
701 *
702 * We must not proceed to shmem_add_to_page_cache() if the
703 * inode has been freed, but of course we cannot rely on
704 * inode or mapping or info to check that. However, we can
705 * safely check if our swap entry is still in use (and here
706 * it can't have got reused for another page): if it's still
707 * in use, then the inode cannot have been freed yet, and we
708 * can safely proceed (if it's no longer in use, that tells
709 * nothing about the inode, but we don't need to unuse swap).
710 */
711 if (!page_swapcount(*pagep))
712 error = -ENOENT;
713 }
714
628 /* 715 /*
629 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 716 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
630 * but also to hold up shmem_evict_inode(): so inode cannot be freed 717 * but also to hold up shmem_evict_inode(): so inode cannot be freed
631 * beneath us (pagelock doesn't help until the page is in pagecache). 718 * beneath us (pagelock doesn't help until the page is in pagecache).
632 */ 719 */
633 error = shmem_add_to_page_cache(page, mapping, index, 720 if (!error)
721 error = shmem_add_to_page_cache(*pagep, mapping, index,
634 GFP_NOWAIT, radswap); 722 GFP_NOWAIT, radswap);
635 /* which does mem_cgroup_uncharge_cache_page on error */
636
637 if (error != -ENOMEM) { 723 if (error != -ENOMEM) {
638 /* 724 /*
639 * Truncation and eviction use free_swap_and_cache(), which 725 * Truncation and eviction use free_swap_and_cache(), which
640 * only does trylock page: if we raced, best clean up here. 726 * only does trylock page: if we raced, best clean up here.
641 */ 727 */
642 delete_from_swap_cache(page); 728 delete_from_swap_cache(*pagep);
643 set_page_dirty(page); 729 set_page_dirty(*pagep);
644 if (!error) { 730 if (!error) {
645 spin_lock(&info->lock); 731 spin_lock(&info->lock);
646 info->swapped--; 732 info->swapped--;
@@ -660,7 +746,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
660 struct list_head *this, *next; 746 struct list_head *this, *next;
661 struct shmem_inode_info *info; 747 struct shmem_inode_info *info;
662 int found = 0; 748 int found = 0;
663 int error; 749 int error = 0;
750
751 /*
752 * There's a faint possibility that swap page was replaced before
753 * caller locked it: caller will come back later with the right page.
754 */
755 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
756 goto out;
664 757
665 /* 758 /*
666 * Charge page using GFP_KERNEL while we can wait, before taking 759 * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +769,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
676 list_for_each_safe(this, next, &shmem_swaplist) { 769 list_for_each_safe(this, next, &shmem_swaplist) {
677 info = list_entry(this, struct shmem_inode_info, swaplist); 770 info = list_entry(this, struct shmem_inode_info, swaplist);
678 if (info->swapped) 771 if (info->swapped)
679 found = shmem_unuse_inode(info, swap, page); 772 found = shmem_unuse_inode(info, swap, &page);
680 else 773 else
681 list_del_init(&info->swaplist); 774 list_del_init(&info->swaplist);
682 cond_resched(); 775 cond_resched();
@@ -685,8 +778,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
685 } 778 }
686 mutex_unlock(&shmem_swaplist_mutex); 779 mutex_unlock(&shmem_swaplist_mutex);
687 780
688 if (!found)
689 mem_cgroup_uncharge_cache_page(page);
690 if (found < 0) 781 if (found < 0)
691 error = found; 782 error = found;
692out: 783out:
@@ -727,6 +818,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
727 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 818 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
728 goto redirty; 819 goto redirty;
729 } 820 }
821
822 /*
823 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
824 * value into swapfile.c, the only way we can correctly account for a
825 * fallocated page arriving here is now to initialize it and write it.
826 *
827 * That's okay for a page already fallocated earlier, but if we have
828 * not yet completed the fallocation, then (a) we want to keep track
829 * of this page in case we have to undo it, and (b) it may not be a
830 * good idea to continue anyway, once we're pushing into swap. So
831 * reactivate the page, and let shmem_fallocate() quit when too many.
832 */
833 if (!PageUptodate(page)) {
834 if (inode->i_private) {
835 struct shmem_falloc *shmem_falloc;
836 spin_lock(&inode->i_lock);
837 shmem_falloc = inode->i_private;
838 if (shmem_falloc &&
839 index >= shmem_falloc->start &&
840 index < shmem_falloc->next)
841 shmem_falloc->nr_unswapped++;
842 else
843 shmem_falloc = NULL;
844 spin_unlock(&inode->i_lock);
845 if (shmem_falloc)
846 goto redirty;
847 }
848 clear_highpage(page);
849 flush_dcache_page(page);
850 SetPageUptodate(page);
851 }
852
730 swap = get_swap_page(); 853 swap = get_swap_page();
731 if (!swap.val) 854 if (!swap.val)
732 goto redirty; 855 goto redirty;
@@ -856,6 +979,89 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
856#endif 979#endif
857 980
858/* 981/*
982 * When a page is moved from swapcache to shmem filecache (either by the
983 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
984 * shmem_unuse_inode()), it may have been read in earlier from swap, in
985 * ignorance of the mapping it belongs to. If that mapping has special
986 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
987 * we may need to copy to a suitable page before moving to filecache.
988 *
989 * In a future release, this may well be extended to respect cpuset and
990 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
991 * but for now it is a simple matter of zone.
992 */
993static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
994{
995 return page_zonenum(page) > gfp_zone(gfp);
996}
997
998static int shmem_replace_page(struct page **pagep, gfp_t gfp,
999 struct shmem_inode_info *info, pgoff_t index)
1000{
1001 struct page *oldpage, *newpage;
1002 struct address_space *swap_mapping;
1003 pgoff_t swap_index;
1004 int error;
1005
1006 oldpage = *pagep;
1007 swap_index = page_private(oldpage);
1008 swap_mapping = page_mapping(oldpage);
1009
1010 /*
1011 * We have arrived here because our zones are constrained, so don't
1012 * limit chance of success by further cpuset and node constraints.
1013 */
1014 gfp &= ~GFP_CONSTRAINT_MASK;
1015 newpage = shmem_alloc_page(gfp, info, index);
1016 if (!newpage)
1017 return -ENOMEM;
1018
1019 page_cache_get(newpage);
1020 copy_highpage(newpage, oldpage);
1021 flush_dcache_page(newpage);
1022
1023 __set_page_locked(newpage);
1024 SetPageUptodate(newpage);
1025 SetPageSwapBacked(newpage);
1026 set_page_private(newpage, swap_index);
1027 SetPageSwapCache(newpage);
1028
1029 /*
1030 * Our caller will very soon move newpage out of swapcache, but it's
1031 * a nice clean interface for us to replace oldpage by newpage there.
1032 */
1033 spin_lock_irq(&swap_mapping->tree_lock);
1034 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1035 newpage);
1036 if (!error) {
1037 __inc_zone_page_state(newpage, NR_FILE_PAGES);
1038 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1039 }
1040 spin_unlock_irq(&swap_mapping->tree_lock);
1041
1042 if (unlikely(error)) {
1043 /*
1044 * Is this possible? I think not, now that our callers check
1045 * both PageSwapCache and page_private after getting page lock;
1046 * but be defensive. Reverse old to newpage for clear and free.
1047 */
1048 oldpage = newpage;
1049 } else {
1050 mem_cgroup_replace_page_cache(oldpage, newpage);
1051 lru_cache_add_anon(newpage);
1052 *pagep = newpage;
1053 }
1054
1055 ClearPageSwapCache(oldpage);
1056 set_page_private(oldpage, 0);
1057
1058 unlock_page(oldpage);
1059 page_cache_release(oldpage);
1060 page_cache_release(oldpage);
1061 return error;
1062}
1063
1064/*
859 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1065 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
860 * 1066 *
861 * If we allocate a new one we do not mark it dirty. That's up to the 1067 * If we allocate a new one we do not mark it dirty. That's up to the
@@ -872,6 +1078,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
872 swp_entry_t swap; 1078 swp_entry_t swap;
873 int error; 1079 int error;
874 int once = 0; 1080 int once = 0;
1081 int alloced = 0;
875 1082
876 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) 1083 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
877 return -EFBIG; 1084 return -EFBIG;
@@ -883,19 +1090,21 @@ repeat:
883 page = NULL; 1090 page = NULL;
884 } 1091 }
885 1092
886 if (sgp != SGP_WRITE && 1093 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
887 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1094 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
888 error = -EINVAL; 1095 error = -EINVAL;
889 goto failed; 1096 goto failed;
890 } 1097 }
891 1098
1099 /* fallocated page? */
1100 if (page && !PageUptodate(page)) {
1101 if (sgp != SGP_READ)
1102 goto clear;
1103 unlock_page(page);
1104 page_cache_release(page);
1105 page = NULL;
1106 }
892 if (page || (sgp == SGP_READ && !swap.val)) { 1107 if (page || (sgp == SGP_READ && !swap.val)) {
893 /*
894 * Once we can get the page lock, it must be uptodate:
895 * if there were an error in reading back from swap,
896 * the page would not be inserted into the filecache.
897 */
898 BUG_ON(page && !PageUptodate(page));
899 *pagep = page; 1108 *pagep = page;
900 return 0; 1109 return 0;
901 } 1110 }
@@ -923,26 +1132,31 @@ repeat:
923 1132
924 /* We have to do this with page locked to prevent races */ 1133 /* We have to do this with page locked to prevent races */
925 lock_page(page); 1134 lock_page(page);
1135 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1136 !shmem_confirm_swap(mapping, index, swap)) {
1137 error = -EEXIST; /* try again */
1138 goto unlock;
1139 }
926 if (!PageUptodate(page)) { 1140 if (!PageUptodate(page)) {
927 error = -EIO; 1141 error = -EIO;
928 goto failed; 1142 goto failed;
929 } 1143 }
930 wait_on_page_writeback(page); 1144 wait_on_page_writeback(page);
931 1145
932 /* Someone may have already done it for us */ 1146 if (shmem_should_replace_page(page, gfp)) {
933 if (page->mapping) { 1147 error = shmem_replace_page(&page, gfp, info, index);
934 if (page->mapping == mapping && 1148 if (error)
935 page->index == index) 1149 goto failed;
936 goto done;
937 error = -EEXIST;
938 goto failed;
939 } 1150 }
940 1151
941 error = mem_cgroup_cache_charge(page, current->mm, 1152 error = mem_cgroup_cache_charge(page, current->mm,
942 gfp & GFP_RECLAIM_MASK); 1153 gfp & GFP_RECLAIM_MASK);
943 if (!error) 1154 if (!error) {
944 error = shmem_add_to_page_cache(page, mapping, index, 1155 error = shmem_add_to_page_cache(page, mapping, index,
945 gfp, swp_to_radix_entry(swap)); 1156 gfp, swp_to_radix_entry(swap));
1157 /* We already confirmed swap, and make no allocation */
1158 VM_BUG_ON(error);
1159 }
946 if (error) 1160 if (error)
947 goto failed; 1161 goto failed;
948 1162
@@ -979,11 +1193,18 @@ repeat:
979 __set_page_locked(page); 1193 __set_page_locked(page);
980 error = mem_cgroup_cache_charge(page, current->mm, 1194 error = mem_cgroup_cache_charge(page, current->mm,
981 gfp & GFP_RECLAIM_MASK); 1195 gfp & GFP_RECLAIM_MASK);
982 if (!error)
983 error = shmem_add_to_page_cache(page, mapping, index,
984 gfp, NULL);
985 if (error) 1196 if (error)
986 goto decused; 1197 goto decused;
1198 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1199 if (!error) {
1200 error = shmem_add_to_page_cache(page, mapping, index,
1201 gfp, NULL);
1202 radix_tree_preload_end();
1203 }
1204 if (error) {
1205 mem_cgroup_uncharge_cache_page(page);
1206 goto decused;
1207 }
987 lru_cache_add_anon(page); 1208 lru_cache_add_anon(page);
988 1209
989 spin_lock(&info->lock); 1210 spin_lock(&info->lock);
@@ -991,19 +1212,36 @@ repeat:
991 inode->i_blocks += BLOCKS_PER_PAGE; 1212 inode->i_blocks += BLOCKS_PER_PAGE;
992 shmem_recalc_inode(inode); 1213 shmem_recalc_inode(inode);
993 spin_unlock(&info->lock); 1214 spin_unlock(&info->lock);
1215 alloced = true;
994 1216
995 clear_highpage(page); 1217 /*
996 flush_dcache_page(page); 1218 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
997 SetPageUptodate(page); 1219 */
1220 if (sgp == SGP_FALLOC)
1221 sgp = SGP_WRITE;
1222clear:
1223 /*
1224 * Let SGP_WRITE caller clear ends if write does not fill page;
1225 * but SGP_FALLOC on a page fallocated earlier must initialize
1226 * it now, lest undo on failure cancel our earlier guarantee.
1227 */
1228 if (sgp != SGP_WRITE) {
1229 clear_highpage(page);
1230 flush_dcache_page(page);
1231 SetPageUptodate(page);
1232 }
998 if (sgp == SGP_DIRTY) 1233 if (sgp == SGP_DIRTY)
999 set_page_dirty(page); 1234 set_page_dirty(page);
1000 } 1235 }
1001done: 1236
1002 /* Perhaps the file has been truncated since we checked */ 1237 /* Perhaps the file has been truncated since we checked */
1003 if (sgp != SGP_WRITE && 1238 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1004 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1239 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1005 error = -EINVAL; 1240 error = -EINVAL;
1006 goto trunc; 1241 if (alloced)
1242 goto trunc;
1243 else
1244 goto failed;
1007 } 1245 }
1008 *pagep = page; 1246 *pagep = page;
1009 return 0; 1247 return 0;
@@ -1012,6 +1250,7 @@ done:
1012 * Error recovery. 1250 * Error recovery.
1013 */ 1251 */
1014trunc: 1252trunc:
1253 info = SHMEM_I(inode);
1015 ClearPageDirty(page); 1254 ClearPageDirty(page);
1016 delete_from_page_cache(page); 1255 delete_from_page_cache(page);
1017 spin_lock(&info->lock); 1256 spin_lock(&info->lock);
@@ -1019,19 +1258,16 @@ trunc:
1019 inode->i_blocks -= BLOCKS_PER_PAGE; 1258 inode->i_blocks -= BLOCKS_PER_PAGE;
1020 spin_unlock(&info->lock); 1259 spin_unlock(&info->lock);
1021decused: 1260decused:
1261 sbinfo = SHMEM_SB(inode->i_sb);
1022 if (sbinfo->max_blocks) 1262 if (sbinfo->max_blocks)
1023 percpu_counter_add(&sbinfo->used_blocks, -1); 1263 percpu_counter_add(&sbinfo->used_blocks, -1);
1024unacct: 1264unacct:
1025 shmem_unacct_blocks(info->flags, 1); 1265 shmem_unacct_blocks(info->flags, 1);
1026failed: 1266failed:
1027 if (swap.val && error != -EINVAL) { 1267 if (swap.val && error != -EINVAL &&
1028 struct page *test = find_get_page(mapping, index); 1268 !shmem_confirm_swap(mapping, index, swap))
1029 if (test && !radix_tree_exceptional_entry(test)) 1269 error = -EEXIST;
1030 page_cache_release(test); 1270unlock:
1031 /* Have another try if the entry has changed */
1032 if (test != swp_to_radix_entry(swap))
1033 error = -EEXIST;
1034 }
1035 if (page) { 1271 if (page) {
1036 unlock_page(page); 1272 unlock_page(page);
1037 page_cache_release(page); 1273 page_cache_release(page);
@@ -1043,7 +1279,7 @@ failed:
1043 spin_unlock(&info->lock); 1279 spin_unlock(&info->lock);
1044 goto repeat; 1280 goto repeat;
1045 } 1281 }
1046 if (error == -EEXIST) 1282 if (error == -EEXIST) /* from above or from radix_tree_insert */
1047 goto repeat; 1283 goto repeat;
1048 return error; 1284 return error;
1049} 1285}
@@ -1204,6 +1440,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1204 if (pos + copied > inode->i_size) 1440 if (pos + copied > inode->i_size)
1205 i_size_write(inode, pos + copied); 1441 i_size_write(inode, pos + copied);
1206 1442
1443 if (!PageUptodate(page)) {
1444 if (copied < PAGE_CACHE_SIZE) {
1445 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1446 zero_user_segments(page, 0, from,
1447 from + copied, PAGE_CACHE_SIZE);
1448 }
1449 SetPageUptodate(page);
1450 }
1207 set_page_dirty(page); 1451 set_page_dirty(page);
1208 unlock_page(page); 1452 unlock_page(page);
1209 page_cache_release(page); 1453 page_cache_release(page);
@@ -1365,6 +1609,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1365 struct splice_pipe_desc spd = { 1609 struct splice_pipe_desc spd = {
1366 .pages = pages, 1610 .pages = pages,
1367 .partial = partial, 1611 .partial = partial,
1612 .nr_pages_max = PIPE_DEF_BUFFERS,
1368 .flags = flags, 1613 .flags = flags,
1369 .ops = &page_cache_pipe_buf_ops, 1614 .ops = &page_cache_pipe_buf_ops,
1370 .spd_release = spd_release_page, 1615 .spd_release = spd_release_page,
@@ -1453,7 +1698,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1453 if (spd.nr_pages) 1698 if (spd.nr_pages)
1454 error = splice_to_pipe(pipe, &spd); 1699 error = splice_to_pipe(pipe, &spd);
1455 1700
1456 splice_shrink_spd(pipe, &spd); 1701 splice_shrink_spd(&spd);
1457 1702
1458 if (error > 0) { 1703 if (error > 0) {
1459 *ppos += error; 1704 *ppos += error;
@@ -1462,6 +1707,107 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1462 return error; 1707 return error;
1463} 1708}
1464 1709
1710static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1711 loff_t len)
1712{
1713 struct inode *inode = file->f_path.dentry->d_inode;
1714 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1715 struct shmem_falloc shmem_falloc;
1716 pgoff_t start, index, end;
1717 int error;
1718
1719 mutex_lock(&inode->i_mutex);
1720
1721 if (mode & FALLOC_FL_PUNCH_HOLE) {
1722 struct address_space *mapping = file->f_mapping;
1723 loff_t unmap_start = round_up(offset, PAGE_SIZE);
1724 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1725
1726 if ((u64)unmap_end > (u64)unmap_start)
1727 unmap_mapping_range(mapping, unmap_start,
1728 1 + unmap_end - unmap_start, 0);
1729 shmem_truncate_range(inode, offset, offset + len - 1);
1730 /* No need to unmap again: hole-punching leaves COWed pages */
1731 error = 0;
1732 goto out;
1733 }
1734
1735 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
1736 error = inode_newsize_ok(inode, offset + len);
1737 if (error)
1738 goto out;
1739
1740 start = offset >> PAGE_CACHE_SHIFT;
1741 end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1742 /* Try to avoid a swapstorm if len is impossible to satisfy */
1743 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
1744 error = -ENOSPC;
1745 goto out;
1746 }
1747
1748 shmem_falloc.start = start;
1749 shmem_falloc.next = start;
1750 shmem_falloc.nr_falloced = 0;
1751 shmem_falloc.nr_unswapped = 0;
1752 spin_lock(&inode->i_lock);
1753 inode->i_private = &shmem_falloc;
1754 spin_unlock(&inode->i_lock);
1755
1756 for (index = start; index < end; index++) {
1757 struct page *page;
1758
1759 /*
1760 * Good, the fallocate(2) manpage permits EINTR: we may have
1761 * been interrupted because we are using up too much memory.
1762 */
1763 if (signal_pending(current))
1764 error = -EINTR;
1765 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
1766 error = -ENOMEM;
1767 else
1768 error = shmem_getpage(inode, index, &page, SGP_FALLOC,
1769 NULL);
1770 if (error) {
1771 /* Remove the !PageUptodate pages we added */
1772 shmem_undo_range(inode,
1773 (loff_t)start << PAGE_CACHE_SHIFT,
1774 (loff_t)index << PAGE_CACHE_SHIFT, true);
1775 goto undone;
1776 }
1777
1778 /*
1779 * Inform shmem_writepage() how far we have reached.
1780 * No need for lock or barrier: we have the page lock.
1781 */
1782 shmem_falloc.next++;
1783 if (!PageUptodate(page))
1784 shmem_falloc.nr_falloced++;
1785
1786 /*
1787 * If !PageUptodate, leave it that way so that freeable pages
1788 * can be recognized if we need to rollback on error later.
1789 * But set_page_dirty so that memory pressure will swap rather
1790 * than free the pages we are allocating (and SGP_CACHE pages
1791 * might still be clean: we now need to mark those dirty too).
1792 */
1793 set_page_dirty(page);
1794 unlock_page(page);
1795 page_cache_release(page);
1796 cond_resched();
1797 }
1798
1799 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
1800 i_size_write(inode, offset + len);
1801 inode->i_ctime = CURRENT_TIME;
1802undone:
1803 spin_lock(&inode->i_lock);
1804 inode->i_private = NULL;
1805 spin_unlock(&inode->i_lock);
1806out:
1807 mutex_unlock(&inode->i_mutex);
1808 return error;
1809}
1810
1465static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1811static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1466{ 1812{
1467 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1813 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1665,6 +2011,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1665 kaddr = kmap_atomic(page); 2011 kaddr = kmap_atomic(page);
1666 memcpy(kaddr, symname, len); 2012 memcpy(kaddr, symname, len);
1667 kunmap_atomic(kaddr); 2013 kunmap_atomic(kaddr);
2014 SetPageUptodate(page);
1668 set_page_dirty(page); 2015 set_page_dirty(page);
1669 unlock_page(page); 2016 unlock_page(page);
1670 page_cache_release(page); 2017 page_cache_release(page);
@@ -2033,11 +2380,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2033 return dentry; 2380 return dentry;
2034} 2381}
2035 2382
2036static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, 2383static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
2037 int connectable) 2384 struct inode *parent)
2038{ 2385{
2039 struct inode *inode = dentry->d_inode;
2040
2041 if (*len < 3) { 2386 if (*len < 3) {
2042 *len = 3; 2387 *len = 3;
2043 return 255; 2388 return 255;
@@ -2075,6 +2420,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2075 bool remount) 2420 bool remount)
2076{ 2421{
2077 char *this_char, *value, *rest; 2422 char *this_char, *value, *rest;
2423 uid_t uid;
2424 gid_t gid;
2078 2425
2079 while (options != NULL) { 2426 while (options != NULL) {
2080 this_char = options; 2427 this_char = options;
@@ -2134,15 +2481,21 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2134 } else if (!strcmp(this_char,"uid")) { 2481 } else if (!strcmp(this_char,"uid")) {
2135 if (remount) 2482 if (remount)
2136 continue; 2483 continue;
2137 sbinfo->uid = simple_strtoul(value, &rest, 0); 2484 uid = simple_strtoul(value, &rest, 0);
2138 if (*rest) 2485 if (*rest)
2139 goto bad_val; 2486 goto bad_val;
2487 sbinfo->uid = make_kuid(current_user_ns(), uid);
2488 if (!uid_valid(sbinfo->uid))
2489 goto bad_val;
2140 } else if (!strcmp(this_char,"gid")) { 2490 } else if (!strcmp(this_char,"gid")) {
2141 if (remount) 2491 if (remount)
2142 continue; 2492 continue;
2143 sbinfo->gid = simple_strtoul(value, &rest, 0); 2493 gid = simple_strtoul(value, &rest, 0);
2144 if (*rest) 2494 if (*rest)
2145 goto bad_val; 2495 goto bad_val;
2496 sbinfo->gid = make_kgid(current_user_ns(), gid);
2497 if (!gid_valid(sbinfo->gid))
2498 goto bad_val;
2146 } else if (!strcmp(this_char,"mpol")) { 2499 } else if (!strcmp(this_char,"mpol")) {
2147 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 2500 if (mpol_parse_str(value, &sbinfo->mpol, 1))
2148 goto bad_val; 2501 goto bad_val;
@@ -2210,10 +2563,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
2210 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 2563 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
2211 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) 2564 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
2212 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 2565 seq_printf(seq, ",mode=%03ho", sbinfo->mode);
2213 if (sbinfo->uid != 0) 2566 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
2214 seq_printf(seq, ",uid=%u", sbinfo->uid); 2567 seq_printf(seq, ",uid=%u",
2215 if (sbinfo->gid != 0) 2568 from_kuid_munged(&init_user_ns, sbinfo->uid));
2216 seq_printf(seq, ",gid=%u", sbinfo->gid); 2569 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
2570 seq_printf(seq, ",gid=%u",
2571 from_kgid_munged(&init_user_ns, sbinfo->gid));
2217 shmem_show_mpol(seq, sbinfo->mpol); 2572 shmem_show_mpol(seq, sbinfo->mpol);
2218 return 0; 2573 return 0;
2219} 2574}
@@ -2260,6 +2615,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2260 } 2615 }
2261 } 2616 }
2262 sb->s_export_op = &shmem_export_ops; 2617 sb->s_export_op = &shmem_export_ops;
2618 sb->s_flags |= MS_NOSEC;
2263#else 2619#else
2264 sb->s_flags |= MS_NOUSER; 2620 sb->s_flags |= MS_NOUSER;
2265#endif 2621#endif
@@ -2362,12 +2718,12 @@ static const struct file_operations shmem_file_operations = {
2362 .fsync = noop_fsync, 2718 .fsync = noop_fsync,
2363 .splice_read = shmem_file_splice_read, 2719 .splice_read = shmem_file_splice_read,
2364 .splice_write = generic_file_splice_write, 2720 .splice_write = generic_file_splice_write,
2721 .fallocate = shmem_fallocate,
2365#endif 2722#endif
2366}; 2723};
2367 2724
2368static const struct inode_operations shmem_inode_operations = { 2725static const struct inode_operations shmem_inode_operations = {
2369 .setattr = shmem_setattr, 2726 .setattr = shmem_setattr,
2370 .truncate_range = shmem_truncate_range,
2371#ifdef CONFIG_TMPFS_XATTR 2727#ifdef CONFIG_TMPFS_XATTR
2372 .setxattr = shmem_setxattr, 2728 .setxattr = shmem_setxattr,
2373 .getxattr = shmem_getxattr, 2729 .getxattr = shmem_getxattr,
diff --git a/mm/slub.c b/mm/slub.c
index 80848cd3901c..8c691fa1cf3c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1369,7 +1369,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1369 1369
1370 inc_slabs_node(s, page_to_nid(page), page->objects); 1370 inc_slabs_node(s, page_to_nid(page), page->objects);
1371 page->slab = s; 1371 page->slab = s;
1372 page->flags |= 1 << PG_slab; 1372 __SetPageSlab(page);
1373 1373
1374 start = page_address(page); 1374 start = page_address(page);
1375 1375
@@ -1514,15 +1514,19 @@ static inline void *acquire_slab(struct kmem_cache *s,
1514 freelist = page->freelist; 1514 freelist = page->freelist;
1515 counters = page->counters; 1515 counters = page->counters;
1516 new.counters = counters; 1516 new.counters = counters;
1517 if (mode) 1517 if (mode) {
1518 new.inuse = page->objects; 1518 new.inuse = page->objects;
1519 new.freelist = NULL;
1520 } else {
1521 new.freelist = freelist;
1522 }
1519 1523
1520 VM_BUG_ON(new.frozen); 1524 VM_BUG_ON(new.frozen);
1521 new.frozen = 1; 1525 new.frozen = 1;
1522 1526
1523 } while (!__cmpxchg_double_slab(s, page, 1527 } while (!__cmpxchg_double_slab(s, page,
1524 freelist, counters, 1528 freelist, counters,
1525 NULL, new.counters, 1529 new.freelist, new.counters,
1526 "lock and freeze")); 1530 "lock and freeze"));
1527 1531
1528 remove_partial(n, page); 1532 remove_partial(n, page);
@@ -1564,7 +1568,6 @@ static void *get_partial_node(struct kmem_cache *s,
1564 object = t; 1568 object = t;
1565 available = page->objects - page->inuse; 1569 available = page->objects - page->inuse;
1566 } else { 1570 } else {
1567 page->freelist = t;
1568 available = put_cpu_partial(s, page, 0); 1571 available = put_cpu_partial(s, page, 0);
1569 stat(s, CPU_PARTIAL_NODE); 1572 stat(s, CPU_PARTIAL_NODE);
1570 } 1573 }
@@ -1579,7 +1582,7 @@ static void *get_partial_node(struct kmem_cache *s,
1579/* 1582/*
1580 * Get a page from somewhere. Search in increasing NUMA distances. 1583 * Get a page from somewhere. Search in increasing NUMA distances.
1581 */ 1584 */
1582static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, 1585static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1583 struct kmem_cache_cpu *c) 1586 struct kmem_cache_cpu *c)
1584{ 1587{
1585#ifdef CONFIG_NUMA 1588#ifdef CONFIG_NUMA
@@ -2766,7 +2769,7 @@ static unsigned long calculate_alignment(unsigned long flags,
2766} 2769}
2767 2770
2768static void 2771static void
2769init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2772init_kmem_cache_node(struct kmem_cache_node *n)
2770{ 2773{
2771 n->nr_partial = 0; 2774 n->nr_partial = 0;
2772 spin_lock_init(&n->list_lock); 2775 spin_lock_init(&n->list_lock);
@@ -2836,7 +2839,7 @@ static void early_kmem_cache_node_alloc(int node)
2836 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2839 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2837 init_tracking(kmem_cache_node, n); 2840 init_tracking(kmem_cache_node, n);
2838#endif 2841#endif
2839 init_kmem_cache_node(n, kmem_cache_node); 2842 init_kmem_cache_node(n);
2840 inc_slabs_node(kmem_cache_node, node, page->objects); 2843 inc_slabs_node(kmem_cache_node, node, page->objects);
2841 2844
2842 add_partial(n, page, DEACTIVATE_TO_HEAD); 2845 add_partial(n, page, DEACTIVATE_TO_HEAD);
@@ -2876,7 +2879,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
2876 } 2879 }
2877 2880
2878 s->node[node] = n; 2881 s->node[node] = n;
2879 init_kmem_cache_node(n, s); 2882 init_kmem_cache_node(n);
2880 } 2883 }
2881 return 1; 2884 return 1;
2882} 2885}
@@ -3625,7 +3628,7 @@ static int slab_mem_going_online_callback(void *arg)
3625 ret = -ENOMEM; 3628 ret = -ENOMEM;
3626 goto out; 3629 goto out;
3627 } 3630 }
3628 init_kmem_cache_node(n, s); 3631 init_kmem_cache_node(n);
3629 s->node[nid] = n; 3632 s->node[nid] = n;
3630 } 3633 }
3631out: 3634out:
@@ -3968,9 +3971,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3968 } 3971 }
3969 return s; 3972 return s;
3970 } 3973 }
3971 kfree(n);
3972 kfree(s); 3974 kfree(s);
3973 } 3975 }
3976 kfree(n);
3974err: 3977err:
3975 up_write(&slub_lock); 3978 up_write(&slub_lock);
3976 3979
diff --git a/mm/sparse.c b/mm/sparse.c
index a8bc7d364deb..c7bb952400c8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -273,10 +273,11 @@ static unsigned long *__kmalloc_section_usemap(void)
273#ifdef CONFIG_MEMORY_HOTREMOVE 273#ifdef CONFIG_MEMORY_HOTREMOVE
274static unsigned long * __init 274static unsigned long * __init
275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
276 unsigned long count) 276 unsigned long size)
277{ 277{
278 unsigned long section_nr; 278 unsigned long goal, limit;
279 279 unsigned long *p;
280 int nid;
280 /* 281 /*
281 * A page may contain usemaps for other sections preventing the 282 * A page may contain usemaps for other sections preventing the
282 * page being freed and making a section unremovable while 283 * page being freed and making a section unremovable while
@@ -287,8 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
287 * from the same section as the pgdat where possible to avoid 288 * from the same section as the pgdat where possible to avoid
288 * this problem. 289 * this problem.
289 */ 290 */
290 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 291 goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
291 return alloc_bootmem_section(usemap_size() * count, section_nr); 292 limit = goal + (1UL << PA_SECTION_SHIFT);
293 nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
294again:
295 p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
296 SMP_CACHE_BYTES, goal, limit);
297 if (!p && limit) {
298 limit = 0;
299 goto again;
300 }
301 return p;
292} 302}
293 303
294static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 304static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -332,9 +342,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
332#else 342#else
333static unsigned long * __init 343static unsigned long * __init
334sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 344sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
335 unsigned long count) 345 unsigned long size)
336{ 346{
337 return NULL; 347 return alloc_bootmem_node_nopanic(pgdat, size);
338} 348}
339 349
340static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 350static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -352,13 +362,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
352 int size = usemap_size(); 362 int size = usemap_size();
353 363
354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 364 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count); 365 size * usemap_count);
356 if (!usemap) { 366 if (!usemap) {
357 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); 367 printk(KERN_WARNING "%s: allocation failed\n", __func__);
358 if (!usemap) { 368 return;
359 printk(KERN_WARNING "%s: allocation failed\n", __func__);
360 return;
361 }
362 } 369 }
363 370
364 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 371 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
diff --git a/mm/swap.c b/mm/swap.c
index 5c13f1338972..4e7e2ec67078 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,13 +47,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
47static void __page_cache_release(struct page *page) 47static void __page_cache_release(struct page *page)
48{ 48{
49 if (PageLRU(page)) { 49 if (PageLRU(page)) {
50 unsigned long flags;
51 struct zone *zone = page_zone(page); 50 struct zone *zone = page_zone(page);
51 struct lruvec *lruvec;
52 unsigned long flags;
52 53
53 spin_lock_irqsave(&zone->lru_lock, flags); 54 spin_lock_irqsave(&zone->lru_lock, flags);
55 lruvec = mem_cgroup_page_lruvec(page, zone);
54 VM_BUG_ON(!PageLRU(page)); 56 VM_BUG_ON(!PageLRU(page));
55 __ClearPageLRU(page); 57 __ClearPageLRU(page);
56 del_page_from_lru_list(zone, page, page_off_lru(page)); 58 del_page_from_lru_list(page, lruvec, page_off_lru(page));
57 spin_unlock_irqrestore(&zone->lru_lock, flags); 59 spin_unlock_irqrestore(&zone->lru_lock, flags);
58 } 60 }
59} 61}
@@ -82,6 +84,25 @@ static void put_compound_page(struct page *page)
82 if (likely(page != page_head && 84 if (likely(page != page_head &&
83 get_page_unless_zero(page_head))) { 85 get_page_unless_zero(page_head))) {
84 unsigned long flags; 86 unsigned long flags;
87
88 /*
89 * THP can not break up slab pages so avoid taking
90 * compound_lock(). Slab performs non-atomic bit ops
91 * on page->flags for better performance. In particular
92 * slab_unlock() in slub used to be a hot path. It is
93 * still hot on arches that do not support
94 * this_cpu_cmpxchg_double().
95 */
96 if (PageSlab(page_head)) {
97 if (PageTail(page)) {
98 if (put_page_testzero(page_head))
99 VM_BUG_ON(1);
100
101 atomic_dec(&page->_mapcount);
102 goto skip_lock_tail;
103 } else
104 goto skip_lock;
105 }
85 /* 106 /*
86 * page_head wasn't a dangling pointer but it 107 * page_head wasn't a dangling pointer but it
87 * may not be a head page anymore by the time 108 * may not be a head page anymore by the time
@@ -92,10 +113,10 @@ static void put_compound_page(struct page *page)
92 if (unlikely(!PageTail(page))) { 113 if (unlikely(!PageTail(page))) {
93 /* __split_huge_page_refcount run before us */ 114 /* __split_huge_page_refcount run before us */
94 compound_unlock_irqrestore(page_head, flags); 115 compound_unlock_irqrestore(page_head, flags);
95 VM_BUG_ON(PageHead(page_head)); 116skip_lock:
96 if (put_page_testzero(page_head)) 117 if (put_page_testzero(page_head))
97 __put_single_page(page_head); 118 __put_single_page(page_head);
98 out_put_single: 119out_put_single:
99 if (put_page_testzero(page)) 120 if (put_page_testzero(page))
100 __put_single_page(page); 121 __put_single_page(page);
101 return; 122 return;
@@ -115,6 +136,8 @@ static void put_compound_page(struct page *page)
115 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 136 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
116 VM_BUG_ON(atomic_read(&page->_count) != 0); 137 VM_BUG_ON(atomic_read(&page->_count) != 0);
117 compound_unlock_irqrestore(page_head, flags); 138 compound_unlock_irqrestore(page_head, flags);
139
140skip_lock_tail:
118 if (put_page_testzero(page_head)) { 141 if (put_page_testzero(page_head)) {
119 if (PageHead(page_head)) 142 if (PageHead(page_head))
120 __put_compound_page(page_head); 143 __put_compound_page(page_head);
@@ -162,6 +185,18 @@ bool __get_page_tail(struct page *page)
162 struct page *page_head = compound_trans_head(page); 185 struct page *page_head = compound_trans_head(page);
163 186
164 if (likely(page != page_head && get_page_unless_zero(page_head))) { 187 if (likely(page != page_head && get_page_unless_zero(page_head))) {
188
189 /* Ref to put_compound_page() comment. */
190 if (PageSlab(page_head)) {
191 if (likely(PageTail(page))) {
192 __get_page_tail_foll(page, false);
193 return true;
194 } else {
195 put_page(page_head);
196 return false;
197 }
198 }
199
165 /* 200 /*
166 * page_head wasn't a dangling pointer but it 201 * page_head wasn't a dangling pointer but it
167 * may not be a head page anymore by the time 202 * may not be a head page anymore by the time
@@ -202,11 +237,12 @@ void put_pages_list(struct list_head *pages)
202EXPORT_SYMBOL(put_pages_list); 237EXPORT_SYMBOL(put_pages_list);
203 238
204static void pagevec_lru_move_fn(struct pagevec *pvec, 239static void pagevec_lru_move_fn(struct pagevec *pvec,
205 void (*move_fn)(struct page *page, void *arg), 240 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
206 void *arg) 241 void *arg)
207{ 242{
208 int i; 243 int i;
209 struct zone *zone = NULL; 244 struct zone *zone = NULL;
245 struct lruvec *lruvec;
210 unsigned long flags = 0; 246 unsigned long flags = 0;
211 247
212 for (i = 0; i < pagevec_count(pvec); i++) { 248 for (i = 0; i < pagevec_count(pvec); i++) {
@@ -220,7 +256,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
220 spin_lock_irqsave(&zone->lru_lock, flags); 256 spin_lock_irqsave(&zone->lru_lock, flags);
221 } 257 }
222 258
223 (*move_fn)(page, arg); 259 lruvec = mem_cgroup_page_lruvec(page, zone);
260 (*move_fn)(page, lruvec, arg);
224 } 261 }
225 if (zone) 262 if (zone)
226 spin_unlock_irqrestore(&zone->lru_lock, flags); 263 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -228,16 +265,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
228 pagevec_reinit(pvec); 265 pagevec_reinit(pvec);
229} 266}
230 267
231static void pagevec_move_tail_fn(struct page *page, void *arg) 268static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
269 void *arg)
232{ 270{
233 int *pgmoved = arg; 271 int *pgmoved = arg;
234 272
235 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 273 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
236 enum lru_list lru = page_lru_base_type(page); 274 enum lru_list lru = page_lru_base_type(page);
237 struct lruvec *lruvec;
238
239 lruvec = mem_cgroup_lru_move_lists(page_zone(page),
240 page, lru, lru);
241 list_move_tail(&page->lru, &lruvec->lists[lru]); 275 list_move_tail(&page->lru, &lruvec->lists[lru]);
242 (*pgmoved)++; 276 (*pgmoved)++;
243 } 277 }
@@ -276,41 +310,30 @@ void rotate_reclaimable_page(struct page *page)
276 } 310 }
277} 311}
278 312
279static void update_page_reclaim_stat(struct zone *zone, struct page *page, 313static void update_page_reclaim_stat(struct lruvec *lruvec,
280 int file, int rotated) 314 int file, int rotated)
281{ 315{
282 struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; 316 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
283 struct zone_reclaim_stat *memcg_reclaim_stat;
284
285 memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
286 317
287 reclaim_stat->recent_scanned[file]++; 318 reclaim_stat->recent_scanned[file]++;
288 if (rotated) 319 if (rotated)
289 reclaim_stat->recent_rotated[file]++; 320 reclaim_stat->recent_rotated[file]++;
290
291 if (!memcg_reclaim_stat)
292 return;
293
294 memcg_reclaim_stat->recent_scanned[file]++;
295 if (rotated)
296 memcg_reclaim_stat->recent_rotated[file]++;
297} 321}
298 322
299static void __activate_page(struct page *page, void *arg) 323static void __activate_page(struct page *page, struct lruvec *lruvec,
324 void *arg)
300{ 325{
301 struct zone *zone = page_zone(page);
302
303 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 326 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
304 int file = page_is_file_cache(page); 327 int file = page_is_file_cache(page);
305 int lru = page_lru_base_type(page); 328 int lru = page_lru_base_type(page);
306 del_page_from_lru_list(zone, page, lru);
307 329
330 del_page_from_lru_list(page, lruvec, lru);
308 SetPageActive(page); 331 SetPageActive(page);
309 lru += LRU_ACTIVE; 332 lru += LRU_ACTIVE;
310 add_page_to_lru_list(zone, page, lru); 333 add_page_to_lru_list(page, lruvec, lru);
311 __count_vm_event(PGACTIVATE);
312 334
313 update_page_reclaim_stat(zone, page, file, 1); 335 __count_vm_event(PGACTIVATE);
336 update_page_reclaim_stat(lruvec, file, 1);
314 } 337 }
315} 338}
316 339
@@ -347,7 +370,7 @@ void activate_page(struct page *page)
347 struct zone *zone = page_zone(page); 370 struct zone *zone = page_zone(page);
348 371
349 spin_lock_irq(&zone->lru_lock); 372 spin_lock_irq(&zone->lru_lock);
350 __activate_page(page, NULL); 373 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
351 spin_unlock_irq(&zone->lru_lock); 374 spin_unlock_irq(&zone->lru_lock);
352} 375}
353#endif 376#endif
@@ -414,11 +437,13 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru)
414void add_page_to_unevictable_list(struct page *page) 437void add_page_to_unevictable_list(struct page *page)
415{ 438{
416 struct zone *zone = page_zone(page); 439 struct zone *zone = page_zone(page);
440 struct lruvec *lruvec;
417 441
418 spin_lock_irq(&zone->lru_lock); 442 spin_lock_irq(&zone->lru_lock);
443 lruvec = mem_cgroup_page_lruvec(page, zone);
419 SetPageUnevictable(page); 444 SetPageUnevictable(page);
420 SetPageLRU(page); 445 SetPageLRU(page);
421 add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); 446 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
422 spin_unlock_irq(&zone->lru_lock); 447 spin_unlock_irq(&zone->lru_lock);
423} 448}
424 449
@@ -443,11 +468,11 @@ void add_page_to_unevictable_list(struct page *page)
443 * be write it out by flusher threads as this is much more effective 468 * be write it out by flusher threads as this is much more effective
444 * than the single-page writeout from reclaim. 469 * than the single-page writeout from reclaim.
445 */ 470 */
446static void lru_deactivate_fn(struct page *page, void *arg) 471static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
472 void *arg)
447{ 473{
448 int lru, file; 474 int lru, file;
449 bool active; 475 bool active;
450 struct zone *zone = page_zone(page);
451 476
452 if (!PageLRU(page)) 477 if (!PageLRU(page))
453 return; 478 return;
@@ -460,13 +485,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
460 return; 485 return;
461 486
462 active = PageActive(page); 487 active = PageActive(page);
463
464 file = page_is_file_cache(page); 488 file = page_is_file_cache(page);
465 lru = page_lru_base_type(page); 489 lru = page_lru_base_type(page);
466 del_page_from_lru_list(zone, page, lru + active); 490
491 del_page_from_lru_list(page, lruvec, lru + active);
467 ClearPageActive(page); 492 ClearPageActive(page);
468 ClearPageReferenced(page); 493 ClearPageReferenced(page);
469 add_page_to_lru_list(zone, page, lru); 494 add_page_to_lru_list(page, lruvec, lru);
470 495
471 if (PageWriteback(page) || PageDirty(page)) { 496 if (PageWriteback(page) || PageDirty(page)) {
472 /* 497 /*
@@ -476,19 +501,17 @@ static void lru_deactivate_fn(struct page *page, void *arg)
476 */ 501 */
477 SetPageReclaim(page); 502 SetPageReclaim(page);
478 } else { 503 } else {
479 struct lruvec *lruvec;
480 /* 504 /*
481 * The page's writeback ends up during pagevec 505 * The page's writeback ends up during pagevec
482 * We moves tha page into tail of inactive. 506 * We moves tha page into tail of inactive.
483 */ 507 */
484 lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
485 list_move_tail(&page->lru, &lruvec->lists[lru]); 508 list_move_tail(&page->lru, &lruvec->lists[lru]);
486 __count_vm_event(PGROTATED); 509 __count_vm_event(PGROTATED);
487 } 510 }
488 511
489 if (active) 512 if (active)
490 __count_vm_event(PGDEACTIVATE); 513 __count_vm_event(PGDEACTIVATE);
491 update_page_reclaim_stat(zone, page, file, 0); 514 update_page_reclaim_stat(lruvec, file, 0);
492} 515}
493 516
494/* 517/*
@@ -588,6 +611,7 @@ void release_pages(struct page **pages, int nr, int cold)
588 int i; 611 int i;
589 LIST_HEAD(pages_to_free); 612 LIST_HEAD(pages_to_free);
590 struct zone *zone = NULL; 613 struct zone *zone = NULL;
614 struct lruvec *lruvec;
591 unsigned long uninitialized_var(flags); 615 unsigned long uninitialized_var(flags);
592 616
593 for (i = 0; i < nr; i++) { 617 for (i = 0; i < nr; i++) {
@@ -615,9 +639,11 @@ void release_pages(struct page **pages, int nr, int cold)
615 zone = pagezone; 639 zone = pagezone;
616 spin_lock_irqsave(&zone->lru_lock, flags); 640 spin_lock_irqsave(&zone->lru_lock, flags);
617 } 641 }
642
643 lruvec = mem_cgroup_page_lruvec(page, zone);
618 VM_BUG_ON(!PageLRU(page)); 644 VM_BUG_ON(!PageLRU(page));
619 __ClearPageLRU(page); 645 __ClearPageLRU(page);
620 del_page_from_lru_list(zone, page, page_off_lru(page)); 646 del_page_from_lru_list(page, lruvec, page_off_lru(page));
621 } 647 }
622 648
623 list_add(&page->lru, &pages_to_free); 649 list_add(&page->lru, &pages_to_free);
@@ -649,8 +675,8 @@ EXPORT_SYMBOL(__pagevec_release);
649 675
650#ifdef CONFIG_TRANSPARENT_HUGEPAGE 676#ifdef CONFIG_TRANSPARENT_HUGEPAGE
651/* used by __split_huge_page_refcount() */ 677/* used by __split_huge_page_refcount() */
652void lru_add_page_tail(struct zone* zone, 678void lru_add_page_tail(struct page *page, struct page *page_tail,
653 struct page *page, struct page *page_tail) 679 struct lruvec *lruvec)
654{ 680{
655 int uninitialized_var(active); 681 int uninitialized_var(active);
656 enum lru_list lru; 682 enum lru_list lru;
@@ -659,7 +685,8 @@ void lru_add_page_tail(struct zone* zone,
659 VM_BUG_ON(!PageHead(page)); 685 VM_BUG_ON(!PageHead(page));
660 VM_BUG_ON(PageCompound(page_tail)); 686 VM_BUG_ON(PageCompound(page_tail));
661 VM_BUG_ON(PageLRU(page_tail)); 687 VM_BUG_ON(PageLRU(page_tail));
662 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); 688 VM_BUG_ON(NR_CPUS != 1 &&
689 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
663 690
664 SetPageLRU(page_tail); 691 SetPageLRU(page_tail);
665 692
@@ -688,20 +715,20 @@ void lru_add_page_tail(struct zone* zone,
688 * Use the standard add function to put page_tail on the list, 715 * Use the standard add function to put page_tail on the list,
689 * but then correct its position so they all end up in order. 716 * but then correct its position so they all end up in order.
690 */ 717 */
691 add_page_to_lru_list(zone, page_tail, lru); 718 add_page_to_lru_list(page_tail, lruvec, lru);
692 list_head = page_tail->lru.prev; 719 list_head = page_tail->lru.prev;
693 list_move_tail(&page_tail->lru, list_head); 720 list_move_tail(&page_tail->lru, list_head);
694 } 721 }
695 722
696 if (!PageUnevictable(page)) 723 if (!PageUnevictable(page))
697 update_page_reclaim_stat(zone, page_tail, file, active); 724 update_page_reclaim_stat(lruvec, file, active);
698} 725}
699#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 726#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
700 727
701static void __pagevec_lru_add_fn(struct page *page, void *arg) 728static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
729 void *arg)
702{ 730{
703 enum lru_list lru = (enum lru_list)arg; 731 enum lru_list lru = (enum lru_list)arg;
704 struct zone *zone = page_zone(page);
705 int file = is_file_lru(lru); 732 int file = is_file_lru(lru);
706 int active = is_active_lru(lru); 733 int active = is_active_lru(lru);
707 734
@@ -712,8 +739,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
712 SetPageLRU(page); 739 SetPageLRU(page);
713 if (active) 740 if (active)
714 SetPageActive(page); 741 SetPageActive(page);
715 add_page_to_lru_list(zone, page, lru); 742 add_page_to_lru_list(page, lruvec, lru);
716 update_page_reclaim_stat(zone, page, file, active); 743 update_page_reclaim_stat(lruvec, file, active);
717} 744}
718 745
719/* 746/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fafc26d1b1dc..71373d03fcee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/poll.h> 32#include <linux/poll.h>
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
34 36
35#include <asm/pgtable.h> 37#include <asm/pgtable.h>
36#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
42static void free_swap_count_continuations(struct swap_info_struct *); 44static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**); 45static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44 46
45static DEFINE_SPINLOCK(swap_lock); 47DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles; 48static unsigned int nr_swapfiles;
47long nr_swap_pages; 49long nr_swap_pages;
48long total_swap_pages; 50long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry "; 55static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry "; 56static const char Unused_offset[] = "Unused swap offset entry ";
55 57
56static struct swap_list_t swap_list = {-1, -1}; 58struct swap_list_t swap_list = {-1, -1};
57 59
58static struct swap_info_struct *swap_info[MAX_SWAPFILES]; 60struct swap_info_struct *swap_info[MAX_SWAPFILES];
59 61
60static DEFINE_MUTEX(swapon_mutex); 62static DEFINE_MUTEX(swapon_mutex);
61 63
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
556 swap_list.next = p->type; 558 swap_list.next = p->type;
557 nr_swap_pages++; 559 nr_swap_pages++;
558 p->inuse_pages--; 560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset);
559 if ((p->flags & SWP_BLKDEV) && 562 if ((p->flags & SWP_BLKDEV) &&
560 disk->fops->swap_slot_free_notify) 563 disk->fops->swap_slot_free_notify)
561 disk->fops->swap_slot_free_notify(p->bdev, offset); 564 disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -601,7 +604,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
601 * This does not give an exact answer when swap count is continued, 604 * This does not give an exact answer when swap count is continued,
602 * but does include the high COUNT_CONTINUED flag to allow for that. 605 * but does include the high COUNT_CONTINUED flag to allow for that.
603 */ 606 */
604static inline int page_swapcount(struct page *page) 607int page_swapcount(struct page *page)
605{ 608{
606 int count = 0; 609 int count = 0;
607 struct swap_info_struct *p; 610 struct swap_info_struct *p;
@@ -717,37 +720,6 @@ int free_swap_and_cache(swp_entry_t entry)
717 return p != NULL; 720 return p != NULL;
718} 721}
719 722
720#ifdef CONFIG_CGROUP_MEM_RES_CTLR
721/**
722 * mem_cgroup_count_swap_user - count the user of a swap entry
723 * @ent: the swap entry to be checked
724 * @pagep: the pointer for the swap cache page of the entry to be stored
725 *
726 * Returns the number of the user of the swap entry. The number is valid only
727 * for swaps of anonymous pages.
728 * If the entry is found on swap cache, the page is stored to pagep with
729 * refcount of it being incremented.
730 */
731int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
732{
733 struct page *page;
734 struct swap_info_struct *p;
735 int count = 0;
736
737 page = find_get_page(&swapper_space, ent.val);
738 if (page)
739 count += page_mapcount(page);
740 p = swap_info_get(ent);
741 if (p) {
742 count += swap_count(p->swap_map[swp_offset(ent)]);
743 spin_unlock(&swap_lock);
744 }
745
746 *pagep = page;
747 return count;
748}
749#endif
750
751#ifdef CONFIG_HIBERNATION 723#ifdef CONFIG_HIBERNATION
752/* 724/*
753 * Find the swap type that corresponds to given device (if any). 725 * Find the swap type that corresponds to given device (if any).
@@ -1016,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm,
1016} 988}
1017 989
1018/* 990/*
1019 * Scan swap_map from current position to next entry still in use. 991 * Scan swap_map (or frontswap_map if frontswap parameter is true)
992 * from current position to next entry still in use.
1020 * Recycle to start on reaching the end, returning 0 when empty. 993 * Recycle to start on reaching the end, returning 0 when empty.
1021 */ 994 */
1022static unsigned int find_next_to_unuse(struct swap_info_struct *si, 995static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1023 unsigned int prev) 996 unsigned int prev, bool frontswap)
1024{ 997{
1025 unsigned int max = si->max; 998 unsigned int max = si->max;
1026 unsigned int i = prev; 999 unsigned int i = prev;
@@ -1046,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1046 prev = 0; 1019 prev = 0;
1047 i = 1; 1020 i = 1;
1048 } 1021 }
1022 if (frontswap) {
1023 if (frontswap_test(si, i))
1024 break;
1025 else
1026 continue;
1027 }
1049 count = si->swap_map[i]; 1028 count = si->swap_map[i];
1050 if (count && swap_count(count) != SWAP_MAP_BAD) 1029 if (count && swap_count(count) != SWAP_MAP_BAD)
1051 break; 1030 break;
@@ -1057,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1057 * We completely avoid races by reading each swap page in advance, 1036 * We completely avoid races by reading each swap page in advance,
1058 * and then search for the process using it. All the necessary 1037 * and then search for the process using it. All the necessary
1059 * page table adjustments can then be made atomically. 1038 * page table adjustments can then be made atomically.
1039 *
1040 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
1041 * pages_to_unuse==0 means all pages; ignored if frontswap is false
1060 */ 1042 */
1061static int try_to_unuse(unsigned int type) 1043int try_to_unuse(unsigned int type, bool frontswap,
1044 unsigned long pages_to_unuse)
1062{ 1045{
1063 struct swap_info_struct *si = swap_info[type]; 1046 struct swap_info_struct *si = swap_info[type];
1064 struct mm_struct *start_mm; 1047 struct mm_struct *start_mm;
@@ -1091,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
1091 * one pass through swap_map is enough, but not necessarily: 1074 * one pass through swap_map is enough, but not necessarily:
1092 * there are races when an instance of an entry might be missed. 1075 * there are races when an instance of an entry might be missed.
1093 */ 1076 */
1094 while ((i = find_next_to_unuse(si, i)) != 0) { 1077 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1095 if (signal_pending(current)) { 1078 if (signal_pending(current)) {
1096 retval = -EINTR; 1079 retval = -EINTR;
1097 break; 1080 break;
@@ -1258,6 +1241,10 @@ static int try_to_unuse(unsigned int type)
1258 * interactive performance. 1241 * interactive performance.
1259 */ 1242 */
1260 cond_resched(); 1243 cond_resched();
1244 if (frontswap && pages_to_unuse > 0) {
1245 if (!--pages_to_unuse)
1246 break;
1247 }
1261 } 1248 }
1262 1249
1263 mmput(start_mm); 1250 mmput(start_mm);
@@ -1517,7 +1504,8 @@ bad_bmap:
1517} 1504}
1518 1505
1519static void enable_swap_info(struct swap_info_struct *p, int prio, 1506static void enable_swap_info(struct swap_info_struct *p, int prio,
1520 unsigned char *swap_map) 1507 unsigned char *swap_map,
1508 unsigned long *frontswap_map)
1521{ 1509{
1522 int i, prev; 1510 int i, prev;
1523 1511
@@ -1527,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1527 else 1515 else
1528 p->prio = --least_priority; 1516 p->prio = --least_priority;
1529 p->swap_map = swap_map; 1517 p->swap_map = swap_map;
1518 frontswap_map_set(p, frontswap_map);
1530 p->flags |= SWP_WRITEOK; 1519 p->flags |= SWP_WRITEOK;
1531 nr_swap_pages += p->pages; 1520 nr_swap_pages += p->pages;
1532 total_swap_pages += p->pages; 1521 total_swap_pages += p->pages;
@@ -1543,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1543 swap_list.head = swap_list.next = p->type; 1532 swap_list.head = swap_list.next = p->type;
1544 else 1533 else
1545 swap_info[prev]->next = p->type; 1534 swap_info[prev]->next = p->type;
1535 frontswap_init(p->type);
1546 spin_unlock(&swap_lock); 1536 spin_unlock(&swap_lock);
1547} 1537}
1548 1538
@@ -1616,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1616 spin_unlock(&swap_lock); 1606 spin_unlock(&swap_lock);
1617 1607
1618 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1608 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1619 err = try_to_unuse(type); 1609 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1620 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1610 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1621 1611
1622 if (err) { 1612 if (err) {
@@ -1627,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1627 * sys_swapoff for this swap_info_struct at this point. 1617 * sys_swapoff for this swap_info_struct at this point.
1628 */ 1618 */
1629 /* re-insert swap space back into swap_list */ 1619 /* re-insert swap space back into swap_list */
1630 enable_swap_info(p, p->prio, p->swap_map); 1620 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1631 goto out_dput; 1621 goto out_dput;
1632 } 1622 }
1633 1623
@@ -1653,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1653 swap_map = p->swap_map; 1643 swap_map = p->swap_map;
1654 p->swap_map = NULL; 1644 p->swap_map = NULL;
1655 p->flags = 0; 1645 p->flags = 0;
1646 frontswap_invalidate_area(type);
1656 spin_unlock(&swap_lock); 1647 spin_unlock(&swap_lock);
1657 mutex_unlock(&swapon_mutex); 1648 mutex_unlock(&swapon_mutex);
1658 vfree(swap_map); 1649 vfree(swap_map);
1650 vfree(frontswap_map_get(p));
1659 /* Destroy swap account informatin */ 1651 /* Destroy swap account informatin */
1660 swap_cgroup_swapoff(type); 1652 swap_cgroup_swapoff(type);
1661 1653
@@ -1924,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1924 1916
1925 /* 1917 /*
1926 * Find out how many pages are allowed for a single swap 1918 * Find out how many pages are allowed for a single swap
1927 * device. There are three limiting factors: 1) the number 1919 * device. There are two limiting factors: 1) the number
1928 * of bits for the swap offset in the swp_entry_t type, and 1920 * of bits for the swap offset in the swp_entry_t type, and
1929 * 2) the number of bits in the swap pte as defined by the 1921 * 2) the number of bits in the swap pte as defined by the
1930 * the different architectures, and 3) the number of free bits 1922 * different architectures. In order to find the
1931 * in an exceptional radix_tree entry. In order to find the
1932 * largest possible bit mask, a swap entry with swap type 0 1923 * largest possible bit mask, a swap entry with swap type 0
1933 * and swap offset ~0UL is created, encoded to a swap pte, 1924 * and swap offset ~0UL is created, encoded to a swap pte,
1934 * decoded to a swp_entry_t again, and finally the swap 1925 * decoded to a swp_entry_t again, and finally the swap
1935 * offset is extracted. This will mask all the bits from 1926 * offset is extracted. This will mask all the bits from
1936 * the initial ~0UL mask that can't be encoded in either 1927 * the initial ~0UL mask that can't be encoded in either
1937 * the swp_entry_t or the architecture definition of a 1928 * the swp_entry_t or the architecture definition of a
1938 * swap pte. Then the same is done for a radix_tree entry. 1929 * swap pte.
1939 */ 1930 */
1940 maxpages = swp_offset(pte_to_swp_entry( 1931 maxpages = swp_offset(pte_to_swp_entry(
1941 swp_entry_to_pte(swp_entry(0, ~0UL)))); 1932 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1942 maxpages = swp_offset(radix_to_swp_entry(
1943 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1944
1945 if (maxpages > swap_header->info.last_page) { 1933 if (maxpages > swap_header->info.last_page) {
1946 maxpages = swap_header->info.last_page + 1; 1934 maxpages = swap_header->info.last_page + 1;
1947 /* p->max is an unsigned int: don't overflow it */ 1935 /* p->max is an unsigned int: don't overflow it */
@@ -2019,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2019 sector_t span; 2007 sector_t span;
2020 unsigned long maxpages; 2008 unsigned long maxpages;
2021 unsigned char *swap_map = NULL; 2009 unsigned char *swap_map = NULL;
2010 unsigned long *frontswap_map = NULL;
2022 struct page *page = NULL; 2011 struct page *page = NULL;
2023 struct inode *inode = NULL; 2012 struct inode *inode = NULL;
2024 2013
@@ -2102,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2102 error = nr_extents; 2091 error = nr_extents;
2103 goto bad_swap; 2092 goto bad_swap;
2104 } 2093 }
2094 /* frontswap enabled? set up bit-per-page map for frontswap */
2095 if (frontswap_enabled)
2096 frontswap_map = vzalloc(maxpages / sizeof(long));
2105 2097
2106 if (p->bdev) { 2098 if (p->bdev) {
2107 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2099 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2117,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2117 if (swap_flags & SWAP_FLAG_PREFER) 2109 if (swap_flags & SWAP_FLAG_PREFER)
2118 prio = 2110 prio =
2119 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2111 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2120 enable_swap_info(p, prio, swap_map); 2112 enable_swap_info(p, prio, swap_map, frontswap_map);
2121 2113
2122 printk(KERN_INFO "Adding %uk swap on %s. " 2114 printk(KERN_INFO "Adding %uk swap on %s. "
2123 "Priority:%d extents:%d across:%lluk %s%s\n", 2115 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2124 p->pages<<(PAGE_SHIFT-10), name, p->prio, 2116 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2125 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2117 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2126 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2118 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2127 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2119 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2120 (frontswap_map) ? "FS" : "");
2128 2121
2129 mutex_unlock(&swapon_mutex); 2122 mutex_unlock(&swapon_mutex);
2130 atomic_inc(&proc_poll_event); 2123 atomic_inc(&proc_poll_event);
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index 57ad495dbd54..000000000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,155 +0,0 @@
1/*
2 * mm/thrash.c
3 *
4 * Copyright (C) 2004, Red Hat, Inc.
5 * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
6 * Released under the GPL, see the file COPYING for details.
7 *
8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token:
13 * Each task has a priority which is incremented if it contended
14 * for the token in an interval less than its previous attempt.
15 * If the token is acquired, that task's priority is boosted to prevent
16 * the token from bouncing around too often and to let the task make
17 * some progress in its execution.
18 */
19
20#include <linux/jiffies.h>
21#include <linux/mm.h>
22#include <linux/sched.h>
23#include <linux/swap.h>
24#include <linux/memcontrol.h>
25
26#include <trace/events/vmscan.h>
27
28#define TOKEN_AGING_INTERVAL (0xFF)
29
30static DEFINE_SPINLOCK(swap_token_lock);
31struct mm_struct *swap_token_mm;
32static struct mem_cgroup *swap_token_memcg;
33
34#ifdef CONFIG_CGROUP_MEM_RES_CTLR
35static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
36{
37 struct mem_cgroup *memcg;
38
39 memcg = try_get_mem_cgroup_from_mm(mm);
40 if (memcg)
41 css_put(mem_cgroup_css(memcg));
42
43 return memcg;
44}
45#else
46static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
47{
48 return NULL;
49}
50#endif
51
52void grab_swap_token(struct mm_struct *mm)
53{
54 int current_interval;
55 unsigned int old_prio = mm->token_priority;
56 static unsigned int global_faults;
57 static unsigned int last_aging;
58
59 global_faults++;
60
61 current_interval = global_faults - mm->faultstamp;
62
63 if (!spin_trylock(&swap_token_lock))
64 return;
65
66 /* First come first served */
67 if (!swap_token_mm)
68 goto replace_token;
69
70 /*
71 * Usually, we don't need priority aging because long interval faults
72 * makes priority decrease quickly. But there is one exception. If the
73 * token owner task is sleeping, it never make long interval faults.
74 * Thus, we need a priority aging mechanism instead. The requirements
75 * of priority aging are
76 * 1) An aging interval is reasonable enough long. Too short aging
77 * interval makes quick swap token lost and decrease performance.
78 * 2) The swap token owner task have to get priority aging even if
79 * it's under sleep.
80 */
81 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
82 swap_token_mm->token_priority /= 2;
83 last_aging = global_faults;
84 }
85
86 if (mm == swap_token_mm) {
87 mm->token_priority += 2;
88 goto update_priority;
89 }
90
91 if (current_interval < mm->last_interval)
92 mm->token_priority++;
93 else {
94 if (likely(mm->token_priority > 0))
95 mm->token_priority--;
96 }
97
98 /* Check if we deserve the token */
99 if (mm->token_priority > swap_token_mm->token_priority)
100 goto replace_token;
101
102update_priority:
103 trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
104
105out:
106 mm->faultstamp = global_faults;
107 mm->last_interval = current_interval;
108 spin_unlock(&swap_token_lock);
109 return;
110
111replace_token:
112 mm->token_priority += 2;
113 trace_replace_swap_token(swap_token_mm, mm);
114 swap_token_mm = mm;
115 swap_token_memcg = swap_token_memcg_from_mm(mm);
116 last_aging = global_faults;
117 goto out;
118}
119
120/* Called on process exit. */
121void __put_swap_token(struct mm_struct *mm)
122{
123 spin_lock(&swap_token_lock);
124 if (likely(mm == swap_token_mm)) {
125 trace_put_swap_token(swap_token_mm);
126 swap_token_mm = NULL;
127 swap_token_memcg = NULL;
128 }
129 spin_unlock(&swap_token_lock);
130}
131
132static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
133{
134 if (!a)
135 return true;
136 if (!b)
137 return true;
138 if (a == b)
139 return true;
140 return false;
141}
142
143void disable_swap_token(struct mem_cgroup *memcg)
144{
145 /* memcg reclaim don't disable unrelated mm token. */
146 if (match_memcg(memcg, swap_token_memcg)) {
147 spin_lock(&swap_token_lock);
148 if (match_memcg(memcg, swap_token_memcg)) {
149 trace_disable_swap_token(swap_token_mm);
150 swap_token_mm = NULL;
151 swap_token_memcg = NULL;
152 }
153 spin_unlock(&swap_token_lock);
154 }
155}
diff --git a/mm/truncate.c b/mm/truncate.c
index 61a183b89df6..75801acdaac7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize)
602} 602}
603EXPORT_SYMBOL(vmtruncate); 603EXPORT_SYMBOL(vmtruncate);
604 604
605int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
606{
607 struct address_space *mapping = inode->i_mapping;
608 loff_t holebegin = round_up(lstart, PAGE_SIZE);
609 loff_t holelen = 1 + lend - holebegin;
610
611 /*
612 * If the underlying filesystem is not going to provide
613 * a way to truncate a range of blocks (punch a hole) -
614 * we should return failure right now.
615 */
616 if (!inode->i_op->truncate_range)
617 return -ENOSYS;
618
619 mutex_lock(&inode->i_mutex);
620 inode_dio_wait(inode);
621 unmap_mapping_range(mapping, holebegin, holelen, 1);
622 inode->i_op->truncate_range(inode, lstart, lend);
623 /* unmap again to remove racily COWed private pages */
624 unmap_mapping_range(mapping, holebegin, holelen, 1);
625 mutex_unlock(&inode->i_mutex);
626
627 return 0;
628}
629
630/** 605/**
631 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched 606 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
632 * @inode: inode 607 * @inode: inode
diff --git a/mm/util.c b/mm/util.c
index ae962b31de88..8c7265afa29f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
4#include <linux/export.h> 4#include <linux/export.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/security.h>
7#include <asm/uaccess.h> 8#include <asm/uaccess.h>
8 9
9#include "internal.h" 10#include "internal.h"
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
341} 342}
342EXPORT_SYMBOL_GPL(get_user_pages_fast); 343EXPORT_SYMBOL_GPL(get_user_pages_fast);
343 344
345unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
346 unsigned long len, unsigned long prot,
347 unsigned long flag, unsigned long pgoff)
348{
349 unsigned long ret;
350 struct mm_struct *mm = current->mm;
351
352 ret = security_mmap_file(file, prot, flag);
353 if (!ret) {
354 down_write(&mm->mmap_sem);
355 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
356 up_write(&mm->mmap_sem);
357 }
358 return ret;
359}
360
361unsigned long vm_mmap(struct file *file, unsigned long addr,
362 unsigned long len, unsigned long prot,
363 unsigned long flag, unsigned long offset)
364{
365 if (unlikely(offset + PAGE_ALIGN(len) < offset))
366 return -EINVAL;
367 if (unlikely(offset & ~PAGE_MASK))
368 return -EINVAL;
369
370 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
371}
372EXPORT_SYMBOL(vm_mmap);
373
344/* Tracepoints definitions. */ 374/* Tracepoints definitions. */
345EXPORT_TRACEPOINT_SYMBOL(kmalloc); 375EXPORT_TRACEPOINT_SYMBOL(kmalloc);
346EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 376EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 94dff883b449..2aad49981b57 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1185,9 +1185,10 @@ void __init vmalloc_init(void)
1185 /* Import existing vmlist entries. */ 1185 /* Import existing vmlist entries. */
1186 for (tmp = vmlist; tmp; tmp = tmp->next) { 1186 for (tmp = vmlist; tmp; tmp = tmp->next) {
1187 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 1187 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1188 va->flags = tmp->flags | VM_VM_AREA; 1188 va->flags = VM_VM_AREA;
1189 va->va_start = (unsigned long)tmp->addr; 1189 va->va_start = (unsigned long)tmp->addr;
1190 va->va_end = va->va_start + tmp->size; 1190 va->va_end = va->va_start + tmp->size;
1191 va->vm = tmp;
1191 __insert_vmap_area(va); 1192 __insert_vmap_area(va);
1192 } 1193 }
1193 1194
@@ -2375,8 +2376,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2375 return NULL; 2376 return NULL;
2376 } 2377 }
2377 2378
2378 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); 2379 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
2379 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); 2380 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
2380 if (!vas || !vms) 2381 if (!vas || !vms)
2381 goto err_free2; 2382 goto err_free2;
2382 2383
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33dc256033b5..66e431060c05 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,24 +53,6 @@
53#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
54#include <trace/events/vmscan.h> 54#include <trace/events/vmscan.h>
55 55
56/*
57 * reclaim_mode determines how the inactive list is shrunk
58 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
59 * RECLAIM_MODE_ASYNC: Do not block
60 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
61 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
62 * page from the LRU and reclaim all pages within a
63 * naturally aligned range
64 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
65 * order-0 pages and then compact the zone
66 */
67typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
73
74struct scan_control { 56struct scan_control {
75 /* Incremented by the number of inactive pages that were scanned */ 57 /* Incremented by the number of inactive pages that were scanned */
76 unsigned long nr_scanned; 58 unsigned long nr_scanned;
@@ -96,11 +78,8 @@ struct scan_control {
96 78
97 int order; 79 int order;
98 80
99 /* 81 /* Scan (total_size >> priority) pages at once */
100 * Intend to reclaim enough continuous memory rather than reclaim 82 int priority;
101 * enough amount of memory. i.e, mode for high order allocation.
102 */
103 reclaim_mode_t reclaim_mode;
104 83
105 /* 84 /*
106 * The memory cgroup that hit its limit and as a result is the 85 * The memory cgroup that hit its limit and as a result is the
@@ -115,11 +94,6 @@ struct scan_control {
115 nodemask_t *nodemask; 94 nodemask_t *nodemask;
116}; 95};
117 96
118struct mem_cgroup_zone {
119 struct mem_cgroup *mem_cgroup;
120 struct zone *zone;
121};
122
123#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 97#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
124 98
125#ifdef ARCH_HAS_PREFETCH 99#ifdef ARCH_HAS_PREFETCH
@@ -164,44 +138,21 @@ static bool global_reclaim(struct scan_control *sc)
164{ 138{
165 return !sc->target_mem_cgroup; 139 return !sc->target_mem_cgroup;
166} 140}
167
168static bool scanning_global_lru(struct mem_cgroup_zone *mz)
169{
170 return !mz->mem_cgroup;
171}
172#else 141#else
173static bool global_reclaim(struct scan_control *sc) 142static bool global_reclaim(struct scan_control *sc)
174{ 143{
175 return true; 144 return true;
176} 145}
177
178static bool scanning_global_lru(struct mem_cgroup_zone *mz)
179{
180 return true;
181}
182#endif 146#endif
183 147
184static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) 148static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
185{ 149{
186 if (!scanning_global_lru(mz)) 150 if (!mem_cgroup_disabled())
187 return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); 151 return mem_cgroup_get_lru_size(lruvec, lru);
188 152
189 return &mz->zone->reclaim_stat; 153 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
190} 154}
191 155
192static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
193 enum lru_list lru)
194{
195 if (!scanning_global_lru(mz))
196 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
197 zone_to_nid(mz->zone),
198 zone_idx(mz->zone),
199 BIT(lru));
200
201 return zone_page_state(mz->zone, NR_LRU_BASE + lru);
202}
203
204
205/* 156/*
206 * Add a shrinker callback to be called from the vm 157 * Add a shrinker callback to be called from the vm
207 */ 158 */
@@ -364,39 +315,6 @@ out:
364 return ret; 315 return ret;
365} 316}
366 317
367static void set_reclaim_mode(int priority, struct scan_control *sc,
368 bool sync)
369{
370 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
371
372 /*
373 * Initially assume we are entering either lumpy reclaim or
374 * reclaim/compaction.Depending on the order, we will either set the
375 * sync mode or just reclaim order-0 pages later.
376 */
377 if (COMPACTION_BUILD)
378 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
379 else
380 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
381
382 /*
383 * Avoid using lumpy reclaim or reclaim/compaction if possible by
384 * restricting when its set to either costly allocations or when
385 * under memory pressure
386 */
387 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
388 sc->reclaim_mode |= syncmode;
389 else if (sc->order && priority < DEF_PRIORITY - 2)
390 sc->reclaim_mode |= syncmode;
391 else
392 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
393}
394
395static void reset_reclaim_mode(struct scan_control *sc)
396{
397 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
398}
399
400static inline int is_page_cache_freeable(struct page *page) 318static inline int is_page_cache_freeable(struct page *page)
401{ 319{
402 /* 320 /*
@@ -416,10 +334,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
416 return 1; 334 return 1;
417 if (bdi == current->backing_dev_info) 335 if (bdi == current->backing_dev_info)
418 return 1; 336 return 1;
419
420 /* lumpy reclaim for hugepage often need a lot of write */
421 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
422 return 1;
423 return 0; 337 return 0;
424} 338}
425 339
@@ -523,8 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
523 /* synchronous write or broken a_ops? */ 437 /* synchronous write or broken a_ops? */
524 ClearPageReclaim(page); 438 ClearPageReclaim(page);
525 } 439 }
526 trace_mm_vmscan_writepage(page, 440 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
527 trace_reclaim_flags(page, sc->reclaim_mode));
528 inc_zone_page_state(page, NR_VMSCAN_WRITE); 441 inc_zone_page_state(page, NR_VMSCAN_WRITE);
529 return PAGE_SUCCESS; 442 return PAGE_SUCCESS;
530 } 443 }
@@ -701,19 +614,15 @@ enum page_references {
701}; 614};
702 615
703static enum page_references page_check_references(struct page *page, 616static enum page_references page_check_references(struct page *page,
704 struct mem_cgroup_zone *mz,
705 struct scan_control *sc) 617 struct scan_control *sc)
706{ 618{
707 int referenced_ptes, referenced_page; 619 int referenced_ptes, referenced_page;
708 unsigned long vm_flags; 620 unsigned long vm_flags;
709 621
710 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); 622 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
623 &vm_flags);
711 referenced_page = TestClearPageReferenced(page); 624 referenced_page = TestClearPageReferenced(page);
712 625
713 /* Lumpy reclaim - ignore references */
714 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
715 return PAGEREF_RECLAIM;
716
717 /* 626 /*
718 * Mlock lost the isolation race with us. Let try_to_unmap() 627 * Mlock lost the isolation race with us. Let try_to_unmap()
719 * move the page to the unevictable list. 628 * move the page to the unevictable list.
@@ -722,7 +631,7 @@ static enum page_references page_check_references(struct page *page,
722 return PAGEREF_RECLAIM; 631 return PAGEREF_RECLAIM;
723 632
724 if (referenced_ptes) { 633 if (referenced_ptes) {
725 if (PageAnon(page)) 634 if (PageSwapBacked(page))
726 return PAGEREF_ACTIVATE; 635 return PAGEREF_ACTIVATE;
727 /* 636 /*
728 * All mapped pages start out with page table 637 * All mapped pages start out with page table
@@ -763,9 +672,8 @@ static enum page_references page_check_references(struct page *page,
763 * shrink_page_list() returns the number of reclaimed pages 672 * shrink_page_list() returns the number of reclaimed pages
764 */ 673 */
765static unsigned long shrink_page_list(struct list_head *page_list, 674static unsigned long shrink_page_list(struct list_head *page_list,
766 struct mem_cgroup_zone *mz, 675 struct zone *zone,
767 struct scan_control *sc, 676 struct scan_control *sc,
768 int priority,
769 unsigned long *ret_nr_dirty, 677 unsigned long *ret_nr_dirty,
770 unsigned long *ret_nr_writeback) 678 unsigned long *ret_nr_writeback)
771{ 679{
@@ -794,7 +702,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
794 goto keep; 702 goto keep;
795 703
796 VM_BUG_ON(PageActive(page)); 704 VM_BUG_ON(PageActive(page));
797 VM_BUG_ON(page_zone(page) != mz->zone); 705 VM_BUG_ON(page_zone(page) != zone);
798 706
799 sc->nr_scanned++; 707 sc->nr_scanned++;
800 708
@@ -813,22 +721,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
813 721
814 if (PageWriteback(page)) { 722 if (PageWriteback(page)) {
815 nr_writeback++; 723 nr_writeback++;
816 /* 724 unlock_page(page);
817 * Synchronous reclaim cannot queue pages for 725 goto keep;
818 * writeback due to the possibility of stack overflow
819 * but if it encounters a page under writeback, wait
820 * for the IO to complete.
821 */
822 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
823 may_enter_fs)
824 wait_on_page_writeback(page);
825 else {
826 unlock_page(page);
827 goto keep_lumpy;
828 }
829 } 726 }
830 727
831 references = page_check_references(page, mz, sc); 728 references = page_check_references(page, sc);
832 switch (references) { 729 switch (references) {
833 case PAGEREF_ACTIVATE: 730 case PAGEREF_ACTIVATE:
834 goto activate_locked; 731 goto activate_locked;
@@ -879,7 +776,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
879 * unless under significant pressure. 776 * unless under significant pressure.
880 */ 777 */
881 if (page_is_file_cache(page) && 778 if (page_is_file_cache(page) &&
882 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { 779 (!current_is_kswapd() ||
780 sc->priority >= DEF_PRIORITY - 2)) {
883 /* 781 /*
884 * Immediately reclaim when written back. 782 * Immediately reclaim when written back.
885 * Similar in principal to deactivate_page() 783 * Similar in principal to deactivate_page()
@@ -908,7 +806,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
908 goto activate_locked; 806 goto activate_locked;
909 case PAGE_SUCCESS: 807 case PAGE_SUCCESS:
910 if (PageWriteback(page)) 808 if (PageWriteback(page))
911 goto keep_lumpy; 809 goto keep;
912 if (PageDirty(page)) 810 if (PageDirty(page))
913 goto keep; 811 goto keep;
914 812
@@ -994,7 +892,6 @@ cull_mlocked:
994 try_to_free_swap(page); 892 try_to_free_swap(page);
995 unlock_page(page); 893 unlock_page(page);
996 putback_lru_page(page); 894 putback_lru_page(page);
997 reset_reclaim_mode(sc);
998 continue; 895 continue;
999 896
1000activate_locked: 897activate_locked:
@@ -1007,8 +904,6 @@ activate_locked:
1007keep_locked: 904keep_locked:
1008 unlock_page(page); 905 unlock_page(page);
1009keep: 906keep:
1010 reset_reclaim_mode(sc);
1011keep_lumpy:
1012 list_add(&page->lru, &ret_pages); 907 list_add(&page->lru, &ret_pages);
1013 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 908 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1014 } 909 }
@@ -1020,7 +915,7 @@ keep_lumpy:
1020 * will encounter the same problem 915 * will encounter the same problem
1021 */ 916 */
1022 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) 917 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
1023 zone_set_flag(mz->zone, ZONE_CONGESTED); 918 zone_set_flag(zone, ZONE_CONGESTED);
1024 919
1025 free_hot_cold_page_list(&free_pages, 1); 920 free_hot_cold_page_list(&free_pages, 1);
1026 921
@@ -1041,34 +936,15 @@ keep_lumpy:
1041 * 936 *
1042 * returns 0 on success, -ve errno on failure. 937 * returns 0 on success, -ve errno on failure.
1043 */ 938 */
1044int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) 939int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1045{ 940{
1046 bool all_lru_mode;
1047 int ret = -EINVAL; 941 int ret = -EINVAL;
1048 942
1049 /* Only take pages on the LRU. */ 943 /* Only take pages on the LRU. */
1050 if (!PageLRU(page)) 944 if (!PageLRU(page))
1051 return ret; 945 return ret;
1052 946
1053 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == 947 /* Do not give back unevictable pages for compaction */
1054 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1055
1056 /*
1057 * When checking the active state, we need to be sure we are
1058 * dealing with comparible boolean values. Take the logical not
1059 * of each.
1060 */
1061 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1062 return ret;
1063
1064 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1065 return ret;
1066
1067 /*
1068 * When this function is being called for lumpy reclaim, we
1069 * initially look into all LRU pages, active, inactive and
1070 * unevictable; only give shrink_page_list evictable pages.
1071 */
1072 if (PageUnevictable(page)) 948 if (PageUnevictable(page))
1073 return ret; 949 return ret;
1074 950
@@ -1135,54 +1011,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1135 * Appropriate locks must be held before calling this function. 1011 * Appropriate locks must be held before calling this function.
1136 * 1012 *
1137 * @nr_to_scan: The number of pages to look through on the list. 1013 * @nr_to_scan: The number of pages to look through on the list.
1138 * @mz: The mem_cgroup_zone to pull pages from. 1014 * @lruvec: The LRU vector to pull pages from.
1139 * @dst: The temp list to put pages on to. 1015 * @dst: The temp list to put pages on to.
1140 * @nr_scanned: The number of pages that were scanned. 1016 * @nr_scanned: The number of pages that were scanned.
1141 * @sc: The scan_control struct for this reclaim session 1017 * @sc: The scan_control struct for this reclaim session
1142 * @mode: One of the LRU isolation modes 1018 * @mode: One of the LRU isolation modes
1143 * @active: True [1] if isolating active pages 1019 * @lru: LRU list id for isolating
1144 * @file: True [1] if isolating file [!anon] pages
1145 * 1020 *
1146 * returns how many pages were moved onto *@dst. 1021 * returns how many pages were moved onto *@dst.
1147 */ 1022 */
1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1023static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1149 struct mem_cgroup_zone *mz, struct list_head *dst, 1024 struct lruvec *lruvec, struct list_head *dst,
1150 unsigned long *nr_scanned, struct scan_control *sc, 1025 unsigned long *nr_scanned, struct scan_control *sc,
1151 isolate_mode_t mode, int active, int file) 1026 isolate_mode_t mode, enum lru_list lru)
1152{ 1027{
1153 struct lruvec *lruvec; 1028 struct list_head *src = &lruvec->lists[lru];
1154 struct list_head *src;
1155 unsigned long nr_taken = 0; 1029 unsigned long nr_taken = 0;
1156 unsigned long nr_lumpy_taken = 0;
1157 unsigned long nr_lumpy_dirty = 0;
1158 unsigned long nr_lumpy_failed = 0;
1159 unsigned long scan; 1030 unsigned long scan;
1160 int lru = LRU_BASE;
1161
1162 lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
1163 if (active)
1164 lru += LRU_ACTIVE;
1165 if (file)
1166 lru += LRU_FILE;
1167 src = &lruvec->lists[lru];
1168 1031
1169 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1032 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1170 struct page *page; 1033 struct page *page;
1171 unsigned long pfn; 1034 int nr_pages;
1172 unsigned long end_pfn;
1173 unsigned long page_pfn;
1174 int zone_id;
1175 1035
1176 page = lru_to_page(src); 1036 page = lru_to_page(src);
1177 prefetchw_prev_lru_page(page, src, flags); 1037 prefetchw_prev_lru_page(page, src, flags);
1178 1038
1179 VM_BUG_ON(!PageLRU(page)); 1039 VM_BUG_ON(!PageLRU(page));
1180 1040
1181 switch (__isolate_lru_page(page, mode, file)) { 1041 switch (__isolate_lru_page(page, mode)) {
1182 case 0: 1042 case 0:
1183 mem_cgroup_lru_del(page); 1043 nr_pages = hpage_nr_pages(page);
1044 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
1184 list_move(&page->lru, dst); 1045 list_move(&page->lru, dst);
1185 nr_taken += hpage_nr_pages(page); 1046 nr_taken += nr_pages;
1186 break; 1047 break;
1187 1048
1188 case -EBUSY: 1049 case -EBUSY:
@@ -1193,93 +1054,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1193 default: 1054 default:
1194 BUG(); 1055 BUG();
1195 } 1056 }
1196
1197 if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
1198 continue;
1199
1200 /*
1201 * Attempt to take all pages in the order aligned region
1202 * surrounding the tag page. Only take those pages of
1203 * the same active state as that tag page. We may safely
1204 * round the target page pfn down to the requested order
1205 * as the mem_map is guaranteed valid out to MAX_ORDER,
1206 * where that page is in a different zone we will detect
1207 * it from its zone id and abort this block scan.
1208 */
1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << sc->order) - 1);
1212 end_pfn = pfn + (1 << sc->order);
1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page;
1215
1216 /* The target page is in the block, ignore it. */
1217 if (unlikely(pfn == page_pfn))
1218 continue;
1219
1220 /* Avoid holes within the zone. */
1221 if (unlikely(!pfn_valid_within(pfn)))
1222 break;
1223
1224 cursor_page = pfn_to_page(pfn);
1225
1226 /* Check that we have not crossed a zone boundary. */
1227 if (unlikely(page_zone_id(cursor_page) != zone_id))
1228 break;
1229
1230 /*
1231 * If we don't have enough swap space, reclaiming of
1232 * anon page which don't already have a swap slot is
1233 * pointless.
1234 */
1235 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1236 !PageSwapCache(cursor_page))
1237 break;
1238
1239 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1240 unsigned int isolated_pages;
1241
1242 mem_cgroup_lru_del(cursor_page);
1243 list_move(&cursor_page->lru, dst);
1244 isolated_pages = hpage_nr_pages(cursor_page);
1245 nr_taken += isolated_pages;
1246 nr_lumpy_taken += isolated_pages;
1247 if (PageDirty(cursor_page))
1248 nr_lumpy_dirty += isolated_pages;
1249 scan++;
1250 pfn += isolated_pages - 1;
1251 } else {
1252 /*
1253 * Check if the page is freed already.
1254 *
1255 * We can't use page_count() as that
1256 * requires compound_head and we don't
1257 * have a pin on the page here. If a
1258 * page is tail, we may or may not
1259 * have isolated the head, so assume
1260 * it's not free, it'd be tricky to
1261 * track the head status without a
1262 * page pin.
1263 */
1264 if (!PageTail(cursor_page) &&
1265 !atomic_read(&cursor_page->_count))
1266 continue;
1267 break;
1268 }
1269 }
1270
1271 /* If we break out of the loop above, lumpy reclaim failed */
1272 if (pfn < end_pfn)
1273 nr_lumpy_failed++;
1274 } 1057 }
1275 1058
1276 *nr_scanned = scan; 1059 *nr_scanned = scan;
1277 1060 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
1278 trace_mm_vmscan_lru_isolate(sc->order, 1061 nr_taken, mode, is_file_lru(lru));
1279 nr_to_scan, scan,
1280 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1282 mode, file);
1283 return nr_taken; 1062 return nr_taken;
1284} 1063}
1285 1064
@@ -1316,15 +1095,16 @@ int isolate_lru_page(struct page *page)
1316 1095
1317 if (PageLRU(page)) { 1096 if (PageLRU(page)) {
1318 struct zone *zone = page_zone(page); 1097 struct zone *zone = page_zone(page);
1098 struct lruvec *lruvec;
1319 1099
1320 spin_lock_irq(&zone->lru_lock); 1100 spin_lock_irq(&zone->lru_lock);
1101 lruvec = mem_cgroup_page_lruvec(page, zone);
1321 if (PageLRU(page)) { 1102 if (PageLRU(page)) {
1322 int lru = page_lru(page); 1103 int lru = page_lru(page);
1323 ret = 0;
1324 get_page(page); 1104 get_page(page);
1325 ClearPageLRU(page); 1105 ClearPageLRU(page);
1326 1106 del_page_from_lru_list(page, lruvec, lru);
1327 del_page_from_lru_list(zone, page, lru); 1107 ret = 0;
1328 } 1108 }
1329 spin_unlock_irq(&zone->lru_lock); 1109 spin_unlock_irq(&zone->lru_lock);
1330 } 1110 }
@@ -1357,11 +1137,10 @@ static int too_many_isolated(struct zone *zone, int file,
1357} 1137}
1358 1138
1359static noinline_for_stack void 1139static noinline_for_stack void
1360putback_inactive_pages(struct mem_cgroup_zone *mz, 1140putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1361 struct list_head *page_list)
1362{ 1141{
1363 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1142 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1364 struct zone *zone = mz->zone; 1143 struct zone *zone = lruvec_zone(lruvec);
1365 LIST_HEAD(pages_to_free); 1144 LIST_HEAD(pages_to_free);
1366 1145
1367 /* 1146 /*
@@ -1379,9 +1158,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1379 spin_lock_irq(&zone->lru_lock); 1158 spin_lock_irq(&zone->lru_lock);
1380 continue; 1159 continue;
1381 } 1160 }
1161
1162 lruvec = mem_cgroup_page_lruvec(page, zone);
1163
1382 SetPageLRU(page); 1164 SetPageLRU(page);
1383 lru = page_lru(page); 1165 lru = page_lru(page);
1384 add_page_to_lru_list(zone, page, lru); 1166 add_page_to_lru_list(page, lruvec, lru);
1167
1385 if (is_active_lru(lru)) { 1168 if (is_active_lru(lru)) {
1386 int file = is_file_lru(lru); 1169 int file = is_file_lru(lru);
1387 int numpages = hpage_nr_pages(page); 1170 int numpages = hpage_nr_pages(page);
@@ -1390,7 +1173,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1390 if (put_page_testzero(page)) { 1173 if (put_page_testzero(page)) {
1391 __ClearPageLRU(page); 1174 __ClearPageLRU(page);
1392 __ClearPageActive(page); 1175 __ClearPageActive(page);
1393 del_page_from_lru_list(zone, page, lru); 1176 del_page_from_lru_list(page, lruvec, lru);
1394 1177
1395 if (unlikely(PageCompound(page))) { 1178 if (unlikely(PageCompound(page))) {
1396 spin_unlock_irq(&zone->lru_lock); 1179 spin_unlock_irq(&zone->lru_lock);
@@ -1407,112 +1190,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1407 list_splice(&pages_to_free, page_list); 1190 list_splice(&pages_to_free, page_list);
1408} 1191}
1409 1192
1410static noinline_for_stack void
1411update_isolated_counts(struct mem_cgroup_zone *mz,
1412 struct list_head *page_list,
1413 unsigned long *nr_anon,
1414 unsigned long *nr_file)
1415{
1416 struct zone *zone = mz->zone;
1417 unsigned int count[NR_LRU_LISTS] = { 0, };
1418 unsigned long nr_active = 0;
1419 struct page *page;
1420 int lru;
1421
1422 /*
1423 * Count pages and clear active flags
1424 */
1425 list_for_each_entry(page, page_list, lru) {
1426 int numpages = hpage_nr_pages(page);
1427 lru = page_lru_base_type(page);
1428 if (PageActive(page)) {
1429 lru += LRU_ACTIVE;
1430 ClearPageActive(page);
1431 nr_active += numpages;
1432 }
1433 count[lru] += numpages;
1434 }
1435
1436 preempt_disable();
1437 __count_vm_events(PGDEACTIVATE, nr_active);
1438
1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1440 -count[LRU_ACTIVE_FILE]);
1441 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1442 -count[LRU_INACTIVE_FILE]);
1443 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1444 -count[LRU_ACTIVE_ANON]);
1445 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1446 -count[LRU_INACTIVE_ANON]);
1447
1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1450
1451 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1452 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1453 preempt_enable();
1454}
1455
1456/*
1457 * Returns true if a direct reclaim should wait on pages under writeback.
1458 *
1459 * If we are direct reclaiming for contiguous pages and we do not reclaim
1460 * everything in the list, try again and wait for writeback IO to complete.
1461 * This will stall high-order allocations noticeably. Only do that when really
1462 * need to free the pages under high memory pressure.
1463 */
1464static inline bool should_reclaim_stall(unsigned long nr_taken,
1465 unsigned long nr_freed,
1466 int priority,
1467 struct scan_control *sc)
1468{
1469 int lumpy_stall_priority;
1470
1471 /* kswapd should not stall on sync IO */
1472 if (current_is_kswapd())
1473 return false;
1474
1475 /* Only stall on lumpy reclaim */
1476 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1477 return false;
1478
1479 /* If we have reclaimed everything on the isolated list, no stall */
1480 if (nr_freed == nr_taken)
1481 return false;
1482
1483 /*
1484 * For high-order allocations, there are two stall thresholds.
1485 * High-cost allocations stall immediately where as lower
1486 * order allocations such as stacks require the scanning
1487 * priority to be much higher before stalling.
1488 */
1489 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1490 lumpy_stall_priority = DEF_PRIORITY;
1491 else
1492 lumpy_stall_priority = DEF_PRIORITY / 3;
1493
1494 return priority <= lumpy_stall_priority;
1495}
1496
1497/* 1193/*
1498 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1194 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1499 * of reclaimed pages 1195 * of reclaimed pages
1500 */ 1196 */
1501static noinline_for_stack unsigned long 1197static noinline_for_stack unsigned long
1502shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, 1198shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1503 struct scan_control *sc, int priority, int file) 1199 struct scan_control *sc, enum lru_list lru)
1504{ 1200{
1505 LIST_HEAD(page_list); 1201 LIST_HEAD(page_list);
1506 unsigned long nr_scanned; 1202 unsigned long nr_scanned;
1507 unsigned long nr_reclaimed = 0; 1203 unsigned long nr_reclaimed = 0;
1508 unsigned long nr_taken; 1204 unsigned long nr_taken;
1509 unsigned long nr_anon;
1510 unsigned long nr_file;
1511 unsigned long nr_dirty = 0; 1205 unsigned long nr_dirty = 0;
1512 unsigned long nr_writeback = 0; 1206 unsigned long nr_writeback = 0;
1513 isolate_mode_t isolate_mode = ISOLATE_INACTIVE; 1207 isolate_mode_t isolate_mode = 0;
1514 struct zone *zone = mz->zone; 1208 int file = is_file_lru(lru);
1515 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1209 struct zone *zone = lruvec_zone(lruvec);
1210 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1516 1211
1517 while (unlikely(too_many_isolated(zone, file, sc))) { 1212 while (unlikely(too_many_isolated(zone, file, sc))) {
1518 congestion_wait(BLK_RW_ASYNC, HZ/10); 1213 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,10 +1217,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1522 return SWAP_CLUSTER_MAX; 1217 return SWAP_CLUSTER_MAX;
1523 } 1218 }
1524 1219
1525 set_reclaim_mode(priority, sc, false);
1526 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1527 isolate_mode |= ISOLATE_ACTIVE;
1528
1529 lru_add_drain(); 1220 lru_add_drain();
1530 1221
1531 if (!sc->may_unmap) 1222 if (!sc->may_unmap)
@@ -1535,38 +1226,30 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1535 1226
1536 spin_lock_irq(&zone->lru_lock); 1227 spin_lock_irq(&zone->lru_lock);
1537 1228
1538 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, 1229 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1539 sc, isolate_mode, 0, file); 1230 &nr_scanned, sc, isolate_mode, lru);
1231
1232 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1233 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1234
1540 if (global_reclaim(sc)) { 1235 if (global_reclaim(sc)) {
1541 zone->pages_scanned += nr_scanned; 1236 zone->pages_scanned += nr_scanned;
1542 if (current_is_kswapd()) 1237 if (current_is_kswapd())
1543 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1238 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1544 nr_scanned);
1545 else 1239 else
1546 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1240 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
1547 nr_scanned);
1548 } 1241 }
1549 spin_unlock_irq(&zone->lru_lock); 1242 spin_unlock_irq(&zone->lru_lock);
1550 1243
1551 if (nr_taken == 0) 1244 if (nr_taken == 0)
1552 return 0; 1245 return 0;
1553 1246
1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); 1247 nr_reclaimed = shrink_page_list(&page_list, zone, sc,
1555
1556 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1557 &nr_dirty, &nr_writeback); 1248 &nr_dirty, &nr_writeback);
1558 1249
1559 /* Check if we should syncronously wait for writeback */
1560 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1561 set_reclaim_mode(priority, sc, true);
1562 nr_reclaimed += shrink_page_list(&page_list, mz, sc,
1563 priority, &nr_dirty, &nr_writeback);
1564 }
1565
1566 spin_lock_irq(&zone->lru_lock); 1250 spin_lock_irq(&zone->lru_lock);
1567 1251
1568 reclaim_stat->recent_scanned[0] += nr_anon; 1252 reclaim_stat->recent_scanned[file] += nr_taken;
1569 reclaim_stat->recent_scanned[1] += nr_file;
1570 1253
1571 if (global_reclaim(sc)) { 1254 if (global_reclaim(sc)) {
1572 if (current_is_kswapd()) 1255 if (current_is_kswapd())
@@ -1577,10 +1260,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1577 nr_reclaimed); 1260 nr_reclaimed);
1578 } 1261 }
1579 1262
1580 putback_inactive_pages(mz, &page_list); 1263 putback_inactive_pages(lruvec, &page_list);
1581 1264
1582 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); 1265 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1583 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1584 1266
1585 spin_unlock_irq(&zone->lru_lock); 1267 spin_unlock_irq(&zone->lru_lock);
1586 1268
@@ -1609,14 +1291,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1609 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any 1291 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1610 * isolated page is PageWriteback 1292 * isolated page is PageWriteback
1611 */ 1293 */
1612 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) 1294 if (nr_writeback && nr_writeback >=
1295 (nr_taken >> (DEF_PRIORITY - sc->priority)))
1613 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1296 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1614 1297
1615 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1298 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1616 zone_idx(zone), 1299 zone_idx(zone),
1617 nr_scanned, nr_reclaimed, 1300 nr_scanned, nr_reclaimed,
1618 priority, 1301 sc->priority,
1619 trace_shrink_flags(file, sc->reclaim_mode)); 1302 trace_shrink_flags(file));
1620 return nr_reclaimed; 1303 return nr_reclaimed;
1621} 1304}
1622 1305
@@ -1638,30 +1321,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1638 * But we had to alter page->flags anyway. 1321 * But we had to alter page->flags anyway.
1639 */ 1322 */
1640 1323
1641static void move_active_pages_to_lru(struct zone *zone, 1324static void move_active_pages_to_lru(struct lruvec *lruvec,
1642 struct list_head *list, 1325 struct list_head *list,
1643 struct list_head *pages_to_free, 1326 struct list_head *pages_to_free,
1644 enum lru_list lru) 1327 enum lru_list lru)
1645{ 1328{
1329 struct zone *zone = lruvec_zone(lruvec);
1646 unsigned long pgmoved = 0; 1330 unsigned long pgmoved = 0;
1647 struct page *page; 1331 struct page *page;
1332 int nr_pages;
1648 1333
1649 while (!list_empty(list)) { 1334 while (!list_empty(list)) {
1650 struct lruvec *lruvec;
1651
1652 page = lru_to_page(list); 1335 page = lru_to_page(list);
1336 lruvec = mem_cgroup_page_lruvec(page, zone);
1653 1337
1654 VM_BUG_ON(PageLRU(page)); 1338 VM_BUG_ON(PageLRU(page));
1655 SetPageLRU(page); 1339 SetPageLRU(page);
1656 1340
1657 lruvec = mem_cgroup_lru_add_list(zone, page, lru); 1341 nr_pages = hpage_nr_pages(page);
1342 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
1658 list_move(&page->lru, &lruvec->lists[lru]); 1343 list_move(&page->lru, &lruvec->lists[lru]);
1659 pgmoved += hpage_nr_pages(page); 1344 pgmoved += nr_pages;
1660 1345
1661 if (put_page_testzero(page)) { 1346 if (put_page_testzero(page)) {
1662 __ClearPageLRU(page); 1347 __ClearPageLRU(page);
1663 __ClearPageActive(page); 1348 __ClearPageActive(page);
1664 del_page_from_lru_list(zone, page, lru); 1349 del_page_from_lru_list(page, lruvec, lru);
1665 1350
1666 if (unlikely(PageCompound(page))) { 1351 if (unlikely(PageCompound(page))) {
1667 spin_unlock_irq(&zone->lru_lock); 1352 spin_unlock_irq(&zone->lru_lock);
@@ -1677,9 +1362,9 @@ static void move_active_pages_to_lru(struct zone *zone,
1677} 1362}
1678 1363
1679static void shrink_active_list(unsigned long nr_to_scan, 1364static void shrink_active_list(unsigned long nr_to_scan,
1680 struct mem_cgroup_zone *mz, 1365 struct lruvec *lruvec,
1681 struct scan_control *sc, 1366 struct scan_control *sc,
1682 int priority, int file) 1367 enum lru_list lru)
1683{ 1368{
1684 unsigned long nr_taken; 1369 unsigned long nr_taken;
1685 unsigned long nr_scanned; 1370 unsigned long nr_scanned;
@@ -1688,15 +1373,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
1688 LIST_HEAD(l_active); 1373 LIST_HEAD(l_active);
1689 LIST_HEAD(l_inactive); 1374 LIST_HEAD(l_inactive);
1690 struct page *page; 1375 struct page *page;
1691 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1376 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1692 unsigned long nr_rotated = 0; 1377 unsigned long nr_rotated = 0;
1693 isolate_mode_t isolate_mode = ISOLATE_ACTIVE; 1378 isolate_mode_t isolate_mode = 0;
1694 struct zone *zone = mz->zone; 1379 int file = is_file_lru(lru);
1380 struct zone *zone = lruvec_zone(lruvec);
1695 1381
1696 lru_add_drain(); 1382 lru_add_drain();
1697 1383
1698 reset_reclaim_mode(sc);
1699
1700 if (!sc->may_unmap) 1384 if (!sc->may_unmap)
1701 isolate_mode |= ISOLATE_UNMAPPED; 1385 isolate_mode |= ISOLATE_UNMAPPED;
1702 if (!sc->may_writepage) 1386 if (!sc->may_writepage)
@@ -1704,18 +1388,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
1704 1388
1705 spin_lock_irq(&zone->lru_lock); 1389 spin_lock_irq(&zone->lru_lock);
1706 1390
1707 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, 1391 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1708 isolate_mode, 1, file); 1392 &nr_scanned, sc, isolate_mode, lru);
1709 if (global_reclaim(sc)) 1393 if (global_reclaim(sc))
1710 zone->pages_scanned += nr_scanned; 1394 zone->pages_scanned += nr_scanned;
1711 1395
1712 reclaim_stat->recent_scanned[file] += nr_taken; 1396 reclaim_stat->recent_scanned[file] += nr_taken;
1713 1397
1714 __count_zone_vm_events(PGREFILL, zone, nr_scanned); 1398 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1715 if (file) 1399 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1716 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1717 else
1718 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1719 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1400 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1720 spin_unlock_irq(&zone->lru_lock); 1401 spin_unlock_irq(&zone->lru_lock);
1721 1402
@@ -1737,7 +1418,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
1737 } 1418 }
1738 } 1419 }
1739 1420
1740 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { 1421 if (page_referenced(page, 0, sc->target_mem_cgroup,
1422 &vm_flags)) {
1741 nr_rotated += hpage_nr_pages(page); 1423 nr_rotated += hpage_nr_pages(page);
1742 /* 1424 /*
1743 * Identify referenced, file-backed active pages and 1425 * Identify referenced, file-backed active pages and
@@ -1770,10 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
1770 */ 1452 */
1771 reclaim_stat->recent_rotated[file] += nr_rotated; 1453 reclaim_stat->recent_rotated[file] += nr_rotated;
1772 1454
1773 move_active_pages_to_lru(zone, &l_active, &l_hold, 1455 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1774 LRU_ACTIVE + file * LRU_FILE); 1456 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1775 move_active_pages_to_lru(zone, &l_inactive, &l_hold,
1776 LRU_BASE + file * LRU_FILE);
1777 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1457 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1778 spin_unlock_irq(&zone->lru_lock); 1458 spin_unlock_irq(&zone->lru_lock);
1779 1459
@@ -1796,13 +1476,12 @@ static int inactive_anon_is_low_global(struct zone *zone)
1796 1476
1797/** 1477/**
1798 * inactive_anon_is_low - check if anonymous pages need to be deactivated 1478 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1799 * @zone: zone to check 1479 * @lruvec: LRU vector to check
1800 * @sc: scan control of this context
1801 * 1480 *
1802 * Returns true if the zone does not have enough inactive anon pages, 1481 * Returns true if the zone does not have enough inactive anon pages,
1803 * meaning some active anon pages need to be deactivated. 1482 * meaning some active anon pages need to be deactivated.
1804 */ 1483 */
1805static int inactive_anon_is_low(struct mem_cgroup_zone *mz) 1484static int inactive_anon_is_low(struct lruvec *lruvec)
1806{ 1485{
1807 /* 1486 /*
1808 * If we don't have swap space, anonymous page deactivation 1487 * If we don't have swap space, anonymous page deactivation
@@ -1811,14 +1490,13 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1811 if (!total_swap_pages) 1490 if (!total_swap_pages)
1812 return 0; 1491 return 0;
1813 1492
1814 if (!scanning_global_lru(mz)) 1493 if (!mem_cgroup_disabled())
1815 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, 1494 return mem_cgroup_inactive_anon_is_low(lruvec);
1816 mz->zone);
1817 1495
1818 return inactive_anon_is_low_global(mz->zone); 1496 return inactive_anon_is_low_global(lruvec_zone(lruvec));
1819} 1497}
1820#else 1498#else
1821static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) 1499static inline int inactive_anon_is_low(struct lruvec *lruvec)
1822{ 1500{
1823 return 0; 1501 return 0;
1824} 1502}
@@ -1836,7 +1514,7 @@ static int inactive_file_is_low_global(struct zone *zone)
1836 1514
1837/** 1515/**
1838 * inactive_file_is_low - check if file pages need to be deactivated 1516 * inactive_file_is_low - check if file pages need to be deactivated
1839 * @mz: memory cgroup and zone to check 1517 * @lruvec: LRU vector to check
1840 * 1518 *
1841 * When the system is doing streaming IO, memory pressure here 1519 * When the system is doing streaming IO, memory pressure here
1842 * ensures that active file pages get deactivated, until more 1520 * ensures that active file pages get deactivated, until more
@@ -1848,44 +1526,39 @@ static int inactive_file_is_low_global(struct zone *zone)
1848 * This uses a different ratio than the anonymous pages, because 1526 * This uses a different ratio than the anonymous pages, because
1849 * the page cache uses a use-once replacement algorithm. 1527 * the page cache uses a use-once replacement algorithm.
1850 */ 1528 */
1851static int inactive_file_is_low(struct mem_cgroup_zone *mz) 1529static int inactive_file_is_low(struct lruvec *lruvec)
1852{ 1530{
1853 if (!scanning_global_lru(mz)) 1531 if (!mem_cgroup_disabled())
1854 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, 1532 return mem_cgroup_inactive_file_is_low(lruvec);
1855 mz->zone);
1856 1533
1857 return inactive_file_is_low_global(mz->zone); 1534 return inactive_file_is_low_global(lruvec_zone(lruvec));
1858} 1535}
1859 1536
1860static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) 1537static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1861{ 1538{
1862 if (file) 1539 if (is_file_lru(lru))
1863 return inactive_file_is_low(mz); 1540 return inactive_file_is_low(lruvec);
1864 else 1541 else
1865 return inactive_anon_is_low(mz); 1542 return inactive_anon_is_low(lruvec);
1866} 1543}
1867 1544
1868static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1545static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1869 struct mem_cgroup_zone *mz, 1546 struct lruvec *lruvec, struct scan_control *sc)
1870 struct scan_control *sc, int priority)
1871{ 1547{
1872 int file = is_file_lru(lru);
1873
1874 if (is_active_lru(lru)) { 1548 if (is_active_lru(lru)) {
1875 if (inactive_list_is_low(mz, file)) 1549 if (inactive_list_is_low(lruvec, lru))
1876 shrink_active_list(nr_to_scan, mz, sc, priority, file); 1550 shrink_active_list(nr_to_scan, lruvec, sc, lru);
1877 return 0; 1551 return 0;
1878 } 1552 }
1879 1553
1880 return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); 1554 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
1881} 1555}
1882 1556
1883static int vmscan_swappiness(struct mem_cgroup_zone *mz, 1557static int vmscan_swappiness(struct scan_control *sc)
1884 struct scan_control *sc)
1885{ 1558{
1886 if (global_reclaim(sc)) 1559 if (global_reclaim(sc))
1887 return vm_swappiness; 1560 return vm_swappiness;
1888 return mem_cgroup_swappiness(mz->mem_cgroup); 1561 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1889} 1562}
1890 1563
1891/* 1564/*
@@ -1896,17 +1569,18 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz,
1896 * 1569 *
1897 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1570 * nr[0] = anon pages to scan; nr[1] = file pages to scan
1898 */ 1571 */
1899static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, 1572static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1900 unsigned long *nr, int priority) 1573 unsigned long *nr)
1901{ 1574{
1902 unsigned long anon, file, free; 1575 unsigned long anon, file, free;
1903 unsigned long anon_prio, file_prio; 1576 unsigned long anon_prio, file_prio;
1904 unsigned long ap, fp; 1577 unsigned long ap, fp;
1905 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1578 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1906 u64 fraction[2], denominator; 1579 u64 fraction[2], denominator;
1907 enum lru_list lru; 1580 enum lru_list lru;
1908 int noswap = 0; 1581 int noswap = 0;
1909 bool force_scan = false; 1582 bool force_scan = false;
1583 struct zone *zone = lruvec_zone(lruvec);
1910 1584
1911 /* 1585 /*
1912 * If the zone or memcg is small, nr[l] can be 0. This 1586 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1918,7 +1592,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1918 * latencies, so it's better to scan a minimum amount there as 1592 * latencies, so it's better to scan a minimum amount there as
1919 * well. 1593 * well.
1920 */ 1594 */
1921 if (current_is_kswapd() && mz->zone->all_unreclaimable) 1595 if (current_is_kswapd() && zone->all_unreclaimable)
1922 force_scan = true; 1596 force_scan = true;
1923 if (!global_reclaim(sc)) 1597 if (!global_reclaim(sc))
1924 force_scan = true; 1598 force_scan = true;
@@ -1932,16 +1606,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1932 goto out; 1606 goto out;
1933 } 1607 }
1934 1608
1935 anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + 1609 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1936 zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); 1610 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1937 file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + 1611 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1938 zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); 1612 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1939 1613
1940 if (global_reclaim(sc)) { 1614 if (global_reclaim(sc)) {
1941 free = zone_page_state(mz->zone, NR_FREE_PAGES); 1615 free = zone_page_state(zone, NR_FREE_PAGES);
1942 /* If we have very few page cache pages, 1616 /* If we have very few page cache pages,
1943 force-scan anon pages. */ 1617 force-scan anon pages. */
1944 if (unlikely(file + free <= high_wmark_pages(mz->zone))) { 1618 if (unlikely(file + free <= high_wmark_pages(zone))) {
1945 fraction[0] = 1; 1619 fraction[0] = 1;
1946 fraction[1] = 0; 1620 fraction[1] = 0;
1947 denominator = 1; 1621 denominator = 1;
@@ -1953,8 +1627,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1953 * With swappiness at 100, anonymous and file have the same priority. 1627 * With swappiness at 100, anonymous and file have the same priority.
1954 * This scanning priority is essentially the inverse of IO cost. 1628 * This scanning priority is essentially the inverse of IO cost.
1955 */ 1629 */
1956 anon_prio = vmscan_swappiness(mz, sc); 1630 anon_prio = vmscan_swappiness(sc);
1957 file_prio = 200 - vmscan_swappiness(mz, sc); 1631 file_prio = 200 - anon_prio;
1958 1632
1959 /* 1633 /*
1960 * OK, so we have swap space and a fair amount of page cache 1634 * OK, so we have swap space and a fair amount of page cache
@@ -1967,7 +1641,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1967 * 1641 *
1968 * anon in [0], file in [1] 1642 * anon in [0], file in [1]
1969 */ 1643 */
1970 spin_lock_irq(&mz->zone->lru_lock); 1644 spin_lock_irq(&zone->lru_lock);
1971 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1645 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1972 reclaim_stat->recent_scanned[0] /= 2; 1646 reclaim_stat->recent_scanned[0] /= 2;
1973 reclaim_stat->recent_rotated[0] /= 2; 1647 reclaim_stat->recent_rotated[0] /= 2;
@@ -1983,12 +1657,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1983 * proportional to the fraction of recently scanned pages on 1657 * proportional to the fraction of recently scanned pages on
1984 * each list that were recently referenced and in active use. 1658 * each list that were recently referenced and in active use.
1985 */ 1659 */
1986 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); 1660 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
1987 ap /= reclaim_stat->recent_rotated[0] + 1; 1661 ap /= reclaim_stat->recent_rotated[0] + 1;
1988 1662
1989 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1663 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1990 fp /= reclaim_stat->recent_rotated[1] + 1; 1664 fp /= reclaim_stat->recent_rotated[1] + 1;
1991 spin_unlock_irq(&mz->zone->lru_lock); 1665 spin_unlock_irq(&zone->lru_lock);
1992 1666
1993 fraction[0] = ap; 1667 fraction[0] = ap;
1994 fraction[1] = fp; 1668 fraction[1] = fp;
@@ -1998,9 +1672,9 @@ out:
1998 int file = is_file_lru(lru); 1672 int file = is_file_lru(lru);
1999 unsigned long scan; 1673 unsigned long scan;
2000 1674
2001 scan = zone_nr_lru_pages(mz, lru); 1675 scan = get_lru_size(lruvec, lru);
2002 if (priority || noswap) { 1676 if (sc->priority || noswap || !vmscan_swappiness(sc)) {
2003 scan >>= priority; 1677 scan >>= sc->priority;
2004 if (!scan && force_scan) 1678 if (!scan && force_scan)
2005 scan = SWAP_CLUSTER_MAX; 1679 scan = SWAP_CLUSTER_MAX;
2006 scan = div64_u64(scan * fraction[file], denominator); 1680 scan = div64_u64(scan * fraction[file], denominator);
@@ -2009,14 +1683,25 @@ out:
2009 } 1683 }
2010} 1684}
2011 1685
1686/* Use reclaim/compaction for costly allocs or under memory pressure */
1687static bool in_reclaim_compaction(struct scan_control *sc)
1688{
1689 if (COMPACTION_BUILD && sc->order &&
1690 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1691 sc->priority < DEF_PRIORITY - 2))
1692 return true;
1693
1694 return false;
1695}
1696
2012/* 1697/*
2013 * Reclaim/compaction depends on a number of pages being freed. To avoid 1698 * Reclaim/compaction is used for high-order allocation requests. It reclaims
2014 * disruption to the system, a small number of order-0 pages continue to be 1699 * order-0 pages before compacting the zone. should_continue_reclaim() returns
2015 * rotated and reclaimed in the normal fashion. However, by the time we get 1700 * true if more pages should be reclaimed such that when the page allocator
2016 * back to the allocator and call try_to_compact_zone(), we ensure that 1701 * calls try_to_compact_zone() that it will have enough free pages to succeed.
2017 * there are enough free pages for it to be likely successful 1702 * It will give up earlier than that if there is difficulty reclaiming pages.
2018 */ 1703 */
2019static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, 1704static inline bool should_continue_reclaim(struct lruvec *lruvec,
2020 unsigned long nr_reclaimed, 1705 unsigned long nr_reclaimed,
2021 unsigned long nr_scanned, 1706 unsigned long nr_scanned,
2022 struct scan_control *sc) 1707 struct scan_control *sc)
@@ -2025,7 +1710,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2025 unsigned long inactive_lru_pages; 1710 unsigned long inactive_lru_pages;
2026 1711
2027 /* If not in reclaim/compaction mode, stop */ 1712 /* If not in reclaim/compaction mode, stop */
2028 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1713 if (!in_reclaim_compaction(sc))
2029 return false; 1714 return false;
2030 1715
2031 /* Consider stopping depending on scan and reclaim activity */ 1716 /* Consider stopping depending on scan and reclaim activity */
@@ -2056,15 +1741,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2056 * inactive lists are large enough, continue reclaiming 1741 * inactive lists are large enough, continue reclaiming
2057 */ 1742 */
2058 pages_for_compaction = (2UL << sc->order); 1743 pages_for_compaction = (2UL << sc->order);
2059 inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); 1744 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
2060 if (nr_swap_pages > 0) 1745 if (nr_swap_pages > 0)
2061 inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); 1746 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
2062 if (sc->nr_reclaimed < pages_for_compaction && 1747 if (sc->nr_reclaimed < pages_for_compaction &&
2063 inactive_lru_pages > pages_for_compaction) 1748 inactive_lru_pages > pages_for_compaction)
2064 return true; 1749 return true;
2065 1750
2066 /* If compaction would go ahead or the allocation would succeed, stop */ 1751 /* If compaction would go ahead or the allocation would succeed, stop */
2067 switch (compaction_suitable(mz->zone, sc->order)) { 1752 switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
2068 case COMPACT_PARTIAL: 1753 case COMPACT_PARTIAL:
2069 case COMPACT_CONTINUE: 1754 case COMPACT_CONTINUE:
2070 return false; 1755 return false;
@@ -2076,8 +1761,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2076/* 1761/*
2077 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1762 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2078 */ 1763 */
2079static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, 1764static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2080 struct scan_control *sc)
2081{ 1765{
2082 unsigned long nr[NR_LRU_LISTS]; 1766 unsigned long nr[NR_LRU_LISTS];
2083 unsigned long nr_to_scan; 1767 unsigned long nr_to_scan;
@@ -2089,7 +1773,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
2089restart: 1773restart:
2090 nr_reclaimed = 0; 1774 nr_reclaimed = 0;
2091 nr_scanned = sc->nr_scanned; 1775 nr_scanned = sc->nr_scanned;
2092 get_scan_count(mz, sc, nr, priority); 1776 get_scan_count(lruvec, sc, nr);
2093 1777
2094 blk_start_plug(&plug); 1778 blk_start_plug(&plug);
2095 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1779 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2101,7 +1785,7 @@ restart:
2101 nr[lru] -= nr_to_scan; 1785 nr[lru] -= nr_to_scan;
2102 1786
2103 nr_reclaimed += shrink_list(lru, nr_to_scan, 1787 nr_reclaimed += shrink_list(lru, nr_to_scan,
2104 mz, sc, priority); 1788 lruvec, sc);
2105 } 1789 }
2106 } 1790 }
2107 /* 1791 /*
@@ -2112,7 +1796,8 @@ restart:
2112 * with multiple processes reclaiming pages, the total 1796 * with multiple processes reclaiming pages, the total
2113 * freeing target can get unreasonably large. 1797 * freeing target can get unreasonably large.
2114 */ 1798 */
2115 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1799 if (nr_reclaimed >= nr_to_reclaim &&
1800 sc->priority < DEF_PRIORITY)
2116 break; 1801 break;
2117 } 1802 }
2118 blk_finish_plug(&plug); 1803 blk_finish_plug(&plug);
@@ -2122,35 +1807,33 @@ restart:
2122 * Even if we did not try to evict anon pages at all, we want to 1807 * Even if we did not try to evict anon pages at all, we want to
2123 * rebalance the anon lru active/inactive ratio. 1808 * rebalance the anon lru active/inactive ratio.
2124 */ 1809 */
2125 if (inactive_anon_is_low(mz)) 1810 if (inactive_anon_is_low(lruvec))
2126 shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); 1811 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1812 sc, LRU_ACTIVE_ANON);
2127 1813
2128 /* reclaim/compaction might need reclaim to continue */ 1814 /* reclaim/compaction might need reclaim to continue */
2129 if (should_continue_reclaim(mz, nr_reclaimed, 1815 if (should_continue_reclaim(lruvec, nr_reclaimed,
2130 sc->nr_scanned - nr_scanned, sc)) 1816 sc->nr_scanned - nr_scanned, sc))
2131 goto restart; 1817 goto restart;
2132 1818
2133 throttle_vm_writeout(sc->gfp_mask); 1819 throttle_vm_writeout(sc->gfp_mask);
2134} 1820}
2135 1821
2136static void shrink_zone(int priority, struct zone *zone, 1822static void shrink_zone(struct zone *zone, struct scan_control *sc)
2137 struct scan_control *sc)
2138{ 1823{
2139 struct mem_cgroup *root = sc->target_mem_cgroup; 1824 struct mem_cgroup *root = sc->target_mem_cgroup;
2140 struct mem_cgroup_reclaim_cookie reclaim = { 1825 struct mem_cgroup_reclaim_cookie reclaim = {
2141 .zone = zone, 1826 .zone = zone,
2142 .priority = priority, 1827 .priority = sc->priority,
2143 }; 1828 };
2144 struct mem_cgroup *memcg; 1829 struct mem_cgroup *memcg;
2145 1830
2146 memcg = mem_cgroup_iter(root, NULL, &reclaim); 1831 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2147 do { 1832 do {
2148 struct mem_cgroup_zone mz = { 1833 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2149 .mem_cgroup = memcg, 1834
2150 .zone = zone, 1835 shrink_lruvec(lruvec, sc);
2151 };
2152 1836
2153 shrink_mem_cgroup_zone(priority, &mz, sc);
2154 /* 1837 /*
2155 * Limit reclaim has historically picked one memcg and 1838 * Limit reclaim has historically picked one memcg and
2156 * scanned it with decreasing priority levels until 1839 * scanned it with decreasing priority levels until
@@ -2226,8 +1909,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2226 * the caller that it should consider retrying the allocation instead of 1909 * the caller that it should consider retrying the allocation instead of
2227 * further reclaim. 1910 * further reclaim.
2228 */ 1911 */
2229static bool shrink_zones(int priority, struct zonelist *zonelist, 1912static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2230 struct scan_control *sc)
2231{ 1913{
2232 struct zoneref *z; 1914 struct zoneref *z;
2233 struct zone *zone; 1915 struct zone *zone;
@@ -2254,7 +1936,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2254 if (global_reclaim(sc)) { 1936 if (global_reclaim(sc)) {
2255 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1937 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2256 continue; 1938 continue;
2257 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1939 if (zone->all_unreclaimable &&
1940 sc->priority != DEF_PRIORITY)
2258 continue; /* Let kswapd poll it */ 1941 continue; /* Let kswapd poll it */
2259 if (COMPACTION_BUILD) { 1942 if (COMPACTION_BUILD) {
2260 /* 1943 /*
@@ -2286,7 +1969,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2286 /* need some check for avoid more shrink_zone() */ 1969 /* need some check for avoid more shrink_zone() */
2287 } 1970 }
2288 1971
2289 shrink_zone(priority, zone, sc); 1972 shrink_zone(zone, sc);
2290 } 1973 }
2291 1974
2292 return aborted_reclaim; 1975 return aborted_reclaim;
@@ -2337,7 +2020,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2337 struct scan_control *sc, 2020 struct scan_control *sc,
2338 struct shrink_control *shrink) 2021 struct shrink_control *shrink)
2339{ 2022{
2340 int priority;
2341 unsigned long total_scanned = 0; 2023 unsigned long total_scanned = 0;
2342 struct reclaim_state *reclaim_state = current->reclaim_state; 2024 struct reclaim_state *reclaim_state = current->reclaim_state;
2343 struct zoneref *z; 2025 struct zoneref *z;
@@ -2350,11 +2032,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2350 if (global_reclaim(sc)) 2032 if (global_reclaim(sc))
2351 count_vm_event(ALLOCSTALL); 2033 count_vm_event(ALLOCSTALL);
2352 2034
2353 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2035 do {
2354 sc->nr_scanned = 0; 2036 sc->nr_scanned = 0;
2355 if (!priority) 2037 aborted_reclaim = shrink_zones(zonelist, sc);
2356 disable_swap_token(sc->target_mem_cgroup);
2357 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2358 2038
2359 /* 2039 /*
2360 * Don't shrink slabs when reclaiming memory from 2040 * Don't shrink slabs when reclaiming memory from
@@ -2396,7 +2076,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2396 2076
2397 /* Take a nap, wait for some writeback to complete */ 2077 /* Take a nap, wait for some writeback to complete */
2398 if (!sc->hibernation_mode && sc->nr_scanned && 2078 if (!sc->hibernation_mode && sc->nr_scanned &&
2399 priority < DEF_PRIORITY - 2) { 2079 sc->priority < DEF_PRIORITY - 2) {
2400 struct zone *preferred_zone; 2080 struct zone *preferred_zone;
2401 2081
2402 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2082 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
@@ -2404,7 +2084,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2404 &preferred_zone); 2084 &preferred_zone);
2405 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2085 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2406 } 2086 }
2407 } 2087 } while (--sc->priority >= 0);
2408 2088
2409out: 2089out:
2410 delayacct_freepages_end(); 2090 delayacct_freepages_end();
@@ -2442,6 +2122,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2442 .may_unmap = 1, 2122 .may_unmap = 1,
2443 .may_swap = 1, 2123 .may_swap = 1,
2444 .order = order, 2124 .order = order,
2125 .priority = DEF_PRIORITY,
2445 .target_mem_cgroup = NULL, 2126 .target_mem_cgroup = NULL,
2446 .nodemask = nodemask, 2127 .nodemask = nodemask,
2447 }; 2128 };
@@ -2474,17 +2155,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2474 .may_unmap = 1, 2155 .may_unmap = 1,
2475 .may_swap = !noswap, 2156 .may_swap = !noswap,
2476 .order = 0, 2157 .order = 0,
2158 .priority = 0,
2477 .target_mem_cgroup = memcg, 2159 .target_mem_cgroup = memcg,
2478 }; 2160 };
2479 struct mem_cgroup_zone mz = { 2161 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2480 .mem_cgroup = memcg,
2481 .zone = zone,
2482 };
2483 2162
2484 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2163 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2485 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2164 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2486 2165
2487 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, 2166 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2488 sc.may_writepage, 2167 sc.may_writepage,
2489 sc.gfp_mask); 2168 sc.gfp_mask);
2490 2169
@@ -2495,7 +2174,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2495 * will pick up pages from other mem cgroup's as well. We hack 2174 * will pick up pages from other mem cgroup's as well. We hack
2496 * the priority and make it zero. 2175 * the priority and make it zero.
2497 */ 2176 */
2498 shrink_mem_cgroup_zone(0, &mz, &sc); 2177 shrink_lruvec(lruvec, &sc);
2499 2178
2500 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2179 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2501 2180
@@ -2516,6 +2195,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2516 .may_swap = !noswap, 2195 .may_swap = !noswap,
2517 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2196 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2518 .order = 0, 2197 .order = 0,
2198 .priority = DEF_PRIORITY,
2519 .target_mem_cgroup = memcg, 2199 .target_mem_cgroup = memcg,
2520 .nodemask = NULL, /* we don't care the placement */ 2200 .nodemask = NULL, /* we don't care the placement */
2521 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2201 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2546,8 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2546} 2226}
2547#endif 2227#endif
2548 2228
2549static void age_active_anon(struct zone *zone, struct scan_control *sc, 2229static void age_active_anon(struct zone *zone, struct scan_control *sc)
2550 int priority)
2551{ 2230{
2552 struct mem_cgroup *memcg; 2231 struct mem_cgroup *memcg;
2553 2232
@@ -2556,14 +2235,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc,
2556 2235
2557 memcg = mem_cgroup_iter(NULL, NULL, NULL); 2236 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2558 do { 2237 do {
2559 struct mem_cgroup_zone mz = { 2238 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2560 .mem_cgroup = memcg,
2561 .zone = zone,
2562 };
2563 2239
2564 if (inactive_anon_is_low(&mz)) 2240 if (inactive_anon_is_low(lruvec))
2565 shrink_active_list(SWAP_CLUSTER_MAX, &mz, 2241 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2566 sc, priority, 0); 2242 sc, LRU_ACTIVE_ANON);
2567 2243
2568 memcg = mem_cgroup_iter(NULL, memcg, NULL); 2244 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2569 } while (memcg); 2245 } while (memcg);
@@ -2672,7 +2348,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2672{ 2348{
2673 int all_zones_ok; 2349 int all_zones_ok;
2674 unsigned long balanced; 2350 unsigned long balanced;
2675 int priority;
2676 int i; 2351 int i;
2677 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2352 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2678 unsigned long total_scanned; 2353 unsigned long total_scanned;
@@ -2696,18 +2371,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2696 }; 2371 };
2697loop_again: 2372loop_again:
2698 total_scanned = 0; 2373 total_scanned = 0;
2374 sc.priority = DEF_PRIORITY;
2699 sc.nr_reclaimed = 0; 2375 sc.nr_reclaimed = 0;
2700 sc.may_writepage = !laptop_mode; 2376 sc.may_writepage = !laptop_mode;
2701 count_vm_event(PAGEOUTRUN); 2377 count_vm_event(PAGEOUTRUN);
2702 2378
2703 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2379 do {
2704 unsigned long lru_pages = 0; 2380 unsigned long lru_pages = 0;
2705 int has_under_min_watermark_zone = 0; 2381 int has_under_min_watermark_zone = 0;
2706 2382
2707 /* The swap token gets in the way of swapout... */
2708 if (!priority)
2709 disable_swap_token(NULL);
2710
2711 all_zones_ok = 1; 2383 all_zones_ok = 1;
2712 balanced = 0; 2384 balanced = 0;
2713 2385
@@ -2721,14 +2393,15 @@ loop_again:
2721 if (!populated_zone(zone)) 2393 if (!populated_zone(zone))
2722 continue; 2394 continue;
2723 2395
2724 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2396 if (zone->all_unreclaimable &&
2397 sc.priority != DEF_PRIORITY)
2725 continue; 2398 continue;
2726 2399
2727 /* 2400 /*
2728 * Do some background aging of the anon list, to give 2401 * Do some background aging of the anon list, to give
2729 * pages a chance to be referenced before reclaiming. 2402 * pages a chance to be referenced before reclaiming.
2730 */ 2403 */
2731 age_active_anon(zone, &sc, priority); 2404 age_active_anon(zone, &sc);
2732 2405
2733 /* 2406 /*
2734 * If the number of buffer_heads in the machine 2407 * If the number of buffer_heads in the machine
@@ -2776,7 +2449,8 @@ loop_again:
2776 if (!populated_zone(zone)) 2449 if (!populated_zone(zone))
2777 continue; 2450 continue;
2778 2451
2779 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2452 if (zone->all_unreclaimable &&
2453 sc.priority != DEF_PRIORITY)
2780 continue; 2454 continue;
2781 2455
2782 sc.nr_scanned = 0; 2456 sc.nr_scanned = 0;
@@ -2820,7 +2494,7 @@ loop_again:
2820 !zone_watermark_ok_safe(zone, testorder, 2494 !zone_watermark_ok_safe(zone, testorder,
2821 high_wmark_pages(zone) + balance_gap, 2495 high_wmark_pages(zone) + balance_gap,
2822 end_zone, 0)) { 2496 end_zone, 0)) {
2823 shrink_zone(priority, zone, &sc); 2497 shrink_zone(zone, &sc);
2824 2498
2825 reclaim_state->reclaimed_slab = 0; 2499 reclaim_state->reclaimed_slab = 0;
2826 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); 2500 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
@@ -2877,7 +2551,7 @@ loop_again:
2877 * OK, kswapd is getting into trouble. Take a nap, then take 2551 * OK, kswapd is getting into trouble. Take a nap, then take
2878 * another pass across the zones. 2552 * another pass across the zones.
2879 */ 2553 */
2880 if (total_scanned && (priority < DEF_PRIORITY - 2)) { 2554 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2881 if (has_under_min_watermark_zone) 2555 if (has_under_min_watermark_zone)
2882 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2556 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2883 else 2557 else
@@ -2892,7 +2566,7 @@ loop_again:
2892 */ 2566 */
2893 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2567 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2894 break; 2568 break;
2895 } 2569 } while (--sc.priority >= 0);
2896out: 2570out:
2897 2571
2898 /* 2572 /*
@@ -2942,7 +2616,8 @@ out:
2942 if (!populated_zone(zone)) 2616 if (!populated_zone(zone))
2943 continue; 2617 continue;
2944 2618
2945 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2619 if (zone->all_unreclaimable &&
2620 sc.priority != DEF_PRIORITY)
2946 continue; 2621 continue;
2947 2622
2948 /* Would compaction fail due to lack of free memory? */ 2623 /* Would compaction fail due to lack of free memory? */
@@ -3013,7 +2688,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3013 * them before going back to sleep. 2688 * them before going back to sleep.
3014 */ 2689 */
3015 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2690 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3016 schedule(); 2691
2692 if (!kthread_should_stop())
2693 schedule();
2694
3017 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2695 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3018 } else { 2696 } else {
3019 if (remaining) 2697 if (remaining)
@@ -3209,6 +2887,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3209 .nr_to_reclaim = nr_to_reclaim, 2887 .nr_to_reclaim = nr_to_reclaim,
3210 .hibernation_mode = 1, 2888 .hibernation_mode = 1,
3211 .order = 0, 2889 .order = 0,
2890 .priority = DEF_PRIORITY,
3212 }; 2891 };
3213 struct shrink_control shrink = { 2892 struct shrink_control shrink = {
3214 .gfp_mask = sc.gfp_mask, 2893 .gfp_mask = sc.gfp_mask,
@@ -3279,14 +2958,17 @@ int kswapd_run(int nid)
3279} 2958}
3280 2959
3281/* 2960/*
3282 * Called by memory hotplug when all memory in a node is offlined. 2961 * Called by memory hotplug when all memory in a node is offlined. Caller must
2962 * hold lock_memory_hotplug().
3283 */ 2963 */
3284void kswapd_stop(int nid) 2964void kswapd_stop(int nid)
3285{ 2965{
3286 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 2966 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3287 2967
3288 if (kswapd) 2968 if (kswapd) {
3289 kthread_stop(kswapd); 2969 kthread_stop(kswapd);
2970 NODE_DATA(nid)->kswapd = NULL;
2971 }
3290} 2972}
3291 2973
3292static int __init kswapd_init(void) 2974static int __init kswapd_init(void)
@@ -3386,7 +3068,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3386 const unsigned long nr_pages = 1 << order; 3068 const unsigned long nr_pages = 1 << order;
3387 struct task_struct *p = current; 3069 struct task_struct *p = current;
3388 struct reclaim_state reclaim_state; 3070 struct reclaim_state reclaim_state;
3389 int priority;
3390 struct scan_control sc = { 3071 struct scan_control sc = {
3391 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3072 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3392 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3073 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -3395,6 +3076,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3395 SWAP_CLUSTER_MAX), 3076 SWAP_CLUSTER_MAX),
3396 .gfp_mask = gfp_mask, 3077 .gfp_mask = gfp_mask,
3397 .order = order, 3078 .order = order,
3079 .priority = ZONE_RECLAIM_PRIORITY,
3398 }; 3080 };
3399 struct shrink_control shrink = { 3081 struct shrink_control shrink = {
3400 .gfp_mask = sc.gfp_mask, 3082 .gfp_mask = sc.gfp_mask,
@@ -3417,11 +3099,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3417 * Free memory by calling shrink zone with increasing 3099 * Free memory by calling shrink zone with increasing
3418 * priorities until we have enough memory freed. 3100 * priorities until we have enough memory freed.
3419 */ 3101 */
3420 priority = ZONE_RECLAIM_PRIORITY;
3421 do { 3102 do {
3422 shrink_zone(priority, zone, &sc); 3103 shrink_zone(zone, &sc);
3423 priority--; 3104 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3424 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
3425 } 3105 }
3426 3106
3427 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3107 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -3536,7 +3216,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
3536 if (mapping_unevictable(page_mapping(page))) 3216 if (mapping_unevictable(page_mapping(page)))
3537 return 0; 3217 return 0;
3538 3218
3539 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) 3219 if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
3540 return 0; 3220 return 0;
3541 3221
3542 return 1; 3222 return 1;
@@ -3572,6 +3252,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3572 zone = pagezone; 3252 zone = pagezone;
3573 spin_lock_irq(&zone->lru_lock); 3253 spin_lock_irq(&zone->lru_lock);
3574 } 3254 }
3255 lruvec = mem_cgroup_page_lruvec(page, zone);
3575 3256
3576 if (!PageLRU(page) || !PageUnevictable(page)) 3257 if (!PageLRU(page) || !PageUnevictable(page))
3577 continue; 3258 continue;
@@ -3581,11 +3262,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3581 3262
3582 VM_BUG_ON(PageActive(page)); 3263 VM_BUG_ON(PageActive(page));
3583 ClearPageUnevictable(page); 3264 ClearPageUnevictable(page);
3584 __dec_zone_state(zone, NR_UNEVICTABLE); 3265 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3585 lruvec = mem_cgroup_lru_move_lists(zone, page, 3266 add_page_to_lru_list(page, lruvec, lru);
3586 LRU_UNEVICTABLE, lru);
3587 list_move(&page->lru, &lruvec->lists[lru]);
3588 __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
3589 pgrescued++; 3267 pgrescued++;
3590 } 3268 }
3591 } 3269 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7db1b9bab492..1bbbbd9776ad 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -613,6 +613,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
613 "Reclaimable", 613 "Reclaimable",
614 "Movable", 614 "Movable",
615 "Reserve", 615 "Reserve",
616#ifdef CONFIG_CMA
617 "CMA",
618#endif
616 "Isolate", 619 "Isolate",
617}; 620};
618 621
@@ -1220,7 +1223,6 @@ module_init(setup_vmstat)
1220#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 1223#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1221#include <linux/debugfs.h> 1224#include <linux/debugfs.h>
1222 1225
1223static struct dentry *extfrag_debug_root;
1224 1226
1225/* 1227/*
1226 * Return an index indicating how much of the available free memory is 1228 * Return an index indicating how much of the available free memory is
@@ -1358,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = {
1358 1360
1359static int __init extfrag_debug_init(void) 1361static int __init extfrag_debug_init(void)
1360{ 1362{
1363 struct dentry *extfrag_debug_root;
1364
1361 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); 1365 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1362 if (!extfrag_debug_root) 1366 if (!extfrag_debug_root)
1363 return -ENOMEM; 1367 return -ENOMEM;
1364 1368
1365 if (!debugfs_create_file("unusable_index", 0444, 1369 if (!debugfs_create_file("unusable_index", 0444,
1366 extfrag_debug_root, NULL, &unusable_file_ops)) 1370 extfrag_debug_root, NULL, &unusable_file_ops))
1367 return -ENOMEM; 1371 goto fail;
1368 1372
1369 if (!debugfs_create_file("extfrag_index", 0444, 1373 if (!debugfs_create_file("extfrag_index", 0444,
1370 extfrag_debug_root, NULL, &extfrag_file_ops)) 1374 extfrag_debug_root, NULL, &extfrag_file_ops))
1371 return -ENOMEM; 1375 goto fail;
1372 1376
1373 return 0; 1377 return 0;
1378fail:
1379 debugfs_remove_recursive(extfrag_debug_root);
1380 return -ENOMEM;
1374} 1381}
1375 1382
1376module_init(extfrag_debug_init); 1383module_init(extfrag_debug_init);