aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-07-28 19:36:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-28 19:36:48 -0400
commit1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9 (patch)
tree6d227487ca2cf391589c73af1c40ec7b7126feec
parent6039b80eb50a893476fea7d56e86ed2d19290054 (diff)
parentc3486f5376696034d0fcbef8ba70c70cfcb26f51 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: "The rest of MM" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (101 commits) mm, compaction: simplify contended compaction handling mm, compaction: introduce direct compaction priority mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations mm, page_alloc: make THP-specific decisions more generic mm, page_alloc: restructure direct compaction handling in slowpath mm, page_alloc: don't retry initial attempt in slowpath mm, page_alloc: set alloc_flags only once in slowpath lib/stackdepot.c: use __GFP_NOWARN for stack allocations mm, kasan: switch SLUB to stackdepot, enable memory quarantine for SLUB mm, kasan: account for object redzone in SLUB's nearest_obj() mm: fix use-after-free if memory allocation failed in vma_adjust() zsmalloc: Delete an unnecessary check before the function call "iput" mm/memblock.c: fix index adjustment error in __next_mem_range_rev() mem-hotplug: alloc new page from a nearest neighbor node when mem-offline mm: optimize copy_page_to/from_iter_iovec mm: add cond_resched() to generic_swapfile_activate() Revert "mm, mempool: only set __GFP_NOMEMALLOC if there are free elements" mm, compaction: don't isolate PageWriteback pages in MIGRATE_SYNC_LIGHT mode mm: hwpoison: remove incorrect comments make __section_nr() more efficient ...
-rw-r--r--Documentation/cgroup-v1/memcg_test.txt4
-rw-r--r--Documentation/cgroup-v1/memory.txt4
-rw-r--r--arch/arm64/mm/init.c2
-rw-r--r--arch/s390/appldata/appldata_mem.c2
-rw-r--r--arch/tile/mm/pgtable.c18
-rw-r--r--drivers/base/node.c78
-rw-r--r--drivers/staging/android/lowmemorykiller.c12
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_cache.c6
-rw-r--r--fs/fs-writeback.c4
-rw-r--r--fs/fuse/file.c8
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/proc/base.c185
-rw-r--r--fs/proc/meminfo.c22
-rw-r--r--include/linux/backing-dev.h2
-rw-r--r--include/linux/compaction.h33
-rw-r--r--include/linux/gfp.h14
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/kasan.h2
-rw-r--r--include/linux/kdb.h2
-rw-r--r--include/linux/memblock.h1
-rw-r--r--include/linux/memcontrol.h70
-rw-r--r--include/linux/memremap.h2
-rw-r--r--include/linux/mm.h17
-rw-r--r--include/linux/mm_inline.h19
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/mmzone.h170
-rw-r--r--include/linux/oom.h26
-rw-r--r--include/linux/sched.h27
-rw-r--r--include/linux/slab_def.h3
-rw-r--r--include/linux/slub_def.h14
-rw-r--r--include/linux/swap.h23
-rw-r--r--include/linux/topology.h2
-rw-r--r--include/linux/vm_event_item.h14
-rw-r--r--include/linux/vmstat.h111
-rw-r--r--include/linux/writeback.h2
-rw-r--r--include/trace/events/compaction.h12
-rw-r--r--include/trace/events/mmflags.h1
-rw-r--r--include/trace/events/vmscan.h63
-rw-r--r--include/trace/events/writeback.h10
-rw-r--r--kernel/cpuset.c9
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/memremap.c8
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/printk/printk.c5
-rw-r--r--kernel/sysctl.c4
-rw-r--r--lib/Kconfig.kasan4
-rw-r--r--lib/iov_iter.c8
-rw-r--r--lib/stackdepot.c1
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c15
-rw-r--r--mm/compaction.c113
-rw-r--r--mm/filemap.c16
-rw-r--r--mm/huge_memory.c56
-rw-r--r--mm/hugetlb.c1
-rw-r--r--mm/internal.h16
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/kasan.c63
-rw-r--r--mm/kasan/kasan.h3
-rw-r--r--mm/kasan/report.c8
-rw-r--r--mm/khugepaged.c16
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/memblock.c61
-rw-r--r--mm/memcontrol.c260
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/memory_hotplug.c45
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mempool.c18
-rw-r--r--mm/migrate.c39
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mmap.c20
-rw-r--r--mm/oom_kill.c187
-rw-r--r--mm/page-writeback.c128
-rw-r--r--mm/page_alloc.c557
-rw-r--r--mm/page_idle.c4
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/rmap.c26
-rw-r--r--mm/shmem.c14
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slub.c59
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c76
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/util.c4
-rw-r--r--mm/vmscan.c1023
-rw-r--r--mm/vmstat.c417
-rw-r--r--mm/workingset.c62
-rw-r--r--mm/zsmalloc.c74
-rw-r--r--tools/perf/builtin-kmem.c1
90 files changed, 2517 insertions, 1978 deletions
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt
index 8870b0212150..78a8c2963b38 100644
--- a/Documentation/cgroup-v1/memcg_test.txt
+++ b/Documentation/cgroup-v1/memcg_test.txt
@@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
107 107
1088. LRU 1088. LRU
109 Each memcg has its own private LRU. Now, its handling is under global 109 Each memcg has its own private LRU. Now, its handling is under global
110 VM's control (means that it's handled under global zone->lru_lock). 110 VM's control (means that it's handled under global zone_lru_lock).
111 Almost all routines around memcg's LRU is called by global LRU's 111 Almost all routines around memcg's LRU is called by global LRU's
112 list management functions under zone->lru_lock(). 112 list management functions under zone_lru_lock().
113 113
114 A special function is mem_cgroup_isolate_pages(). This scans 114 A special function is mem_cgroup_isolate_pages(). This scans
115 memcg's private LRU and call __isolate_lru_page() to extract a page 115 memcg's private LRU and call __isolate_lru_page() to extract a page
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt
index b14abf217239..946e69103cdd 100644
--- a/Documentation/cgroup-v1/memory.txt
+++ b/Documentation/cgroup-v1/memory.txt
@@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered.
267 Other lock order is following: 267 Other lock order is following:
268 PG_locked. 268 PG_locked.
269 mm->page_table_lock 269 mm->page_table_lock
270 zone->lru_lock 270 zone_lru_lock
271 lock_page_cgroup. 271 lock_page_cgroup.
272 In many cases, just lock_page_cgroup() is called. 272 In many cases, just lock_page_cgroup() is called.
273 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by 273 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
274 zone->lru_lock, it has no lock of its own. 274 zone_lru_lock, it has no lock of its own.
275 275
2762.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) 2762.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
277 277
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 2ade7a6a10a7..bbb7ee76e319 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -224,7 +224,7 @@ void __init arm64_memblock_init(void)
224 * via the linear mapping. 224 * via the linear mapping.
225 */ 225 */
226 if (memory_limit != (phys_addr_t)ULLONG_MAX) { 226 if (memory_limit != (phys_addr_t)ULLONG_MAX) {
227 memblock_enforce_memory_limit(memory_limit); 227 memblock_mem_limit_remove_map(memory_limit);
228 memblock_add(__pa(_text), (u64)(_end - _text)); 228 memblock_add(__pa(_text), (u64)(_end - _text));
229 } 229 }
230 230
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index edcf2a706942..598df5708501 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -102,7 +102,7 @@ static void appldata_get_mem_data(void *data)
102 mem_data->totalhigh = P2K(val.totalhigh); 102 mem_data->totalhigh = P2K(val.totalhigh);
103 mem_data->freehigh = P2K(val.freehigh); 103 mem_data->freehigh = P2K(val.freehigh);
104 mem_data->bufferram = P2K(val.bufferram); 104 mem_data->bufferram = P2K(val.bufferram);
105 mem_data->cached = P2K(global_page_state(NR_FILE_PAGES) 105 mem_data->cached = P2K(global_node_page_state(NR_FILE_PAGES)
106 - val.bufferram); 106 - val.bufferram);
107 107
108 si_swapinfo(&val); 108 si_swapinfo(&val);
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index c4d5bf841a7f..7cc6ee7f1a58 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -45,20 +45,20 @@ void show_mem(unsigned int filter)
45 struct zone *zone; 45 struct zone *zone;
46 46
47 pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n", 47 pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n",
48 (global_page_state(NR_ACTIVE_ANON) + 48 (global_node_page_state(NR_ACTIVE_ANON) +
49 global_page_state(NR_ACTIVE_FILE)), 49 global_node_page_state(NR_ACTIVE_FILE)),
50 (global_page_state(NR_INACTIVE_ANON) + 50 (global_node_page_state(NR_INACTIVE_ANON) +
51 global_page_state(NR_INACTIVE_FILE)), 51 global_node_page_state(NR_INACTIVE_FILE)),
52 global_page_state(NR_FILE_DIRTY), 52 global_node_page_state(NR_FILE_DIRTY),
53 global_page_state(NR_WRITEBACK), 53 global_node_page_state(NR_WRITEBACK),
54 global_page_state(NR_UNSTABLE_NFS), 54 global_node_page_state(NR_UNSTABLE_NFS),
55 global_page_state(NR_FREE_PAGES), 55 global_page_state(NR_FREE_PAGES),
56 (global_page_state(NR_SLAB_RECLAIMABLE) + 56 (global_page_state(NR_SLAB_RECLAIMABLE) +
57 global_page_state(NR_SLAB_UNRECLAIMABLE)), 57 global_page_state(NR_SLAB_UNRECLAIMABLE)),
58 global_page_state(NR_FILE_MAPPED), 58 global_node_page_state(NR_FILE_MAPPED),
59 global_page_state(NR_PAGETABLE), 59 global_page_state(NR_PAGETABLE),
60 global_page_state(NR_BOUNCE), 60 global_page_state(NR_BOUNCE),
61 global_page_state(NR_FILE_PAGES), 61 global_node_page_state(NR_FILE_PAGES),
62 get_nr_swap_pages()); 62 get_nr_swap_pages());
63 63
64 for_each_zone(zone) { 64 for_each_zone(zone) {
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 51c7db2c4ee2..29cd96661b30 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -56,6 +56,7 @@ static ssize_t node_read_meminfo(struct device *dev,
56{ 56{
57 int n; 57 int n;
58 int nid = dev->id; 58 int nid = dev->id;
59 struct pglist_data *pgdat = NODE_DATA(nid);
59 struct sysinfo i; 60 struct sysinfo i;
60 61
61 si_meminfo_node(&i, nid); 62 si_meminfo_node(&i, nid);
@@ -74,16 +75,16 @@ static ssize_t node_read_meminfo(struct device *dev,
74 nid, K(i.totalram), 75 nid, K(i.totalram),
75 nid, K(i.freeram), 76 nid, K(i.freeram),
76 nid, K(i.totalram - i.freeram), 77 nid, K(i.totalram - i.freeram),
77 nid, K(node_page_state(nid, NR_ACTIVE_ANON) + 78 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
78 node_page_state(nid, NR_ACTIVE_FILE)), 79 node_page_state(pgdat, NR_ACTIVE_FILE)),
79 nid, K(node_page_state(nid, NR_INACTIVE_ANON) + 80 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
80 node_page_state(nid, NR_INACTIVE_FILE)), 81 node_page_state(pgdat, NR_INACTIVE_FILE)),
81 nid, K(node_page_state(nid, NR_ACTIVE_ANON)), 82 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
82 nid, K(node_page_state(nid, NR_INACTIVE_ANON)), 83 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
83 nid, K(node_page_state(nid, NR_ACTIVE_FILE)), 84 nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
84 nid, K(node_page_state(nid, NR_INACTIVE_FILE)), 85 nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
85 nid, K(node_page_state(nid, NR_UNEVICTABLE)), 86 nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
86 nid, K(node_page_state(nid, NR_MLOCK))); 87 nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));
87 88
88#ifdef CONFIG_HIGHMEM 89#ifdef CONFIG_HIGHMEM
89 n += sprintf(buf + n, 90 n += sprintf(buf + n,
@@ -117,31 +118,30 @@ static ssize_t node_read_meminfo(struct device *dev,
117 "Node %d ShmemPmdMapped: %8lu kB\n" 118 "Node %d ShmemPmdMapped: %8lu kB\n"
118#endif 119#endif
119 , 120 ,
120 nid, K(node_page_state(nid, NR_FILE_DIRTY)), 121 nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
121 nid, K(node_page_state(nid, NR_WRITEBACK)), 122 nid, K(node_page_state(pgdat, NR_WRITEBACK)),
122 nid, K(node_page_state(nid, NR_FILE_PAGES)), 123 nid, K(node_page_state(pgdat, NR_FILE_PAGES)),
123 nid, K(node_page_state(nid, NR_FILE_MAPPED)), 124 nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
124 nid, K(node_page_state(nid, NR_ANON_PAGES)), 125 nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
125 nid, K(i.sharedram), 126 nid, K(i.sharedram),
126 nid, node_page_state(nid, NR_KERNEL_STACK) * 127 nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
127 THREAD_SIZE / 1024, 128 nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
128 nid, K(node_page_state(nid, NR_PAGETABLE)), 129 nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
129 nid, K(node_page_state(nid, NR_UNSTABLE_NFS)), 130 nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
130 nid, K(node_page_state(nid, NR_BOUNCE)), 131 nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
131 nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)), 132 nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE) +
132 nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + 133 sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
133 node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), 134 nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE)),
134 nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
135#ifdef CONFIG_TRANSPARENT_HUGEPAGE 135#ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), 136 nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
137 nid, K(node_page_state(nid, NR_ANON_THPS) * 137 nid, K(node_page_state(pgdat, NR_ANON_THPS) *
138 HPAGE_PMD_NR), 138 HPAGE_PMD_NR),
139 nid, K(node_page_state(nid, NR_SHMEM_THPS) * 139 nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
140 HPAGE_PMD_NR), 140 HPAGE_PMD_NR),
141 nid, K(node_page_state(nid, NR_SHMEM_PMDMAPPED) * 141 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
142 HPAGE_PMD_NR)); 142 HPAGE_PMD_NR));
143#else 143#else
144 nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); 144 nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
145#endif 145#endif
146 n += hugetlb_report_node_meminfo(nid, buf + n); 146 n += hugetlb_report_node_meminfo(nid, buf + n);
147 return n; 147 return n;
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
160 "interleave_hit %lu\n" 160 "interleave_hit %lu\n"
161 "local_node %lu\n" 161 "local_node %lu\n"
162 "other_node %lu\n", 162 "other_node %lu\n",
163 node_page_state(dev->id, NUMA_HIT), 163 sum_zone_node_page_state(dev->id, NUMA_HIT),
164 node_page_state(dev->id, NUMA_MISS), 164 sum_zone_node_page_state(dev->id, NUMA_MISS),
165 node_page_state(dev->id, NUMA_FOREIGN), 165 sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
166 node_page_state(dev->id, NUMA_INTERLEAVE_HIT), 166 sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
167 node_page_state(dev->id, NUMA_LOCAL), 167 sum_zone_node_page_state(dev->id, NUMA_LOCAL),
168 node_page_state(dev->id, NUMA_OTHER)); 168 sum_zone_node_page_state(dev->id, NUMA_OTHER));
169} 169}
170static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); 170static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
171 171
@@ -173,12 +173,18 @@ static ssize_t node_read_vmstat(struct device *dev,
173 struct device_attribute *attr, char *buf) 173 struct device_attribute *attr, char *buf)
174{ 174{
175 int nid = dev->id; 175 int nid = dev->id;
176 struct pglist_data *pgdat = NODE_DATA(nid);
176 int i; 177 int i;
177 int n = 0; 178 int n = 0;
178 179
179 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 180 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
180 n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], 181 n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
181 node_page_state(nid, i)); 182 sum_zone_node_page_state(nid, i));
183
184 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
185 n += sprintf(buf+n, "%s %lu\n",
186 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
187 node_page_state(pgdat, i));
182 188
183 return n; 189 return n;
184} 190}
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index 24d2745e9437..45a1b4ec4ca3 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -72,10 +72,10 @@ static unsigned long lowmem_deathpending_timeout;
72static unsigned long lowmem_count(struct shrinker *s, 72static unsigned long lowmem_count(struct shrinker *s,
73 struct shrink_control *sc) 73 struct shrink_control *sc)
74{ 74{
75 return global_page_state(NR_ACTIVE_ANON) + 75 return global_node_page_state(NR_ACTIVE_ANON) +
76 global_page_state(NR_ACTIVE_FILE) + 76 global_node_page_state(NR_ACTIVE_FILE) +
77 global_page_state(NR_INACTIVE_ANON) + 77 global_node_page_state(NR_INACTIVE_ANON) +
78 global_page_state(NR_INACTIVE_FILE); 78 global_node_page_state(NR_INACTIVE_FILE);
79} 79}
80 80
81static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) 81static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
@@ -91,8 +91,8 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
91 short selected_oom_score_adj; 91 short selected_oom_score_adj;
92 int array_size = ARRAY_SIZE(lowmem_adj); 92 int array_size = ARRAY_SIZE(lowmem_adj);
93 int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages; 93 int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
94 int other_file = global_page_state(NR_FILE_PAGES) - 94 int other_file = global_node_page_state(NR_FILE_PAGES) -
95 global_page_state(NR_SHMEM) - 95 global_node_page_state(NR_SHMEM) -
96 total_swapcache_pages(); 96 total_swapcache_pages();
97 97
98 if (lowmem_adj_size < array_size) 98 if (lowmem_adj_size < array_size)
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c
index d1a7d6beee60..d011135802d5 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -1864,7 +1864,8 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req)
1864 LASSERT(page_count >= 0); 1864 LASSERT(page_count >= 0);
1865 1865
1866 for (i = 0; i < page_count; i++) 1866 for (i = 0; i < page_count; i++)
1867 dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); 1867 dec_node_page_state(desc->bd_iov[i].kiov_page,
1868 NR_UNSTABLE_NFS);
1868 1869
1869 atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr); 1870 atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr);
1870 LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); 1871 LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
@@ -1898,7 +1899,8 @@ void osc_inc_unstable_pages(struct ptlrpc_request *req)
1898 LASSERT(page_count >= 0); 1899 LASSERT(page_count >= 0);
1899 1900
1900 for (i = 0; i < page_count; i++) 1901 for (i = 0; i < page_count; i++)
1901 inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); 1902 inc_node_page_state(desc->bd_iov[i].kiov_page,
1903 NR_UNSTABLE_NFS);
1902 1904
1903 LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); 1905 LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
1904 atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr); 1906 atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6f9c9f6f5157..56c8fda436c0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1807,8 +1807,8 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1807 */ 1807 */
1808static unsigned long get_nr_dirty_pages(void) 1808static unsigned long get_nr_dirty_pages(void)
1809{ 1809{
1810 return global_page_state(NR_FILE_DIRTY) + 1810 return global_node_page_state(NR_FILE_DIRTY) +
1811 global_page_state(NR_UNSTABLE_NFS) + 1811 global_node_page_state(NR_UNSTABLE_NFS) +
1812 get_nr_dirty_inodes(); 1812 get_nr_dirty_inodes();
1813} 1813}
1814 1814
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9154f8679024..2382f22a2a8b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1452,7 +1452,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1452 list_del(&req->writepages_entry); 1452 list_del(&req->writepages_entry);
1453 for (i = 0; i < req->num_pages; i++) { 1453 for (i = 0; i < req->num_pages; i++) {
1454 dec_wb_stat(&bdi->wb, WB_WRITEBACK); 1454 dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1455 dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); 1455 dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP);
1456 wb_writeout_inc(&bdi->wb); 1456 wb_writeout_inc(&bdi->wb);
1457 } 1457 }
1458 wake_up(&fi->page_waitq); 1458 wake_up(&fi->page_waitq);
@@ -1642,7 +1642,7 @@ static int fuse_writepage_locked(struct page *page)
1642 req->inode = inode; 1642 req->inode = inode;
1643 1643
1644 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); 1644 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1645 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); 1645 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
1646 1646
1647 spin_lock(&fc->lock); 1647 spin_lock(&fc->lock);
1648 list_add(&req->writepages_entry, &fi->writepages); 1648 list_add(&req->writepages_entry, &fi->writepages);
@@ -1756,7 +1756,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
1756 spin_unlock(&fc->lock); 1756 spin_unlock(&fc->lock);
1757 1757
1758 dec_wb_stat(&bdi->wb, WB_WRITEBACK); 1758 dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1759 dec_zone_page_state(page, NR_WRITEBACK_TEMP); 1759 dec_node_page_state(page, NR_WRITEBACK_TEMP);
1760 wb_writeout_inc(&bdi->wb); 1760 wb_writeout_inc(&bdi->wb);
1761 fuse_writepage_free(fc, new_req); 1761 fuse_writepage_free(fc, new_req);
1762 fuse_request_free(new_req); 1762 fuse_request_free(new_req);
@@ -1855,7 +1855,7 @@ static int fuse_writepages_fill(struct page *page,
1855 req->page_descs[req->num_pages].length = PAGE_SIZE; 1855 req->page_descs[req->num_pages].length = PAGE_SIZE;
1856 1856
1857 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); 1857 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1858 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); 1858 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
1859 1859
1860 err = 0; 1860 err = 0;
1861 if (is_writeback && fuse_writepage_in_flight(req, page)) { 1861 if (is_writeback && fuse_writepage_in_flight(req, page)) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5154fa65a2f2..5ea04d87fc65 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -623,7 +623,7 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
623 if (!cinfo->dreq) { 623 if (!cinfo->dreq) {
624 struct inode *inode = page_file_mapping(page)->host; 624 struct inode *inode = page_file_mapping(page)->host;
625 625
626 inc_zone_page_state(page, NR_UNSTABLE_NFS); 626 inc_node_page_state(page, NR_UNSTABLE_NFS);
627 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); 627 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
628 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 628 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
629 } 629 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e1c74d3db64d..593fa21a02c0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -898,7 +898,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
898static void 898static void
899nfs_clear_page_commit(struct page *page) 899nfs_clear_page_commit(struct page *page)
900{ 900{
901 dec_zone_page_state(page, NR_UNSTABLE_NFS); 901 dec_node_page_state(page, NR_UNSTABLE_NFS);
902 dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, 902 dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
903 WB_RECLAIMABLE); 903 WB_RECLAIMABLE);
904} 904}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a11eb7196ec8..31370da2ee7c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1024,23 +1024,107 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
1024 char buffer[PROC_NUMBUF]; 1024 char buffer[PROC_NUMBUF];
1025 int oom_adj = OOM_ADJUST_MIN; 1025 int oom_adj = OOM_ADJUST_MIN;
1026 size_t len; 1026 size_t len;
1027 unsigned long flags;
1028 1027
1029 if (!task) 1028 if (!task)
1030 return -ESRCH; 1029 return -ESRCH;
1031 if (lock_task_sighand(task, &flags)) { 1030 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
1032 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) 1031 oom_adj = OOM_ADJUST_MAX;
1033 oom_adj = OOM_ADJUST_MAX; 1032 else
1034 else 1033 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
1035 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / 1034 OOM_SCORE_ADJ_MAX;
1036 OOM_SCORE_ADJ_MAX;
1037 unlock_task_sighand(task, &flags);
1038 }
1039 put_task_struct(task); 1035 put_task_struct(task);
1040 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); 1036 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
1041 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1037 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1042} 1038}
1043 1039
1040static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
1041{
1042 static DEFINE_MUTEX(oom_adj_mutex);
1043 struct mm_struct *mm = NULL;
1044 struct task_struct *task;
1045 int err = 0;
1046
1047 task = get_proc_task(file_inode(file));
1048 if (!task)
1049 return -ESRCH;
1050
1051 mutex_lock(&oom_adj_mutex);
1052 if (legacy) {
1053 if (oom_adj < task->signal->oom_score_adj &&
1054 !capable(CAP_SYS_RESOURCE)) {
1055 err = -EACCES;
1056 goto err_unlock;
1057 }
1058 /*
1059 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
1060 * /proc/pid/oom_score_adj instead.
1061 */
1062 pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
1063 current->comm, task_pid_nr(current), task_pid_nr(task),
1064 task_pid_nr(task));
1065 } else {
1066 if ((short)oom_adj < task->signal->oom_score_adj_min &&
1067 !capable(CAP_SYS_RESOURCE)) {
1068 err = -EACCES;
1069 goto err_unlock;
1070 }
1071 }
1072
1073 /*
1074 * Make sure we will check other processes sharing the mm if this is
1075 * not vfrok which wants its own oom_score_adj.
1076 * pin the mm so it doesn't go away and get reused after task_unlock
1077 */
1078 if (!task->vfork_done) {
1079 struct task_struct *p = find_lock_task_mm(task);
1080
1081 if (p) {
1082 if (atomic_read(&p->mm->mm_users) > 1) {
1083 mm = p->mm;
1084 atomic_inc(&mm->mm_count);
1085 }
1086 task_unlock(p);
1087 }
1088 }
1089
1090 task->signal->oom_score_adj = oom_adj;
1091 if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1092 task->signal->oom_score_adj_min = (short)oom_adj;
1093 trace_oom_score_adj_update(task);
1094
1095 if (mm) {
1096 struct task_struct *p;
1097
1098 rcu_read_lock();
1099 for_each_process(p) {
1100 if (same_thread_group(task, p))
1101 continue;
1102
1103 /* do not touch kernel threads or the global init */
1104 if (p->flags & PF_KTHREAD || is_global_init(p))
1105 continue;
1106
1107 task_lock(p);
1108 if (!p->vfork_done && process_shares_mm(p, mm)) {
1109 pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
1110 task_pid_nr(p), p->comm,
1111 p->signal->oom_score_adj, oom_adj,
1112 task_pid_nr(task), task->comm);
1113 p->signal->oom_score_adj = oom_adj;
1114 if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1115 p->signal->oom_score_adj_min = (short)oom_adj;
1116 }
1117 task_unlock(p);
1118 }
1119 rcu_read_unlock();
1120 mmdrop(mm);
1121 }
1122err_unlock:
1123 mutex_unlock(&oom_adj_mutex);
1124 put_task_struct(task);
1125 return err;
1126}
1127
1044/* 1128/*
1045 * /proc/pid/oom_adj exists solely for backwards compatibility with previous 1129 * /proc/pid/oom_adj exists solely for backwards compatibility with previous
1046 * kernels. The effective policy is defined by oom_score_adj, which has a 1130 * kernels. The effective policy is defined by oom_score_adj, which has a
@@ -1054,10 +1138,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
1054static ssize_t oom_adj_write(struct file *file, const char __user *buf, 1138static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1055 size_t count, loff_t *ppos) 1139 size_t count, loff_t *ppos)
1056{ 1140{
1057 struct task_struct *task;
1058 char buffer[PROC_NUMBUF]; 1141 char buffer[PROC_NUMBUF];
1059 int oom_adj; 1142 int oom_adj;
1060 unsigned long flags;
1061 int err; 1143 int err;
1062 1144
1063 memset(buffer, 0, sizeof(buffer)); 1145 memset(buffer, 0, sizeof(buffer));
@@ -1077,23 +1159,6 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1077 goto out; 1159 goto out;
1078 } 1160 }
1079 1161
1080 task = get_proc_task(file_inode(file));
1081 if (!task) {
1082 err = -ESRCH;
1083 goto out;
1084 }
1085
1086 task_lock(task);
1087 if (!task->mm) {
1088 err = -EINVAL;
1089 goto err_task_lock;
1090 }
1091
1092 if (!lock_task_sighand(task, &flags)) {
1093 err = -ESRCH;
1094 goto err_task_lock;
1095 }
1096
1097 /* 1162 /*
1098 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum 1163 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1099 * value is always attainable. 1164 * value is always attainable.
@@ -1103,27 +1168,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1103 else 1168 else
1104 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; 1169 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
1105 1170
1106 if (oom_adj < task->signal->oom_score_adj && 1171 err = __set_oom_adj(file, oom_adj, true);
1107 !capable(CAP_SYS_RESOURCE)) {
1108 err = -EACCES;
1109 goto err_sighand;
1110 }
1111
1112 /*
1113 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
1114 * /proc/pid/oom_score_adj instead.
1115 */
1116 pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
1117 current->comm, task_pid_nr(current), task_pid_nr(task),
1118 task_pid_nr(task));
1119
1120 task->signal->oom_score_adj = oom_adj;
1121 trace_oom_score_adj_update(task);
1122err_sighand:
1123 unlock_task_sighand(task, &flags);
1124err_task_lock:
1125 task_unlock(task);
1126 put_task_struct(task);
1127out: 1172out:
1128 return err < 0 ? err : count; 1173 return err < 0 ? err : count;
1129} 1174}
@@ -1140,15 +1185,11 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1140 struct task_struct *task = get_proc_task(file_inode(file)); 1185 struct task_struct *task = get_proc_task(file_inode(file));
1141 char buffer[PROC_NUMBUF]; 1186 char buffer[PROC_NUMBUF];
1142 short oom_score_adj = OOM_SCORE_ADJ_MIN; 1187 short oom_score_adj = OOM_SCORE_ADJ_MIN;
1143 unsigned long flags;
1144 size_t len; 1188 size_t len;
1145 1189
1146 if (!task) 1190 if (!task)
1147 return -ESRCH; 1191 return -ESRCH;
1148 if (lock_task_sighand(task, &flags)) { 1192 oom_score_adj = task->signal->oom_score_adj;
1149 oom_score_adj = task->signal->oom_score_adj;
1150 unlock_task_sighand(task, &flags);
1151 }
1152 put_task_struct(task); 1193 put_task_struct(task);
1153 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); 1194 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
1154 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1195 return simple_read_from_buffer(buf, count, ppos, buffer, len);
@@ -1157,9 +1198,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1157static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, 1198static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1158 size_t count, loff_t *ppos) 1199 size_t count, loff_t *ppos)
1159{ 1200{
1160 struct task_struct *task;
1161 char buffer[PROC_NUMBUF]; 1201 char buffer[PROC_NUMBUF];
1162 unsigned long flags;
1163 int oom_score_adj; 1202 int oom_score_adj;
1164 int err; 1203 int err;
1165 1204
@@ -1180,39 +1219,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1180 goto out; 1219 goto out;
1181 } 1220 }
1182 1221
1183 task = get_proc_task(file_inode(file)); 1222 err = __set_oom_adj(file, oom_score_adj, false);
1184 if (!task) {
1185 err = -ESRCH;
1186 goto out;
1187 }
1188
1189 task_lock(task);
1190 if (!task->mm) {
1191 err = -EINVAL;
1192 goto err_task_lock;
1193 }
1194
1195 if (!lock_task_sighand(task, &flags)) {
1196 err = -ESRCH;
1197 goto err_task_lock;
1198 }
1199
1200 if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
1201 !capable(CAP_SYS_RESOURCE)) {
1202 err = -EACCES;
1203 goto err_sighand;
1204 }
1205
1206 task->signal->oom_score_adj = (short)oom_score_adj;
1207 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1208 task->signal->oom_score_adj_min = (short)oom_score_adj;
1209 trace_oom_score_adj_update(task);
1210
1211err_sighand:
1212 unlock_task_sighand(task, &flags);
1213err_task_lock:
1214 task_unlock(task);
1215 put_task_struct(task);
1216out: 1223out:
1217 return err < 0 ? err : count; 1224 return err < 0 ? err : count;
1218} 1225}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index cf301a9ef512..09e18fdf61e5 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
40 si_swapinfo(&i); 40 si_swapinfo(&i);
41 committed = percpu_counter_read_positive(&vm_committed_as); 41 committed = percpu_counter_read_positive(&vm_committed_as);
42 42
43 cached = global_page_state(NR_FILE_PAGES) - 43 cached = global_node_page_state(NR_FILE_PAGES) -
44 total_swapcache_pages() - i.bufferram; 44 total_swapcache_pages() - i.bufferram;
45 if (cached < 0) 45 if (cached < 0)
46 cached = 0; 46 cached = 0;
@@ -138,23 +138,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
138#endif 138#endif
139 K(i.totalswap), 139 K(i.totalswap),
140 K(i.freeswap), 140 K(i.freeswap),
141 K(global_page_state(NR_FILE_DIRTY)), 141 K(global_node_page_state(NR_FILE_DIRTY)),
142 K(global_page_state(NR_WRITEBACK)), 142 K(global_node_page_state(NR_WRITEBACK)),
143 K(global_page_state(NR_ANON_PAGES)), 143 K(global_node_page_state(NR_ANON_MAPPED)),
144 K(global_page_state(NR_FILE_MAPPED)), 144 K(global_node_page_state(NR_FILE_MAPPED)),
145 K(i.sharedram), 145 K(i.sharedram),
146 K(global_page_state(NR_SLAB_RECLAIMABLE) + 146 K(global_page_state(NR_SLAB_RECLAIMABLE) +
147 global_page_state(NR_SLAB_UNRECLAIMABLE)), 147 global_page_state(NR_SLAB_UNRECLAIMABLE)),
148 K(global_page_state(NR_SLAB_RECLAIMABLE)), 148 K(global_page_state(NR_SLAB_RECLAIMABLE)),
149 K(global_page_state(NR_SLAB_UNRECLAIMABLE)), 149 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
150 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, 150 global_page_state(NR_KERNEL_STACK_KB),
151 K(global_page_state(NR_PAGETABLE)), 151 K(global_page_state(NR_PAGETABLE)),
152#ifdef CONFIG_QUICKLIST 152#ifdef CONFIG_QUICKLIST
153 K(quicklist_total_size()), 153 K(quicklist_total_size()),
154#endif 154#endif
155 K(global_page_state(NR_UNSTABLE_NFS)), 155 K(global_node_page_state(NR_UNSTABLE_NFS)),
156 K(global_page_state(NR_BOUNCE)), 156 K(global_page_state(NR_BOUNCE)),
157 K(global_page_state(NR_WRITEBACK_TEMP)), 157 K(global_node_page_state(NR_WRITEBACK_TEMP)),
158 K(vm_commit_limit()), 158 K(vm_commit_limit()),
159 K(committed), 159 K(committed),
160 (unsigned long)VMALLOC_TOTAL >> 10, 160 (unsigned long)VMALLOC_TOTAL >> 10,
@@ -164,9 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
164 , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) 164 , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
165#endif 165#endif
166#ifdef CONFIG_TRANSPARENT_HUGEPAGE 166#ifdef CONFIG_TRANSPARENT_HUGEPAGE
167 , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) 167 , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
168 , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) 168 , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
169 , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) 169 , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
170#endif 170#endif
171#ifdef CONFIG_CMA 171#ifdef CONFIG_CMA
172 , K(totalcma_pages) 172 , K(totalcma_pages)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c82794f20110..491a91717788 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -197,7 +197,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
197} 197}
198 198
199long congestion_wait(int sync, long timeout); 199long congestion_wait(int sync, long timeout);
200long wait_iff_congested(struct zone *zone, int sync, long timeout); 200long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
201int pdflush_proc_obsolete(struct ctl_table *table, int write, 201int pdflush_proc_obsolete(struct ctl_table *table, int write,
202 void __user *buffer, size_t *lenp, loff_t *ppos); 202 void __user *buffer, size_t *lenp, loff_t *ppos);
203 203
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 1a02dab16646..d4e106b5dc27 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -1,6 +1,18 @@
1#ifndef _LINUX_COMPACTION_H 1#ifndef _LINUX_COMPACTION_H
2#define _LINUX_COMPACTION_H 2#define _LINUX_COMPACTION_H
3 3
4/*
5 * Determines how hard direct compaction should try to succeed.
6 * Lower value means higher priority, analogically to reclaim priority.
7 */
8enum compact_priority {
9 COMPACT_PRIO_SYNC_LIGHT,
10 MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
11 DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
12 COMPACT_PRIO_ASYNC,
13 INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
14};
15
4/* Return values for compact_zone() and try_to_compact_pages() */ 16/* Return values for compact_zone() and try_to_compact_pages() */
5/* When adding new states, please adjust include/trace/events/compaction.h */ 17/* When adding new states, please adjust include/trace/events/compaction.h */
6enum compact_result { 18enum compact_result {
@@ -43,14 +55,6 @@ enum compact_result {
43 COMPACT_PARTIAL, 55 COMPACT_PARTIAL,
44}; 56};
45 57
46/* Used to signal whether compaction detected need_sched() or lock contention */
47/* No contention detected */
48#define COMPACT_CONTENDED_NONE 0
49/* Either need_sched() was true or fatal signal pending */
50#define COMPACT_CONTENDED_SCHED 1
51/* Zone lock or lru_lock was contended in async compaction */
52#define COMPACT_CONTENDED_LOCK 2
53
54struct alloc_context; /* in mm/internal.h */ 58struct alloc_context; /* in mm/internal.h */
55 59
56#ifdef CONFIG_COMPACTION 60#ifdef CONFIG_COMPACTION
@@ -64,9 +68,8 @@ extern int sysctl_compact_unevictable_allowed;
64 68
65extern int fragmentation_index(struct zone *zone, unsigned int order); 69extern int fragmentation_index(struct zone *zone, unsigned int order);
66extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, 70extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
67 unsigned int order, 71 unsigned int order, unsigned int alloc_flags,
68 unsigned int alloc_flags, const struct alloc_context *ac, 72 const struct alloc_context *ac, enum compact_priority prio);
69 enum migrate_mode mode, int *contended);
70extern void compact_pgdat(pg_data_t *pgdat, int order); 73extern void compact_pgdat(pg_data_t *pgdat, int order);
71extern void reset_isolation_suitable(pg_data_t *pgdat); 74extern void reset_isolation_suitable(pg_data_t *pgdat);
72extern enum compact_result compaction_suitable(struct zone *zone, int order, 75extern enum compact_result compaction_suitable(struct zone *zone, int order,
@@ -151,14 +154,6 @@ extern void kcompactd_stop(int nid);
151extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); 154extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
152 155
153#else 156#else
154static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
155 unsigned int order, int alloc_flags,
156 const struct alloc_context *ac,
157 enum migrate_mode mode, int *contended)
158{
159 return COMPACT_CONTINUE;
160}
161
162static inline void compact_pgdat(pg_data_t *pgdat, int order) 157static inline void compact_pgdat(pg_data_t *pgdat, int order)
163{ 158{
164} 159}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c29e9d347bc6..f8041f9de31e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -237,9 +237,11 @@ struct vm_area_struct;
237 * are expected to be movable via page reclaim or page migration. Typically, 237 * are expected to be movable via page reclaim or page migration. Typically,
238 * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE. 238 * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE.
239 * 239 *
240 * GFP_TRANSHUGE is used for THP allocations. They are compound allocations 240 * GFP_TRANSHUGE and GFP_TRANSHUGE_LIGHT are used for THP allocations. They are
241 * that will fail quickly if memory is not available and will not wake 241 * compound allocations that will generally fail quickly if memory is not
242 * kswapd on failure. 242 * available and will not wake kswapd/kcompactd on failure. The _LIGHT
243 * version does not attempt reclaim/compaction at all and is by default used
244 * in page fault path, while the non-light is used by khugepaged.
243 */ 245 */
244#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) 246#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
245#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) 247#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
@@ -254,9 +256,9 @@ struct vm_area_struct;
254#define GFP_DMA32 __GFP_DMA32 256#define GFP_DMA32 __GFP_DMA32
255#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) 257#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
256#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) 258#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE)
257#define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ 259#define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
258 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ 260 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
259 ~__GFP_RECLAIM) 261#define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
260 262
261/* Convert GFP flags to their corresponding migrate type */ 263/* Convert GFP flags to their corresponding migrate type */
262#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) 264#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 92ce91c03cd0..6f14de45b5ce 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,7 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
11 unsigned long addr, 11 unsigned long addr,
12 pmd_t *pmd, 12 pmd_t *pmd,
13 unsigned int flags); 13 unsigned int flags);
14extern int madvise_free_huge_pmd(struct mmu_gather *tlb, 14extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
15 struct vm_area_struct *vma, 15 struct vm_area_struct *vma,
16 pmd_t *pmd, unsigned long addr, unsigned long next); 16 pmd_t *pmd, unsigned long addr, unsigned long next);
17extern int zap_huge_pmd(struct mmu_gather *tlb, 17extern int zap_huge_pmd(struct mmu_gather *tlb,
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index ac4b3c46a84d..c9cf374445d8 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -77,6 +77,7 @@ void kasan_free_shadow(const struct vm_struct *vm);
77 77
78size_t ksize(const void *); 78size_t ksize(const void *);
79static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } 79static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
80size_t kasan_metadata_size(struct kmem_cache *cache);
80 81
81#else /* CONFIG_KASAN */ 82#else /* CONFIG_KASAN */
82 83
@@ -121,6 +122,7 @@ static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
121static inline void kasan_free_shadow(const struct vm_struct *vm) {} 122static inline void kasan_free_shadow(const struct vm_struct *vm) {}
122 123
123static inline void kasan_unpoison_slab(const void *ptr) { } 124static inline void kasan_unpoison_slab(const void *ptr) { }
125static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
124 126
125#endif /* CONFIG_KASAN */ 127#endif /* CONFIG_KASAN */
126 128
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index a19bcf9e762e..410decacff8f 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -177,7 +177,7 @@ extern int kdb_get_kbd_char(void);
177static inline 177static inline
178int kdb_process_cpu(const struct task_struct *p) 178int kdb_process_cpu(const struct task_struct *p)
179{ 179{
180 unsigned int cpu = task_thread_info(p)->cpu; 180 unsigned int cpu = task_cpu(p);
181 if (cpu > num_possible_cpus()) 181 if (cpu > num_possible_cpus())
182 cpu = 0; 182 cpu = 0;
183 return cpu; 183 return cpu;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 6c14b6179727..2925da23505d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -332,6 +332,7 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn);
332phys_addr_t memblock_start_of_DRAM(void); 332phys_addr_t memblock_start_of_DRAM(void);
333phys_addr_t memblock_end_of_DRAM(void); 333phys_addr_t memblock_end_of_DRAM(void);
334void memblock_enforce_memory_limit(phys_addr_t memory_limit); 334void memblock_enforce_memory_limit(phys_addr_t memory_limit);
335void memblock_mem_limit_remove_map(phys_addr_t limit);
335bool memblock_is_memory(phys_addr_t addr); 336bool memblock_is_memory(phys_addr_t addr);
336int memblock_is_map_memory(phys_addr_t addr); 337int memblock_is_map_memory(phys_addr_t addr);
337int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); 338int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 71aff733a497..5d8ca6e02e39 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,7 +52,7 @@ enum mem_cgroup_stat_index {
52 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ 52 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
53 MEM_CGROUP_STAT_NSTATS, 53 MEM_CGROUP_STAT_NSTATS,
54 /* default hierarchy stats */ 54 /* default hierarchy stats */
55 MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS, 55 MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS,
56 MEMCG_SLAB_RECLAIMABLE, 56 MEMCG_SLAB_RECLAIMABLE,
57 MEMCG_SLAB_UNRECLAIMABLE, 57 MEMCG_SLAB_UNRECLAIMABLE,
58 MEMCG_SOCK, 58 MEMCG_SOCK,
@@ -60,7 +60,7 @@ enum mem_cgroup_stat_index {
60}; 60};
61 61
62struct mem_cgroup_reclaim_cookie { 62struct mem_cgroup_reclaim_cookie {
63 struct zone *zone; 63 pg_data_t *pgdat;
64 int priority; 64 int priority;
65 unsigned int generation; 65 unsigned int generation;
66}; 66};
@@ -118,7 +118,7 @@ struct mem_cgroup_reclaim_iter {
118/* 118/*
119 * per-zone information in memory controller. 119 * per-zone information in memory controller.
120 */ 120 */
121struct mem_cgroup_per_zone { 121struct mem_cgroup_per_node {
122 struct lruvec lruvec; 122 struct lruvec lruvec;
123 unsigned long lru_size[NR_LRU_LISTS]; 123 unsigned long lru_size[NR_LRU_LISTS];
124 124
@@ -132,10 +132,6 @@ struct mem_cgroup_per_zone {
132 /* use container_of */ 132 /* use container_of */
133}; 133};
134 134
135struct mem_cgroup_per_node {
136 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
137};
138
139struct mem_cgroup_threshold { 135struct mem_cgroup_threshold {
140 struct eventfd_ctx *eventfd; 136 struct eventfd_ctx *eventfd;
141 unsigned long threshold; 137 unsigned long threshold;
@@ -314,8 +310,46 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
314 310
315void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); 311void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
316 312
317struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); 313static struct mem_cgroup_per_node *
318struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); 314mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
315{
316 return memcg->nodeinfo[nid];
317}
318
319/**
320 * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
321 * @node: node of the wanted lruvec
322 * @memcg: memcg of the wanted lruvec
323 *
324 * Returns the lru list vector holding pages for a given @node or a given
325 * @memcg and @zone. This can be the node lruvec, if the memory controller
326 * is disabled.
327 */
328static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
329 struct mem_cgroup *memcg)
330{
331 struct mem_cgroup_per_node *mz;
332 struct lruvec *lruvec;
333
334 if (mem_cgroup_disabled()) {
335 lruvec = node_lruvec(pgdat);
336 goto out;
337 }
338
339 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
340 lruvec = &mz->lruvec;
341out:
342 /*
343 * Since a node can be onlined after the mem_cgroup was created,
344 * we have to be prepared to initialize lruvec->pgdat here;
345 * and if offlined then reonlined, we need to reinitialize it.
346 */
347 if (unlikely(lruvec->pgdat != pgdat))
348 lruvec->pgdat = pgdat;
349 return lruvec;
350}
351
352struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
319 353
320bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); 354bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
321struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 355struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
@@ -404,9 +438,9 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
404static inline 438static inline
405unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 439unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
406{ 440{
407 struct mem_cgroup_per_zone *mz; 441 struct mem_cgroup_per_node *mz;
408 442
409 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 443 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
410 return mz->lru_size[lru]; 444 return mz->lru_size[lru];
411} 445}
412 446
@@ -477,7 +511,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
477 mem_cgroup_update_page_stat(page, idx, -1); 511 mem_cgroup_update_page_stat(page, idx, -1);
478} 512}
479 513
480unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 514unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
481 gfp_t gfp_mask, 515 gfp_t gfp_mask,
482 unsigned long *total_scanned); 516 unsigned long *total_scanned);
483 517
@@ -568,16 +602,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
568{ 602{
569} 603}
570 604
571static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 605static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
572 struct mem_cgroup *memcg) 606 struct mem_cgroup *memcg)
573{ 607{
574 return &zone->lruvec; 608 return node_lruvec(pgdat);
575} 609}
576 610
577static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, 611static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
578 struct zone *zone) 612 struct pglist_data *pgdat)
579{ 613{
580 return &zone->lruvec; 614 return &pgdat->lruvec;
581} 615}
582 616
583static inline bool mm_match_cgroup(struct mm_struct *mm, 617static inline bool mm_match_cgroup(struct mm_struct *mm,
@@ -681,7 +715,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
681} 715}
682 716
683static inline 717static inline
684unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 718unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
685 gfp_t gfp_mask, 719 gfp_t gfp_mask,
686 unsigned long *total_scanned) 720 unsigned long *total_scanned)
687{ 721{
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index bcaa634139a9..93416196ba64 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -26,7 +26,7 @@ struct vmem_altmap {
26unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); 26unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
27void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); 27void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
28 28
29#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) 29#ifdef CONFIG_ZONE_DEVICE
30struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); 30struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start);
31#else 31#else
32static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) 32static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 192c1bbe5fcd..08ed53eeedd5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -933,6 +933,11 @@ static inline struct zone *page_zone(const struct page *page)
933 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; 933 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
934} 934}
935 935
936static inline pg_data_t *page_pgdat(const struct page *page)
937{
938 return NODE_DATA(page_to_nid(page));
939}
940
936#ifdef SECTION_IN_PAGE_FLAGS 941#ifdef SECTION_IN_PAGE_FLAGS
937static inline void set_page_section(struct page *page, unsigned long section) 942static inline void set_page_section(struct page *page, unsigned long section)
938{ 943{
@@ -973,11 +978,21 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
973{ 978{
974 return page->mem_cgroup; 979 return page->mem_cgroup;
975} 980}
981static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
982{
983 WARN_ON_ONCE(!rcu_read_lock_held());
984 return READ_ONCE(page->mem_cgroup);
985}
976#else 986#else
977static inline struct mem_cgroup *page_memcg(struct page *page) 987static inline struct mem_cgroup *page_memcg(struct page *page)
978{ 988{
979 return NULL; 989 return NULL;
980} 990}
991static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
992{
993 WARN_ON_ONCE(!rcu_read_lock_held());
994 return NULL;
995}
981#endif 996#endif
982 997
983/* 998/*
@@ -2284,6 +2299,8 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
2284} 2299}
2285#endif /* __HAVE_ARCH_GATE_AREA */ 2300#endif /* __HAVE_ARCH_GATE_AREA */
2286 2301
2302extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
2303
2287#ifdef CONFIG_SYSCTL 2304#ifdef CONFIG_SYSCTL
2288extern int sysctl_drop_caches; 2305extern int sysctl_drop_caches;
2289int drop_caches_sysctl_handler(struct ctl_table *, int, 2306int drop_caches_sysctl_handler(struct ctl_table *, int,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 5bd29ba4f174..71613e8a720f 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -23,25 +23,30 @@ static inline int page_is_file_cache(struct page *page)
23} 23}
24 24
25static __always_inline void __update_lru_size(struct lruvec *lruvec, 25static __always_inline void __update_lru_size(struct lruvec *lruvec,
26 enum lru_list lru, int nr_pages) 26 enum lru_list lru, enum zone_type zid,
27 int nr_pages)
27{ 28{
28 __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); 29 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
30
31 __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
32 __mod_zone_page_state(&pgdat->node_zones[zid],
33 NR_ZONE_LRU_BASE + lru, nr_pages);
29} 34}
30 35
31static __always_inline void update_lru_size(struct lruvec *lruvec, 36static __always_inline void update_lru_size(struct lruvec *lruvec,
32 enum lru_list lru, int nr_pages) 37 enum lru_list lru, enum zone_type zid,
38 int nr_pages)
33{ 39{
40 __update_lru_size(lruvec, lru, zid, nr_pages);
34#ifdef CONFIG_MEMCG 41#ifdef CONFIG_MEMCG
35 mem_cgroup_update_lru_size(lruvec, lru, nr_pages); 42 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
36#else
37 __update_lru_size(lruvec, lru, nr_pages);
38#endif 43#endif
39} 44}
40 45
41static __always_inline void add_page_to_lru_list(struct page *page, 46static __always_inline void add_page_to_lru_list(struct page *page,
42 struct lruvec *lruvec, enum lru_list lru) 47 struct lruvec *lruvec, enum lru_list lru)
43{ 48{
44 update_lru_size(lruvec, lru, hpage_nr_pages(page)); 49 update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
45 list_add(&page->lru, &lruvec->lists[lru]); 50 list_add(&page->lru, &lruvec->lists[lru]);
46} 51}
47 52
@@ -49,7 +54,7 @@ static __always_inline void del_page_from_lru_list(struct page *page,
49 struct lruvec *lruvec, enum lru_list lru) 54 struct lruvec *lruvec, enum lru_list lru)
50{ 55{
51 list_del(&page->lru); 56 list_del(&page->lru);
52 update_lru_size(lruvec, lru, -hpage_nr_pages(page)); 57 update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
53} 58}
54 59
55/** 60/**
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 79472b22d23f..903200f4ec41 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -118,7 +118,7 @@ struct page {
118 */ 118 */
119 union { 119 union {
120 struct list_head lru; /* Pageout list, eg. active_list 120 struct list_head lru; /* Pageout list, eg. active_list
121 * protected by zone->lru_lock ! 121 * protected by zone_lru_lock !
122 * Can be used as a generic list 122 * Can be used as a generic list
123 * by the page owner. 123 * by the page owner.
124 */ 124 */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 19425e988bdc..f2e4e90621ec 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -93,7 +93,7 @@ struct free_area {
93struct pglist_data; 93struct pglist_data;
94 94
95/* 95/*
96 * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. 96 * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
97 * So add a wild amount of padding here to ensure that they fall into separate 97 * So add a wild amount of padding here to ensure that they fall into separate
98 * cachelines. There are very few zone structures in the machine, so space 98 * cachelines. There are very few zone structures in the machine, so space
99 * consumption is not a concern here. 99 * consumption is not a concern here.
@@ -110,36 +110,20 @@ struct zone_padding {
110enum zone_stat_item { 110enum zone_stat_item {
111 /* First 128 byte cacheline (assuming 64 bit words) */ 111 /* First 128 byte cacheline (assuming 64 bit words) */
112 NR_FREE_PAGES, 112 NR_FREE_PAGES,
113 NR_ALLOC_BATCH, 113 NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
114 NR_LRU_BASE, 114 NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
115 NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ 115 NR_ZONE_ACTIVE_ANON,
116 NR_ACTIVE_ANON, /* " " " " " */ 116 NR_ZONE_INACTIVE_FILE,
117 NR_INACTIVE_FILE, /* " " " " " */ 117 NR_ZONE_ACTIVE_FILE,
118 NR_ACTIVE_FILE, /* " " " " " */ 118 NR_ZONE_UNEVICTABLE,
119 NR_UNEVICTABLE, /* " " " " " */ 119 NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
120 NR_MLOCK, /* mlock()ed pages found and moved off LRU */ 120 NR_MLOCK, /* mlock()ed pages found and moved off LRU */
121 NR_ANON_PAGES, /* Mapped anonymous pages */
122 NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
123 only modified from process context */
124 NR_FILE_PAGES,
125 NR_FILE_DIRTY,
126 NR_WRITEBACK,
127 NR_SLAB_RECLAIMABLE, 121 NR_SLAB_RECLAIMABLE,
128 NR_SLAB_UNRECLAIMABLE, 122 NR_SLAB_UNRECLAIMABLE,
129 NR_PAGETABLE, /* used for pagetables */ 123 NR_PAGETABLE, /* used for pagetables */
130 NR_KERNEL_STACK, 124 NR_KERNEL_STACK_KB, /* measured in KiB */
131 /* Second 128 byte cacheline */ 125 /* Second 128 byte cacheline */
132 NR_UNSTABLE_NFS, /* NFS unstable pages */
133 NR_BOUNCE, 126 NR_BOUNCE,
134 NR_VMSCAN_WRITE,
135 NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
136 NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
137 NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
138 NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
139 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
140 NR_DIRTIED, /* page dirtyings since bootup */
141 NR_WRITTEN, /* page writings since bootup */
142 NR_PAGES_SCANNED, /* pages scanned since last reclaim */
143#if IS_ENABLED(CONFIG_ZSMALLOC) 127#if IS_ENABLED(CONFIG_ZSMALLOC)
144 NR_ZSPAGES, /* allocated in zsmalloc */ 128 NR_ZSPAGES, /* allocated in zsmalloc */
145#endif 129#endif
@@ -151,14 +135,40 @@ enum zone_stat_item {
151 NUMA_LOCAL, /* allocation from local node */ 135 NUMA_LOCAL, /* allocation from local node */
152 NUMA_OTHER, /* allocation from other node */ 136 NUMA_OTHER, /* allocation from other node */
153#endif 137#endif
138 NR_FREE_CMA_PAGES,
139 NR_VM_ZONE_STAT_ITEMS };
140
141enum node_stat_item {
142 NR_LRU_BASE,
143 NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
144 NR_ACTIVE_ANON, /* " " " " " */
145 NR_INACTIVE_FILE, /* " " " " " */
146 NR_ACTIVE_FILE, /* " " " " " */
147 NR_UNEVICTABLE, /* " " " " " */
148 NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
149 NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
150 NR_PAGES_SCANNED, /* pages scanned since last reclaim */
154 WORKINGSET_REFAULT, 151 WORKINGSET_REFAULT,
155 WORKINGSET_ACTIVATE, 152 WORKINGSET_ACTIVATE,
156 WORKINGSET_NODERECLAIM, 153 WORKINGSET_NODERECLAIM,
157 NR_ANON_THPS, 154 NR_ANON_MAPPED, /* Mapped anonymous pages */
155 NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
156 only modified from process context */
157 NR_FILE_PAGES,
158 NR_FILE_DIRTY,
159 NR_WRITEBACK,
160 NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
161 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
158 NR_SHMEM_THPS, 162 NR_SHMEM_THPS,
159 NR_SHMEM_PMDMAPPED, 163 NR_SHMEM_PMDMAPPED,
160 NR_FREE_CMA_PAGES, 164 NR_ANON_THPS,
161 NR_VM_ZONE_STAT_ITEMS }; 165 NR_UNSTABLE_NFS, /* NFS unstable pages */
166 NR_VMSCAN_WRITE,
167 NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
168 NR_DIRTIED, /* page dirtyings since bootup */
169 NR_WRITTEN, /* page writings since bootup */
170 NR_VM_NODE_STAT_ITEMS
171};
162 172
163/* 173/*
164 * We do arithmetic on the LRU lists in various places in the code, 174 * We do arithmetic on the LRU lists in various places in the code,
@@ -215,7 +225,7 @@ struct lruvec {
215 /* Evictions & activations on the inactive file list */ 225 /* Evictions & activations on the inactive file list */
216 atomic_long_t inactive_age; 226 atomic_long_t inactive_age;
217#ifdef CONFIG_MEMCG 227#ifdef CONFIG_MEMCG
218 struct zone *zone; 228 struct pglist_data *pgdat;
219#endif 229#endif
220}; 230};
221 231
@@ -267,6 +277,11 @@ struct per_cpu_pageset {
267#endif 277#endif
268}; 278};
269 279
280struct per_cpu_nodestat {
281 s8 stat_threshold;
282 s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
283};
284
270#endif /* !__GENERATING_BOUNDS.H */ 285#endif /* !__GENERATING_BOUNDS.H */
271 286
272enum zone_type { 287enum zone_type {
@@ -348,22 +363,9 @@ struct zone {
348#ifdef CONFIG_NUMA 363#ifdef CONFIG_NUMA
349 int node; 364 int node;
350#endif 365#endif
351
352 /*
353 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
354 * this zone's LRU. Maintained by the pageout code.
355 */
356 unsigned int inactive_ratio;
357
358 struct pglist_data *zone_pgdat; 366 struct pglist_data *zone_pgdat;
359 struct per_cpu_pageset __percpu *pageset; 367 struct per_cpu_pageset __percpu *pageset;
360 368
361 /*
362 * This is a per-zone reserve of pages that are not available
363 * to userspace allocations.
364 */
365 unsigned long totalreserve_pages;
366
367#ifndef CONFIG_SPARSEMEM 369#ifndef CONFIG_SPARSEMEM
368 /* 370 /*
369 * Flags for a pageblock_nr_pages block. See pageblock-flags.h. 371 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
@@ -372,14 +374,6 @@ struct zone {
372 unsigned long *pageblock_flags; 374 unsigned long *pageblock_flags;
373#endif /* CONFIG_SPARSEMEM */ 375#endif /* CONFIG_SPARSEMEM */
374 376
375#ifdef CONFIG_NUMA
376 /*
377 * zone reclaim becomes active if more unmapped pages exist.
378 */
379 unsigned long min_unmapped_pages;
380 unsigned long min_slab_pages;
381#endif /* CONFIG_NUMA */
382
383 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 377 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
384 unsigned long zone_start_pfn; 378 unsigned long zone_start_pfn;
385 379
@@ -472,24 +466,21 @@ struct zone {
472 unsigned long wait_table_hash_nr_entries; 466 unsigned long wait_table_hash_nr_entries;
473 unsigned long wait_table_bits; 467 unsigned long wait_table_bits;
474 468
469 /* Write-intensive fields used from the page allocator */
475 ZONE_PADDING(_pad1_) 470 ZONE_PADDING(_pad1_)
471
476 /* free areas of different sizes */ 472 /* free areas of different sizes */
477 struct free_area free_area[MAX_ORDER]; 473 struct free_area free_area[MAX_ORDER];
478 474
479 /* zone flags, see below */ 475 /* zone flags, see below */
480 unsigned long flags; 476 unsigned long flags;
481 477
482 /* Write-intensive fields used from the page allocator */ 478 /* Primarily protects free_area */
483 spinlock_t lock; 479 spinlock_t lock;
484 480
481 /* Write-intensive fields used by compaction and vmstats. */
485 ZONE_PADDING(_pad2_) 482 ZONE_PADDING(_pad2_)
486 483
487 /* Write-intensive fields used by page reclaim */
488
489 /* Fields commonly accessed by the page reclaim scanner */
490 spinlock_t lru_lock;
491 struct lruvec lruvec;
492
493 /* 484 /*
494 * When free pages are below this point, additional steps are taken 485 * When free pages are below this point, additional steps are taken
495 * when reading the number of free pages to avoid per-cpu counter 486 * when reading the number of free pages to avoid per-cpu counter
@@ -527,19 +518,18 @@ struct zone {
527 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 518 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
528} ____cacheline_internodealigned_in_smp; 519} ____cacheline_internodealigned_in_smp;
529 520
530enum zone_flags { 521enum pgdat_flags {
531 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 522 PGDAT_CONGESTED, /* pgdat has many dirty pages backed by
532 ZONE_CONGESTED, /* zone has many dirty pages backed by
533 * a congested BDI 523 * a congested BDI
534 */ 524 */
535 ZONE_DIRTY, /* reclaim scanning has recently found 525 PGDAT_DIRTY, /* reclaim scanning has recently found
536 * many dirty file pages at the tail 526 * many dirty file pages at the tail
537 * of the LRU. 527 * of the LRU.
538 */ 528 */
539 ZONE_WRITEBACK, /* reclaim scanning has recently found 529 PGDAT_WRITEBACK, /* reclaim scanning has recently found
540 * many pages under writeback 530 * many pages under writeback
541 */ 531 */
542 ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ 532 PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
543}; 533};
544 534
545static inline unsigned long zone_end_pfn(const struct zone *zone) 535static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -663,8 +653,9 @@ typedef struct pglist_data {
663 wait_queue_head_t pfmemalloc_wait; 653 wait_queue_head_t pfmemalloc_wait;
664 struct task_struct *kswapd; /* Protected by 654 struct task_struct *kswapd; /* Protected by
665 mem_hotplug_begin/end() */ 655 mem_hotplug_begin/end() */
666 int kswapd_max_order; 656 int kswapd_order;
667 enum zone_type classzone_idx; 657 enum zone_type kswapd_classzone_idx;
658
668#ifdef CONFIG_COMPACTION 659#ifdef CONFIG_COMPACTION
669 int kcompactd_max_order; 660 int kcompactd_max_order;
670 enum zone_type kcompactd_classzone_idx; 661 enum zone_type kcompactd_classzone_idx;
@@ -681,6 +672,23 @@ typedef struct pglist_data {
681 /* Number of pages migrated during the rate limiting time interval */ 672 /* Number of pages migrated during the rate limiting time interval */
682 unsigned long numabalancing_migrate_nr_pages; 673 unsigned long numabalancing_migrate_nr_pages;
683#endif 674#endif
675 /*
676 * This is a per-node reserve of pages that are not available
677 * to userspace allocations.
678 */
679 unsigned long totalreserve_pages;
680
681#ifdef CONFIG_NUMA
682 /*
683 * zone reclaim becomes active if more unmapped pages exist.
684 */
685 unsigned long min_unmapped_pages;
686 unsigned long min_slab_pages;
687#endif /* CONFIG_NUMA */
688
689 /* Write-intensive fields used by page reclaim */
690 ZONE_PADDING(_pad1_)
691 spinlock_t lru_lock;
684 692
685#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 693#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
686 /* 694 /*
@@ -695,6 +703,23 @@ typedef struct pglist_data {
695 struct list_head split_queue; 703 struct list_head split_queue;
696 unsigned long split_queue_len; 704 unsigned long split_queue_len;
697#endif 705#endif
706
707 /* Fields commonly accessed by the page reclaim scanner */
708 struct lruvec lruvec;
709
710 /*
711 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
712 * this node's LRU. Maintained by the pageout code.
713 */
714 unsigned int inactive_ratio;
715
716 unsigned long flags;
717
718 ZONE_PADDING(_pad2_)
719
720 /* Per-node vmstats */
721 struct per_cpu_nodestat __percpu *per_cpu_nodestats;
722 atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
698} pg_data_t; 723} pg_data_t;
699 724
700#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 725#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -708,6 +733,15 @@ typedef struct pglist_data {
708 733
709#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 734#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
710#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) 735#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
736static inline spinlock_t *zone_lru_lock(struct zone *zone)
737{
738 return &zone->zone_pgdat->lru_lock;
739}
740
741static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
742{
743 return &pgdat->lruvec;
744}
711 745
712static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) 746static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
713{ 747{
@@ -760,12 +794,12 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
760 794
761extern void lruvec_init(struct lruvec *lruvec); 795extern void lruvec_init(struct lruvec *lruvec);
762 796
763static inline struct zone *lruvec_zone(struct lruvec *lruvec) 797static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
764{ 798{
765#ifdef CONFIG_MEMCG 799#ifdef CONFIG_MEMCG
766 return lruvec->zone; 800 return lruvec->pgdat;
767#else 801#else
768 return container_of(lruvec, struct zone, lruvec); 802 return container_of(lruvec, struct pglist_data, lruvec);
769#endif 803#endif
770} 804}
771 805
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 606137b3b778..5bc0457ee3a8 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -73,9 +73,9 @@ static inline bool oom_task_origin(const struct task_struct *p)
73extern void mark_oom_victim(struct task_struct *tsk); 73extern void mark_oom_victim(struct task_struct *tsk);
74 74
75#ifdef CONFIG_MMU 75#ifdef CONFIG_MMU
76extern void try_oom_reaper(struct task_struct *tsk); 76extern void wake_oom_reaper(struct task_struct *tsk);
77#else 77#else
78static inline void try_oom_reaper(struct task_struct *tsk) 78static inline void wake_oom_reaper(struct task_struct *tsk)
79{ 79{
80} 80}
81#endif 81#endif
@@ -107,27 +107,7 @@ extern void oom_killer_enable(void);
107 107
108extern struct task_struct *find_lock_task_mm(struct task_struct *p); 108extern struct task_struct *find_lock_task_mm(struct task_struct *p);
109 109
110static inline bool task_will_free_mem(struct task_struct *task) 110bool task_will_free_mem(struct task_struct *task);
111{
112 struct signal_struct *sig = task->signal;
113
114 /*
115 * A coredumping process may sleep for an extended period in exit_mm(),
116 * so the oom killer cannot assume that the process will promptly exit
117 * and release memory.
118 */
119 if (sig->flags & SIGNAL_GROUP_COREDUMP)
120 return false;
121
122 if (!(task->flags & PF_EXITING))
123 return false;
124
125 /* Make sure that the whole thread group is going down */
126 if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
127 return false;
128
129 return true;
130}
131 111
132/* sysctls */ 112/* sysctls */
133extern int sysctl_oom_dump_tasks; 113extern int sysctl_oom_dump_tasks;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d99218a1e043..553af2923824 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm)
523#define MMF_HAS_UPROBES 19 /* has uprobes */ 523#define MMF_HAS_UPROBES 19 /* has uprobes */
524#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ 524#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
525#define MMF_OOM_REAPED 21 /* mm has been already reaped */ 525#define MMF_OOM_REAPED 21 /* mm has been already reaped */
526#define MMF_OOM_NOT_REAPABLE 22 /* mm couldn't be reaped */
526 527
527#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) 528#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
528 529
@@ -1949,6 +1950,32 @@ static inline int tsk_nr_cpus_allowed(struct task_struct *p)
1949#define TNF_FAULT_LOCAL 0x08 1950#define TNF_FAULT_LOCAL 0x08
1950#define TNF_MIGRATE_FAIL 0x10 1951#define TNF_MIGRATE_FAIL 0x10
1951 1952
1953static inline bool in_vfork(struct task_struct *tsk)
1954{
1955 bool ret;
1956
1957 /*
1958 * need RCU to access ->real_parent if CLONE_VM was used along with
1959 * CLONE_PARENT.
1960 *
1961 * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
1962 * imply CLONE_VM
1963 *
1964 * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
1965 * ->real_parent is not necessarily the task doing vfork(), so in
1966 * theory we can't rely on task_lock() if we want to dereference it.
1967 *
1968 * And in this case we can't trust the real_parent->mm == tsk->mm
1969 * check, it can be false negative. But we do not care, if init or
1970 * another oom-unkillable task does this it should blame itself.
1971 */
1972 rcu_read_lock();
1973 ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
1974 rcu_read_unlock();
1975
1976 return ret;
1977}
1978
1952#ifdef CONFIG_NUMA_BALANCING 1979#ifdef CONFIG_NUMA_BALANCING
1953extern void task_numa_fault(int last_node, int node, int pages, int flags); 1980extern void task_numa_fault(int last_node, int node, int pages, int flags);
1954extern pid_t task_numa_group_id(struct task_struct *p); 1981extern pid_t task_numa_group_id(struct task_struct *p);
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 339ba027ade9..4ad2c5a26399 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -88,7 +88,8 @@ struct kmem_cache {
88}; 88};
89 89
90static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, 90static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
91 void *x) { 91 void *x)
92{
92 void *object = x - (x - page->s_mem) % cache->size; 93 void *object = x - (x - page->s_mem) % cache->size;
93 void *last_object = page->s_mem + (cache->num - 1) * cache->size; 94 void *last_object = page->s_mem + (cache->num - 1) * cache->size;
94 95
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 5624c1f3eb0a..75f56c2ef2d4 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -104,6 +104,10 @@ struct kmem_cache {
104 unsigned int *random_seq; 104 unsigned int *random_seq;
105#endif 105#endif
106 106
107#ifdef CONFIG_KASAN
108 struct kasan_cache kasan_info;
109#endif
110
107 struct kmem_cache_node *node[MAX_NUMNODES]; 111 struct kmem_cache_node *node[MAX_NUMNODES];
108}; 112};
109 113
@@ -119,15 +123,17 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
119void object_err(struct kmem_cache *s, struct page *page, 123void object_err(struct kmem_cache *s, struct page *page,
120 u8 *object, char *reason); 124 u8 *object, char *reason);
121 125
126void *fixup_red_left(struct kmem_cache *s, void *p);
127
122static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, 128static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
123 void *x) { 129 void *x) {
124 void *object = x - (x - page_address(page)) % cache->size; 130 void *object = x - (x - page_address(page)) % cache->size;
125 void *last_object = page_address(page) + 131 void *last_object = page_address(page) +
126 (page->objects - 1) * cache->size; 132 (page->objects - 1) * cache->size;
127 if (unlikely(object > last_object)) 133 void *result = (unlikely(object > last_object)) ? last_object : object;
128 return last_object; 134
129 else 135 result = fixup_red_left(cache, result);
130 return object; 136 return result;
131} 137}
132 138
133#endif /* _LINUX_SLUB_DEF_H */ 139#endif /* _LINUX_SLUB_DEF_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0af2bb2028fd..b17cc4830fa6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -157,15 +157,6 @@ enum {
157#define SWAP_CLUSTER_MAX 32UL 157#define SWAP_CLUSTER_MAX 32UL
158#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX 158#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
159 159
160/*
161 * Ratio between zone->managed_pages and the "gap" that above the per-zone
162 * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that
163 * do not meet the (high_wmark + gap) watermark, even which already met the
164 * high_wmark, in order to provide better per-zone lru behavior. We are ok to
165 * spend not more than 1% of the memory for this zone balancing "gap".
166 */
167#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
168
169#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ 160#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
170#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ 161#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
171#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ 162#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
@@ -317,6 +308,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
317 308
318/* linux/mm/vmscan.c */ 309/* linux/mm/vmscan.c */
319extern unsigned long zone_reclaimable_pages(struct zone *zone); 310extern unsigned long zone_reclaimable_pages(struct zone *zone);
311extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
320extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 312extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
321 gfp_t gfp_mask, nodemask_t *mask); 313 gfp_t gfp_mask, nodemask_t *mask);
322extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); 314extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
@@ -324,9 +316,9 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
324 unsigned long nr_pages, 316 unsigned long nr_pages,
325 gfp_t gfp_mask, 317 gfp_t gfp_mask,
326 bool may_swap); 318 bool may_swap);
327extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 319extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
328 gfp_t gfp_mask, bool noswap, 320 gfp_t gfp_mask, bool noswap,
329 struct zone *zone, 321 pg_data_t *pgdat,
330 unsigned long *nr_scanned); 322 unsigned long *nr_scanned);
331extern unsigned long shrink_all_memory(unsigned long nr_pages); 323extern unsigned long shrink_all_memory(unsigned long nr_pages);
332extern int vm_swappiness; 324extern int vm_swappiness;
@@ -334,13 +326,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page);
334extern unsigned long vm_total_pages; 326extern unsigned long vm_total_pages;
335 327
336#ifdef CONFIG_NUMA 328#ifdef CONFIG_NUMA
337extern int zone_reclaim_mode; 329extern int node_reclaim_mode;
338extern int sysctl_min_unmapped_ratio; 330extern int sysctl_min_unmapped_ratio;
339extern int sysctl_min_slab_ratio; 331extern int sysctl_min_slab_ratio;
340extern int zone_reclaim(struct zone *, gfp_t, unsigned int); 332extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
341#else 333#else
342#define zone_reclaim_mode 0 334#define node_reclaim_mode 0
343static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) 335static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
336 unsigned int order)
344{ 337{
345 return 0; 338 return 0;
346} 339}
diff --git a/include/linux/topology.h b/include/linux/topology.h
index afce69296ac0..cb0775e1ee4b 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -54,7 +54,7 @@ int arch_update_cpu_topology(void);
54/* 54/*
55 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE 55 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
56 * (in whatever arch specific measurement units returned by node_distance()) 56 * (in whatever arch specific measurement units returned by node_distance())
57 * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim() 57 * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
58 * on nodes within this distance. 58 * on nodes within this distance.
59 */ 59 */
60#define RECLAIM_DISTANCE 30 60#define RECLAIM_DISTANCE 30
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 42604173f122..4d6ec58a8d45 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -23,21 +23,23 @@
23 23
24enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, 24enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
25 FOR_ALL_ZONES(PGALLOC), 25 FOR_ALL_ZONES(PGALLOC),
26 FOR_ALL_ZONES(ALLOCSTALL),
27 FOR_ALL_ZONES(PGSCAN_SKIP),
26 PGFREE, PGACTIVATE, PGDEACTIVATE, 28 PGFREE, PGACTIVATE, PGDEACTIVATE,
27 PGFAULT, PGMAJFAULT, 29 PGFAULT, PGMAJFAULT,
28 PGLAZYFREED, 30 PGLAZYFREED,
29 FOR_ALL_ZONES(PGREFILL), 31 PGREFILL,
30 FOR_ALL_ZONES(PGSTEAL_KSWAPD), 32 PGSTEAL_KSWAPD,
31 FOR_ALL_ZONES(PGSTEAL_DIRECT), 33 PGSTEAL_DIRECT,
32 FOR_ALL_ZONES(PGSCAN_KSWAPD), 34 PGSCAN_KSWAPD,
33 FOR_ALL_ZONES(PGSCAN_DIRECT), 35 PGSCAN_DIRECT,
34 PGSCAN_DIRECT_THROTTLE, 36 PGSCAN_DIRECT_THROTTLE,
35#ifdef CONFIG_NUMA 37#ifdef CONFIG_NUMA
36 PGSCAN_ZONE_RECLAIM_FAILED, 38 PGSCAN_ZONE_RECLAIM_FAILED,
37#endif 39#endif
38 PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, 40 PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
39 KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, 41 KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
40 PAGEOUTRUN, ALLOCSTALL, PGROTATED, 42 PAGEOUTRUN, PGROTATED,
41 DROP_PAGECACHE, DROP_SLAB, 43 DROP_PAGECACHE, DROP_SLAB,
42#ifdef CONFIG_NUMA_BALANCING 44#ifdef CONFIG_NUMA_BALANCING
43 NUMA_PTE_UPDATES, 45 NUMA_PTE_UPDATES,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index d2da8e053210..613771909b6e 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -101,25 +101,42 @@ static inline void vm_events_fold_cpu(int cpu)
101#define count_vm_vmacache_event(x) do {} while (0) 101#define count_vm_vmacache_event(x) do {} while (0)
102#endif 102#endif
103 103
104#define __count_zone_vm_events(item, zone, delta) \ 104#define __count_zid_vm_events(item, zid, delta) \
105 __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ 105 __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
106 zone_idx(zone), delta)
107 106
108/* 107/*
109 * Zone based page accounting with per cpu differentials. 108 * Zone and node-based page accounting with per cpu differentials.
110 */ 109 */
111extern atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 110extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
111extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
112 112
113static inline void zone_page_state_add(long x, struct zone *zone, 113static inline void zone_page_state_add(long x, struct zone *zone,
114 enum zone_stat_item item) 114 enum zone_stat_item item)
115{ 115{
116 atomic_long_add(x, &zone->vm_stat[item]); 116 atomic_long_add(x, &zone->vm_stat[item]);
117 atomic_long_add(x, &vm_stat[item]); 117 atomic_long_add(x, &vm_zone_stat[item]);
118}
119
120static inline void node_page_state_add(long x, struct pglist_data *pgdat,
121 enum node_stat_item item)
122{
123 atomic_long_add(x, &pgdat->vm_stat[item]);
124 atomic_long_add(x, &vm_node_stat[item]);
118} 125}
119 126
120static inline unsigned long global_page_state(enum zone_stat_item item) 127static inline unsigned long global_page_state(enum zone_stat_item item)
121{ 128{
122 long x = atomic_long_read(&vm_stat[item]); 129 long x = atomic_long_read(&vm_zone_stat[item]);
130#ifdef CONFIG_SMP
131 if (x < 0)
132 x = 0;
133#endif
134 return x;
135}
136
137static inline unsigned long global_node_page_state(enum node_stat_item item)
138{
139 long x = atomic_long_read(&vm_node_stat[item]);
123#ifdef CONFIG_SMP 140#ifdef CONFIG_SMP
124 if (x < 0) 141 if (x < 0)
125 x = 0; 142 x = 0;
@@ -160,32 +177,61 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
160 return x; 177 return x;
161} 178}
162 179
163#ifdef CONFIG_NUMA 180static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
181 enum node_stat_item item)
182{
183 long x = atomic_long_read(&pgdat->vm_stat[item]);
164 184
165extern unsigned long node_page_state(int node, enum zone_stat_item item); 185#ifdef CONFIG_SMP
186 int cpu;
187 for_each_online_cpu(cpu)
188 x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item];
166 189
167#else 190 if (x < 0)
191 x = 0;
192#endif
193 return x;
194}
168 195
169#define node_page_state(node, item) global_page_state(item)
170 196
197#ifdef CONFIG_NUMA
198extern unsigned long sum_zone_node_page_state(int node,
199 enum zone_stat_item item);
200extern unsigned long node_page_state(struct pglist_data *pgdat,
201 enum node_stat_item item);
202#else
203#define sum_zone_node_page_state(node, item) global_page_state(item)
204#define node_page_state(node, item) global_node_page_state(item)
171#endif /* CONFIG_NUMA */ 205#endif /* CONFIG_NUMA */
172 206
173#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) 207#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
174#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) 208#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
209#define add_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, __d)
210#define sub_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, -(__d))
175 211
176#ifdef CONFIG_SMP 212#ifdef CONFIG_SMP
177void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); 213void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
178void __inc_zone_page_state(struct page *, enum zone_stat_item); 214void __inc_zone_page_state(struct page *, enum zone_stat_item);
179void __dec_zone_page_state(struct page *, enum zone_stat_item); 215void __dec_zone_page_state(struct page *, enum zone_stat_item);
180 216
217void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
218void __inc_node_page_state(struct page *, enum node_stat_item);
219void __dec_node_page_state(struct page *, enum node_stat_item);
220
181void mod_zone_page_state(struct zone *, enum zone_stat_item, long); 221void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
182void inc_zone_page_state(struct page *, enum zone_stat_item); 222void inc_zone_page_state(struct page *, enum zone_stat_item);
183void dec_zone_page_state(struct page *, enum zone_stat_item); 223void dec_zone_page_state(struct page *, enum zone_stat_item);
184 224
185extern void inc_zone_state(struct zone *, enum zone_stat_item); 225void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
226void inc_node_page_state(struct page *, enum node_stat_item);
227void dec_node_page_state(struct page *, enum node_stat_item);
228
229extern void inc_node_state(struct pglist_data *, enum node_stat_item);
186extern void __inc_zone_state(struct zone *, enum zone_stat_item); 230extern void __inc_zone_state(struct zone *, enum zone_stat_item);
231extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
187extern void dec_zone_state(struct zone *, enum zone_stat_item); 232extern void dec_zone_state(struct zone *, enum zone_stat_item);
188extern void __dec_zone_state(struct zone *, enum zone_stat_item); 233extern void __dec_zone_state(struct zone *, enum zone_stat_item);
234extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
189 235
190void quiet_vmstat(void); 236void quiet_vmstat(void);
191void cpu_vm_stats_fold(int cpu); 237void cpu_vm_stats_fold(int cpu);
@@ -213,16 +259,34 @@ static inline void __mod_zone_page_state(struct zone *zone,
213 zone_page_state_add(delta, zone, item); 259 zone_page_state_add(delta, zone, item);
214} 260}
215 261
262static inline void __mod_node_page_state(struct pglist_data *pgdat,
263 enum node_stat_item item, int delta)
264{
265 node_page_state_add(delta, pgdat, item);
266}
267
216static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 268static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
217{ 269{
218 atomic_long_inc(&zone->vm_stat[item]); 270 atomic_long_inc(&zone->vm_stat[item]);
219 atomic_long_inc(&vm_stat[item]); 271 atomic_long_inc(&vm_zone_stat[item]);
272}
273
274static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
275{
276 atomic_long_inc(&pgdat->vm_stat[item]);
277 atomic_long_inc(&vm_node_stat[item]);
220} 278}
221 279
222static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 280static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
223{ 281{
224 atomic_long_dec(&zone->vm_stat[item]); 282 atomic_long_dec(&zone->vm_stat[item]);
225 atomic_long_dec(&vm_stat[item]); 283 atomic_long_dec(&vm_zone_stat[item]);
284}
285
286static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
287{
288 atomic_long_dec(&pgdat->vm_stat[item]);
289 atomic_long_dec(&vm_node_stat[item]);
226} 290}
227 291
228static inline void __inc_zone_page_state(struct page *page, 292static inline void __inc_zone_page_state(struct page *page,
@@ -231,12 +295,26 @@ static inline void __inc_zone_page_state(struct page *page,
231 __inc_zone_state(page_zone(page), item); 295 __inc_zone_state(page_zone(page), item);
232} 296}
233 297
298static inline void __inc_node_page_state(struct page *page,
299 enum node_stat_item item)
300{
301 __inc_node_state(page_pgdat(page), item);
302}
303
304
234static inline void __dec_zone_page_state(struct page *page, 305static inline void __dec_zone_page_state(struct page *page,
235 enum zone_stat_item item) 306 enum zone_stat_item item)
236{ 307{
237 __dec_zone_state(page_zone(page), item); 308 __dec_zone_state(page_zone(page), item);
238} 309}
239 310
311static inline void __dec_node_page_state(struct page *page,
312 enum node_stat_item item)
313{
314 __dec_node_state(page_pgdat(page), item);
315}
316
317
240/* 318/*
241 * We only use atomic operations to update counters. So there is no need to 319 * We only use atomic operations to update counters. So there is no need to
242 * disable interrupts. 320 * disable interrupts.
@@ -245,7 +323,12 @@ static inline void __dec_zone_page_state(struct page *page,
245#define dec_zone_page_state __dec_zone_page_state 323#define dec_zone_page_state __dec_zone_page_state
246#define mod_zone_page_state __mod_zone_page_state 324#define mod_zone_page_state __mod_zone_page_state
247 325
326#define inc_node_page_state __inc_node_page_state
327#define dec_node_page_state __dec_node_page_state
328#define mod_node_page_state __mod_node_page_state
329
248#define inc_zone_state __inc_zone_state 330#define inc_zone_state __inc_zone_state
331#define inc_node_state __inc_node_state
249#define dec_zone_state __dec_zone_state 332#define dec_zone_state __dec_zone_state
250 333
251#define set_pgdat_percpu_threshold(pgdat, callback) { } 334#define set_pgdat_percpu_threshold(pgdat, callback) { }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 717e6149e753..fc1e16c25a29 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -320,7 +320,7 @@ void laptop_mode_timer_fn(unsigned long data);
320static inline void laptop_sync_completion(void) { } 320static inline void laptop_sync_completion(void) { }
321#endif 321#endif
322void throttle_vm_writeout(gfp_t gfp_mask); 322void throttle_vm_writeout(gfp_t gfp_mask);
323bool zone_dirty_ok(struct zone *zone); 323bool node_dirty_ok(struct pglist_data *pgdat);
324int wb_domain_init(struct wb_domain *dom, gfp_t gfp); 324int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
325#ifdef CONFIG_CGROUP_WRITEBACK 325#ifdef CONFIG_CGROUP_WRITEBACK
326void wb_domain_exit(struct wb_domain *dom); 326void wb_domain_exit(struct wb_domain *dom);
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 36e2d6fb1360..c2ba402ab256 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -226,26 +226,26 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages,
226 TP_PROTO( 226 TP_PROTO(
227 int order, 227 int order,
228 gfp_t gfp_mask, 228 gfp_t gfp_mask,
229 enum migrate_mode mode), 229 int prio),
230 230
231 TP_ARGS(order, gfp_mask, mode), 231 TP_ARGS(order, gfp_mask, prio),
232 232
233 TP_STRUCT__entry( 233 TP_STRUCT__entry(
234 __field(int, order) 234 __field(int, order)
235 __field(gfp_t, gfp_mask) 235 __field(gfp_t, gfp_mask)
236 __field(enum migrate_mode, mode) 236 __field(int, prio)
237 ), 237 ),
238 238
239 TP_fast_assign( 239 TP_fast_assign(
240 __entry->order = order; 240 __entry->order = order;
241 __entry->gfp_mask = gfp_mask; 241 __entry->gfp_mask = gfp_mask;
242 __entry->mode = mode; 242 __entry->prio = prio;
243 ), 243 ),
244 244
245 TP_printk("order=%d gfp_mask=0x%x mode=%d", 245 TP_printk("order=%d gfp_mask=0x%x priority=%d",
246 __entry->order, 246 __entry->order,
247 __entry->gfp_mask, 247 __entry->gfp_mask,
248 (int)__entry->mode) 248 __entry->prio)
249); 249);
250 250
251DECLARE_EVENT_CLASS(mm_compaction_suitable_template, 251DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 43cedbf0c759..5a81ab48a2fb 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -11,6 +11,7 @@
11 11
12#define __def_gfpflag_names \ 12#define __def_gfpflag_names \
13 {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ 13 {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
14 {(unsigned long)GFP_TRANSHUGE_LIGHT, "GFP_TRANSHUGE_LIGHT"}, \
14 {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ 15 {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\
15 {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ 16 {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
16 {(unsigned long)GFP_USER, "GFP_USER"}, \ 17 {(unsigned long)GFP_USER, "GFP_USER"}, \
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 0101ef37f1ee..c88fd0934e7e 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -55,21 +55,23 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
55 55
56TRACE_EVENT(mm_vmscan_kswapd_wake, 56TRACE_EVENT(mm_vmscan_kswapd_wake,
57 57
58 TP_PROTO(int nid, int order), 58 TP_PROTO(int nid, int zid, int order),
59 59
60 TP_ARGS(nid, order), 60 TP_ARGS(nid, zid, order),
61 61
62 TP_STRUCT__entry( 62 TP_STRUCT__entry(
63 __field( int, nid ) 63 __field( int, nid )
64 __field( int, zid )
64 __field( int, order ) 65 __field( int, order )
65 ), 66 ),
66 67
67 TP_fast_assign( 68 TP_fast_assign(
68 __entry->nid = nid; 69 __entry->nid = nid;
70 __entry->zid = zid;
69 __entry->order = order; 71 __entry->order = order;
70 ), 72 ),
71 73
72 TP_printk("nid=%d order=%d", __entry->nid, __entry->order) 74 TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order)
73); 75);
74 76
75TRACE_EVENT(mm_vmscan_wakeup_kswapd, 77TRACE_EVENT(mm_vmscan_wakeup_kswapd,
@@ -98,47 +100,50 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
98 100
99DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, 101DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
100 102
101 TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), 103 TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
102 104
103 TP_ARGS(order, may_writepage, gfp_flags), 105 TP_ARGS(order, may_writepage, gfp_flags, classzone_idx),
104 106
105 TP_STRUCT__entry( 107 TP_STRUCT__entry(
106 __field( int, order ) 108 __field( int, order )
107 __field( int, may_writepage ) 109 __field( int, may_writepage )
108 __field( gfp_t, gfp_flags ) 110 __field( gfp_t, gfp_flags )
111 __field( int, classzone_idx )
109 ), 112 ),
110 113
111 TP_fast_assign( 114 TP_fast_assign(
112 __entry->order = order; 115 __entry->order = order;
113 __entry->may_writepage = may_writepage; 116 __entry->may_writepage = may_writepage;
114 __entry->gfp_flags = gfp_flags; 117 __entry->gfp_flags = gfp_flags;
118 __entry->classzone_idx = classzone_idx;
115 ), 119 ),
116 120
117 TP_printk("order=%d may_writepage=%d gfp_flags=%s", 121 TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d",
118 __entry->order, 122 __entry->order,
119 __entry->may_writepage, 123 __entry->may_writepage,
120 show_gfp_flags(__entry->gfp_flags)) 124 show_gfp_flags(__entry->gfp_flags),
125 __entry->classzone_idx)
121); 126);
122 127
123DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, 128DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,
124 129
125 TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), 130 TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
126 131
127 TP_ARGS(order, may_writepage, gfp_flags) 132 TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
128); 133);
129 134
130DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, 135DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
131 136
132 TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), 137 TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
133 138
134 TP_ARGS(order, may_writepage, gfp_flags) 139 TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
135); 140);
136 141
137DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, 142DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
138 143
139 TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), 144 TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
140 145
141 TP_ARGS(order, may_writepage, gfp_flags) 146 TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
142); 147);
143 148
144DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, 149DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
@@ -266,16 +271,18 @@ TRACE_EVENT(mm_shrink_slab_end,
266 271
267DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, 272DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
268 273
269 TP_PROTO(int order, 274 TP_PROTO(int classzone_idx,
275 int order,
270 unsigned long nr_requested, 276 unsigned long nr_requested,
271 unsigned long nr_scanned, 277 unsigned long nr_scanned,
272 unsigned long nr_taken, 278 unsigned long nr_taken,
273 isolate_mode_t isolate_mode, 279 isolate_mode_t isolate_mode,
274 int file), 280 int file),
275 281
276 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file), 282 TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
277 283
278 TP_STRUCT__entry( 284 TP_STRUCT__entry(
285 __field(int, classzone_idx)
279 __field(int, order) 286 __field(int, order)
280 __field(unsigned long, nr_requested) 287 __field(unsigned long, nr_requested)
281 __field(unsigned long, nr_scanned) 288 __field(unsigned long, nr_scanned)
@@ -285,6 +292,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
285 ), 292 ),
286 293
287 TP_fast_assign( 294 TP_fast_assign(
295 __entry->classzone_idx = classzone_idx;
288 __entry->order = order; 296 __entry->order = order;
289 __entry->nr_requested = nr_requested; 297 __entry->nr_requested = nr_requested;
290 __entry->nr_scanned = nr_scanned; 298 __entry->nr_scanned = nr_scanned;
@@ -293,8 +301,9 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
293 __entry->file = file; 301 __entry->file = file;
294 ), 302 ),
295 303
296 TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d", 304 TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
297 __entry->isolate_mode, 305 __entry->isolate_mode,
306 __entry->classzone_idx,
298 __entry->order, 307 __entry->order,
299 __entry->nr_requested, 308 __entry->nr_requested,
300 __entry->nr_scanned, 309 __entry->nr_scanned,
@@ -304,27 +313,29 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
304 313
305DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, 314DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
306 315
307 TP_PROTO(int order, 316 TP_PROTO(int classzone_idx,
317 int order,
308 unsigned long nr_requested, 318 unsigned long nr_requested,
309 unsigned long nr_scanned, 319 unsigned long nr_scanned,
310 unsigned long nr_taken, 320 unsigned long nr_taken,
311 isolate_mode_t isolate_mode, 321 isolate_mode_t isolate_mode,
312 int file), 322 int file),
313 323
314 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) 324 TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
315 325
316); 326);
317 327
318DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, 328DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
319 329
320 TP_PROTO(int order, 330 TP_PROTO(int classzone_idx,
331 int order,
321 unsigned long nr_requested, 332 unsigned long nr_requested,
322 unsigned long nr_scanned, 333 unsigned long nr_scanned,
323 unsigned long nr_taken, 334 unsigned long nr_taken,
324 isolate_mode_t isolate_mode, 335 isolate_mode_t isolate_mode,
325 int file), 336 int file),
326 337
327 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) 338 TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
328 339
329); 340);
330 341
@@ -352,15 +363,14 @@ TRACE_EVENT(mm_vmscan_writepage,
352 363
353TRACE_EVENT(mm_vmscan_lru_shrink_inactive, 364TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
354 365
355 TP_PROTO(struct zone *zone, 366 TP_PROTO(int nid,
356 unsigned long nr_scanned, unsigned long nr_reclaimed, 367 unsigned long nr_scanned, unsigned long nr_reclaimed,
357 int priority, int file), 368 int priority, int file),
358 369
359 TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file), 370 TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file),
360 371
361 TP_STRUCT__entry( 372 TP_STRUCT__entry(
362 __field(int, nid) 373 __field(int, nid)
363 __field(int, zid)
364 __field(unsigned long, nr_scanned) 374 __field(unsigned long, nr_scanned)
365 __field(unsigned long, nr_reclaimed) 375 __field(unsigned long, nr_reclaimed)
366 __field(int, priority) 376 __field(int, priority)
@@ -368,16 +378,15 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
368 ), 378 ),
369 379
370 TP_fast_assign( 380 TP_fast_assign(
371 __entry->nid = zone_to_nid(zone); 381 __entry->nid = nid;
372 __entry->zid = zone_idx(zone);
373 __entry->nr_scanned = nr_scanned; 382 __entry->nr_scanned = nr_scanned;
374 __entry->nr_reclaimed = nr_reclaimed; 383 __entry->nr_reclaimed = nr_reclaimed;
375 __entry->priority = priority; 384 __entry->priority = priority;
376 __entry->reclaim_flags = trace_shrink_flags(file); 385 __entry->reclaim_flags = trace_shrink_flags(file);
377 ), 386 ),
378 387
379 TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", 388 TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
380 __entry->nid, __entry->zid, 389 __entry->nid,
381 __entry->nr_scanned, __entry->nr_reclaimed, 390 __entry->nr_scanned, __entry->nr_reclaimed,
382 __entry->priority, 391 __entry->priority,
383 show_reclaim_flags(__entry->reclaim_flags)) 392 show_reclaim_flags(__entry->reclaim_flags))
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 531f5811ff6b..2ccd9ccbf9ef 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -412,11 +412,11 @@ TRACE_EVENT(global_dirty_state,
412 ), 412 ),
413 413
414 TP_fast_assign( 414 TP_fast_assign(
415 __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); 415 __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY);
416 __entry->nr_writeback = global_page_state(NR_WRITEBACK); 416 __entry->nr_writeback = global_node_page_state(NR_WRITEBACK);
417 __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); 417 __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS);
418 __entry->nr_dirtied = global_page_state(NR_DIRTIED); 418 __entry->nr_dirtied = global_node_page_state(NR_DIRTIED);
419 __entry->nr_written = global_page_state(NR_WRITTEN); 419 __entry->nr_written = global_node_page_state(NR_WRITTEN);
420 __entry->background_thresh = background_thresh; 420 __entry->background_thresh = background_thresh;
421 __entry->dirty_thresh = dirty_thresh; 421 __entry->dirty_thresh = dirty_thresh;
422 __entry->dirty_limit = global_wb_domain.dirty_limit; 422 __entry->dirty_limit = global_wb_domain.dirty_limit;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 73e93e53884d..c7fd2778ed50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1034{ 1034{
1035 bool need_loop; 1035 bool need_loop;
1036 1036
1037 /*
1038 * Allow tasks that have access to memory reserves because they have
1039 * been OOM killed to get memory anywhere.
1040 */
1041 if (unlikely(test_thread_flag(TIF_MEMDIE)))
1042 return;
1043 if (current->flags & PF_EXITING) /* Let dying task have memory */
1044 return;
1045
1046 task_lock(tsk); 1037 task_lock(tsk);
1047 /* 1038 /*
1048 * Determine if a loop is necessary if another thread is doing 1039 * Determine if a loop is necessary if another thread is doing
diff --git a/kernel/fork.c b/kernel/fork.c
index de21f25e0d2c..52e725d4a866 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,20 +165,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
165 struct page *page = alloc_pages_node(node, THREADINFO_GFP, 165 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
166 THREAD_SIZE_ORDER); 166 THREAD_SIZE_ORDER);
167 167
168 if (page)
169 memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
170 1 << THREAD_SIZE_ORDER);
171
172 return page ? page_address(page) : NULL; 168 return page ? page_address(page) : NULL;
173} 169}
174 170
175static inline void free_thread_stack(unsigned long *stack) 171static inline void free_thread_stack(unsigned long *stack)
176{ 172{
177 struct page *page = virt_to_page(stack); 173 __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
178
179 memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
180 -(1 << THREAD_SIZE_ORDER));
181 __free_pages(page, THREAD_SIZE_ORDER);
182} 174}
183# else 175# else
184static struct kmem_cache *thread_stack_cache; 176static struct kmem_cache *thread_stack_cache;
@@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep;
223 215
224static void account_kernel_stack(unsigned long *stack, int account) 216static void account_kernel_stack(unsigned long *stack, int account)
225{ 217{
226 struct zone *zone = page_zone(virt_to_page(stack)); 218 /* All stack pages are in the same zone and belong to the same memcg. */
219 struct page *first_page = virt_to_page(stack);
220
221 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
222 THREAD_SIZE / 1024 * account);
227 223
228 mod_zone_page_state(zone, NR_KERNEL_STACK, account); 224 memcg_kmem_update_page_stat(
225 first_page, MEMCG_KERNEL_STACK_KB,
226 account * (THREAD_SIZE / 1024));
229} 227}
230 228
231void free_task(struct task_struct *tsk) 229void free_task(struct task_struct *tsk)
diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc27a..6f56a9e219fa 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
42 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) 42 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
43 return false; 43 return false;
44 44
45 if (test_thread_flag(TIF_MEMDIE)) 45 if (test_tsk_thread_flag(p, TIF_MEMDIE))
46 return false; 46 return false;
47 47
48 if (pm_nosig_freezing || cgroup_freezing(p)) 48 if (pm_nosig_freezing || cgroup_freezing(p))
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 017532193fb1..ddb3247a872a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -308,12 +308,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
308 if (is_ram == REGION_INTERSECTS) 308 if (is_ram == REGION_INTERSECTS)
309 return __va(res->start); 309 return __va(res->start);
310 310
311 if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
312 dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
313 __func__);
314 return ERR_PTR(-ENXIO);
315 }
316
317 if (!ref) 311 if (!ref)
318 return ERR_PTR(-EINVAL); 312 return ERR_PTR(-EINVAL);
319 313
@@ -401,7 +395,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
401 altmap->alloc -= nr_pfns; 395 altmap->alloc -= nr_pfns;
402} 396}
403 397
404#ifdef CONFIG_SPARSEMEM_VMEMMAP
405struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) 398struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
406{ 399{
407 /* 400 /*
@@ -427,5 +420,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
427 420
428 return pgmap ? pgmap->altmap : NULL; 421 return pgmap ? pgmap->altmap : NULL;
429} 422}
430#endif /* CONFIG_SPARSEMEM_VMEMMAP */
431#endif /* CONFIG_ZONE_DEVICE */ 423#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d90df926b59f..9a0178c2ac1d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable)
1627 unsigned long size; 1627 unsigned long size;
1628 1628
1629 size = global_page_state(NR_SLAB_RECLAIMABLE) 1629 size = global_page_state(NR_SLAB_RECLAIMABLE)
1630 + global_page_state(NR_ACTIVE_ANON) 1630 + global_node_page_state(NR_ACTIVE_ANON)
1631 + global_page_state(NR_INACTIVE_ANON) 1631 + global_node_page_state(NR_INACTIVE_ANON)
1632 + global_page_state(NR_ACTIVE_FILE) 1632 + global_node_page_state(NR_ACTIVE_FILE)
1633 + global_page_state(NR_INACTIVE_FILE) 1633 + global_node_page_state(NR_INACTIVE_FILE)
1634 - global_page_state(NR_FILE_MAPPED); 1634 - global_node_page_state(NR_FILE_MAPPED);
1635 1635
1636 return saveable <= size ? 0 : saveable - size; 1636 return saveable <= size ? 0 : saveable - size;
1637} 1637}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 60cdf6386763..d4de33934dac 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)
3177{ 3177{
3178 dump_stack_print_info(log_lvl); 3178 dump_stack_print_info(log_lvl);
3179 3179
3180 printk("%stask: %p ti: %p task.ti: %p\n", 3180 printk("%stask: %p task.stack: %p\n",
3181 log_lvl, current, current_thread_info(), 3181 log_lvl, current, task_stack_page(current));
3182 task_thread_info(current));
3183} 3182}
3184 3183
3185#endif 3184#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35f0dcb1cb4f..53954631a4e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = {
1508#ifdef CONFIG_NUMA 1508#ifdef CONFIG_NUMA
1509 { 1509 {
1510 .procname = "zone_reclaim_mode", 1510 .procname = "zone_reclaim_mode",
1511 .data = &zone_reclaim_mode, 1511 .data = &node_reclaim_mode,
1512 .maxlen = sizeof(zone_reclaim_mode), 1512 .maxlen = sizeof(node_reclaim_mode),
1513 .mode = 0644, 1513 .mode = 0644,
1514 .proc_handler = proc_dointvec, 1514 .proc_handler = proc_dointvec,
1515 .extra1 = &zero, 1515 .extra1 = &zero,
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 67d8c6838ba9..bd38aab05929 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN
5 5
6config KASAN 6config KASAN
7 bool "KASan: runtime memory debugger" 7 bool "KASan: runtime memory debugger"
8 depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) 8 depends on SLUB || (SLAB && !DEBUG_SLAB)
9 select CONSTRUCTORS 9 select CONSTRUCTORS
10 select STACKDEPOT if SLAB 10 select STACKDEPOT
11 help 11 help
12 Enables kernel address sanitizer - runtime memory debugger, 12 Enables kernel address sanitizer - runtime memory debugger,
13 designed to find out-of-bounds accesses and use-after-free bugs. 13 designed to find out-of-bounds accesses and use-after-free bugs.
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index d67c8288d95d..9e8c7386b3a0 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -144,7 +144,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
144 buf = iov->iov_base + skip; 144 buf = iov->iov_base + skip;
145 copy = min(bytes, iov->iov_len - skip); 145 copy = min(bytes, iov->iov_len - skip);
146 146
147 if (!fault_in_pages_writeable(buf, copy)) { 147 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
148 kaddr = kmap_atomic(page); 148 kaddr = kmap_atomic(page);
149 from = kaddr + offset; 149 from = kaddr + offset;
150 150
@@ -175,6 +175,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
175 copy = min(bytes, iov->iov_len - skip); 175 copy = min(bytes, iov->iov_len - skip);
176 } 176 }
177 /* Too bad - revert to non-atomic kmap */ 177 /* Too bad - revert to non-atomic kmap */
178
178 kaddr = kmap(page); 179 kaddr = kmap(page);
179 from = kaddr + offset; 180 from = kaddr + offset;
180 left = __copy_to_user(buf, from, copy); 181 left = __copy_to_user(buf, from, copy);
@@ -193,6 +194,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
193 bytes -= copy; 194 bytes -= copy;
194 } 195 }
195 kunmap(page); 196 kunmap(page);
197
196done: 198done:
197 if (skip == iov->iov_len) { 199 if (skip == iov->iov_len) {
198 iov++; 200 iov++;
@@ -225,7 +227,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
225 buf = iov->iov_base + skip; 227 buf = iov->iov_base + skip;
226 copy = min(bytes, iov->iov_len - skip); 228 copy = min(bytes, iov->iov_len - skip);
227 229
228 if (!fault_in_pages_readable(buf, copy)) { 230 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
229 kaddr = kmap_atomic(page); 231 kaddr = kmap_atomic(page);
230 to = kaddr + offset; 232 to = kaddr + offset;
231 233
@@ -256,6 +258,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
256 copy = min(bytes, iov->iov_len - skip); 258 copy = min(bytes, iov->iov_len - skip);
257 } 259 }
258 /* Too bad - revert to non-atomic kmap */ 260 /* Too bad - revert to non-atomic kmap */
261
259 kaddr = kmap(page); 262 kaddr = kmap(page);
260 to = kaddr + offset; 263 to = kaddr + offset;
261 left = __copy_from_user(to, buf, copy); 264 left = __copy_from_user(to, buf, copy);
@@ -274,6 +277,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
274 bytes -= copy; 277 bytes -= copy;
275 } 278 }
276 kunmap(page); 279 kunmap(page);
280
277done: 281done:
278 if (skip == iov->iov_len) { 282 if (skip == iov->iov_len) {
279 iov++; 283 iov++;
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 53ad6c0831ae..60f77f1d470a 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -242,6 +242,7 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
242 */ 242 */
243 alloc_flags &= ~GFP_ZONEMASK; 243 alloc_flags &= ~GFP_ZONEMASK;
244 alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); 244 alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
245 alloc_flags |= __GFP_NOWARN;
245 page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); 246 page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
246 if (page) 247 if (page)
247 prealloc = page_address(page); 248 prealloc = page_address(page);
diff --git a/mm/Kconfig b/mm/Kconfig
index 3c81803b00a3..c0837845c17c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -681,7 +681,7 @@ config IDLE_PAGE_TRACKING
681 See Documentation/vm/idle_page_tracking.txt for more details. 681 See Documentation/vm/idle_page_tracking.txt for more details.
682 682
683config ZONE_DEVICE 683config ZONE_DEVICE
684 bool "Device memory (pmem, etc...) hotplug support" if EXPERT 684 bool "Device memory (pmem, etc...) hotplug support"
685 depends on MEMORY_HOTPLUG 685 depends on MEMORY_HOTPLUG
686 depends on MEMORY_HOTREMOVE 686 depends on MEMORY_HOTREMOVE
687 depends on SPARSEMEM_VMEMMAP 687 depends on SPARSEMEM_VMEMMAP
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ed173b8ae8f2..efe237742074 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -947,24 +947,24 @@ long congestion_wait(int sync, long timeout)
947EXPORT_SYMBOL(congestion_wait); 947EXPORT_SYMBOL(congestion_wait);
948 948
949/** 949/**
950 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes 950 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
951 * @zone: A zone to check if it is heavily congested 951 * @pgdat: A pgdat to check if it is heavily congested
952 * @sync: SYNC or ASYNC IO 952 * @sync: SYNC or ASYNC IO
953 * @timeout: timeout in jiffies 953 * @timeout: timeout in jiffies
954 * 954 *
955 * In the event of a congested backing_dev (any backing_dev) and the given 955 * In the event of a congested backing_dev (any backing_dev) and the given
956 * @zone has experienced recent congestion, this waits for up to @timeout 956 * @pgdat has experienced recent congestion, this waits for up to @timeout
957 * jiffies for either a BDI to exit congestion of the given @sync queue 957 * jiffies for either a BDI to exit congestion of the given @sync queue
958 * or a write to complete. 958 * or a write to complete.
959 * 959 *
960 * In the absence of zone congestion, cond_resched() is called to yield 960 * In the absence of pgdat congestion, cond_resched() is called to yield
961 * the processor if necessary but otherwise does not sleep. 961 * the processor if necessary but otherwise does not sleep.
962 * 962 *
963 * The return value is 0 if the sleep is for the full timeout. Otherwise, 963 * The return value is 0 if the sleep is for the full timeout. Otherwise,
964 * it is the number of jiffies that were still remaining when the function 964 * it is the number of jiffies that were still remaining when the function
965 * returned. return_value == timeout implies the function did not sleep. 965 * returned. return_value == timeout implies the function did not sleep.
966 */ 966 */
967long wait_iff_congested(struct zone *zone, int sync, long timeout) 967long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout)
968{ 968{
969 long ret; 969 long ret;
970 unsigned long start = jiffies; 970 unsigned long start = jiffies;
@@ -973,12 +973,13 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
973 973
974 /* 974 /*
975 * If there is no congestion, or heavy congestion is not being 975 * If there is no congestion, or heavy congestion is not being
976 * encountered in the current zone, yield if necessary instead 976 * encountered in the current pgdat, yield if necessary instead
977 * of sleeping on the congestion queue 977 * of sleeping on the congestion queue
978 */ 978 */
979 if (atomic_read(&nr_wb_congested[sync]) == 0 || 979 if (atomic_read(&nr_wb_congested[sync]) == 0 ||
980 !test_bit(ZONE_CONGESTED, &zone->flags)) { 980 !test_bit(PGDAT_CONGESTED, &pgdat->flags)) {
981 cond_resched(); 981 cond_resched();
982
982 /* In case we scheduled, work out time remaining */ 983 /* In case we scheduled, work out time remaining */
983 ret = timeout - (jiffies - start); 984 ret = timeout - (jiffies - start);
984 if (ret < 0) 985 if (ret < 0)
diff --git a/mm/compaction.c b/mm/compaction.c
index 64df5fe052db..9affb2908304 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -331,7 +331,7 @@ static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
331{ 331{
332 if (cc->mode == MIGRATE_ASYNC) { 332 if (cc->mode == MIGRATE_ASYNC) {
333 if (!spin_trylock_irqsave(lock, *flags)) { 333 if (!spin_trylock_irqsave(lock, *flags)) {
334 cc->contended = COMPACT_CONTENDED_LOCK; 334 cc->contended = true;
335 return false; 335 return false;
336 } 336 }
337 } else { 337 } else {
@@ -365,13 +365,13 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
365 } 365 }
366 366
367 if (fatal_signal_pending(current)) { 367 if (fatal_signal_pending(current)) {
368 cc->contended = COMPACT_CONTENDED_SCHED; 368 cc->contended = true;
369 return true; 369 return true;
370 } 370 }
371 371
372 if (need_resched()) { 372 if (need_resched()) {
373 if (cc->mode == MIGRATE_ASYNC) { 373 if (cc->mode == MIGRATE_ASYNC) {
374 cc->contended = COMPACT_CONTENDED_SCHED; 374 cc->contended = true;
375 return true; 375 return true;
376 } 376 }
377 cond_resched(); 377 cond_resched();
@@ -394,7 +394,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
394 /* async compaction aborts if contended */ 394 /* async compaction aborts if contended */
395 if (need_resched()) { 395 if (need_resched()) {
396 if (cc->mode == MIGRATE_ASYNC) { 396 if (cc->mode == MIGRATE_ASYNC) {
397 cc->contended = COMPACT_CONTENDED_SCHED; 397 cc->contended = true;
398 return true; 398 return true;
399 } 399 }
400 400
@@ -646,8 +646,8 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
646 list_for_each_entry(page, &cc->migratepages, lru) 646 list_for_each_entry(page, &cc->migratepages, lru)
647 count[!!page_is_file_cache(page)]++; 647 count[!!page_is_file_cache(page)]++;
648 648
649 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 649 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
650 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 650 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
651} 651}
652 652
653/* Similar to reclaim, but different enough that they don't share logic */ 653/* Similar to reclaim, but different enough that they don't share logic */
@@ -655,12 +655,12 @@ static bool too_many_isolated(struct zone *zone)
655{ 655{
656 unsigned long active, inactive, isolated; 656 unsigned long active, inactive, isolated;
657 657
658 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 658 inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
659 zone_page_state(zone, NR_INACTIVE_ANON); 659 node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
660 active = zone_page_state(zone, NR_ACTIVE_FILE) + 660 active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
661 zone_page_state(zone, NR_ACTIVE_ANON); 661 node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
662 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 662 isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
663 zone_page_state(zone, NR_ISOLATED_ANON); 663 node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
664 664
665 return isolated > (inactive + active) / 2; 665 return isolated > (inactive + active) / 2;
666} 666}
@@ -752,7 +752,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
752 * if contended. 752 * if contended.
753 */ 753 */
754 if (!(low_pfn % SWAP_CLUSTER_MAX) 754 if (!(low_pfn % SWAP_CLUSTER_MAX)
755 && compact_unlock_should_abort(&zone->lru_lock, flags, 755 && compact_unlock_should_abort(zone_lru_lock(zone), flags,
756 &locked, cc)) 756 &locked, cc))
757 break; 757 break;
758 758
@@ -813,7 +813,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
813 if (unlikely(__PageMovable(page)) && 813 if (unlikely(__PageMovable(page)) &&
814 !PageIsolated(page)) { 814 !PageIsolated(page)) {
815 if (locked) { 815 if (locked) {
816 spin_unlock_irqrestore(&zone->lru_lock, 816 spin_unlock_irqrestore(zone_lru_lock(zone),
817 flags); 817 flags);
818 locked = false; 818 locked = false;
819 } 819 }
@@ -836,7 +836,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
836 836
837 /* If we already hold the lock, we can skip some rechecking */ 837 /* If we already hold the lock, we can skip some rechecking */
838 if (!locked) { 838 if (!locked) {
839 locked = compact_trylock_irqsave(&zone->lru_lock, 839 locked = compact_trylock_irqsave(zone_lru_lock(zone),
840 &flags, cc); 840 &flags, cc);
841 if (!locked) 841 if (!locked)
842 break; 842 break;
@@ -856,7 +856,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
856 } 856 }
857 } 857 }
858 858
859 lruvec = mem_cgroup_page_lruvec(page, zone); 859 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
860 860
861 /* Try isolate the page */ 861 /* Try isolate the page */
862 if (__isolate_lru_page(page, isolate_mode) != 0) 862 if (__isolate_lru_page(page, isolate_mode) != 0)
@@ -899,7 +899,7 @@ isolate_fail:
899 */ 899 */
900 if (nr_isolated) { 900 if (nr_isolated) {
901 if (locked) { 901 if (locked) {
902 spin_unlock_irqrestore(&zone->lru_lock, flags); 902 spin_unlock_irqrestore(zone_lru_lock(zone), flags);
903 locked = false; 903 locked = false;
904 } 904 }
905 acct_isolated(zone, cc); 905 acct_isolated(zone, cc);
@@ -927,7 +927,7 @@ isolate_fail:
927 low_pfn = end_pfn; 927 low_pfn = end_pfn;
928 928
929 if (locked) 929 if (locked)
930 spin_unlock_irqrestore(&zone->lru_lock, flags); 930 spin_unlock_irqrestore(zone_lru_lock(zone), flags);
931 931
932 /* 932 /*
933 * Update the pageblock-skip information and cached scanner pfn, 933 * Update the pageblock-skip information and cached scanner pfn,
@@ -1200,7 +1200,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1200 struct page *page; 1200 struct page *page;
1201 const isolate_mode_t isolate_mode = 1201 const isolate_mode_t isolate_mode =
1202 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | 1202 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1203 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1203 (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1204 1204
1205 /* 1205 /*
1206 * Start at where we last stopped, or beginning of the zone as 1206 * Start at where we last stopped, or beginning of the zone as
@@ -1619,14 +1619,11 @@ out:
1619 trace_mm_compaction_end(start_pfn, cc->migrate_pfn, 1619 trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
1620 cc->free_pfn, end_pfn, sync, ret); 1620 cc->free_pfn, end_pfn, sync, ret);
1621 1621
1622 if (ret == COMPACT_CONTENDED)
1623 ret = COMPACT_PARTIAL;
1624
1625 return ret; 1622 return ret;
1626} 1623}
1627 1624
1628static enum compact_result compact_zone_order(struct zone *zone, int order, 1625static enum compact_result compact_zone_order(struct zone *zone, int order,
1629 gfp_t gfp_mask, enum migrate_mode mode, int *contended, 1626 gfp_t gfp_mask, enum compact_priority prio,
1630 unsigned int alloc_flags, int classzone_idx) 1627 unsigned int alloc_flags, int classzone_idx)
1631{ 1628{
1632 enum compact_result ret; 1629 enum compact_result ret;
@@ -1636,7 +1633,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
1636 .order = order, 1633 .order = order,
1637 .gfp_mask = gfp_mask, 1634 .gfp_mask = gfp_mask,
1638 .zone = zone, 1635 .zone = zone,
1639 .mode = mode, 1636 .mode = (prio == COMPACT_PRIO_ASYNC) ?
1637 MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
1640 .alloc_flags = alloc_flags, 1638 .alloc_flags = alloc_flags,
1641 .classzone_idx = classzone_idx, 1639 .classzone_idx = classzone_idx,
1642 .direct_compaction = true, 1640 .direct_compaction = true,
@@ -1649,7 +1647,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
1649 VM_BUG_ON(!list_empty(&cc.freepages)); 1647 VM_BUG_ON(!list_empty(&cc.freepages));
1650 VM_BUG_ON(!list_empty(&cc.migratepages)); 1648 VM_BUG_ON(!list_empty(&cc.migratepages));
1651 1649
1652 *contended = cc.contended;
1653 return ret; 1650 return ret;
1654} 1651}
1655 1652
@@ -1662,50 +1659,38 @@ int sysctl_extfrag_threshold = 500;
1662 * @alloc_flags: The allocation flags of the current allocation 1659 * @alloc_flags: The allocation flags of the current allocation
1663 * @ac: The context of current allocation 1660 * @ac: The context of current allocation
1664 * @mode: The migration mode for async, sync light, or sync migration 1661 * @mode: The migration mode for async, sync light, or sync migration
1665 * @contended: Return value that determines if compaction was aborted due to
1666 * need_resched() or lock contention
1667 * 1662 *
1668 * This is the main entry point for direct page compaction. 1663 * This is the main entry point for direct page compaction.
1669 */ 1664 */
1670enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, 1665enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1671 unsigned int alloc_flags, const struct alloc_context *ac, 1666 unsigned int alloc_flags, const struct alloc_context *ac,
1672 enum migrate_mode mode, int *contended) 1667 enum compact_priority prio)
1673{ 1668{
1674 int may_enter_fs = gfp_mask & __GFP_FS; 1669 int may_enter_fs = gfp_mask & __GFP_FS;
1675 int may_perform_io = gfp_mask & __GFP_IO; 1670 int may_perform_io = gfp_mask & __GFP_IO;
1676 struct zoneref *z; 1671 struct zoneref *z;
1677 struct zone *zone; 1672 struct zone *zone;
1678 enum compact_result rc = COMPACT_SKIPPED; 1673 enum compact_result rc = COMPACT_SKIPPED;
1679 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
1680
1681 *contended = COMPACT_CONTENDED_NONE;
1682 1674
1683 /* Check if the GFP flags allow compaction */ 1675 /* Check if the GFP flags allow compaction */
1684 if (!order || !may_enter_fs || !may_perform_io) 1676 if (!may_enter_fs || !may_perform_io)
1685 return COMPACT_SKIPPED; 1677 return COMPACT_SKIPPED;
1686 1678
1687 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); 1679 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
1688 1680
1689 /* Compact each zone in the list */ 1681 /* Compact each zone in the list */
1690 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 1682 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1691 ac->nodemask) { 1683 ac->nodemask) {
1692 enum compact_result status; 1684 enum compact_result status;
1693 int zone_contended;
1694 1685
1695 if (compaction_deferred(zone, order)) { 1686 if (compaction_deferred(zone, order)) {
1696 rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); 1687 rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
1697 continue; 1688 continue;
1698 } 1689 }
1699 1690
1700 status = compact_zone_order(zone, order, gfp_mask, mode, 1691 status = compact_zone_order(zone, order, gfp_mask, prio,
1701 &zone_contended, alloc_flags, 1692 alloc_flags, ac_classzone_idx(ac));
1702 ac_classzone_idx(ac));
1703 rc = max(status, rc); 1693 rc = max(status, rc);
1704 /*
1705 * It takes at least one zone that wasn't lock contended
1706 * to clear all_zones_contended.
1707 */
1708 all_zones_contended &= zone_contended;
1709 1694
1710 /* If a normal allocation would succeed, stop compacting */ 1695 /* If a normal allocation would succeed, stop compacting */
1711 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 1696 if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
@@ -1717,59 +1702,29 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1717 * succeeds in this zone. 1702 * succeeds in this zone.
1718 */ 1703 */
1719 compaction_defer_reset(zone, order, false); 1704 compaction_defer_reset(zone, order, false);
1720 /*
1721 * It is possible that async compaction aborted due to
1722 * need_resched() and the watermarks were ok thanks to
1723 * somebody else freeing memory. The allocation can
1724 * however still fail so we better signal the
1725 * need_resched() contention anyway (this will not
1726 * prevent the allocation attempt).
1727 */
1728 if (zone_contended == COMPACT_CONTENDED_SCHED)
1729 *contended = COMPACT_CONTENDED_SCHED;
1730 1705
1731 goto break_loop; 1706 break;
1732 } 1707 }
1733 1708
1734 if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE || 1709 if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
1735 status == COMPACT_PARTIAL_SKIPPED)) { 1710 status == COMPACT_PARTIAL_SKIPPED))
1736 /* 1711 /*
1737 * We think that allocation won't succeed in this zone 1712 * We think that allocation won't succeed in this zone
1738 * so we defer compaction there. If it ends up 1713 * so we defer compaction there. If it ends up
1739 * succeeding after all, it will be reset. 1714 * succeeding after all, it will be reset.
1740 */ 1715 */
1741 defer_compaction(zone, order); 1716 defer_compaction(zone, order);
1742 }
1743 1717
1744 /* 1718 /*
1745 * We might have stopped compacting due to need_resched() in 1719 * We might have stopped compacting due to need_resched() in
1746 * async compaction, or due to a fatal signal detected. In that 1720 * async compaction, or due to a fatal signal detected. In that
1747 * case do not try further zones and signal need_resched() 1721 * case do not try further zones
1748 * contention.
1749 */
1750 if ((zone_contended == COMPACT_CONTENDED_SCHED)
1751 || fatal_signal_pending(current)) {
1752 *contended = COMPACT_CONTENDED_SCHED;
1753 goto break_loop;
1754 }
1755
1756 continue;
1757break_loop:
1758 /*
1759 * We might not have tried all the zones, so be conservative
1760 * and assume they are not all lock contended.
1761 */ 1722 */
1762 all_zones_contended = 0; 1723 if ((prio == COMPACT_PRIO_ASYNC && need_resched())
1763 break; 1724 || fatal_signal_pending(current))
1725 break;
1764 } 1726 }
1765 1727
1766 /*
1767 * If at least one zone wasn't deferred or skipped, we report if all
1768 * zones that were tried were lock contended.
1769 */
1770 if (rc > COMPACT_INACTIVE && all_zones_contended)
1771 *contended = COMPACT_CONTENDED_LOCK;
1772
1773 return rc; 1728 return rc;
1774} 1729}
1775 1730
diff --git a/mm/filemap.c b/mm/filemap.c
index e90c1543ec2d..c5f5e46c6f7f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -95,8 +95,8 @@
95 * ->swap_lock (try_to_unmap_one) 95 * ->swap_lock (try_to_unmap_one)
96 * ->private_lock (try_to_unmap_one) 96 * ->private_lock (try_to_unmap_one)
97 * ->tree_lock (try_to_unmap_one) 97 * ->tree_lock (try_to_unmap_one)
98 * ->zone.lru_lock (follow_page->mark_page_accessed) 98 * ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
99 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 99 * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
100 * ->private_lock (page_remove_rmap->set_page_dirty) 100 * ->private_lock (page_remove_rmap->set_page_dirty)
101 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 * ->tree_lock (page_remove_rmap->set_page_dirty)
102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
@@ -218,11 +218,11 @@ void __delete_from_page_cache(struct page *page, void *shadow)
218 218
219 /* hugetlb pages do not participate in page cache accounting. */ 219 /* hugetlb pages do not participate in page cache accounting. */
220 if (!PageHuge(page)) 220 if (!PageHuge(page))
221 __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr); 221 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
222 if (PageSwapBacked(page)) { 222 if (PageSwapBacked(page)) {
223 __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr); 223 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
224 if (PageTransHuge(page)) 224 if (PageTransHuge(page))
225 __dec_zone_page_state(page, NR_SHMEM_THPS); 225 __dec_node_page_state(page, NR_SHMEM_THPS);
226 } else { 226 } else {
227 VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); 227 VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page);
228 } 228 }
@@ -568,9 +568,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
568 * hugetlb pages do not participate in page cache accounting. 568 * hugetlb pages do not participate in page cache accounting.
569 */ 569 */
570 if (!PageHuge(new)) 570 if (!PageHuge(new))
571 __inc_zone_page_state(new, NR_FILE_PAGES); 571 __inc_node_page_state(new, NR_FILE_PAGES);
572 if (PageSwapBacked(new)) 572 if (PageSwapBacked(new))
573 __inc_zone_page_state(new, NR_SHMEM); 573 __inc_node_page_state(new, NR_SHMEM);
574 spin_unlock_irqrestore(&mapping->tree_lock, flags); 574 spin_unlock_irqrestore(&mapping->tree_lock, flags);
575 mem_cgroup_migrate(old, new); 575 mem_cgroup_migrate(old, new);
576 radix_tree_preload_end(); 576 radix_tree_preload_end();
@@ -677,7 +677,7 @@ static int __add_to_page_cache_locked(struct page *page,
677 677
678 /* hugetlb pages do not participate in page cache accounting. */ 678 /* hugetlb pages do not participate in page cache accounting. */
679 if (!huge) 679 if (!huge)
680 __inc_zone_page_state(page, NR_FILE_PAGES); 680 __inc_node_page_state(page, NR_FILE_PAGES);
681 spin_unlock_irq(&mapping->tree_lock); 681 spin_unlock_irq(&mapping->tree_lock);
682 if (!huge) 682 if (!huge)
683 mem_cgroup_commit_charge(page, memcg, false, false); 683 mem_cgroup_commit_charge(page, memcg, false, false);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3647334c2ef9..2373f0a7d340 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -539,23 +539,26 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
539} 539}
540 540
541/* 541/*
542 * If THP is set to always then directly reclaim/compact as necessary 542 * If THP defrag is set to always then directly reclaim/compact as necessary
543 * If set to defer then do no reclaim and defer to khugepaged 543 * If set to defer then do only background reclaim/compact and defer to khugepaged
544 * If set to madvise and the VMA is flagged then directly reclaim/compact 544 * If set to madvise and the VMA is flagged then directly reclaim/compact
545 * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
545 */ 546 */
546static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 547static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
547{ 548{
548 gfp_t reclaim_flags = 0; 549 bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
549 550
550 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) && 551 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
551 (vma->vm_flags & VM_HUGEPAGE)) 552 &transparent_hugepage_flags) && vma_madvised)
552 reclaim_flags = __GFP_DIRECT_RECLAIM; 553 return GFP_TRANSHUGE;
553 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 554 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
554 reclaim_flags = __GFP_KSWAPD_RECLAIM; 555 &transparent_hugepage_flags))
555 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 556 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
556 reclaim_flags = __GFP_DIRECT_RECLAIM; 557 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
557 558 &transparent_hugepage_flags))
558 return GFP_TRANSHUGE | reclaim_flags; 559 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
560
561 return GFP_TRANSHUGE_LIGHT;
559} 562}
560 563
561/* Caller must hold page table lock. */ 564/* Caller must hold page table lock. */
@@ -1249,25 +1252,26 @@ out:
1249 return 0; 1252 return 0;
1250} 1253}
1251 1254
1252int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1255/*
1256 * Return true if we do MADV_FREE successfully on entire pmd page.
1257 * Otherwise, return false.
1258 */
1259bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1253 pmd_t *pmd, unsigned long addr, unsigned long next) 1260 pmd_t *pmd, unsigned long addr, unsigned long next)
1254
1255{ 1261{
1256 spinlock_t *ptl; 1262 spinlock_t *ptl;
1257 pmd_t orig_pmd; 1263 pmd_t orig_pmd;
1258 struct page *page; 1264 struct page *page;
1259 struct mm_struct *mm = tlb->mm; 1265 struct mm_struct *mm = tlb->mm;
1260 int ret = 0; 1266 bool ret = false;
1261 1267
1262 ptl = pmd_trans_huge_lock(pmd, vma); 1268 ptl = pmd_trans_huge_lock(pmd, vma);
1263 if (!ptl) 1269 if (!ptl)
1264 goto out_unlocked; 1270 goto out_unlocked;
1265 1271
1266 orig_pmd = *pmd; 1272 orig_pmd = *pmd;
1267 if (is_huge_zero_pmd(orig_pmd)) { 1273 if (is_huge_zero_pmd(orig_pmd))
1268 ret = 1;
1269 goto out; 1274 goto out;
1270 }
1271 1275
1272 page = pmd_page(orig_pmd); 1276 page = pmd_page(orig_pmd);
1273 /* 1277 /*
@@ -1309,7 +1313,7 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1309 set_pmd_at(mm, addr, pmd, orig_pmd); 1313 set_pmd_at(mm, addr, pmd, orig_pmd);
1310 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1314 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1311 } 1315 }
1312 ret = 1; 1316 ret = true;
1313out: 1317out:
1314 spin_unlock(ptl); 1318 spin_unlock(ptl);
1315out_unlocked: 1319out_unlocked:
@@ -1586,7 +1590,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
1586 1590
1587 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { 1591 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
1588 /* Last compound_mapcount is gone. */ 1592 /* Last compound_mapcount is gone. */
1589 __dec_zone_page_state(page, NR_ANON_THPS); 1593 __dec_node_page_state(page, NR_ANON_THPS);
1590 if (TestClearPageDoubleMap(page)) { 1594 if (TestClearPageDoubleMap(page)) {
1591 /* No need in mapcount reference anymore */ 1595 /* No need in mapcount reference anymore */
1592 for (i = 0; i < HPAGE_PMD_NR; i++) 1596 for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -1818,7 +1822,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
1818 pgoff_t end = -1; 1822 pgoff_t end = -1;
1819 int i; 1823 int i;
1820 1824
1821 lruvec = mem_cgroup_page_lruvec(head, zone); 1825 lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
1822 1826
1823 /* complete memcg works before add pages to LRU */ 1827 /* complete memcg works before add pages to LRU */
1824 mem_cgroup_split_huge_fixup(head); 1828 mem_cgroup_split_huge_fixup(head);
@@ -1848,7 +1852,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
1848 spin_unlock(&head->mapping->tree_lock); 1852 spin_unlock(&head->mapping->tree_lock);
1849 } 1853 }
1850 1854
1851 spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); 1855 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
1852 1856
1853 unfreeze_page(head); 1857 unfreeze_page(head);
1854 1858
@@ -2034,7 +2038,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2034 lru_add_drain(); 2038 lru_add_drain();
2035 2039
2036 /* prevent PageLRU to go away from under us, and freeze lru stats */ 2040 /* prevent PageLRU to go away from under us, and freeze lru stats */
2037 spin_lock_irqsave(&page_zone(head)->lru_lock, flags); 2041 spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
2038 2042
2039 if (mapping) { 2043 if (mapping) {
2040 void **pslot; 2044 void **pslot;
@@ -2061,7 +2065,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2061 list_del(page_deferred_list(head)); 2065 list_del(page_deferred_list(head));
2062 } 2066 }
2063 if (mapping) 2067 if (mapping)
2064 __dec_zone_page_state(page, NR_SHMEM_THPS); 2068 __dec_node_page_state(page, NR_SHMEM_THPS);
2065 spin_unlock(&pgdata->split_queue_lock); 2069 spin_unlock(&pgdata->split_queue_lock);
2066 __split_huge_page(page, list, flags); 2070 __split_huge_page(page, list, flags);
2067 ret = 0; 2071 ret = 0;
@@ -2077,7 +2081,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2077 spin_unlock(&pgdata->split_queue_lock); 2081 spin_unlock(&pgdata->split_queue_lock);
2078fail: if (mapping) 2082fail: if (mapping)
2079 spin_unlock(&mapping->tree_lock); 2083 spin_unlock(&mapping->tree_lock);
2080 spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); 2084 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
2081 unfreeze_page(head); 2085 unfreeze_page(head);
2082 ret = -EBUSY; 2086 ret = -EBUSY;
2083 } 2087 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51a04e5e9373..f904246a8fd5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4391,7 +4391,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
4391 4391
4392/* 4392/*
4393 * This function is called from memory failure code. 4393 * This function is called from memory failure code.
4394 * Assume the caller holds page lock of the head page.
4395 */ 4394 */
4396int dequeue_hwpoisoned_huge_page(struct page *hpage) 4395int dequeue_hwpoisoned_huge_page(struct page *hpage)
4397{ 4396{
diff --git a/mm/internal.h b/mm/internal.h
index 9b6a6c43ac39..1501304f87a4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -78,7 +78,7 @@ extern unsigned long highest_memmap_pfn;
78 */ 78 */
79extern int isolate_lru_page(struct page *page); 79extern int isolate_lru_page(struct page *page);
80extern void putback_lru_page(struct page *page); 80extern void putback_lru_page(struct page *page);
81extern bool zone_reclaimable(struct zone *zone); 81extern bool pgdat_reclaimable(struct pglist_data *pgdat);
82 82
83/* 83/*
84 * in mm/rmap.c: 84 * in mm/rmap.c:
@@ -185,10 +185,7 @@ struct compact_control {
185 const unsigned int alloc_flags; /* alloc flags of a direct compactor */ 185 const unsigned int alloc_flags; /* alloc flags of a direct compactor */
186 const int classzone_idx; /* zone index of a direct compactor */ 186 const int classzone_idx; /* zone index of a direct compactor */
187 struct zone *zone; 187 struct zone *zone;
188 int contended; /* Signal need_sched() or lock 188 bool contended; /* Signal lock or sched contention */
189 * contention detected during
190 * compaction
191 */
192}; 189};
193 190
194unsigned long 191unsigned long
@@ -433,10 +430,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
433} 430}
434#endif /* CONFIG_SPARSEMEM */ 431#endif /* CONFIG_SPARSEMEM */
435 432
436#define ZONE_RECLAIM_NOSCAN -2 433#define NODE_RECLAIM_NOSCAN -2
437#define ZONE_RECLAIM_FULL -1 434#define NODE_RECLAIM_FULL -1
438#define ZONE_RECLAIM_SOME 0 435#define NODE_RECLAIM_SOME 0
439#define ZONE_RECLAIM_SUCCESS 1 436#define NODE_RECLAIM_SUCCESS 1
440 437
441extern int hwpoison_filter(struct page *p); 438extern int hwpoison_filter(struct page *p);
442 439
@@ -467,7 +464,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
467#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 464#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
468#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 465#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
469#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ 466#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
470#define ALLOC_FAIR 0x100 /* fair zone allocation */
471 467
472enum ttu_flags; 468enum ttu_flags;
473struct tlbflush_unmap_batch; 469struct tlbflush_unmap_batch;
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1548749a3d45..2976a9ee104f 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -7,5 +7,4 @@ CFLAGS_REMOVE_kasan.o = -pg
7# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 7# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
8CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) 8CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
9 9
10obj-y := kasan.o report.o kasan_init.o 10obj-y := kasan.o report.o kasan_init.o quarantine.o
11obj-$(CONFIG_SLAB) += quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 6845f9294696..b6f99e81bfeb 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -351,7 +351,6 @@ void kasan_free_pages(struct page *page, unsigned int order)
351 KASAN_FREE_PAGE); 351 KASAN_FREE_PAGE);
352} 352}
353 353
354#ifdef CONFIG_SLAB
355/* 354/*
356 * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. 355 * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
357 * For larger allocations larger redzones are used. 356 * For larger allocations larger redzones are used.
@@ -373,16 +372,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
373 unsigned long *flags) 372 unsigned long *flags)
374{ 373{
375 int redzone_adjust; 374 int redzone_adjust;
376 /* Make sure the adjusted size is still less than 375 int orig_size = *size;
377 * KMALLOC_MAX_CACHE_SIZE. 376
378 * TODO: this check is only useful for SLAB, but not SLUB. We'll need
379 * to skip it for SLUB when it starts using kasan_cache_create().
380 */
381 if (*size > KMALLOC_MAX_CACHE_SIZE -
382 sizeof(struct kasan_alloc_meta) -
383 sizeof(struct kasan_free_meta))
384 return;
385 *flags |= SLAB_KASAN;
386 /* Add alloc meta. */ 377 /* Add alloc meta. */
387 cache->kasan_info.alloc_meta_offset = *size; 378 cache->kasan_info.alloc_meta_offset = *size;
388 *size += sizeof(struct kasan_alloc_meta); 379 *size += sizeof(struct kasan_alloc_meta);
@@ -395,14 +386,26 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
395 } 386 }
396 redzone_adjust = optimal_redzone(cache->object_size) - 387 redzone_adjust = optimal_redzone(cache->object_size) -
397 (*size - cache->object_size); 388 (*size - cache->object_size);
389
398 if (redzone_adjust > 0) 390 if (redzone_adjust > 0)
399 *size += redzone_adjust; 391 *size += redzone_adjust;
400 *size = min(KMALLOC_MAX_CACHE_SIZE, 392
401 max(*size, 393 *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size +
402 cache->object_size + 394 optimal_redzone(cache->object_size)));
403 optimal_redzone(cache->object_size))); 395
396 /*
397 * If the metadata doesn't fit, don't enable KASAN at all.
398 */
399 if (*size <= cache->kasan_info.alloc_meta_offset ||
400 *size <= cache->kasan_info.free_meta_offset) {
401 cache->kasan_info.alloc_meta_offset = 0;
402 cache->kasan_info.free_meta_offset = 0;
403 *size = orig_size;
404 return;
405 }
406
407 *flags |= SLAB_KASAN;
404} 408}
405#endif
406 409
407void kasan_cache_shrink(struct kmem_cache *cache) 410void kasan_cache_shrink(struct kmem_cache *cache)
408{ 411{
@@ -414,6 +417,14 @@ void kasan_cache_destroy(struct kmem_cache *cache)
414 quarantine_remove_cache(cache); 417 quarantine_remove_cache(cache);
415} 418}
416 419
420size_t kasan_metadata_size(struct kmem_cache *cache)
421{
422 return (cache->kasan_info.alloc_meta_offset ?
423 sizeof(struct kasan_alloc_meta) : 0) +
424 (cache->kasan_info.free_meta_offset ?
425 sizeof(struct kasan_free_meta) : 0);
426}
427
417void kasan_poison_slab(struct page *page) 428void kasan_poison_slab(struct page *page)
418{ 429{
419 kasan_poison_shadow(page_address(page), 430 kasan_poison_shadow(page_address(page),
@@ -431,16 +442,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
431 kasan_poison_shadow(object, 442 kasan_poison_shadow(object,
432 round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), 443 round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
433 KASAN_KMALLOC_REDZONE); 444 KASAN_KMALLOC_REDZONE);
434#ifdef CONFIG_SLAB
435 if (cache->flags & SLAB_KASAN) { 445 if (cache->flags & SLAB_KASAN) {
436 struct kasan_alloc_meta *alloc_info = 446 struct kasan_alloc_meta *alloc_info =
437 get_alloc_info(cache, object); 447 get_alloc_info(cache, object);
438 alloc_info->state = KASAN_STATE_INIT; 448 alloc_info->state = KASAN_STATE_INIT;
439 } 449 }
440#endif
441} 450}
442 451
443#ifdef CONFIG_SLAB
444static inline int in_irqentry_text(unsigned long ptr) 452static inline int in_irqentry_text(unsigned long ptr)
445{ 453{
446 return (ptr >= (unsigned long)&__irqentry_text_start && 454 return (ptr >= (unsigned long)&__irqentry_text_start &&
@@ -501,7 +509,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
501 BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); 509 BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
502 return (void *)object + cache->kasan_info.free_meta_offset; 510 return (void *)object + cache->kasan_info.free_meta_offset;
503} 511}
504#endif
505 512
506void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) 513void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
507{ 514{
@@ -522,16 +529,16 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
522 529
523bool kasan_slab_free(struct kmem_cache *cache, void *object) 530bool kasan_slab_free(struct kmem_cache *cache, void *object)
524{ 531{
525#ifdef CONFIG_SLAB
526 /* RCU slabs could be legally used after free within the RCU period */ 532 /* RCU slabs could be legally used after free within the RCU period */
527 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) 533 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
528 return false; 534 return false;
529 535
530 if (likely(cache->flags & SLAB_KASAN)) { 536 if (likely(cache->flags & SLAB_KASAN)) {
531 struct kasan_alloc_meta *alloc_info = 537 struct kasan_alloc_meta *alloc_info;
532 get_alloc_info(cache, object); 538 struct kasan_free_meta *free_info;
533 struct kasan_free_meta *free_info = 539
534 get_free_info(cache, object); 540 alloc_info = get_alloc_info(cache, object);
541 free_info = get_free_info(cache, object);
535 542
536 switch (alloc_info->state) { 543 switch (alloc_info->state) {
537 case KASAN_STATE_ALLOC: 544 case KASAN_STATE_ALLOC:
@@ -550,10 +557,6 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
550 } 557 }
551 } 558 }
552 return false; 559 return false;
553#else
554 kasan_poison_slab_free(cache, object);
555 return false;
556#endif
557} 560}
558 561
559void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, 562void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -576,7 +579,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
576 kasan_unpoison_shadow(object, size); 579 kasan_unpoison_shadow(object, size);
577 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, 580 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
578 KASAN_KMALLOC_REDZONE); 581 KASAN_KMALLOC_REDZONE);
579#ifdef CONFIG_SLAB
580 if (cache->flags & SLAB_KASAN) { 582 if (cache->flags & SLAB_KASAN) {
581 struct kasan_alloc_meta *alloc_info = 583 struct kasan_alloc_meta *alloc_info =
582 get_alloc_info(cache, object); 584 get_alloc_info(cache, object);
@@ -585,7 +587,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
585 alloc_info->alloc_size = size; 587 alloc_info->alloc_size = size;
586 set_track(&alloc_info->track, flags); 588 set_track(&alloc_info->track, flags);
587 } 589 }
588#endif
589} 590}
590EXPORT_SYMBOL(kasan_kmalloc); 591EXPORT_SYMBOL(kasan_kmalloc);
591 592
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index fb87923552ef..31972cdba433 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -95,7 +95,6 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
95struct kasan_free_meta *get_free_info(struct kmem_cache *cache, 95struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
96 const void *object); 96 const void *object);
97 97
98
99static inline const void *kasan_shadow_to_mem(const void *shadow_addr) 98static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
100{ 99{
101 return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) 100 return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
@@ -110,7 +109,7 @@ static inline bool kasan_report_enabled(void)
110void kasan_report(unsigned long addr, size_t size, 109void kasan_report(unsigned long addr, size_t size,
111 bool is_write, unsigned long ip); 110 bool is_write, unsigned long ip);
112 111
113#ifdef CONFIG_SLAB 112#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
114void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); 113void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
115void quarantine_reduce(void); 114void quarantine_reduce(void);
116void quarantine_remove_cache(struct kmem_cache *cache); 115void quarantine_remove_cache(struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b3c122ddd454..861b9776841a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -116,7 +116,6 @@ static inline bool init_task_stack_addr(const void *addr)
116 sizeof(init_thread_union.stack)); 116 sizeof(init_thread_union.stack));
117} 117}
118 118
119#ifdef CONFIG_SLAB
120static void print_track(struct kasan_track *track) 119static void print_track(struct kasan_track *track)
121{ 120{
122 pr_err("PID = %u\n", track->pid); 121 pr_err("PID = %u\n", track->pid);
@@ -130,8 +129,8 @@ static void print_track(struct kasan_track *track)
130 } 129 }
131} 130}
132 131
133static void object_err(struct kmem_cache *cache, struct page *page, 132static void kasan_object_err(struct kmem_cache *cache, struct page *page,
134 void *object, char *unused_reason) 133 void *object, char *unused_reason)
135{ 134{
136 struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); 135 struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
137 struct kasan_free_meta *free_info; 136 struct kasan_free_meta *free_info;
@@ -162,7 +161,6 @@ static void object_err(struct kmem_cache *cache, struct page *page,
162 break; 161 break;
163 } 162 }
164} 163}
165#endif
166 164
167static void print_address_description(struct kasan_access_info *info) 165static void print_address_description(struct kasan_access_info *info)
168{ 166{
@@ -177,7 +175,7 @@ static void print_address_description(struct kasan_access_info *info)
177 struct kmem_cache *cache = page->slab_cache; 175 struct kmem_cache *cache = page->slab_cache;
178 object = nearest_obj(cache, page, 176 object = nearest_obj(cache, page,
179 (void *)info->access_addr); 177 (void *)info->access_addr);
180 object_err(cache, page, object, 178 kasan_object_err(cache, page, object,
181 "kasan: bad access detected"); 179 "kasan: bad access detected");
182 return; 180 return;
183 } 181 }
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 7dbee698d6aa..79c52d0061af 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -480,7 +480,7 @@ void __khugepaged_exit(struct mm_struct *mm)
480static void release_pte_page(struct page *page) 480static void release_pte_page(struct page *page)
481{ 481{
482 /* 0 stands for page_is_file_cache(page) == false */ 482 /* 0 stands for page_is_file_cache(page) == false */
483 dec_zone_page_state(page, NR_ISOLATED_ANON + 0); 483 dec_node_page_state(page, NR_ISOLATED_ANON + 0);
484 unlock_page(page); 484 unlock_page(page);
485 putback_lru_page(page); 485 putback_lru_page(page);
486} 486}
@@ -576,7 +576,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
576 goto out; 576 goto out;
577 } 577 }
578 /* 0 stands for page_is_file_cache(page) == false */ 578 /* 0 stands for page_is_file_cache(page) == false */
579 inc_zone_page_state(page, NR_ISOLATED_ANON + 0); 579 inc_node_page_state(page, NR_ISOLATED_ANON + 0);
580 VM_BUG_ON_PAGE(!PageLocked(page), page); 580 VM_BUG_ON_PAGE(!PageLocked(page), page);
581 VM_BUG_ON_PAGE(PageLRU(page), page); 581 VM_BUG_ON_PAGE(PageLRU(page), page);
582 582
@@ -672,10 +672,10 @@ static bool khugepaged_scan_abort(int nid)
672 int i; 672 int i;
673 673
674 /* 674 /*
675 * If zone_reclaim_mode is disabled, then no extra effort is made to 675 * If node_reclaim_mode is disabled, then no extra effort is made to
676 * allocate memory locally. 676 * allocate memory locally.
677 */ 677 */
678 if (!zone_reclaim_mode) 678 if (!node_reclaim_mode)
679 return false; 679 return false;
680 680
681 /* If there is a count for this node already, it must be acceptable */ 681 /* If there is a count for this node already, it must be acceptable */
@@ -694,7 +694,7 @@ static bool khugepaged_scan_abort(int nid)
694/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ 694/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
695static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) 695static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
696{ 696{
697 return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); 697 return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
698} 698}
699 699
700#ifdef CONFIG_NUMA 700#ifdef CONFIG_NUMA
@@ -1483,10 +1483,10 @@ tree_unlocked:
1483 } 1483 }
1484 1484
1485 local_irq_save(flags); 1485 local_irq_save(flags);
1486 __inc_zone_page_state(new_page, NR_SHMEM_THPS); 1486 __inc_node_page_state(new_page, NR_SHMEM_THPS);
1487 if (nr_none) { 1487 if (nr_none) {
1488 __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none); 1488 __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
1489 __mod_zone_page_state(zone, NR_SHMEM, nr_none); 1489 __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
1490 } 1490 }
1491 local_irq_restore(flags); 1491 local_irq_restore(flags);
1492 1492
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 04320d3adbef..086292f7c59d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1485,8 +1485,10 @@ static int kmemleak_scan_thread(void *arg)
1485 * Wait before the first scan to allow the system to fully initialize. 1485 * Wait before the first scan to allow the system to fully initialize.
1486 */ 1486 */
1487 if (first_run) { 1487 if (first_run) {
1488 signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000);
1488 first_run = 0; 1489 first_run = 0;
1489 ssleep(SECS_FIRST_SCAN); 1490 while (timeout && !kthread_should_stop())
1491 timeout = schedule_timeout_interruptible(timeout);
1490 } 1492 }
1491 1493
1492 while (!kthread_should_stop()) { 1494 while (!kthread_should_stop()) {
diff --git a/mm/memblock.c b/mm/memblock.c
index ca099159b45a..ff5ff3b5f1ea 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,7 +20,7 @@
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22 22
23#include <asm-generic/sections.h> 23#include <asm/sections.h>
24#include <linux/io.h> 24#include <linux/io.h>
25 25
26#include "internal.h" 26#include "internal.h"
@@ -1027,7 +1027,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
1027 *out_end = m_end; 1027 *out_end = m_end;
1028 if (out_nid) 1028 if (out_nid)
1029 *out_nid = m_nid; 1029 *out_nid = m_nid;
1030 idx_a++; 1030 idx_a--;
1031 *idx = (u32)idx_a | (u64)idx_b << 32; 1031 *idx = (u32)idx_a | (u64)idx_b << 32;
1032 return; 1032 return;
1033 } 1033 }
@@ -1465,15 +1465,16 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
1465 return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); 1465 return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
1466} 1466}
1467 1467
1468void __init memblock_enforce_memory_limit(phys_addr_t limit) 1468static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
1469{ 1469{
1470 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; 1470 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
1471 struct memblock_region *r; 1471 struct memblock_region *r;
1472 1472
1473 if (!limit) 1473 /*
1474 return; 1474 * translate the memory @limit size into the max address within one of
1475 1475 * the memory memblock regions, if the @limit exceeds the total size
1476 /* find out max address */ 1476 * of those regions, max_addr will keep original value ULLONG_MAX
1477 */
1477 for_each_memblock(memory, r) { 1478 for_each_memblock(memory, r) {
1478 if (limit <= r->size) { 1479 if (limit <= r->size) {
1479 max_addr = r->base + limit; 1480 max_addr = r->base + limit;
@@ -1482,6 +1483,22 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
1482 limit -= r->size; 1483 limit -= r->size;
1483 } 1484 }
1484 1485
1486 return max_addr;
1487}
1488
1489void __init memblock_enforce_memory_limit(phys_addr_t limit)
1490{
1491 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
1492
1493 if (!limit)
1494 return;
1495
1496 max_addr = __find_max_addr(limit);
1497
1498 /* @limit exceeds the total size of the memory, do nothing */
1499 if (max_addr == (phys_addr_t)ULLONG_MAX)
1500 return;
1501
1485 /* truncate both memory and reserved regions */ 1502 /* truncate both memory and reserved regions */
1486 memblock_remove_range(&memblock.memory, max_addr, 1503 memblock_remove_range(&memblock.memory, max_addr,
1487 (phys_addr_t)ULLONG_MAX); 1504 (phys_addr_t)ULLONG_MAX);
@@ -1489,6 +1506,36 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
1489 (phys_addr_t)ULLONG_MAX); 1506 (phys_addr_t)ULLONG_MAX);
1490} 1507}
1491 1508
1509void __init memblock_mem_limit_remove_map(phys_addr_t limit)
1510{
1511 struct memblock_type *type = &memblock.memory;
1512 phys_addr_t max_addr;
1513 int i, ret, start_rgn, end_rgn;
1514
1515 if (!limit)
1516 return;
1517
1518 max_addr = __find_max_addr(limit);
1519
1520 /* @limit exceeds the total size of the memory, do nothing */
1521 if (max_addr == (phys_addr_t)ULLONG_MAX)
1522 return;
1523
1524 ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX,
1525 &start_rgn, &end_rgn);
1526 if (ret)
1527 return;
1528
1529 /* remove all the MAP regions above the limit */
1530 for (i = end_rgn - 1; i >= start_rgn; i--) {
1531 if (!memblock_is_nomap(&type->regions[i]))
1532 memblock_remove_region(type, i);
1533 }
1534 /* truncate the reserved regions */
1535 memblock_remove_range(&memblock.reserved, max_addr,
1536 (phys_addr_t)ULLONG_MAX);
1537}
1538
1492static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) 1539static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
1493{ 1540{
1494 unsigned int left = 0, right = type->cnt; 1541 unsigned int left = 0, right = type->cnt;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f3a84c64f35c..c265212bec8c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = {
132 * their hierarchy representation 132 * their hierarchy representation
133 */ 133 */
134 134
135struct mem_cgroup_tree_per_zone { 135struct mem_cgroup_tree_per_node {
136 struct rb_root rb_root; 136 struct rb_root rb_root;
137 spinlock_t lock; 137 spinlock_t lock;
138}; 138};
139 139
140struct mem_cgroup_tree_per_node {
141 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
142};
143
144struct mem_cgroup_tree { 140struct mem_cgroup_tree {
145 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 141 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
146}; 142};
@@ -323,15 +319,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
323 319
324#endif /* !CONFIG_SLOB */ 320#endif /* !CONFIG_SLOB */
325 321
326static struct mem_cgroup_per_zone *
327mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
328{
329 int nid = zone_to_nid(zone);
330 int zid = zone_idx(zone);
331
332 return &memcg->nodeinfo[nid]->zoneinfo[zid];
333}
334
335/** 322/**
336 * mem_cgroup_css_from_page - css of the memcg associated with a page 323 * mem_cgroup_css_from_page - css of the memcg associated with a page
337 * @page: page of interest 324 * @page: page of interest
@@ -383,37 +370,35 @@ ino_t page_cgroup_ino(struct page *page)
383 return ino; 370 return ino;
384} 371}
385 372
386static struct mem_cgroup_per_zone * 373static struct mem_cgroup_per_node *
387mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 374mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
388{ 375{
389 int nid = page_to_nid(page); 376 int nid = page_to_nid(page);
390 int zid = page_zonenum(page);
391 377
392 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 378 return memcg->nodeinfo[nid];
393} 379}
394 380
395static struct mem_cgroup_tree_per_zone * 381static struct mem_cgroup_tree_per_node *
396soft_limit_tree_node_zone(int nid, int zid) 382soft_limit_tree_node(int nid)
397{ 383{
398 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 384 return soft_limit_tree.rb_tree_per_node[nid];
399} 385}
400 386
401static struct mem_cgroup_tree_per_zone * 387static struct mem_cgroup_tree_per_node *
402soft_limit_tree_from_page(struct page *page) 388soft_limit_tree_from_page(struct page *page)
403{ 389{
404 int nid = page_to_nid(page); 390 int nid = page_to_nid(page);
405 int zid = page_zonenum(page);
406 391
407 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 392 return soft_limit_tree.rb_tree_per_node[nid];
408} 393}
409 394
410static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 395static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
411 struct mem_cgroup_tree_per_zone *mctz, 396 struct mem_cgroup_tree_per_node *mctz,
412 unsigned long new_usage_in_excess) 397 unsigned long new_usage_in_excess)
413{ 398{
414 struct rb_node **p = &mctz->rb_root.rb_node; 399 struct rb_node **p = &mctz->rb_root.rb_node;
415 struct rb_node *parent = NULL; 400 struct rb_node *parent = NULL;
416 struct mem_cgroup_per_zone *mz_node; 401 struct mem_cgroup_per_node *mz_node;
417 402
418 if (mz->on_tree) 403 if (mz->on_tree)
419 return; 404 return;
@@ -423,7 +408,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
423 return; 408 return;
424 while (*p) { 409 while (*p) {
425 parent = *p; 410 parent = *p;
426 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 411 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
427 tree_node); 412 tree_node);
428 if (mz->usage_in_excess < mz_node->usage_in_excess) 413 if (mz->usage_in_excess < mz_node->usage_in_excess)
429 p = &(*p)->rb_left; 414 p = &(*p)->rb_left;
@@ -439,8 +424,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
439 mz->on_tree = true; 424 mz->on_tree = true;
440} 425}
441 426
442static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 427static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
443 struct mem_cgroup_tree_per_zone *mctz) 428 struct mem_cgroup_tree_per_node *mctz)
444{ 429{
445 if (!mz->on_tree) 430 if (!mz->on_tree)
446 return; 431 return;
@@ -448,8 +433,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
448 mz->on_tree = false; 433 mz->on_tree = false;
449} 434}
450 435
451static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 436static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
452 struct mem_cgroup_tree_per_zone *mctz) 437 struct mem_cgroup_tree_per_node *mctz)
453{ 438{
454 unsigned long flags; 439 unsigned long flags;
455 440
@@ -473,8 +458,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
473static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 458static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
474{ 459{
475 unsigned long excess; 460 unsigned long excess;
476 struct mem_cgroup_per_zone *mz; 461 struct mem_cgroup_per_node *mz;
477 struct mem_cgroup_tree_per_zone *mctz; 462 struct mem_cgroup_tree_per_node *mctz;
478 463
479 mctz = soft_limit_tree_from_page(page); 464 mctz = soft_limit_tree_from_page(page);
480 /* 465 /*
@@ -482,7 +467,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
482 * because their event counter is not touched. 467 * because their event counter is not touched.
483 */ 468 */
484 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 469 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
485 mz = mem_cgroup_page_zoneinfo(memcg, page); 470 mz = mem_cgroup_page_nodeinfo(memcg, page);
486 excess = soft_limit_excess(memcg); 471 excess = soft_limit_excess(memcg);
487 /* 472 /*
488 * We have to update the tree if mz is on RB-tree or 473 * We have to update the tree if mz is on RB-tree or
@@ -507,24 +492,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
507 492
508static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 493static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
509{ 494{
510 struct mem_cgroup_tree_per_zone *mctz; 495 struct mem_cgroup_tree_per_node *mctz;
511 struct mem_cgroup_per_zone *mz; 496 struct mem_cgroup_per_node *mz;
512 int nid, zid; 497 int nid;
513 498
514 for_each_node(nid) { 499 for_each_node(nid) {
515 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 500 mz = mem_cgroup_nodeinfo(memcg, nid);
516 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 501 mctz = soft_limit_tree_node(nid);
517 mctz = soft_limit_tree_node_zone(nid, zid); 502 mem_cgroup_remove_exceeded(mz, mctz);
518 mem_cgroup_remove_exceeded(mz, mctz);
519 }
520 } 503 }
521} 504}
522 505
523static struct mem_cgroup_per_zone * 506static struct mem_cgroup_per_node *
524__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 507__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
525{ 508{
526 struct rb_node *rightmost = NULL; 509 struct rb_node *rightmost = NULL;
527 struct mem_cgroup_per_zone *mz; 510 struct mem_cgroup_per_node *mz;
528 511
529retry: 512retry:
530 mz = NULL; 513 mz = NULL;
@@ -532,7 +515,7 @@ retry:
532 if (!rightmost) 515 if (!rightmost)
533 goto done; /* Nothing to reclaim from */ 516 goto done; /* Nothing to reclaim from */
534 517
535 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 518 mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
536 /* 519 /*
537 * Remove the node now but someone else can add it back, 520 * Remove the node now but someone else can add it back,
538 * we will to add it back at the end of reclaim to its correct 521 * we will to add it back at the end of reclaim to its correct
@@ -546,10 +529,10 @@ done:
546 return mz; 529 return mz;
547} 530}
548 531
549static struct mem_cgroup_per_zone * 532static struct mem_cgroup_per_node *
550mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 533mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
551{ 534{
552 struct mem_cgroup_per_zone *mz; 535 struct mem_cgroup_per_node *mz;
553 536
554 spin_lock_irq(&mctz->lock); 537 spin_lock_irq(&mctz->lock);
555 mz = __mem_cgroup_largest_soft_limit_node(mctz); 538 mz = __mem_cgroup_largest_soft_limit_node(mctz);
@@ -643,20 +626,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
643 int nid, unsigned int lru_mask) 626 int nid, unsigned int lru_mask)
644{ 627{
645 unsigned long nr = 0; 628 unsigned long nr = 0;
646 int zid; 629 struct mem_cgroup_per_node *mz;
630 enum lru_list lru;
647 631
648 VM_BUG_ON((unsigned)nid >= nr_node_ids); 632 VM_BUG_ON((unsigned)nid >= nr_node_ids);
649 633
650 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 634 for_each_lru(lru) {
651 struct mem_cgroup_per_zone *mz; 635 if (!(BIT(lru) & lru_mask))
652 enum lru_list lru; 636 continue;
653 637 mz = mem_cgroup_nodeinfo(memcg, nid);
654 for_each_lru(lru) { 638 nr += mz->lru_size[lru];
655 if (!(BIT(lru) & lru_mask))
656 continue;
657 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
658 nr += mz->lru_size[lru];
659 }
660 } 639 }
661 return nr; 640 return nr;
662} 641}
@@ -809,9 +788,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
809 rcu_read_lock(); 788 rcu_read_lock();
810 789
811 if (reclaim) { 790 if (reclaim) {
812 struct mem_cgroup_per_zone *mz; 791 struct mem_cgroup_per_node *mz;
813 792
814 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 793 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
815 iter = &mz->iter[reclaim->priority]; 794 iter = &mz->iter[reclaim->priority];
816 795
817 if (prev && reclaim->generation != iter->generation) 796 if (prev && reclaim->generation != iter->generation)
@@ -910,19 +889,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
910{ 889{
911 struct mem_cgroup *memcg = dead_memcg; 890 struct mem_cgroup *memcg = dead_memcg;
912 struct mem_cgroup_reclaim_iter *iter; 891 struct mem_cgroup_reclaim_iter *iter;
913 struct mem_cgroup_per_zone *mz; 892 struct mem_cgroup_per_node *mz;
914 int nid, zid; 893 int nid;
915 int i; 894 int i;
916 895
917 while ((memcg = parent_mem_cgroup(memcg))) { 896 while ((memcg = parent_mem_cgroup(memcg))) {
918 for_each_node(nid) { 897 for_each_node(nid) {
919 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 898 mz = mem_cgroup_nodeinfo(memcg, nid);
920 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 899 for (i = 0; i <= DEF_PRIORITY; i++) {
921 for (i = 0; i <= DEF_PRIORITY; i++) { 900 iter = &mz->iter[i];
922 iter = &mz->iter[i]; 901 cmpxchg(&iter->position,
923 cmpxchg(&iter->position, 902 dead_memcg, NULL);
924 dead_memcg, NULL);
925 }
926 } 903 }
927 } 904 }
928 } 905 }
@@ -944,39 +921,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
944 iter = mem_cgroup_iter(NULL, iter, NULL)) 921 iter = mem_cgroup_iter(NULL, iter, NULL))
945 922
946/** 923/**
947 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
948 * @zone: zone of the wanted lruvec
949 * @memcg: memcg of the wanted lruvec
950 *
951 * Returns the lru list vector holding pages for the given @zone and
952 * @mem. This can be the global zone lruvec, if the memory controller
953 * is disabled.
954 */
955struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
956 struct mem_cgroup *memcg)
957{
958 struct mem_cgroup_per_zone *mz;
959 struct lruvec *lruvec;
960
961 if (mem_cgroup_disabled()) {
962 lruvec = &zone->lruvec;
963 goto out;
964 }
965
966 mz = mem_cgroup_zone_zoneinfo(memcg, zone);
967 lruvec = &mz->lruvec;
968out:
969 /*
970 * Since a node can be onlined after the mem_cgroup was created,
971 * we have to be prepared to initialize lruvec->zone here;
972 * and if offlined then reonlined, we need to reinitialize it.
973 */
974 if (unlikely(lruvec->zone != zone))
975 lruvec->zone = zone;
976 return lruvec;
977}
978
979/**
980 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 924 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
981 * @page: the page 925 * @page: the page
982 * @zone: zone of the page 926 * @zone: zone of the page
@@ -985,14 +929,14 @@ out:
985 * and putback protocol: the LRU lock must be held, and the page must 929 * and putback protocol: the LRU lock must be held, and the page must
986 * either be PageLRU() or the caller must have isolated/allocated it. 930 * either be PageLRU() or the caller must have isolated/allocated it.
987 */ 931 */
988struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 932struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
989{ 933{
990 struct mem_cgroup_per_zone *mz; 934 struct mem_cgroup_per_node *mz;
991 struct mem_cgroup *memcg; 935 struct mem_cgroup *memcg;
992 struct lruvec *lruvec; 936 struct lruvec *lruvec;
993 937
994 if (mem_cgroup_disabled()) { 938 if (mem_cgroup_disabled()) {
995 lruvec = &zone->lruvec; 939 lruvec = &pgdat->lruvec;
996 goto out; 940 goto out;
997 } 941 }
998 942
@@ -1004,7 +948,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1004 if (!memcg) 948 if (!memcg)
1005 memcg = root_mem_cgroup; 949 memcg = root_mem_cgroup;
1006 950
1007 mz = mem_cgroup_page_zoneinfo(memcg, page); 951 mz = mem_cgroup_page_nodeinfo(memcg, page);
1008 lruvec = &mz->lruvec; 952 lruvec = &mz->lruvec;
1009out: 953out:
1010 /* 954 /*
@@ -1012,8 +956,8 @@ out:
1012 * we have to be prepared to initialize lruvec->zone here; 956 * we have to be prepared to initialize lruvec->zone here;
1013 * and if offlined then reonlined, we need to reinitialize it. 957 * and if offlined then reonlined, we need to reinitialize it.
1014 */ 958 */
1015 if (unlikely(lruvec->zone != zone)) 959 if (unlikely(lruvec->pgdat != pgdat))
1016 lruvec->zone = zone; 960 lruvec->pgdat = pgdat;
1017 return lruvec; 961 return lruvec;
1018} 962}
1019 963
@@ -1030,17 +974,15 @@ out:
1030void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 974void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1031 int nr_pages) 975 int nr_pages)
1032{ 976{
1033 struct mem_cgroup_per_zone *mz; 977 struct mem_cgroup_per_node *mz;
1034 unsigned long *lru_size; 978 unsigned long *lru_size;
1035 long size; 979 long size;
1036 bool empty; 980 bool empty;
1037 981
1038 __update_lru_size(lruvec, lru, nr_pages);
1039
1040 if (mem_cgroup_disabled()) 982 if (mem_cgroup_disabled())
1041 return; 983 return;
1042 984
1043 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 985 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1044 lru_size = mz->lru_size + lru; 986 lru_size = mz->lru_size + lru;
1045 empty = list_empty(lruvec->lists + lru); 987 empty = list_empty(lruvec->lists + lru);
1046 988
@@ -1276,9 +1218,9 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1276 * select it. The goal is to allow it to allocate so that it may 1218 * select it. The goal is to allow it to allocate so that it may
1277 * quickly exit and free its memory. 1219 * quickly exit and free its memory.
1278 */ 1220 */
1279 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1221 if (task_will_free_mem(current)) {
1280 mark_oom_victim(current); 1222 mark_oom_victim(current);
1281 try_oom_reaper(current); 1223 wake_oom_reaper(current);
1282 goto unlock; 1224 goto unlock;
1283 } 1225 }
1284 1226
@@ -1433,7 +1375,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1433#endif 1375#endif
1434 1376
1435static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1377static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1436 struct zone *zone, 1378 pg_data_t *pgdat,
1437 gfp_t gfp_mask, 1379 gfp_t gfp_mask,
1438 unsigned long *total_scanned) 1380 unsigned long *total_scanned)
1439{ 1381{
@@ -1443,7 +1385,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1443 unsigned long excess; 1385 unsigned long excess;
1444 unsigned long nr_scanned; 1386 unsigned long nr_scanned;
1445 struct mem_cgroup_reclaim_cookie reclaim = { 1387 struct mem_cgroup_reclaim_cookie reclaim = {
1446 .zone = zone, 1388 .pgdat = pgdat,
1447 .priority = 0, 1389 .priority = 0,
1448 }; 1390 };
1449 1391
@@ -1473,8 +1415,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1473 } 1415 }
1474 continue; 1416 continue;
1475 } 1417 }
1476 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1418 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1477 zone, &nr_scanned); 1419 pgdat, &nr_scanned);
1478 *total_scanned += nr_scanned; 1420 *total_scanned += nr_scanned;
1479 if (!soft_limit_excess(root_memcg)) 1421 if (!soft_limit_excess(root_memcg))
1480 break; 1422 break;
@@ -2107,11 +2049,11 @@ static void lock_page_lru(struct page *page, int *isolated)
2107{ 2049{
2108 struct zone *zone = page_zone(page); 2050 struct zone *zone = page_zone(page);
2109 2051
2110 spin_lock_irq(&zone->lru_lock); 2052 spin_lock_irq(zone_lru_lock(zone));
2111 if (PageLRU(page)) { 2053 if (PageLRU(page)) {
2112 struct lruvec *lruvec; 2054 struct lruvec *lruvec;
2113 2055
2114 lruvec = mem_cgroup_page_lruvec(page, zone); 2056 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2115 ClearPageLRU(page); 2057 ClearPageLRU(page);
2116 del_page_from_lru_list(page, lruvec, page_lru(page)); 2058 del_page_from_lru_list(page, lruvec, page_lru(page));
2117 *isolated = 1; 2059 *isolated = 1;
@@ -2126,12 +2068,12 @@ static void unlock_page_lru(struct page *page, int isolated)
2126 if (isolated) { 2068 if (isolated) {
2127 struct lruvec *lruvec; 2069 struct lruvec *lruvec;
2128 2070
2129 lruvec = mem_cgroup_page_lruvec(page, zone); 2071 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2130 VM_BUG_ON_PAGE(PageLRU(page), page); 2072 VM_BUG_ON_PAGE(PageLRU(page), page);
2131 SetPageLRU(page); 2073 SetPageLRU(page);
2132 add_page_to_lru_list(page, lruvec, page_lru(page)); 2074 add_page_to_lru_list(page, lruvec, page_lru(page));
2133 } 2075 }
2134 spin_unlock_irq(&zone->lru_lock); 2076 spin_unlock_irq(zone_lru_lock(zone));
2135} 2077}
2136 2078
2137static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2079static void commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -2431,7 +2373,7 @@ void memcg_kmem_uncharge(struct page *page, int order)
2431 2373
2432/* 2374/*
2433 * Because tail pages are not marked as "used", set it. We're under 2375 * Because tail pages are not marked as "used", set it. We're under
2434 * zone->lru_lock and migration entries setup in all page mappings. 2376 * zone_lru_lock and migration entries setup in all page mappings.
2435 */ 2377 */
2436void mem_cgroup_split_huge_fixup(struct page *head) 2378void mem_cgroup_split_huge_fixup(struct page *head)
2437{ 2379{
@@ -2601,22 +2543,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2601 return ret; 2543 return ret;
2602} 2544}
2603 2545
2604unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2546unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2605 gfp_t gfp_mask, 2547 gfp_t gfp_mask,
2606 unsigned long *total_scanned) 2548 unsigned long *total_scanned)
2607{ 2549{
2608 unsigned long nr_reclaimed = 0; 2550 unsigned long nr_reclaimed = 0;
2609 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2551 struct mem_cgroup_per_node *mz, *next_mz = NULL;
2610 unsigned long reclaimed; 2552 unsigned long reclaimed;
2611 int loop = 0; 2553 int loop = 0;
2612 struct mem_cgroup_tree_per_zone *mctz; 2554 struct mem_cgroup_tree_per_node *mctz;
2613 unsigned long excess; 2555 unsigned long excess;
2614 unsigned long nr_scanned; 2556 unsigned long nr_scanned;
2615 2557
2616 if (order > 0) 2558 if (order > 0)
2617 return 0; 2559 return 0;
2618 2560
2619 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 2561 mctz = soft_limit_tree_node(pgdat->node_id);
2620 /* 2562 /*
2621 * This loop can run a while, specially if mem_cgroup's continuously 2563 * This loop can run a while, specially if mem_cgroup's continuously
2622 * keep exceeding their soft limit and putting the system under 2564 * keep exceeding their soft limit and putting the system under
@@ -2631,7 +2573,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2631 break; 2573 break;
2632 2574
2633 nr_scanned = 0; 2575 nr_scanned = 0;
2634 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 2576 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2635 gfp_mask, &nr_scanned); 2577 gfp_mask, &nr_scanned);
2636 nr_reclaimed += reclaimed; 2578 nr_reclaimed += reclaimed;
2637 *total_scanned += nr_scanned; 2579 *total_scanned += nr_scanned;
@@ -3252,22 +3194,21 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3252 3194
3253#ifdef CONFIG_DEBUG_VM 3195#ifdef CONFIG_DEBUG_VM
3254 { 3196 {
3255 int nid, zid; 3197 pg_data_t *pgdat;
3256 struct mem_cgroup_per_zone *mz; 3198 struct mem_cgroup_per_node *mz;
3257 struct zone_reclaim_stat *rstat; 3199 struct zone_reclaim_stat *rstat;
3258 unsigned long recent_rotated[2] = {0, 0}; 3200 unsigned long recent_rotated[2] = {0, 0};
3259 unsigned long recent_scanned[2] = {0, 0}; 3201 unsigned long recent_scanned[2] = {0, 0};
3260 3202
3261 for_each_online_node(nid) 3203 for_each_online_pgdat(pgdat) {
3262 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3204 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3263 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 3205 rstat = &mz->lruvec.reclaim_stat;
3264 rstat = &mz->lruvec.reclaim_stat;
3265 3206
3266 recent_rotated[0] += rstat->recent_rotated[0]; 3207 recent_rotated[0] += rstat->recent_rotated[0];
3267 recent_rotated[1] += rstat->recent_rotated[1]; 3208 recent_rotated[1] += rstat->recent_rotated[1];
3268 recent_scanned[0] += rstat->recent_scanned[0]; 3209 recent_scanned[0] += rstat->recent_scanned[0];
3269 recent_scanned[1] += rstat->recent_scanned[1]; 3210 recent_scanned[1] += rstat->recent_scanned[1];
3270 } 3211 }
3271 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3212 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3272 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3213 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3273 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3214 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
@@ -4147,11 +4088,10 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4147 return idr_find(&mem_cgroup_idr, id); 4088 return idr_find(&mem_cgroup_idr, id);
4148} 4089}
4149 4090
4150static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4091static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4151{ 4092{
4152 struct mem_cgroup_per_node *pn; 4093 struct mem_cgroup_per_node *pn;
4153 struct mem_cgroup_per_zone *mz; 4094 int tmp = node;
4154 int zone, tmp = node;
4155 /* 4095 /*
4156 * This routine is called against possible nodes. 4096 * This routine is called against possible nodes.
4157 * But it's BUG to call kmalloc() against offline node. 4097 * But it's BUG to call kmalloc() against offline node.
@@ -4166,18 +4106,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4166 if (!pn) 4106 if (!pn)
4167 return 1; 4107 return 1;
4168 4108
4169 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4109 lruvec_init(&pn->lruvec);
4170 mz = &pn->zoneinfo[zone]; 4110 pn->usage_in_excess = 0;
4171 lruvec_init(&mz->lruvec); 4111 pn->on_tree = false;
4172 mz->usage_in_excess = 0; 4112 pn->memcg = memcg;
4173 mz->on_tree = false; 4113
4174 mz->memcg = memcg;
4175 }
4176 memcg->nodeinfo[node] = pn; 4114 memcg->nodeinfo[node] = pn;
4177 return 0; 4115 return 0;
4178} 4116}
4179 4117
4180static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4118static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4181{ 4119{
4182 kfree(memcg->nodeinfo[node]); 4120 kfree(memcg->nodeinfo[node]);
4183} 4121}
@@ -4188,7 +4126,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
4188 4126
4189 memcg_wb_domain_exit(memcg); 4127 memcg_wb_domain_exit(memcg);
4190 for_each_node(node) 4128 for_each_node(node)
4191 free_mem_cgroup_per_zone_info(memcg, node); 4129 free_mem_cgroup_per_node_info(memcg, node);
4192 free_percpu(memcg->stat); 4130 free_percpu(memcg->stat);
4193 kfree(memcg); 4131 kfree(memcg);
4194} 4132}
@@ -4217,7 +4155,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4217 goto fail; 4155 goto fail;
4218 4156
4219 for_each_node(node) 4157 for_each_node(node)
4220 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4158 if (alloc_mem_cgroup_per_node_info(memcg, node))
4221 goto fail; 4159 goto fail;
4222 4160
4223 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 4161 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
@@ -5233,7 +5171,7 @@ static int memory_stat_show(struct seq_file *m, void *v)
5233 seq_printf(m, "file %llu\n", 5171 seq_printf(m, "file %llu\n",
5234 (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); 5172 (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
5235 seq_printf(m, "kernel_stack %llu\n", 5173 seq_printf(m, "kernel_stack %llu\n",
5236 (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); 5174 (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
5237 seq_printf(m, "slab %llu\n", 5175 seq_printf(m, "slab %llu\n",
5238 (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + 5176 (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
5239 stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); 5177 stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
@@ -5820,18 +5758,12 @@ static int __init mem_cgroup_init(void)
5820 5758
5821 for_each_node(node) { 5759 for_each_node(node) {
5822 struct mem_cgroup_tree_per_node *rtpn; 5760 struct mem_cgroup_tree_per_node *rtpn;
5823 int zone;
5824 5761
5825 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 5762 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5826 node_online(node) ? node : NUMA_NO_NODE); 5763 node_online(node) ? node : NUMA_NO_NODE);
5827 5764
5828 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5765 rtpn->rb_root = RB_ROOT;
5829 struct mem_cgroup_tree_per_zone *rtpz; 5766 spin_lock_init(&rtpn->lock);
5830
5831 rtpz = &rtpn->rb_tree_per_zone[zone];
5832 rtpz->rb_root = RB_ROOT;
5833 spin_lock_init(&rtpz->lock);
5834 }
5835 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5767 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5836 } 5768 }
5837 5769
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2fcca6b0e005..de88f33519c0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -741,8 +741,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
741 * page->lru because it can be used in other hugepage operations, 741 * page->lru because it can be used in other hugepage operations,
742 * such as __unmap_hugepage_range() and gather_surplus_pages(). 742 * such as __unmap_hugepage_range() and gather_surplus_pages().
743 * So instead we use page_mapping() and PageAnon(). 743 * So instead we use page_mapping() and PageAnon().
744 * We assume that this function is called with page lock held,
745 * so there is no race between isolation and mapping/unmapping.
746 */ 744 */
747 if (!(page_mapping(hpage) || PageAnon(hpage))) { 745 if (!(page_mapping(hpage) || PageAnon(hpage))) {
748 res = dequeue_hwpoisoned_huge_page(hpage); 746 res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1663,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags)
1663 put_hwpoison_page(page); 1661 put_hwpoison_page(page);
1664 if (!ret) { 1662 if (!ret) {
1665 LIST_HEAD(pagelist); 1663 LIST_HEAD(pagelist);
1666 inc_zone_page_state(page, NR_ISOLATED_ANON + 1664 inc_node_page_state(page, NR_ISOLATED_ANON +
1667 page_is_file_cache(page)); 1665 page_is_file_cache(page));
1668 list_add(&page->lru, &pagelist); 1666 list_add(&page->lru, &pagelist);
1669 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, 1667 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
@@ -1671,7 +1669,7 @@ static int __soft_offline_page(struct page *page, int flags)
1671 if (ret) { 1669 if (ret) {
1672 if (!list_empty(&pagelist)) { 1670 if (!list_empty(&pagelist)) {
1673 list_del(&page->lru); 1671 list_del(&page->lru);
1674 dec_zone_page_state(page, NR_ISOLATED_ANON + 1672 dec_node_page_state(page, NR_ISOLATED_ANON +
1675 page_is_file_cache(page)); 1673 page_is_file_cache(page));
1676 putback_lru_page(page); 1674 putback_lru_page(page);
1677 } 1675 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 82d0b98d27f8..3894b65b1555 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1209,9 +1209,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1209 1209
1210 arch_refresh_nodedata(nid, pgdat); 1210 arch_refresh_nodedata(nid, pgdat);
1211 } else { 1211 } else {
1212 /* Reset the nr_zones and classzone_idx to 0 before reuse */ 1212 /* Reset the nr_zones, order and classzone_idx before reuse */
1213 pgdat->nr_zones = 0; 1213 pgdat->nr_zones = 0;
1214 pgdat->classzone_idx = 0; 1214 pgdat->kswapd_order = 0;
1215 pgdat->kswapd_classzone_idx = 0;
1215 } 1216 }
1216 1217
1217 /* we can use NODE_DATA(nid) from here */ 1218 /* we can use NODE_DATA(nid) from here */
@@ -1547,6 +1548,37 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1547 return 0; 1548 return 0;
1548} 1549}
1549 1550
1551static struct page *new_node_page(struct page *page, unsigned long private,
1552 int **result)
1553{
1554 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
1555 int nid = page_to_nid(page);
1556 nodemask_t nmask = node_online_map;
1557 struct page *new_page;
1558
1559 /*
1560 * TODO: allocate a destination hugepage from a nearest neighbor node,
1561 * accordance with memory policy of the user process if possible. For
1562 * now as a simple work-around, we use the next node for destination.
1563 */
1564 if (PageHuge(page))
1565 return alloc_huge_page_node(page_hstate(compound_head(page)),
1566 next_node_in(nid, nmask));
1567
1568 node_clear(nid, nmask);
1569 if (PageHighMem(page)
1570 || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
1571 gfp_mask |= __GFP_HIGHMEM;
1572
1573 new_page = __alloc_pages_nodemask(gfp_mask, 0,
1574 node_zonelist(nid, gfp_mask), &nmask);
1575 if (!new_page)
1576 new_page = __alloc_pages(gfp_mask, 0,
1577 node_zonelist(nid, gfp_mask));
1578
1579 return new_page;
1580}
1581
1550#define NR_OFFLINE_AT_ONCE_PAGES (256) 1582#define NR_OFFLINE_AT_ONCE_PAGES (256)
1551static int 1583static int
1552do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1584do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -1586,7 +1618,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1586 put_page(page); 1618 put_page(page);
1587 list_add_tail(&page->lru, &source); 1619 list_add_tail(&page->lru, &source);
1588 move_pages--; 1620 move_pages--;
1589 inc_zone_page_state(page, NR_ISOLATED_ANON + 1621 inc_node_page_state(page, NR_ISOLATED_ANON +
1590 page_is_file_cache(page)); 1622 page_is_file_cache(page));
1591 1623
1592 } else { 1624 } else {
@@ -1610,11 +1642,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1610 goto out; 1642 goto out;
1611 } 1643 }
1612 1644
1613 /* 1645 /* Allocate a new page from the nearest neighbor node */
1614 * alloc_migrate_target should be improooooved!! 1646 ret = migrate_pages(&source, new_node_page, NULL, 0,
1615 * migrate_pages returns # of failed pages.
1616 */
1617 ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
1618 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1647 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1619 if (ret) 1648 if (ret)
1620 putback_movable_pages(&source); 1649 putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 53e40d3f3933..d8c4e38fb5f4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -962,7 +962,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
962 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 962 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
963 if (!isolate_lru_page(page)) { 963 if (!isolate_lru_page(page)) {
964 list_add_tail(&page->lru, pagelist); 964 list_add_tail(&page->lru, pagelist);
965 inc_zone_page_state(page, NR_ISOLATED_ANON + 965 inc_node_page_state(page, NR_ISOLATED_ANON +
966 page_is_file_cache(page)); 966 page_is_file_cache(page));
967 } 967 }
968 } 968 }
diff --git a/mm/mempool.c b/mm/mempool.c
index 8f65464da5de..47a659dedd44 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -306,7 +306,7 @@ EXPORT_SYMBOL(mempool_resize);
306 * returns NULL. Note that due to preallocation, this function 306 * returns NULL. Note that due to preallocation, this function
307 * *never* fails when called from process contexts. (it might 307 * *never* fails when called from process contexts. (it might
308 * fail if called from an IRQ context.) 308 * fail if called from an IRQ context.)
309 * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. 309 * Note: using __GFP_ZERO is not supported.
310 */ 310 */
311void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) 311void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
312{ 312{
@@ -315,27 +315,16 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
315 wait_queue_t wait; 315 wait_queue_t wait;
316 gfp_t gfp_temp; 316 gfp_t gfp_temp;
317 317
318 /* If oom killed, memory reserves are essential to prevent livelock */
319 VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC);
320 /* No element size to zero on allocation */
321 VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); 318 VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
322
323 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 319 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
324 320
321 gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
325 gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ 322 gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
326 gfp_mask |= __GFP_NOWARN; /* failures are OK */ 323 gfp_mask |= __GFP_NOWARN; /* failures are OK */
327 324
328 gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); 325 gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
329 326
330repeat_alloc: 327repeat_alloc:
331 if (likely(pool->curr_nr)) {
332 /*
333 * Don't allocate from emergency reserves if there are
334 * elements available. This check is racy, but it will
335 * be rechecked each loop.
336 */
337 gfp_temp |= __GFP_NOMEMALLOC;
338 }
339 328
340 element = pool->alloc(gfp_temp, pool->pool_data); 329 element = pool->alloc(gfp_temp, pool->pool_data);
341 if (likely(element != NULL)) 330 if (likely(element != NULL))
@@ -359,12 +348,11 @@ repeat_alloc:
359 * We use gfp mask w/o direct reclaim or IO for the first round. If 348 * We use gfp mask w/o direct reclaim or IO for the first round. If
360 * alloc failed with that and @pool was empty, retry immediately. 349 * alloc failed with that and @pool was empty, retry immediately.
361 */ 350 */
362 if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { 351 if (gfp_temp != gfp_mask) {
363 spin_unlock_irqrestore(&pool->lock, flags); 352 spin_unlock_irqrestore(&pool->lock, flags);
364 gfp_temp = gfp_mask; 353 gfp_temp = gfp_mask;
365 goto repeat_alloc; 354 goto repeat_alloc;
366 } 355 }
367 gfp_temp = gfp_mask;
368 356
369 /* We must not sleep if !__GFP_DIRECT_RECLAIM */ 357 /* We must not sleep if !__GFP_DIRECT_RECLAIM */
370 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { 358 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 2232f6923cc7..f7ee04a5ae27 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -168,7 +168,7 @@ void putback_movable_pages(struct list_head *l)
168 continue; 168 continue;
169 } 169 }
170 list_del(&page->lru); 170 list_del(&page->lru);
171 dec_zone_page_state(page, NR_ISOLATED_ANON + 171 dec_node_page_state(page, NR_ISOLATED_ANON +
172 page_is_file_cache(page)); 172 page_is_file_cache(page));
173 /* 173 /*
174 * We isolated non-lru movable page so here we can use 174 * We isolated non-lru movable page so here we can use
@@ -501,19 +501,21 @@ int migrate_page_move_mapping(struct address_space *mapping,
501 * new page and drop references to the old page. 501 * new page and drop references to the old page.
502 * 502 *
503 * Note that anonymous pages are accounted for 503 * Note that anonymous pages are accounted for
504 * via NR_FILE_PAGES and NR_ANON_PAGES if they 504 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
505 * are mapped to swap space. 505 * are mapped to swap space.
506 */ 506 */
507 if (newzone != oldzone) { 507 if (newzone != oldzone) {
508 __dec_zone_state(oldzone, NR_FILE_PAGES); 508 __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
509 __inc_zone_state(newzone, NR_FILE_PAGES); 509 __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
510 if (PageSwapBacked(page) && !PageSwapCache(page)) { 510 if (PageSwapBacked(page) && !PageSwapCache(page)) {
511 __dec_zone_state(oldzone, NR_SHMEM); 511 __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
512 __inc_zone_state(newzone, NR_SHMEM); 512 __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
513 } 513 }
514 if (dirty && mapping_cap_account_dirty(mapping)) { 514 if (dirty && mapping_cap_account_dirty(mapping)) {
515 __dec_zone_state(oldzone, NR_FILE_DIRTY); 515 __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
516 __inc_zone_state(newzone, NR_FILE_DIRTY); 516 __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
517 __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
518 __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
517 } 519 }
518 } 520 }
519 local_irq_enable(); 521 local_irq_enable();
@@ -1119,7 +1121,7 @@ out:
1119 * restored. 1121 * restored.
1120 */ 1122 */
1121 list_del(&page->lru); 1123 list_del(&page->lru);
1122 dec_zone_page_state(page, NR_ISOLATED_ANON + 1124 dec_node_page_state(page, NR_ISOLATED_ANON +
1123 page_is_file_cache(page)); 1125 page_is_file_cache(page));
1124 } 1126 }
1125 1127
@@ -1460,7 +1462,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1460 err = isolate_lru_page(page); 1462 err = isolate_lru_page(page);
1461 if (!err) { 1463 if (!err) {
1462 list_add_tail(&page->lru, &pagelist); 1464 list_add_tail(&page->lru, &pagelist);
1463 inc_zone_page_state(page, NR_ISOLATED_ANON + 1465 inc_node_page_state(page, NR_ISOLATED_ANON +
1464 page_is_file_cache(page)); 1466 page_is_file_cache(page));
1465 } 1467 }
1466put_and_set: 1468put_and_set:
@@ -1726,15 +1728,16 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1726 unsigned long nr_migrate_pages) 1728 unsigned long nr_migrate_pages)
1727{ 1729{
1728 int z; 1730 int z;
1731
1732 if (!pgdat_reclaimable(pgdat))
1733 return false;
1734
1729 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1735 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1730 struct zone *zone = pgdat->node_zones + z; 1736 struct zone *zone = pgdat->node_zones + z;
1731 1737
1732 if (!populated_zone(zone)) 1738 if (!populated_zone(zone))
1733 continue; 1739 continue;
1734 1740
1735 if (!zone_reclaimable(zone))
1736 continue;
1737
1738 /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 1741 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
1739 if (!zone_watermark_ok(zone, 0, 1742 if (!zone_watermark_ok(zone, 0,
1740 high_wmark_pages(zone) + 1743 high_wmark_pages(zone) +
@@ -1828,7 +1831,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1828 } 1831 }
1829 1832
1830 page_lru = page_is_file_cache(page); 1833 page_lru = page_is_file_cache(page);
1831 mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, 1834 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
1832 hpage_nr_pages(page)); 1835 hpage_nr_pages(page));
1833 1836
1834 /* 1837 /*
@@ -1886,7 +1889,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1886 if (nr_remaining) { 1889 if (nr_remaining) {
1887 if (!list_empty(&migratepages)) { 1890 if (!list_empty(&migratepages)) {
1888 list_del(&page->lru); 1891 list_del(&page->lru);
1889 dec_zone_page_state(page, NR_ISOLATED_ANON + 1892 dec_node_page_state(page, NR_ISOLATED_ANON +
1890 page_is_file_cache(page)); 1893 page_is_file_cache(page));
1891 putback_lru_page(page); 1894 putback_lru_page(page);
1892 } 1895 }
@@ -1931,7 +1934,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1931 goto out_dropref; 1934 goto out_dropref;
1932 1935
1933 new_page = alloc_pages_node(node, 1936 new_page = alloc_pages_node(node,
1934 (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, 1937 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
1935 HPAGE_PMD_ORDER); 1938 HPAGE_PMD_ORDER);
1936 if (!new_page) 1939 if (!new_page)
1937 goto out_fail; 1940 goto out_fail;
@@ -1979,7 +1982,7 @@ fail_putback:
1979 /* Retake the callers reference and putback on LRU */ 1982 /* Retake the callers reference and putback on LRU */
1980 get_page(page); 1983 get_page(page);
1981 putback_lru_page(page); 1984 putback_lru_page(page);
1982 mod_zone_page_state(page_zone(page), 1985 mod_node_page_state(page_pgdat(page),
1983 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); 1986 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
1984 1987
1985 goto out_unlock; 1988 goto out_unlock;
@@ -2030,7 +2033,7 @@ fail_putback:
2030 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); 2033 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
2031 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); 2034 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
2032 2035
2033 mod_zone_page_state(page_zone(page), 2036 mod_node_page_state(page_pgdat(page),
2034 NR_ISOLATED_ANON + page_lru, 2037 NR_ISOLATED_ANON + page_lru,
2035 -HPAGE_PMD_NR); 2038 -HPAGE_PMD_NR);
2036 return isolated; 2039 return isolated;
diff --git a/mm/mlock.c b/mm/mlock.c
index ef8dc9f395c4..14645be06e30 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -103,7 +103,7 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
103 if (PageLRU(page)) { 103 if (PageLRU(page)) {
104 struct lruvec *lruvec; 104 struct lruvec *lruvec;
105 105
106 lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); 106 lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
107 if (getpage) 107 if (getpage)
108 get_page(page); 108 get_page(page);
109 ClearPageLRU(page); 109 ClearPageLRU(page);
@@ -188,7 +188,7 @@ unsigned int munlock_vma_page(struct page *page)
188 * might otherwise copy PageMlocked to part of the tail pages before 188 * might otherwise copy PageMlocked to part of the tail pages before
189 * we clear it in the head page. It also stabilizes hpage_nr_pages(). 189 * we clear it in the head page. It also stabilizes hpage_nr_pages().
190 */ 190 */
191 spin_lock_irq(&zone->lru_lock); 191 spin_lock_irq(zone_lru_lock(zone));
192 192
193 nr_pages = hpage_nr_pages(page); 193 nr_pages = hpage_nr_pages(page);
194 if (!TestClearPageMlocked(page)) 194 if (!TestClearPageMlocked(page))
@@ -197,14 +197,14 @@ unsigned int munlock_vma_page(struct page *page)
197 __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); 197 __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
198 198
199 if (__munlock_isolate_lru_page(page, true)) { 199 if (__munlock_isolate_lru_page(page, true)) {
200 spin_unlock_irq(&zone->lru_lock); 200 spin_unlock_irq(zone_lru_lock(zone));
201 __munlock_isolated_page(page); 201 __munlock_isolated_page(page);
202 goto out; 202 goto out;
203 } 203 }
204 __munlock_isolation_failed(page); 204 __munlock_isolation_failed(page);
205 205
206unlock_out: 206unlock_out:
207 spin_unlock_irq(&zone->lru_lock); 207 spin_unlock_irq(zone_lru_lock(zone));
208 208
209out: 209out:
210 return nr_pages - 1; 210 return nr_pages - 1;
@@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
289 pagevec_init(&pvec_putback, 0); 289 pagevec_init(&pvec_putback, 0);
290 290
291 /* Phase 1: page isolation */ 291 /* Phase 1: page isolation */
292 spin_lock_irq(&zone->lru_lock); 292 spin_lock_irq(zone_lru_lock(zone));
293 for (i = 0; i < nr; i++) { 293 for (i = 0; i < nr; i++) {
294 struct page *page = pvec->pages[i]; 294 struct page *page = pvec->pages[i];
295 295
@@ -315,7 +315,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
315 } 315 }
316 delta_munlocked = -nr + pagevec_count(&pvec_putback); 316 delta_munlocked = -nr + pagevec_count(&pvec_putback);
317 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); 317 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
318 spin_unlock_irq(&zone->lru_lock); 318 spin_unlock_irq(zone_lru_lock(zone));
319 319
320 /* Now we can release pins of pages that we are not munlocking */ 320 /* Now we can release pins of pages that we are not munlocking */
321 pagevec_release(&pvec_putback); 321 pagevec_release(&pvec_putback);
diff --git a/mm/mmap.c b/mm/mmap.c
index 86b18f334f4f..d44bee96a5fe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -621,7 +621,6 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
621{ 621{
622 struct mm_struct *mm = vma->vm_mm; 622 struct mm_struct *mm = vma->vm_mm;
623 struct vm_area_struct *next = vma->vm_next; 623 struct vm_area_struct *next = vma->vm_next;
624 struct vm_area_struct *importer = NULL;
625 struct address_space *mapping = NULL; 624 struct address_space *mapping = NULL;
626 struct rb_root *root = NULL; 625 struct rb_root *root = NULL;
627 struct anon_vma *anon_vma = NULL; 626 struct anon_vma *anon_vma = NULL;
@@ -631,17 +630,25 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
631 int remove_next = 0; 630 int remove_next = 0;
632 631
633 if (next && !insert) { 632 if (next && !insert) {
634 struct vm_area_struct *exporter = NULL; 633 struct vm_area_struct *exporter = NULL, *importer = NULL;
635 634
636 if (end >= next->vm_end) { 635 if (end >= next->vm_end) {
637 /* 636 /*
638 * vma expands, overlapping all the next, and 637 * vma expands, overlapping all the next, and
639 * perhaps the one after too (mprotect case 6). 638 * perhaps the one after too (mprotect case 6).
640 */ 639 */
641again: remove_next = 1 + (end > next->vm_end); 640 remove_next = 1 + (end > next->vm_end);
642 end = next->vm_end; 641 end = next->vm_end;
643 exporter = next; 642 exporter = next;
644 importer = vma; 643 importer = vma;
644
645 /*
646 * If next doesn't have anon_vma, import from vma after
647 * next, if the vma overlaps with it.
648 */
649 if (remove_next == 2 && next && !next->anon_vma)
650 exporter = next->vm_next;
651
645 } else if (end > next->vm_start) { 652 } else if (end > next->vm_start) {
646 /* 653 /*
647 * vma expands, overlapping part of the next: 654 * vma expands, overlapping part of the next:
@@ -675,7 +682,7 @@ again: remove_next = 1 + (end > next->vm_end);
675 return error; 682 return error;
676 } 683 }
677 } 684 }
678 685again:
679 vma_adjust_trans_huge(vma, start, end, adjust_next); 686 vma_adjust_trans_huge(vma, start, end, adjust_next);
680 687
681 if (file) { 688 if (file) {
@@ -796,8 +803,11 @@ again: remove_next = 1 + (end > next->vm_end);
796 * up the code too much to do both in one go. 803 * up the code too much to do both in one go.
797 */ 804 */
798 next = vma->vm_next; 805 next = vma->vm_next;
799 if (remove_next == 2) 806 if (remove_next == 2) {
807 remove_next = 1;
808 end = next->vm_end;
800 goto again; 809 goto again;
810 }
801 else if (next) 811 else if (next)
802 vma_gap_update(next); 812 vma_gap_update(next);
803 else 813 else
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d4a929d79470..7d0a275df822 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
176 176
177 /* 177 /*
178 * Do not even consider tasks which are explicitly marked oom 178 * Do not even consider tasks which are explicitly marked oom
179 * unkillable or have been already oom reaped. 179 * unkillable or have been already oom reaped or the are in
180 * the middle of vfork
180 */ 181 */
181 adj = (long)p->signal->oom_score_adj; 182 adj = (long)p->signal->oom_score_adj;
182 if (adj == OOM_SCORE_ADJ_MIN || 183 if (adj == OOM_SCORE_ADJ_MIN ||
183 test_bit(MMF_OOM_REAPED, &p->mm->flags)) { 184 test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
185 in_vfork(p)) {
184 task_unlock(p); 186 task_unlock(p);
185 return 0; 187 return 0;
186 } 188 }
@@ -281,10 +283,22 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
281 283
282 /* 284 /*
283 * This task already has access to memory reserves and is being killed. 285 * This task already has access to memory reserves and is being killed.
284 * Don't allow any other task to have access to the reserves. 286 * Don't allow any other task to have access to the reserves unless
287 * the task has MMF_OOM_REAPED because chances that it would release
288 * any memory is quite low.
285 */ 289 */
286 if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) 290 if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
287 return OOM_SCAN_ABORT; 291 struct task_struct *p = find_lock_task_mm(task);
292 enum oom_scan_t ret = OOM_SCAN_ABORT;
293
294 if (p) {
295 if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
296 ret = OOM_SCAN_CONTINUE;
297 task_unlock(p);
298 }
299
300 return ret;
301 }
288 302
289 /* 303 /*
290 * If task is allocating a lot of memory and has been marked to be 304 * If task is allocating a lot of memory and has been marked to be
@@ -415,7 +429,7 @@ bool oom_killer_disabled __read_mostly;
415 * task's threads: if one of those is using this mm then this task was also 429 * task's threads: if one of those is using this mm then this task was also
416 * using it. 430 * using it.
417 */ 431 */
418static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) 432bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
419{ 433{
420 struct task_struct *t; 434 struct task_struct *t;
421 435
@@ -554,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk)
554 schedule_timeout_idle(HZ/10); 568 schedule_timeout_idle(HZ/10);
555 569
556 if (attempts > MAX_OOM_REAP_RETRIES) { 570 if (attempts > MAX_OOM_REAP_RETRIES) {
571 struct task_struct *p;
572
557 pr_info("oom_reaper: unable to reap pid:%d (%s)\n", 573 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
558 task_pid_nr(tsk), tsk->comm); 574 task_pid_nr(tsk), tsk->comm);
575
576 /*
577 * If we've already tried to reap this task in the past and
578 * failed it probably doesn't make much sense to try yet again
579 * so hide the mm from the oom killer so that it can move on
580 * to another task with a different mm struct.
581 */
582 p = find_lock_task_mm(tsk);
583 if (p) {
584 if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
585 pr_info("oom_reaper: giving up pid:%d (%s)\n",
586 task_pid_nr(tsk), tsk->comm);
587 set_bit(MMF_OOM_REAPED, &p->mm->flags);
588 }
589 task_unlock(p);
590 }
591
559 debug_show_all_locks(); 592 debug_show_all_locks();
560 } 593 }
561 594
@@ -594,7 +627,7 @@ static int oom_reaper(void *unused)
594 return 0; 627 return 0;
595} 628}
596 629
597static void wake_oom_reaper(struct task_struct *tsk) 630void wake_oom_reaper(struct task_struct *tsk)
598{ 631{
599 if (!oom_reaper_th) 632 if (!oom_reaper_th)
600 return; 633 return;
@@ -612,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
612 wake_up(&oom_reaper_wait); 645 wake_up(&oom_reaper_wait);
613} 646}
614 647
615/* Check if we can reap the given task. This has to be called with stable
616 * tsk->mm
617 */
618void try_oom_reaper(struct task_struct *tsk)
619{
620 struct mm_struct *mm = tsk->mm;
621 struct task_struct *p;
622
623 if (!mm)
624 return;
625
626 /*
627 * There might be other threads/processes which are either not
628 * dying or even not killable.
629 */
630 if (atomic_read(&mm->mm_users) > 1) {
631 rcu_read_lock();
632 for_each_process(p) {
633 if (!process_shares_mm(p, mm))
634 continue;
635 if (fatal_signal_pending(p))
636 continue;
637
638 /*
639 * If the task is exiting make sure the whole thread group
640 * is exiting and cannot acces mm anymore.
641 */
642 if (signal_group_exit(p->signal))
643 continue;
644
645 /* Give up */
646 rcu_read_unlock();
647 return;
648 }
649 rcu_read_unlock();
650 }
651
652 wake_oom_reaper(tsk);
653}
654
655static int __init oom_init(void) 648static int __init oom_init(void)
656{ 649{
657 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); 650 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -663,10 +656,6 @@ static int __init oom_init(void)
663 return 0; 656 return 0;
664} 657}
665subsys_initcall(oom_init) 658subsys_initcall(oom_init)
666#else
667static void wake_oom_reaper(struct task_struct *tsk)
668{
669}
670#endif 659#endif
671 660
672/** 661/**
@@ -743,6 +732,80 @@ void oom_killer_enable(void)
743 oom_killer_disabled = false; 732 oom_killer_disabled = false;
744} 733}
745 734
735static inline bool __task_will_free_mem(struct task_struct *task)
736{
737 struct signal_struct *sig = task->signal;
738
739 /*
740 * A coredumping process may sleep for an extended period in exit_mm(),
741 * so the oom killer cannot assume that the process will promptly exit
742 * and release memory.
743 */
744 if (sig->flags & SIGNAL_GROUP_COREDUMP)
745 return false;
746
747 if (sig->flags & SIGNAL_GROUP_EXIT)
748 return true;
749
750 if (thread_group_empty(task) && (task->flags & PF_EXITING))
751 return true;
752
753 return false;
754}
755
756/*
757 * Checks whether the given task is dying or exiting and likely to
758 * release its address space. This means that all threads and processes
759 * sharing the same mm have to be killed or exiting.
760 * Caller has to make sure that task->mm is stable (hold task_lock or
761 * it operates on the current).
762 */
763bool task_will_free_mem(struct task_struct *task)
764{
765 struct mm_struct *mm = task->mm;
766 struct task_struct *p;
767 bool ret;
768
769 /*
770 * Skip tasks without mm because it might have passed its exit_mm and
771 * exit_oom_victim. oom_reaper could have rescued that but do not rely
772 * on that for now. We can consider find_lock_task_mm in future.
773 */
774 if (!mm)
775 return false;
776
777 if (!__task_will_free_mem(task))
778 return false;
779
780 /*
781 * This task has already been drained by the oom reaper so there are
782 * only small chances it will free some more
783 */
784 if (test_bit(MMF_OOM_REAPED, &mm->flags))
785 return false;
786
787 if (atomic_read(&mm->mm_users) <= 1)
788 return true;
789
790 /*
791 * This is really pessimistic but we do not have any reliable way
792 * to check that external processes share with our mm
793 */
794 rcu_read_lock();
795 for_each_process(p) {
796 if (!process_shares_mm(p, mm))
797 continue;
798 if (same_thread_group(task, p))
799 continue;
800 ret = __task_will_free_mem(p);
801 if (!ret)
802 break;
803 }
804 rcu_read_unlock();
805
806 return ret;
807}
808
746/* 809/*
747 * Must be called while holding a reference to p, which will be released upon 810 * Must be called while holding a reference to p, which will be released upon
748 * returning. 811 * returning.
@@ -765,9 +828,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
765 * its children or threads, just set TIF_MEMDIE so it can die quickly 828 * its children or threads, just set TIF_MEMDIE so it can die quickly
766 */ 829 */
767 task_lock(p); 830 task_lock(p);
768 if (p->mm && task_will_free_mem(p)) { 831 if (task_will_free_mem(p)) {
769 mark_oom_victim(p); 832 mark_oom_victim(p);
770 try_oom_reaper(p); 833 wake_oom_reaper(p);
771 task_unlock(p); 834 task_unlock(p);
772 put_task_struct(p); 835 put_task_struct(p);
773 return; 836 return;
@@ -850,14 +913,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
850 continue; 913 continue;
851 if (same_thread_group(p, victim)) 914 if (same_thread_group(p, victim))
852 continue; 915 continue;
853 if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || 916 if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) {
854 p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
855 /* 917 /*
856 * We cannot use oom_reaper for the mm shared by this 918 * We cannot use oom_reaper for the mm shared by this
857 * process because it wouldn't get killed and so the 919 * process because it wouldn't get killed and so the
858 * memory might be still used. 920 * memory might be still used. Hide the mm from the oom
921 * killer to guarantee OOM forward progress.
859 */ 922 */
860 can_oom_reap = false; 923 can_oom_reap = false;
924 set_bit(MMF_OOM_REAPED, &mm->flags);
925 pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
926 task_pid_nr(victim), victim->comm,
927 task_pid_nr(p), p->comm);
861 continue; 928 continue;
862 } 929 }
863 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 930 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
@@ -939,14 +1006,10 @@ bool out_of_memory(struct oom_control *oc)
939 * If current has a pending SIGKILL or is exiting, then automatically 1006 * If current has a pending SIGKILL or is exiting, then automatically
940 * select it. The goal is to allow it to allocate so that it may 1007 * select it. The goal is to allow it to allocate so that it may
941 * quickly exit and free its memory. 1008 * quickly exit and free its memory.
942 *
943 * But don't select if current has already released its mm and cleared
944 * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
945 */ 1009 */
946 if (current->mm && 1010 if (task_will_free_mem(current)) {
947 (fatal_signal_pending(current) || task_will_free_mem(current))) {
948 mark_oom_victim(current); 1011 mark_oom_victim(current);
949 try_oom_reaper(current); 1012 wake_oom_reaper(current);
950 return true; 1013 return true;
951 } 1014 }
952 1015
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d578d2a56b19..f4cd7d8005c9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -267,26 +267,35 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
267 */ 267 */
268 268
269/** 269/**
270 * zone_dirtyable_memory - number of dirtyable pages in a zone 270 * node_dirtyable_memory - number of dirtyable pages in a node
271 * @zone: the zone 271 * @pgdat: the node
272 * 272 *
273 * Returns the zone's number of pages potentially available for dirty 273 * Returns the node's number of pages potentially available for dirty
274 * page cache. This is the base value for the per-zone dirty limits. 274 * page cache. This is the base value for the per-node dirty limits.
275 */ 275 */
276static unsigned long zone_dirtyable_memory(struct zone *zone) 276static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
277{ 277{
278 unsigned long nr_pages; 278 unsigned long nr_pages = 0;
279 int z;
280
281 for (z = 0; z < MAX_NR_ZONES; z++) {
282 struct zone *zone = pgdat->node_zones + z;
283
284 if (!populated_zone(zone))
285 continue;
286
287 nr_pages += zone_page_state(zone, NR_FREE_PAGES);
288 }
279 289
280 nr_pages = zone_page_state(zone, NR_FREE_PAGES);
281 /* 290 /*
282 * Pages reserved for the kernel should not be considered 291 * Pages reserved for the kernel should not be considered
283 * dirtyable, to prevent a situation where reclaim has to 292 * dirtyable, to prevent a situation where reclaim has to
284 * clean pages in order to balance the zones. 293 * clean pages in order to balance the zones.
285 */ 294 */
286 nr_pages -= min(nr_pages, zone->totalreserve_pages); 295 nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
287 296
288 nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); 297 nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
289 nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); 298 nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
290 299
291 return nr_pages; 300 return nr_pages;
292} 301}
@@ -299,13 +308,26 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
299 int i; 308 int i;
300 309
301 for_each_node_state(node, N_HIGH_MEMORY) { 310 for_each_node_state(node, N_HIGH_MEMORY) {
302 for (i = 0; i < MAX_NR_ZONES; i++) { 311 for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
303 struct zone *z = &NODE_DATA(node)->node_zones[i]; 312 struct zone *z;
313 unsigned long nr_pages;
314
315 if (!is_highmem_idx(i))
316 continue;
317
318 z = &NODE_DATA(node)->node_zones[i];
319 if (!populated_zone(z))
320 continue;
304 321
305 if (is_highmem(z)) 322 nr_pages = zone_page_state(z, NR_FREE_PAGES);
306 x += zone_dirtyable_memory(z); 323 /* watch for underflows */
324 nr_pages -= min(nr_pages, high_wmark_pages(z));
325 nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
326 nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
327 x += nr_pages;
307 } 328 }
308 } 329 }
330
309 /* 331 /*
310 * Unreclaimable memory (kernel memory or anonymous memory 332 * Unreclaimable memory (kernel memory or anonymous memory
311 * without swap) can bring down the dirtyable pages below 333 * without swap) can bring down the dirtyable pages below
@@ -348,8 +370,8 @@ static unsigned long global_dirtyable_memory(void)
348 */ 370 */
349 x -= min(x, totalreserve_pages); 371 x -= min(x, totalreserve_pages);
350 372
351 x += global_page_state(NR_INACTIVE_FILE); 373 x += global_node_page_state(NR_INACTIVE_FILE);
352 x += global_page_state(NR_ACTIVE_FILE); 374 x += global_node_page_state(NR_ACTIVE_FILE);
353 375
354 if (!vm_highmem_is_dirtyable) 376 if (!vm_highmem_is_dirtyable)
355 x -= highmem_dirtyable_memory(x); 377 x -= highmem_dirtyable_memory(x);
@@ -445,23 +467,23 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
445} 467}
446 468
447/** 469/**
448 * zone_dirty_limit - maximum number of dirty pages allowed in a zone 470 * node_dirty_limit - maximum number of dirty pages allowed in a node
449 * @zone: the zone 471 * @pgdat: the node
450 * 472 *
451 * Returns the maximum number of dirty pages allowed in a zone, based 473 * Returns the maximum number of dirty pages allowed in a node, based
452 * on the zone's dirtyable memory. 474 * on the node's dirtyable memory.
453 */ 475 */
454static unsigned long zone_dirty_limit(struct zone *zone) 476static unsigned long node_dirty_limit(struct pglist_data *pgdat)
455{ 477{
456 unsigned long zone_memory = zone_dirtyable_memory(zone); 478 unsigned long node_memory = node_dirtyable_memory(pgdat);
457 struct task_struct *tsk = current; 479 struct task_struct *tsk = current;
458 unsigned long dirty; 480 unsigned long dirty;
459 481
460 if (vm_dirty_bytes) 482 if (vm_dirty_bytes)
461 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * 483 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
462 zone_memory / global_dirtyable_memory(); 484 node_memory / global_dirtyable_memory();
463 else 485 else
464 dirty = vm_dirty_ratio * zone_memory / 100; 486 dirty = vm_dirty_ratio * node_memory / 100;
465 487
466 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) 488 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
467 dirty += dirty / 4; 489 dirty += dirty / 4;
@@ -470,19 +492,22 @@ static unsigned long zone_dirty_limit(struct zone *zone)
470} 492}
471 493
472/** 494/**
473 * zone_dirty_ok - tells whether a zone is within its dirty limits 495 * node_dirty_ok - tells whether a node is within its dirty limits
474 * @zone: the zone to check 496 * @pgdat: the node to check
475 * 497 *
476 * Returns %true when the dirty pages in @zone are within the zone's 498 * Returns %true when the dirty pages in @pgdat are within the node's
477 * dirty limit, %false if the limit is exceeded. 499 * dirty limit, %false if the limit is exceeded.
478 */ 500 */
479bool zone_dirty_ok(struct zone *zone) 501bool node_dirty_ok(struct pglist_data *pgdat)
480{ 502{
481 unsigned long limit = zone_dirty_limit(zone); 503 unsigned long limit = node_dirty_limit(pgdat);
504 unsigned long nr_pages = 0;
505
506 nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
507 nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
508 nr_pages += node_page_state(pgdat, NR_WRITEBACK);
482 509
483 return zone_page_state(zone, NR_FILE_DIRTY) + 510 return nr_pages <= limit;
484 zone_page_state(zone, NR_UNSTABLE_NFS) +
485 zone_page_state(zone, NR_WRITEBACK) <= limit;
486} 511}
487 512
488int dirty_background_ratio_handler(struct ctl_table *table, int write, 513int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -1570,10 +1595,10 @@ static void balance_dirty_pages(struct address_space *mapping,
1570 * written to the server's write cache, but has not yet 1595 * written to the server's write cache, but has not yet
1571 * been flushed to permanent storage. 1596 * been flushed to permanent storage.
1572 */ 1597 */
1573 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 1598 nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
1574 global_page_state(NR_UNSTABLE_NFS); 1599 global_node_page_state(NR_UNSTABLE_NFS);
1575 gdtc->avail = global_dirtyable_memory(); 1600 gdtc->avail = global_dirtyable_memory();
1576 gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); 1601 gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
1577 1602
1578 domain_dirty_limits(gdtc); 1603 domain_dirty_limits(gdtc);
1579 1604
@@ -1910,8 +1935,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
1910 * as we're trying to decide whether to put more under writeback. 1935 * as we're trying to decide whether to put more under writeback.
1911 */ 1936 */
1912 gdtc->avail = global_dirtyable_memory(); 1937 gdtc->avail = global_dirtyable_memory();
1913 gdtc->dirty = global_page_state(NR_FILE_DIRTY) + 1938 gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
1914 global_page_state(NR_UNSTABLE_NFS); 1939 global_node_page_state(NR_UNSTABLE_NFS);
1915 domain_dirty_limits(gdtc); 1940 domain_dirty_limits(gdtc);
1916 1941
1917 if (gdtc->dirty > gdtc->bg_thresh) 1942 if (gdtc->dirty > gdtc->bg_thresh)
@@ -1955,8 +1980,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
1955 */ 1980 */
1956 dirty_thresh += dirty_thresh / 10; /* wheeee... */ 1981 dirty_thresh += dirty_thresh / 10; /* wheeee... */
1957 1982
1958 if (global_page_state(NR_UNSTABLE_NFS) + 1983 if (global_node_page_state(NR_UNSTABLE_NFS) +
1959 global_page_state(NR_WRITEBACK) <= dirty_thresh) 1984 global_node_page_state(NR_WRITEBACK) <= dirty_thresh)
1960 break; 1985 break;
1961 congestion_wait(BLK_RW_ASYNC, HZ/10); 1986 congestion_wait(BLK_RW_ASYNC, HZ/10);
1962 1987
@@ -1984,8 +2009,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
1984void laptop_mode_timer_fn(unsigned long data) 2009void laptop_mode_timer_fn(unsigned long data)
1985{ 2010{
1986 struct request_queue *q = (struct request_queue *)data; 2011 struct request_queue *q = (struct request_queue *)data;
1987 int nr_pages = global_page_state(NR_FILE_DIRTY) + 2012 int nr_pages = global_node_page_state(NR_FILE_DIRTY) +
1988 global_page_state(NR_UNSTABLE_NFS); 2013 global_node_page_state(NR_UNSTABLE_NFS);
1989 struct bdi_writeback *wb; 2014 struct bdi_writeback *wb;
1990 2015
1991 /* 2016 /*
@@ -2436,8 +2461,9 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
2436 wb = inode_to_wb(inode); 2461 wb = inode_to_wb(inode);
2437 2462
2438 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); 2463 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2439 __inc_zone_page_state(page, NR_FILE_DIRTY); 2464 __inc_node_page_state(page, NR_FILE_DIRTY);
2440 __inc_zone_page_state(page, NR_DIRTIED); 2465 __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2466 __inc_node_page_state(page, NR_DIRTIED);
2441 __inc_wb_stat(wb, WB_RECLAIMABLE); 2467 __inc_wb_stat(wb, WB_RECLAIMABLE);
2442 __inc_wb_stat(wb, WB_DIRTIED); 2468 __inc_wb_stat(wb, WB_DIRTIED);
2443 task_io_account_write(PAGE_SIZE); 2469 task_io_account_write(PAGE_SIZE);
@@ -2457,7 +2483,8 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
2457{ 2483{
2458 if (mapping_cap_account_dirty(mapping)) { 2484 if (mapping_cap_account_dirty(mapping)) {
2459 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); 2485 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2460 dec_zone_page_state(page, NR_FILE_DIRTY); 2486 dec_node_page_state(page, NR_FILE_DIRTY);
2487 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2461 dec_wb_stat(wb, WB_RECLAIMABLE); 2488 dec_wb_stat(wb, WB_RECLAIMABLE);
2462 task_io_account_cancelled_write(PAGE_SIZE); 2489 task_io_account_cancelled_write(PAGE_SIZE);
2463 } 2490 }
@@ -2525,7 +2552,7 @@ void account_page_redirty(struct page *page)
2525 2552
2526 wb = unlocked_inode_to_wb_begin(inode, &locked); 2553 wb = unlocked_inode_to_wb_begin(inode, &locked);
2527 current->nr_dirtied--; 2554 current->nr_dirtied--;
2528 dec_zone_page_state(page, NR_DIRTIED); 2555 dec_node_page_state(page, NR_DIRTIED);
2529 dec_wb_stat(wb, WB_DIRTIED); 2556 dec_wb_stat(wb, WB_DIRTIED);
2530 unlocked_inode_to_wb_end(inode, locked); 2557 unlocked_inode_to_wb_end(inode, locked);
2531 } 2558 }
@@ -2713,7 +2740,8 @@ int clear_page_dirty_for_io(struct page *page)
2713 wb = unlocked_inode_to_wb_begin(inode, &locked); 2740 wb = unlocked_inode_to_wb_begin(inode, &locked);
2714 if (TestClearPageDirty(page)) { 2741 if (TestClearPageDirty(page)) {
2715 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); 2742 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
2716 dec_zone_page_state(page, NR_FILE_DIRTY); 2743 dec_node_page_state(page, NR_FILE_DIRTY);
2744 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2717 dec_wb_stat(wb, WB_RECLAIMABLE); 2745 dec_wb_stat(wb, WB_RECLAIMABLE);
2718 ret = 1; 2746 ret = 1;
2719 } 2747 }
@@ -2759,8 +2787,9 @@ int test_clear_page_writeback(struct page *page)
2759 } 2787 }
2760 if (ret) { 2788 if (ret) {
2761 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); 2789 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2762 dec_zone_page_state(page, NR_WRITEBACK); 2790 dec_node_page_state(page, NR_WRITEBACK);
2763 inc_zone_page_state(page, NR_WRITTEN); 2791 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2792 inc_node_page_state(page, NR_WRITTEN);
2764 } 2793 }
2765 unlock_page_memcg(page); 2794 unlock_page_memcg(page);
2766 return ret; 2795 return ret;
@@ -2813,7 +2842,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2813 } 2842 }
2814 if (!ret) { 2843 if (!ret) {
2815 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); 2844 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2816 inc_zone_page_state(page, NR_WRITEBACK); 2845 inc_node_page_state(page, NR_WRITEBACK);
2846 inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
2817 } 2847 }
2818 unlock_page_memcg(page); 2848 unlock_page_memcg(page);
2819 return ret; 2849 return ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 452513bf02ce..ea759b935360 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -295,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
295 return false; 295 return false;
296} 296}
297 297
298static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
299{
300 if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
301 return true;
302
303 return false;
304}
305
306/* 298/*
307 * Returns false when the remaining initialisation should be deferred until 299 * Returns false when the remaining initialisation should be deferred until
308 * later in the boot cycle when it can be parallelised. 300 * later in the boot cycle when it can be parallelised.
@@ -342,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn)
342 return false; 334 return false;
343} 335}
344 336
345static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
346{
347 return false;
348}
349
350static inline bool update_defer_init(pg_data_t *pgdat, 337static inline bool update_defer_init(pg_data_t *pgdat,
351 unsigned long pfn, unsigned long zone_end, 338 unsigned long pfn, unsigned long zone_end,
352 unsigned long *nr_initialised) 339 unsigned long *nr_initialised)
@@ -1091,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1091 1078
1092 spin_lock(&zone->lock); 1079 spin_lock(&zone->lock);
1093 isolated_pageblocks = has_isolate_pageblock(zone); 1080 isolated_pageblocks = has_isolate_pageblock(zone);
1094 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); 1081 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1095 if (nr_scanned) 1082 if (nr_scanned)
1096 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); 1083 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
1097 1084
1098 while (count) { 1085 while (count) {
1099 struct page *page; 1086 struct page *page;
@@ -1148,9 +1135,9 @@ static void free_one_page(struct zone *zone,
1148{ 1135{
1149 unsigned long nr_scanned; 1136 unsigned long nr_scanned;
1150 spin_lock(&zone->lock); 1137 spin_lock(&zone->lock);
1151 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); 1138 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1152 if (nr_scanned) 1139 if (nr_scanned)
1153 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); 1140 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
1154 1141
1155 if (unlikely(has_isolate_pageblock(zone) || 1142 if (unlikely(has_isolate_pageblock(zone) ||
1156 is_migrate_isolate(migratetype))) { 1143 is_migrate_isolate(migratetype))) {
@@ -2517,7 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order)
2517 zone->free_area[order].nr_free--; 2504 zone->free_area[order].nr_free--;
2518 rmv_page_order(page); 2505 rmv_page_order(page);
2519 2506
2520 /* Set the pageblock if the isolated page is at least a pageblock */ 2507 /*
2508 * Set the pageblock if the isolated page is at least half of a
2509 * pageblock
2510 */
2521 if (order >= pageblock_order - 1) { 2511 if (order >= pageblock_order - 1) {
2522 struct page *endpage = page + (1 << order) - 1; 2512 struct page *endpage = page + (1 << order) - 1;
2523 for (; page < endpage; page += pageblock_nr_pages) { 2513 for (; page < endpage; page += pageblock_nr_pages) {
@@ -2597,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
2597 else 2587 else
2598 page = list_first_entry(list, struct page, lru); 2588 page = list_first_entry(list, struct page, lru);
2599 2589
2600 __dec_zone_state(zone, NR_ALLOC_BATCH);
2601 list_del(&page->lru); 2590 list_del(&page->lru);
2602 pcp->count--; 2591 pcp->count--;
2603 2592
@@ -2623,16 +2612,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
2623 spin_unlock(&zone->lock); 2612 spin_unlock(&zone->lock);
2624 if (!page) 2613 if (!page)
2625 goto failed; 2614 goto failed;
2626 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
2627 __mod_zone_freepage_state(zone, -(1 << order), 2615 __mod_zone_freepage_state(zone, -(1 << order),
2628 get_pcppage_migratetype(page)); 2616 get_pcppage_migratetype(page));
2629 } 2617 }
2630 2618
2631 if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && 2619 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2632 !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
2633 set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
2634
2635 __count_zone_vm_events(PGALLOC, zone, 1 << order);
2636 zone_statistics(preferred_zone, zone, gfp_flags); 2620 zone_statistics(preferred_zone, zone, gfp_flags);
2637 local_irq_restore(flags); 2621 local_irq_restore(flags);
2638 2622
@@ -2842,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
2842} 2826}
2843 2827
2844#ifdef CONFIG_NUMA 2828#ifdef CONFIG_NUMA
2845static bool zone_local(struct zone *local_zone, struct zone *zone)
2846{
2847 return local_zone->node == zone->node;
2848}
2849
2850static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 2829static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
2851{ 2830{
2852 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < 2831 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
2853 RECLAIM_DISTANCE; 2832 RECLAIM_DISTANCE;
2854} 2833}
2855#else /* CONFIG_NUMA */ 2834#else /* CONFIG_NUMA */
2856static bool zone_local(struct zone *local_zone, struct zone *zone)
2857{
2858 return true;
2859}
2860
2861static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 2835static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
2862{ 2836{
2863 return true; 2837 return true;
2864} 2838}
2865#endif /* CONFIG_NUMA */ 2839#endif /* CONFIG_NUMA */
2866 2840
2867static void reset_alloc_batches(struct zone *preferred_zone)
2868{
2869 struct zone *zone = preferred_zone->zone_pgdat->node_zones;
2870
2871 do {
2872 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2873 high_wmark_pages(zone) - low_wmark_pages(zone) -
2874 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2875 clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
2876 } while (zone++ != preferred_zone);
2877}
2878
2879/* 2841/*
2880 * get_page_from_freelist goes through the zonelist trying to allocate 2842 * get_page_from_freelist goes through the zonelist trying to allocate
2881 * a page. 2843 * a page.
@@ -2886,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
2886{ 2848{
2887 struct zoneref *z = ac->preferred_zoneref; 2849 struct zoneref *z = ac->preferred_zoneref;
2888 struct zone *zone; 2850 struct zone *zone;
2889 bool fair_skipped = false; 2851 struct pglist_data *last_pgdat_dirty_limit = NULL;
2890 bool apply_fair = (alloc_flags & ALLOC_FAIR);
2891 2852
2892zonelist_scan:
2893 /* 2853 /*
2894 * Scan zonelist, looking for a zone with enough free. 2854 * Scan zonelist, looking for a zone with enough free.
2895 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 2855 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
@@ -2904,50 +2864,33 @@ zonelist_scan:
2904 !__cpuset_zone_allowed(zone, gfp_mask)) 2864 !__cpuset_zone_allowed(zone, gfp_mask))
2905 continue; 2865 continue;
2906 /* 2866 /*
2907 * Distribute pages in proportion to the individual
2908 * zone size to ensure fair page aging. The zone a
2909 * page was allocated in should have no effect on the
2910 * time the page has in memory before being reclaimed.
2911 */
2912 if (apply_fair) {
2913 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
2914 fair_skipped = true;
2915 continue;
2916 }
2917 if (!zone_local(ac->preferred_zoneref->zone, zone)) {
2918 if (fair_skipped)
2919 goto reset_fair;
2920 apply_fair = false;
2921 }
2922 }
2923 /*
2924 * When allocating a page cache page for writing, we 2867 * When allocating a page cache page for writing, we
2925 * want to get it from a zone that is within its dirty 2868 * want to get it from a node that is within its dirty
2926 * limit, such that no single zone holds more than its 2869 * limit, such that no single node holds more than its
2927 * proportional share of globally allowed dirty pages. 2870 * proportional share of globally allowed dirty pages.
2928 * The dirty limits take into account the zone's 2871 * The dirty limits take into account the node's
2929 * lowmem reserves and high watermark so that kswapd 2872 * lowmem reserves and high watermark so that kswapd
2930 * should be able to balance it without having to 2873 * should be able to balance it without having to
2931 * write pages from its LRU list. 2874 * write pages from its LRU list.
2932 * 2875 *
2933 * This may look like it could increase pressure on
2934 * lower zones by failing allocations in higher zones
2935 * before they are full. But the pages that do spill
2936 * over are limited as the lower zones are protected
2937 * by this very same mechanism. It should not become
2938 * a practical burden to them.
2939 *
2940 * XXX: For now, allow allocations to potentially 2876 * XXX: For now, allow allocations to potentially
2941 * exceed the per-zone dirty limit in the slowpath 2877 * exceed the per-node dirty limit in the slowpath
2942 * (spread_dirty_pages unset) before going into reclaim, 2878 * (spread_dirty_pages unset) before going into reclaim,
2943 * which is important when on a NUMA setup the allowed 2879 * which is important when on a NUMA setup the allowed
2944 * zones are together not big enough to reach the 2880 * nodes are together not big enough to reach the
2945 * global limit. The proper fix for these situations 2881 * global limit. The proper fix for these situations
2946 * will require awareness of zones in the 2882 * will require awareness of nodes in the
2947 * dirty-throttling and the flusher threads. 2883 * dirty-throttling and the flusher threads.
2948 */ 2884 */
2949 if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) 2885 if (ac->spread_dirty_pages) {
2950 continue; 2886 if (last_pgdat_dirty_limit == zone->zone_pgdat)
2887 continue;
2888
2889 if (!node_dirty_ok(zone->zone_pgdat)) {
2890 last_pgdat_dirty_limit = zone->zone_pgdat;
2891 continue;
2892 }
2893 }
2951 2894
2952 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 2895 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
2953 if (!zone_watermark_fast(zone, order, mark, 2896 if (!zone_watermark_fast(zone, order, mark,
@@ -2959,16 +2902,16 @@ zonelist_scan:
2959 if (alloc_flags & ALLOC_NO_WATERMARKS) 2902 if (alloc_flags & ALLOC_NO_WATERMARKS)
2960 goto try_this_zone; 2903 goto try_this_zone;
2961 2904
2962 if (zone_reclaim_mode == 0 || 2905 if (node_reclaim_mode == 0 ||
2963 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) 2906 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
2964 continue; 2907 continue;
2965 2908
2966 ret = zone_reclaim(zone, gfp_mask, order); 2909 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
2967 switch (ret) { 2910 switch (ret) {
2968 case ZONE_RECLAIM_NOSCAN: 2911 case NODE_RECLAIM_NOSCAN:
2969 /* did not scan */ 2912 /* did not scan */
2970 continue; 2913 continue;
2971 case ZONE_RECLAIM_FULL: 2914 case NODE_RECLAIM_FULL:
2972 /* scanned but unreclaimable */ 2915 /* scanned but unreclaimable */
2973 continue; 2916 continue;
2974 default: 2917 default:
@@ -2998,23 +2941,6 @@ try_this_zone:
2998 } 2941 }
2999 } 2942 }
3000 2943
3001 /*
3002 * The first pass makes sure allocations are spread fairly within the
3003 * local node. However, the local node might have free pages left
3004 * after the fairness batches are exhausted, and remote zones haven't
3005 * even been considered yet. Try once more without fairness, and
3006 * include remote zones now, before entering the slowpath and waking
3007 * kswapd: prefer spilling to a remote zone over swapping locally.
3008 */
3009 if (fair_skipped) {
3010reset_fair:
3011 apply_fair = false;
3012 fair_skipped = false;
3013 reset_alloc_batches(ac->preferred_zoneref->zone);
3014 z = ac->preferred_zoneref;
3015 goto zonelist_scan;
3016 }
3017
3018 return NULL; 2944 return NULL;
3019} 2945}
3020 2946
@@ -3159,7 +3085,6 @@ out:
3159 return page; 3085 return page;
3160} 3086}
3161 3087
3162
3163/* 3088/*
3164 * Maximum number of compaction retries wit a progress before OOM 3089 * Maximum number of compaction retries wit a progress before OOM
3165 * killer is consider as the only way to move forward. 3090 * killer is consider as the only way to move forward.
@@ -3171,17 +3096,16 @@ out:
3171static struct page * 3096static struct page *
3172__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3097__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3173 unsigned int alloc_flags, const struct alloc_context *ac, 3098 unsigned int alloc_flags, const struct alloc_context *ac,
3174 enum migrate_mode mode, enum compact_result *compact_result) 3099 enum compact_priority prio, enum compact_result *compact_result)
3175{ 3100{
3176 struct page *page; 3101 struct page *page;
3177 int contended_compaction;
3178 3102
3179 if (!order) 3103 if (!order)
3180 return NULL; 3104 return NULL;
3181 3105
3182 current->flags |= PF_MEMALLOC; 3106 current->flags |= PF_MEMALLOC;
3183 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 3107 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3184 mode, &contended_compaction); 3108 prio);
3185 current->flags &= ~PF_MEMALLOC; 3109 current->flags &= ~PF_MEMALLOC;
3186 3110
3187 if (*compact_result <= COMPACT_INACTIVE) 3111 if (*compact_result <= COMPACT_INACTIVE)
@@ -3193,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3193 */ 3117 */
3194 count_vm_event(COMPACTSTALL); 3118 count_vm_event(COMPACTSTALL);
3195 3119
3196 page = get_page_from_freelist(gfp_mask, order, 3120 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3197 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
3198 3121
3199 if (page) { 3122 if (page) {
3200 struct zone *zone = page_zone(page); 3123 struct zone *zone = page_zone(page);
@@ -3211,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3211 */ 3134 */
3212 count_vm_event(COMPACTFAIL); 3135 count_vm_event(COMPACTFAIL);
3213 3136
3214 /*
3215 * In all zones where compaction was attempted (and not
3216 * deferred or skipped), lock contention has been detected.
3217 * For THP allocation we do not want to disrupt the others
3218 * so we fallback to base pages instead.
3219 */
3220 if (contended_compaction == COMPACT_CONTENDED_LOCK)
3221 *compact_result = COMPACT_CONTENDED;
3222
3223 /*
3224 * If compaction was aborted due to need_resched(), we do not
3225 * want to further increase allocation latency, unless it is
3226 * khugepaged trying to collapse.
3227 */
3228 if (contended_compaction == COMPACT_CONTENDED_SCHED
3229 && !(current->flags & PF_KTHREAD))
3230 *compact_result = COMPACT_CONTENDED;
3231
3232 cond_resched(); 3137 cond_resched();
3233 3138
3234 return NULL; 3139 return NULL;
@@ -3236,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3236 3141
3237static inline bool 3142static inline bool
3238should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, 3143should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3239 enum compact_result compact_result, enum migrate_mode *migrate_mode, 3144 enum compact_result compact_result,
3145 enum compact_priority *compact_priority,
3240 int compaction_retries) 3146 int compaction_retries)
3241{ 3147{
3242 int max_retries = MAX_COMPACT_RETRIES; 3148 int max_retries = MAX_COMPACT_RETRIES;
@@ -3247,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3247 /* 3153 /*
3248 * compaction considers all the zone as desperately out of memory 3154 * compaction considers all the zone as desperately out of memory
3249 * so it doesn't really make much sense to retry except when the 3155 * so it doesn't really make much sense to retry except when the
3250 * failure could be caused by weak migration mode. 3156 * failure could be caused by insufficient priority
3251 */ 3157 */
3252 if (compaction_failed(compact_result)) { 3158 if (compaction_failed(compact_result)) {
3253 if (*migrate_mode == MIGRATE_ASYNC) { 3159 if (*compact_priority > MIN_COMPACT_PRIORITY) {
3254 *migrate_mode = MIGRATE_SYNC_LIGHT; 3160 (*compact_priority)--;
3255 return true; 3161 return true;
3256 } 3162 }
3257 return false; 3163 return false;
@@ -3285,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3285static inline struct page * 3191static inline struct page *
3286__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3192__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3287 unsigned int alloc_flags, const struct alloc_context *ac, 3193 unsigned int alloc_flags, const struct alloc_context *ac,
3288 enum migrate_mode mode, enum compact_result *compact_result) 3194 enum compact_priority prio, enum compact_result *compact_result)
3289{ 3195{
3290 *compact_result = COMPACT_SKIPPED; 3196 *compact_result = COMPACT_SKIPPED;
3291 return NULL; 3197 return NULL;
@@ -3294,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3294static inline bool 3200static inline bool
3295should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 3201should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3296 enum compact_result compact_result, 3202 enum compact_result compact_result,
3297 enum migrate_mode *migrate_mode, 3203 enum compact_priority *compact_priority,
3298 int compaction_retries) 3204 int compaction_retries)
3299{ 3205{
3300 struct zone *zone; 3206 struct zone *zone;
@@ -3362,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
3362 return NULL; 3268 return NULL;
3363 3269
3364retry: 3270retry:
3365 page = get_page_from_freelist(gfp_mask, order, 3271 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3366 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
3367 3272
3368 /* 3273 /*
3369 * If an allocation failed after direct reclaim, it could be because 3274 * If an allocation failed after direct reclaim, it could be because
@@ -3384,10 +3289,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
3384{ 3289{
3385 struct zoneref *z; 3290 struct zoneref *z;
3386 struct zone *zone; 3291 struct zone *zone;
3292 pg_data_t *last_pgdat = NULL;
3387 3293
3388 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 3294 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3389 ac->high_zoneidx, ac->nodemask) 3295 ac->high_zoneidx, ac->nodemask) {
3390 wakeup_kswapd(zone, order, ac_classzone_idx(ac)); 3296 if (last_pgdat != zone->zone_pgdat)
3297 wakeup_kswapd(zone, order, ac->high_zoneidx);
3298 last_pgdat = zone->zone_pgdat;
3299 }
3391} 3300}
3392 3301
3393static inline unsigned int 3302static inline unsigned int
@@ -3421,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
3421 } else if (unlikely(rt_task(current)) && !in_interrupt()) 3330 } else if (unlikely(rt_task(current)) && !in_interrupt())
3422 alloc_flags |= ALLOC_HARDER; 3331 alloc_flags |= ALLOC_HARDER;
3423 3332
3424 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
3425 if (gfp_mask & __GFP_MEMALLOC)
3426 alloc_flags |= ALLOC_NO_WATERMARKS;
3427 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3428 alloc_flags |= ALLOC_NO_WATERMARKS;
3429 else if (!in_interrupt() &&
3430 ((current->flags & PF_MEMALLOC) ||
3431 unlikely(test_thread_flag(TIF_MEMDIE))))
3432 alloc_flags |= ALLOC_NO_WATERMARKS;
3433 }
3434#ifdef CONFIG_CMA 3333#ifdef CONFIG_CMA
3435 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 3334 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3436 alloc_flags |= ALLOC_CMA; 3335 alloc_flags |= ALLOC_CMA;
@@ -3440,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
3440 3339
3441bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 3340bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3442{ 3341{
3443 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 3342 if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
3444} 3343 return false;
3445 3344
3446static inline bool is_thp_gfp_mask(gfp_t gfp_mask) 3345 if (gfp_mask & __GFP_MEMALLOC)
3447{ 3346 return true;
3448 return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; 3347 if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3348 return true;
3349 if (!in_interrupt() &&
3350 ((current->flags & PF_MEMALLOC) ||
3351 unlikely(test_thread_flag(TIF_MEMDIE))))
3352 return true;
3353
3354 return false;
3449} 3355}
3450 3356
3451/* 3357/*
@@ -3481,10 +3387,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3481 return false; 3387 return false;
3482 3388
3483 /* 3389 /*
3484 * Keep reclaiming pages while there is a chance this will lead somewhere. 3390 * Keep reclaiming pages while there is a chance this will lead
3485 * If none of the target zones can satisfy our allocation request even 3391 * somewhere. If none of the target zones can satisfy our allocation
3486 * if all reclaimable pages are considered then we are screwed and have 3392 * request even if all reclaimable pages are considered then we are
3487 * to go OOM. 3393 * screwed and have to go OOM.
3488 */ 3394 */
3489 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3395 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3490 ac->nodemask) { 3396 ac->nodemask) {
@@ -3509,14 +3415,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3509 * prevent from pre mature OOM 3415 * prevent from pre mature OOM
3510 */ 3416 */
3511 if (!did_some_progress) { 3417 if (!did_some_progress) {
3512 unsigned long writeback; 3418 unsigned long write_pending;
3513 unsigned long dirty;
3514 3419
3515 writeback = zone_page_state_snapshot(zone, 3420 write_pending = zone_page_state_snapshot(zone,
3516 NR_WRITEBACK); 3421 NR_ZONE_WRITE_PENDING);
3517 dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
3518 3422
3519 if (2*(writeback + dirty) > reclaimable) { 3423 if (2 * write_pending > reclaimable) {
3520 congestion_wait(BLK_RW_ASYNC, HZ/10); 3424 congestion_wait(BLK_RW_ASYNC, HZ/10);
3521 return true; 3425 return true;
3522 } 3426 }
@@ -3551,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3551 struct page *page = NULL; 3455 struct page *page = NULL;
3552 unsigned int alloc_flags; 3456 unsigned int alloc_flags;
3553 unsigned long did_some_progress; 3457 unsigned long did_some_progress;
3554 enum migrate_mode migration_mode = MIGRATE_ASYNC; 3458 enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
3555 enum compact_result compact_result; 3459 enum compact_result compact_result;
3556 int compaction_retries = 0; 3460 int compaction_retries = 0;
3557 int no_progress_loops = 0; 3461 int no_progress_loops = 0;
@@ -3575,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3575 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 3479 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
3576 gfp_mask &= ~__GFP_ATOMIC; 3480 gfp_mask &= ~__GFP_ATOMIC;
3577 3481
3578retry: 3482 /*
3483 * The fast path uses conservative alloc_flags to succeed only until
3484 * kswapd needs to be woken up, and to avoid the cost of setting up
3485 * alloc_flags precisely. So we do that now.
3486 */
3487 alloc_flags = gfp_to_alloc_flags(gfp_mask);
3488
3579 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3489 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3580 wake_all_kswapds(order, ac); 3490 wake_all_kswapds(order, ac);
3581 3491
3582 /* 3492 /*
3583 * OK, we're below the kswapd watermark and have kicked background 3493 * The adjusted alloc_flags might result in immediate success, so try
3584 * reclaim. Now things get more complex, so set up alloc_flags according 3494 * that first
3585 * to how we want to proceed.
3586 */ 3495 */
3587 alloc_flags = gfp_to_alloc_flags(gfp_mask); 3496 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3497 if (page)
3498 goto got_pg;
3499
3500 /*
3501 * For costly allocations, try direct compaction first, as it's likely
3502 * that we have enough base pages and don't need to reclaim. Don't try
3503 * that for allocations that are allowed to ignore watermarks, as the
3504 * ALLOC_NO_WATERMARKS attempt didn't yet happen.
3505 */
3506 if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
3507 !gfp_pfmemalloc_allowed(gfp_mask)) {
3508 page = __alloc_pages_direct_compact(gfp_mask, order,
3509 alloc_flags, ac,
3510 INIT_COMPACT_PRIORITY,
3511 &compact_result);
3512 if (page)
3513 goto got_pg;
3514
3515 /*
3516 * Checks for costly allocations with __GFP_NORETRY, which
3517 * includes THP page fault allocations
3518 */
3519 if (gfp_mask & __GFP_NORETRY) {
3520 /*
3521 * If compaction is deferred for high-order allocations,
3522 * it is because sync compaction recently failed. If
3523 * this is the case and the caller requested a THP
3524 * allocation, we do not want to heavily disrupt the
3525 * system, so we fail the allocation instead of entering
3526 * direct reclaim.
3527 */
3528 if (compact_result == COMPACT_DEFERRED)
3529 goto nopage;
3530
3531 /*
3532 * Looks like reclaim/compaction is worth trying, but
3533 * sync compaction could be very expensive, so keep
3534 * using async compaction.
3535 */
3536 compact_priority = INIT_COMPACT_PRIORITY;
3537 }
3538 }
3539
3540retry:
3541 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
3542 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3543 wake_all_kswapds(order, ac);
3544
3545 if (gfp_pfmemalloc_allowed(gfp_mask))
3546 alloc_flags = ALLOC_NO_WATERMARKS;
3588 3547
3589 /* 3548 /*
3590 * Reset the zonelist iterators if memory policies can be ignored. 3549 * Reset the zonelist iterators if memory policies can be ignored.
3591 * These allocations are high priority and system rather than user 3550 * These allocations are high priority and system rather than user
3592 * orientated. 3551 * orientated.
3593 */ 3552 */
3594 if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) { 3553 if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
3595 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); 3554 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
3596 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3555 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3597 ac->high_zoneidx, ac->nodemask); 3556 ac->high_zoneidx, ac->nodemask);
3598 } 3557 }
3599 3558
3600 /* This is the last chance, in general, before the goto nopage. */ 3559 /* Attempt with potentially adjusted zonelist and alloc_flags */
3601 page = get_page_from_freelist(gfp_mask, order, 3560 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3602 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
3603 if (page) 3561 if (page)
3604 goto got_pg; 3562 goto got_pg;
3605 3563
3606 /* Allocate without watermarks if the context allows */
3607 if (alloc_flags & ALLOC_NO_WATERMARKS) {
3608 page = get_page_from_freelist(gfp_mask, order,
3609 ALLOC_NO_WATERMARKS, ac);
3610 if (page)
3611 goto got_pg;
3612 }
3613
3614 /* Caller is not willing to reclaim, we can't balance anything */ 3564 /* Caller is not willing to reclaim, we can't balance anything */
3615 if (!can_direct_reclaim) { 3565 if (!can_direct_reclaim) {
3616 /* 3566 /*
@@ -3640,38 +3590,6 @@ retry:
3640 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 3590 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
3641 goto nopage; 3591 goto nopage;
3642 3592
3643 /*
3644 * Try direct compaction. The first pass is asynchronous. Subsequent
3645 * attempts after direct reclaim are synchronous
3646 */
3647 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
3648 migration_mode,
3649 &compact_result);
3650 if (page)
3651 goto got_pg;
3652
3653 /* Checks for THP-specific high-order allocations */
3654 if (is_thp_gfp_mask(gfp_mask)) {
3655 /*
3656 * If compaction is deferred for high-order allocations, it is
3657 * because sync compaction recently failed. If this is the case
3658 * and the caller requested a THP allocation, we do not want
3659 * to heavily disrupt the system, so we fail the allocation
3660 * instead of entering direct reclaim.
3661 */
3662 if (compact_result == COMPACT_DEFERRED)
3663 goto nopage;
3664
3665 /*
3666 * Compaction is contended so rather back off than cause
3667 * excessive stalls.
3668 */
3669 if(compact_result == COMPACT_CONTENDED)
3670 goto nopage;
3671 }
3672
3673 if (order && compaction_made_progress(compact_result))
3674 compaction_retries++;
3675 3593
3676 /* Try direct reclaim and then allocating */ 3594 /* Try direct reclaim and then allocating */
3677 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 3595 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3679,16 +3597,25 @@ retry:
3679 if (page) 3597 if (page)
3680 goto got_pg; 3598 goto got_pg;
3681 3599
3600 /* Try direct compaction and then allocating */
3601 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
3602 compact_priority, &compact_result);
3603 if (page)
3604 goto got_pg;
3605
3606 if (order && compaction_made_progress(compact_result))
3607 compaction_retries++;
3608
3682 /* Do not loop if specifically requested */ 3609 /* Do not loop if specifically requested */
3683 if (gfp_mask & __GFP_NORETRY) 3610 if (gfp_mask & __GFP_NORETRY)
3684 goto noretry; 3611 goto nopage;
3685 3612
3686 /* 3613 /*
3687 * Do not retry costly high order allocations unless they are 3614 * Do not retry costly high order allocations unless they are
3688 * __GFP_REPEAT 3615 * __GFP_REPEAT
3689 */ 3616 */
3690 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) 3617 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
3691 goto noretry; 3618 goto nopage;
3692 3619
3693 /* 3620 /*
3694 * Costly allocations might have made a progress but this doesn't mean 3621 * Costly allocations might have made a progress but this doesn't mean
@@ -3712,7 +3639,7 @@ retry:
3712 */ 3639 */
3713 if (did_some_progress > 0 && 3640 if (did_some_progress > 0 &&
3714 should_compact_retry(ac, order, alloc_flags, 3641 should_compact_retry(ac, order, alloc_flags,
3715 compact_result, &migration_mode, 3642 compact_result, &compact_priority,
3716 compaction_retries)) 3643 compaction_retries))
3717 goto retry; 3644 goto retry;
3718 3645
@@ -3727,25 +3654,6 @@ retry:
3727 goto retry; 3654 goto retry;
3728 } 3655 }
3729 3656
3730noretry:
3731 /*
3732 * High-order allocations do not necessarily loop after direct reclaim
3733 * and reclaim/compaction depends on compaction being called after
3734 * reclaim so call directly if necessary.
3735 * It can become very expensive to allocate transparent hugepages at
3736 * fault, so use asynchronous memory compaction for THP unless it is
3737 * khugepaged trying to collapse. All other requests should tolerate
3738 * at least light sync migration.
3739 */
3740 if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
3741 migration_mode = MIGRATE_ASYNC;
3742 else
3743 migration_mode = MIGRATE_SYNC_LIGHT;
3744 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
3745 ac, migration_mode,
3746 &compact_result);
3747 if (page)
3748 goto got_pg;
3749nopage: 3657nopage:
3750 warn_alloc_failed(gfp_mask, order, NULL); 3658 warn_alloc_failed(gfp_mask, order, NULL);
3751got_pg: 3659got_pg:
@@ -3761,7 +3669,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3761{ 3669{
3762 struct page *page; 3670 struct page *page;
3763 unsigned int cpuset_mems_cookie; 3671 unsigned int cpuset_mems_cookie;
3764 unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; 3672 unsigned int alloc_flags = ALLOC_WMARK_LOW;
3765 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ 3673 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
3766 struct alloc_context ac = { 3674 struct alloc_context ac = {
3767 .high_zoneidx = gfp_zone(gfp_mask), 3675 .high_zoneidx = gfp_zone(gfp_mask),
@@ -4192,7 +4100,7 @@ EXPORT_SYMBOL_GPL(si_mem_available);
4192void si_meminfo(struct sysinfo *val) 4100void si_meminfo(struct sysinfo *val)
4193{ 4101{
4194 val->totalram = totalram_pages; 4102 val->totalram = totalram_pages;
4195 val->sharedram = global_page_state(NR_SHMEM); 4103 val->sharedram = global_node_page_state(NR_SHMEM);
4196 val->freeram = global_page_state(NR_FREE_PAGES); 4104 val->freeram = global_page_state(NR_FREE_PAGES);
4197 val->bufferram = nr_blockdev_pages(); 4105 val->bufferram = nr_blockdev_pages();
4198 val->totalhigh = totalhigh_pages; 4106 val->totalhigh = totalhigh_pages;
@@ -4214,8 +4122,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
4214 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 4122 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4215 managed_pages += pgdat->node_zones[zone_type].managed_pages; 4123 managed_pages += pgdat->node_zones[zone_type].managed_pages;
4216 val->totalram = managed_pages; 4124 val->totalram = managed_pages;
4217 val->sharedram = node_page_state(nid, NR_SHMEM); 4125 val->sharedram = node_page_state(pgdat, NR_SHMEM);
4218 val->freeram = node_page_state(nid, NR_FREE_PAGES); 4126 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
4219#ifdef CONFIG_HIGHMEM 4127#ifdef CONFIG_HIGHMEM
4220 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 4128 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
4221 struct zone *zone = &pgdat->node_zones[zone_type]; 4129 struct zone *zone = &pgdat->node_zones[zone_type];
@@ -4298,6 +4206,7 @@ void show_free_areas(unsigned int filter)
4298 unsigned long free_pcp = 0; 4206 unsigned long free_pcp = 0;
4299 int cpu; 4207 int cpu;
4300 struct zone *zone; 4208 struct zone *zone;
4209 pg_data_t *pgdat;
4301 4210
4302 for_each_populated_zone(zone) { 4211 for_each_populated_zone(zone) {
4303 if (skip_free_areas_node(filter, zone_to_nid(zone))) 4212 if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -4312,35 +4221,74 @@ void show_free_areas(unsigned int filter)
4312 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" 4221 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
4313 " slab_reclaimable:%lu slab_unreclaimable:%lu\n" 4222 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
4314 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 4223 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
4315#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4316 " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n"
4317#endif
4318 " free:%lu free_pcp:%lu free_cma:%lu\n", 4224 " free:%lu free_pcp:%lu free_cma:%lu\n",
4319 global_page_state(NR_ACTIVE_ANON), 4225 global_node_page_state(NR_ACTIVE_ANON),
4320 global_page_state(NR_INACTIVE_ANON), 4226 global_node_page_state(NR_INACTIVE_ANON),
4321 global_page_state(NR_ISOLATED_ANON), 4227 global_node_page_state(NR_ISOLATED_ANON),
4322 global_page_state(NR_ACTIVE_FILE), 4228 global_node_page_state(NR_ACTIVE_FILE),
4323 global_page_state(NR_INACTIVE_FILE), 4229 global_node_page_state(NR_INACTIVE_FILE),
4324 global_page_state(NR_ISOLATED_FILE), 4230 global_node_page_state(NR_ISOLATED_FILE),
4325 global_page_state(NR_UNEVICTABLE), 4231 global_node_page_state(NR_UNEVICTABLE),
4326 global_page_state(NR_FILE_DIRTY), 4232 global_node_page_state(NR_FILE_DIRTY),
4327 global_page_state(NR_WRITEBACK), 4233 global_node_page_state(NR_WRITEBACK),
4328 global_page_state(NR_UNSTABLE_NFS), 4234 global_node_page_state(NR_UNSTABLE_NFS),
4329 global_page_state(NR_SLAB_RECLAIMABLE), 4235 global_page_state(NR_SLAB_RECLAIMABLE),
4330 global_page_state(NR_SLAB_UNRECLAIMABLE), 4236 global_page_state(NR_SLAB_UNRECLAIMABLE),
4331 global_page_state(NR_FILE_MAPPED), 4237 global_node_page_state(NR_FILE_MAPPED),
4332 global_page_state(NR_SHMEM), 4238 global_node_page_state(NR_SHMEM),
4333 global_page_state(NR_PAGETABLE), 4239 global_page_state(NR_PAGETABLE),
4334 global_page_state(NR_BOUNCE), 4240 global_page_state(NR_BOUNCE),
4335#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4336 global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR,
4337 global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR,
4338 global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR,
4339#endif
4340 global_page_state(NR_FREE_PAGES), 4241 global_page_state(NR_FREE_PAGES),
4341 free_pcp, 4242 free_pcp,
4342 global_page_state(NR_FREE_CMA_PAGES)); 4243 global_page_state(NR_FREE_CMA_PAGES));
4343 4244
4245 for_each_online_pgdat(pgdat) {
4246 printk("Node %d"
4247 " active_anon:%lukB"
4248 " inactive_anon:%lukB"
4249 " active_file:%lukB"
4250 " inactive_file:%lukB"
4251 " unevictable:%lukB"
4252 " isolated(anon):%lukB"
4253 " isolated(file):%lukB"
4254 " mapped:%lukB"
4255 " dirty:%lukB"
4256 " writeback:%lukB"
4257 " shmem:%lukB"
4258#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4259 " shmem_thp: %lukB"
4260 " shmem_pmdmapped: %lukB"
4261 " anon_thp: %lukB"
4262#endif
4263 " writeback_tmp:%lukB"
4264 " unstable:%lukB"
4265 " pages_scanned:%lu"
4266 " all_unreclaimable? %s"
4267 "\n",
4268 pgdat->node_id,
4269 K(node_page_state(pgdat, NR_ACTIVE_ANON)),
4270 K(node_page_state(pgdat, NR_INACTIVE_ANON)),
4271 K(node_page_state(pgdat, NR_ACTIVE_FILE)),
4272 K(node_page_state(pgdat, NR_INACTIVE_FILE)),
4273 K(node_page_state(pgdat, NR_UNEVICTABLE)),
4274 K(node_page_state(pgdat, NR_ISOLATED_ANON)),
4275 K(node_page_state(pgdat, NR_ISOLATED_FILE)),
4276 K(node_page_state(pgdat, NR_FILE_MAPPED)),
4277 K(node_page_state(pgdat, NR_FILE_DIRTY)),
4278 K(node_page_state(pgdat, NR_WRITEBACK)),
4279#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4280 K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
4281 K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
4282 * HPAGE_PMD_NR),
4283 K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
4284#endif
4285 K(node_page_state(pgdat, NR_SHMEM)),
4286 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
4287 K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
4288 node_page_state(pgdat, NR_PAGES_SCANNED),
4289 !pgdat_reclaimable(pgdat) ? "yes" : "no");
4290 }
4291
4344 for_each_populated_zone(zone) { 4292 for_each_populated_zone(zone) {
4345 int i; 4293 int i;
4346 4294
@@ -4362,72 +4310,41 @@ void show_free_areas(unsigned int filter)
4362 " active_file:%lukB" 4310 " active_file:%lukB"
4363 " inactive_file:%lukB" 4311 " inactive_file:%lukB"
4364 " unevictable:%lukB" 4312 " unevictable:%lukB"
4365 " isolated(anon):%lukB" 4313 " writepending:%lukB"
4366 " isolated(file):%lukB"
4367 " present:%lukB" 4314 " present:%lukB"
4368 " managed:%lukB" 4315 " managed:%lukB"
4369 " mlocked:%lukB" 4316 " mlocked:%lukB"
4370 " dirty:%lukB"
4371 " writeback:%lukB"
4372 " mapped:%lukB"
4373 " shmem:%lukB"
4374#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4375 " shmem_thp: %lukB"
4376 " shmem_pmdmapped: %lukB"
4377 " anon_thp: %lukB"
4378#endif
4379 " slab_reclaimable:%lukB" 4317 " slab_reclaimable:%lukB"
4380 " slab_unreclaimable:%lukB" 4318 " slab_unreclaimable:%lukB"
4381 " kernel_stack:%lukB" 4319 " kernel_stack:%lukB"
4382 " pagetables:%lukB" 4320 " pagetables:%lukB"
4383 " unstable:%lukB"
4384 " bounce:%lukB" 4321 " bounce:%lukB"
4385 " free_pcp:%lukB" 4322 " free_pcp:%lukB"
4386 " local_pcp:%ukB" 4323 " local_pcp:%ukB"
4387 " free_cma:%lukB" 4324 " free_cma:%lukB"
4388 " writeback_tmp:%lukB"
4389 " pages_scanned:%lu"
4390 " all_unreclaimable? %s"
4391 "\n", 4325 "\n",
4392 zone->name, 4326 zone->name,
4393 K(zone_page_state(zone, NR_FREE_PAGES)), 4327 K(zone_page_state(zone, NR_FREE_PAGES)),
4394 K(min_wmark_pages(zone)), 4328 K(min_wmark_pages(zone)),
4395 K(low_wmark_pages(zone)), 4329 K(low_wmark_pages(zone)),
4396 K(high_wmark_pages(zone)), 4330 K(high_wmark_pages(zone)),
4397 K(zone_page_state(zone, NR_ACTIVE_ANON)), 4331 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
4398 K(zone_page_state(zone, NR_INACTIVE_ANON)), 4332 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
4399 K(zone_page_state(zone, NR_ACTIVE_FILE)), 4333 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
4400 K(zone_page_state(zone, NR_INACTIVE_FILE)), 4334 K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
4401 K(zone_page_state(zone, NR_UNEVICTABLE)), 4335 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
4402 K(zone_page_state(zone, NR_ISOLATED_ANON)), 4336 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
4403 K(zone_page_state(zone, NR_ISOLATED_FILE)),
4404 K(zone->present_pages), 4337 K(zone->present_pages),
4405 K(zone->managed_pages), 4338 K(zone->managed_pages),
4406 K(zone_page_state(zone, NR_MLOCK)), 4339 K(zone_page_state(zone, NR_MLOCK)),
4407 K(zone_page_state(zone, NR_FILE_DIRTY)),
4408 K(zone_page_state(zone, NR_WRITEBACK)),
4409 K(zone_page_state(zone, NR_FILE_MAPPED)),
4410 K(zone_page_state(zone, NR_SHMEM)),
4411#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4412 K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR),
4413 K(zone_page_state(zone, NR_SHMEM_PMDMAPPED)
4414 * HPAGE_PMD_NR),
4415 K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR),
4416#endif
4417 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 4340 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
4418 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 4341 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
4419 zone_page_state(zone, NR_KERNEL_STACK) * 4342 zone_page_state(zone, NR_KERNEL_STACK_KB),
4420 THREAD_SIZE / 1024,
4421 K(zone_page_state(zone, NR_PAGETABLE)), 4343 K(zone_page_state(zone, NR_PAGETABLE)),
4422 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
4423 K(zone_page_state(zone, NR_BOUNCE)), 4344 K(zone_page_state(zone, NR_BOUNCE)),
4424 K(free_pcp), 4345 K(free_pcp),
4425 K(this_cpu_read(zone->pageset->pcp.count)), 4346 K(this_cpu_read(zone->pageset->pcp.count)),
4426 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 4347 K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
4427 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
4428 K(zone_page_state(zone, NR_PAGES_SCANNED)),
4429 (!zone_reclaimable(zone) ? "yes" : "no")
4430 );
4431 printk("lowmem_reserve[]:"); 4348 printk("lowmem_reserve[]:");
4432 for (i = 0; i < MAX_NR_ZONES; i++) 4349 for (i = 0; i < MAX_NR_ZONES; i++)
4433 printk(" %ld", zone->lowmem_reserve[i]); 4350 printk(" %ld", zone->lowmem_reserve[i]);
@@ -4469,7 +4386,7 @@ void show_free_areas(unsigned int filter)
4469 4386
4470 hugetlb_show_meminfo(); 4387 hugetlb_show_meminfo();
4471 4388
4472 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 4389 printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
4473 4390
4474 show_swap_cache_info(); 4391 show_swap_cache_info();
4475} 4392}
@@ -5340,6 +5257,11 @@ static void __meminit setup_zone_pageset(struct zone *zone)
5340 zone->pageset = alloc_percpu(struct per_cpu_pageset); 5257 zone->pageset = alloc_percpu(struct per_cpu_pageset);
5341 for_each_possible_cpu(cpu) 5258 for_each_possible_cpu(cpu)
5342 zone_pageset_init(zone, cpu); 5259 zone_pageset_init(zone, cpu);
5260
5261 if (!zone->zone_pgdat->per_cpu_nodestats) {
5262 zone->zone_pgdat->per_cpu_nodestats =
5263 alloc_percpu(struct per_cpu_nodestat);
5264 }
5343} 5265}
5344 5266
5345/* 5267/*
@@ -5909,6 +5831,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5909 init_waitqueue_head(&pgdat->kcompactd_wait); 5831 init_waitqueue_head(&pgdat->kcompactd_wait);
5910#endif 5832#endif
5911 pgdat_page_ext_init(pgdat); 5833 pgdat_page_ext_init(pgdat);
5834 spin_lock_init(&pgdat->lru_lock);
5835 lruvec_init(node_lruvec(pgdat));
5912 5836
5913 for (j = 0; j < MAX_NR_ZONES; j++) { 5837 for (j = 0; j < MAX_NR_ZONES; j++) {
5914 struct zone *zone = pgdat->node_zones + j; 5838 struct zone *zone = pgdat->node_zones + j;
@@ -5958,21 +5882,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5958 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; 5882 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
5959#ifdef CONFIG_NUMA 5883#ifdef CONFIG_NUMA
5960 zone->node = nid; 5884 zone->node = nid;
5961 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) 5885 pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
5962 / 100; 5886 / 100;
5963 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; 5887 pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
5964#endif 5888#endif
5965 zone->name = zone_names[j]; 5889 zone->name = zone_names[j];
5890 zone->zone_pgdat = pgdat;
5966 spin_lock_init(&zone->lock); 5891 spin_lock_init(&zone->lock);
5967 spin_lock_init(&zone->lru_lock);
5968 zone_seqlock_init(zone); 5892 zone_seqlock_init(zone);
5969 zone->zone_pgdat = pgdat;
5970 zone_pcp_init(zone); 5893 zone_pcp_init(zone);
5971 5894
5972 /* For bootup, initialized properly in watermark setup */
5973 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
5974
5975 lruvec_init(&zone->lruvec);
5976 if (!size) 5895 if (!size)
5977 continue; 5896 continue;
5978 5897
@@ -6038,11 +5957,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6038 unsigned long end_pfn = 0; 5957 unsigned long end_pfn = 0;
6039 5958
6040 /* pg_data_t should be reset to zero when it's allocated */ 5959 /* pg_data_t should be reset to zero when it's allocated */
6041 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 5960 WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
6042 5961
6043 reset_deferred_meminit(pgdat); 5962 reset_deferred_meminit(pgdat);
6044 pgdat->node_id = nid; 5963 pgdat->node_id = nid;
6045 pgdat->node_start_pfn = node_start_pfn; 5964 pgdat->node_start_pfn = node_start_pfn;
5965 pgdat->per_cpu_nodestats = NULL;
6046#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5966#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6047 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 5967 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
6048 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 5968 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
@@ -6699,6 +6619,9 @@ static void calculate_totalreserve_pages(void)
6699 enum zone_type i, j; 6619 enum zone_type i, j;
6700 6620
6701 for_each_online_pgdat(pgdat) { 6621 for_each_online_pgdat(pgdat) {
6622
6623 pgdat->totalreserve_pages = 0;
6624
6702 for (i = 0; i < MAX_NR_ZONES; i++) { 6625 for (i = 0; i < MAX_NR_ZONES; i++) {
6703 struct zone *zone = pgdat->node_zones + i; 6626 struct zone *zone = pgdat->node_zones + i;
6704 long max = 0; 6627 long max = 0;
@@ -6715,7 +6638,7 @@ static void calculate_totalreserve_pages(void)
6715 if (max > zone->managed_pages) 6638 if (max > zone->managed_pages)
6716 max = zone->managed_pages; 6639 max = zone->managed_pages;
6717 6640
6718 zone->totalreserve_pages = max; 6641 pgdat->totalreserve_pages += max;
6719 6642
6720 reserve_pages += max; 6643 reserve_pages += max;
6721 } 6644 }
@@ -6816,10 +6739,6 @@ static void __setup_per_zone_wmarks(void)
6816 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 6739 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
6817 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; 6740 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
6818 6741
6819 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
6820 high_wmark_pages(zone) - low_wmark_pages(zone) -
6821 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
6822
6823 spin_unlock_irqrestore(&zone->lock, flags); 6742 spin_unlock_irqrestore(&zone->lock, flags);
6824 } 6743 }
6825 6744
@@ -6930,6 +6849,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
6930int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, 6849int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
6931 void __user *buffer, size_t *length, loff_t *ppos) 6850 void __user *buffer, size_t *length, loff_t *ppos)
6932{ 6851{
6852 struct pglist_data *pgdat;
6933 struct zone *zone; 6853 struct zone *zone;
6934 int rc; 6854 int rc;
6935 6855
@@ -6937,8 +6857,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
6937 if (rc) 6857 if (rc)
6938 return rc; 6858 return rc;
6939 6859
6860 for_each_online_pgdat(pgdat)
6861 pgdat->min_slab_pages = 0;
6862
6940 for_each_zone(zone) 6863 for_each_zone(zone)
6941 zone->min_unmapped_pages = (zone->managed_pages * 6864 zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
6942 sysctl_min_unmapped_ratio) / 100; 6865 sysctl_min_unmapped_ratio) / 100;
6943 return 0; 6866 return 0;
6944} 6867}
@@ -6946,6 +6869,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
6946int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 6869int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
6947 void __user *buffer, size_t *length, loff_t *ppos) 6870 void __user *buffer, size_t *length, loff_t *ppos)
6948{ 6871{
6872 struct pglist_data *pgdat;
6949 struct zone *zone; 6873 struct zone *zone;
6950 int rc; 6874 int rc;
6951 6875
@@ -6953,8 +6877,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
6953 if (rc) 6877 if (rc)
6954 return rc; 6878 return rc;
6955 6879
6880 for_each_online_pgdat(pgdat)
6881 pgdat->min_slab_pages = 0;
6882
6956 for_each_zone(zone) 6883 for_each_zone(zone)
6957 zone->min_slab_pages = (zone->managed_pages * 6884 zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
6958 sysctl_min_slab_ratio) / 100; 6885 sysctl_min_slab_ratio) / 100;
6959 return 0; 6886 return 0;
6960} 6887}
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 4ea9c4ef5146..ae11aa914e55 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -41,12 +41,12 @@ static struct page *page_idle_get_page(unsigned long pfn)
41 return NULL; 41 return NULL;
42 42
43 zone = page_zone(page); 43 zone = page_zone(page);
44 spin_lock_irq(&zone->lru_lock); 44 spin_lock_irq(zone_lru_lock(zone));
45 if (unlikely(!PageLRU(page))) { 45 if (unlikely(!PageLRU(page))) {
46 put_page(page); 46 put_page(page);
47 page = NULL; 47 page = NULL;
48 } 48 }
49 spin_unlock_irq(&zone->lru_lock); 49 spin_unlock_irq(zone_lru_lock(zone));
50 return page; 50 return page;
51} 51}
52 52
diff --git a/mm/page_io.c b/mm/page_io.c
index dcc5d3769608..fb1fa269d3a0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -166,6 +166,8 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
166 unsigned block_in_page; 166 unsigned block_in_page;
167 sector_t first_block; 167 sector_t first_block;
168 168
169 cond_resched();
170
169 first_block = bmap(inode, probe_block); 171 first_block = bmap(inode, probe_block);
170 if (first_block == 0) 172 if (first_block == 0)
171 goto bad_bmap; 173 goto bad_bmap;
diff --git a/mm/rmap.c b/mm/rmap.c
index 8a13d9f7b566..709bc83703b1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -27,7 +27,7 @@
27 * mapping->i_mmap_rwsem 27 * mapping->i_mmap_rwsem
28 * anon_vma->rwsem 28 * anon_vma->rwsem
29 * mm->page_table_lock or pte_lock 29 * mm->page_table_lock or pte_lock
30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
31 * swap_lock (in swap_duplicate, swap_info_get) 31 * swap_lock (in swap_duplicate, swap_info_get)
32 * mmlist_lock (in mmput, drain_mmlist and others) 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->private_lock (in __set_page_dirty_buffers) 33 * mapping->private_lock (in __set_page_dirty_buffers)
@@ -1213,8 +1213,8 @@ void do_page_add_anon_rmap(struct page *page,
1213 * disabled. 1213 * disabled.
1214 */ 1214 */
1215 if (compound) 1215 if (compound)
1216 __inc_zone_page_state(page, NR_ANON_THPS); 1216 __inc_node_page_state(page, NR_ANON_THPS);
1217 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); 1217 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
1218 } 1218 }
1219 if (unlikely(PageKsm(page))) 1219 if (unlikely(PageKsm(page)))
1220 return; 1220 return;
@@ -1251,14 +1251,14 @@ void page_add_new_anon_rmap(struct page *page,
1251 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1251 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1252 /* increment count (starts at -1) */ 1252 /* increment count (starts at -1) */
1253 atomic_set(compound_mapcount_ptr(page), 0); 1253 atomic_set(compound_mapcount_ptr(page), 0);
1254 __inc_zone_page_state(page, NR_ANON_THPS); 1254 __inc_node_page_state(page, NR_ANON_THPS);
1255 } else { 1255 } else {
1256 /* Anon THP always mapped first with PMD */ 1256 /* Anon THP always mapped first with PMD */
1257 VM_BUG_ON_PAGE(PageTransCompound(page), page); 1257 VM_BUG_ON_PAGE(PageTransCompound(page), page);
1258 /* increment count (starts at -1) */ 1258 /* increment count (starts at -1) */
1259 atomic_set(&page->_mapcount, 0); 1259 atomic_set(&page->_mapcount, 0);
1260 } 1260 }
1261 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); 1261 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
1262 __page_set_anon_rmap(page, vma, address, 1); 1262 __page_set_anon_rmap(page, vma, address, 1);
1263} 1263}
1264 1264
@@ -1282,7 +1282,7 @@ void page_add_file_rmap(struct page *page, bool compound)
1282 if (!atomic_inc_and_test(compound_mapcount_ptr(page))) 1282 if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
1283 goto out; 1283 goto out;
1284 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1284 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1285 __inc_zone_page_state(page, NR_SHMEM_PMDMAPPED); 1285 __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
1286 } else { 1286 } else {
1287 if (PageTransCompound(page)) { 1287 if (PageTransCompound(page)) {
1288 VM_BUG_ON_PAGE(!PageLocked(page), page); 1288 VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1293,7 +1293,7 @@ void page_add_file_rmap(struct page *page, bool compound)
1293 if (!atomic_inc_and_test(&page->_mapcount)) 1293 if (!atomic_inc_and_test(&page->_mapcount))
1294 goto out; 1294 goto out;
1295 } 1295 }
1296 __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr); 1296 __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
1297 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); 1297 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1298out: 1298out:
1299 unlock_page_memcg(page); 1299 unlock_page_memcg(page);
@@ -1322,18 +1322,18 @@ static void page_remove_file_rmap(struct page *page, bool compound)
1322 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1322 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1323 goto out; 1323 goto out;
1324 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1324 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1325 __dec_zone_page_state(page, NR_SHMEM_PMDMAPPED); 1325 __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
1326 } else { 1326 } else {
1327 if (!atomic_add_negative(-1, &page->_mapcount)) 1327 if (!atomic_add_negative(-1, &page->_mapcount))
1328 goto out; 1328 goto out;
1329 } 1329 }
1330 1330
1331 /* 1331 /*
1332 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1332 * We use the irq-unsafe __{inc|mod}_zone_page_state because
1333 * these counters are not modified in interrupt context, and 1333 * these counters are not modified in interrupt context, and
1334 * pte lock(a spinlock) is held, which implies preemption disabled. 1334 * pte lock(a spinlock) is held, which implies preemption disabled.
1335 */ 1335 */
1336 __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr); 1336 __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
1337 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); 1337 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1338 1338
1339 if (unlikely(PageMlocked(page))) 1339 if (unlikely(PageMlocked(page)))
@@ -1356,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
1356 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1356 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1357 return; 1357 return;
1358 1358
1359 __dec_zone_page_state(page, NR_ANON_THPS); 1359 __dec_node_page_state(page, NR_ANON_THPS);
1360 1360
1361 if (TestClearPageDoubleMap(page)) { 1361 if (TestClearPageDoubleMap(page)) {
1362 /* 1362 /*
@@ -1375,7 +1375,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
1375 clear_page_mlock(page); 1375 clear_page_mlock(page);
1376 1376
1377 if (nr) { 1377 if (nr) {
1378 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); 1378 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
1379 deferred_split_huge_page(page); 1379 deferred_split_huge_page(page);
1380 } 1380 }
1381} 1381}
@@ -1404,7 +1404,7 @@ void page_remove_rmap(struct page *page, bool compound)
1404 * these counters are not modified in interrupt context, and 1404 * these counters are not modified in interrupt context, and
1405 * pte lock(a spinlock) is held, which implies preemption disabled. 1405 * pte lock(a spinlock) is held, which implies preemption disabled.
1406 */ 1406 */
1407 __dec_zone_page_state(page, NR_ANON_PAGES); 1407 __dec_node_page_state(page, NR_ANON_MAPPED);
1408 1408
1409 if (unlikely(PageMlocked(page))) 1409 if (unlikely(PageMlocked(page)))
1410 clear_page_mlock(page); 1410 clear_page_mlock(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index 62e42c7d544c..2ac19a61d565 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -575,9 +575,9 @@ static int shmem_add_to_page_cache(struct page *page,
575 if (!error) { 575 if (!error) {
576 mapping->nrpages += nr; 576 mapping->nrpages += nr;
577 if (PageTransHuge(page)) 577 if (PageTransHuge(page))
578 __inc_zone_page_state(page, NR_SHMEM_THPS); 578 __inc_node_page_state(page, NR_SHMEM_THPS);
579 __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr); 579 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
580 __mod_zone_page_state(page_zone(page), NR_SHMEM, nr); 580 __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
581 spin_unlock_irq(&mapping->tree_lock); 581 spin_unlock_irq(&mapping->tree_lock);
582 } else { 582 } else {
583 page->mapping = NULL; 583 page->mapping = NULL;
@@ -601,8 +601,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
601 error = shmem_radix_tree_replace(mapping, page->index, page, radswap); 601 error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
602 page->mapping = NULL; 602 page->mapping = NULL;
603 mapping->nrpages--; 603 mapping->nrpages--;
604 __dec_zone_page_state(page, NR_FILE_PAGES); 604 __dec_node_page_state(page, NR_FILE_PAGES);
605 __dec_zone_page_state(page, NR_SHMEM); 605 __dec_node_page_state(page, NR_SHMEM);
606 spin_unlock_irq(&mapping->tree_lock); 606 spin_unlock_irq(&mapping->tree_lock);
607 put_page(page); 607 put_page(page);
608 BUG_ON(error); 608 BUG_ON(error);
@@ -1493,8 +1493,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1493 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 1493 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1494 newpage); 1494 newpage);
1495 if (!error) { 1495 if (!error) {
1496 __inc_zone_page_state(newpage, NR_FILE_PAGES); 1496 __inc_node_page_state(newpage, NR_FILE_PAGES);
1497 __dec_zone_page_state(oldpage, NR_FILE_PAGES); 1497 __dec_node_page_state(oldpage, NR_FILE_PAGES);
1498 } 1498 }
1499 spin_unlock_irq(&swap_mapping->tree_lock); 1499 spin_unlock_irq(&swap_mapping->tree_lock);
1500 1500
diff --git a/mm/slab.h b/mm/slab.h
index f33980ab0406..9653f2e2591a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -369,6 +369,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
369 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 369 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
370 return s->object_size; 370 return s->object_size;
371# endif 371# endif
372 if (s->flags & SLAB_KASAN)
373 return s->object_size;
372 /* 374 /*
373 * If we have the need to store the freelist pointer 375 * If we have the need to store the freelist pointer
374 * back there or track user information then we can 376 * back there or track user information then we can
diff --git a/mm/slub.c b/mm/slub.c
index f9da8716b8b3..74e7c8c30db8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
124#endif 124#endif
125} 125}
126 126
127static inline void *fixup_red_left(struct kmem_cache *s, void *p) 127inline void *fixup_red_left(struct kmem_cache *s, void *p)
128{ 128{
129 if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) 129 if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
130 p += s->red_left_pad; 130 p += s->red_left_pad;
@@ -454,8 +454,6 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
454 */ 454 */
455#if defined(CONFIG_SLUB_DEBUG_ON) 455#if defined(CONFIG_SLUB_DEBUG_ON)
456static int slub_debug = DEBUG_DEFAULT_FLAGS; 456static int slub_debug = DEBUG_DEFAULT_FLAGS;
457#elif defined(CONFIG_KASAN)
458static int slub_debug = SLAB_STORE_USER;
459#else 457#else
460static int slub_debug; 458static int slub_debug;
461#endif 459#endif
@@ -660,6 +658,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
660 if (s->flags & SLAB_STORE_USER) 658 if (s->flags & SLAB_STORE_USER)
661 off += 2 * sizeof(struct track); 659 off += 2 * sizeof(struct track);
662 660
661 off += kasan_metadata_size(s);
662
663 if (off != size_from_object(s)) 663 if (off != size_from_object(s))
664 /* Beginning of the filler is the free pointer */ 664 /* Beginning of the filler is the free pointer */
665 print_section("Padding ", p + off, size_from_object(s) - off); 665 print_section("Padding ", p + off, size_from_object(s) - off);
@@ -787,6 +787,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
787 /* We also have user information there */ 787 /* We also have user information there */
788 off += 2 * sizeof(struct track); 788 off += 2 * sizeof(struct track);
789 789
790 off += kasan_metadata_size(s);
791
790 if (size_from_object(s) == off) 792 if (size_from_object(s) == off)
791 return 1; 793 return 1;
792 794
@@ -1322,8 +1324,10 @@ static inline void kfree_hook(const void *x)
1322 kasan_kfree_large(x); 1324 kasan_kfree_large(x);
1323} 1325}
1324 1326
1325static inline void slab_free_hook(struct kmem_cache *s, void *x) 1327static inline void *slab_free_hook(struct kmem_cache *s, void *x)
1326{ 1328{
1329 void *freeptr;
1330
1327 kmemleak_free_recursive(x, s->flags); 1331 kmemleak_free_recursive(x, s->flags);
1328 1332
1329 /* 1333 /*
@@ -1344,7 +1348,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
1344 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1348 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1345 debug_check_no_obj_freed(x, s->object_size); 1349 debug_check_no_obj_freed(x, s->object_size);
1346 1350
1351 freeptr = get_freepointer(s, x);
1352 /*
1353 * kasan_slab_free() may put x into memory quarantine, delaying its
1354 * reuse. In this case the object's freelist pointer is changed.
1355 */
1347 kasan_slab_free(s, x); 1356 kasan_slab_free(s, x);
1357 return freeptr;
1348} 1358}
1349 1359
1350static inline void slab_free_freelist_hook(struct kmem_cache *s, 1360static inline void slab_free_freelist_hook(struct kmem_cache *s,
@@ -1362,11 +1372,11 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
1362 1372
1363 void *object = head; 1373 void *object = head;
1364 void *tail_obj = tail ? : head; 1374 void *tail_obj = tail ? : head;
1375 void *freeptr;
1365 1376
1366 do { 1377 do {
1367 slab_free_hook(s, object); 1378 freeptr = slab_free_hook(s, object);
1368 } while ((object != tail_obj) && 1379 } while ((object != tail_obj) && (object = freeptr));
1369 (object = get_freepointer(s, object)));
1370#endif 1380#endif
1371} 1381}
1372 1382
@@ -2878,16 +2888,13 @@ slab_empty:
2878 * same page) possible by specifying head and tail ptr, plus objects 2888 * same page) possible by specifying head and tail ptr, plus objects
2879 * count (cnt). Bulk free indicated by tail pointer being set. 2889 * count (cnt). Bulk free indicated by tail pointer being set.
2880 */ 2890 */
2881static __always_inline void slab_free(struct kmem_cache *s, struct page *page, 2891static __always_inline void do_slab_free(struct kmem_cache *s,
2882 void *head, void *tail, int cnt, 2892 struct page *page, void *head, void *tail,
2883 unsigned long addr) 2893 int cnt, unsigned long addr)
2884{ 2894{
2885 void *tail_obj = tail ? : head; 2895 void *tail_obj = tail ? : head;
2886 struct kmem_cache_cpu *c; 2896 struct kmem_cache_cpu *c;
2887 unsigned long tid; 2897 unsigned long tid;
2888
2889 slab_free_freelist_hook(s, head, tail);
2890
2891redo: 2898redo:
2892 /* 2899 /*
2893 * Determine the currently cpus per cpu slab. 2900 * Determine the currently cpus per cpu slab.
@@ -2921,6 +2928,27 @@ redo:
2921 2928
2922} 2929}
2923 2930
2931static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
2932 void *head, void *tail, int cnt,
2933 unsigned long addr)
2934{
2935 slab_free_freelist_hook(s, head, tail);
2936 /*
2937 * slab_free_freelist_hook() could have put the items into quarantine.
2938 * If so, no need to free them.
2939 */
2940 if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU))
2941 return;
2942 do_slab_free(s, page, head, tail, cnt, addr);
2943}
2944
2945#ifdef CONFIG_KASAN
2946void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
2947{
2948 do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
2949}
2950#endif
2951
2924void kmem_cache_free(struct kmem_cache *s, void *x) 2952void kmem_cache_free(struct kmem_cache *s, void *x)
2925{ 2953{
2926 s = cache_from_obj(s, x); 2954 s = cache_from_obj(s, x);
@@ -3363,7 +3391,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
3363static int calculate_sizes(struct kmem_cache *s, int forced_order) 3391static int calculate_sizes(struct kmem_cache *s, int forced_order)
3364{ 3392{
3365 unsigned long flags = s->flags; 3393 unsigned long flags = s->flags;
3366 unsigned long size = s->object_size; 3394 size_t size = s->object_size;
3367 int order; 3395 int order;
3368 3396
3369 /* 3397 /*
@@ -3422,7 +3450,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3422 * the object. 3450 * the object.
3423 */ 3451 */
3424 size += 2 * sizeof(struct track); 3452 size += 2 * sizeof(struct track);
3453#endif
3425 3454
3455 kasan_cache_create(s, &size, &s->flags);
3456#ifdef CONFIG_SLUB_DEBUG
3426 if (flags & SLAB_RED_ZONE) { 3457 if (flags & SLAB_RED_ZONE) {
3427 /* 3458 /*
3428 * Add some empty padding so that we can catch 3459 * Add some empty padding so that we can catch
diff --git a/mm/sparse.c b/mm/sparse.c
index 5d0cf4540364..36d7bbb80e49 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -100,11 +100,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
100} 100}
101#endif 101#endif
102 102
103/* 103#ifdef CONFIG_SPARSEMEM_EXTREME
104 * Although written for the SPARSEMEM_EXTREME case, this happens
105 * to also work for the flat array case because
106 * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
107 */
108int __section_nr(struct mem_section* ms) 104int __section_nr(struct mem_section* ms)
109{ 105{
110 unsigned long root_nr; 106 unsigned long root_nr;
@@ -123,6 +119,12 @@ int __section_nr(struct mem_section* ms)
123 119
124 return (root_nr * SECTIONS_PER_ROOT) + (ms - root); 120 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
125} 121}
122#else
123int __section_nr(struct mem_section* ms)
124{
125 return (int)(ms - mem_section[0]);
126}
127#endif
126 128
127/* 129/*
128 * During early boot, before section_mem_map is used for an actual 130 * During early boot, before section_mem_map is used for an actual
diff --git a/mm/swap.c b/mm/swap.c
index 616df4ddd870..75c63bb2a1da 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -62,12 +62,12 @@ static void __page_cache_release(struct page *page)
62 struct lruvec *lruvec; 62 struct lruvec *lruvec;
63 unsigned long flags; 63 unsigned long flags;
64 64
65 spin_lock_irqsave(&zone->lru_lock, flags); 65 spin_lock_irqsave(zone_lru_lock(zone), flags);
66 lruvec = mem_cgroup_page_lruvec(page, zone); 66 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
67 VM_BUG_ON_PAGE(!PageLRU(page), page); 67 VM_BUG_ON_PAGE(!PageLRU(page), page);
68 __ClearPageLRU(page); 68 __ClearPageLRU(page);
69 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 69 del_page_from_lru_list(page, lruvec, page_off_lru(page));
70 spin_unlock_irqrestore(&zone->lru_lock, flags); 70 spin_unlock_irqrestore(zone_lru_lock(zone), flags);
71 } 71 }
72 mem_cgroup_uncharge(page); 72 mem_cgroup_uncharge(page);
73} 73}
@@ -179,26 +179,26 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
179 void *arg) 179 void *arg)
180{ 180{
181 int i; 181 int i;
182 struct zone *zone = NULL; 182 struct pglist_data *pgdat = NULL;
183 struct lruvec *lruvec; 183 struct lruvec *lruvec;
184 unsigned long flags = 0; 184 unsigned long flags = 0;
185 185
186 for (i = 0; i < pagevec_count(pvec); i++) { 186 for (i = 0; i < pagevec_count(pvec); i++) {
187 struct page *page = pvec->pages[i]; 187 struct page *page = pvec->pages[i];
188 struct zone *pagezone = page_zone(page); 188 struct pglist_data *pagepgdat = page_pgdat(page);
189 189
190 if (pagezone != zone) { 190 if (pagepgdat != pgdat) {
191 if (zone) 191 if (pgdat)
192 spin_unlock_irqrestore(&zone->lru_lock, flags); 192 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
193 zone = pagezone; 193 pgdat = pagepgdat;
194 spin_lock_irqsave(&zone->lru_lock, flags); 194 spin_lock_irqsave(&pgdat->lru_lock, flags);
195 } 195 }
196 196
197 lruvec = mem_cgroup_page_lruvec(page, zone); 197 lruvec = mem_cgroup_page_lruvec(page, pgdat);
198 (*move_fn)(page, lruvec, arg); 198 (*move_fn)(page, lruvec, arg);
199 } 199 }
200 if (zone) 200 if (pgdat)
201 spin_unlock_irqrestore(&zone->lru_lock, flags); 201 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
202 release_pages(pvec->pages, pvec->nr, pvec->cold); 202 release_pages(pvec->pages, pvec->nr, pvec->cold);
203 pagevec_reinit(pvec); 203 pagevec_reinit(pvec);
204} 204}
@@ -318,9 +318,9 @@ void activate_page(struct page *page)
318 struct zone *zone = page_zone(page); 318 struct zone *zone = page_zone(page);
319 319
320 page = compound_head(page); 320 page = compound_head(page);
321 spin_lock_irq(&zone->lru_lock); 321 spin_lock_irq(zone_lru_lock(zone));
322 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 322 __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
323 spin_unlock_irq(&zone->lru_lock); 323 spin_unlock_irq(zone_lru_lock(zone));
324} 324}
325#endif 325#endif
326 326
@@ -445,16 +445,16 @@ void lru_cache_add(struct page *page)
445 */ 445 */
446void add_page_to_unevictable_list(struct page *page) 446void add_page_to_unevictable_list(struct page *page)
447{ 447{
448 struct zone *zone = page_zone(page); 448 struct pglist_data *pgdat = page_pgdat(page);
449 struct lruvec *lruvec; 449 struct lruvec *lruvec;
450 450
451 spin_lock_irq(&zone->lru_lock); 451 spin_lock_irq(&pgdat->lru_lock);
452 lruvec = mem_cgroup_page_lruvec(page, zone); 452 lruvec = mem_cgroup_page_lruvec(page, pgdat);
453 ClearPageActive(page); 453 ClearPageActive(page);
454 SetPageUnevictable(page); 454 SetPageUnevictable(page);
455 SetPageLRU(page); 455 SetPageLRU(page);
456 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 456 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
457 spin_unlock_irq(&zone->lru_lock); 457 spin_unlock_irq(&pgdat->lru_lock);
458} 458}
459 459
460/** 460/**
@@ -730,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold)
730{ 730{
731 int i; 731 int i;
732 LIST_HEAD(pages_to_free); 732 LIST_HEAD(pages_to_free);
733 struct zone *zone = NULL; 733 struct pglist_data *locked_pgdat = NULL;
734 struct lruvec *lruvec; 734 struct lruvec *lruvec;
735 unsigned long uninitialized_var(flags); 735 unsigned long uninitialized_var(flags);
736 unsigned int uninitialized_var(lock_batch); 736 unsigned int uninitialized_var(lock_batch);
@@ -741,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold)
741 /* 741 /*
742 * Make sure the IRQ-safe lock-holding time does not get 742 * Make sure the IRQ-safe lock-holding time does not get
743 * excessive with a continuous string of pages from the 743 * excessive with a continuous string of pages from the
744 * same zone. The lock is held only if zone != NULL. 744 * same pgdat. The lock is held only if pgdat != NULL.
745 */ 745 */
746 if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { 746 if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
747 spin_unlock_irqrestore(&zone->lru_lock, flags); 747 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
748 zone = NULL; 748 locked_pgdat = NULL;
749 } 749 }
750 750
751 if (is_huge_zero_page(page)) { 751 if (is_huge_zero_page(page)) {
@@ -758,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold)
758 continue; 758 continue;
759 759
760 if (PageCompound(page)) { 760 if (PageCompound(page)) {
761 if (zone) { 761 if (locked_pgdat) {
762 spin_unlock_irqrestore(&zone->lru_lock, flags); 762 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
763 zone = NULL; 763 locked_pgdat = NULL;
764 } 764 }
765 __put_compound_page(page); 765 __put_compound_page(page);
766 continue; 766 continue;
767 } 767 }
768 768
769 if (PageLRU(page)) { 769 if (PageLRU(page)) {
770 struct zone *pagezone = page_zone(page); 770 struct pglist_data *pgdat = page_pgdat(page);
771 771
772 if (pagezone != zone) { 772 if (pgdat != locked_pgdat) {
773 if (zone) 773 if (locked_pgdat)
774 spin_unlock_irqrestore(&zone->lru_lock, 774 spin_unlock_irqrestore(&locked_pgdat->lru_lock,
775 flags); 775 flags);
776 lock_batch = 0; 776 lock_batch = 0;
777 zone = pagezone; 777 locked_pgdat = pgdat;
778 spin_lock_irqsave(&zone->lru_lock, flags); 778 spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
779 } 779 }
780 780
781 lruvec = mem_cgroup_page_lruvec(page, zone); 781 lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
782 VM_BUG_ON_PAGE(!PageLRU(page), page); 782 VM_BUG_ON_PAGE(!PageLRU(page), page);
783 __ClearPageLRU(page); 783 __ClearPageLRU(page);
784 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 784 del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -789,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold)
789 789
790 list_add(&page->lru, &pages_to_free); 790 list_add(&page->lru, &pages_to_free);
791 } 791 }
792 if (zone) 792 if (locked_pgdat)
793 spin_unlock_irqrestore(&zone->lru_lock, flags); 793 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
794 794
795 mem_cgroup_uncharge_list(&pages_to_free); 795 mem_cgroup_uncharge_list(&pages_to_free);
796 free_hot_cold_page_list(&pages_to_free, cold); 796 free_hot_cold_page_list(&pages_to_free, cold);
@@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
826 VM_BUG_ON_PAGE(PageCompound(page_tail), page); 826 VM_BUG_ON_PAGE(PageCompound(page_tail), page);
827 VM_BUG_ON_PAGE(PageLRU(page_tail), page); 827 VM_BUG_ON_PAGE(PageLRU(page_tail), page);
828 VM_BUG_ON(NR_CPUS != 1 && 828 VM_BUG_ON(NR_CPUS != 1 &&
829 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 829 !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
830 830
831 if (!list) 831 if (!list)
832 SetPageLRU(page_tail); 832 SetPageLRU(page_tail);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c99463ac02fb..c8310a37be3a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -95,7 +95,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
95 entry.val, page); 95 entry.val, page);
96 if (likely(!error)) { 96 if (likely(!error)) {
97 address_space->nrpages++; 97 address_space->nrpages++;
98 __inc_zone_page_state(page, NR_FILE_PAGES); 98 __inc_node_page_state(page, NR_FILE_PAGES);
99 INC_CACHE_INFO(add_total); 99 INC_CACHE_INFO(add_total);
100 } 100 }
101 spin_unlock_irq(&address_space->tree_lock); 101 spin_unlock_irq(&address_space->tree_lock);
@@ -147,7 +147,7 @@ void __delete_from_swap_cache(struct page *page)
147 set_page_private(page, 0); 147 set_page_private(page, 0);
148 ClearPageSwapCache(page); 148 ClearPageSwapCache(page);
149 address_space->nrpages--; 149 address_space->nrpages--;
150 __dec_zone_page_state(page, NR_FILE_PAGES); 150 __dec_node_page_state(page, NR_FILE_PAGES);
151 INC_CACHE_INFO(del_total); 151 INC_CACHE_INFO(del_total);
152} 152}
153 153
diff --git a/mm/util.c b/mm/util.c
index 8d010ef2ce1c..662cddf914af 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -528,7 +528,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
528 528
529 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 529 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
530 free = global_page_state(NR_FREE_PAGES); 530 free = global_page_state(NR_FREE_PAGES);
531 free += global_page_state(NR_FILE_PAGES); 531 free += global_node_page_state(NR_FILE_PAGES);
532 532
533 /* 533 /*
534 * shmem pages shouldn't be counted as free in this 534 * shmem pages shouldn't be counted as free in this
@@ -536,7 +536,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
536 * that won't affect the overall amount of available 536 * that won't affect the overall amount of available
537 * memory in the system. 537 * memory in the system.
538 */ 538 */
539 free -= global_page_state(NR_SHMEM); 539 free -= global_node_page_state(NR_SHMEM);
540 540
541 free += get_nr_swap_pages(); 541 free += get_nr_swap_pages();
542 542
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 21d417ccff69..650d26832569 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -84,6 +84,9 @@ struct scan_control {
84 /* Scan (total_size >> priority) pages at once */ 84 /* Scan (total_size >> priority) pages at once */
85 int priority; 85 int priority;
86 86
87 /* The highest zone to isolate pages for reclaim from */
88 enum zone_type reclaim_idx;
89
87 unsigned int may_writepage:1; 90 unsigned int may_writepage:1;
88 91
89 /* Can mapped pages be reclaimed? */ 92 /* Can mapped pages be reclaimed? */
@@ -191,26 +194,44 @@ static bool sane_reclaim(struct scan_control *sc)
191} 194}
192#endif 195#endif
193 196
197/*
198 * This misses isolated pages which are not accounted for to save counters.
199 * As the data only determines if reclaim or compaction continues, it is
200 * not expected that isolated pages will be a dominating factor.
201 */
194unsigned long zone_reclaimable_pages(struct zone *zone) 202unsigned long zone_reclaimable_pages(struct zone *zone)
195{ 203{
196 unsigned long nr; 204 unsigned long nr;
197 205
198 nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + 206 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
199 zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + 207 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
200 zone_page_state_snapshot(zone, NR_ISOLATED_FILE); 208 if (get_nr_swap_pages() > 0)
209 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
210 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
211
212 return nr;
213}
214
215unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
216{
217 unsigned long nr;
218
219 nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
220 node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
221 node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
201 222
202 if (get_nr_swap_pages() > 0) 223 if (get_nr_swap_pages() > 0)
203 nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + 224 nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
204 zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + 225 node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
205 zone_page_state_snapshot(zone, NR_ISOLATED_ANON); 226 node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
206 227
207 return nr; 228 return nr;
208} 229}
209 230
210bool zone_reclaimable(struct zone *zone) 231bool pgdat_reclaimable(struct pglist_data *pgdat)
211{ 232{
212 return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < 233 return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
213 zone_reclaimable_pages(zone) * 6; 234 pgdat_reclaimable_pages(pgdat) * 6;
214} 235}
215 236
216unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) 237unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -218,7 +239,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
218 if (!mem_cgroup_disabled()) 239 if (!mem_cgroup_disabled())
219 return mem_cgroup_get_lru_size(lruvec, lru); 240 return mem_cgroup_get_lru_size(lruvec, lru);
220 241
221 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); 242 return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
222} 243}
223 244
224/* 245/*
@@ -593,7 +614,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
593 ClearPageReclaim(page); 614 ClearPageReclaim(page);
594 } 615 }
595 trace_mm_vmscan_writepage(page); 616 trace_mm_vmscan_writepage(page);
596 inc_zone_page_state(page, NR_VMSCAN_WRITE); 617 inc_node_page_state(page, NR_VMSCAN_WRITE);
597 return PAGE_SUCCESS; 618 return PAGE_SUCCESS;
598 } 619 }
599 620
@@ -877,7 +898,7 @@ static void page_check_dirty_writeback(struct page *page,
877 * shrink_page_list() returns the number of reclaimed pages 898 * shrink_page_list() returns the number of reclaimed pages
878 */ 899 */
879static unsigned long shrink_page_list(struct list_head *page_list, 900static unsigned long shrink_page_list(struct list_head *page_list,
880 struct zone *zone, 901 struct pglist_data *pgdat,
881 struct scan_control *sc, 902 struct scan_control *sc,
882 enum ttu_flags ttu_flags, 903 enum ttu_flags ttu_flags,
883 unsigned long *ret_nr_dirty, 904 unsigned long *ret_nr_dirty,
@@ -917,7 +938,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
917 goto keep; 938 goto keep;
918 939
919 VM_BUG_ON_PAGE(PageActive(page), page); 940 VM_BUG_ON_PAGE(PageActive(page), page);
920 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
921 941
922 sc->nr_scanned++; 942 sc->nr_scanned++;
923 943
@@ -996,7 +1016,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
996 /* Case 1 above */ 1016 /* Case 1 above */
997 if (current_is_kswapd() && 1017 if (current_is_kswapd() &&
998 PageReclaim(page) && 1018 PageReclaim(page) &&
999 test_bit(ZONE_WRITEBACK, &zone->flags)) { 1019 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1000 nr_immediate++; 1020 nr_immediate++;
1001 goto keep_locked; 1021 goto keep_locked;
1002 1022
@@ -1092,14 +1112,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1092 */ 1112 */
1093 if (page_is_file_cache(page) && 1113 if (page_is_file_cache(page) &&
1094 (!current_is_kswapd() || 1114 (!current_is_kswapd() ||
1095 !test_bit(ZONE_DIRTY, &zone->flags))) { 1115 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1096 /* 1116 /*
1097 * Immediately reclaim when written back. 1117 * Immediately reclaim when written back.
1098 * Similar in principal to deactivate_page() 1118 * Similar in principal to deactivate_page()
1099 * except we already have the page isolated 1119 * except we already have the page isolated
1100 * and know it's dirty 1120 * and know it's dirty
1101 */ 1121 */
1102 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); 1122 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1103 SetPageReclaim(page); 1123 SetPageReclaim(page);
1104 1124
1105 goto keep_locked; 1125 goto keep_locked;
@@ -1266,11 +1286,11 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1266 } 1286 }
1267 } 1287 }
1268 1288
1269 ret = shrink_page_list(&clean_pages, zone, &sc, 1289 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1270 TTU_UNMAP|TTU_IGNORE_ACCESS, 1290 TTU_UNMAP|TTU_IGNORE_ACCESS,
1271 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); 1291 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
1272 list_splice(&clean_pages, page_list); 1292 list_splice(&clean_pages, page_list);
1273 mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); 1293 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
1274 return ret; 1294 return ret;
1275} 1295}
1276 1296
@@ -1348,8 +1368,31 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1348 return ret; 1368 return ret;
1349} 1369}
1350 1370
1371
1351/* 1372/*
1352 * zone->lru_lock is heavily contended. Some of the functions that 1373 * Update LRU sizes after isolating pages. The LRU size updates must
1374 * be complete before mem_cgroup_update_lru_size due to a santity check.
1375 */
1376static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1377 enum lru_list lru, unsigned long *nr_zone_taken,
1378 unsigned long nr_taken)
1379{
1380 int zid;
1381
1382 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1383 if (!nr_zone_taken[zid])
1384 continue;
1385
1386 __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1387 }
1388
1389#ifdef CONFIG_MEMCG
1390 mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
1391#endif
1392}
1393
1394/*
1395 * zone_lru_lock is heavily contended. Some of the functions that
1353 * shrink the lists perform better by taking out a batch of pages 1396 * shrink the lists perform better by taking out a batch of pages
1354 * and working on them outside the LRU lock. 1397 * and working on them outside the LRU lock.
1355 * 1398 *
@@ -1375,10 +1418,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1375{ 1418{
1376 struct list_head *src = &lruvec->lists[lru]; 1419 struct list_head *src = &lruvec->lists[lru];
1377 unsigned long nr_taken = 0; 1420 unsigned long nr_taken = 0;
1378 unsigned long scan; 1421 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1422 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1423 unsigned long scan, nr_pages;
1424 LIST_HEAD(pages_skipped);
1379 1425
1380 for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && 1426 for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
1381 !list_empty(src); scan++) { 1427 !list_empty(src);) {
1382 struct page *page; 1428 struct page *page;
1383 1429
1384 page = lru_to_page(src); 1430 page = lru_to_page(src);
@@ -1386,9 +1432,23 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1386 1432
1387 VM_BUG_ON_PAGE(!PageLRU(page), page); 1433 VM_BUG_ON_PAGE(!PageLRU(page), page);
1388 1434
1435 if (page_zonenum(page) > sc->reclaim_idx) {
1436 list_move(&page->lru, &pages_skipped);
1437 nr_skipped[page_zonenum(page)]++;
1438 continue;
1439 }
1440
1441 /*
1442 * Account for scanned and skipped separetly to avoid the pgdat
1443 * being prematurely marked unreclaimable by pgdat_reclaimable.
1444 */
1445 scan++;
1446
1389 switch (__isolate_lru_page(page, mode)) { 1447 switch (__isolate_lru_page(page, mode)) {
1390 case 0: 1448 case 0:
1391 nr_taken += hpage_nr_pages(page); 1449 nr_pages = hpage_nr_pages(page);
1450 nr_taken += nr_pages;
1451 nr_zone_taken[page_zonenum(page)] += nr_pages;
1392 list_move(&page->lru, dst); 1452 list_move(&page->lru, dst);
1393 break; 1453 break;
1394 1454
@@ -1402,9 +1462,38 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1402 } 1462 }
1403 } 1463 }
1404 1464
1465 /*
1466 * Splice any skipped pages to the start of the LRU list. Note that
1467 * this disrupts the LRU order when reclaiming for lower zones but
1468 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
1469 * scanning would soon rescan the same pages to skip and put the
1470 * system at risk of premature OOM.
1471 */
1472 if (!list_empty(&pages_skipped)) {
1473 int zid;
1474 unsigned long total_skipped = 0;
1475
1476 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1477 if (!nr_skipped[zid])
1478 continue;
1479
1480 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1481 total_skipped += nr_skipped[zid];
1482 }
1483
1484 /*
1485 * Account skipped pages as a partial scan as the pgdat may be
1486 * close to unreclaimable. If the LRU list is empty, account
1487 * skipped pages as a full scan.
1488 */
1489 scan += list_empty(src) ? total_skipped : total_skipped >> 2;
1490
1491 list_splice(&pages_skipped, src);
1492 }
1405 *nr_scanned = scan; 1493 *nr_scanned = scan;
1406 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, 1494 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
1407 nr_taken, mode, is_file_lru(lru)); 1495 nr_taken, mode, is_file_lru(lru));
1496 update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
1408 return nr_taken; 1497 return nr_taken;
1409} 1498}
1410 1499
@@ -1444,8 +1533,8 @@ int isolate_lru_page(struct page *page)
1444 struct zone *zone = page_zone(page); 1533 struct zone *zone = page_zone(page);
1445 struct lruvec *lruvec; 1534 struct lruvec *lruvec;
1446 1535
1447 spin_lock_irq(&zone->lru_lock); 1536 spin_lock_irq(zone_lru_lock(zone));
1448 lruvec = mem_cgroup_page_lruvec(page, zone); 1537 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
1449 if (PageLRU(page)) { 1538 if (PageLRU(page)) {
1450 int lru = page_lru(page); 1539 int lru = page_lru(page);
1451 get_page(page); 1540 get_page(page);
@@ -1453,7 +1542,7 @@ int isolate_lru_page(struct page *page)
1453 del_page_from_lru_list(page, lruvec, lru); 1542 del_page_from_lru_list(page, lruvec, lru);
1454 ret = 0; 1543 ret = 0;
1455 } 1544 }
1456 spin_unlock_irq(&zone->lru_lock); 1545 spin_unlock_irq(zone_lru_lock(zone));
1457 } 1546 }
1458 return ret; 1547 return ret;
1459} 1548}
@@ -1465,7 +1554,7 @@ int isolate_lru_page(struct page *page)
1465 * the LRU list will go small and be scanned faster than necessary, leading to 1554 * the LRU list will go small and be scanned faster than necessary, leading to
1466 * unnecessary swapping, thrashing and OOM. 1555 * unnecessary swapping, thrashing and OOM.
1467 */ 1556 */
1468static int too_many_isolated(struct zone *zone, int file, 1557static int too_many_isolated(struct pglist_data *pgdat, int file,
1469 struct scan_control *sc) 1558 struct scan_control *sc)
1470{ 1559{
1471 unsigned long inactive, isolated; 1560 unsigned long inactive, isolated;
@@ -1477,11 +1566,11 @@ static int too_many_isolated(struct zone *zone, int file,
1477 return 0; 1566 return 0;
1478 1567
1479 if (file) { 1568 if (file) {
1480 inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1569 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1481 isolated = zone_page_state(zone, NR_ISOLATED_FILE); 1570 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1482 } else { 1571 } else {
1483 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1572 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1484 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1573 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1485 } 1574 }
1486 1575
1487 /* 1576 /*
@@ -1499,7 +1588,7 @@ static noinline_for_stack void
1499putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) 1588putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1500{ 1589{
1501 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1590 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1502 struct zone *zone = lruvec_zone(lruvec); 1591 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1503 LIST_HEAD(pages_to_free); 1592 LIST_HEAD(pages_to_free);
1504 1593
1505 /* 1594 /*
@@ -1512,13 +1601,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1512 VM_BUG_ON_PAGE(PageLRU(page), page); 1601 VM_BUG_ON_PAGE(PageLRU(page), page);
1513 list_del(&page->lru); 1602 list_del(&page->lru);
1514 if (unlikely(!page_evictable(page))) { 1603 if (unlikely(!page_evictable(page))) {
1515 spin_unlock_irq(&zone->lru_lock); 1604 spin_unlock_irq(&pgdat->lru_lock);
1516 putback_lru_page(page); 1605 putback_lru_page(page);
1517 spin_lock_irq(&zone->lru_lock); 1606 spin_lock_irq(&pgdat->lru_lock);
1518 continue; 1607 continue;
1519 } 1608 }
1520 1609
1521 lruvec = mem_cgroup_page_lruvec(page, zone); 1610 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1522 1611
1523 SetPageLRU(page); 1612 SetPageLRU(page);
1524 lru = page_lru(page); 1613 lru = page_lru(page);
@@ -1535,10 +1624,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1535 del_page_from_lru_list(page, lruvec, lru); 1624 del_page_from_lru_list(page, lruvec, lru);
1536 1625
1537 if (unlikely(PageCompound(page))) { 1626 if (unlikely(PageCompound(page))) {
1538 spin_unlock_irq(&zone->lru_lock); 1627 spin_unlock_irq(&pgdat->lru_lock);
1539 mem_cgroup_uncharge(page); 1628 mem_cgroup_uncharge(page);
1540 (*get_compound_page_dtor(page))(page); 1629 (*get_compound_page_dtor(page))(page);
1541 spin_lock_irq(&zone->lru_lock); 1630 spin_lock_irq(&pgdat->lru_lock);
1542 } else 1631 } else
1543 list_add(&page->lru, &pages_to_free); 1632 list_add(&page->lru, &pages_to_free);
1544 } 1633 }
@@ -1563,8 +1652,32 @@ static int current_may_throttle(void)
1563 bdi_write_congested(current->backing_dev_info); 1652 bdi_write_congested(current->backing_dev_info);
1564} 1653}
1565 1654
1655static bool inactive_reclaimable_pages(struct lruvec *lruvec,
1656 struct scan_control *sc, enum lru_list lru)
1657{
1658 int zid;
1659 struct zone *zone;
1660 int file = is_file_lru(lru);
1661 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1662
1663 if (!global_reclaim(sc))
1664 return true;
1665
1666 for (zid = sc->reclaim_idx; zid >= 0; zid--) {
1667 zone = &pgdat->node_zones[zid];
1668 if (!populated_zone(zone))
1669 continue;
1670
1671 if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
1672 LRU_FILE * file) >= SWAP_CLUSTER_MAX)
1673 return true;
1674 }
1675
1676 return false;
1677}
1678
1566/* 1679/*
1567 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1680 * shrink_inactive_list() is a helper for shrink_node(). It returns the number
1568 * of reclaimed pages 1681 * of reclaimed pages
1569 */ 1682 */
1570static noinline_for_stack unsigned long 1683static noinline_for_stack unsigned long
@@ -1582,10 +1695,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1582 unsigned long nr_immediate = 0; 1695 unsigned long nr_immediate = 0;
1583 isolate_mode_t isolate_mode = 0; 1696 isolate_mode_t isolate_mode = 0;
1584 int file = is_file_lru(lru); 1697 int file = is_file_lru(lru);
1585 struct zone *zone = lruvec_zone(lruvec); 1698 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1586 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1699 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1587 1700
1588 while (unlikely(too_many_isolated(zone, file, sc))) { 1701 if (!inactive_reclaimable_pages(lruvec, sc, lru))
1702 return 0;
1703
1704 while (unlikely(too_many_isolated(pgdat, file, sc))) {
1589 congestion_wait(BLK_RW_ASYNC, HZ/10); 1705 congestion_wait(BLK_RW_ASYNC, HZ/10);
1590 1706
1591 /* We are about to die and free our memory. Return now. */ 1707 /* We are about to die and free our memory. Return now. */
@@ -1600,48 +1716,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1600 if (!sc->may_writepage) 1716 if (!sc->may_writepage)
1601 isolate_mode |= ISOLATE_CLEAN; 1717 isolate_mode |= ISOLATE_CLEAN;
1602 1718
1603 spin_lock_irq(&zone->lru_lock); 1719 spin_lock_irq(&pgdat->lru_lock);
1604 1720
1605 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, 1721 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1606 &nr_scanned, sc, isolate_mode, lru); 1722 &nr_scanned, sc, isolate_mode, lru);
1607 1723
1608 update_lru_size(lruvec, lru, -nr_taken); 1724 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1609 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1610 reclaim_stat->recent_scanned[file] += nr_taken; 1725 reclaim_stat->recent_scanned[file] += nr_taken;
1611 1726
1612 if (global_reclaim(sc)) { 1727 if (global_reclaim(sc)) {
1613 __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); 1728 __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
1614 if (current_is_kswapd()) 1729 if (current_is_kswapd())
1615 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); 1730 __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
1616 else 1731 else
1617 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); 1732 __count_vm_events(PGSCAN_DIRECT, nr_scanned);
1618 } 1733 }
1619 spin_unlock_irq(&zone->lru_lock); 1734 spin_unlock_irq(&pgdat->lru_lock);
1620 1735
1621 if (nr_taken == 0) 1736 if (nr_taken == 0)
1622 return 0; 1737 return 0;
1623 1738
1624 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, 1739 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
1625 &nr_dirty, &nr_unqueued_dirty, &nr_congested, 1740 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
1626 &nr_writeback, &nr_immediate, 1741 &nr_writeback, &nr_immediate,
1627 false); 1742 false);
1628 1743
1629 spin_lock_irq(&zone->lru_lock); 1744 spin_lock_irq(&pgdat->lru_lock);
1630 1745
1631 if (global_reclaim(sc)) { 1746 if (global_reclaim(sc)) {
1632 if (current_is_kswapd()) 1747 if (current_is_kswapd())
1633 __count_zone_vm_events(PGSTEAL_KSWAPD, zone, 1748 __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
1634 nr_reclaimed);
1635 else 1749 else
1636 __count_zone_vm_events(PGSTEAL_DIRECT, zone, 1750 __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
1637 nr_reclaimed);
1638 } 1751 }
1639 1752
1640 putback_inactive_pages(lruvec, &page_list); 1753 putback_inactive_pages(lruvec, &page_list);
1641 1754
1642 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1755 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1643 1756
1644 spin_unlock_irq(&zone->lru_lock); 1757 spin_unlock_irq(&pgdat->lru_lock);
1645 1758
1646 mem_cgroup_uncharge_list(&page_list); 1759 mem_cgroup_uncharge_list(&page_list);
1647 free_hot_cold_page_list(&page_list, true); 1760 free_hot_cold_page_list(&page_list, true);
@@ -1661,7 +1774,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1661 * are encountered in the nr_immediate check below. 1774 * are encountered in the nr_immediate check below.
1662 */ 1775 */
1663 if (nr_writeback && nr_writeback == nr_taken) 1776 if (nr_writeback && nr_writeback == nr_taken)
1664 set_bit(ZONE_WRITEBACK, &zone->flags); 1777 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
1665 1778
1666 /* 1779 /*
1667 * Legacy memcg will stall in page writeback so avoid forcibly 1780 * Legacy memcg will stall in page writeback so avoid forcibly
@@ -1673,16 +1786,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1673 * backed by a congested BDI and wait_iff_congested will stall. 1786 * backed by a congested BDI and wait_iff_congested will stall.
1674 */ 1787 */
1675 if (nr_dirty && nr_dirty == nr_congested) 1788 if (nr_dirty && nr_dirty == nr_congested)
1676 set_bit(ZONE_CONGESTED, &zone->flags); 1789 set_bit(PGDAT_CONGESTED, &pgdat->flags);
1677 1790
1678 /* 1791 /*
1679 * If dirty pages are scanned that are not queued for IO, it 1792 * If dirty pages are scanned that are not queued for IO, it
1680 * implies that flushers are not keeping up. In this case, flag 1793 * implies that flushers are not keeping up. In this case, flag
1681 * the zone ZONE_DIRTY and kswapd will start writing pages from 1794 * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
1682 * reclaim context. 1795 * reclaim context.
1683 */ 1796 */
1684 if (nr_unqueued_dirty == nr_taken) 1797 if (nr_unqueued_dirty == nr_taken)
1685 set_bit(ZONE_DIRTY, &zone->flags); 1798 set_bit(PGDAT_DIRTY, &pgdat->flags);
1686 1799
1687 /* 1800 /*
1688 * If kswapd scans pages marked marked for immediate 1801 * If kswapd scans pages marked marked for immediate
@@ -1701,9 +1814,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1701 */ 1814 */
1702 if (!sc->hibernation_mode && !current_is_kswapd() && 1815 if (!sc->hibernation_mode && !current_is_kswapd() &&
1703 current_may_throttle()) 1816 current_may_throttle())
1704 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1817 wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
1705 1818
1706 trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed, 1819 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
1820 nr_scanned, nr_reclaimed,
1707 sc->priority, file); 1821 sc->priority, file);
1708 return nr_reclaimed; 1822 return nr_reclaimed;
1709} 1823}
@@ -1715,9 +1829,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1715 * processes, from rmap. 1829 * processes, from rmap.
1716 * 1830 *
1717 * If the pages are mostly unmapped, the processing is fast and it is 1831 * If the pages are mostly unmapped, the processing is fast and it is
1718 * appropriate to hold zone->lru_lock across the whole operation. But if 1832 * appropriate to hold zone_lru_lock across the whole operation. But if
1719 * the pages are mapped, the processing is slow (page_referenced()) so we 1833 * the pages are mapped, the processing is slow (page_referenced()) so we
1720 * should drop zone->lru_lock around each page. It's impossible to balance 1834 * should drop zone_lru_lock around each page. It's impossible to balance
1721 * this, so instead we remove the pages from the LRU while processing them. 1835 * this, so instead we remove the pages from the LRU while processing them.
1722 * It is safe to rely on PG_active against the non-LRU pages in here because 1836 * It is safe to rely on PG_active against the non-LRU pages in here because
1723 * nobody will play with that bit on a non-LRU page. 1837 * nobody will play with that bit on a non-LRU page.
@@ -1731,20 +1845,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
1731 struct list_head *pages_to_free, 1845 struct list_head *pages_to_free,
1732 enum lru_list lru) 1846 enum lru_list lru)
1733{ 1847{
1734 struct zone *zone = lruvec_zone(lruvec); 1848 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1735 unsigned long pgmoved = 0; 1849 unsigned long pgmoved = 0;
1736 struct page *page; 1850 struct page *page;
1737 int nr_pages; 1851 int nr_pages;
1738 1852
1739 while (!list_empty(list)) { 1853 while (!list_empty(list)) {
1740 page = lru_to_page(list); 1854 page = lru_to_page(list);
1741 lruvec = mem_cgroup_page_lruvec(page, zone); 1855 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1742 1856
1743 VM_BUG_ON_PAGE(PageLRU(page), page); 1857 VM_BUG_ON_PAGE(PageLRU(page), page);
1744 SetPageLRU(page); 1858 SetPageLRU(page);
1745 1859
1746 nr_pages = hpage_nr_pages(page); 1860 nr_pages = hpage_nr_pages(page);
1747 update_lru_size(lruvec, lru, nr_pages); 1861 update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
1748 list_move(&page->lru, &lruvec->lists[lru]); 1862 list_move(&page->lru, &lruvec->lists[lru]);
1749 pgmoved += nr_pages; 1863 pgmoved += nr_pages;
1750 1864
@@ -1754,10 +1868,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
1754 del_page_from_lru_list(page, lruvec, lru); 1868 del_page_from_lru_list(page, lruvec, lru);
1755 1869
1756 if (unlikely(PageCompound(page))) { 1870 if (unlikely(PageCompound(page))) {
1757 spin_unlock_irq(&zone->lru_lock); 1871 spin_unlock_irq(&pgdat->lru_lock);
1758 mem_cgroup_uncharge(page); 1872 mem_cgroup_uncharge(page);
1759 (*get_compound_page_dtor(page))(page); 1873 (*get_compound_page_dtor(page))(page);
1760 spin_lock_irq(&zone->lru_lock); 1874 spin_lock_irq(&pgdat->lru_lock);
1761 } else 1875 } else
1762 list_add(&page->lru, pages_to_free); 1876 list_add(&page->lru, pages_to_free);
1763 } 1877 }
@@ -1783,7 +1897,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1783 unsigned long nr_rotated = 0; 1897 unsigned long nr_rotated = 0;
1784 isolate_mode_t isolate_mode = 0; 1898 isolate_mode_t isolate_mode = 0;
1785 int file = is_file_lru(lru); 1899 int file = is_file_lru(lru);
1786 struct zone *zone = lruvec_zone(lruvec); 1900 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1787 1901
1788 lru_add_drain(); 1902 lru_add_drain();
1789 1903
@@ -1792,20 +1906,19 @@ static void shrink_active_list(unsigned long nr_to_scan,
1792 if (!sc->may_writepage) 1906 if (!sc->may_writepage)
1793 isolate_mode |= ISOLATE_CLEAN; 1907 isolate_mode |= ISOLATE_CLEAN;
1794 1908
1795 spin_lock_irq(&zone->lru_lock); 1909 spin_lock_irq(&pgdat->lru_lock);
1796 1910
1797 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, 1911 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1798 &nr_scanned, sc, isolate_mode, lru); 1912 &nr_scanned, sc, isolate_mode, lru);
1799 1913
1800 update_lru_size(lruvec, lru, -nr_taken); 1914 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1801 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1802 reclaim_stat->recent_scanned[file] += nr_taken; 1915 reclaim_stat->recent_scanned[file] += nr_taken;
1803 1916
1804 if (global_reclaim(sc)) 1917 if (global_reclaim(sc))
1805 __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); 1918 __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
1806 __count_zone_vm_events(PGREFILL, zone, nr_scanned); 1919 __count_vm_events(PGREFILL, nr_scanned);
1807 1920
1808 spin_unlock_irq(&zone->lru_lock); 1921 spin_unlock_irq(&pgdat->lru_lock);
1809 1922
1810 while (!list_empty(&l_hold)) { 1923 while (!list_empty(&l_hold)) {
1811 cond_resched(); 1924 cond_resched();
@@ -1850,7 +1963,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1850 /* 1963 /*
1851 * Move pages back to the lru list. 1964 * Move pages back to the lru list.
1852 */ 1965 */
1853 spin_lock_irq(&zone->lru_lock); 1966 spin_lock_irq(&pgdat->lru_lock);
1854 /* 1967 /*
1855 * Count referenced pages from currently used mappings as rotated, 1968 * Count referenced pages from currently used mappings as rotated,
1856 * even though only some of them are actually re-activated. This 1969 * even though only some of them are actually re-activated. This
@@ -1861,8 +1974,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
1861 1974
1862 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); 1975 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1863 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); 1976 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1864 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1977 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1865 spin_unlock_irq(&zone->lru_lock); 1978 spin_unlock_irq(&pgdat->lru_lock);
1866 1979
1867 mem_cgroup_uncharge_list(&l_hold); 1980 mem_cgroup_uncharge_list(&l_hold);
1868 free_hot_cold_page_list(&l_hold, true); 1981 free_hot_cold_page_list(&l_hold, true);
@@ -1894,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
1894 * 1TB 101 10GB 2007 * 1TB 101 10GB
1895 * 10TB 320 32GB 2008 * 10TB 320 32GB
1896 */ 2009 */
1897static bool inactive_list_is_low(struct lruvec *lruvec, bool file) 2010static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2011 struct scan_control *sc)
1898{ 2012{
1899 unsigned long inactive_ratio; 2013 unsigned long inactive_ratio;
1900 unsigned long inactive; 2014 unsigned long inactive;
1901 unsigned long active; 2015 unsigned long active;
1902 unsigned long gb; 2016 unsigned long gb;
2017 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2018 int zid;
1903 2019
1904 /* 2020 /*
1905 * If we don't have swap space, anonymous page deactivation 2021 * If we don't have swap space, anonymous page deactivation
@@ -1911,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
1911 inactive = lruvec_lru_size(lruvec, file * LRU_FILE); 2027 inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
1912 active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); 2028 active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
1913 2029
2030 /*
2031 * For zone-constrained allocations, it is necessary to check if
2032 * deactivations are required for lowmem to be reclaimed. This
2033 * calculates the inactive/active pages available in eligible zones.
2034 */
2035 for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
2036 struct zone *zone = &pgdat->node_zones[zid];
2037 unsigned long inactive_zone, active_zone;
2038
2039 if (!populated_zone(zone))
2040 continue;
2041
2042 inactive_zone = zone_page_state(zone,
2043 NR_ZONE_LRU_BASE + (file * LRU_FILE));
2044 active_zone = zone_page_state(zone,
2045 NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
2046
2047 inactive -= min(inactive, inactive_zone);
2048 active -= min(active, active_zone);
2049 }
2050
1914 gb = (inactive + active) >> (30 - PAGE_SHIFT); 2051 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1915 if (gb) 2052 if (gb)
1916 inactive_ratio = int_sqrt(10 * gb); 2053 inactive_ratio = int_sqrt(10 * gb);
@@ -1924,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1924 struct lruvec *lruvec, struct scan_control *sc) 2061 struct lruvec *lruvec, struct scan_control *sc)
1925{ 2062{
1926 if (is_active_lru(lru)) { 2063 if (is_active_lru(lru)) {
1927 if (inactive_list_is_low(lruvec, is_file_lru(lru))) 2064 if (inactive_list_is_low(lruvec, is_file_lru(lru), sc))
1928 shrink_active_list(nr_to_scan, lruvec, sc, lru); 2065 shrink_active_list(nr_to_scan, lruvec, sc, lru);
1929 return 0; 2066 return 0;
1930 } 2067 }
@@ -1956,7 +2093,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
1956 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 2093 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1957 u64 fraction[2]; 2094 u64 fraction[2];
1958 u64 denominator = 0; /* gcc */ 2095 u64 denominator = 0; /* gcc */
1959 struct zone *zone = lruvec_zone(lruvec); 2096 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1960 unsigned long anon_prio, file_prio; 2097 unsigned long anon_prio, file_prio;
1961 enum scan_balance scan_balance; 2098 enum scan_balance scan_balance;
1962 unsigned long anon, file; 2099 unsigned long anon, file;
@@ -1977,7 +2114,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
1977 * well. 2114 * well.
1978 */ 2115 */
1979 if (current_is_kswapd()) { 2116 if (current_is_kswapd()) {
1980 if (!zone_reclaimable(zone)) 2117 if (!pgdat_reclaimable(pgdat))
1981 force_scan = true; 2118 force_scan = true;
1982 if (!mem_cgroup_online(memcg)) 2119 if (!mem_cgroup_online(memcg))
1983 force_scan = true; 2120 force_scan = true;
@@ -2023,14 +2160,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2023 * anon pages. Try to detect this based on file LRU size. 2160 * anon pages. Try to detect this based on file LRU size.
2024 */ 2161 */
2025 if (global_reclaim(sc)) { 2162 if (global_reclaim(sc)) {
2026 unsigned long zonefile; 2163 unsigned long pgdatfile;
2027 unsigned long zonefree; 2164 unsigned long pgdatfree;
2165 int z;
2166 unsigned long total_high_wmark = 0;
2028 2167
2029 zonefree = zone_page_state(zone, NR_FREE_PAGES); 2168 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2030 zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + 2169 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
2031 zone_page_state(zone, NR_INACTIVE_FILE); 2170 node_page_state(pgdat, NR_INACTIVE_FILE);
2032 2171
2033 if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { 2172 for (z = 0; z < MAX_NR_ZONES; z++) {
2173 struct zone *zone = &pgdat->node_zones[z];
2174 if (!populated_zone(zone))
2175 continue;
2176
2177 total_high_wmark += high_wmark_pages(zone);
2178 }
2179
2180 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
2034 scan_balance = SCAN_ANON; 2181 scan_balance = SCAN_ANON;
2035 goto out; 2182 goto out;
2036 } 2183 }
@@ -2045,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2045 * lruvec even if it has plenty of old anonymous pages unless the 2192 * lruvec even if it has plenty of old anonymous pages unless the
2046 * system is under heavy pressure. 2193 * system is under heavy pressure.
2047 */ 2194 */
2048 if (!inactive_list_is_low(lruvec, true) && 2195 if (!inactive_list_is_low(lruvec, true, sc) &&
2049 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { 2196 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
2050 scan_balance = SCAN_FILE; 2197 scan_balance = SCAN_FILE;
2051 goto out; 2198 goto out;
@@ -2077,7 +2224,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2077 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + 2224 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
2078 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); 2225 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
2079 2226
2080 spin_lock_irq(&zone->lru_lock); 2227 spin_lock_irq(&pgdat->lru_lock);
2081 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 2228 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
2082 reclaim_stat->recent_scanned[0] /= 2; 2229 reclaim_stat->recent_scanned[0] /= 2;
2083 reclaim_stat->recent_rotated[0] /= 2; 2230 reclaim_stat->recent_rotated[0] /= 2;
@@ -2098,7 +2245,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2098 2245
2099 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); 2246 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
2100 fp /= reclaim_stat->recent_rotated[1] + 1; 2247 fp /= reclaim_stat->recent_rotated[1] + 1;
2101 spin_unlock_irq(&zone->lru_lock); 2248 spin_unlock_irq(&pgdat->lru_lock);
2102 2249
2103 fraction[0] = ap; 2250 fraction[0] = ap;
2104 fraction[1] = fp; 2251 fraction[1] = fp;
@@ -2174,12 +2321,12 @@ static inline void init_tlb_ubc(void)
2174#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 2321#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
2175 2322
2176/* 2323/*
2177 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2324 * This is a basic per-node page freer. Used by both kswapd and direct reclaim.
2178 */ 2325 */
2179static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, 2326static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
2180 struct scan_control *sc, unsigned long *lru_pages) 2327 struct scan_control *sc, unsigned long *lru_pages)
2181{ 2328{
2182 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2329 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2183 unsigned long nr[NR_LRU_LISTS]; 2330 unsigned long nr[NR_LRU_LISTS];
2184 unsigned long targets[NR_LRU_LISTS]; 2331 unsigned long targets[NR_LRU_LISTS];
2185 unsigned long nr_to_scan; 2332 unsigned long nr_to_scan;
@@ -2287,7 +2434,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
2287 * Even if we did not try to evict anon pages at all, we want to 2434 * Even if we did not try to evict anon pages at all, we want to
2288 * rebalance the anon lru active/inactive ratio. 2435 * rebalance the anon lru active/inactive ratio.
2289 */ 2436 */
2290 if (inactive_list_is_low(lruvec, false)) 2437 if (inactive_list_is_low(lruvec, false, sc))
2291 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2438 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2292 sc, LRU_ACTIVE_ANON); 2439 sc, LRU_ACTIVE_ANON);
2293 2440
@@ -2312,13 +2459,14 @@ static bool in_reclaim_compaction(struct scan_control *sc)
2312 * calls try_to_compact_zone() that it will have enough free pages to succeed. 2459 * calls try_to_compact_zone() that it will have enough free pages to succeed.
2313 * It will give up earlier than that if there is difficulty reclaiming pages. 2460 * It will give up earlier than that if there is difficulty reclaiming pages.
2314 */ 2461 */
2315static inline bool should_continue_reclaim(struct zone *zone, 2462static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2316 unsigned long nr_reclaimed, 2463 unsigned long nr_reclaimed,
2317 unsigned long nr_scanned, 2464 unsigned long nr_scanned,
2318 struct scan_control *sc) 2465 struct scan_control *sc)
2319{ 2466{
2320 unsigned long pages_for_compaction; 2467 unsigned long pages_for_compaction;
2321 unsigned long inactive_lru_pages; 2468 unsigned long inactive_lru_pages;
2469 int z;
2322 2470
2323 /* If not in reclaim/compaction mode, stop */ 2471 /* If not in reclaim/compaction mode, stop */
2324 if (!in_reclaim_compaction(sc)) 2472 if (!in_reclaim_compaction(sc))
@@ -2352,25 +2500,32 @@ static inline bool should_continue_reclaim(struct zone *zone,
2352 * inactive lists are large enough, continue reclaiming 2500 * inactive lists are large enough, continue reclaiming
2353 */ 2501 */
2354 pages_for_compaction = (2UL << sc->order); 2502 pages_for_compaction = (2UL << sc->order);
2355 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); 2503 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2356 if (get_nr_swap_pages() > 0) 2504 if (get_nr_swap_pages() > 0)
2357 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); 2505 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2358 if (sc->nr_reclaimed < pages_for_compaction && 2506 if (sc->nr_reclaimed < pages_for_compaction &&
2359 inactive_lru_pages > pages_for_compaction) 2507 inactive_lru_pages > pages_for_compaction)
2360 return true; 2508 return true;
2361 2509
2362 /* If compaction would go ahead or the allocation would succeed, stop */ 2510 /* If compaction would go ahead or the allocation would succeed, stop */
2363 switch (compaction_suitable(zone, sc->order, 0, 0)) { 2511 for (z = 0; z <= sc->reclaim_idx; z++) {
2364 case COMPACT_PARTIAL: 2512 struct zone *zone = &pgdat->node_zones[z];
2365 case COMPACT_CONTINUE: 2513 if (!populated_zone(zone))
2366 return false; 2514 continue;
2367 default: 2515
2368 return true; 2516 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2517 case COMPACT_PARTIAL:
2518 case COMPACT_CONTINUE:
2519 return false;
2520 default:
2521 /* check next zone */
2522 ;
2523 }
2369 } 2524 }
2525 return true;
2370} 2526}
2371 2527
2372static bool shrink_zone(struct zone *zone, struct scan_control *sc, 2528static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2373 bool is_classzone)
2374{ 2529{
2375 struct reclaim_state *reclaim_state = current->reclaim_state; 2530 struct reclaim_state *reclaim_state = current->reclaim_state;
2376 unsigned long nr_reclaimed, nr_scanned; 2531 unsigned long nr_reclaimed, nr_scanned;
@@ -2379,10 +2534,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2379 do { 2534 do {
2380 struct mem_cgroup *root = sc->target_mem_cgroup; 2535 struct mem_cgroup *root = sc->target_mem_cgroup;
2381 struct mem_cgroup_reclaim_cookie reclaim = { 2536 struct mem_cgroup_reclaim_cookie reclaim = {
2382 .zone = zone, 2537 .pgdat = pgdat,
2383 .priority = sc->priority, 2538 .priority = sc->priority,
2384 }; 2539 };
2385 unsigned long zone_lru_pages = 0; 2540 unsigned long node_lru_pages = 0;
2386 struct mem_cgroup *memcg; 2541 struct mem_cgroup *memcg;
2387 2542
2388 nr_reclaimed = sc->nr_reclaimed; 2543 nr_reclaimed = sc->nr_reclaimed;
@@ -2403,11 +2558,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2403 reclaimed = sc->nr_reclaimed; 2558 reclaimed = sc->nr_reclaimed;
2404 scanned = sc->nr_scanned; 2559 scanned = sc->nr_scanned;
2405 2560
2406 shrink_zone_memcg(zone, memcg, sc, &lru_pages); 2561 shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
2407 zone_lru_pages += lru_pages; 2562 node_lru_pages += lru_pages;
2408 2563
2409 if (memcg && is_classzone) 2564 if (!global_reclaim(sc))
2410 shrink_slab(sc->gfp_mask, zone_to_nid(zone), 2565 shrink_slab(sc->gfp_mask, pgdat->node_id,
2411 memcg, sc->nr_scanned - scanned, 2566 memcg, sc->nr_scanned - scanned,
2412 lru_pages); 2567 lru_pages);
2413 2568
@@ -2419,7 +2574,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2419 /* 2574 /*
2420 * Direct reclaim and kswapd have to scan all memory 2575 * Direct reclaim and kswapd have to scan all memory
2421 * cgroups to fulfill the overall scan target for the 2576 * cgroups to fulfill the overall scan target for the
2422 * zone. 2577 * node.
2423 * 2578 *
2424 * Limit reclaim, on the other hand, only cares about 2579 * Limit reclaim, on the other hand, only cares about
2425 * nr_to_reclaim pages to be reclaimed and it will 2580 * nr_to_reclaim pages to be reclaimed and it will
@@ -2437,10 +2592,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2437 * Shrink the slab caches in the same proportion that 2592 * Shrink the slab caches in the same proportion that
2438 * the eligible LRU pages were scanned. 2593 * the eligible LRU pages were scanned.
2439 */ 2594 */
2440 if (global_reclaim(sc) && is_classzone) 2595 if (global_reclaim(sc))
2441 shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, 2596 shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
2442 sc->nr_scanned - nr_scanned, 2597 sc->nr_scanned - nr_scanned,
2443 zone_lru_pages); 2598 node_lru_pages);
2444 2599
2445 if (reclaim_state) { 2600 if (reclaim_state) {
2446 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2601 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2455,7 +2610,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2455 if (sc->nr_reclaimed - nr_reclaimed) 2610 if (sc->nr_reclaimed - nr_reclaimed)
2456 reclaimable = true; 2611 reclaimable = true;
2457 2612
2458 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2613 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2459 sc->nr_scanned - nr_scanned, sc)); 2614 sc->nr_scanned - nr_scanned, sc));
2460 2615
2461 return reclaimable; 2616 return reclaimable;
@@ -2465,9 +2620,9 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2465 * Returns true if compaction should go ahead for a high-order request, or 2620 * Returns true if compaction should go ahead for a high-order request, or
2466 * the high-order allocation would succeed without compaction. 2621 * the high-order allocation would succeed without compaction.
2467 */ 2622 */
2468static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx) 2623static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2469{ 2624{
2470 unsigned long balance_gap, watermark; 2625 unsigned long watermark;
2471 bool watermark_ok; 2626 bool watermark_ok;
2472 2627
2473 /* 2628 /*
@@ -2476,23 +2631,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
2476 * there is a buffer of free pages available to give compaction 2631 * there is a buffer of free pages available to give compaction
2477 * a reasonable chance of completing and allocating the page 2632 * a reasonable chance of completing and allocating the page
2478 */ 2633 */
2479 balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( 2634 watermark = high_wmark_pages(zone) + (2UL << sc->order);
2480 zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); 2635 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2481 watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
2482 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
2483 2636
2484 /* 2637 /*
2485 * If compaction is deferred, reclaim up to a point where 2638 * If compaction is deferred, reclaim up to a point where
2486 * compaction will have a chance of success when re-enabled 2639 * compaction will have a chance of success when re-enabled
2487 */ 2640 */
2488 if (compaction_deferred(zone, order)) 2641 if (compaction_deferred(zone, sc->order))
2489 return watermark_ok; 2642 return watermark_ok;
2490 2643
2491 /* 2644 /*
2492 * If compaction is not ready to start and allocation is not likely 2645 * If compaction is not ready to start and allocation is not likely
2493 * to succeed without it, then keep reclaiming. 2646 * to succeed without it, then keep reclaiming.
2494 */ 2647 */
2495 if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED) 2648 if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED)
2496 return false; 2649 return false;
2497 2650
2498 return watermark_ok; 2651 return watermark_ok;
@@ -2503,14 +2656,6 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
2503 * try to reclaim pages from zones which will satisfy the caller's allocation 2656 * try to reclaim pages from zones which will satisfy the caller's allocation
2504 * request. 2657 * request.
2505 * 2658 *
2506 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
2507 * Because:
2508 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
2509 * allocation or
2510 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
2511 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
2512 * zone defense algorithm.
2513 *
2514 * If a zone is deemed to be full of pinned pages then just give it a light 2659 * If a zone is deemed to be full of pinned pages then just give it a light
2515 * scan then give up on it. 2660 * scan then give up on it.
2516 */ 2661 */
@@ -2521,7 +2666,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2521 unsigned long nr_soft_reclaimed; 2666 unsigned long nr_soft_reclaimed;
2522 unsigned long nr_soft_scanned; 2667 unsigned long nr_soft_scanned;
2523 gfp_t orig_mask; 2668 gfp_t orig_mask;
2524 enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); 2669 pg_data_t *last_pgdat = NULL;
2525 2670
2526 /* 2671 /*
2527 * If the number of buffer_heads in the machine exceeds the maximum 2672 * If the number of buffer_heads in the machine exceeds the maximum
@@ -2529,21 +2674,13 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2529 * highmem pages could be pinning lowmem pages storing buffer_heads 2674 * highmem pages could be pinning lowmem pages storing buffer_heads
2530 */ 2675 */
2531 orig_mask = sc->gfp_mask; 2676 orig_mask = sc->gfp_mask;
2532 if (buffer_heads_over_limit) 2677 if (buffer_heads_over_limit) {
2533 sc->gfp_mask |= __GFP_HIGHMEM; 2678 sc->gfp_mask |= __GFP_HIGHMEM;
2679 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2680 }
2534 2681
2535 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2682 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2536 gfp_zone(sc->gfp_mask), sc->nodemask) { 2683 sc->reclaim_idx, sc->nodemask) {
2537 enum zone_type classzone_idx;
2538
2539 if (!populated_zone(zone))
2540 continue;
2541
2542 classzone_idx = requested_highidx;
2543 while (!populated_zone(zone->zone_pgdat->node_zones +
2544 classzone_idx))
2545 classzone_idx--;
2546
2547 /* 2684 /*
2548 * Take care memory controller reclaiming has small influence 2685 * Take care memory controller reclaiming has small influence
2549 * to global LRU. 2686 * to global LRU.
@@ -2554,7 +2691,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2554 continue; 2691 continue;
2555 2692
2556 if (sc->priority != DEF_PRIORITY && 2693 if (sc->priority != DEF_PRIORITY &&
2557 !zone_reclaimable(zone)) 2694 !pgdat_reclaimable(zone->zone_pgdat))
2558 continue; /* Let kswapd poll it */ 2695 continue; /* Let kswapd poll it */
2559 2696
2560 /* 2697 /*
@@ -2568,20 +2705,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2568 */ 2705 */
2569 if (IS_ENABLED(CONFIG_COMPACTION) && 2706 if (IS_ENABLED(CONFIG_COMPACTION) &&
2570 sc->order > PAGE_ALLOC_COSTLY_ORDER && 2707 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2571 zonelist_zone_idx(z) <= requested_highidx && 2708 compaction_ready(zone, sc)) {
2572 compaction_ready(zone, sc->order, requested_highidx)) {
2573 sc->compaction_ready = true; 2709 sc->compaction_ready = true;
2574 continue; 2710 continue;
2575 } 2711 }
2576 2712
2577 /* 2713 /*
2714 * Shrink each node in the zonelist once. If the
2715 * zonelist is ordered by zone (not the default) then a
2716 * node may be shrunk multiple times but in that case
2717 * the user prefers lower zones being preserved.
2718 */
2719 if (zone->zone_pgdat == last_pgdat)
2720 continue;
2721
2722 /*
2578 * This steals pages from memory cgroups over softlimit 2723 * This steals pages from memory cgroups over softlimit
2579 * and returns the number of reclaimed pages and 2724 * and returns the number of reclaimed pages and
2580 * scanned pages. This works for global memory pressure 2725 * scanned pages. This works for global memory pressure
2581 * and balancing, not for a memcg's limit. 2726 * and balancing, not for a memcg's limit.
2582 */ 2727 */
2583 nr_soft_scanned = 0; 2728 nr_soft_scanned = 0;
2584 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2729 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2585 sc->order, sc->gfp_mask, 2730 sc->order, sc->gfp_mask,
2586 &nr_soft_scanned); 2731 &nr_soft_scanned);
2587 sc->nr_reclaimed += nr_soft_reclaimed; 2732 sc->nr_reclaimed += nr_soft_reclaimed;
@@ -2589,7 +2734,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2589 /* need some check for avoid more shrink_zone() */ 2734 /* need some check for avoid more shrink_zone() */
2590 } 2735 }
2591 2736
2592 shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); 2737 /* See comment about same check for global reclaim above */
2738 if (zone->zone_pgdat == last_pgdat)
2739 continue;
2740 last_pgdat = zone->zone_pgdat;
2741 shrink_node(zone->zone_pgdat, sc);
2593 } 2742 }
2594 2743
2595 /* 2744 /*
@@ -2625,7 +2774,7 @@ retry:
2625 delayacct_freepages_start(); 2774 delayacct_freepages_start();
2626 2775
2627 if (global_reclaim(sc)) 2776 if (global_reclaim(sc))
2628 count_vm_event(ALLOCSTALL); 2777 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
2629 2778
2630 do { 2779 do {
2631 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 2780 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2692,7 +2841,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2692 for (i = 0; i <= ZONE_NORMAL; i++) { 2841 for (i = 0; i <= ZONE_NORMAL; i++) {
2693 zone = &pgdat->node_zones[i]; 2842 zone = &pgdat->node_zones[i];
2694 if (!populated_zone(zone) || 2843 if (!populated_zone(zone) ||
2695 zone_reclaimable_pages(zone) == 0) 2844 pgdat_reclaimable_pages(pgdat) == 0)
2696 continue; 2845 continue;
2697 2846
2698 pfmemalloc_reserve += min_wmark_pages(zone); 2847 pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2707,7 +2856,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2707 2856
2708 /* kswapd must be awake if processes are being throttled */ 2857 /* kswapd must be awake if processes are being throttled */
2709 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { 2858 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2710 pgdat->classzone_idx = min(pgdat->classzone_idx, 2859 pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
2711 (enum zone_type)ZONE_NORMAL); 2860 (enum zone_type)ZONE_NORMAL);
2712 wake_up_interruptible(&pgdat->kswapd_wait); 2861 wake_up_interruptible(&pgdat->kswapd_wait);
2713 } 2862 }
@@ -2815,6 +2964,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2815 struct scan_control sc = { 2964 struct scan_control sc = {
2816 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2965 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2817 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 2966 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2967 .reclaim_idx = gfp_zone(gfp_mask),
2818 .order = order, 2968 .order = order,
2819 .nodemask = nodemask, 2969 .nodemask = nodemask,
2820 .priority = DEF_PRIORITY, 2970 .priority = DEF_PRIORITY,
@@ -2833,7 +2983,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2833 2983
2834 trace_mm_vmscan_direct_reclaim_begin(order, 2984 trace_mm_vmscan_direct_reclaim_begin(order,
2835 sc.may_writepage, 2985 sc.may_writepage,
2836 gfp_mask); 2986 gfp_mask,
2987 sc.reclaim_idx);
2837 2988
2838 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2989 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2839 2990
@@ -2844,9 +2995,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2844 2995
2845#ifdef CONFIG_MEMCG 2996#ifdef CONFIG_MEMCG
2846 2997
2847unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2998unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
2848 gfp_t gfp_mask, bool noswap, 2999 gfp_t gfp_mask, bool noswap,
2849 struct zone *zone, 3000 pg_data_t *pgdat,
2850 unsigned long *nr_scanned) 3001 unsigned long *nr_scanned)
2851{ 3002{
2852 struct scan_control sc = { 3003 struct scan_control sc = {
@@ -2854,6 +3005,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2854 .target_mem_cgroup = memcg, 3005 .target_mem_cgroup = memcg,
2855 .may_writepage = !laptop_mode, 3006 .may_writepage = !laptop_mode,
2856 .may_unmap = 1, 3007 .may_unmap = 1,
3008 .reclaim_idx = MAX_NR_ZONES - 1,
2857 .may_swap = !noswap, 3009 .may_swap = !noswap,
2858 }; 3010 };
2859 unsigned long lru_pages; 3011 unsigned long lru_pages;
@@ -2863,16 +3015,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2863 3015
2864 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 3016 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2865 sc.may_writepage, 3017 sc.may_writepage,
2866 sc.gfp_mask); 3018 sc.gfp_mask,
3019 sc.reclaim_idx);
2867 3020
2868 /* 3021 /*
2869 * NOTE: Although we can get the priority field, using it 3022 * NOTE: Although we can get the priority field, using it
2870 * here is not a good idea, since it limits the pages we can scan. 3023 * here is not a good idea, since it limits the pages we can scan.
2871 * if we don't reclaim here, the shrink_zone from balance_pgdat 3024 * if we don't reclaim here, the shrink_node from balance_pgdat
2872 * will pick up pages from other mem cgroup's as well. We hack 3025 * will pick up pages from other mem cgroup's as well. We hack
2873 * the priority and make it zero. 3026 * the priority and make it zero.
2874 */ 3027 */
2875 shrink_zone_memcg(zone, memcg, &sc, &lru_pages); 3028 shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
2876 3029
2877 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 3030 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2878 3031
@@ -2892,6 +3045,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2892 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 3045 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
2893 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 3046 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2894 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 3047 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3048 .reclaim_idx = MAX_NR_ZONES - 1,
2895 .target_mem_cgroup = memcg, 3049 .target_mem_cgroup = memcg,
2896 .priority = DEF_PRIORITY, 3050 .priority = DEF_PRIORITY,
2897 .may_writepage = !laptop_mode, 3051 .may_writepage = !laptop_mode,
@@ -2910,7 +3064,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2910 3064
2911 trace_mm_vmscan_memcg_reclaim_begin(0, 3065 trace_mm_vmscan_memcg_reclaim_begin(0,
2912 sc.may_writepage, 3066 sc.may_writepage,
2913 sc.gfp_mask); 3067 sc.gfp_mask,
3068 sc.reclaim_idx);
2914 3069
2915 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 3070 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2916 3071
@@ -2920,7 +3075,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2920} 3075}
2921#endif 3076#endif
2922 3077
2923static void age_active_anon(struct zone *zone, struct scan_control *sc) 3078static void age_active_anon(struct pglist_data *pgdat,
3079 struct scan_control *sc)
2924{ 3080{
2925 struct mem_cgroup *memcg; 3081 struct mem_cgroup *memcg;
2926 3082
@@ -2929,9 +3085,9 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
2929 3085
2930 memcg = mem_cgroup_iter(NULL, NULL, NULL); 3086 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2931 do { 3087 do {
2932 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 3088 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2933 3089
2934 if (inactive_list_is_low(lruvec, false)) 3090 if (inactive_list_is_low(lruvec, false, sc))
2935 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 3091 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2936 sc, LRU_ACTIVE_ANON); 3092 sc, LRU_ACTIVE_ANON);
2937 3093
@@ -2939,82 +3095,21 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
2939 } while (memcg); 3095 } while (memcg);
2940} 3096}
2941 3097
2942static bool zone_balanced(struct zone *zone, int order, bool highorder, 3098static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
2943 unsigned long balance_gap, int classzone_idx)
2944{ 3099{
2945 unsigned long mark = high_wmark_pages(zone) + balance_gap; 3100 unsigned long mark = high_wmark_pages(zone);
3101
3102 if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
3103 return false;
2946 3104
2947 /* 3105 /*
2948 * When checking from pgdat_balanced(), kswapd should stop and sleep 3106 * If any eligible zone is balanced then the node is not considered
2949 * when it reaches the high order-0 watermark and let kcompactd take 3107 * to be congested or dirty
2950 * over. Other callers such as wakeup_kswapd() want to determine the
2951 * true high-order watermark.
2952 */ 3108 */
2953 if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { 3109 clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
2954 mark += (1UL << order); 3110 clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
2955 order = 0;
2956 }
2957
2958 return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
2959}
2960
2961/*
2962 * pgdat_balanced() is used when checking if a node is balanced.
2963 *
2964 * For order-0, all zones must be balanced!
2965 *
2966 * For high-order allocations only zones that meet watermarks and are in a
2967 * zone allowed by the callers classzone_idx are added to balanced_pages. The
2968 * total of balanced pages must be at least 25% of the zones allowed by
2969 * classzone_idx for the node to be considered balanced. Forcing all zones to
2970 * be balanced for high orders can cause excessive reclaim when there are
2971 * imbalanced zones.
2972 * The choice of 25% is due to
2973 * o a 16M DMA zone that is balanced will not balance a zone on any
2974 * reasonable sized machine
2975 * o On all other machines, the top zone must be at least a reasonable
2976 * percentage of the middle zones. For example, on 32-bit x86, highmem
2977 * would need to be at least 256M for it to be balance a whole node.
2978 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2979 * to balance a node on its own. These seemed like reasonable ratios.
2980 */
2981static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2982{
2983 unsigned long managed_pages = 0;
2984 unsigned long balanced_pages = 0;
2985 int i;
2986
2987 /* Check the watermark levels */
2988 for (i = 0; i <= classzone_idx; i++) {
2989 struct zone *zone = pgdat->node_zones + i;
2990
2991 if (!populated_zone(zone))
2992 continue;
2993
2994 managed_pages += zone->managed_pages;
2995
2996 /*
2997 * A special case here:
2998 *
2999 * balance_pgdat() skips over all_unreclaimable after
3000 * DEF_PRIORITY. Effectively, it considers them balanced so
3001 * they must be considered balanced here as well!
3002 */
3003 if (!zone_reclaimable(zone)) {
3004 balanced_pages += zone->managed_pages;
3005 continue;
3006 }
3007 3111
3008 if (zone_balanced(zone, order, false, 0, i)) 3112 return true;
3009 balanced_pages += zone->managed_pages;
3010 else if (!order)
3011 return false;
3012 }
3013
3014 if (order)
3015 return balanced_pages >= (managed_pages >> 2);
3016 else
3017 return true;
3018} 3113}
3019 3114
3020/* 3115/*
@@ -3023,12 +3118,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3023 * 3118 *
3024 * Returns true if kswapd is ready to sleep 3119 * Returns true if kswapd is ready to sleep
3025 */ 3120 */
3026static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, 3121static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3027 int classzone_idx)
3028{ 3122{
3029 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 3123 int i;
3030 if (remaining)
3031 return false;
3032 3124
3033 /* 3125 /*
3034 * The throttled processes are normally woken up in balance_pgdat() as 3126 * The throttled processes are normally woken up in balance_pgdat() as
@@ -3046,91 +3138,81 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
3046 if (waitqueue_active(&pgdat->pfmemalloc_wait)) 3138 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3047 wake_up_all(&pgdat->pfmemalloc_wait); 3139 wake_up_all(&pgdat->pfmemalloc_wait);
3048 3140
3049 return pgdat_balanced(pgdat, order, classzone_idx); 3141 for (i = 0; i <= classzone_idx; i++) {
3142 struct zone *zone = pgdat->node_zones + i;
3143
3144 if (!populated_zone(zone))
3145 continue;
3146
3147 if (!zone_balanced(zone, order, classzone_idx))
3148 return false;
3149 }
3150
3151 return true;
3050} 3152}
3051 3153
3052/* 3154/*
3053 * kswapd shrinks the zone by the number of pages required to reach 3155 * kswapd shrinks a node of pages that are at or below the highest usable
3054 * the high watermark. 3156 * zone that is currently unbalanced.
3055 * 3157 *
3056 * Returns true if kswapd scanned at least the requested number of pages to 3158 * Returns true if kswapd scanned at least the requested number of pages to
3057 * reclaim or if the lack of progress was due to pages under writeback. 3159 * reclaim or if the lack of progress was due to pages under writeback.
3058 * This is used to determine if the scanning priority needs to be raised. 3160 * This is used to determine if the scanning priority needs to be raised.
3059 */ 3161 */
3060static bool kswapd_shrink_zone(struct zone *zone, 3162static bool kswapd_shrink_node(pg_data_t *pgdat,
3061 int classzone_idx,
3062 struct scan_control *sc) 3163 struct scan_control *sc)
3063{ 3164{
3064 unsigned long balance_gap; 3165 struct zone *zone;
3065 bool lowmem_pressure; 3166 int z;
3066 3167
3067 /* Reclaim above the high watermark. */ 3168 /* Reclaim a number of pages proportional to the number of zones */
3068 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); 3169 sc->nr_to_reclaim = 0;
3170 for (z = 0; z <= sc->reclaim_idx; z++) {
3171 zone = pgdat->node_zones + z;
3172 if (!populated_zone(zone))
3173 continue;
3069 3174
3070 /* 3175 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3071 * We put equal pressure on every zone, unless one zone has way too 3176 }
3072 * many pages free already. The "too many pages" is defined as the
3073 * high wmark plus a "gap" where the gap is either the low
3074 * watermark or 1% of the zone, whichever is smaller.
3075 */
3076 balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
3077 zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
3078 3177
3079 /* 3178 /*
3080 * If there is no low memory pressure or the zone is balanced then no 3179 * Historically care was taken to put equal pressure on all zones but
3081 * reclaim is necessary 3180 * now pressure is applied based on node LRU order.
3082 */ 3181 */
3083 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); 3182 shrink_node(pgdat, sc);
3084 if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
3085 balance_gap, classzone_idx))
3086 return true;
3087
3088 shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
3089
3090 clear_bit(ZONE_WRITEBACK, &zone->flags);
3091 3183
3092 /* 3184 /*
3093 * If a zone reaches its high watermark, consider it to be no longer 3185 * Fragmentation may mean that the system cannot be rebalanced for
3094 * congested. It's possible there are dirty pages backed by congested 3186 * high-order allocations. If twice the allocation size has been
3095 * BDIs but as pressure is relieved, speculatively avoid congestion 3187 * reclaimed then recheck watermarks only at order-0 to prevent
3096 * waits. 3188 * excessive reclaim. Assume that a process requested a high-order
3189 * can direct reclaim/compact.
3097 */ 3190 */
3098 if (zone_reclaimable(zone) && 3191 if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
3099 zone_balanced(zone, sc->order, false, 0, classzone_idx)) { 3192 sc->order = 0;
3100 clear_bit(ZONE_CONGESTED, &zone->flags);
3101 clear_bit(ZONE_DIRTY, &zone->flags);
3102 }
3103 3193
3104 return sc->nr_scanned >= sc->nr_to_reclaim; 3194 return sc->nr_scanned >= sc->nr_to_reclaim;
3105} 3195}
3106 3196
3107/* 3197/*
3108 * For kswapd, balance_pgdat() will work across all this node's zones until 3198 * For kswapd, balance_pgdat() will reclaim pages across a node from zones
3109 * they are all at high_wmark_pages(zone). 3199 * that are eligible for use by the caller until at least one zone is
3200 * balanced.
3110 * 3201 *
3111 * Returns the highest zone idx kswapd was reclaiming at 3202 * Returns the order kswapd finished reclaiming at.
3112 *
3113 * There is special handling here for zones which are full of pinned pages.
3114 * This can happen if the pages are all mlocked, or if they are all used by
3115 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
3116 * What we do is to detect the case where all pages in the zone have been
3117 * scanned twice and there has been zero successful reclaim. Mark the zone as
3118 * dead and from now on, only perform a short scan. Basically we're polling
3119 * the zone for when the problem goes away.
3120 * 3203 *
3121 * kswapd scans the zones in the highmem->normal->dma direction. It skips 3204 * kswapd scans the zones in the highmem->normal->dma direction. It skips
3122 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 3205 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
3123 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the 3206 * found to have free_pages <= high_wmark_pages(zone), any page is that zone
3124 * lower zones regardless of the number of free pages in the lower zones. This 3207 * or lower is eligible for reclaim until at least one usable zone is
3125 * interoperates with the page allocator fallback scheme to ensure that aging 3208 * balanced.
3126 * of pages is balanced across the zones.
3127 */ 3209 */
3128static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) 3210static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3129{ 3211{
3130 int i; 3212 int i;
3131 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
3132 unsigned long nr_soft_reclaimed; 3213 unsigned long nr_soft_reclaimed;
3133 unsigned long nr_soft_scanned; 3214 unsigned long nr_soft_scanned;
3215 struct zone *zone;
3134 struct scan_control sc = { 3216 struct scan_control sc = {
3135 .gfp_mask = GFP_KERNEL, 3217 .gfp_mask = GFP_KERNEL,
3136 .order = order, 3218 .order = order,
@@ -3145,100 +3227,77 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3145 bool raise_priority = true; 3227 bool raise_priority = true;
3146 3228
3147 sc.nr_reclaimed = 0; 3229 sc.nr_reclaimed = 0;
3230 sc.reclaim_idx = classzone_idx;
3148 3231
3149 /* 3232 /*
3150 * Scan in the highmem->dma direction for the highest 3233 * If the number of buffer_heads exceeds the maximum allowed
3151 * zone which needs scanning 3234 * then consider reclaiming from all zones. This has a dual
3235 * purpose -- on 64-bit systems it is expected that
3236 * buffer_heads are stripped during active rotation. On 32-bit
3237 * systems, highmem pages can pin lowmem memory and shrinking
3238 * buffers can relieve lowmem pressure. Reclaim may still not
3239 * go ahead if all eligible zones for the original allocation
3240 * request are balanced to avoid excessive reclaim from kswapd.
3152 */ 3241 */
3153 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 3242 if (buffer_heads_over_limit) {
3154 struct zone *zone = pgdat->node_zones + i; 3243 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3155 3244 zone = pgdat->node_zones + i;
3156 if (!populated_zone(zone)) 3245 if (!populated_zone(zone))
3157 continue; 3246 continue;
3158
3159 if (sc.priority != DEF_PRIORITY &&
3160 !zone_reclaimable(zone))
3161 continue;
3162
3163 /*
3164 * Do some background aging of the anon list, to give
3165 * pages a chance to be referenced before reclaiming.
3166 */
3167 age_active_anon(zone, &sc);
3168 3247
3169 /* 3248 sc.reclaim_idx = i;
3170 * If the number of buffer_heads in the machine
3171 * exceeds the maximum allowed level and this node
3172 * has a highmem zone, force kswapd to reclaim from
3173 * it to relieve lowmem pressure.
3174 */
3175 if (buffer_heads_over_limit && is_highmem_idx(i)) {
3176 end_zone = i;
3177 break; 3249 break;
3178 } 3250 }
3251 }
3179 3252
3180 if (!zone_balanced(zone, order, false, 0, 0)) { 3253 /*
3181 end_zone = i; 3254 * Only reclaim if there are no eligible zones. Check from
3182 break; 3255 * high to low zone as allocations prefer higher zones.
3183 } else { 3256 * Scanning from low to high zone would allow congestion to be
3184 /* 3257 * cleared during a very small window when a small low
3185 * If balanced, clear the dirty and congested 3258 * zone was balanced even under extreme pressure when the
3186 * flags 3259 * overall node may be congested. Note that sc.reclaim_idx
3187 */ 3260 * is not used as buffer_heads_over_limit may have adjusted
3188 clear_bit(ZONE_CONGESTED, &zone->flags); 3261 * it.
3189 clear_bit(ZONE_DIRTY, &zone->flags); 3262 */
3190 } 3263 for (i = classzone_idx; i >= 0; i--) {
3264 zone = pgdat->node_zones + i;
3265 if (!populated_zone(zone))
3266 continue;
3267
3268 if (zone_balanced(zone, sc.order, classzone_idx))
3269 goto out;
3191 } 3270 }
3192 3271
3193 if (i < 0) 3272 /*
3194 goto out; 3273 * Do some background aging of the anon list, to give
3274 * pages a chance to be referenced before reclaiming. All
3275 * pages are rotated regardless of classzone as this is
3276 * about consistent aging.
3277 */
3278 age_active_anon(pgdat, &sc);
3195 3279
3196 /* 3280 /*
3197 * If we're getting trouble reclaiming, start doing writepage 3281 * If we're getting trouble reclaiming, start doing writepage
3198 * even in laptop mode. 3282 * even in laptop mode.
3199 */ 3283 */
3200 if (sc.priority < DEF_PRIORITY - 2) 3284 if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
3201 sc.may_writepage = 1; 3285 sc.may_writepage = 1;
3202 3286
3287 /* Call soft limit reclaim before calling shrink_node. */
3288 sc.nr_scanned = 0;
3289 nr_soft_scanned = 0;
3290 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3291 sc.gfp_mask, &nr_soft_scanned);
3292 sc.nr_reclaimed += nr_soft_reclaimed;
3293
3203 /* 3294 /*
3204 * Now scan the zone in the dma->highmem direction, stopping 3295 * There should be no need to raise the scanning priority if
3205 * at the last zone which needs scanning. 3296 * enough pages are already being scanned that that high
3206 * 3297 * watermark would be met at 100% efficiency.
3207 * We do this because the page allocator works in the opposite
3208 * direction. This prevents the page allocator from allocating
3209 * pages behind kswapd's direction of progress, which would
3210 * cause too much scanning of the lower zones.
3211 */ 3298 */
3212 for (i = 0; i <= end_zone; i++) { 3299 if (kswapd_shrink_node(pgdat, &sc))
3213 struct zone *zone = pgdat->node_zones + i; 3300 raise_priority = false;
3214
3215 if (!populated_zone(zone))
3216 continue;
3217
3218 if (sc.priority != DEF_PRIORITY &&
3219 !zone_reclaimable(zone))
3220 continue;
3221
3222 sc.nr_scanned = 0;
3223
3224 nr_soft_scanned = 0;
3225 /*
3226 * Call soft limit reclaim before calling shrink_zone.
3227 */
3228 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
3229 order, sc.gfp_mask,
3230 &nr_soft_scanned);
3231 sc.nr_reclaimed += nr_soft_reclaimed;
3232
3233 /*
3234 * There should be no need to raise the scanning
3235 * priority if enough pages are already being scanned
3236 * that that high watermark would be met at 100%
3237 * efficiency.
3238 */
3239 if (kswapd_shrink_zone(zone, end_zone, &sc))
3240 raise_priority = false;
3241 }
3242 3301
3243 /* 3302 /*
3244 * If the low watermark is met there is no need for processes 3303 * If the low watermark is met there is no need for processes
@@ -3259,19 +3318,20 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3259 */ 3318 */
3260 if (raise_priority || !sc.nr_reclaimed) 3319 if (raise_priority || !sc.nr_reclaimed)
3261 sc.priority--; 3320 sc.priority--;
3262 } while (sc.priority >= 1 && 3321 } while (sc.priority >= 1);
3263 !pgdat_balanced(pgdat, order, classzone_idx));
3264 3322
3265out: 3323out:
3266 /* 3324 /*
3267 * Return the highest zone idx we were reclaiming at so 3325 * Return the order kswapd stopped reclaiming at as
3268 * prepare_kswapd_sleep() makes the same decisions as here. 3326 * prepare_kswapd_sleep() takes it into account. If another caller
3327 * entered the allocator slow path while kswapd was awake, order will
3328 * remain at the higher level.
3269 */ 3329 */
3270 return end_zone; 3330 return sc.order;
3271} 3331}
3272 3332
3273static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, 3333static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3274 int classzone_idx, int balanced_classzone_idx) 3334 unsigned int classzone_idx)
3275{ 3335{
3276 long remaining = 0; 3336 long remaining = 0;
3277 DEFINE_WAIT(wait); 3337 DEFINE_WAIT(wait);
@@ -3282,8 +3342,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
3282 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3342 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3283 3343
3284 /* Try to sleep for a short interval */ 3344 /* Try to sleep for a short interval */
3285 if (prepare_kswapd_sleep(pgdat, order, remaining, 3345 if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3286 balanced_classzone_idx)) {
3287 /* 3346 /*
3288 * Compaction records what page blocks it recently failed to 3347 * Compaction records what page blocks it recently failed to
3289 * isolate pages from and skips them in the future scanning. 3348 * isolate pages from and skips them in the future scanning.
@@ -3296,9 +3355,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
3296 * We have freed the memory, now we should compact it to make 3355 * We have freed the memory, now we should compact it to make
3297 * allocation of the requested order possible. 3356 * allocation of the requested order possible.
3298 */ 3357 */
3299 wakeup_kcompactd(pgdat, order, classzone_idx); 3358 wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
3300 3359
3301 remaining = schedule_timeout(HZ/10); 3360 remaining = schedule_timeout(HZ/10);
3361
3362 /*
3363 * If woken prematurely then reset kswapd_classzone_idx and
3364 * order. The values will either be from a wakeup request or
3365 * the previous request that slept prematurely.
3366 */
3367 if (remaining) {
3368 pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
3369 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
3370 }
3371
3302 finish_wait(&pgdat->kswapd_wait, &wait); 3372 finish_wait(&pgdat->kswapd_wait, &wait);
3303 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3373 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3304 } 3374 }
@@ -3307,8 +3377,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
3307 * After a short sleep, check if it was a premature sleep. If not, then 3377 * After a short sleep, check if it was a premature sleep. If not, then
3308 * go fully to sleep until explicitly woken up. 3378 * go fully to sleep until explicitly woken up.
3309 */ 3379 */
3310 if (prepare_kswapd_sleep(pgdat, order, remaining, 3380 if (!remaining &&
3311 balanced_classzone_idx)) { 3381 prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3312 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 3382 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3313 3383
3314 /* 3384 /*
@@ -3349,9 +3419,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
3349 */ 3419 */
3350static int kswapd(void *p) 3420static int kswapd(void *p)
3351{ 3421{
3352 unsigned long order, new_order; 3422 unsigned int alloc_order, reclaim_order, classzone_idx;
3353 int classzone_idx, new_classzone_idx;
3354 int balanced_classzone_idx;
3355 pg_data_t *pgdat = (pg_data_t*)p; 3423 pg_data_t *pgdat = (pg_data_t*)p;
3356 struct task_struct *tsk = current; 3424 struct task_struct *tsk = current;
3357 3425
@@ -3381,38 +3449,20 @@ static int kswapd(void *p)
3381 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 3449 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3382 set_freezable(); 3450 set_freezable();
3383 3451
3384 order = new_order = 0; 3452 pgdat->kswapd_order = alloc_order = reclaim_order = 0;
3385 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 3453 pgdat->kswapd_classzone_idx = classzone_idx = 0;
3386 balanced_classzone_idx = classzone_idx;
3387 for ( ; ; ) { 3454 for ( ; ; ) {
3388 bool ret; 3455 bool ret;
3389 3456
3390 /* 3457kswapd_try_sleep:
3391 * While we were reclaiming, there might have been another 3458 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3392 * wakeup, so check the values. 3459 classzone_idx);
3393 */
3394 new_order = pgdat->kswapd_max_order;
3395 new_classzone_idx = pgdat->classzone_idx;
3396 pgdat->kswapd_max_order = 0;
3397 pgdat->classzone_idx = pgdat->nr_zones - 1;
3398 3460
3399 if (order < new_order || classzone_idx > new_classzone_idx) { 3461 /* Read the new order and classzone_idx */
3400 /* 3462 alloc_order = reclaim_order = pgdat->kswapd_order;
3401 * Don't sleep if someone wants a larger 'order' 3463 classzone_idx = pgdat->kswapd_classzone_idx;
3402 * allocation or has tigher zone constraints 3464 pgdat->kswapd_order = 0;
3403 */ 3465 pgdat->kswapd_classzone_idx = 0;
3404 order = new_order;
3405 classzone_idx = new_classzone_idx;
3406 } else {
3407 kswapd_try_to_sleep(pgdat, order, classzone_idx,
3408 balanced_classzone_idx);
3409 order = pgdat->kswapd_max_order;
3410 classzone_idx = pgdat->classzone_idx;
3411 new_order = order;
3412 new_classzone_idx = classzone_idx;
3413 pgdat->kswapd_max_order = 0;
3414 pgdat->classzone_idx = pgdat->nr_zones - 1;
3415 }
3416 3466
3417 ret = try_to_freeze(); 3467 ret = try_to_freeze();
3418 if (kthread_should_stop()) 3468 if (kthread_should_stop())
@@ -3422,11 +3472,25 @@ static int kswapd(void *p)
3422 * We can speed up thawing tasks if we don't call balance_pgdat 3472 * We can speed up thawing tasks if we don't call balance_pgdat
3423 * after returning from the refrigerator 3473 * after returning from the refrigerator
3424 */ 3474 */
3425 if (!ret) { 3475 if (ret)
3426 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 3476 continue;
3427 balanced_classzone_idx = balance_pgdat(pgdat, order, 3477
3428 classzone_idx); 3478 /*
3429 } 3479 * Reclaim begins at the requested order but if a high-order
3480 * reclaim fails then kswapd falls back to reclaiming for
3481 * order-0. If that happens, kswapd will consider sleeping
3482 * for the order it finished reclaiming at (reclaim_order)
3483 * but kcompactd is woken to compact for the original
3484 * request (alloc_order).
3485 */
3486 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
3487 alloc_order);
3488 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
3489 if (reclaim_order < alloc_order)
3490 goto kswapd_try_sleep;
3491
3492 alloc_order = reclaim_order = pgdat->kswapd_order;
3493 classzone_idx = pgdat->kswapd_classzone_idx;
3430 } 3494 }
3431 3495
3432 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); 3496 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3442,6 +3506,7 @@ static int kswapd(void *p)
3442void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 3506void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3443{ 3507{
3444 pg_data_t *pgdat; 3508 pg_data_t *pgdat;
3509 int z;
3445 3510
3446 if (!populated_zone(zone)) 3511 if (!populated_zone(zone))
3447 return; 3512 return;
@@ -3449,14 +3514,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3449 if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) 3514 if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
3450 return; 3515 return;
3451 pgdat = zone->zone_pgdat; 3516 pgdat = zone->zone_pgdat;
3452 if (pgdat->kswapd_max_order < order) { 3517 pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
3453 pgdat->kswapd_max_order = order; 3518 pgdat->kswapd_order = max(pgdat->kswapd_order, order);
3454 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
3455 }
3456 if (!waitqueue_active(&pgdat->kswapd_wait)) 3519 if (!waitqueue_active(&pgdat->kswapd_wait))
3457 return; 3520 return;
3458 if (zone_balanced(zone, order, true, 0, 0)) 3521
3459 return; 3522 /* Only wake kswapd if all zones are unbalanced */
3523 for (z = 0; z <= classzone_idx; z++) {
3524 zone = pgdat->node_zones + z;
3525 if (!populated_zone(zone))
3526 continue;
3527
3528 if (zone_balanced(zone, order, classzone_idx))
3529 return;
3530 }
3460 3531
3461 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 3532 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
3462 wake_up_interruptible(&pgdat->kswapd_wait); 3533 wake_up_interruptible(&pgdat->kswapd_wait);
@@ -3477,6 +3548,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3477 struct scan_control sc = { 3548 struct scan_control sc = {
3478 .nr_to_reclaim = nr_to_reclaim, 3549 .nr_to_reclaim = nr_to_reclaim,
3479 .gfp_mask = GFP_HIGHUSER_MOVABLE, 3550 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3551 .reclaim_idx = MAX_NR_ZONES - 1,
3480 .priority = DEF_PRIORITY, 3552 .priority = DEF_PRIORITY,
3481 .may_writepage = 1, 3553 .may_writepage = 1,
3482 .may_unmap = 1, 3554 .may_unmap = 1,
@@ -3578,12 +3650,12 @@ module_init(kswapd_init)
3578 3650
3579#ifdef CONFIG_NUMA 3651#ifdef CONFIG_NUMA
3580/* 3652/*
3581 * Zone reclaim mode 3653 * Node reclaim mode
3582 * 3654 *
3583 * If non-zero call zone_reclaim when the number of free pages falls below 3655 * If non-zero call node_reclaim when the number of free pages falls below
3584 * the watermarks. 3656 * the watermarks.
3585 */ 3657 */
3586int zone_reclaim_mode __read_mostly; 3658int node_reclaim_mode __read_mostly;
3587 3659
3588#define RECLAIM_OFF 0 3660#define RECLAIM_OFF 0
3589#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ 3661#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
@@ -3591,14 +3663,14 @@ int zone_reclaim_mode __read_mostly;
3591#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ 3663#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
3592 3664
3593/* 3665/*
3594 * Priority for ZONE_RECLAIM. This determines the fraction of pages 3666 * Priority for NODE_RECLAIM. This determines the fraction of pages
3595 * of a node considered for each zone_reclaim. 4 scans 1/16th of 3667 * of a node considered for each zone_reclaim. 4 scans 1/16th of
3596 * a zone. 3668 * a zone.
3597 */ 3669 */
3598#define ZONE_RECLAIM_PRIORITY 4 3670#define NODE_RECLAIM_PRIORITY 4
3599 3671
3600/* 3672/*
3601 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 3673 * Percentage of pages in a zone that must be unmapped for node_reclaim to
3602 * occur. 3674 * occur.
3603 */ 3675 */
3604int sysctl_min_unmapped_ratio = 1; 3676int sysctl_min_unmapped_ratio = 1;
@@ -3609,11 +3681,11 @@ int sysctl_min_unmapped_ratio = 1;
3609 */ 3681 */
3610int sysctl_min_slab_ratio = 5; 3682int sysctl_min_slab_ratio = 5;
3611 3683
3612static inline unsigned long zone_unmapped_file_pages(struct zone *zone) 3684static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
3613{ 3685{
3614 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); 3686 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
3615 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + 3687 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
3616 zone_page_state(zone, NR_ACTIVE_FILE); 3688 node_page_state(pgdat, NR_ACTIVE_FILE);
3617 3689
3618 /* 3690 /*
3619 * It's possible for there to be more file mapped pages than 3691 * It's possible for there to be more file mapped pages than
@@ -3624,7 +3696,7 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3624} 3696}
3625 3697
3626/* Work out how many page cache pages we can reclaim in this reclaim_mode */ 3698/* Work out how many page cache pages we can reclaim in this reclaim_mode */
3627static unsigned long zone_pagecache_reclaimable(struct zone *zone) 3699static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
3628{ 3700{
3629 unsigned long nr_pagecache_reclaimable; 3701 unsigned long nr_pagecache_reclaimable;
3630 unsigned long delta = 0; 3702 unsigned long delta = 0;
@@ -3632,17 +3704,17 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
3632 /* 3704 /*
3633 * If RECLAIM_UNMAP is set, then all file pages are considered 3705 * If RECLAIM_UNMAP is set, then all file pages are considered
3634 * potentially reclaimable. Otherwise, we have to worry about 3706 * potentially reclaimable. Otherwise, we have to worry about
3635 * pages like swapcache and zone_unmapped_file_pages() provides 3707 * pages like swapcache and node_unmapped_file_pages() provides
3636 * a better estimate 3708 * a better estimate
3637 */ 3709 */
3638 if (zone_reclaim_mode & RECLAIM_UNMAP) 3710 if (node_reclaim_mode & RECLAIM_UNMAP)
3639 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); 3711 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
3640 else 3712 else
3641 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); 3713 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
3642 3714
3643 /* If we can't clean pages, remove dirty pages from consideration */ 3715 /* If we can't clean pages, remove dirty pages from consideration */
3644 if (!(zone_reclaim_mode & RECLAIM_WRITE)) 3716 if (!(node_reclaim_mode & RECLAIM_WRITE))
3645 delta += zone_page_state(zone, NR_FILE_DIRTY); 3717 delta += node_page_state(pgdat, NR_FILE_DIRTY);
3646 3718
3647 /* Watch for any possible underflows due to delta */ 3719 /* Watch for any possible underflows due to delta */
3648 if (unlikely(delta > nr_pagecache_reclaimable)) 3720 if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3652,22 +3724,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
3652} 3724}
3653 3725
3654/* 3726/*
3655 * Try to free up some pages from this zone through reclaim. 3727 * Try to free up some pages from this node through reclaim.
3656 */ 3728 */
3657static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3729static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
3658{ 3730{
3659 /* Minimum pages needed in order to stay on node */ 3731 /* Minimum pages needed in order to stay on node */
3660 const unsigned long nr_pages = 1 << order; 3732 const unsigned long nr_pages = 1 << order;
3661 struct task_struct *p = current; 3733 struct task_struct *p = current;
3662 struct reclaim_state reclaim_state; 3734 struct reclaim_state reclaim_state;
3735 int classzone_idx = gfp_zone(gfp_mask);
3663 struct scan_control sc = { 3736 struct scan_control sc = {
3664 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 3737 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3665 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 3738 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3666 .order = order, 3739 .order = order,
3667 .priority = ZONE_RECLAIM_PRIORITY, 3740 .priority = NODE_RECLAIM_PRIORITY,
3668 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3741 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
3669 .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), 3742 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
3670 .may_swap = 1, 3743 .may_swap = 1,
3744 .reclaim_idx = classzone_idx,
3671 }; 3745 };
3672 3746
3673 cond_resched(); 3747 cond_resched();
@@ -3681,13 +3755,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3681 reclaim_state.reclaimed_slab = 0; 3755 reclaim_state.reclaimed_slab = 0;
3682 p->reclaim_state = &reclaim_state; 3756 p->reclaim_state = &reclaim_state;
3683 3757
3684 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { 3758 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
3685 /* 3759 /*
3686 * Free memory by calling shrink zone with increasing 3760 * Free memory by calling shrink zone with increasing
3687 * priorities until we have enough memory freed. 3761 * priorities until we have enough memory freed.
3688 */ 3762 */
3689 do { 3763 do {
3690 shrink_zone(zone, &sc, true); 3764 shrink_node(pgdat, &sc);
3691 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 3765 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3692 } 3766 }
3693 3767
@@ -3697,49 +3771,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3697 return sc.nr_reclaimed >= nr_pages; 3771 return sc.nr_reclaimed >= nr_pages;
3698} 3772}
3699 3773
3700int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3774int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
3701{ 3775{
3702 int node_id;
3703 int ret; 3776 int ret;
3704 3777
3705 /* 3778 /*
3706 * Zone reclaim reclaims unmapped file backed pages and 3779 * Node reclaim reclaims unmapped file backed pages and
3707 * slab pages if we are over the defined limits. 3780 * slab pages if we are over the defined limits.
3708 * 3781 *
3709 * A small portion of unmapped file backed pages is needed for 3782 * A small portion of unmapped file backed pages is needed for
3710 * file I/O otherwise pages read by file I/O will be immediately 3783 * file I/O otherwise pages read by file I/O will be immediately
3711 * thrown out if the zone is overallocated. So we do not reclaim 3784 * thrown out if the node is overallocated. So we do not reclaim
3712 * if less than a specified percentage of the zone is used by 3785 * if less than a specified percentage of the node is used by
3713 * unmapped file backed pages. 3786 * unmapped file backed pages.
3714 */ 3787 */
3715 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && 3788 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
3716 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3789 sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
3717 return ZONE_RECLAIM_FULL; 3790 return NODE_RECLAIM_FULL;
3718 3791
3719 if (!zone_reclaimable(zone)) 3792 if (!pgdat_reclaimable(pgdat))
3720 return ZONE_RECLAIM_FULL; 3793 return NODE_RECLAIM_FULL;
3721 3794
3722 /* 3795 /*
3723 * Do not scan if the allocation should not be delayed. 3796 * Do not scan if the allocation should not be delayed.
3724 */ 3797 */
3725 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) 3798 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
3726 return ZONE_RECLAIM_NOSCAN; 3799 return NODE_RECLAIM_NOSCAN;
3727 3800
3728 /* 3801 /*
3729 * Only run zone reclaim on the local zone or on zones that do not 3802 * Only run node reclaim on the local node or on nodes that do not
3730 * have associated processors. This will favor the local processor 3803 * have associated processors. This will favor the local processor
3731 * over remote processors and spread off node memory allocations 3804 * over remote processors and spread off node memory allocations
3732 * as wide as possible. 3805 * as wide as possible.
3733 */ 3806 */
3734 node_id = zone_to_nid(zone); 3807 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
3735 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 3808 return NODE_RECLAIM_NOSCAN;
3736 return ZONE_RECLAIM_NOSCAN;
3737 3809
3738 if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) 3810 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
3739 return ZONE_RECLAIM_NOSCAN; 3811 return NODE_RECLAIM_NOSCAN;
3740 3812
3741 ret = __zone_reclaim(zone, gfp_mask, order); 3813 ret = __node_reclaim(pgdat, gfp_mask, order);
3742 clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); 3814 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
3743 3815
3744 if (!ret) 3816 if (!ret)
3745 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3817 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3778,24 +3850,23 @@ int page_evictable(struct page *page)
3778void check_move_unevictable_pages(struct page **pages, int nr_pages) 3850void check_move_unevictable_pages(struct page **pages, int nr_pages)
3779{ 3851{
3780 struct lruvec *lruvec; 3852 struct lruvec *lruvec;
3781 struct zone *zone = NULL; 3853 struct pglist_data *pgdat = NULL;
3782 int pgscanned = 0; 3854 int pgscanned = 0;
3783 int pgrescued = 0; 3855 int pgrescued = 0;
3784 int i; 3856 int i;
3785 3857
3786 for (i = 0; i < nr_pages; i++) { 3858 for (i = 0; i < nr_pages; i++) {
3787 struct page *page = pages[i]; 3859 struct page *page = pages[i];
3788 struct zone *pagezone; 3860 struct pglist_data *pagepgdat = page_pgdat(page);
3789 3861
3790 pgscanned++; 3862 pgscanned++;
3791 pagezone = page_zone(page); 3863 if (pagepgdat != pgdat) {
3792 if (pagezone != zone) { 3864 if (pgdat)
3793 if (zone) 3865 spin_unlock_irq(&pgdat->lru_lock);
3794 spin_unlock_irq(&zone->lru_lock); 3866 pgdat = pagepgdat;
3795 zone = pagezone; 3867 spin_lock_irq(&pgdat->lru_lock);
3796 spin_lock_irq(&zone->lru_lock);
3797 } 3868 }
3798 lruvec = mem_cgroup_page_lruvec(page, zone); 3869 lruvec = mem_cgroup_page_lruvec(page, pgdat);
3799 3870
3800 if (!PageLRU(page) || !PageUnevictable(page)) 3871 if (!PageLRU(page) || !PageUnevictable(page))
3801 continue; 3872 continue;
@@ -3811,10 +3882,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3811 } 3882 }
3812 } 3883 }
3813 3884
3814 if (zone) { 3885 if (pgdat) {
3815 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 3886 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3816 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 3887 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3817 spin_unlock_irq(&zone->lru_lock); 3888 spin_unlock_irq(&pgdat->lru_lock);
3818 } 3889 }
3819} 3890}
3820#endif /* CONFIG_SHMEM */ 3891#endif /* CONFIG_SHMEM */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7997f52935c9..89cec42d19ff 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu)
86 * 86 *
87 * vm_stat contains the global counters 87 * vm_stat contains the global counters
88 */ 88 */
89atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; 89atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
90EXPORT_SYMBOL(vm_stat); 90atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
91EXPORT_SYMBOL(vm_zone_stat);
92EXPORT_SYMBOL(vm_node_stat);
91 93
92#ifdef CONFIG_SMP 94#ifdef CONFIG_SMP
93 95
@@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone)
167 */ 169 */
168void refresh_zone_stat_thresholds(void) 170void refresh_zone_stat_thresholds(void)
169{ 171{
172 struct pglist_data *pgdat;
170 struct zone *zone; 173 struct zone *zone;
171 int cpu; 174 int cpu;
172 int threshold; 175 int threshold;
173 176
177 /* Zero current pgdat thresholds */
178 for_each_online_pgdat(pgdat) {
179 for_each_online_cpu(cpu) {
180 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
181 }
182 }
183
174 for_each_populated_zone(zone) { 184 for_each_populated_zone(zone) {
185 struct pglist_data *pgdat = zone->zone_pgdat;
175 unsigned long max_drift, tolerate_drift; 186 unsigned long max_drift, tolerate_drift;
176 187
177 threshold = calculate_normal_threshold(zone); 188 threshold = calculate_normal_threshold(zone);
178 189
179 for_each_online_cpu(cpu) 190 for_each_online_cpu(cpu) {
191 int pgdat_threshold;
192
180 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 193 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
181 = threshold; 194 = threshold;
182 195
196 /* Base nodestat threshold on the largest populated zone. */
197 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
198 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
199 = max(threshold, pgdat_threshold);
200 }
201
183 /* 202 /*
184 * Only set percpu_drift_mark if there is a danger that 203 * Only set percpu_drift_mark if there is a danger that
185 * NR_FREE_PAGES reports the low watermark is ok when in fact 204 * NR_FREE_PAGES reports the low watermark is ok when in fact
@@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
238} 257}
239EXPORT_SYMBOL(__mod_zone_page_state); 258EXPORT_SYMBOL(__mod_zone_page_state);
240 259
260void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
261 long delta)
262{
263 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
264 s8 __percpu *p = pcp->vm_node_stat_diff + item;
265 long x;
266 long t;
267
268 x = delta + __this_cpu_read(*p);
269
270 t = __this_cpu_read(pcp->stat_threshold);
271
272 if (unlikely(x > t || x < -t)) {
273 node_page_state_add(x, pgdat, item);
274 x = 0;
275 }
276 __this_cpu_write(*p, x);
277}
278EXPORT_SYMBOL(__mod_node_page_state);
279
241/* 280/*
242 * Optimized increment and decrement functions. 281 * Optimized increment and decrement functions.
243 * 282 *
@@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
277 } 316 }
278} 317}
279 318
319void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
320{
321 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
322 s8 __percpu *p = pcp->vm_node_stat_diff + item;
323 s8 v, t;
324
325 v = __this_cpu_inc_return(*p);
326 t = __this_cpu_read(pcp->stat_threshold);
327 if (unlikely(v > t)) {
328 s8 overstep = t >> 1;
329
330 node_page_state_add(v + overstep, pgdat, item);
331 __this_cpu_write(*p, -overstep);
332 }
333}
334
280void __inc_zone_page_state(struct page *page, enum zone_stat_item item) 335void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
281{ 336{
282 __inc_zone_state(page_zone(page), item); 337 __inc_zone_state(page_zone(page), item);
283} 338}
284EXPORT_SYMBOL(__inc_zone_page_state); 339EXPORT_SYMBOL(__inc_zone_page_state);
285 340
341void __inc_node_page_state(struct page *page, enum node_stat_item item)
342{
343 __inc_node_state(page_pgdat(page), item);
344}
345EXPORT_SYMBOL(__inc_node_page_state);
346
286void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 347void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
287{ 348{
288 struct per_cpu_pageset __percpu *pcp = zone->pageset; 349 struct per_cpu_pageset __percpu *pcp = zone->pageset;
@@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
299 } 360 }
300} 361}
301 362
363void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
364{
365 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
366 s8 __percpu *p = pcp->vm_node_stat_diff + item;
367 s8 v, t;
368
369 v = __this_cpu_dec_return(*p);
370 t = __this_cpu_read(pcp->stat_threshold);
371 if (unlikely(v < - t)) {
372 s8 overstep = t >> 1;
373
374 node_page_state_add(v - overstep, pgdat, item);
375 __this_cpu_write(*p, overstep);
376 }
377}
378
302void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 379void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
303{ 380{
304 __dec_zone_state(page_zone(page), item); 381 __dec_zone_state(page_zone(page), item);
305} 382}
306EXPORT_SYMBOL(__dec_zone_page_state); 383EXPORT_SYMBOL(__dec_zone_page_state);
307 384
385void __dec_node_page_state(struct page *page, enum node_stat_item item)
386{
387 __dec_node_state(page_pgdat(page), item);
388}
389EXPORT_SYMBOL(__dec_node_page_state);
390
308#ifdef CONFIG_HAVE_CMPXCHG_LOCAL 391#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
309/* 392/*
310 * If we have cmpxchg_local support then we do not need to incur the overhead 393 * If we have cmpxchg_local support then we do not need to incur the overhead
@@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state);
318 * 1 Overstepping half of threshold 401 * 1 Overstepping half of threshold
319 * -1 Overstepping minus half of threshold 402 * -1 Overstepping minus half of threshold
320*/ 403*/
321static inline void mod_state(struct zone *zone, enum zone_stat_item item, 404static inline void mod_zone_state(struct zone *zone,
322 long delta, int overstep_mode) 405 enum zone_stat_item item, long delta, int overstep_mode)
323{ 406{
324 struct per_cpu_pageset __percpu *pcp = zone->pageset; 407 struct per_cpu_pageset __percpu *pcp = zone->pageset;
325 s8 __percpu *p = pcp->vm_stat_diff + item; 408 s8 __percpu *p = pcp->vm_stat_diff + item;
@@ -359,26 +442,83 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item,
359void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 442void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
360 long delta) 443 long delta)
361{ 444{
362 mod_state(zone, item, delta, 0); 445 mod_zone_state(zone, item, delta, 0);
363} 446}
364EXPORT_SYMBOL(mod_zone_page_state); 447EXPORT_SYMBOL(mod_zone_page_state);
365 448
366void inc_zone_state(struct zone *zone, enum zone_stat_item item)
367{
368 mod_state(zone, item, 1, 1);
369}
370
371void inc_zone_page_state(struct page *page, enum zone_stat_item item) 449void inc_zone_page_state(struct page *page, enum zone_stat_item item)
372{ 450{
373 mod_state(page_zone(page), item, 1, 1); 451 mod_zone_state(page_zone(page), item, 1, 1);
374} 452}
375EXPORT_SYMBOL(inc_zone_page_state); 453EXPORT_SYMBOL(inc_zone_page_state);
376 454
377void dec_zone_page_state(struct page *page, enum zone_stat_item item) 455void dec_zone_page_state(struct page *page, enum zone_stat_item item)
378{ 456{
379 mod_state(page_zone(page), item, -1, -1); 457 mod_zone_state(page_zone(page), item, -1, -1);
380} 458}
381EXPORT_SYMBOL(dec_zone_page_state); 459EXPORT_SYMBOL(dec_zone_page_state);
460
461static inline void mod_node_state(struct pglist_data *pgdat,
462 enum node_stat_item item, int delta, int overstep_mode)
463{
464 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
465 s8 __percpu *p = pcp->vm_node_stat_diff + item;
466 long o, n, t, z;
467
468 do {
469 z = 0; /* overflow to node counters */
470
471 /*
472 * The fetching of the stat_threshold is racy. We may apply
473 * a counter threshold to the wrong the cpu if we get
474 * rescheduled while executing here. However, the next
475 * counter update will apply the threshold again and
476 * therefore bring the counter under the threshold again.
477 *
478 * Most of the time the thresholds are the same anyways
479 * for all cpus in a node.
480 */
481 t = this_cpu_read(pcp->stat_threshold);
482
483 o = this_cpu_read(*p);
484 n = delta + o;
485
486 if (n > t || n < -t) {
487 int os = overstep_mode * (t >> 1) ;
488
489 /* Overflow must be added to node counters */
490 z = n + os;
491 n = -os;
492 }
493 } while (this_cpu_cmpxchg(*p, o, n) != o);
494
495 if (z)
496 node_page_state_add(z, pgdat, item);
497}
498
499void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
500 long delta)
501{
502 mod_node_state(pgdat, item, delta, 0);
503}
504EXPORT_SYMBOL(mod_node_page_state);
505
506void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
507{
508 mod_node_state(pgdat, item, 1, 1);
509}
510
511void inc_node_page_state(struct page *page, enum node_stat_item item)
512{
513 mod_node_state(page_pgdat(page), item, 1, 1);
514}
515EXPORT_SYMBOL(inc_node_page_state);
516
517void dec_node_page_state(struct page *page, enum node_stat_item item)
518{
519 mod_node_state(page_pgdat(page), item, -1, -1);
520}
521EXPORT_SYMBOL(dec_node_page_state);
382#else 522#else
383/* 523/*
384 * Use interrupt disable to serialize counter updates 524 * Use interrupt disable to serialize counter updates
@@ -394,15 +534,6 @@ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
394} 534}
395EXPORT_SYMBOL(mod_zone_page_state); 535EXPORT_SYMBOL(mod_zone_page_state);
396 536
397void inc_zone_state(struct zone *zone, enum zone_stat_item item)
398{
399 unsigned long flags;
400
401 local_irq_save(flags);
402 __inc_zone_state(zone, item);
403 local_irq_restore(flags);
404}
405
406void inc_zone_page_state(struct page *page, enum zone_stat_item item) 537void inc_zone_page_state(struct page *page, enum zone_stat_item item)
407{ 538{
408 unsigned long flags; 539 unsigned long flags;
@@ -424,21 +555,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
424 local_irq_restore(flags); 555 local_irq_restore(flags);
425} 556}
426EXPORT_SYMBOL(dec_zone_page_state); 557EXPORT_SYMBOL(dec_zone_page_state);
427#endif
428 558
559void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
560{
561 unsigned long flags;
562
563 local_irq_save(flags);
564 __inc_node_state(pgdat, item);
565 local_irq_restore(flags);
566}
567EXPORT_SYMBOL(inc_node_state);
568
569void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
570 long delta)
571{
572 unsigned long flags;
573
574 local_irq_save(flags);
575 __mod_node_page_state(pgdat, item, delta);
576 local_irq_restore(flags);
577}
578EXPORT_SYMBOL(mod_node_page_state);
579
580void inc_node_page_state(struct page *page, enum node_stat_item item)
581{
582 unsigned long flags;
583 struct pglist_data *pgdat;
584
585 pgdat = page_pgdat(page);
586 local_irq_save(flags);
587 __inc_node_state(pgdat, item);
588 local_irq_restore(flags);
589}
590EXPORT_SYMBOL(inc_node_page_state);
591
592void dec_node_page_state(struct page *page, enum node_stat_item item)
593{
594 unsigned long flags;
595
596 local_irq_save(flags);
597 __dec_node_page_state(page, item);
598 local_irq_restore(flags);
599}
600EXPORT_SYMBOL(dec_node_page_state);
601#endif
429 602
430/* 603/*
431 * Fold a differential into the global counters. 604 * Fold a differential into the global counters.
432 * Returns the number of counters updated. 605 * Returns the number of counters updated.
433 */ 606 */
434static int fold_diff(int *diff) 607static int fold_diff(int *zone_diff, int *node_diff)
435{ 608{
436 int i; 609 int i;
437 int changes = 0; 610 int changes = 0;
438 611
439 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 612 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
440 if (diff[i]) { 613 if (zone_diff[i]) {
441 atomic_long_add(diff[i], &vm_stat[i]); 614 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
615 changes++;
616 }
617
618 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
619 if (node_diff[i]) {
620 atomic_long_add(node_diff[i], &vm_node_stat[i]);
442 changes++; 621 changes++;
443 } 622 }
444 return changes; 623 return changes;
@@ -462,9 +641,11 @@ static int fold_diff(int *diff)
462 */ 641 */
463static int refresh_cpu_vm_stats(bool do_pagesets) 642static int refresh_cpu_vm_stats(bool do_pagesets)
464{ 643{
644 struct pglist_data *pgdat;
465 struct zone *zone; 645 struct zone *zone;
466 int i; 646 int i;
467 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 647 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
648 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
468 int changes = 0; 649 int changes = 0;
469 650
470 for_each_populated_zone(zone) { 651 for_each_populated_zone(zone) {
@@ -477,7 +658,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
477 if (v) { 658 if (v) {
478 659
479 atomic_long_add(v, &zone->vm_stat[i]); 660 atomic_long_add(v, &zone->vm_stat[i]);
480 global_diff[i] += v; 661 global_zone_diff[i] += v;
481#ifdef CONFIG_NUMA 662#ifdef CONFIG_NUMA
482 /* 3 seconds idle till flush */ 663 /* 3 seconds idle till flush */
483 __this_cpu_write(p->expire, 3); 664 __this_cpu_write(p->expire, 3);
@@ -516,7 +697,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
516 } 697 }
517#endif 698#endif
518 } 699 }
519 changes += fold_diff(global_diff); 700
701 for_each_online_pgdat(pgdat) {
702 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
703
704 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
705 int v;
706
707 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
708 if (v) {
709 atomic_long_add(v, &pgdat->vm_stat[i]);
710 global_node_diff[i] += v;
711 }
712 }
713 }
714
715 changes += fold_diff(global_zone_diff, global_node_diff);
520 return changes; 716 return changes;
521} 717}
522 718
@@ -527,9 +723,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
527 */ 723 */
528void cpu_vm_stats_fold(int cpu) 724void cpu_vm_stats_fold(int cpu)
529{ 725{
726 struct pglist_data *pgdat;
530 struct zone *zone; 727 struct zone *zone;
531 int i; 728 int i;
532 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 729 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
730 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
533 731
534 for_each_populated_zone(zone) { 732 for_each_populated_zone(zone) {
535 struct per_cpu_pageset *p; 733 struct per_cpu_pageset *p;
@@ -543,11 +741,27 @@ void cpu_vm_stats_fold(int cpu)
543 v = p->vm_stat_diff[i]; 741 v = p->vm_stat_diff[i];
544 p->vm_stat_diff[i] = 0; 742 p->vm_stat_diff[i] = 0;
545 atomic_long_add(v, &zone->vm_stat[i]); 743 atomic_long_add(v, &zone->vm_stat[i]);
546 global_diff[i] += v; 744 global_zone_diff[i] += v;
547 } 745 }
548 } 746 }
549 747
550 fold_diff(global_diff); 748 for_each_online_pgdat(pgdat) {
749 struct per_cpu_nodestat *p;
750
751 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
752
753 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
754 if (p->vm_node_stat_diff[i]) {
755 int v;
756
757 v = p->vm_node_stat_diff[i];
758 p->vm_node_stat_diff[i] = 0;
759 atomic_long_add(v, &pgdat->vm_stat[i]);
760 global_node_diff[i] += v;
761 }
762 }
763
764 fold_diff(global_zone_diff, global_node_diff);
551} 765}
552 766
553/* 767/*
@@ -563,16 +777,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
563 int v = pset->vm_stat_diff[i]; 777 int v = pset->vm_stat_diff[i];
564 pset->vm_stat_diff[i] = 0; 778 pset->vm_stat_diff[i] = 0;
565 atomic_long_add(v, &zone->vm_stat[i]); 779 atomic_long_add(v, &zone->vm_stat[i]);
566 atomic_long_add(v, &vm_stat[i]); 780 atomic_long_add(v, &vm_zone_stat[i]);
567 } 781 }
568} 782}
569#endif 783#endif
570 784
571#ifdef CONFIG_NUMA 785#ifdef CONFIG_NUMA
572/* 786/*
573 * Determine the per node value of a stat item. 787 * Determine the per node value of a stat item. This function
788 * is called frequently in a NUMA machine, so try to be as
789 * frugal as possible.
574 */ 790 */
575unsigned long node_page_state(int node, enum zone_stat_item item) 791unsigned long sum_zone_node_page_state(int node,
792 enum zone_stat_item item)
576{ 793{
577 struct zone *zones = NODE_DATA(node)->node_zones; 794 struct zone *zones = NODE_DATA(node)->node_zones;
578 int i; 795 int i;
@@ -584,6 +801,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item)
584 return count; 801 return count;
585} 802}
586 803
804/*
805 * Determine the per node value of a stat item.
806 */
807unsigned long node_page_state(struct pglist_data *pgdat,
808 enum node_stat_item item)
809{
810 long x = atomic_long_read(&pgdat->vm_stat[item]);
811#ifdef CONFIG_SMP
812 if (x < 0)
813 x = 0;
814#endif
815 return x;
816}
587#endif 817#endif
588 818
589#ifdef CONFIG_COMPACTION 819#ifdef CONFIG_COMPACTION
@@ -691,33 +921,18 @@ int fragmentation_index(struct zone *zone, unsigned int order)
691const char * const vmstat_text[] = { 921const char * const vmstat_text[] = {
692 /* enum zone_stat_item countes */ 922 /* enum zone_stat_item countes */
693 "nr_free_pages", 923 "nr_free_pages",
694 "nr_alloc_batch", 924 "nr_zone_inactive_anon",
695 "nr_inactive_anon", 925 "nr_zone_active_anon",
696 "nr_active_anon", 926 "nr_zone_inactive_file",
697 "nr_inactive_file", 927 "nr_zone_active_file",
698 "nr_active_file", 928 "nr_zone_unevictable",
699 "nr_unevictable", 929 "nr_zone_write_pending",
700 "nr_mlock", 930 "nr_mlock",
701 "nr_anon_pages",
702 "nr_mapped",
703 "nr_file_pages",
704 "nr_dirty",
705 "nr_writeback",
706 "nr_slab_reclaimable", 931 "nr_slab_reclaimable",
707 "nr_slab_unreclaimable", 932 "nr_slab_unreclaimable",
708 "nr_page_table_pages", 933 "nr_page_table_pages",
709 "nr_kernel_stack", 934 "nr_kernel_stack",
710 "nr_unstable",
711 "nr_bounce", 935 "nr_bounce",
712 "nr_vmscan_write",
713 "nr_vmscan_immediate_reclaim",
714 "nr_writeback_temp",
715 "nr_isolated_anon",
716 "nr_isolated_file",
717 "nr_shmem",
718 "nr_dirtied",
719 "nr_written",
720 "nr_pages_scanned",
721#if IS_ENABLED(CONFIG_ZSMALLOC) 936#if IS_ENABLED(CONFIG_ZSMALLOC)
722 "nr_zspages", 937 "nr_zspages",
723#endif 938#endif
@@ -729,13 +944,35 @@ const char * const vmstat_text[] = {
729 "numa_local", 944 "numa_local",
730 "numa_other", 945 "numa_other",
731#endif 946#endif
947 "nr_free_cma",
948
949 /* Node-based counters */
950 "nr_inactive_anon",
951 "nr_active_anon",
952 "nr_inactive_file",
953 "nr_active_file",
954 "nr_unevictable",
955 "nr_isolated_anon",
956 "nr_isolated_file",
957 "nr_pages_scanned",
732 "workingset_refault", 958 "workingset_refault",
733 "workingset_activate", 959 "workingset_activate",
734 "workingset_nodereclaim", 960 "workingset_nodereclaim",
735 "nr_anon_transparent_hugepages", 961 "nr_anon_pages",
962 "nr_mapped",
963 "nr_file_pages",
964 "nr_dirty",
965 "nr_writeback",
966 "nr_writeback_temp",
967 "nr_shmem",
736 "nr_shmem_hugepages", 968 "nr_shmem_hugepages",
737 "nr_shmem_pmdmapped", 969 "nr_shmem_pmdmapped",
738 "nr_free_cma", 970 "nr_anon_transparent_hugepages",
971 "nr_unstable",
972 "nr_vmscan_write",
973 "nr_vmscan_immediate_reclaim",
974 "nr_dirtied",
975 "nr_written",
739 976
740 /* enum writeback_stat_item counters */ 977 /* enum writeback_stat_item counters */
741 "nr_dirty_threshold", 978 "nr_dirty_threshold",
@@ -749,6 +986,8 @@ const char * const vmstat_text[] = {
749 "pswpout", 986 "pswpout",
750 987
751 TEXTS_FOR_ZONES("pgalloc") 988 TEXTS_FOR_ZONES("pgalloc")
989 TEXTS_FOR_ZONES("allocstall")
990 TEXTS_FOR_ZONES("pgskip")
752 991
753 "pgfree", 992 "pgfree",
754 "pgactivate", 993 "pgactivate",
@@ -758,11 +997,11 @@ const char * const vmstat_text[] = {
758 "pgmajfault", 997 "pgmajfault",
759 "pglazyfreed", 998 "pglazyfreed",
760 999
761 TEXTS_FOR_ZONES("pgrefill") 1000 "pgrefill",
762 TEXTS_FOR_ZONES("pgsteal_kswapd") 1001 "pgsteal_kswapd",
763 TEXTS_FOR_ZONES("pgsteal_direct") 1002 "pgsteal_direct",
764 TEXTS_FOR_ZONES("pgscan_kswapd") 1003 "pgscan_kswapd",
765 TEXTS_FOR_ZONES("pgscan_direct") 1004 "pgscan_direct",
766 "pgscan_direct_throttle", 1005 "pgscan_direct_throttle",
767 1006
768#ifdef CONFIG_NUMA 1007#ifdef CONFIG_NUMA
@@ -774,7 +1013,6 @@ const char * const vmstat_text[] = {
774 "kswapd_low_wmark_hit_quickly", 1013 "kswapd_low_wmark_hit_quickly",
775 "kswapd_high_wmark_hit_quickly", 1014 "kswapd_high_wmark_hit_quickly",
776 "pageoutrun", 1015 "pageoutrun",
777 "allocstall",
778 1016
779 "pgrotated", 1017 "pgrotated",
780 1018
@@ -1180,17 +1418,41 @@ static const struct file_operations pagetypeinfo_file_ops = {
1180 .release = seq_release, 1418 .release = seq_release,
1181}; 1419};
1182 1420
1421static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1422{
1423 int zid;
1424
1425 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1426 struct zone *compare = &pgdat->node_zones[zid];
1427
1428 if (populated_zone(compare))
1429 return zone == compare;
1430 }
1431
1432 /* The zone must be somewhere! */
1433 WARN_ON_ONCE(1);
1434 return false;
1435}
1436
1183static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 1437static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1184 struct zone *zone) 1438 struct zone *zone)
1185{ 1439{
1186 int i; 1440 int i;
1187 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 1441 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1442 if (is_zone_first_populated(pgdat, zone)) {
1443 seq_printf(m, "\n per-node stats");
1444 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1445 seq_printf(m, "\n %-12s %lu",
1446 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
1447 node_page_state(pgdat, i));
1448 }
1449 }
1188 seq_printf(m, 1450 seq_printf(m,
1189 "\n pages free %lu" 1451 "\n pages free %lu"
1190 "\n min %lu" 1452 "\n min %lu"
1191 "\n low %lu" 1453 "\n low %lu"
1192 "\n high %lu" 1454 "\n high %lu"
1193 "\n scanned %lu" 1455 "\n node_scanned %lu"
1194 "\n spanned %lu" 1456 "\n spanned %lu"
1195 "\n present %lu" 1457 "\n present %lu"
1196 "\n managed %lu", 1458 "\n managed %lu",
@@ -1198,13 +1460,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1198 min_wmark_pages(zone), 1460 min_wmark_pages(zone),
1199 low_wmark_pages(zone), 1461 low_wmark_pages(zone),
1200 high_wmark_pages(zone), 1462 high_wmark_pages(zone),
1201 zone_page_state(zone, NR_PAGES_SCANNED), 1463 node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
1202 zone->spanned_pages, 1464 zone->spanned_pages,
1203 zone->present_pages, 1465 zone->present_pages,
1204 zone->managed_pages); 1466 zone->managed_pages);
1205 1467
1206 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1468 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1207 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1469 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
1208 zone_page_state(zone, i)); 1470 zone_page_state(zone, i));
1209 1471
1210 seq_printf(m, 1472 seq_printf(m,
@@ -1234,12 +1496,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1234#endif 1496#endif
1235 } 1497 }
1236 seq_printf(m, 1498 seq_printf(m,
1237 "\n all_unreclaimable: %u" 1499 "\n node_unreclaimable: %u"
1238 "\n start_pfn: %lu" 1500 "\n start_pfn: %lu"
1239 "\n inactive_ratio: %u", 1501 "\n node_inactive_ratio: %u",
1240 !zone_reclaimable(zone), 1502 !pgdat_reclaimable(zone->zone_pgdat),
1241 zone->zone_start_pfn, 1503 zone->zone_start_pfn,
1242 zone->inactive_ratio); 1504 zone->zone_pgdat->inactive_ratio);
1243 seq_putc(m, '\n'); 1505 seq_putc(m, '\n');
1244} 1506}
1245 1507
@@ -1287,6 +1549,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
1287 if (*pos >= ARRAY_SIZE(vmstat_text)) 1549 if (*pos >= ARRAY_SIZE(vmstat_text))
1288 return NULL; 1550 return NULL;
1289 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + 1551 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
1552 NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
1290 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); 1553 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
1291 1554
1292#ifdef CONFIG_VM_EVENT_COUNTERS 1555#ifdef CONFIG_VM_EVENT_COUNTERS
@@ -1301,6 +1564,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
1301 v[i] = global_page_state(i); 1564 v[i] = global_page_state(i);
1302 v += NR_VM_ZONE_STAT_ITEMS; 1565 v += NR_VM_ZONE_STAT_ITEMS;
1303 1566
1567 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
1568 v[i] = global_node_page_state(i);
1569 v += NR_VM_NODE_STAT_ITEMS;
1570
1304 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, 1571 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1305 v + NR_DIRTY_THRESHOLD); 1572 v + NR_DIRTY_THRESHOLD);
1306 v += NR_VM_WRITEBACK_STAT_ITEMS; 1573 v += NR_VM_WRITEBACK_STAT_ITEMS;
@@ -1325,7 +1592,6 @@ static int vmstat_show(struct seq_file *m, void *arg)
1325{ 1592{
1326 unsigned long *l = arg; 1593 unsigned long *l = arg;
1327 unsigned long off = l - (unsigned long *)m->private; 1594 unsigned long off = l - (unsigned long *)m->private;
1328
1329 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 1595 seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
1330 return 0; 1596 return 0;
1331} 1597}
@@ -1390,13 +1656,12 @@ int vmstat_refresh(struct ctl_table *table, int write,
1390 if (err) 1656 if (err)
1391 return err; 1657 return err;
1392 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { 1658 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1393 val = atomic_long_read(&vm_stat[i]); 1659 val = atomic_long_read(&vm_zone_stat[i]);
1394 if (val < 0) { 1660 if (val < 0) {
1395 switch (i) { 1661 switch (i) {
1396 case NR_ALLOC_BATCH:
1397 case NR_PAGES_SCANNED: 1662 case NR_PAGES_SCANNED:
1398 /* 1663 /*
1399 * These are often seen to go negative in 1664 * This is often seen to go negative in
1400 * recent kernels, but not to go permanently 1665 * recent kernels, but not to go permanently
1401 * negative. Whilst it would be nicer not to 1666 * negative. Whilst it would be nicer not to
1402 * have exceptions, rooting them out would be 1667 * have exceptions, rooting them out would be
diff --git a/mm/workingset.c b/mm/workingset.c
index 577277546d98..69551cfae97b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -16,7 +16,7 @@
16/* 16/*
17 * Double CLOCK lists 17 * Double CLOCK lists
18 * 18 *
19 * Per zone, two clock lists are maintained for file pages: the 19 * Per node, two clock lists are maintained for file pages: the
20 * inactive and the active list. Freshly faulted pages start out at 20 * inactive and the active list. Freshly faulted pages start out at
21 * the head of the inactive list and page reclaim scans pages from the 21 * the head of the inactive list and page reclaim scans pages from the
22 * tail. Pages that are accessed multiple times on the inactive list 22 * tail. Pages that are accessed multiple times on the inactive list
@@ -141,11 +141,11 @@
141 * 141 *
142 * Implementation 142 * Implementation
143 * 143 *
144 * For each zone's file LRU lists, a counter for inactive evictions 144 * For each node's file LRU lists, a counter for inactive evictions
145 * and activations is maintained (zone->inactive_age). 145 * and activations is maintained (node->inactive_age).
146 * 146 *
147 * On eviction, a snapshot of this counter (along with some bits to 147 * On eviction, a snapshot of this counter (along with some bits to
148 * identify the zone) is stored in the now empty page cache radix tree 148 * identify the node) is stored in the now empty page cache radix tree
149 * slot of the evicted page. This is called a shadow entry. 149 * slot of the evicted page. This is called a shadow entry.
150 * 150 *
151 * On cache misses for which there are shadow entries, an eligible 151 * On cache misses for which there are shadow entries, an eligible
@@ -153,7 +153,7 @@
153 */ 153 */
154 154
155#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ 155#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
156 ZONES_SHIFT + NODES_SHIFT + \ 156 NODES_SHIFT + \
157 MEM_CGROUP_ID_SHIFT) 157 MEM_CGROUP_ID_SHIFT)
158#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 158#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
159 159
@@ -167,33 +167,30 @@
167 */ 167 */
168static unsigned int bucket_order __read_mostly; 168static unsigned int bucket_order __read_mostly;
169 169
170static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) 170static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
171{ 171{
172 eviction >>= bucket_order; 172 eviction >>= bucket_order;
173 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; 173 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
174 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); 174 eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
175 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
176 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); 175 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
177 176
178 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); 177 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
179} 178}
180 179
181static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, 180static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
182 unsigned long *evictionp) 181 unsigned long *evictionp)
183{ 182{
184 unsigned long entry = (unsigned long)shadow; 183 unsigned long entry = (unsigned long)shadow;
185 int memcgid, nid, zid; 184 int memcgid, nid;
186 185
187 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; 186 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
188 zid = entry & ((1UL << ZONES_SHIFT) - 1);
189 entry >>= ZONES_SHIFT;
190 nid = entry & ((1UL << NODES_SHIFT) - 1); 187 nid = entry & ((1UL << NODES_SHIFT) - 1);
191 entry >>= NODES_SHIFT; 188 entry >>= NODES_SHIFT;
192 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); 189 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
193 entry >>= MEM_CGROUP_ID_SHIFT; 190 entry >>= MEM_CGROUP_ID_SHIFT;
194 191
195 *memcgidp = memcgid; 192 *memcgidp = memcgid;
196 *zonep = NODE_DATA(nid)->node_zones + zid; 193 *pgdat = NODE_DATA(nid);
197 *evictionp = entry << bucket_order; 194 *evictionp = entry << bucket_order;
198} 195}
199 196
@@ -208,7 +205,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
208void *workingset_eviction(struct address_space *mapping, struct page *page) 205void *workingset_eviction(struct address_space *mapping, struct page *page)
209{ 206{
210 struct mem_cgroup *memcg = page_memcg(page); 207 struct mem_cgroup *memcg = page_memcg(page);
211 struct zone *zone = page_zone(page); 208 struct pglist_data *pgdat = page_pgdat(page);
212 int memcgid = mem_cgroup_id(memcg); 209 int memcgid = mem_cgroup_id(memcg);
213 unsigned long eviction; 210 unsigned long eviction;
214 struct lruvec *lruvec; 211 struct lruvec *lruvec;
@@ -218,9 +215,9 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
218 VM_BUG_ON_PAGE(page_count(page), page); 215 VM_BUG_ON_PAGE(page_count(page), page);
219 VM_BUG_ON_PAGE(!PageLocked(page), page); 216 VM_BUG_ON_PAGE(!PageLocked(page), page);
220 217
221 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 218 lruvec = mem_cgroup_lruvec(pgdat, memcg);
222 eviction = atomic_long_inc_return(&lruvec->inactive_age); 219 eviction = atomic_long_inc_return(&lruvec->inactive_age);
223 return pack_shadow(memcgid, zone, eviction); 220 return pack_shadow(memcgid, pgdat, eviction);
224} 221}
225 222
226/** 223/**
@@ -228,7 +225,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
228 * @shadow: shadow entry of the evicted page 225 * @shadow: shadow entry of the evicted page
229 * 226 *
230 * Calculates and evaluates the refault distance of the previously 227 * Calculates and evaluates the refault distance of the previously
231 * evicted page in the context of the zone it was allocated in. 228 * evicted page in the context of the node it was allocated in.
232 * 229 *
233 * Returns %true if the page should be activated, %false otherwise. 230 * Returns %true if the page should be activated, %false otherwise.
234 */ 231 */
@@ -240,10 +237,10 @@ bool workingset_refault(void *shadow)
240 unsigned long eviction; 237 unsigned long eviction;
241 struct lruvec *lruvec; 238 struct lruvec *lruvec;
242 unsigned long refault; 239 unsigned long refault;
243 struct zone *zone; 240 struct pglist_data *pgdat;
244 int memcgid; 241 int memcgid;
245 242
246 unpack_shadow(shadow, &memcgid, &zone, &eviction); 243 unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
247 244
248 rcu_read_lock(); 245 rcu_read_lock();
249 /* 246 /*
@@ -267,7 +264,7 @@ bool workingset_refault(void *shadow)
267 rcu_read_unlock(); 264 rcu_read_unlock();
268 return false; 265 return false;
269 } 266 }
270 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 267 lruvec = mem_cgroup_lruvec(pgdat, memcg);
271 refault = atomic_long_read(&lruvec->inactive_age); 268 refault = atomic_long_read(&lruvec->inactive_age);
272 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); 269 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
273 rcu_read_unlock(); 270 rcu_read_unlock();
@@ -290,10 +287,10 @@ bool workingset_refault(void *shadow)
290 */ 287 */
291 refault_distance = (refault - eviction) & EVICTION_MASK; 288 refault_distance = (refault - eviction) & EVICTION_MASK;
292 289
293 inc_zone_state(zone, WORKINGSET_REFAULT); 290 inc_node_state(pgdat, WORKINGSET_REFAULT);
294 291
295 if (refault_distance <= active_file) { 292 if (refault_distance <= active_file) {
296 inc_zone_state(zone, WORKINGSET_ACTIVATE); 293 inc_node_state(pgdat, WORKINGSET_ACTIVATE);
297 return true; 294 return true;
298 } 295 }
299 return false; 296 return false;
@@ -305,9 +302,10 @@ bool workingset_refault(void *shadow)
305 */ 302 */
306void workingset_activation(struct page *page) 303void workingset_activation(struct page *page)
307{ 304{
305 struct mem_cgroup *memcg;
308 struct lruvec *lruvec; 306 struct lruvec *lruvec;
309 307
310 lock_page_memcg(page); 308 rcu_read_lock();
311 /* 309 /*
312 * Filter non-memcg pages here, e.g. unmap can call 310 * Filter non-memcg pages here, e.g. unmap can call
313 * mark_page_accessed() on VDSO pages. 311 * mark_page_accessed() on VDSO pages.
@@ -315,12 +313,13 @@ void workingset_activation(struct page *page)
315 * XXX: See workingset_refault() - this should return 313 * XXX: See workingset_refault() - this should return
316 * root_mem_cgroup even for !CONFIG_MEMCG. 314 * root_mem_cgroup even for !CONFIG_MEMCG.
317 */ 315 */
318 if (!mem_cgroup_disabled() && !page_memcg(page)) 316 memcg = page_memcg_rcu(page);
317 if (!mem_cgroup_disabled() && !memcg)
319 goto out; 318 goto out;
320 lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); 319 lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
321 atomic_long_inc(&lruvec->inactive_age); 320 atomic_long_inc(&lruvec->inactive_age);
322out: 321out:
323 unlock_page_memcg(page); 322 rcu_read_unlock();
324} 323}
325 324
326/* 325/*
@@ -349,12 +348,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
349 shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); 348 shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
350 local_irq_enable(); 349 local_irq_enable();
351 350
352 if (memcg_kmem_enabled()) 351 if (memcg_kmem_enabled()) {
353 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, 352 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
354 LRU_ALL_FILE); 353 LRU_ALL_FILE);
355 else 354 } else {
356 pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + 355 pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
357 node_page_state(sc->nid, NR_INACTIVE_FILE); 356 node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
357 }
358 358
359 /* 359 /*
360 * Active cache pages are limited to 50% of memory, and shadow 360 * Active cache pages are limited to 50% of memory, and shadow
@@ -433,7 +433,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
433 } 433 }
434 } 434 }
435 BUG_ON(node->count); 435 BUG_ON(node->count);
436 inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); 436 inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
437 if (!__radix_tree_delete_node(&mapping->page_tree, node)) 437 if (!__radix_tree_delete_node(&mapping->page_tree, node))
438 BUG(); 438 BUG();
439 439
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 04176de6df70..b0bc023d25c5 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -20,6 +20,7 @@
20 * page->freelist(index): links together all component pages of a zspage 20 * page->freelist(index): links together all component pages of a zspage
21 * For the huge page, this is always 0, so we use this field 21 * For the huge page, this is always 0, so we use this field
22 * to store handle. 22 * to store handle.
23 * page->units: first object offset in a subpage of zspage
23 * 24 *
24 * Usage of struct page flags: 25 * Usage of struct page flags:
25 * PG_private: identifies the first component page 26 * PG_private: identifies the first component page
@@ -137,9 +138,6 @@
137 */ 138 */
138#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) 139#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
139 140
140/*
141 * We do not maintain any list for completely empty or full pages
142 */
143enum fullness_group { 141enum fullness_group {
144 ZS_EMPTY, 142 ZS_EMPTY,
145 ZS_ALMOST_EMPTY, 143 ZS_ALMOST_EMPTY,
@@ -467,11 +465,6 @@ static struct zpool_driver zs_zpool_driver = {
467MODULE_ALIAS("zpool-zsmalloc"); 465MODULE_ALIAS("zpool-zsmalloc");
468#endif /* CONFIG_ZPOOL */ 466#endif /* CONFIG_ZPOOL */
469 467
470static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
471{
472 return pages_per_zspage * PAGE_SIZE / size;
473}
474
475/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 468/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
476static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 469static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
477 470
@@ -635,8 +628,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
635 freeable = zs_can_compact(class); 628 freeable = zs_can_compact(class);
636 spin_unlock(&class->lock); 629 spin_unlock(&class->lock);
637 630
638 objs_per_zspage = get_maxobj_per_zspage(class->size, 631 objs_per_zspage = class->objs_per_zspage;
639 class->pages_per_zspage);
640 pages_used = obj_allocated / objs_per_zspage * 632 pages_used = obj_allocated / objs_per_zspage *
641 class->pages_per_zspage; 633 class->pages_per_zspage;
642 634
@@ -945,8 +937,8 @@ static void unpin_tag(unsigned long handle)
945static void reset_page(struct page *page) 937static void reset_page(struct page *page)
946{ 938{
947 __ClearPageMovable(page); 939 __ClearPageMovable(page);
948 clear_bit(PG_private, &page->flags); 940 ClearPagePrivate(page);
949 clear_bit(PG_private_2, &page->flags); 941 ClearPagePrivate2(page);
950 set_page_private(page, 0); 942 set_page_private(page, 0);
951 page_mapcount_reset(page); 943 page_mapcount_reset(page);
952 ClearPageHugeObject(page); 944 ClearPageHugeObject(page);
@@ -1014,8 +1006,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
1014 1006
1015 cache_free_zspage(pool, zspage); 1007 cache_free_zspage(pool, zspage);
1016 1008
1017 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1009 zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
1018 class->size, class->pages_per_zspage));
1019 atomic_long_sub(class->pages_per_zspage, 1010 atomic_long_sub(class->pages_per_zspage,
1020 &pool->pages_allocated); 1011 &pool->pages_allocated);
1021} 1012}
@@ -1350,7 +1341,7 @@ static void zs_unregister_cpu_notifier(void)
1350 cpu_notifier_register_done(); 1341 cpu_notifier_register_done();
1351} 1342}
1352 1343
1353static void init_zs_size_classes(void) 1344static void __init init_zs_size_classes(void)
1354{ 1345{
1355 int nr; 1346 int nr;
1356 1347
@@ -1361,16 +1352,14 @@ static void init_zs_size_classes(void)
1361 zs_size_classes = nr; 1352 zs_size_classes = nr;
1362} 1353}
1363 1354
1364static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1355static bool can_merge(struct size_class *prev, int pages_per_zspage,
1356 int objs_per_zspage)
1365{ 1357{
1366 if (prev->pages_per_zspage != pages_per_zspage) 1358 if (prev->pages_per_zspage == pages_per_zspage &&
1367 return false; 1359 prev->objs_per_zspage == objs_per_zspage)
1360 return true;
1368 1361
1369 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) 1362 return false;
1370 != get_maxobj_per_zspage(size, pages_per_zspage))
1371 return false;
1372
1373 return true;
1374} 1363}
1375 1364
1376static bool zspage_full(struct size_class *class, struct zspage *zspage) 1365static bool zspage_full(struct size_class *class, struct zspage *zspage)
@@ -1541,6 +1530,7 @@ static unsigned long obj_malloc(struct size_class *class,
1541 * zs_malloc - Allocate block of given size from pool. 1530 * zs_malloc - Allocate block of given size from pool.
1542 * @pool: pool to allocate from 1531 * @pool: pool to allocate from
1543 * @size: size of block to allocate 1532 * @size: size of block to allocate
1533 * @gfp: gfp flags when allocating object
1544 * 1534 *
1545 * On success, handle to the allocated object is returned, 1535 * On success, handle to the allocated object is returned,
1546 * otherwise 0. 1536 * otherwise 0.
@@ -1592,8 +1582,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
1592 record_obj(handle, obj); 1582 record_obj(handle, obj);
1593 atomic_long_add(class->pages_per_zspage, 1583 atomic_long_add(class->pages_per_zspage,
1594 &pool->pages_allocated); 1584 &pool->pages_allocated);
1595 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1585 zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
1596 class->size, class->pages_per_zspage));
1597 1586
1598 /* We completely set up zspage so mark them as movable */ 1587 /* We completely set up zspage so mark them as movable */
1599 SetZsPageMovable(pool, zspage); 1588 SetZsPageMovable(pool, zspage);
@@ -1741,10 +1730,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
1741 * return handle. 1730 * return handle.
1742 */ 1731 */
1743static unsigned long find_alloced_obj(struct size_class *class, 1732static unsigned long find_alloced_obj(struct size_class *class,
1744 struct page *page, int index) 1733 struct page *page, int *obj_idx)
1745{ 1734{
1746 unsigned long head; 1735 unsigned long head;
1747 int offset = 0; 1736 int offset = 0;
1737 int index = *obj_idx;
1748 unsigned long handle = 0; 1738 unsigned long handle = 0;
1749 void *addr = kmap_atomic(page); 1739 void *addr = kmap_atomic(page);
1750 1740
@@ -1765,6 +1755,9 @@ static unsigned long find_alloced_obj(struct size_class *class,
1765 } 1755 }
1766 1756
1767 kunmap_atomic(addr); 1757 kunmap_atomic(addr);
1758
1759 *obj_idx = index;
1760
1768 return handle; 1761 return handle;
1769} 1762}
1770 1763
@@ -1776,7 +1769,7 @@ struct zs_compact_control {
1776 struct page *d_page; 1769 struct page *d_page;
1777 /* Starting object index within @s_page which used for live object 1770 /* Starting object index within @s_page which used for live object
1778 * in the subpage. */ 1771 * in the subpage. */
1779 int index; 1772 int obj_idx;
1780}; 1773};
1781 1774
1782static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1775static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1786,16 +1779,16 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1786 unsigned long handle; 1779 unsigned long handle;
1787 struct page *s_page = cc->s_page; 1780 struct page *s_page = cc->s_page;
1788 struct page *d_page = cc->d_page; 1781 struct page *d_page = cc->d_page;
1789 unsigned long index = cc->index; 1782 int obj_idx = cc->obj_idx;
1790 int ret = 0; 1783 int ret = 0;
1791 1784
1792 while (1) { 1785 while (1) {
1793 handle = find_alloced_obj(class, s_page, index); 1786 handle = find_alloced_obj(class, s_page, &obj_idx);
1794 if (!handle) { 1787 if (!handle) {
1795 s_page = get_next_page(s_page); 1788 s_page = get_next_page(s_page);
1796 if (!s_page) 1789 if (!s_page)
1797 break; 1790 break;
1798 index = 0; 1791 obj_idx = 0;
1799 continue; 1792 continue;
1800 } 1793 }
1801 1794
@@ -1809,7 +1802,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1809 used_obj = handle_to_obj(handle); 1802 used_obj = handle_to_obj(handle);
1810 free_obj = obj_malloc(class, get_zspage(d_page), handle); 1803 free_obj = obj_malloc(class, get_zspage(d_page), handle);
1811 zs_object_copy(class, free_obj, used_obj); 1804 zs_object_copy(class, free_obj, used_obj);
1812 index++; 1805 obj_idx++;
1813 /* 1806 /*
1814 * record_obj updates handle's value to free_obj and it will 1807 * record_obj updates handle's value to free_obj and it will
1815 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which 1808 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
@@ -1824,7 +1817,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1824 1817
1825 /* Remember last position in this iteration */ 1818 /* Remember last position in this iteration */
1826 cc->s_page = s_page; 1819 cc->s_page = s_page;
1827 cc->index = index; 1820 cc->obj_idx = obj_idx;
1828 1821
1829 return ret; 1822 return ret;
1830} 1823}
@@ -2181,8 +2174,7 @@ static int zs_register_migration(struct zs_pool *pool)
2181static void zs_unregister_migration(struct zs_pool *pool) 2174static void zs_unregister_migration(struct zs_pool *pool)
2182{ 2175{
2183 flush_work(&pool->free_work); 2176 flush_work(&pool->free_work);
2184 if (pool->inode) 2177 iput(pool->inode);
2185 iput(pool->inode);
2186} 2178}
2187 2179
2188/* 2180/*
@@ -2261,8 +2253,7 @@ static unsigned long zs_can_compact(struct size_class *class)
2261 return 0; 2253 return 0;
2262 2254
2263 obj_wasted = obj_allocated - obj_used; 2255 obj_wasted = obj_allocated - obj_used;
2264 obj_wasted /= get_maxobj_per_zspage(class->size, 2256 obj_wasted /= class->objs_per_zspage;
2265 class->pages_per_zspage);
2266 2257
2267 return obj_wasted * class->pages_per_zspage; 2258 return obj_wasted * class->pages_per_zspage;
2268} 2259}
@@ -2279,7 +2270,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
2279 if (!zs_can_compact(class)) 2270 if (!zs_can_compact(class))
2280 break; 2271 break;
2281 2272
2282 cc.index = 0; 2273 cc.obj_idx = 0;
2283 cc.s_page = get_first_page(src_zspage); 2274 cc.s_page = get_first_page(src_zspage);
2284 2275
2285 while ((dst_zspage = isolate_zspage(class, false))) { 2276 while ((dst_zspage = isolate_zspage(class, false))) {
@@ -2398,7 +2389,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
2398 2389
2399/** 2390/**
2400 * zs_create_pool - Creates an allocation pool to work from. 2391 * zs_create_pool - Creates an allocation pool to work from.
2401 * @flags: allocation flags used to allocate pool metadata 2392 * @name: pool name to be created
2402 * 2393 *
2403 * This function must be called before anything when using 2394 * This function must be called before anything when using
2404 * the zsmalloc allocator. 2395 * the zsmalloc allocator.
@@ -2438,6 +2429,7 @@ struct zs_pool *zs_create_pool(const char *name)
2438 for (i = zs_size_classes - 1; i >= 0; i--) { 2429 for (i = zs_size_classes - 1; i >= 0; i--) {
2439 int size; 2430 int size;
2440 int pages_per_zspage; 2431 int pages_per_zspage;
2432 int objs_per_zspage;
2441 struct size_class *class; 2433 struct size_class *class;
2442 int fullness = 0; 2434 int fullness = 0;
2443 2435
@@ -2445,6 +2437,7 @@ struct zs_pool *zs_create_pool(const char *name)
2445 if (size > ZS_MAX_ALLOC_SIZE) 2437 if (size > ZS_MAX_ALLOC_SIZE)
2446 size = ZS_MAX_ALLOC_SIZE; 2438 size = ZS_MAX_ALLOC_SIZE;
2447 pages_per_zspage = get_pages_per_zspage(size); 2439 pages_per_zspage = get_pages_per_zspage(size);
2440 objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
2448 2441
2449 /* 2442 /*
2450 * size_class is used for normal zsmalloc operation such 2443 * size_class is used for normal zsmalloc operation such
@@ -2456,7 +2449,7 @@ struct zs_pool *zs_create_pool(const char *name)
2456 * previous size_class if possible. 2449 * previous size_class if possible.
2457 */ 2450 */
2458 if (prev_class) { 2451 if (prev_class) {
2459 if (can_merge(prev_class, size, pages_per_zspage)) { 2452 if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) {
2460 pool->size_class[i] = prev_class; 2453 pool->size_class[i] = prev_class;
2461 continue; 2454 continue;
2462 } 2455 }
@@ -2469,8 +2462,7 @@ struct zs_pool *zs_create_pool(const char *name)
2469 class->size = size; 2462 class->size = size;
2470 class->index = i; 2463 class->index = i;
2471 class->pages_per_zspage = pages_per_zspage; 2464 class->pages_per_zspage = pages_per_zspage;
2472 class->objs_per_zspage = class->pages_per_zspage * 2465 class->objs_per_zspage = objs_per_zspage;
2473 PAGE_SIZE / class->size;
2474 spin_lock_init(&class->lock); 2466 spin_lock_init(&class->lock);
2475 pool->size_class[i] = class; 2467 pool->size_class[i] = class;
2476 for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; 2468 for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index b1d491c2e704..fdde1bd3e306 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -608,6 +608,7 @@ static const struct {
608 const char *compact; 608 const char *compact;
609} gfp_compact_table[] = { 609} gfp_compact_table[] = {
610 { "GFP_TRANSHUGE", "THP" }, 610 { "GFP_TRANSHUGE", "THP" },
611 { "GFP_TRANSHUGE_LIGHT", "THL" },
611 { "GFP_HIGHUSER_MOVABLE", "HUM" }, 612 { "GFP_HIGHUSER_MOVABLE", "HUM" },
612 { "GFP_HIGHUSER", "HU" }, 613 { "GFP_HIGHUSER", "HU" },
613 { "GFP_USER", "U" }, 614 { "GFP_USER", "U" },