aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2013-01-29 17:59:09 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2013-01-29 18:10:15 -0500
commitde65d816aa44f9ddd79861ae21d75010cc1fd003 (patch)
tree04a637a43b2e52a733d0dcb7595a47057571e7da /mm
parent9710f581bb4c35589ac046b0cfc0deb7f369fc85 (diff)
parent5dcd14ecd41ea2b3ae3295a9b30d98769d52165f (diff)
Merge remote-tracking branch 'origin/x86/boot' into x86/mm2
Coming patches to x86/mm2 require the changes and advanced baseline in x86/boot. Resolved Conflicts: arch/x86/kernel/setup.c mm/nobootmem.c Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig34
-rw-r--r--mm/Makefile3
-rw-r--r--mm/balloon_compaction.c302
-rw-r--r--mm/bootmem.c103
-rw-r--r--mm/compaction.c166
-rw-r--r--mm/dmapool.c55
-rw-r--r--mm/highmem.c30
-rw-r--r--mm/huge_memory.c658
-rw-r--r--mm/hugetlb.c63
-rw-r--r--mm/hugetlb_cgroup.c42
-rw-r--r--mm/internal.h13
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/ksm.c37
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c1483
-rw-r--r--mm/memory-failure.c43
-rw-r--r--mm/memory.c251
-rw-r--r--mm/memory_hotplug.c430
-rw-r--r--mm/mempolicy.c470
-rw-r--r--mm/migrate.c450
-rw-r--r--mm/mmap.c569
-rw-r--r--mm/mprotect.c151
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nobootmem.c21
-rw-r--r--mm/nommu.c15
-rw-r--r--mm/oom_kill.c138
-rw-r--r--mm/page-writeback.c36
-rw-r--r--mm/page_alloc.c421
-rw-r--r--mm/page_cgroup.c5
-rw-r--r--mm/page_isolation.c53
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c5
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c134
-rw-r--r--mm/shmem.c122
-rw-r--r--mm/slab.c383
-rw-r--r--mm/slab.h190
-rw-r--r--mm/slab_common.c292
-rw-r--r--mm/slob.c48
-rw-r--r--mm/slub.c451
-rw-r--r--mm/sparse.c35
-rw-r--r--mm/swapfile.c31
-rw-r--r--mm/truncate.c23
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c242
-rw-r--r--mm/vmstat.c28
47 files changed, 5954 insertions, 2099 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a3f8dddaaab3..278e3ab1f169 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,25 @@ config NO_BOOTMEM
143config MEMORY_ISOLATION 143config MEMORY_ISOLATION
144 boolean 144 boolean
145 145
146config MOVABLE_NODE
147 boolean "Enable to assign a node which has only movable memory"
148 depends on HAVE_MEMBLOCK
149 depends on NO_BOOTMEM
150 depends on X86_64
151 depends on NUMA
152 default n
153 help
154 Allow a node to have only movable memory. Pages used by the kernel,
155 such as direct mapping pages cannot be migrated. So the corresponding
156 memory device cannot be hotplugged. This option allows users to
157 online all the memory of a node as movable memory so that the whole
158 node can be hotplugged. Users who don't use the memory hotplug
159 feature are fine with this option on since they don't online memory
160 as movable.
161
162 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly.
164
146# eventually, we can have this option just 'select SPARSEMEM' 165# eventually, we can have this option just 'select SPARSEMEM'
147config MEMORY_HOTPLUG 166config MEMORY_HOTPLUG
148 bool "Allow for memory hot-add" 167 bool "Allow for memory hot-add"
@@ -188,6 +207,21 @@ config SPLIT_PTLOCK_CPUS
188 default "4" 207 default "4"
189 208
190# 209#
210# support for memory balloon compaction
211config BALLOON_COMPACTION
212 bool "Allow for balloon memory compaction/migration"
213 def_bool y
214 depends on COMPACTION && VIRTIO_BALLOON
215 help
216 Memory fragmentation introduced by ballooning might reduce
217 significantly the number of 2MB contiguous memory blocks that can be
218 used within a guest, thus imposing performance penalties associated
219 with the reduced number of transparent huge pages that could be used
220 by the guest workload. Allowing the compaction & migration for memory
221 pages enlisted as being part of memory balloon devices avoids the
222 scenario aforementioned and helps improving memory defragmentation.
223
224#
191# support for memory compaction 225# support for memory compaction
192config COMPACTION 226config COMPACTION
193 bool "Allow for memory compaction" 227 bool "Allow for memory compaction"
diff --git a/mm/Makefile b/mm/Makefile
index 6b025f80af34..3a4628751f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o interval_tree.o $(mmu-y) 19 compaction.o balloon_compaction.o \
20 interval_tree.o $(mmu-y)
20 21
21obj-y += init-mm.o 22obj-y += init-mm.o
22 23
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 000000000000..07dbc8ec46cf
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
1/*
2 * mm/balloon_compaction.c
3 *
4 * Common interface for making balloon pages movable by compaction.
5 *
6 * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
7 */
8#include <linux/mm.h>
9#include <linux/slab.h>
10#include <linux/export.h>
11#include <linux/balloon_compaction.h>
12
13/*
14 * balloon_devinfo_alloc - allocates a balloon device information descriptor.
15 * @balloon_dev_descriptor: pointer to reference the balloon device which
16 * this struct balloon_dev_info will be servicing.
17 *
18 * Driver must call it to properly allocate and initialize an instance of
19 * struct balloon_dev_info which will be used to reference a balloon device
20 * as well as to keep track of the balloon device page list.
21 */
22struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
23{
24 struct balloon_dev_info *b_dev_info;
25 b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
26 if (!b_dev_info)
27 return ERR_PTR(-ENOMEM);
28
29 b_dev_info->balloon_device = balloon_dev_descriptor;
30 b_dev_info->mapping = NULL;
31 b_dev_info->isolated_pages = 0;
32 spin_lock_init(&b_dev_info->pages_lock);
33 INIT_LIST_HEAD(&b_dev_info->pages);
34
35 return b_dev_info;
36}
37EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
38
39/*
40 * balloon_page_enqueue - allocates a new page and inserts it into the balloon
41 * page list.
42 * @b_dev_info: balloon device decriptor where we will insert a new page to
43 *
44 * Driver must call it to properly allocate a new enlisted balloon page
45 * before definetively removing it from the guest system.
46 * This function returns the page address for the recently enqueued page or
47 * NULL in the case we fail to allocate a new page this turn.
48 */
49struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
50{
51 unsigned long flags;
52 struct page *page = alloc_page(balloon_mapping_gfp_mask() |
53 __GFP_NOMEMALLOC | __GFP_NORETRY);
54 if (!page)
55 return NULL;
56
57 /*
58 * Block others from accessing the 'page' when we get around to
59 * establishing additional references. We should be the only one
60 * holding a reference to the 'page' at this point.
61 */
62 BUG_ON(!trylock_page(page));
63 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
64 balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
65 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
66 unlock_page(page);
67 return page;
68}
69EXPORT_SYMBOL_GPL(balloon_page_enqueue);
70
71/*
72 * balloon_page_dequeue - removes a page from balloon's page list and returns
73 * the its address to allow the driver release the page.
74 * @b_dev_info: balloon device decriptor where we will grab a page from.
75 *
76 * Driver must call it to properly de-allocate a previous enlisted balloon page
77 * before definetively releasing it back to the guest system.
78 * This function returns the page address for the recently dequeued page or
79 * NULL in the case we find balloon's page list temporarily empty due to
80 * compaction isolated pages.
81 */
82struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
83{
84 struct page *page, *tmp;
85 unsigned long flags;
86 bool dequeued_page;
87
88 dequeued_page = false;
89 list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
90 /*
91 * Block others from accessing the 'page' while we get around
92 * establishing additional references and preparing the 'page'
93 * to be released by the balloon driver.
94 */
95 if (trylock_page(page)) {
96 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
97 /*
98 * Raise the page refcount here to prevent any wrong
99 * attempt to isolate this page, in case of coliding
100 * with balloon_page_isolate() just after we release
101 * the page lock.
102 *
103 * balloon_page_free() will take care of dropping
104 * this extra refcount later.
105 */
106 get_page(page);
107 balloon_page_delete(page);
108 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
109 unlock_page(page);
110 dequeued_page = true;
111 break;
112 }
113 }
114
115 if (!dequeued_page) {
116 /*
117 * If we are unable to dequeue a balloon page because the page
118 * list is empty and there is no isolated pages, then something
119 * went out of track and some balloon pages are lost.
120 * BUG() here, otherwise the balloon driver may get stuck into
121 * an infinite loop while attempting to release all its pages.
122 */
123 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
124 if (unlikely(list_empty(&b_dev_info->pages) &&
125 !b_dev_info->isolated_pages))
126 BUG();
127 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
128 page = NULL;
129 }
130 return page;
131}
132EXPORT_SYMBOL_GPL(balloon_page_dequeue);
133
134#ifdef CONFIG_BALLOON_COMPACTION
135/*
136 * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
137 * @b_dev_info: holds the balloon device information descriptor.
138 * @a_ops: balloon_mapping address_space_operations descriptor.
139 *
140 * Driver must call it to properly allocate and initialize an instance of
141 * struct address_space which will be used as the special page->mapping for
142 * balloon device enlisted page instances.
143 */
144struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
145 const struct address_space_operations *a_ops)
146{
147 struct address_space *mapping;
148
149 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
150 if (!mapping)
151 return ERR_PTR(-ENOMEM);
152
153 /*
154 * Give a clean 'zeroed' status to all elements of this special
155 * balloon page->mapping struct address_space instance.
156 */
157 address_space_init_once(mapping);
158
159 /*
160 * Set mapping->flags appropriately, to allow balloon pages
161 * ->mapping identification.
162 */
163 mapping_set_balloon(mapping);
164 mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
165
166 /* balloon's page->mapping->a_ops callback descriptor */
167 mapping->a_ops = a_ops;
168
169 /*
170 * Establish a pointer reference back to the balloon device descriptor
171 * this particular page->mapping will be servicing.
172 * This is used by compaction / migration procedures to identify and
173 * access the balloon device pageset while isolating / migrating pages.
174 *
175 * As some balloon drivers can register multiple balloon devices
176 * for a single guest, this also helps compaction / migration to
177 * properly deal with multiple balloon pagesets, when required.
178 */
179 mapping->private_data = b_dev_info;
180 b_dev_info->mapping = mapping;
181
182 return mapping;
183}
184EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
185
186static inline void __isolate_balloon_page(struct page *page)
187{
188 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
189 unsigned long flags;
190 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
191 list_del(&page->lru);
192 b_dev_info->isolated_pages++;
193 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
194}
195
196static inline void __putback_balloon_page(struct page *page)
197{
198 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
199 unsigned long flags;
200 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
201 list_add(&page->lru, &b_dev_info->pages);
202 b_dev_info->isolated_pages--;
203 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
204}
205
206static inline int __migrate_balloon_page(struct address_space *mapping,
207 struct page *newpage, struct page *page, enum migrate_mode mode)
208{
209 return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
210}
211
212/* __isolate_lru_page() counterpart for a ballooned page */
213bool balloon_page_isolate(struct page *page)
214{
215 /*
216 * Avoid burning cycles with pages that are yet under __free_pages(),
217 * or just got freed under us.
218 *
219 * In case we 'win' a race for a balloon page being freed under us and
220 * raise its refcount preventing __free_pages() from doing its job
221 * the put_page() at the end of this block will take care of
222 * release this page, thus avoiding a nasty leakage.
223 */
224 if (likely(get_page_unless_zero(page))) {
225 /*
226 * As balloon pages are not isolated from LRU lists, concurrent
227 * compaction threads can race against page migration functions
228 * as well as race against the balloon driver releasing a page.
229 *
230 * In order to avoid having an already isolated balloon page
231 * being (wrongly) re-isolated while it is under migration,
232 * or to avoid attempting to isolate pages being released by
233 * the balloon driver, lets be sure we have the page lock
234 * before proceeding with the balloon page isolation steps.
235 */
236 if (likely(trylock_page(page))) {
237 /*
238 * A ballooned page, by default, has just one refcount.
239 * Prevent concurrent compaction threads from isolating
240 * an already isolated balloon page by refcount check.
241 */
242 if (__is_movable_balloon_page(page) &&
243 page_count(page) == 2) {
244 __isolate_balloon_page(page);
245 unlock_page(page);
246 return true;
247 }
248 unlock_page(page);
249 }
250 put_page(page);
251 }
252 return false;
253}
254
255/* putback_lru_page() counterpart for a ballooned page */
256void balloon_page_putback(struct page *page)
257{
258 /*
259 * 'lock_page()' stabilizes the page and prevents races against
260 * concurrent isolation threads attempting to re-isolate it.
261 */
262 lock_page(page);
263
264 if (__is_movable_balloon_page(page)) {
265 __putback_balloon_page(page);
266 /* drop the extra ref count taken for page isolation */
267 put_page(page);
268 } else {
269 WARN_ON(1);
270 dump_page(page);
271 }
272 unlock_page(page);
273}
274
275/* move_to_new_page() counterpart for a ballooned page */
276int balloon_page_migrate(struct page *newpage,
277 struct page *page, enum migrate_mode mode)
278{
279 struct address_space *mapping;
280 int rc = -EAGAIN;
281
282 /*
283 * Block others from accessing the 'newpage' when we get around to
284 * establishing additional references. We should be the only one
285 * holding a reference to the 'newpage' at this point.
286 */
287 BUG_ON(!trylock_page(newpage));
288
289 if (WARN_ON(!__is_movable_balloon_page(page))) {
290 dump_page(page);
291 unlock_page(newpage);
292 return rc;
293 }
294
295 mapping = page->mapping;
296 if (mapping)
297 rc = __migrate_balloon_page(mapping, newpage, page, mode);
298
299 unlock_page(newpage);
300 return rc;
301}
302#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..b93376c39b61 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
147 147
148/* 148/*
149 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
150 * @addr: starting address of the range 150 * @addr: starting physical address of the range
151 * @size: size of the range in bytes 151 * @size: size of the range in bytes
152 * 152 *
153 * This is only useful when the bootmem allocator has already been torn 153 * This is only useful when the bootmem allocator has already been torn
154 * down, but we are still initializing the system. Pages are given directly 154 * down, but we are still initializing the system. Pages are given directly
155 * to the page allocator, no bootmem metadata is updated because it is gone. 155 * to the page allocator, no bootmem metadata is updated because it is gone.
156 */ 156 */
157void __init free_bootmem_late(unsigned long addr, unsigned long size) 157void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
158{ 158{
159 unsigned long cursor, end; 159 unsigned long cursor, end;
160 160
161 kmemleak_free_part(__va(addr), size); 161 kmemleak_free_part(__va(physaddr), size);
162 162
163 cursor = PFN_UP(addr); 163 cursor = PFN_UP(physaddr);
164 end = PFN_DOWN(addr + size); 164 end = PFN_DOWN(physaddr + size);
165 165
166 for (; cursor < end; cursor++) { 166 for (; cursor < end; cursor++) {
167 __free_pages_bootmem(pfn_to_page(cursor), 0); 167 __free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
185 185
186 while (start < end) { 186 while (start < end) {
187 unsigned long *map, idx, vec; 187 unsigned long *map, idx, vec;
188 unsigned shift;
188 189
189 map = bdata->node_bootmem_map; 190 map = bdata->node_bootmem_map;
190 idx = start - bdata->node_min_pfn; 191 idx = start - bdata->node_min_pfn;
192 shift = idx & (BITS_PER_LONG - 1);
193 /*
194 * vec holds at most BITS_PER_LONG map bits,
195 * bit 0 corresponds to start.
196 */
191 vec = ~map[idx / BITS_PER_LONG]; 197 vec = ~map[idx / BITS_PER_LONG];
198
199 if (shift) {
200 vec >>= shift;
201 if (end - start >= BITS_PER_LONG)
202 vec |= ~map[idx / BITS_PER_LONG + 1] <<
203 (BITS_PER_LONG - shift);
204 }
192 /* 205 /*
193 * If we have a properly aligned and fully unreserved 206 * If we have a properly aligned and fully unreserved
194 * BITS_PER_LONG block of pages in front of us, free 207 * BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
201 count += BITS_PER_LONG; 214 count += BITS_PER_LONG;
202 start += BITS_PER_LONG; 215 start += BITS_PER_LONG;
203 } else { 216 } else {
204 unsigned long off = 0; 217 unsigned long cur = start;
205 218
206 vec >>= start & (BITS_PER_LONG - 1); 219 start = ALIGN(start + 1, BITS_PER_LONG);
207 while (vec) { 220 while (vec && cur != start) {
208 if (vec & 1) { 221 if (vec & 1) {
209 page = pfn_to_page(start + off); 222 page = pfn_to_page(cur);
210 __free_pages_bootmem(page, 0); 223 __free_pages_bootmem(page, 0);
211 count++; 224 count++;
212 } 225 }
213 vec >>= 1; 226 vec >>= 1;
214 off++; 227 ++cur;
215 } 228 }
216 start = ALIGN(start + 1, BITS_PER_LONG);
217 } 229 }
218 } 230 }
219 231
@@ -229,6 +241,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
229 return count; 241 return count;
230} 242}
231 243
244static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
245{
246 struct zone *z;
247
248 /*
249 * In free_area_init_core(), highmem zone's managed_pages is set to
250 * present_pages, and bootmem allocator doesn't allocate from highmem
251 * zones. So there's no need to recalculate managed_pages because all
252 * highmem pages will be managed by the buddy system. Here highmem
253 * zone also includes highmem movable zone.
254 */
255 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
256 if (!is_highmem(z))
257 z->managed_pages = 0;
258}
259
232/** 260/**
233 * free_all_bootmem_node - release a node's free pages to the buddy allocator 261 * free_all_bootmem_node - release a node's free pages to the buddy allocator
234 * @pgdat: node to be released 262 * @pgdat: node to be released
@@ -238,6 +266,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
238unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 266unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
239{ 267{
240 register_page_bootmem_info_node(pgdat); 268 register_page_bootmem_info_node(pgdat);
269 reset_node_lowmem_managed_pages(pgdat);
241 return free_all_bootmem_core(pgdat->bdata); 270 return free_all_bootmem_core(pgdat->bdata);
242} 271}
243 272
@@ -250,6 +279,10 @@ unsigned long __init free_all_bootmem(void)
250{ 279{
251 unsigned long total_pages = 0; 280 unsigned long total_pages = 0;
252 bootmem_data_t *bdata; 281 bootmem_data_t *bdata;
282 struct pglist_data *pgdat;
283
284 for_each_online_pgdat(pgdat)
285 reset_node_lowmem_managed_pages(pgdat);
253 286
254 list_for_each_entry(bdata, &bdata_list, list) 287 list_for_each_entry(bdata, &bdata_list, list)
255 total_pages += free_all_bootmem_core(bdata); 288 total_pages += free_all_bootmem_core(bdata);
@@ -377,21 +410,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
377 410
378/** 411/**
379 * free_bootmem - mark a page range as usable 412 * free_bootmem - mark a page range as usable
380 * @addr: starting address of the range 413 * @addr: starting physical address of the range
381 * @size: size of the range in bytes 414 * @size: size of the range in bytes
382 * 415 *
383 * Partial pages will be considered reserved and left as they are. 416 * Partial pages will be considered reserved and left as they are.
384 * 417 *
385 * The range must be contiguous but may span node boundaries. 418 * The range must be contiguous but may span node boundaries.
386 */ 419 */
387void __init free_bootmem(unsigned long addr, unsigned long size) 420void __init free_bootmem(unsigned long physaddr, unsigned long size)
388{ 421{
389 unsigned long start, end; 422 unsigned long start, end;
390 423
391 kmemleak_free_part(__va(addr), size); 424 kmemleak_free_part(__va(physaddr), size);
392 425
393 start = PFN_UP(addr); 426 start = PFN_UP(physaddr);
394 end = PFN_DOWN(addr + size); 427 end = PFN_DOWN(physaddr + size);
395 428
396 mark_bootmem(start, end, 0, 0); 429 mark_bootmem(start, end, 0, 0);
397} 430}
@@ -439,12 +472,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
439 return mark_bootmem(start, end, 1, flags); 472 return mark_bootmem(start, end, 1, flags);
440} 473}
441 474
442int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
443 int flags)
444{
445 return reserve_bootmem(phys, len, flags);
446}
447
448static unsigned long __init align_idx(struct bootmem_data *bdata, 475static unsigned long __init align_idx(struct bootmem_data *bdata,
449 unsigned long idx, unsigned long step) 476 unsigned long idx, unsigned long step)
450{ 477{
@@ -575,27 +602,6 @@ find_block:
575 return NULL; 602 return NULL;
576} 603}
577 604
578static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
579 unsigned long size, unsigned long align,
580 unsigned long goal, unsigned long limit)
581{
582 if (WARN_ON_ONCE(slab_is_available()))
583 return kzalloc(size, GFP_NOWAIT);
584
585#ifdef CONFIG_HAVE_ARCH_BOOTMEM
586 {
587 bootmem_data_t *p_bdata;
588
589 p_bdata = bootmem_arch_preferred_node(bdata, size, align,
590 goal, limit);
591 if (p_bdata)
592 return alloc_bootmem_bdata(p_bdata, size, align,
593 goal, limit);
594 }
595#endif
596 return NULL;
597}
598
599static void * __init alloc_bootmem_core(unsigned long size, 605static void * __init alloc_bootmem_core(unsigned long size,
600 unsigned long align, 606 unsigned long align,
601 unsigned long goal, 607 unsigned long goal,
@@ -604,9 +610,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
604 bootmem_data_t *bdata; 610 bootmem_data_t *bdata;
605 void *region; 611 void *region;
606 612
607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); 613 if (WARN_ON_ONCE(slab_is_available()))
608 if (region) 614 return kzalloc(size, GFP_NOWAIT);
609 return region;
610 615
611 list_for_each_entry(bdata, &bdata_list, list) { 616 list_for_each_entry(bdata, &bdata_list, list) {
612 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) 617 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -704,11 +709,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
704{ 709{
705 void *ptr; 710 void *ptr;
706 711
712 if (WARN_ON_ONCE(slab_is_available()))
713 return kzalloc(size, GFP_NOWAIT);
707again: 714again:
708 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
709 align, goal, limit);
710 if (ptr)
711 return ptr;
712 715
713 /* do not panic in alloc_bootmem_bdata() */ 716 /* do not panic in alloc_bootmem_bdata() */
714 if (limit && goal + size > limit) 717 if (limit && goal + size > limit)
diff --git a/mm/compaction.c b/mm/compaction.c
index 9eef55838fca..c62bd063d766 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -14,8 +14,24 @@
14#include <linux/backing-dev.h> 14#include <linux/backing-dev.h>
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include <linux/balloon_compaction.h>
17#include "internal.h" 18#include "internal.h"
18 19
20#ifdef CONFIG_COMPACTION
21static inline void count_compact_event(enum vm_event_item item)
22{
23 count_vm_event(item);
24}
25
26static inline void count_compact_events(enum vm_event_item item, long delta)
27{
28 count_vm_events(item, delta);
29}
30#else
31#define count_compact_event(item) do { } while (0)
32#define count_compact_events(item, delta) do { } while (0)
33#endif
34
19#if defined CONFIG_COMPACTION || defined CONFIG_CMA 35#if defined CONFIG_COMPACTION || defined CONFIG_CMA
20 36
21#define CREATE_TRACE_POINTS 37#define CREATE_TRACE_POINTS
@@ -214,60 +230,6 @@ static bool suitable_migration_target(struct page *page)
214 return false; 230 return false;
215} 231}
216 232
217static void compact_capture_page(struct compact_control *cc)
218{
219 unsigned long flags;
220 int mtype, mtype_low, mtype_high;
221
222 if (!cc->page || *cc->page)
223 return;
224
225 /*
226 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
227 * regardless of the migratetype of the freelist is is captured from.
228 * This is fine because the order for a high-order MIGRATE_MOVABLE
229 * allocation is typically at least a pageblock size and overall
230 * fragmentation is not impaired. Other allocation types must
231 * capture pages from their own migratelist because otherwise they
232 * could pollute other pageblocks like MIGRATE_MOVABLE with
233 * difficult to move pages and making fragmentation worse overall.
234 */
235 if (cc->migratetype == MIGRATE_MOVABLE) {
236 mtype_low = 0;
237 mtype_high = MIGRATE_PCPTYPES;
238 } else {
239 mtype_low = cc->migratetype;
240 mtype_high = cc->migratetype + 1;
241 }
242
243 /* Speculatively examine the free lists without zone lock */
244 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
245 int order;
246 for (order = cc->order; order < MAX_ORDER; order++) {
247 struct page *page;
248 struct free_area *area;
249 area = &(cc->zone->free_area[order]);
250 if (list_empty(&area->free_list[mtype]))
251 continue;
252
253 /* Take the lock and attempt capture of the page */
254 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
255 return;
256 if (!list_empty(&area->free_list[mtype])) {
257 page = list_entry(area->free_list[mtype].next,
258 struct page, lru);
259 if (capture_free_page(page, cc->order, mtype)) {
260 spin_unlock_irqrestore(&cc->zone->lock,
261 flags);
262 *cc->page = page;
263 return;
264 }
265 }
266 spin_unlock_irqrestore(&cc->zone->lock, flags);
267 }
268 }
269}
270
271/* 233/*
272 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 234 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
273 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 235 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -356,6 +318,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
356 if (blockpfn == end_pfn) 318 if (blockpfn == end_pfn)
357 update_pageblock_skip(cc, valid_page, total_isolated, false); 319 update_pageblock_skip(cc, valid_page, total_isolated, false);
358 320
321 count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
322 if (total_isolated)
323 count_compact_events(COMPACTISOLATED, total_isolated);
359 return total_isolated; 324 return total_isolated;
360} 325}
361 326
@@ -565,9 +530,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
565 goto next_pageblock; 530 goto next_pageblock;
566 } 531 }
567 532
568 /* Check may be lockless but that's ok as we recheck later */ 533 /*
569 if (!PageLRU(page)) 534 * Check may be lockless but that's ok as we recheck later.
535 * It's possible to migrate LRU pages and balloon pages
536 * Skip any other type of page
537 */
538 if (!PageLRU(page)) {
539 if (unlikely(balloon_page_movable(page))) {
540 if (locked && balloon_page_isolate(page)) {
541 /* Successfully isolated */
542 cc->finished_update_migrate = true;
543 list_add(&page->lru, migratelist);
544 cc->nr_migratepages++;
545 nr_isolated++;
546 goto check_compact_cluster;
547 }
548 }
570 continue; 549 continue;
550 }
571 551
572 /* 552 /*
573 * PageLRU is set. lru_lock normally excludes isolation 553 * PageLRU is set. lru_lock normally excludes isolation
@@ -621,6 +601,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
621 cc->nr_migratepages++; 601 cc->nr_migratepages++;
622 nr_isolated++; 602 nr_isolated++;
623 603
604check_compact_cluster:
624 /* Avoid isolating too much */ 605 /* Avoid isolating too much */
625 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 606 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
626 ++low_pfn; 607 ++low_pfn;
@@ -646,6 +627,10 @@ next_pageblock:
646 627
647 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 628 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
648 629
630 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
631 if (nr_isolated)
632 count_compact_events(COMPACTISOLATED, nr_isolated);
633
649 return low_pfn; 634 return low_pfn;
650} 635}
651 636
@@ -713,7 +698,15 @@ static void isolate_freepages(struct zone *zone,
713 698
714 /* Found a block suitable for isolating free pages from */ 699 /* Found a block suitable for isolating free pages from */
715 isolated = 0; 700 isolated = 0;
716 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); 701
702 /*
703 * As pfn may not start aligned, pfn+pageblock_nr_page
704 * may cross a MAX_ORDER_NR_PAGES boundary and miss
705 * a pfn_valid check. Ensure isolate_freepages_block()
706 * only scans within a pageblock
707 */
708 end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
709 end_pfn = min(end_pfn, zone_end_pfn);
717 isolated = isolate_freepages_block(cc, pfn, end_pfn, 710 isolated = isolate_freepages_block(cc, pfn, end_pfn,
718 freelist, false); 711 freelist, false);
719 nr_freepages += isolated; 712 nr_freepages += isolated;
@@ -823,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
823static int compact_finished(struct zone *zone, 816static int compact_finished(struct zone *zone,
824 struct compact_control *cc) 817 struct compact_control *cc)
825{ 818{
819 unsigned int order;
826 unsigned long watermark; 820 unsigned long watermark;
827 821
828 if (fatal_signal_pending(current)) 822 if (fatal_signal_pending(current))
@@ -857,22 +851,16 @@ static int compact_finished(struct zone *zone,
857 return COMPACT_CONTINUE; 851 return COMPACT_CONTINUE;
858 852
859 /* Direct compactor: Is a suitable page free? */ 853 /* Direct compactor: Is a suitable page free? */
860 if (cc->page) { 854 for (order = cc->order; order < MAX_ORDER; order++) {
861 /* Was a suitable page captured? */ 855 struct free_area *area = &zone->free_area[order];
862 if (*cc->page) 856
857 /* Job done if page is free of the right migratetype */
858 if (!list_empty(&area->free_list[cc->migratetype]))
859 return COMPACT_PARTIAL;
860
861 /* Job done if allocation would set block type */
862 if (cc->order >= pageblock_order && area->nr_free)
863 return COMPACT_PARTIAL; 863 return COMPACT_PARTIAL;
864 } else {
865 unsigned int order;
866 for (order = cc->order; order < MAX_ORDER; order++) {
867 struct free_area *area = &zone->free_area[cc->order];
868 /* Job done if page is free of the right migratetype */
869 if (!list_empty(&area->free_list[cc->migratetype]))
870 return COMPACT_PARTIAL;
871
872 /* Job done if allocation would set block type */
873 if (cc->order >= pageblock_order && area->nr_free)
874 return COMPACT_PARTIAL;
875 }
876 } 864 }
877 865
878 return COMPACT_CONTINUE; 866 return COMPACT_CONTINUE;
@@ -978,7 +966,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
978 switch (isolate_migratepages(zone, cc)) { 966 switch (isolate_migratepages(zone, cc)) {
979 case ISOLATE_ABORT: 967 case ISOLATE_ABORT:
980 ret = COMPACT_PARTIAL; 968 ret = COMPACT_PARTIAL;
981 putback_lru_pages(&cc->migratepages); 969 putback_movable_pages(&cc->migratepages);
982 cc->nr_migratepages = 0; 970 cc->nr_migratepages = 0;
983 goto out; 971 goto out;
984 case ISOLATE_NONE: 972 case ISOLATE_NONE:
@@ -990,29 +978,23 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
990 nr_migrate = cc->nr_migratepages; 978 nr_migrate = cc->nr_migratepages;
991 err = migrate_pages(&cc->migratepages, compaction_alloc, 979 err = migrate_pages(&cc->migratepages, compaction_alloc,
992 (unsigned long)cc, false, 980 (unsigned long)cc, false,
993 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); 981 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
982 MR_COMPACTION);
994 update_nr_listpages(cc); 983 update_nr_listpages(cc);
995 nr_remaining = cc->nr_migratepages; 984 nr_remaining = cc->nr_migratepages;
996 985
997 count_vm_event(COMPACTBLOCKS);
998 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
999 if (nr_remaining)
1000 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
1001 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 986 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
1002 nr_remaining); 987 nr_remaining);
1003 988
1004 /* Release LRU pages not migrated */ 989 /* Release isolated pages not migrated */
1005 if (err) { 990 if (err) {
1006 putback_lru_pages(&cc->migratepages); 991 putback_movable_pages(&cc->migratepages);
1007 cc->nr_migratepages = 0; 992 cc->nr_migratepages = 0;
1008 if (err == -ENOMEM) { 993 if (err == -ENOMEM) {
1009 ret = COMPACT_PARTIAL; 994 ret = COMPACT_PARTIAL;
1010 goto out; 995 goto out;
1011 } 996 }
1012 } 997 }
1013
1014 /* Capture a page now if it is a suitable size */
1015 compact_capture_page(cc);
1016 } 998 }
1017 999
1018out: 1000out:
@@ -1025,8 +1007,7 @@ out:
1025 1007
1026static unsigned long compact_zone_order(struct zone *zone, 1008static unsigned long compact_zone_order(struct zone *zone,
1027 int order, gfp_t gfp_mask, 1009 int order, gfp_t gfp_mask,
1028 bool sync, bool *contended, 1010 bool sync, bool *contended)
1029 struct page **page)
1030{ 1011{
1031 unsigned long ret; 1012 unsigned long ret;
1032 struct compact_control cc = { 1013 struct compact_control cc = {
@@ -1036,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
1036 .migratetype = allocflags_to_migratetype(gfp_mask), 1017 .migratetype = allocflags_to_migratetype(gfp_mask),
1037 .zone = zone, 1018 .zone = zone,
1038 .sync = sync, 1019 .sync = sync,
1039 .page = page,
1040 }; 1020 };
1041 INIT_LIST_HEAD(&cc.freepages); 1021 INIT_LIST_HEAD(&cc.freepages);
1042 INIT_LIST_HEAD(&cc.migratepages); 1022 INIT_LIST_HEAD(&cc.migratepages);
@@ -1066,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
1066 */ 1046 */
1067unsigned long try_to_compact_pages(struct zonelist *zonelist, 1047unsigned long try_to_compact_pages(struct zonelist *zonelist,
1068 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1048 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1069 bool sync, bool *contended, struct page **page) 1049 bool sync, bool *contended)
1070{ 1050{
1071 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1051 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1072 int may_enter_fs = gfp_mask & __GFP_FS; 1052 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1080,7 +1060,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1080 if (!order || !may_enter_fs || !may_perform_io) 1060 if (!order || !may_enter_fs || !may_perform_io)
1081 return rc; 1061 return rc;
1082 1062
1083 count_vm_event(COMPACTSTALL); 1063 count_compact_event(COMPACTSTALL);
1084 1064
1085#ifdef CONFIG_CMA 1065#ifdef CONFIG_CMA
1086 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 1066 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -1092,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1092 int status; 1072 int status;
1093 1073
1094 status = compact_zone_order(zone, order, gfp_mask, sync, 1074 status = compact_zone_order(zone, order, gfp_mask, sync,
1095 contended, page); 1075 contended);
1096 rc = max(status, rc); 1076 rc = max(status, rc);
1097 1077
1098 /* If a normal allocation would succeed, stop compacting */ 1078 /* If a normal allocation would succeed, stop compacting */
@@ -1148,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
1148 struct compact_control cc = { 1128 struct compact_control cc = {
1149 .order = order, 1129 .order = order,
1150 .sync = false, 1130 .sync = false,
1151 .page = NULL,
1152 }; 1131 };
1153 1132
1154 return __compact_pgdat(pgdat, &cc); 1133 return __compact_pgdat(pgdat, &cc);
@@ -1159,14 +1138,13 @@ static int compact_node(int nid)
1159 struct compact_control cc = { 1138 struct compact_control cc = {
1160 .order = -1, 1139 .order = -1,
1161 .sync = true, 1140 .sync = true,
1162 .page = NULL,
1163 }; 1141 };
1164 1142
1165 return __compact_pgdat(NODE_DATA(nid), &cc); 1143 return __compact_pgdat(NODE_DATA(nid), &cc);
1166} 1144}
1167 1145
1168/* Compact all nodes in the system */ 1146/* Compact all nodes in the system */
1169static int compact_nodes(void) 1147static void compact_nodes(void)
1170{ 1148{
1171 int nid; 1149 int nid;
1172 1150
@@ -1175,8 +1153,6 @@ static int compact_nodes(void)
1175 1153
1176 for_each_online_node(nid) 1154 for_each_online_node(nid)
1177 compact_node(nid); 1155 compact_node(nid);
1178
1179 return COMPACT_COMPLETE;
1180} 1156}
1181 1157
1182/* The written value is actually unused, all memory is compacted */ 1158/* The written value is actually unused, all memory is compacted */
@@ -1187,7 +1163,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
1187 void __user *buffer, size_t *length, loff_t *ppos) 1163 void __user *buffer, size_t *length, loff_t *ppos)
1188{ 1164{
1189 if (write) 1165 if (write)
1190 return compact_nodes(); 1166 compact_nodes();
1191 1167
1192 return 0; 1168 return 0;
1193} 1169}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c5ab33bca0a8..c69781e97cf9 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */
50 size_t allocation; 50 size_t allocation;
51 size_t boundary; 51 size_t boundary;
52 char name[32]; 52 char name[32];
53 wait_queue_head_t waitq;
54 struct list_head pools; 53 struct list_head pools;
55}; 54};
56 55
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
62 unsigned int offset; 61 unsigned int offset;
63}; 62};
64 63
65#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
66
67static DEFINE_MUTEX(pools_lock); 64static DEFINE_MUTEX(pools_lock);
68 65
69static ssize_t 66static ssize_t
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
172 retval->size = size; 169 retval->size = size;
173 retval->boundary = boundary; 170 retval->boundary = boundary;
174 retval->allocation = allocation; 171 retval->allocation = allocation;
175 init_waitqueue_head(&retval->waitq);
176 172
177 if (dev) { 173 if (dev) {
178 int ret; 174 int ret;
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
227 memset(page->vaddr, POOL_POISON_FREED, pool->allocation); 223 memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
228#endif 224#endif
229 pool_initialise_page(pool, page); 225 pool_initialise_page(pool, page);
230 list_add(&page->page_list, &pool->page_list);
231 page->in_use = 0; 226 page->in_use = 0;
232 page->offset = 0; 227 page->offset = 0;
233 } else { 228 } else {
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
315 might_sleep_if(mem_flags & __GFP_WAIT); 310 might_sleep_if(mem_flags & __GFP_WAIT);
316 311
317 spin_lock_irqsave(&pool->lock, flags); 312 spin_lock_irqsave(&pool->lock, flags);
318 restart:
319 list_for_each_entry(page, &pool->page_list, page_list) { 313 list_for_each_entry(page, &pool->page_list, page_list) {
320 if (page->offset < pool->allocation) 314 if (page->offset < pool->allocation)
321 goto ready; 315 goto ready;
322 } 316 }
323 page = pool_alloc_page(pool, GFP_ATOMIC);
324 if (!page) {
325 if (mem_flags & __GFP_WAIT) {
326 DECLARE_WAITQUEUE(wait, current);
327 317
328 __set_current_state(TASK_UNINTERRUPTIBLE); 318 /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
329 __add_wait_queue(&pool->waitq, &wait); 319 spin_unlock_irqrestore(&pool->lock, flags);
330 spin_unlock_irqrestore(&pool->lock, flags);
331 320
332 schedule_timeout(POOL_TIMEOUT_JIFFIES); 321 page = pool_alloc_page(pool, mem_flags);
322 if (!page)
323 return NULL;
333 324
334 spin_lock_irqsave(&pool->lock, flags); 325 spin_lock_irqsave(&pool->lock, flags);
335 __remove_wait_queue(&pool->waitq, &wait);
336 goto restart;
337 }
338 retval = NULL;
339 goto done;
340 }
341 326
327 list_add(&page->page_list, &pool->page_list);
342 ready: 328 ready:
343 page->in_use++; 329 page->in_use++;
344 offset = page->offset; 330 offset = page->offset;
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
346 retval = offset + page->vaddr; 332 retval = offset + page->vaddr;
347 *handle = offset + page->dma; 333 *handle = offset + page->dma;
348#ifdef DMAPOOL_DEBUG 334#ifdef DMAPOOL_DEBUG
335 {
336 int i;
337 u8 *data = retval;
338 /* page->offset is stored in first 4 bytes */
339 for (i = sizeof(page->offset); i < pool->size; i++) {
340 if (data[i] == POOL_POISON_FREED)
341 continue;
342 if (pool->dev)
343 dev_err(pool->dev,
344 "dma_pool_alloc %s, %p (corruped)\n",
345 pool->name, retval);
346 else
347 pr_err("dma_pool_alloc %s, %p (corruped)\n",
348 pool->name, retval);
349
350 /*
351 * Dump the first 4 bytes even if they are not
352 * POOL_POISON_FREED
353 */
354 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
355 data, pool->size, 1);
356 break;
357 }
358 }
349 memset(retval, POOL_POISON_ALLOCATED, pool->size); 359 memset(retval, POOL_POISON_ALLOCATED, pool->size);
350#endif 360#endif
351 done:
352 spin_unlock_irqrestore(&pool->lock, flags); 361 spin_unlock_irqrestore(&pool->lock, flags);
353 return retval; 362 return retval;
354} 363}
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
435 page->in_use--; 444 page->in_use--;
436 *(int *)vaddr = page->offset; 445 *(int *)vaddr = page->offset;
437 page->offset = offset; 446 page->offset = offset;
438 if (waitqueue_active(&pool->waitq))
439 wake_up_locked(&pool->waitq);
440 /* 447 /*
441 * Resist a temptation to do 448 * Resist a temptation to do
442 * if (!is_page_busy(page)) pool_free_page(pool, page); 449 * if (!is_page_busy(page)) pool_free_page(pool, page);
diff --git a/mm/highmem.c b/mm/highmem.c
index 2da13a5c50e2..b32b70cdaed6 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -99,12 +99,13 @@ struct page *kmap_to_page(void *vaddr)
99 unsigned long addr = (unsigned long)vaddr; 99 unsigned long addr = (unsigned long)vaddr;
100 100
101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { 101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; 102 int i = PKMAP_NR(addr);
103 return pte_page(pkmap_page_table[i]); 103 return pte_page(pkmap_page_table[i]);
104 } 104 }
105 105
106 return virt_to_page(addr); 106 return virt_to_page(addr);
107} 107}
108EXPORT_SYMBOL(kmap_to_page);
108 109
109static void flush_all_zero_pkmaps(void) 110static void flush_all_zero_pkmaps(void)
110{ 111{
@@ -137,8 +138,7 @@ static void flush_all_zero_pkmaps(void)
137 * So no dangers, even with speculative execution. 138 * So no dangers, even with speculative execution.
138 */ 139 */
139 page = pte_page(pkmap_page_table[i]); 140 page = pte_page(pkmap_page_table[i]);
140 pte_clear(&init_mm, (unsigned long)page_address(page), 141 pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
141 &pkmap_page_table[i]);
142 142
143 set_page_address(page, NULL); 143 set_page_address(page, NULL);
144 need_flush = 1; 144 need_flush = 1;
@@ -324,11 +324,7 @@ struct page_address_map {
324 struct list_head list; 324 struct list_head list;
325}; 325};
326 326
327/* 327static struct page_address_map page_address_maps[LAST_PKMAP];
328 * page_address_map freelist, allocated from page_address_maps.
329 */
330static struct list_head page_address_pool; /* freelist */
331static spinlock_t pool_lock; /* protects page_address_pool */
332 328
333/* 329/*
334 * Hash table bucket 330 * Hash table bucket
@@ -393,14 +389,7 @@ void set_page_address(struct page *page, void *virtual)
393 389
394 pas = page_slot(page); 390 pas = page_slot(page);
395 if (virtual) { /* Add */ 391 if (virtual) { /* Add */
396 BUG_ON(list_empty(&page_address_pool)); 392 pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
397
398 spin_lock_irqsave(&pool_lock, flags);
399 pam = list_entry(page_address_pool.next,
400 struct page_address_map, list);
401 list_del(&pam->list);
402 spin_unlock_irqrestore(&pool_lock, flags);
403
404 pam->page = page; 393 pam->page = page;
405 pam->virtual = virtual; 394 pam->virtual = virtual;
406 395
@@ -413,9 +402,6 @@ void set_page_address(struct page *page, void *virtual)
413 if (pam->page == page) { 402 if (pam->page == page) {
414 list_del(&pam->list); 403 list_del(&pam->list);
415 spin_unlock_irqrestore(&pas->lock, flags); 404 spin_unlock_irqrestore(&pas->lock, flags);
416 spin_lock_irqsave(&pool_lock, flags);
417 list_add_tail(&pam->list, &page_address_pool);
418 spin_unlock_irqrestore(&pool_lock, flags);
419 goto done; 405 goto done;
420 } 406 }
421 } 407 }
@@ -425,20 +411,14 @@ done:
425 return; 411 return;
426} 412}
427 413
428static struct page_address_map page_address_maps[LAST_PKMAP];
429
430void __init page_address_init(void) 414void __init page_address_init(void)
431{ 415{
432 int i; 416 int i;
433 417
434 INIT_LIST_HEAD(&page_address_pool);
435 for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
436 list_add(&page_address_maps[i].list, &page_address_pool);
437 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { 418 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
438 INIT_LIST_HEAD(&page_address_htable[i].lh); 419 INIT_LIST_HEAD(&page_address_htable[i].lh);
439 spin_lock_init(&page_address_htable[i].lock); 420 spin_lock_init(&page_address_htable[i].lock);
440 } 421 }
441 spin_lock_init(&pool_lock);
442} 422}
443 423
444#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 424#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40f17c34b415..6001ee6347a9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,15 @@
12#include <linux/mmu_notifier.h> 12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h> 13#include <linux/rmap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/shrinker.h>
15#include <linux/mm_inline.h> 16#include <linux/mm_inline.h>
16#include <linux/kthread.h> 17#include <linux/kthread.h>
17#include <linux/khugepaged.h> 18#include <linux/khugepaged.h>
18#include <linux/freezer.h> 19#include <linux/freezer.h>
19#include <linux/mman.h> 20#include <linux/mman.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/migrate.h>
23
21#include <asm/tlb.h> 24#include <asm/tlb.h>
22#include <asm/pgalloc.h> 25#include <asm/pgalloc.h>
23#include "internal.h" 26#include "internal.h"
@@ -37,7 +40,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
37 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 40 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
38#endif 41#endif
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 42 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
40 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 43 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
44 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41 45
42/* default scan 8*512 pte (or vmas) every 30 second */ 46/* default scan 8*512 pte (or vmas) every 30 second */
43static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 47static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -159,6 +163,77 @@ static int start_khugepaged(void)
159 return err; 163 return err;
160} 164}
161 165
166static atomic_t huge_zero_refcount;
167static unsigned long huge_zero_pfn __read_mostly;
168
169static inline bool is_huge_zero_pfn(unsigned long pfn)
170{
171 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
172 return zero_pfn && pfn == zero_pfn;
173}
174
175static inline bool is_huge_zero_pmd(pmd_t pmd)
176{
177 return is_huge_zero_pfn(pmd_pfn(pmd));
178}
179
180static unsigned long get_huge_zero_page(void)
181{
182 struct page *zero_page;
183retry:
184 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
185 return ACCESS_ONCE(huge_zero_pfn);
186
187 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
188 HPAGE_PMD_ORDER);
189 if (!zero_page) {
190 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
191 return 0;
192 }
193 count_vm_event(THP_ZERO_PAGE_ALLOC);
194 preempt_disable();
195 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
196 preempt_enable();
197 __free_page(zero_page);
198 goto retry;
199 }
200
201 /* We take additional reference here. It will be put back by shrinker */
202 atomic_set(&huge_zero_refcount, 2);
203 preempt_enable();
204 return ACCESS_ONCE(huge_zero_pfn);
205}
206
207static void put_huge_zero_page(void)
208{
209 /*
210 * Counter should never go to zero here. Only shrinker can put
211 * last reference.
212 */
213 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
214}
215
216static int shrink_huge_zero_page(struct shrinker *shrink,
217 struct shrink_control *sc)
218{
219 if (!sc->nr_to_scan)
220 /* we can free zero page only if last reference remains */
221 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
222
223 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
224 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
225 BUG_ON(zero_pfn == 0);
226 __free_page(__pfn_to_page(zero_pfn));
227 }
228
229 return 0;
230}
231
232static struct shrinker huge_zero_page_shrinker = {
233 .shrink = shrink_huge_zero_page,
234 .seeks = DEFAULT_SEEKS,
235};
236
162#ifdef CONFIG_SYSFS 237#ifdef CONFIG_SYSFS
163 238
164static ssize_t double_flag_show(struct kobject *kobj, 239static ssize_t double_flag_show(struct kobject *kobj,
@@ -284,6 +359,20 @@ static ssize_t defrag_store(struct kobject *kobj,
284static struct kobj_attribute defrag_attr = 359static struct kobj_attribute defrag_attr =
285 __ATTR(defrag, 0644, defrag_show, defrag_store); 360 __ATTR(defrag, 0644, defrag_show, defrag_store);
286 361
362static ssize_t use_zero_page_show(struct kobject *kobj,
363 struct kobj_attribute *attr, char *buf)
364{
365 return single_flag_show(kobj, attr, buf,
366 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
367}
368static ssize_t use_zero_page_store(struct kobject *kobj,
369 struct kobj_attribute *attr, const char *buf, size_t count)
370{
371 return single_flag_store(kobj, attr, buf, count,
372 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
373}
374static struct kobj_attribute use_zero_page_attr =
375 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
287#ifdef CONFIG_DEBUG_VM 376#ifdef CONFIG_DEBUG_VM
288static ssize_t debug_cow_show(struct kobject *kobj, 377static ssize_t debug_cow_show(struct kobject *kobj,
289 struct kobj_attribute *attr, char *buf) 378 struct kobj_attribute *attr, char *buf)
@@ -305,6 +394,7 @@ static struct kobj_attribute debug_cow_attr =
305static struct attribute *hugepage_attr[] = { 394static struct attribute *hugepage_attr[] = {
306 &enabled_attr.attr, 395 &enabled_attr.attr,
307 &defrag_attr.attr, 396 &defrag_attr.attr,
397 &use_zero_page_attr.attr,
308#ifdef CONFIG_DEBUG_VM 398#ifdef CONFIG_DEBUG_VM
309 &debug_cow_attr.attr, 399 &debug_cow_attr.attr,
310#endif 400#endif
@@ -484,19 +574,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
484 574
485 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 575 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
486 if (unlikely(!*hugepage_kobj)) { 576 if (unlikely(!*hugepage_kobj)) {
487 printk(KERN_ERR "hugepage: failed kobject create\n"); 577 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
488 return -ENOMEM; 578 return -ENOMEM;
489 } 579 }
490 580
491 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 581 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
492 if (err) { 582 if (err) {
493 printk(KERN_ERR "hugepage: failed register hugeage group\n"); 583 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
494 goto delete_obj; 584 goto delete_obj;
495 } 585 }
496 586
497 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 587 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
498 if (err) { 588 if (err) {
499 printk(KERN_ERR "hugepage: failed register hugeage group\n"); 589 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
500 goto remove_hp_group; 590 goto remove_hp_group;
501 } 591 }
502 592
@@ -550,6 +640,8 @@ static int __init hugepage_init(void)
550 goto out; 640 goto out;
551 } 641 }
552 642
643 register_shrinker(&huge_zero_page_shrinker);
644
553 /* 645 /*
554 * By default disable transparent hugepages on smaller systems, 646 * By default disable transparent hugepages on smaller systems,
555 * where the extra memory used could hurt more than TLB overhead 647 * where the extra memory used could hurt more than TLB overhead
@@ -599,13 +691,22 @@ out:
599} 691}
600__setup("transparent_hugepage=", setup_transparent_hugepage); 692__setup("transparent_hugepage=", setup_transparent_hugepage);
601 693
602static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 694pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
603{ 695{
604 if (likely(vma->vm_flags & VM_WRITE)) 696 if (likely(vma->vm_flags & VM_WRITE))
605 pmd = pmd_mkwrite(pmd); 697 pmd = pmd_mkwrite(pmd);
606 return pmd; 698 return pmd;
607} 699}
608 700
701static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
702{
703 pmd_t entry;
704 entry = mk_pmd(page, vma->vm_page_prot);
705 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
706 entry = pmd_mkhuge(entry);
707 return entry;
708}
709
609static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 710static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
610 struct vm_area_struct *vma, 711 struct vm_area_struct *vma,
611 unsigned long haddr, pmd_t *pmd, 712 unsigned long haddr, pmd_t *pmd,
@@ -629,9 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
629 pte_free(mm, pgtable); 730 pte_free(mm, pgtable);
630 } else { 731 } else {
631 pmd_t entry; 732 pmd_t entry;
632 entry = mk_pmd(page, vma->vm_page_prot); 733 entry = mk_huge_pmd(page, vma);
633 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
634 entry = pmd_mkhuge(entry);
635 /* 734 /*
636 * The spinlocking to take the lru_lock inside 735 * The spinlocking to take the lru_lock inside
637 * page_add_new_anon_rmap() acts as a full memory 736 * page_add_new_anon_rmap() acts as a full memory
@@ -671,6 +770,22 @@ static inline struct page *alloc_hugepage(int defrag)
671} 770}
672#endif 771#endif
673 772
773static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
774 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
775 unsigned long zero_pfn)
776{
777 pmd_t entry;
778 if (!pmd_none(*pmd))
779 return false;
780 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
781 entry = pmd_wrprotect(entry);
782 entry = pmd_mkhuge(entry);
783 set_pmd_at(mm, haddr, pmd, entry);
784 pgtable_trans_huge_deposit(mm, pgtable);
785 mm->nr_ptes++;
786 return true;
787}
788
674int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 789int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
675 unsigned long address, pmd_t *pmd, 790 unsigned long address, pmd_t *pmd,
676 unsigned int flags) 791 unsigned int flags)
@@ -684,6 +799,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
684 return VM_FAULT_OOM; 799 return VM_FAULT_OOM;
685 if (unlikely(khugepaged_enter(vma))) 800 if (unlikely(khugepaged_enter(vma)))
686 return VM_FAULT_OOM; 801 return VM_FAULT_OOM;
802 if (!(flags & FAULT_FLAG_WRITE) &&
803 transparent_hugepage_use_zero_page()) {
804 pgtable_t pgtable;
805 unsigned long zero_pfn;
806 bool set;
807 pgtable = pte_alloc_one(mm, haddr);
808 if (unlikely(!pgtable))
809 return VM_FAULT_OOM;
810 zero_pfn = get_huge_zero_page();
811 if (unlikely(!zero_pfn)) {
812 pte_free(mm, pgtable);
813 count_vm_event(THP_FAULT_FALLBACK);
814 goto out;
815 }
816 spin_lock(&mm->page_table_lock);
817 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
818 zero_pfn);
819 spin_unlock(&mm->page_table_lock);
820 if (!set) {
821 pte_free(mm, pgtable);
822 put_huge_zero_page();
823 }
824 return 0;
825 }
687 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 826 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
688 vma, haddr, numa_node_id(), 0); 827 vma, haddr, numa_node_id(), 0);
689 if (unlikely(!page)) { 828 if (unlikely(!page)) {
@@ -710,7 +849,8 @@ out:
710 * run pte_offset_map on the pmd, if an huge pmd could 849 * run pte_offset_map on the pmd, if an huge pmd could
711 * materialize from under us from a different thread. 850 * materialize from under us from a different thread.
712 */ 851 */
713 if (unlikely(__pte_alloc(mm, vma, pmd, address))) 852 if (unlikely(pmd_none(*pmd)) &&
853 unlikely(__pte_alloc(mm, vma, pmd, address)))
714 return VM_FAULT_OOM; 854 return VM_FAULT_OOM;
715 /* if an huge pmd materialized from under us just retry later */ 855 /* if an huge pmd materialized from under us just retry later */
716 if (unlikely(pmd_trans_huge(*pmd))) 856 if (unlikely(pmd_trans_huge(*pmd)))
@@ -748,6 +888,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
748 pte_free(dst_mm, pgtable); 888 pte_free(dst_mm, pgtable);
749 goto out_unlock; 889 goto out_unlock;
750 } 890 }
891 /*
892 * mm->page_table_lock is enough to be sure that huge zero pmd is not
893 * under splitting since we don't split the page itself, only pmd to
894 * a page table.
895 */
896 if (is_huge_zero_pmd(pmd)) {
897 unsigned long zero_pfn;
898 bool set;
899 /*
900 * get_huge_zero_page() will never allocate a new page here,
901 * since we already have a zero page to copy. It just takes a
902 * reference.
903 */
904 zero_pfn = get_huge_zero_page();
905 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
906 zero_pfn);
907 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
908 ret = 0;
909 goto out_unlock;
910 }
751 if (unlikely(pmd_trans_splitting(pmd))) { 911 if (unlikely(pmd_trans_splitting(pmd))) {
752 /* split huge page running from under us */ 912 /* split huge page running from under us */
753 spin_unlock(&src_mm->page_table_lock); 913 spin_unlock(&src_mm->page_table_lock);
@@ -777,6 +937,102 @@ out:
777 return ret; 937 return ret;
778} 938}
779 939
940void huge_pmd_set_accessed(struct mm_struct *mm,
941 struct vm_area_struct *vma,
942 unsigned long address,
943 pmd_t *pmd, pmd_t orig_pmd,
944 int dirty)
945{
946 pmd_t entry;
947 unsigned long haddr;
948
949 spin_lock(&mm->page_table_lock);
950 if (unlikely(!pmd_same(*pmd, orig_pmd)))
951 goto unlock;
952
953 entry = pmd_mkyoung(orig_pmd);
954 haddr = address & HPAGE_PMD_MASK;
955 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
956 update_mmu_cache_pmd(vma, address, pmd);
957
958unlock:
959 spin_unlock(&mm->page_table_lock);
960}
961
962static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
963 struct vm_area_struct *vma, unsigned long address,
964 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
965{
966 pgtable_t pgtable;
967 pmd_t _pmd;
968 struct page *page;
969 int i, ret = 0;
970 unsigned long mmun_start; /* For mmu_notifiers */
971 unsigned long mmun_end; /* For mmu_notifiers */
972
973 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
974 if (!page) {
975 ret |= VM_FAULT_OOM;
976 goto out;
977 }
978
979 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
980 put_page(page);
981 ret |= VM_FAULT_OOM;
982 goto out;
983 }
984
985 clear_user_highpage(page, address);
986 __SetPageUptodate(page);
987
988 mmun_start = haddr;
989 mmun_end = haddr + HPAGE_PMD_SIZE;
990 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
991
992 spin_lock(&mm->page_table_lock);
993 if (unlikely(!pmd_same(*pmd, orig_pmd)))
994 goto out_free_page;
995
996 pmdp_clear_flush(vma, haddr, pmd);
997 /* leave pmd empty until pte is filled */
998
999 pgtable = pgtable_trans_huge_withdraw(mm);
1000 pmd_populate(mm, &_pmd, pgtable);
1001
1002 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1003 pte_t *pte, entry;
1004 if (haddr == (address & PAGE_MASK)) {
1005 entry = mk_pte(page, vma->vm_page_prot);
1006 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1007 page_add_new_anon_rmap(page, vma, haddr);
1008 } else {
1009 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
1010 entry = pte_mkspecial(entry);
1011 }
1012 pte = pte_offset_map(&_pmd, haddr);
1013 VM_BUG_ON(!pte_none(*pte));
1014 set_pte_at(mm, haddr, pte, entry);
1015 pte_unmap(pte);
1016 }
1017 smp_wmb(); /* make pte visible before pmd */
1018 pmd_populate(mm, pmd, pgtable);
1019 spin_unlock(&mm->page_table_lock);
1020 put_huge_zero_page();
1021 inc_mm_counter(mm, MM_ANONPAGES);
1022
1023 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1024
1025 ret |= VM_FAULT_WRITE;
1026out:
1027 return ret;
1028out_free_page:
1029 spin_unlock(&mm->page_table_lock);
1030 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1031 mem_cgroup_uncharge_page(page);
1032 put_page(page);
1033 goto out;
1034}
1035
780static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1036static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
781 struct vm_area_struct *vma, 1037 struct vm_area_struct *vma,
782 unsigned long address, 1038 unsigned long address,
@@ -883,19 +1139,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
883 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1139 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
884{ 1140{
885 int ret = 0; 1141 int ret = 0;
886 struct page *page, *new_page; 1142 struct page *page = NULL, *new_page;
887 unsigned long haddr; 1143 unsigned long haddr;
888 unsigned long mmun_start; /* For mmu_notifiers */ 1144 unsigned long mmun_start; /* For mmu_notifiers */
889 unsigned long mmun_end; /* For mmu_notifiers */ 1145 unsigned long mmun_end; /* For mmu_notifiers */
890 1146
891 VM_BUG_ON(!vma->anon_vma); 1147 VM_BUG_ON(!vma->anon_vma);
1148 haddr = address & HPAGE_PMD_MASK;
1149 if (is_huge_zero_pmd(orig_pmd))
1150 goto alloc;
892 spin_lock(&mm->page_table_lock); 1151 spin_lock(&mm->page_table_lock);
893 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1152 if (unlikely(!pmd_same(*pmd, orig_pmd)))
894 goto out_unlock; 1153 goto out_unlock;
895 1154
896 page = pmd_page(orig_pmd); 1155 page = pmd_page(orig_pmd);
897 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1156 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
898 haddr = address & HPAGE_PMD_MASK;
899 if (page_mapcount(page) == 1) { 1157 if (page_mapcount(page) == 1) {
900 pmd_t entry; 1158 pmd_t entry;
901 entry = pmd_mkyoung(orig_pmd); 1159 entry = pmd_mkyoung(orig_pmd);
@@ -907,7 +1165,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
907 } 1165 }
908 get_page(page); 1166 get_page(page);
909 spin_unlock(&mm->page_table_lock); 1167 spin_unlock(&mm->page_table_lock);
910 1168alloc:
911 if (transparent_hugepage_enabled(vma) && 1169 if (transparent_hugepage_enabled(vma) &&
912 !transparent_hugepage_debug_cow()) 1170 !transparent_hugepage_debug_cow())
913 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1171 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -917,24 +1175,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
917 1175
918 if (unlikely(!new_page)) { 1176 if (unlikely(!new_page)) {
919 count_vm_event(THP_FAULT_FALLBACK); 1177 count_vm_event(THP_FAULT_FALLBACK);
920 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1178 if (is_huge_zero_pmd(orig_pmd)) {
921 pmd, orig_pmd, page, haddr); 1179 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
922 if (ret & VM_FAULT_OOM) 1180 address, pmd, orig_pmd, haddr);
923 split_huge_page(page); 1181 } else {
924 put_page(page); 1182 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1183 pmd, orig_pmd, page, haddr);
1184 if (ret & VM_FAULT_OOM)
1185 split_huge_page(page);
1186 put_page(page);
1187 }
925 goto out; 1188 goto out;
926 } 1189 }
927 count_vm_event(THP_FAULT_ALLOC); 1190 count_vm_event(THP_FAULT_ALLOC);
928 1191
929 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1192 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
930 put_page(new_page); 1193 put_page(new_page);
931 split_huge_page(page); 1194 if (page) {
932 put_page(page); 1195 split_huge_page(page);
1196 put_page(page);
1197 }
933 ret |= VM_FAULT_OOM; 1198 ret |= VM_FAULT_OOM;
934 goto out; 1199 goto out;
935 } 1200 }
936 1201
937 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1202 if (is_huge_zero_pmd(orig_pmd))
1203 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1204 else
1205 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
938 __SetPageUptodate(new_page); 1206 __SetPageUptodate(new_page);
939 1207
940 mmun_start = haddr; 1208 mmun_start = haddr;
@@ -942,7 +1210,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
942 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1210 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
943 1211
944 spin_lock(&mm->page_table_lock); 1212 spin_lock(&mm->page_table_lock);
945 put_page(page); 1213 if (page)
1214 put_page(page);
946 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1215 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
947 spin_unlock(&mm->page_table_lock); 1216 spin_unlock(&mm->page_table_lock);
948 mem_cgroup_uncharge_page(new_page); 1217 mem_cgroup_uncharge_page(new_page);
@@ -950,16 +1219,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
950 goto out_mn; 1219 goto out_mn;
951 } else { 1220 } else {
952 pmd_t entry; 1221 pmd_t entry;
953 VM_BUG_ON(!PageHead(page)); 1222 entry = mk_huge_pmd(new_page, vma);
954 entry = mk_pmd(new_page, vma->vm_page_prot);
955 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
956 entry = pmd_mkhuge(entry);
957 pmdp_clear_flush(vma, haddr, pmd); 1223 pmdp_clear_flush(vma, haddr, pmd);
958 page_add_new_anon_rmap(new_page, vma, haddr); 1224 page_add_new_anon_rmap(new_page, vma, haddr);
959 set_pmd_at(mm, haddr, pmd, entry); 1225 set_pmd_at(mm, haddr, pmd, entry);
960 update_mmu_cache_pmd(vma, address, pmd); 1226 update_mmu_cache_pmd(vma, address, pmd);
961 page_remove_rmap(page); 1227 if (is_huge_zero_pmd(orig_pmd)) {
962 put_page(page); 1228 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1229 put_huge_zero_page();
1230 } else {
1231 VM_BUG_ON(!PageHead(page));
1232 page_remove_rmap(page);
1233 put_page(page);
1234 }
963 ret |= VM_FAULT_WRITE; 1235 ret |= VM_FAULT_WRITE;
964 } 1236 }
965 spin_unlock(&mm->page_table_lock); 1237 spin_unlock(&mm->page_table_lock);
@@ -1017,6 +1289,81 @@ out:
1017 return page; 1289 return page;
1018} 1290}
1019 1291
1292/* NUMA hinting page fault entry point for trans huge pmds */
1293int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1294 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1295{
1296 struct page *page;
1297 unsigned long haddr = addr & HPAGE_PMD_MASK;
1298 int target_nid;
1299 int current_nid = -1;
1300 bool migrated;
1301 bool page_locked = false;
1302
1303 spin_lock(&mm->page_table_lock);
1304 if (unlikely(!pmd_same(pmd, *pmdp)))
1305 goto out_unlock;
1306
1307 page = pmd_page(pmd);
1308 get_page(page);
1309 current_nid = page_to_nid(page);
1310 count_vm_numa_event(NUMA_HINT_FAULTS);
1311 if (current_nid == numa_node_id())
1312 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1313
1314 target_nid = mpol_misplaced(page, vma, haddr);
1315 if (target_nid == -1) {
1316 put_page(page);
1317 goto clear_pmdnuma;
1318 }
1319
1320 /* Acquire the page lock to serialise THP migrations */
1321 spin_unlock(&mm->page_table_lock);
1322 lock_page(page);
1323 page_locked = true;
1324
1325 /* Confirm the PTE did not while locked */
1326 spin_lock(&mm->page_table_lock);
1327 if (unlikely(!pmd_same(pmd, *pmdp))) {
1328 unlock_page(page);
1329 put_page(page);
1330 goto out_unlock;
1331 }
1332 spin_unlock(&mm->page_table_lock);
1333
1334 /* Migrate the THP to the requested node */
1335 migrated = migrate_misplaced_transhuge_page(mm, vma,
1336 pmdp, pmd, addr,
1337 page, target_nid);
1338 if (migrated)
1339 current_nid = target_nid;
1340 else {
1341 spin_lock(&mm->page_table_lock);
1342 if (unlikely(!pmd_same(pmd, *pmdp))) {
1343 unlock_page(page);
1344 goto out_unlock;
1345 }
1346 goto clear_pmdnuma;
1347 }
1348
1349 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1350 return 0;
1351
1352clear_pmdnuma:
1353 pmd = pmd_mknonnuma(pmd);
1354 set_pmd_at(mm, haddr, pmdp, pmd);
1355 VM_BUG_ON(pmd_numa(*pmdp));
1356 update_mmu_cache_pmd(vma, addr, pmdp);
1357 if (page_locked)
1358 unlock_page(page);
1359
1360out_unlock:
1361 spin_unlock(&mm->page_table_lock);
1362 if (current_nid != -1)
1363 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1364 return 0;
1365}
1366
1020int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1367int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1021 pmd_t *pmd, unsigned long addr) 1368 pmd_t *pmd, unsigned long addr)
1022{ 1369{
@@ -1028,15 +1375,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1028 pmd_t orig_pmd; 1375 pmd_t orig_pmd;
1029 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1376 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1030 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1377 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1031 page = pmd_page(orig_pmd);
1032 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1378 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1033 page_remove_rmap(page); 1379 if (is_huge_zero_pmd(orig_pmd)) {
1034 VM_BUG_ON(page_mapcount(page) < 0); 1380 tlb->mm->nr_ptes--;
1035 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1381 spin_unlock(&tlb->mm->page_table_lock);
1036 VM_BUG_ON(!PageHead(page)); 1382 put_huge_zero_page();
1037 tlb->mm->nr_ptes--; 1383 } else {
1038 spin_unlock(&tlb->mm->page_table_lock); 1384 page = pmd_page(orig_pmd);
1039 tlb_remove_page(tlb, page); 1385 page_remove_rmap(page);
1386 VM_BUG_ON(page_mapcount(page) < 0);
1387 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1388 VM_BUG_ON(!PageHead(page));
1389 tlb->mm->nr_ptes--;
1390 spin_unlock(&tlb->mm->page_table_lock);
1391 tlb_remove_page(tlb, page);
1392 }
1040 pte_free(tlb->mm, pgtable); 1393 pte_free(tlb->mm, pgtable);
1041 ret = 1; 1394 ret = 1;
1042 } 1395 }
@@ -1099,7 +1452,7 @@ out:
1099} 1452}
1100 1453
1101int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1454int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1102 unsigned long addr, pgprot_t newprot) 1455 unsigned long addr, pgprot_t newprot, int prot_numa)
1103{ 1456{
1104 struct mm_struct *mm = vma->vm_mm; 1457 struct mm_struct *mm = vma->vm_mm;
1105 int ret = 0; 1458 int ret = 0;
@@ -1107,7 +1460,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1107 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1460 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1108 pmd_t entry; 1461 pmd_t entry;
1109 entry = pmdp_get_and_clear(mm, addr, pmd); 1462 entry = pmdp_get_and_clear(mm, addr, pmd);
1110 entry = pmd_modify(entry, newprot); 1463 if (!prot_numa) {
1464 entry = pmd_modify(entry, newprot);
1465 BUG_ON(pmd_write(entry));
1466 } else {
1467 struct page *page = pmd_page(*pmd);
1468
1469 /* only check non-shared pages */
1470 if (page_mapcount(page) == 1 &&
1471 !pmd_numa(*pmd)) {
1472 entry = pmd_mknuma(entry);
1473 }
1474 }
1111 set_pmd_at(mm, addr, pmd, entry); 1475 set_pmd_at(mm, addr, pmd, entry);
1112 spin_unlock(&vma->vm_mm->page_table_lock); 1476 spin_unlock(&vma->vm_mm->page_table_lock);
1113 ret = 1; 1477 ret = 1;
@@ -1146,22 +1510,14 @@ pmd_t *page_check_address_pmd(struct page *page,
1146 unsigned long address, 1510 unsigned long address,
1147 enum page_check_address_pmd_flag flag) 1511 enum page_check_address_pmd_flag flag)
1148{ 1512{
1149 pgd_t *pgd;
1150 pud_t *pud;
1151 pmd_t *pmd, *ret = NULL; 1513 pmd_t *pmd, *ret = NULL;
1152 1514
1153 if (address & ~HPAGE_PMD_MASK) 1515 if (address & ~HPAGE_PMD_MASK)
1154 goto out; 1516 goto out;
1155 1517
1156 pgd = pgd_offset(mm, address); 1518 pmd = mm_find_pmd(mm, address);
1157 if (!pgd_present(*pgd)) 1519 if (!pmd)
1158 goto out; 1520 goto out;
1159
1160 pud = pud_offset(pgd, address);
1161 if (!pud_present(*pud))
1162 goto out;
1163
1164 pmd = pmd_offset(pud, address);
1165 if (pmd_none(*pmd)) 1521 if (pmd_none(*pmd))
1166 goto out; 1522 goto out;
1167 if (pmd_page(*pmd) != page) 1523 if (pmd_page(*pmd) != page)
@@ -1205,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
1205 * We can't temporarily set the pmd to null in order 1561 * We can't temporarily set the pmd to null in order
1206 * to split it, the pmd must remain marked huge at all 1562 * to split it, the pmd must remain marked huge at all
1207 * times or the VM won't take the pmd_trans_huge paths 1563 * times or the VM won't take the pmd_trans_huge paths
1208 * and it won't wait on the anon_vma->root->mutex to 1564 * and it won't wait on the anon_vma->root->rwsem to
1209 * serialize against split_huge_page*. 1565 * serialize against split_huge_page*.
1210 */ 1566 */
1211 pmdp_splitting_flush(vma, address, pmd); 1567 pmdp_splitting_flush(vma, address, pmd);
@@ -1296,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
1296 page_tail->mapping = page->mapping; 1652 page_tail->mapping = page->mapping;
1297 1653
1298 page_tail->index = page->index + i; 1654 page_tail->index = page->index + i;
1655 page_xchg_last_nid(page_tail, page_last_nid(page));
1299 1656
1300 BUG_ON(!PageAnon(page_tail)); 1657 BUG_ON(!PageAnon(page_tail));
1301 BUG_ON(!PageUptodate(page_tail)); 1658 BUG_ON(!PageUptodate(page_tail));
@@ -1363,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
1363 BUG_ON(page_mapcount(page) != 1); 1720 BUG_ON(page_mapcount(page) != 1);
1364 if (!pmd_young(*pmd)) 1721 if (!pmd_young(*pmd))
1365 entry = pte_mkold(entry); 1722 entry = pte_mkold(entry);
1723 if (pmd_numa(*pmd))
1724 entry = pte_mknuma(entry);
1366 pte = pte_offset_map(&_pmd, haddr); 1725 pte = pte_offset_map(&_pmd, haddr);
1367 BUG_ON(!pte_none(*pte)); 1726 BUG_ON(!pte_none(*pte));
1368 set_pte_at(mm, haddr, pte, entry); 1727 set_pte_at(mm, haddr, pte, entry);
@@ -1405,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
1405 return ret; 1764 return ret;
1406} 1765}
1407 1766
1408/* must be called with anon_vma->root->mutex hold */ 1767/* must be called with anon_vma->root->rwsem held */
1409static void __split_huge_page(struct page *page, 1768static void __split_huge_page(struct page *page,
1410 struct anon_vma *anon_vma) 1769 struct anon_vma *anon_vma)
1411{ 1770{
@@ -1458,10 +1817,21 @@ int split_huge_page(struct page *page)
1458 struct anon_vma *anon_vma; 1817 struct anon_vma *anon_vma;
1459 int ret = 1; 1818 int ret = 1;
1460 1819
1820 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1461 BUG_ON(!PageAnon(page)); 1821 BUG_ON(!PageAnon(page));
1462 anon_vma = page_lock_anon_vma(page); 1822
1823 /*
1824 * The caller does not necessarily hold an mmap_sem that would prevent
1825 * the anon_vma disappearing so we first we take a reference to it
1826 * and then lock the anon_vma for write. This is similar to
1827 * page_lock_anon_vma_read except the write lock is taken to serialise
1828 * against parallel split or collapse operations.
1829 */
1830 anon_vma = page_get_anon_vma(page);
1463 if (!anon_vma) 1831 if (!anon_vma)
1464 goto out; 1832 goto out;
1833 anon_vma_lock_write(anon_vma);
1834
1465 ret = 0; 1835 ret = 0;
1466 if (!PageCompound(page)) 1836 if (!PageCompound(page))
1467 goto out_unlock; 1837 goto out_unlock;
@@ -1472,7 +1842,8 @@ int split_huge_page(struct page *page)
1472 1842
1473 BUG_ON(PageCompound(page)); 1843 BUG_ON(PageCompound(page));
1474out_unlock: 1844out_unlock:
1475 page_unlock_anon_vma(anon_vma); 1845 anon_vma_unlock(anon_vma);
1846 put_anon_vma(anon_vma);
1476out: 1847out:
1477 return ret; 1848 return ret;
1478} 1849}
@@ -1701,64 +2072,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
1701 } 2072 }
1702} 2073}
1703 2074
1704static void release_all_pte_pages(pte_t *pte)
1705{
1706 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1707}
1708
1709static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 2075static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1710 unsigned long address, 2076 unsigned long address,
1711 pte_t *pte) 2077 pte_t *pte)
1712{ 2078{
1713 struct page *page; 2079 struct page *page;
1714 pte_t *_pte; 2080 pte_t *_pte;
1715 int referenced = 0, isolated = 0, none = 0; 2081 int referenced = 0, none = 0;
1716 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2082 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1717 _pte++, address += PAGE_SIZE) { 2083 _pte++, address += PAGE_SIZE) {
1718 pte_t pteval = *_pte; 2084 pte_t pteval = *_pte;
1719 if (pte_none(pteval)) { 2085 if (pte_none(pteval)) {
1720 if (++none <= khugepaged_max_ptes_none) 2086 if (++none <= khugepaged_max_ptes_none)
1721 continue; 2087 continue;
1722 else { 2088 else
1723 release_pte_pages(pte, _pte);
1724 goto out; 2089 goto out;
1725 }
1726 } 2090 }
1727 if (!pte_present(pteval) || !pte_write(pteval)) { 2091 if (!pte_present(pteval) || !pte_write(pteval))
1728 release_pte_pages(pte, _pte);
1729 goto out; 2092 goto out;
1730 }
1731 page = vm_normal_page(vma, address, pteval); 2093 page = vm_normal_page(vma, address, pteval);
1732 if (unlikely(!page)) { 2094 if (unlikely(!page))
1733 release_pte_pages(pte, _pte);
1734 goto out; 2095 goto out;
1735 } 2096
1736 VM_BUG_ON(PageCompound(page)); 2097 VM_BUG_ON(PageCompound(page));
1737 BUG_ON(!PageAnon(page)); 2098 BUG_ON(!PageAnon(page));
1738 VM_BUG_ON(!PageSwapBacked(page)); 2099 VM_BUG_ON(!PageSwapBacked(page));
1739 2100
1740 /* cannot use mapcount: can't collapse if there's a gup pin */ 2101 /* cannot use mapcount: can't collapse if there's a gup pin */
1741 if (page_count(page) != 1) { 2102 if (page_count(page) != 1)
1742 release_pte_pages(pte, _pte);
1743 goto out; 2103 goto out;
1744 }
1745 /* 2104 /*
1746 * We can do it before isolate_lru_page because the 2105 * We can do it before isolate_lru_page because the
1747 * page can't be freed from under us. NOTE: PG_lock 2106 * page can't be freed from under us. NOTE: PG_lock
1748 * is needed to serialize against split_huge_page 2107 * is needed to serialize against split_huge_page
1749 * when invoked from the VM. 2108 * when invoked from the VM.
1750 */ 2109 */
1751 if (!trylock_page(page)) { 2110 if (!trylock_page(page))
1752 release_pte_pages(pte, _pte);
1753 goto out; 2111 goto out;
1754 }
1755 /* 2112 /*
1756 * Isolate the page to avoid collapsing an hugepage 2113 * Isolate the page to avoid collapsing an hugepage
1757 * currently in use by the VM. 2114 * currently in use by the VM.
1758 */ 2115 */
1759 if (isolate_lru_page(page)) { 2116 if (isolate_lru_page(page)) {
1760 unlock_page(page); 2117 unlock_page(page);
1761 release_pte_pages(pte, _pte);
1762 goto out; 2118 goto out;
1763 } 2119 }
1764 /* 0 stands for page_is_file_cache(page) == false */ 2120 /* 0 stands for page_is_file_cache(page) == false */
@@ -1771,12 +2127,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1771 mmu_notifier_test_young(vma->vm_mm, address)) 2127 mmu_notifier_test_young(vma->vm_mm, address))
1772 referenced = 1; 2128 referenced = 1;
1773 } 2129 }
1774 if (unlikely(!referenced)) 2130 if (likely(referenced))
1775 release_all_pte_pages(pte); 2131 return 1;
1776 else
1777 isolated = 1;
1778out: 2132out:
1779 return isolated; 2133 release_pte_pages(pte, _pte);
2134 return 0;
1780} 2135}
1781 2136
1782static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 2137static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -1918,14 +2273,26 @@ static struct page
1918} 2273}
1919#endif 2274#endif
1920 2275
2276static bool hugepage_vma_check(struct vm_area_struct *vma)
2277{
2278 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2279 (vma->vm_flags & VM_NOHUGEPAGE))
2280 return false;
2281
2282 if (!vma->anon_vma || vma->vm_ops)
2283 return false;
2284 if (is_vma_temporary_stack(vma))
2285 return false;
2286 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2287 return true;
2288}
2289
1921static void collapse_huge_page(struct mm_struct *mm, 2290static void collapse_huge_page(struct mm_struct *mm,
1922 unsigned long address, 2291 unsigned long address,
1923 struct page **hpage, 2292 struct page **hpage,
1924 struct vm_area_struct *vma, 2293 struct vm_area_struct *vma,
1925 int node) 2294 int node)
1926{ 2295{
1927 pgd_t *pgd;
1928 pud_t *pud;
1929 pmd_t *pmd, _pmd; 2296 pmd_t *pmd, _pmd;
1930 pte_t *pte; 2297 pte_t *pte;
1931 pgtable_t pgtable; 2298 pgtable_t pgtable;
@@ -1960,31 +2327,15 @@ static void collapse_huge_page(struct mm_struct *mm,
1960 hend = vma->vm_end & HPAGE_PMD_MASK; 2327 hend = vma->vm_end & HPAGE_PMD_MASK;
1961 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 2328 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1962 goto out; 2329 goto out;
1963 2330 if (!hugepage_vma_check(vma))
1964 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1965 (vma->vm_flags & VM_NOHUGEPAGE))
1966 goto out;
1967
1968 if (!vma->anon_vma || vma->vm_ops)
1969 goto out;
1970 if (is_vma_temporary_stack(vma))
1971 goto out; 2331 goto out;
1972 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2332 pmd = mm_find_pmd(mm, address);
1973 2333 if (!pmd)
1974 pgd = pgd_offset(mm, address);
1975 if (!pgd_present(*pgd))
1976 goto out;
1977
1978 pud = pud_offset(pgd, address);
1979 if (!pud_present(*pud))
1980 goto out; 2334 goto out;
1981 2335 if (pmd_trans_huge(*pmd))
1982 pmd = pmd_offset(pud, address);
1983 /* pmd can't go away or become huge under us */
1984 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1985 goto out; 2336 goto out;
1986 2337
1987 anon_vma_lock(vma->anon_vma); 2338 anon_vma_lock_write(vma->anon_vma);
1988 2339
1989 pte = pte_offset_map(pmd, address); 2340 pte = pte_offset_map(pmd, address);
1990 ptl = pte_lockptr(mm, pmd); 2341 ptl = pte_lockptr(mm, pmd);
@@ -2028,9 +2379,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2028 __SetPageUptodate(new_page); 2379 __SetPageUptodate(new_page);
2029 pgtable = pmd_pgtable(_pmd); 2380 pgtable = pmd_pgtable(_pmd);
2030 2381
2031 _pmd = mk_pmd(new_page, vma->vm_page_prot); 2382 _pmd = mk_huge_pmd(new_page, vma);
2032 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
2033 _pmd = pmd_mkhuge(_pmd);
2034 2383
2035 /* 2384 /*
2036 * spin_lock() below is not the equivalent of smp_wmb(), so 2385 * spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2064,8 +2413,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2064 unsigned long address, 2413 unsigned long address,
2065 struct page **hpage) 2414 struct page **hpage)
2066{ 2415{
2067 pgd_t *pgd;
2068 pud_t *pud;
2069 pmd_t *pmd; 2416 pmd_t *pmd;
2070 pte_t *pte, *_pte; 2417 pte_t *pte, *_pte;
2071 int ret = 0, referenced = 0, none = 0; 2418 int ret = 0, referenced = 0, none = 0;
@@ -2076,16 +2423,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2076 2423
2077 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2424 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2078 2425
2079 pgd = pgd_offset(mm, address); 2426 pmd = mm_find_pmd(mm, address);
2080 if (!pgd_present(*pgd)) 2427 if (!pmd)
2081 goto out; 2428 goto out;
2082 2429 if (pmd_trans_huge(*pmd))
2083 pud = pud_offset(pgd, address);
2084 if (!pud_present(*pud))
2085 goto out;
2086
2087 pmd = pmd_offset(pud, address);
2088 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
2089 goto out; 2430 goto out;
2090 2431
2091 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2432 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2193,20 +2534,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2193 progress++; 2534 progress++;
2194 break; 2535 break;
2195 } 2536 }
2196 2537 if (!hugepage_vma_check(vma)) {
2197 if ((!(vma->vm_flags & VM_HUGEPAGE) && 2538skip:
2198 !khugepaged_always()) ||
2199 (vma->vm_flags & VM_NOHUGEPAGE)) {
2200 skip:
2201 progress++; 2539 progress++;
2202 continue; 2540 continue;
2203 } 2541 }
2204 if (!vma->anon_vma || vma->vm_ops)
2205 goto skip;
2206 if (is_vma_temporary_stack(vma))
2207 goto skip;
2208 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2209
2210 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2542 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2211 hend = vma->vm_end & HPAGE_PMD_MASK; 2543 hend = vma->vm_end & HPAGE_PMD_MASK;
2212 if (hstart >= hend) 2544 if (hstart >= hend)
@@ -2356,19 +2688,65 @@ static int khugepaged(void *none)
2356 return 0; 2688 return 0;
2357} 2689}
2358 2690
2359void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) 2691static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2692 unsigned long haddr, pmd_t *pmd)
2693{
2694 struct mm_struct *mm = vma->vm_mm;
2695 pgtable_t pgtable;
2696 pmd_t _pmd;
2697 int i;
2698
2699 pmdp_clear_flush(vma, haddr, pmd);
2700 /* leave pmd empty until pte is filled */
2701
2702 pgtable = pgtable_trans_huge_withdraw(mm);
2703 pmd_populate(mm, &_pmd, pgtable);
2704
2705 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2706 pte_t *pte, entry;
2707 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2708 entry = pte_mkspecial(entry);
2709 pte = pte_offset_map(&_pmd, haddr);
2710 VM_BUG_ON(!pte_none(*pte));
2711 set_pte_at(mm, haddr, pte, entry);
2712 pte_unmap(pte);
2713 }
2714 smp_wmb(); /* make pte visible before pmd */
2715 pmd_populate(mm, pmd, pgtable);
2716 put_huge_zero_page();
2717}
2718
2719void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2720 pmd_t *pmd)
2360{ 2721{
2361 struct page *page; 2722 struct page *page;
2723 struct mm_struct *mm = vma->vm_mm;
2724 unsigned long haddr = address & HPAGE_PMD_MASK;
2725 unsigned long mmun_start; /* For mmu_notifiers */
2726 unsigned long mmun_end; /* For mmu_notifiers */
2727
2728 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2362 2729
2730 mmun_start = haddr;
2731 mmun_end = haddr + HPAGE_PMD_SIZE;
2732 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2363 spin_lock(&mm->page_table_lock); 2733 spin_lock(&mm->page_table_lock);
2364 if (unlikely(!pmd_trans_huge(*pmd))) { 2734 if (unlikely(!pmd_trans_huge(*pmd))) {
2365 spin_unlock(&mm->page_table_lock); 2735 spin_unlock(&mm->page_table_lock);
2736 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2737 return;
2738 }
2739 if (is_huge_zero_pmd(*pmd)) {
2740 __split_huge_zero_page_pmd(vma, haddr, pmd);
2741 spin_unlock(&mm->page_table_lock);
2742 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2366 return; 2743 return;
2367 } 2744 }
2368 page = pmd_page(*pmd); 2745 page = pmd_page(*pmd);
2369 VM_BUG_ON(!page_count(page)); 2746 VM_BUG_ON(!page_count(page));
2370 get_page(page); 2747 get_page(page);
2371 spin_unlock(&mm->page_table_lock); 2748 spin_unlock(&mm->page_table_lock);
2749 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2372 2750
2373 split_huge_page(page); 2751 split_huge_page(page);
2374 2752
@@ -2376,31 +2754,31 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2376 BUG_ON(pmd_trans_huge(*pmd)); 2754 BUG_ON(pmd_trans_huge(*pmd));
2377} 2755}
2378 2756
2757void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2758 pmd_t *pmd)
2759{
2760 struct vm_area_struct *vma;
2761
2762 vma = find_vma(mm, address);
2763 BUG_ON(vma == NULL);
2764 split_huge_page_pmd(vma, address, pmd);
2765}
2766
2379static void split_huge_page_address(struct mm_struct *mm, 2767static void split_huge_page_address(struct mm_struct *mm,
2380 unsigned long address) 2768 unsigned long address)
2381{ 2769{
2382 pgd_t *pgd;
2383 pud_t *pud;
2384 pmd_t *pmd; 2770 pmd_t *pmd;
2385 2771
2386 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2772 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2387 2773
2388 pgd = pgd_offset(mm, address); 2774 pmd = mm_find_pmd(mm, address);
2389 if (!pgd_present(*pgd)) 2775 if (!pmd)
2390 return;
2391
2392 pud = pud_offset(pgd, address);
2393 if (!pud_present(*pud))
2394 return;
2395
2396 pmd = pmd_offset(pud, address);
2397 if (!pmd_present(*pmd))
2398 return; 2776 return;
2399 /* 2777 /*
2400 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2778 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2401 * materialize from under us. 2779 * materialize from under us.
2402 */ 2780 */
2403 split_huge_page_pmd(mm, pmd); 2781 split_huge_page_pmd_mm(mm, address, pmd);
2404} 2782}
2405 2783
2406void __vma_adjust_trans_huge(struct vm_area_struct *vma, 2784void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 59a0059b39e2..4f3ea0b1e57c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Generic hugetlb support. 2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004 3 * (C) Nadia Yvette Chambers, April 2004
4 */ 4 */
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/init.h> 6#include <linux/init.h>
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
1057 * on-line nodes with memory and will handle the hstate accounting. 1057 * on-line nodes with memory and will handle the hstate accounting.
1058 */ 1058 */
1059 while (nr_pages--) { 1059 while (nr_pages--) {
1060 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) 1060 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1061 break; 1061 break;
1062 } 1062 }
1063} 1063}
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1180int __weak alloc_bootmem_huge_page(struct hstate *h) 1180int __weak alloc_bootmem_huge_page(struct hstate *h)
1181{ 1181{
1182 struct huge_bootmem_page *m; 1182 struct huge_bootmem_page *m;
1183 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 1183 int nr_nodes = nodes_weight(node_states[N_MEMORY]);
1184 1184
1185 while (nr_nodes) { 1185 while (nr_nodes) {
1186 void *addr; 1186 void *addr;
1187 1187
1188 addr = __alloc_bootmem_node_nopanic( 1188 addr = __alloc_bootmem_node_nopanic(
1189 NODE_DATA(hstate_next_node_to_alloc(h, 1189 NODE_DATA(hstate_next_node_to_alloc(h,
1190 &node_states[N_HIGH_MEMORY])), 1190 &node_states[N_MEMORY])),
1191 huge_page_size(h), huge_page_size(h), 0); 1191 huge_page_size(h), huge_page_size(h), 0);
1192 1192
1193 if (addr) { 1193 if (addr) {
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1259 if (!alloc_bootmem_huge_page(h)) 1259 if (!alloc_bootmem_huge_page(h))
1260 break; 1260 break;
1261 } else if (!alloc_fresh_huge_page(h, 1261 } else if (!alloc_fresh_huge_page(h,
1262 &node_states[N_HIGH_MEMORY])) 1262 &node_states[N_MEMORY]))
1263 break; 1263 break;
1264 } 1264 }
1265 h->max_huge_pages = i; 1265 h->max_huge_pages = i;
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1527 if (!(obey_mempolicy && 1527 if (!(obey_mempolicy &&
1528 init_nodemask_of_mempolicy(nodes_allowed))) { 1528 init_nodemask_of_mempolicy(nodes_allowed))) {
1529 NODEMASK_FREE(nodes_allowed); 1529 NODEMASK_FREE(nodes_allowed);
1530 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1530 nodes_allowed = &node_states[N_MEMORY];
1531 } 1531 }
1532 } else if (nodes_allowed) { 1532 } else if (nodes_allowed) {
1533 /* 1533 /*
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1537 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 1537 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1538 init_nodemask_of_node(nodes_allowed, nid); 1538 init_nodemask_of_node(nodes_allowed, nid);
1539 } else 1539 } else
1540 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1540 nodes_allowed = &node_states[N_MEMORY];
1541 1541
1542 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 1542 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1543 1543
1544 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1544 if (nodes_allowed != &node_states[N_MEMORY])
1545 NODEMASK_FREE(nodes_allowed); 1545 NODEMASK_FREE(nodes_allowed);
1546 1546
1547 return len; 1547 return len;
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
1800 * remove hstate attributes from any nodes that have them. 1800 * remove hstate attributes from any nodes that have them.
1801 */ 1801 */
1802 for (nid = 0; nid < nr_node_ids; nid++) 1802 for (nid = 0; nid < nr_node_ids; nid++)
1803 hugetlb_unregister_node(&node_devices[nid]); 1803 hugetlb_unregister_node(node_devices[nid]);
1804} 1804}
1805 1805
1806/* 1806/*
@@ -1844,8 +1844,8 @@ static void hugetlb_register_all_nodes(void)
1844{ 1844{
1845 int nid; 1845 int nid;
1846 1846
1847 for_each_node_state(nid, N_HIGH_MEMORY) { 1847 for_each_node_state(nid, N_MEMORY) {
1848 struct node *node = &node_devices[nid]; 1848 struct node *node = node_devices[nid];
1849 if (node->dev.id == nid) 1849 if (node->dev.id == nid)
1850 hugetlb_register_node(node); 1850 hugetlb_register_node(node);
1851 } 1851 }
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
1906 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1906 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1907 1907
1908 hugetlb_init_hstates(); 1908 hugetlb_init_hstates();
1909
1910 gather_bootmem_prealloc(); 1909 gather_bootmem_prealloc();
1911
1912 report_hugepages(); 1910 report_hugepages();
1913 1911
1914 hugetlb_sysfs_init(); 1912 hugetlb_sysfs_init();
1915
1916 hugetlb_register_all_nodes(); 1913 hugetlb_register_all_nodes();
1914 hugetlb_cgroup_file_init();
1917 1915
1918 return 0; 1916 return 0;
1919} 1917}
@@ -1939,17 +1937,10 @@ void __init hugetlb_add_hstate(unsigned order)
1939 for (i = 0; i < MAX_NUMNODES; ++i) 1937 for (i = 0; i < MAX_NUMNODES; ++i)
1940 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1938 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1941 INIT_LIST_HEAD(&h->hugepage_activelist); 1939 INIT_LIST_HEAD(&h->hugepage_activelist);
1942 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1940 h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
1943 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1941 h->next_nid_to_free = first_node(node_states[N_MEMORY]);
1944 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1942 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1945 huge_page_size(h)/1024); 1943 huge_page_size(h)/1024);
1946 /*
1947 * Add cgroup control files only if the huge page consists
1948 * of more than two normal pages. This is because we use
1949 * page[2].lru.next for storing cgoup details.
1950 */
1951 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1952 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1953 1944
1954 parsed_hstate = h; 1945 parsed_hstate = h;
1955} 1946}
@@ -2035,11 +2026,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2035 if (!(obey_mempolicy && 2026 if (!(obey_mempolicy &&
2036 init_nodemask_of_mempolicy(nodes_allowed))) { 2027 init_nodemask_of_mempolicy(nodes_allowed))) {
2037 NODEMASK_FREE(nodes_allowed); 2028 NODEMASK_FREE(nodes_allowed);
2038 nodes_allowed = &node_states[N_HIGH_MEMORY]; 2029 nodes_allowed = &node_states[N_MEMORY];
2039 } 2030 }
2040 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); 2031 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
2041 2032
2042 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 2033 if (nodes_allowed != &node_states[N_MEMORY])
2043 NODEMASK_FREE(nodes_allowed); 2034 NODEMASK_FREE(nodes_allowed);
2044 } 2035 }
2045out: 2036out:
@@ -2386,8 +2377,10 @@ again:
2386 /* 2377 /*
2387 * HWPoisoned hugepage is already unmapped and dropped reference 2378 * HWPoisoned hugepage is already unmapped and dropped reference
2388 */ 2379 */
2389 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) 2380 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
2381 pte_clear(mm, address, ptep);
2390 continue; 2382 continue;
2383 }
2391 2384
2392 page = pte_page(pte); 2385 page = pte_page(pte);
2393 /* 2386 /*
@@ -3014,7 +3007,7 @@ same_page:
3014 return i ? i : -EFAULT; 3007 return i ? i : -EFAULT;
3015} 3008}
3016 3009
3017void hugetlb_change_protection(struct vm_area_struct *vma, 3010unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3018 unsigned long address, unsigned long end, pgprot_t newprot) 3011 unsigned long address, unsigned long end, pgprot_t newprot)
3019{ 3012{
3020 struct mm_struct *mm = vma->vm_mm; 3013 struct mm_struct *mm = vma->vm_mm;
@@ -3022,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3022 pte_t *ptep; 3015 pte_t *ptep;
3023 pte_t pte; 3016 pte_t pte;
3024 struct hstate *h = hstate_vma(vma); 3017 struct hstate *h = hstate_vma(vma);
3018 unsigned long pages = 0;
3025 3019
3026 BUG_ON(address >= end); 3020 BUG_ON(address >= end);
3027 flush_cache_range(vma, address, end); 3021 flush_cache_range(vma, address, end);
@@ -3032,12 +3026,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3032 ptep = huge_pte_offset(mm, address); 3026 ptep = huge_pte_offset(mm, address);
3033 if (!ptep) 3027 if (!ptep)
3034 continue; 3028 continue;
3035 if (huge_pmd_unshare(mm, &address, ptep)) 3029 if (huge_pmd_unshare(mm, &address, ptep)) {
3030 pages++;
3036 continue; 3031 continue;
3032 }
3037 if (!huge_pte_none(huge_ptep_get(ptep))) { 3033 if (!huge_pte_none(huge_ptep_get(ptep))) {
3038 pte = huge_ptep_get_and_clear(mm, address, ptep); 3034 pte = huge_ptep_get_and_clear(mm, address, ptep);
3039 pte = pte_mkhuge(pte_modify(pte, newprot)); 3035 pte = pte_mkhuge(pte_modify(pte, newprot));
3040 set_huge_pte_at(mm, address, ptep, pte); 3036 set_huge_pte_at(mm, address, ptep, pte);
3037 pages++;
3041 } 3038 }
3042 } 3039 }
3043 spin_unlock(&mm->page_table_lock); 3040 spin_unlock(&mm->page_table_lock);
@@ -3049,6 +3046,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3049 */ 3046 */
3050 flush_tlb_range(vma, start, end); 3047 flush_tlb_range(vma, start, end);
3051 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3048 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
3049
3050 return pages << h->order;
3052} 3051}
3053 3052
3054int hugetlb_reserve_pages(struct inode *inode, 3053int hugetlb_reserve_pages(struct inode *inode,
@@ -3170,7 +3169,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3170 3169
3171 spin_lock(&hugetlb_lock); 3170 spin_lock(&hugetlb_lock);
3172 if (is_hugepage_on_freelist(hpage)) { 3171 if (is_hugepage_on_freelist(hpage)) {
3173 list_del(&hpage->lru); 3172 /*
3173 * Hwpoisoned hugepage isn't linked to activelist or freelist,
3174 * but dangling hpage->lru can trigger list-debug warnings
3175 * (this happens when we call unpoison_memory() on it),
3176 * so let it point to itself with list_del_init().
3177 */
3178 list_del_init(&hpage->lru);
3174 set_page_refcounted(hpage); 3179 set_page_refcounted(hpage);
3175 h->free_huge_pages--; 3180 h->free_huge_pages--;
3176 h->free_huge_pages_node[nid]--; 3181 h->free_huge_pages_node[nid]--;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a3f358fb8a0c..9cea7de22ffb 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
77 return false; 77 return false;
78} 78}
79 79
80static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) 80static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
81{ 81{
82 int idx; 82 int idx;
83 struct cgroup *parent_cgroup; 83 struct cgroup *parent_cgroup;
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
101 return &h_cgroup->css; 101 return &h_cgroup->css;
102} 102}
103 103
104static void hugetlb_cgroup_destroy(struct cgroup *cgroup) 104static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
105{ 105{
106 struct hugetlb_cgroup *h_cgroup; 106 struct hugetlb_cgroup *h_cgroup;
107 107
@@ -155,18 +155,13 @@ out:
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup. 156 * the parent cgroup.
157 */ 157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) 158static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
159{ 159{
160 struct hstate *h; 160 struct hstate *h;
161 struct page *page; 161 struct page *page;
162 int ret = 0, idx = 0; 162 int idx = 0;
163 163
164 do { 164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) { 165 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock); 166 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru) 167 list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
177 } 172 }
178 cond_resched(); 173 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup)); 174 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182} 175}
183 176
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 177int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -340,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
340 return buf; 333 return buf;
341} 334}
342 335
343int __init hugetlb_cgroup_file_init(int idx) 336static void __init __hugetlb_cgroup_file_init(int idx)
344{ 337{
345 char buf[32]; 338 char buf[32];
346 struct cftype *cft; 339 struct cftype *cft;
@@ -382,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
382 375
383 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); 376 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
384 377
385 return 0; 378 return;
379}
380
381void __init hugetlb_cgroup_file_init(void)
382{
383 struct hstate *h;
384
385 for_each_hstate(h) {
386 /*
387 * Add cgroup control files only if the huge page consists
388 * of more than two normal pages. This is because we use
389 * page[2].lru.next for storing cgroup details.
390 */
391 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
392 __hugetlb_cgroup_file_init(hstate_index(h));
393 }
386} 394}
387 395
388/* 396/*
@@ -411,8 +419,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
411 419
412struct cgroup_subsys hugetlb_subsys = { 420struct cgroup_subsys hugetlb_subsys = {
413 .name = "hugetlb", 421 .name = "hugetlb",
414 .create = hugetlb_cgroup_create, 422 .css_alloc = hugetlb_cgroup_css_alloc,
415 .pre_destroy = hugetlb_cgroup_pre_destroy, 423 .css_offline = hugetlb_cgroup_css_offline,
416 .destroy = hugetlb_cgroup_destroy, 424 .css_free = hugetlb_cgroup_css_free,
417 .subsys_id = hugetlb_subsys_id, 425 .subsys_id = hugetlb_subsys_id,
418}; 426};
diff --git a/mm/internal.h b/mm/internal.h
index a4fa284f6bc2..9ba21100ebf3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page);
92extern void putback_lru_page(struct page *page); 92extern void putback_lru_page(struct page *page);
93 93
94/* 94/*
95 * in mm/rmap.c:
96 */
97extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
98
99/*
95 * in mm/page_alloc.c 100 * in mm/page_alloc.c
96 */ 101 */
97extern void __free_pages_bootmem(struct page *page, unsigned int order); 102extern void __free_pages_bootmem(struct page *page, unsigned int order);
@@ -130,7 +135,6 @@ struct compact_control {
130 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 135 int migratetype; /* MOVABLE, RECLAIMABLE etc */
131 struct zone *zone; 136 struct zone *zone;
132 bool contended; /* True if a lock was contended */ 137 bool contended; /* True if a lock was contended */
133 struct page **page; /* Page captured of requested size */
134}; 138};
135 139
136unsigned long 140unsigned long
@@ -212,15 +216,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
212{ 216{
213 if (TestClearPageMlocked(page)) { 217 if (TestClearPageMlocked(page)) {
214 unsigned long flags; 218 unsigned long flags;
219 int nr_pages = hpage_nr_pages(page);
215 220
216 local_irq_save(flags); 221 local_irq_save(flags);
217 __dec_zone_page_state(page, NR_MLOCK); 222 __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
218 SetPageMlocked(newpage); 223 SetPageMlocked(newpage);
219 __inc_zone_page_state(newpage, NR_MLOCK); 224 __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
220 local_irq_restore(flags); 225 local_irq_restore(flags);
221 } 226 }
222} 227}
223 228
229extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
230
224#ifdef CONFIG_TRANSPARENT_HUGEPAGE 231#ifdef CONFIG_TRANSPARENT_HUGEPAGE
225extern unsigned long vma_address(struct page *page, 232extern unsigned long vma_address(struct page *page,
226 struct vm_area_struct *vma); 233 struct vm_area_struct *vma);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a217cc544060..752a705c77c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
1556 struct kmemleak_object *object; 1556 struct kmemleak_object *object;
1557 unsigned long addr; 1557 unsigned long addr;
1558 1558
1559 addr= simple_strtoul(str, NULL, 0); 1559 if (kstrtoul(str, 0, &addr))
1560 return -EINVAL;
1560 object = find_and_get_object(addr, 0); 1561 object = find_and_get_object(addr, 0);
1561 if (!object) { 1562 if (!object) {
1562 pr_info("Unknown object at 0x%08lx\n", addr); 1563 pr_info("Unknown object at 0x%08lx\n", addr);
diff --git a/mm/ksm.c b/mm/ksm.c
index ae539f0b8aa1..51573858938d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
778 struct page *kpage, pte_t orig_pte) 778 struct page *kpage, pte_t orig_pte)
779{ 779{
780 struct mm_struct *mm = vma->vm_mm; 780 struct mm_struct *mm = vma->vm_mm;
781 pgd_t *pgd;
782 pud_t *pud;
783 pmd_t *pmd; 781 pmd_t *pmd;
784 pte_t *ptep; 782 pte_t *ptep;
785 spinlock_t *ptl; 783 spinlock_t *ptl;
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
792 if (addr == -EFAULT) 790 if (addr == -EFAULT)
793 goto out; 791 goto out;
794 792
795 pgd = pgd_offset(mm, addr); 793 pmd = mm_find_pmd(mm, addr);
796 if (!pgd_present(*pgd)) 794 if (!pmd)
797 goto out; 795 goto out;
798
799 pud = pud_offset(pgd, addr);
800 if (!pud_present(*pud))
801 goto out;
802
803 pmd = pmd_offset(pud, addr);
804 BUG_ON(pmd_trans_huge(*pmd)); 796 BUG_ON(pmd_trans_huge(*pmd));
805 if (!pmd_present(*pmd))
806 goto out;
807 797
808 mmun_start = addr; 798 mmun_start = addr;
809 mmun_end = addr + PAGE_SIZE; 799 mmun_end = addr + PAGE_SIZE;
@@ -1634,7 +1624,7 @@ again:
1634 struct anon_vma_chain *vmac; 1624 struct anon_vma_chain *vmac;
1635 struct vm_area_struct *vma; 1625 struct vm_area_struct *vma;
1636 1626
1637 anon_vma_lock(anon_vma); 1627 anon_vma_lock_read(anon_vma);
1638 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1628 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1639 0, ULONG_MAX) { 1629 0, ULONG_MAX) {
1640 vma = vmac->vma; 1630 vma = vmac->vma;
@@ -1658,7 +1648,7 @@ again:
1658 if (!search_new_forks || !mapcount) 1648 if (!search_new_forks || !mapcount)
1659 break; 1649 break;
1660 } 1650 }
1661 anon_vma_unlock(anon_vma); 1651 anon_vma_unlock_read(anon_vma);
1662 if (!mapcount) 1652 if (!mapcount)
1663 goto out; 1653 goto out;
1664 } 1654 }
@@ -1688,7 +1678,7 @@ again:
1688 struct anon_vma_chain *vmac; 1678 struct anon_vma_chain *vmac;
1689 struct vm_area_struct *vma; 1679 struct vm_area_struct *vma;
1690 1680
1691 anon_vma_lock(anon_vma); 1681 anon_vma_lock_read(anon_vma);
1692 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1682 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1693 0, ULONG_MAX) { 1683 0, ULONG_MAX) {
1694 vma = vmac->vma; 1684 vma = vmac->vma;
@@ -1707,11 +1697,11 @@ again:
1707 ret = try_to_unmap_one(page, vma, 1697 ret = try_to_unmap_one(page, vma,
1708 rmap_item->address, flags); 1698 rmap_item->address, flags);
1709 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1699 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1710 anon_vma_unlock(anon_vma); 1700 anon_vma_unlock_read(anon_vma);
1711 goto out; 1701 goto out;
1712 } 1702 }
1713 } 1703 }
1714 anon_vma_unlock(anon_vma); 1704 anon_vma_unlock_read(anon_vma);
1715 } 1705 }
1716 if (!search_new_forks++) 1706 if (!search_new_forks++)
1717 goto again; 1707 goto again;
@@ -1741,7 +1731,7 @@ again:
1741 struct anon_vma_chain *vmac; 1731 struct anon_vma_chain *vmac;
1742 struct vm_area_struct *vma; 1732 struct vm_area_struct *vma;
1743 1733
1744 anon_vma_lock(anon_vma); 1734 anon_vma_lock_read(anon_vma);
1745 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1735 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1746 0, ULONG_MAX) { 1736 0, ULONG_MAX) {
1747 vma = vmac->vma; 1737 vma = vmac->vma;
@@ -1759,11 +1749,11 @@ again:
1759 1749
1760 ret = rmap_one(page, vma, rmap_item->address, arg); 1750 ret = rmap_one(page, vma, rmap_item->address, arg);
1761 if (ret != SWAP_AGAIN) { 1751 if (ret != SWAP_AGAIN) {
1762 anon_vma_unlock(anon_vma); 1752 anon_vma_unlock_read(anon_vma);
1763 goto out; 1753 goto out;
1764 } 1754 }
1765 } 1755 }
1766 anon_vma_unlock(anon_vma); 1756 anon_vma_unlock_read(anon_vma);
1767 } 1757 }
1768 if (!search_new_forks++) 1758 if (!search_new_forks++)
1769 goto again; 1759 goto again;
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1929 if (ksm_run != flags) { 1919 if (ksm_run != flags) {
1930 ksm_run = flags; 1920 ksm_run = flags;
1931 if (flags & KSM_RUN_UNMERGE) { 1921 if (flags & KSM_RUN_UNMERGE) {
1932 int oom_score_adj; 1922 set_current_oom_origin();
1933
1934 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1935 err = unmerge_and_remove_all_rmap_items(); 1923 err = unmerge_and_remove_all_rmap_items();
1936 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, 1924 clear_current_oom_origin();
1937 oom_score_adj);
1938 if (err) { 1925 if (err) {
1939 ksm_run = KSM_RUN_STOP; 1926 ksm_run = KSM_RUN_STOP;
1940 count = err; 1927 count = err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 625905523c2a..88adc8afb610 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -314,7 +314,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
314 } 314 }
315 315
316 this->size += next->size; 316 this->size += next->size;
317 memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); 317 /* move forward from next + 1, index of which is i + 2 */
318 memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
318 type->cnt--; 319 type->cnt--;
319 } 320 }
320} 321}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dd39ba000b31..09255ec8159c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
10 * Copyright (C) 2009 Nokia Corporation 10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov 11 * Author: Kirill A. Shutemov
12 * 12 *
13 * Kernel Memory Controller
14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
15 * Authors: Glauber Costa and Suleiman Souhlal
16 *
13 * This program is free software; you can redistribute it and/or modify 17 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 18 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or 19 * the Free Software Foundation; either version 2 of the License, or
@@ -59,6 +63,8 @@
59#include <trace/events/vmscan.h> 63#include <trace/events/vmscan.h>
60 64
61struct cgroup_subsys mem_cgroup_subsys __read_mostly; 65struct cgroup_subsys mem_cgroup_subsys __read_mostly;
66EXPORT_SYMBOL(mem_cgroup_subsys);
67
62#define MEM_CGROUP_RECLAIM_RETRIES 5 68#define MEM_CGROUP_RECLAIM_RETRIES 5
63static struct mem_cgroup *root_mem_cgroup __read_mostly; 69static struct mem_cgroup *root_mem_cgroup __read_mostly;
64 70
@@ -266,6 +272,10 @@ struct mem_cgroup {
266 }; 272 };
267 273
268 /* 274 /*
275 * the counter to account for kernel memory usage.
276 */
277 struct res_counter kmem;
278 /*
269 * Per cgroup active and inactive list, similar to the 279 * Per cgroup active and inactive list, similar to the
270 * per zone LRU lists. 280 * per zone LRU lists.
271 */ 281 */
@@ -280,6 +290,7 @@ struct mem_cgroup {
280 * Should the accounting and control be hierarchical, per subtree? 290 * Should the accounting and control be hierarchical, per subtree?
281 */ 291 */
282 bool use_hierarchy; 292 bool use_hierarchy;
293 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
283 294
284 bool oom_lock; 295 bool oom_lock;
285 atomic_t under_oom; 296 atomic_t under_oom;
@@ -330,8 +341,61 @@ struct mem_cgroup {
330#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 341#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
331 struct tcp_memcontrol tcp_mem; 342 struct tcp_memcontrol tcp_mem;
332#endif 343#endif
344#if defined(CONFIG_MEMCG_KMEM)
345 /* analogous to slab_common's slab_caches list. per-memcg */
346 struct list_head memcg_slab_caches;
347 /* Not a spinlock, we can take a lot of time walking the list */
348 struct mutex slab_caches_mutex;
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id;
351#endif
333}; 352};
334 353
354/* internal only representation about the status of kmem accounting. */
355enum {
356 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
357 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
358 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
359};
360
361/* We account when limit is on, but only after call sites are patched */
362#define KMEM_ACCOUNTED_MASK \
363 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
364
365#ifdef CONFIG_MEMCG_KMEM
366static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
367{
368 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
369}
370
371static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
372{
373 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
374}
375
376static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
377{
378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
379}
380
381static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
382{
383 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
384}
385
386static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
387{
388 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
389 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
390}
391
392static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
393{
394 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
395 &memcg->kmem_account_flags);
396}
397#endif
398
335/* Stuffs for move charges at task migration. */ 399/* Stuffs for move charges at task migration. */
336/* 400/*
337 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 401 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -386,9 +450,13 @@ enum charge_type {
386}; 450};
387 451
388/* for encoding cft->private value on file */ 452/* for encoding cft->private value on file */
389#define _MEM (0) 453enum res_type {
390#define _MEMSWAP (1) 454 _MEM,
391#define _OOM_TYPE (2) 455 _MEMSWAP,
456 _OOM_TYPE,
457 _KMEM,
458};
459
392#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 460#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
393#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 461#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
394#define MEMFILE_ATTR(val) ((val) & 0xffff) 462#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -485,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
485} 553}
486#endif 554#endif
487 555
556#ifdef CONFIG_MEMCG_KMEM
557/*
558 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
559 * There are two main reasons for not using the css_id for this:
560 * 1) this works better in sparse environments, where we have a lot of memcgs,
561 * but only a few kmem-limited. Or also, if we have, for instance, 200
562 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
563 * 200 entry array for that.
564 *
565 * 2) In order not to violate the cgroup API, we would like to do all memory
566 * allocation in ->create(). At that point, we haven't yet allocated the
567 * css_id. Having a separate index prevents us from messing with the cgroup
568 * core for this
569 *
570 * The current size of the caches array is stored in
571 * memcg_limited_groups_array_size. It will double each time we have to
572 * increase it.
573 */
574static DEFINE_IDA(kmem_limited_groups);
575int memcg_limited_groups_array_size;
576
577/*
578 * MIN_SIZE is different than 1, because we would like to avoid going through
579 * the alloc/free process all the time. In a small machine, 4 kmem-limited
580 * cgroups is a reasonable guess. In the future, it could be a parameter or
581 * tunable, but that is strictly not necessary.
582 *
583 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
584 * this constant directly from cgroup, but it is understandable that this is
585 * better kept as an internal representation in cgroup.c. In any case, the
586 * css_id space is not getting any smaller, and we don't have to necessarily
587 * increase ours as well if it increases.
588 */
589#define MEMCG_CACHES_MIN_SIZE 4
590#define MEMCG_CACHES_MAX_SIZE 65535
591
592/*
593 * A lot of the calls to the cache allocation functions are expected to be
594 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
595 * conditional to this static branch, we'll have to allow modules that does
596 * kmem_cache_alloc and the such to see this symbol as well
597 */
598struct static_key memcg_kmem_enabled_key;
599EXPORT_SYMBOL(memcg_kmem_enabled_key);
600
601static void disarm_kmem_keys(struct mem_cgroup *memcg)
602{
603 if (memcg_kmem_is_active(memcg)) {
604 static_key_slow_dec(&memcg_kmem_enabled_key);
605 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
606 }
607 /*
608 * This check can't live in kmem destruction function,
609 * since the charges will outlive the cgroup
610 */
611 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
612}
613#else
614static void disarm_kmem_keys(struct mem_cgroup *memcg)
615{
616}
617#endif /* CONFIG_MEMCG_KMEM */
618
619static void disarm_static_keys(struct mem_cgroup *memcg)
620{
621 disarm_sock_keys(memcg);
622 disarm_kmem_keys(memcg);
623}
624
488static void drain_all_stock_async(struct mem_cgroup *memcg); 625static void drain_all_stock_async(struct mem_cgroup *memcg);
489 626
490static struct mem_cgroup_per_zone * 627static struct mem_cgroup_per_zone *
@@ -800,7 +937,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
800 int nid; 937 int nid;
801 u64 total = 0; 938 u64 total = 0;
802 939
803 for_each_node_state(nid, N_HIGH_MEMORY) 940 for_each_node_state(nid, N_MEMORY)
804 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 941 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
805 return total; 942 return total;
806} 943}
@@ -1015,13 +1152,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
1015 iter != NULL; \ 1152 iter != NULL; \
1016 iter = mem_cgroup_iter(NULL, iter, NULL)) 1153 iter = mem_cgroup_iter(NULL, iter, NULL))
1017 1154
1018void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1155void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1019{ 1156{
1020 struct mem_cgroup *memcg; 1157 struct mem_cgroup *memcg;
1021 1158
1022 if (!mm)
1023 return;
1024
1025 rcu_read_lock(); 1159 rcu_read_lock();
1026 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1160 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1027 if (unlikely(!memcg)) 1161 if (unlikely(!memcg))
@@ -1040,7 +1174,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1040out: 1174out:
1041 rcu_read_unlock(); 1175 rcu_read_unlock();
1042} 1176}
1043EXPORT_SYMBOL(mem_cgroup_count_vm_event); 1177EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1044 1178
1045/** 1179/**
1046 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1180 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
@@ -1454,6 +1588,10 @@ done:
1454 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1588 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1455 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1589 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1456 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1590 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1591 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1592 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1593 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1594 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1457} 1595}
1458 1596
1459/* 1597/*
@@ -1498,8 +1636,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1498 return limit; 1636 return limit;
1499} 1637}
1500 1638
1501void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1639static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1502 int order) 1640 int order)
1503{ 1641{
1504 struct mem_cgroup *iter; 1642 struct mem_cgroup *iter;
1505 unsigned long chosen_points = 0; 1643 unsigned long chosen_points = 0;
@@ -1644,9 +1782,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1644 return; 1782 return;
1645 1783
1646 /* make a nodemask where this memcg uses memory from */ 1784 /* make a nodemask where this memcg uses memory from */
1647 memcg->scan_nodes = node_states[N_HIGH_MEMORY]; 1785 memcg->scan_nodes = node_states[N_MEMORY];
1648 1786
1649 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1787 for_each_node_mask(nid, node_states[N_MEMORY]) {
1650 1788
1651 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1789 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1652 node_clear(nid, memcg->scan_nodes); 1790 node_clear(nid, memcg->scan_nodes);
@@ -1717,7 +1855,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1717 /* 1855 /*
1718 * Check rest of nodes. 1856 * Check rest of nodes.
1719 */ 1857 */
1720 for_each_node_state(nid, N_HIGH_MEMORY) { 1858 for_each_node_state(nid, N_MEMORY) {
1721 if (node_isset(nid, memcg->scan_nodes)) 1859 if (node_isset(nid, memcg->scan_nodes))
1722 continue; 1860 continue;
1723 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1861 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
@@ -2061,20 +2199,28 @@ struct memcg_stock_pcp {
2061static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2199static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2062static DEFINE_MUTEX(percpu_charge_mutex); 2200static DEFINE_MUTEX(percpu_charge_mutex);
2063 2201
2064/* 2202/**
2065 * Try to consume stocked charge on this cpu. If success, one page is consumed 2203 * consume_stock: Try to consume stocked charge on this cpu.
2066 * from local stock and true is returned. If the stock is 0 or charges from a 2204 * @memcg: memcg to consume from.
2067 * cgroup which is not current target, returns false. This stock will be 2205 * @nr_pages: how many pages to charge.
2068 * refilled. 2206 *
2207 * The charges will only happen if @memcg matches the current cpu's memcg
2208 * stock, and at least @nr_pages are available in that stock. Failure to
2209 * service an allocation will refill the stock.
2210 *
2211 * returns true if successful, false otherwise.
2069 */ 2212 */
2070static bool consume_stock(struct mem_cgroup *memcg) 2213static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2071{ 2214{
2072 struct memcg_stock_pcp *stock; 2215 struct memcg_stock_pcp *stock;
2073 bool ret = true; 2216 bool ret = true;
2074 2217
2218 if (nr_pages > CHARGE_BATCH)
2219 return false;
2220
2075 stock = &get_cpu_var(memcg_stock); 2221 stock = &get_cpu_var(memcg_stock);
2076 if (memcg == stock->cached && stock->nr_pages) 2222 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2077 stock->nr_pages--; 2223 stock->nr_pages -= nr_pages;
2078 else /* need to call res_counter_charge */ 2224 else /* need to call res_counter_charge */
2079 ret = false; 2225 ret = false;
2080 put_cpu_var(memcg_stock); 2226 put_cpu_var(memcg_stock);
@@ -2251,7 +2397,8 @@ enum {
2251}; 2397};
2252 2398
2253static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2399static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2254 unsigned int nr_pages, bool oom_check) 2400 unsigned int nr_pages, unsigned int min_pages,
2401 bool oom_check)
2255{ 2402{
2256 unsigned long csize = nr_pages * PAGE_SIZE; 2403 unsigned long csize = nr_pages * PAGE_SIZE;
2257 struct mem_cgroup *mem_over_limit; 2404 struct mem_cgroup *mem_over_limit;
@@ -2274,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2274 } else 2421 } else
2275 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2422 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2276 /* 2423 /*
2277 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2278 * of regular pages (CHARGE_BATCH), or a single regular page (1).
2279 *
2280 * Never reclaim on behalf of optional batching, retry with a 2424 * Never reclaim on behalf of optional batching, retry with a
2281 * single page instead. 2425 * single page instead.
2282 */ 2426 */
2283 if (nr_pages == CHARGE_BATCH) 2427 if (nr_pages > min_pages)
2284 return CHARGE_RETRY; 2428 return CHARGE_RETRY;
2285 2429
2286 if (!(gfp_mask & __GFP_WAIT)) 2430 if (!(gfp_mask & __GFP_WAIT))
2287 return CHARGE_WOULDBLOCK; 2431 return CHARGE_WOULDBLOCK;
2288 2432
2433 if (gfp_mask & __GFP_NORETRY)
2434 return CHARGE_NOMEM;
2435
2289 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2436 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2290 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2437 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2291 return CHARGE_RETRY; 2438 return CHARGE_RETRY;
@@ -2298,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2298 * unlikely to succeed so close to the limit, and we fall back 2445 * unlikely to succeed so close to the limit, and we fall back
2299 * to regular pages anyway in case of failure. 2446 * to regular pages anyway in case of failure.
2300 */ 2447 */
2301 if (nr_pages == 1 && ret) 2448 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2302 return CHARGE_RETRY; 2449 return CHARGE_RETRY;
2303 2450
2304 /* 2451 /*
@@ -2370,10 +2517,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2370again: 2517again:
2371 if (*ptr) { /* css should be a valid one */ 2518 if (*ptr) { /* css should be a valid one */
2372 memcg = *ptr; 2519 memcg = *ptr;
2373 VM_BUG_ON(css_is_removed(&memcg->css));
2374 if (mem_cgroup_is_root(memcg)) 2520 if (mem_cgroup_is_root(memcg))
2375 goto done; 2521 goto done;
2376 if (nr_pages == 1 && consume_stock(memcg)) 2522 if (consume_stock(memcg, nr_pages))
2377 goto done; 2523 goto done;
2378 css_get(&memcg->css); 2524 css_get(&memcg->css);
2379 } else { 2525 } else {
@@ -2398,7 +2544,7 @@ again:
2398 rcu_read_unlock(); 2544 rcu_read_unlock();
2399 goto done; 2545 goto done;
2400 } 2546 }
2401 if (nr_pages == 1 && consume_stock(memcg)) { 2547 if (consume_stock(memcg, nr_pages)) {
2402 /* 2548 /*
2403 * It seems dagerous to access memcg without css_get(). 2549 * It seems dagerous to access memcg without css_get().
2404 * But considering how consume_stok works, it's not 2550 * But considering how consume_stok works, it's not
@@ -2433,7 +2579,8 @@ again:
2433 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2579 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2434 } 2580 }
2435 2581
2436 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); 2582 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2583 oom_check);
2437 switch (ret) { 2584 switch (ret) {
2438 case CHARGE_OK: 2585 case CHARGE_OK:
2439 break; 2586 break;
@@ -2510,9 +2657,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2510 2657
2511/* 2658/*
2512 * A helper function to get mem_cgroup from ID. must be called under 2659 * A helper function to get mem_cgroup from ID. must be called under
2513 * rcu_read_lock(). The caller must check css_is_removed() or some if 2660 * rcu_read_lock(). The caller is responsible for calling css_tryget if
2514 * it's concern. (dropping refcnt from swap can be called against removed 2661 * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2515 * memcg.) 2662 * called against removed memcg.)
2516 */ 2663 */
2517static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2664static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2518{ 2665{
@@ -2626,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2626 memcg_check_events(memcg, page); 2773 memcg_check_events(memcg, page);
2627} 2774}
2628 2775
2776static DEFINE_MUTEX(set_limit_mutex);
2777
2778#ifdef CONFIG_MEMCG_KMEM
2779static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2780{
2781 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2782 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2783}
2784
2785/*
2786 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2787 * in the memcg_cache_params struct.
2788 */
2789static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2790{
2791 struct kmem_cache *cachep;
2792
2793 VM_BUG_ON(p->is_root_cache);
2794 cachep = p->root_cache;
2795 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2796}
2797
2798#ifdef CONFIG_SLABINFO
2799static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
2800 struct seq_file *m)
2801{
2802 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2803 struct memcg_cache_params *params;
2804
2805 if (!memcg_can_account_kmem(memcg))
2806 return -EIO;
2807
2808 print_slabinfo_header(m);
2809
2810 mutex_lock(&memcg->slab_caches_mutex);
2811 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2812 cache_show(memcg_params_to_cache(params), m);
2813 mutex_unlock(&memcg->slab_caches_mutex);
2814
2815 return 0;
2816}
2817#endif
2818
2819static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2820{
2821 struct res_counter *fail_res;
2822 struct mem_cgroup *_memcg;
2823 int ret = 0;
2824 bool may_oom;
2825
2826 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2827 if (ret)
2828 return ret;
2829
2830 /*
2831 * Conditions under which we can wait for the oom_killer. Those are
2832 * the same conditions tested by the core page allocator
2833 */
2834 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2835
2836 _memcg = memcg;
2837 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2838 &_memcg, may_oom);
2839
2840 if (ret == -EINTR) {
2841 /*
2842 * __mem_cgroup_try_charge() chosed to bypass to root due to
2843 * OOM kill or fatal signal. Since our only options are to
2844 * either fail the allocation or charge it to this cgroup, do
2845 * it as a temporary condition. But we can't fail. From a
2846 * kmem/slab perspective, the cache has already been selected,
2847 * by mem_cgroup_kmem_get_cache(), so it is too late to change
2848 * our minds.
2849 *
2850 * This condition will only trigger if the task entered
2851 * memcg_charge_kmem in a sane state, but was OOM-killed during
2852 * __mem_cgroup_try_charge() above. Tasks that were already
2853 * dying when the allocation triggers should have been already
2854 * directed to the root cgroup in memcontrol.h
2855 */
2856 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2857 if (do_swap_account)
2858 res_counter_charge_nofail(&memcg->memsw, size,
2859 &fail_res);
2860 ret = 0;
2861 } else if (ret)
2862 res_counter_uncharge(&memcg->kmem, size);
2863
2864 return ret;
2865}
2866
2867static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2868{
2869 res_counter_uncharge(&memcg->res, size);
2870 if (do_swap_account)
2871 res_counter_uncharge(&memcg->memsw, size);
2872
2873 /* Not down to 0 */
2874 if (res_counter_uncharge(&memcg->kmem, size))
2875 return;
2876
2877 if (memcg_kmem_test_and_clear_dead(memcg))
2878 mem_cgroup_put(memcg);
2879}
2880
2881void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
2882{
2883 if (!memcg)
2884 return;
2885
2886 mutex_lock(&memcg->slab_caches_mutex);
2887 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2888 mutex_unlock(&memcg->slab_caches_mutex);
2889}
2890
2891/*
2892 * helper for acessing a memcg's index. It will be used as an index in the
2893 * child cache array in kmem_cache, and also to derive its name. This function
2894 * will return -1 when this is not a kmem-limited memcg.
2895 */
2896int memcg_cache_id(struct mem_cgroup *memcg)
2897{
2898 return memcg ? memcg->kmemcg_id : -1;
2899}
2900
2901/*
2902 * This ends up being protected by the set_limit mutex, during normal
2903 * operation, because that is its main call site.
2904 *
2905 * But when we create a new cache, we can call this as well if its parent
2906 * is kmem-limited. That will have to hold set_limit_mutex as well.
2907 */
2908int memcg_update_cache_sizes(struct mem_cgroup *memcg)
2909{
2910 int num, ret;
2911
2912 num = ida_simple_get(&kmem_limited_groups,
2913 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2914 if (num < 0)
2915 return num;
2916 /*
2917 * After this point, kmem_accounted (that we test atomically in
2918 * the beginning of this conditional), is no longer 0. This
2919 * guarantees only one process will set the following boolean
2920 * to true. We don't need test_and_set because we're protected
2921 * by the set_limit_mutex anyway.
2922 */
2923 memcg_kmem_set_activated(memcg);
2924
2925 ret = memcg_update_all_caches(num+1);
2926 if (ret) {
2927 ida_simple_remove(&kmem_limited_groups, num);
2928 memcg_kmem_clear_activated(memcg);
2929 return ret;
2930 }
2931
2932 memcg->kmemcg_id = num;
2933 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
2934 mutex_init(&memcg->slab_caches_mutex);
2935 return 0;
2936}
2937
2938static size_t memcg_caches_array_size(int num_groups)
2939{
2940 ssize_t size;
2941 if (num_groups <= 0)
2942 return 0;
2943
2944 size = 2 * num_groups;
2945 if (size < MEMCG_CACHES_MIN_SIZE)
2946 size = MEMCG_CACHES_MIN_SIZE;
2947 else if (size > MEMCG_CACHES_MAX_SIZE)
2948 size = MEMCG_CACHES_MAX_SIZE;
2949
2950 return size;
2951}
2952
2953/*
2954 * We should update the current array size iff all caches updates succeed. This
2955 * can only be done from the slab side. The slab mutex needs to be held when
2956 * calling this.
2957 */
2958void memcg_update_array_size(int num)
2959{
2960 if (num > memcg_limited_groups_array_size)
2961 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2962}
2963
2964int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2965{
2966 struct memcg_cache_params *cur_params = s->memcg_params;
2967
2968 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
2969
2970 if (num_groups > memcg_limited_groups_array_size) {
2971 int i;
2972 ssize_t size = memcg_caches_array_size(num_groups);
2973
2974 size *= sizeof(void *);
2975 size += sizeof(struct memcg_cache_params);
2976
2977 s->memcg_params = kzalloc(size, GFP_KERNEL);
2978 if (!s->memcg_params) {
2979 s->memcg_params = cur_params;
2980 return -ENOMEM;
2981 }
2982
2983 s->memcg_params->is_root_cache = true;
2984
2985 /*
2986 * There is the chance it will be bigger than
2987 * memcg_limited_groups_array_size, if we failed an allocation
2988 * in a cache, in which case all caches updated before it, will
2989 * have a bigger array.
2990 *
2991 * But if that is the case, the data after
2992 * memcg_limited_groups_array_size is certainly unused
2993 */
2994 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2995 if (!cur_params->memcg_caches[i])
2996 continue;
2997 s->memcg_params->memcg_caches[i] =
2998 cur_params->memcg_caches[i];
2999 }
3000
3001 /*
3002 * Ideally, we would wait until all caches succeed, and only
3003 * then free the old one. But this is not worth the extra
3004 * pointer per-cache we'd have to have for this.
3005 *
3006 * It is not a big deal if some caches are left with a size
3007 * bigger than the others. And all updates will reset this
3008 * anyway.
3009 */
3010 kfree(cur_params);
3011 }
3012 return 0;
3013}
3014
3015int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3016 struct kmem_cache *root_cache)
3017{
3018 size_t size = sizeof(struct memcg_cache_params);
3019
3020 if (!memcg_kmem_enabled())
3021 return 0;
3022
3023 if (!memcg)
3024 size += memcg_limited_groups_array_size * sizeof(void *);
3025
3026 s->memcg_params = kzalloc(size, GFP_KERNEL);
3027 if (!s->memcg_params)
3028 return -ENOMEM;
3029
3030 if (memcg) {
3031 s->memcg_params->memcg = memcg;
3032 s->memcg_params->root_cache = root_cache;
3033 }
3034 return 0;
3035}
3036
3037void memcg_release_cache(struct kmem_cache *s)
3038{
3039 struct kmem_cache *root;
3040 struct mem_cgroup *memcg;
3041 int id;
3042
3043 /*
3044 * This happens, for instance, when a root cache goes away before we
3045 * add any memcg.
3046 */
3047 if (!s->memcg_params)
3048 return;
3049
3050 if (s->memcg_params->is_root_cache)
3051 goto out;
3052
3053 memcg = s->memcg_params->memcg;
3054 id = memcg_cache_id(memcg);
3055
3056 root = s->memcg_params->root_cache;
3057 root->memcg_params->memcg_caches[id] = NULL;
3058 mem_cgroup_put(memcg);
3059
3060 mutex_lock(&memcg->slab_caches_mutex);
3061 list_del(&s->memcg_params->list);
3062 mutex_unlock(&memcg->slab_caches_mutex);
3063
3064out:
3065 kfree(s->memcg_params);
3066}
3067
3068/*
3069 * During the creation a new cache, we need to disable our accounting mechanism
3070 * altogether. This is true even if we are not creating, but rather just
3071 * enqueing new caches to be created.
3072 *
3073 * This is because that process will trigger allocations; some visible, like
3074 * explicit kmallocs to auxiliary data structures, name strings and internal
3075 * cache structures; some well concealed, like INIT_WORK() that can allocate
3076 * objects during debug.
3077 *
3078 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
3079 * to it. This may not be a bounded recursion: since the first cache creation
3080 * failed to complete (waiting on the allocation), we'll just try to create the
3081 * cache again, failing at the same point.
3082 *
3083 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
3084 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
3085 * inside the following two functions.
3086 */
3087static inline void memcg_stop_kmem_account(void)
3088{
3089 VM_BUG_ON(!current->mm);
3090 current->memcg_kmem_skip_account++;
3091}
3092
3093static inline void memcg_resume_kmem_account(void)
3094{
3095 VM_BUG_ON(!current->mm);
3096 current->memcg_kmem_skip_account--;
3097}
3098
3099static void kmem_cache_destroy_work_func(struct work_struct *w)
3100{
3101 struct kmem_cache *cachep;
3102 struct memcg_cache_params *p;
3103
3104 p = container_of(w, struct memcg_cache_params, destroy);
3105
3106 cachep = memcg_params_to_cache(p);
3107
3108 /*
3109 * If we get down to 0 after shrink, we could delete right away.
3110 * However, memcg_release_pages() already puts us back in the workqueue
3111 * in that case. If we proceed deleting, we'll get a dangling
3112 * reference, and removing the object from the workqueue in that case
3113 * is unnecessary complication. We are not a fast path.
3114 *
3115 * Note that this case is fundamentally different from racing with
3116 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3117 * kmem_cache_shrink, not only we would be reinserting a dead cache
3118 * into the queue, but doing so from inside the worker racing to
3119 * destroy it.
3120 *
3121 * So if we aren't down to zero, we'll just schedule a worker and try
3122 * again
3123 */
3124 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3125 kmem_cache_shrink(cachep);
3126 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3127 return;
3128 } else
3129 kmem_cache_destroy(cachep);
3130}
3131
3132void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3133{
3134 if (!cachep->memcg_params->dead)
3135 return;
3136
3137 /*
3138 * There are many ways in which we can get here.
3139 *
3140 * We can get to a memory-pressure situation while the delayed work is
3141 * still pending to run. The vmscan shrinkers can then release all
3142 * cache memory and get us to destruction. If this is the case, we'll
3143 * be executed twice, which is a bug (the second time will execute over
3144 * bogus data). In this case, cancelling the work should be fine.
3145 *
3146 * But we can also get here from the worker itself, if
3147 * kmem_cache_shrink is enough to shake all the remaining objects and
3148 * get the page count to 0. In this case, we'll deadlock if we try to
3149 * cancel the work (the worker runs with an internal lock held, which
3150 * is the same lock we would hold for cancel_work_sync().)
3151 *
3152 * Since we can't possibly know who got us here, just refrain from
3153 * running if there is already work pending
3154 */
3155 if (work_pending(&cachep->memcg_params->destroy))
3156 return;
3157 /*
3158 * We have to defer the actual destroying to a workqueue, because
3159 * we might currently be in a context that cannot sleep.
3160 */
3161 schedule_work(&cachep->memcg_params->destroy);
3162}
3163
3164static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
3165{
3166 char *name;
3167 struct dentry *dentry;
3168
3169 rcu_read_lock();
3170 dentry = rcu_dereference(memcg->css.cgroup->dentry);
3171 rcu_read_unlock();
3172
3173 BUG_ON(dentry == NULL);
3174
3175 name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
3176 memcg_cache_id(memcg), dentry->d_name.name);
3177
3178 return name;
3179}
3180
3181static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3182 struct kmem_cache *s)
3183{
3184 char *name;
3185 struct kmem_cache *new;
3186
3187 name = memcg_cache_name(memcg, s);
3188 if (!name)
3189 return NULL;
3190
3191 new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
3192 (s->flags & ~SLAB_PANIC), s->ctor, s);
3193
3194 if (new)
3195 new->allocflags |= __GFP_KMEMCG;
3196
3197 kfree(name);
3198 return new;
3199}
3200
3201/*
3202 * This lock protects updaters, not readers. We want readers to be as fast as
3203 * they can, and they will either see NULL or a valid cache value. Our model
3204 * allow them to see NULL, in which case the root memcg will be selected.
3205 *
3206 * We need this lock because multiple allocations to the same cache from a non
3207 * will span more than one worker. Only one of them can create the cache.
3208 */
3209static DEFINE_MUTEX(memcg_cache_mutex);
3210static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3211 struct kmem_cache *cachep)
3212{
3213 struct kmem_cache *new_cachep;
3214 int idx;
3215
3216 BUG_ON(!memcg_can_account_kmem(memcg));
3217
3218 idx = memcg_cache_id(memcg);
3219
3220 mutex_lock(&memcg_cache_mutex);
3221 new_cachep = cachep->memcg_params->memcg_caches[idx];
3222 if (new_cachep)
3223 goto out;
3224
3225 new_cachep = kmem_cache_dup(memcg, cachep);
3226 if (new_cachep == NULL) {
3227 new_cachep = cachep;
3228 goto out;
3229 }
3230
3231 mem_cgroup_get(memcg);
3232 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3233
3234 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3235 /*
3236 * the readers won't lock, make sure everybody sees the updated value,
3237 * so they won't put stuff in the queue again for no reason
3238 */
3239 wmb();
3240out:
3241 mutex_unlock(&memcg_cache_mutex);
3242 return new_cachep;
3243}
3244
3245void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3246{
3247 struct kmem_cache *c;
3248 int i;
3249
3250 if (!s->memcg_params)
3251 return;
3252 if (!s->memcg_params->is_root_cache)
3253 return;
3254
3255 /*
3256 * If the cache is being destroyed, we trust that there is no one else
3257 * requesting objects from it. Even if there are, the sanity checks in
3258 * kmem_cache_destroy should caught this ill-case.
3259 *
3260 * Still, we don't want anyone else freeing memcg_caches under our
3261 * noses, which can happen if a new memcg comes to life. As usual,
3262 * we'll take the set_limit_mutex to protect ourselves against this.
3263 */
3264 mutex_lock(&set_limit_mutex);
3265 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3266 c = s->memcg_params->memcg_caches[i];
3267 if (!c)
3268 continue;
3269
3270 /*
3271 * We will now manually delete the caches, so to avoid races
3272 * we need to cancel all pending destruction workers and
3273 * proceed with destruction ourselves.
3274 *
3275 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3276 * and that could spawn the workers again: it is likely that
3277 * the cache still have active pages until this very moment.
3278 * This would lead us back to mem_cgroup_destroy_cache.
3279 *
3280 * But that will not execute at all if the "dead" flag is not
3281 * set, so flip it down to guarantee we are in control.
3282 */
3283 c->memcg_params->dead = false;
3284 cancel_work_sync(&c->memcg_params->destroy);
3285 kmem_cache_destroy(c);
3286 }
3287 mutex_unlock(&set_limit_mutex);
3288}
3289
3290struct create_work {
3291 struct mem_cgroup *memcg;
3292 struct kmem_cache *cachep;
3293 struct work_struct work;
3294};
3295
3296static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3297{
3298 struct kmem_cache *cachep;
3299 struct memcg_cache_params *params;
3300
3301 if (!memcg_kmem_is_active(memcg))
3302 return;
3303
3304 mutex_lock(&memcg->slab_caches_mutex);
3305 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3306 cachep = memcg_params_to_cache(params);
3307 cachep->memcg_params->dead = true;
3308 INIT_WORK(&cachep->memcg_params->destroy,
3309 kmem_cache_destroy_work_func);
3310 schedule_work(&cachep->memcg_params->destroy);
3311 }
3312 mutex_unlock(&memcg->slab_caches_mutex);
3313}
3314
3315static void memcg_create_cache_work_func(struct work_struct *w)
3316{
3317 struct create_work *cw;
3318
3319 cw = container_of(w, struct create_work, work);
3320 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3321 /* Drop the reference gotten when we enqueued. */
3322 css_put(&cw->memcg->css);
3323 kfree(cw);
3324}
3325
3326/*
3327 * Enqueue the creation of a per-memcg kmem_cache.
3328 * Called with rcu_read_lock.
3329 */
3330static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3331 struct kmem_cache *cachep)
3332{
3333 struct create_work *cw;
3334
3335 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3336 if (cw == NULL)
3337 return;
3338
3339 /* The corresponding put will be done in the workqueue. */
3340 if (!css_tryget(&memcg->css)) {
3341 kfree(cw);
3342 return;
3343 }
3344
3345 cw->memcg = memcg;
3346 cw->cachep = cachep;
3347
3348 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3349 schedule_work(&cw->work);
3350}
3351
3352static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3353 struct kmem_cache *cachep)
3354{
3355 /*
3356 * We need to stop accounting when we kmalloc, because if the
3357 * corresponding kmalloc cache is not yet created, the first allocation
3358 * in __memcg_create_cache_enqueue will recurse.
3359 *
3360 * However, it is better to enclose the whole function. Depending on
3361 * the debugging options enabled, INIT_WORK(), for instance, can
3362 * trigger an allocation. This too, will make us recurse. Because at
3363 * this point we can't allow ourselves back into memcg_kmem_get_cache,
3364 * the safest choice is to do it like this, wrapping the whole function.
3365 */
3366 memcg_stop_kmem_account();
3367 __memcg_create_cache_enqueue(memcg, cachep);
3368 memcg_resume_kmem_account();
3369}
3370/*
3371 * Return the kmem_cache we're supposed to use for a slab allocation.
3372 * We try to use the current memcg's version of the cache.
3373 *
3374 * If the cache does not exist yet, if we are the first user of it,
3375 * we either create it immediately, if possible, or create it asynchronously
3376 * in a workqueue.
3377 * In the latter case, we will let the current allocation go through with
3378 * the original cache.
3379 *
3380 * Can't be called in interrupt context or from kernel threads.
3381 * This function needs to be called with rcu_read_lock() held.
3382 */
3383struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3384 gfp_t gfp)
3385{
3386 struct mem_cgroup *memcg;
3387 int idx;
3388
3389 VM_BUG_ON(!cachep->memcg_params);
3390 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3391
3392 if (!current->mm || current->memcg_kmem_skip_account)
3393 return cachep;
3394
3395 rcu_read_lock();
3396 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3397 rcu_read_unlock();
3398
3399 if (!memcg_can_account_kmem(memcg))
3400 return cachep;
3401
3402 idx = memcg_cache_id(memcg);
3403
3404 /*
3405 * barrier to mare sure we're always seeing the up to date value. The
3406 * code updating memcg_caches will issue a write barrier to match this.
3407 */
3408 read_barrier_depends();
3409 if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
3410 /*
3411 * If we are in a safe context (can wait, and not in interrupt
3412 * context), we could be be predictable and return right away.
3413 * This would guarantee that the allocation being performed
3414 * already belongs in the new cache.
3415 *
3416 * However, there are some clashes that can arrive from locking.
3417 * For instance, because we acquire the slab_mutex while doing
3418 * kmem_cache_dup, this means no further allocation could happen
3419 * with the slab_mutex held.
3420 *
3421 * Also, because cache creation issue get_online_cpus(), this
3422 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3423 * that ends up reversed during cpu hotplug. (cpuset allocates
3424 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3425 * better to defer everything.
3426 */
3427 memcg_create_cache_enqueue(memcg, cachep);
3428 return cachep;
3429 }
3430
3431 return cachep->memcg_params->memcg_caches[idx];
3432}
3433EXPORT_SYMBOL(__memcg_kmem_get_cache);
3434
3435/*
3436 * We need to verify if the allocation against current->mm->owner's memcg is
3437 * possible for the given order. But the page is not allocated yet, so we'll
3438 * need a further commit step to do the final arrangements.
3439 *
3440 * It is possible for the task to switch cgroups in this mean time, so at
3441 * commit time, we can't rely on task conversion any longer. We'll then use
3442 * the handle argument to return to the caller which cgroup we should commit
3443 * against. We could also return the memcg directly and avoid the pointer
3444 * passing, but a boolean return value gives better semantics considering
3445 * the compiled-out case as well.
3446 *
3447 * Returning true means the allocation is possible.
3448 */
3449bool
3450__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3451{
3452 struct mem_cgroup *memcg;
3453 int ret;
3454
3455 *_memcg = NULL;
3456 memcg = try_get_mem_cgroup_from_mm(current->mm);
3457
3458 /*
3459 * very rare case described in mem_cgroup_from_task. Unfortunately there
3460 * isn't much we can do without complicating this too much, and it would
3461 * be gfp-dependent anyway. Just let it go
3462 */
3463 if (unlikely(!memcg))
3464 return true;
3465
3466 if (!memcg_can_account_kmem(memcg)) {
3467 css_put(&memcg->css);
3468 return true;
3469 }
3470
3471 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3472 if (!ret)
3473 *_memcg = memcg;
3474
3475 css_put(&memcg->css);
3476 return (ret == 0);
3477}
3478
3479void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3480 int order)
3481{
3482 struct page_cgroup *pc;
3483
3484 VM_BUG_ON(mem_cgroup_is_root(memcg));
3485
3486 /* The page allocation failed. Revert */
3487 if (!page) {
3488 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3489 return;
3490 }
3491
3492 pc = lookup_page_cgroup(page);
3493 lock_page_cgroup(pc);
3494 pc->mem_cgroup = memcg;
3495 SetPageCgroupUsed(pc);
3496 unlock_page_cgroup(pc);
3497}
3498
3499void __memcg_kmem_uncharge_pages(struct page *page, int order)
3500{
3501 struct mem_cgroup *memcg = NULL;
3502 struct page_cgroup *pc;
3503
3504
3505 pc = lookup_page_cgroup(page);
3506 /*
3507 * Fast unlocked return. Theoretically might have changed, have to
3508 * check again after locking.
3509 */
3510 if (!PageCgroupUsed(pc))
3511 return;
3512
3513 lock_page_cgroup(pc);
3514 if (PageCgroupUsed(pc)) {
3515 memcg = pc->mem_cgroup;
3516 ClearPageCgroupUsed(pc);
3517 }
3518 unlock_page_cgroup(pc);
3519
3520 /*
3521 * We trust that only if there is a memcg associated with the page, it
3522 * is a valid allocation
3523 */
3524 if (!memcg)
3525 return;
3526
3527 VM_BUG_ON(mem_cgroup_is_root(memcg));
3528 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3529}
3530#else
3531static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3532{
3533}
3534#endif /* CONFIG_MEMCG_KMEM */
3535
2629#ifdef CONFIG_TRANSPARENT_HUGEPAGE 3536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2630 3537
2631#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3538#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
@@ -2709,13 +3616,6 @@ static int mem_cgroup_move_account(struct page *page,
2709 /* caller should have done css_get */ 3616 /* caller should have done css_get */
2710 pc->mem_cgroup = to; 3617 pc->mem_cgroup = to;
2711 mem_cgroup_charge_statistics(to, anon, nr_pages); 3618 mem_cgroup_charge_statistics(to, anon, nr_pages);
2712 /*
2713 * We charges against "to" which may not have any tasks. Then, "to"
2714 * can be under rmdir(). But in current implementation, caller of
2715 * this function is just force_empty() and move charge, so it's
2716 * guaranteed that "to" is never removed. So, we don't check rmdir
2717 * status here.
2718 */
2719 move_unlock_mem_cgroup(from, &flags); 3619 move_unlock_mem_cgroup(from, &flags);
2720 ret = 0; 3620 ret = 0;
2721unlock: 3621unlock:
@@ -2729,10 +3629,27 @@ out:
2729 return ret; 3629 return ret;
2730} 3630}
2731 3631
2732/* 3632/**
2733 * move charges to its parent. 3633 * mem_cgroup_move_parent - moves page to the parent group
3634 * @page: the page to move
3635 * @pc: page_cgroup of the page
3636 * @child: page's cgroup
3637 *
3638 * move charges to its parent or the root cgroup if the group has no
3639 * parent (aka use_hierarchy==0).
3640 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3641 * mem_cgroup_move_account fails) the failure is always temporary and
3642 * it signals a race with a page removal/uncharge or migration. In the
3643 * first case the page is on the way out and it will vanish from the LRU
3644 * on the next attempt and the call should be retried later.
3645 * Isolation from the LRU fails only if page has been isolated from
3646 * the LRU since we looked at it and that usually means either global
3647 * reclaim or migration going on. The page will either get back to the
3648 * LRU or vanish.
3649 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3650 * (!PageCgroupUsed) or moved to a different group. The page will
3651 * disappear in the next attempt.
2734 */ 3652 */
2735
2736static int mem_cgroup_move_parent(struct page *page, 3653static int mem_cgroup_move_parent(struct page *page,
2737 struct page_cgroup *pc, 3654 struct page_cgroup *pc,
2738 struct mem_cgroup *child) 3655 struct mem_cgroup *child)
@@ -2742,9 +3659,7 @@ static int mem_cgroup_move_parent(struct page *page,
2742 unsigned long uninitialized_var(flags); 3659 unsigned long uninitialized_var(flags);
2743 int ret; 3660 int ret;
2744 3661
2745 /* Is ROOT ? */ 3662 VM_BUG_ON(mem_cgroup_is_root(child));
2746 if (mem_cgroup_is_root(child))
2747 return -EINVAL;
2748 3663
2749 ret = -EBUSY; 3664 ret = -EBUSY;
2750 if (!get_page_unless_zero(page)) 3665 if (!get_page_unless_zero(page))
@@ -2761,8 +3676,10 @@ static int mem_cgroup_move_parent(struct page *page,
2761 if (!parent) 3676 if (!parent)
2762 parent = root_mem_cgroup; 3677 parent = root_mem_cgroup;
2763 3678
2764 if (nr_pages > 1) 3679 if (nr_pages > 1) {
3680 VM_BUG_ON(!PageTransHuge(page));
2765 flags = compound_lock_irqsave(page); 3681 flags = compound_lock_irqsave(page);
3682 }
2766 3683
2767 ret = mem_cgroup_move_account(page, nr_pages, 3684 ret = mem_cgroup_move_account(page, nr_pages,
2768 pc, child, parent); 3685 pc, child, parent);
@@ -2904,7 +3821,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2904 return; 3821 return;
2905 if (!memcg) 3822 if (!memcg)
2906 return; 3823 return;
2907 cgroup_exclude_rmdir(&memcg->css);
2908 3824
2909 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 3825 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2910 /* 3826 /*
@@ -2918,12 +3834,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2918 swp_entry_t ent = {.val = page_private(page)}; 3834 swp_entry_t ent = {.val = page_private(page)};
2919 mem_cgroup_uncharge_swap(ent); 3835 mem_cgroup_uncharge_swap(ent);
2920 } 3836 }
2921 /*
2922 * At swapin, we may charge account against cgroup which has no tasks.
2923 * So, rmdir()->pre_destroy() can be called while we do this charge.
2924 * In that case, we need to call pre_destroy() again. check it here.
2925 */
2926 cgroup_release_and_wakeup_rmdir(&memcg->css);
2927} 3837}
2928 3838
2929void mem_cgroup_commit_charge_swapin(struct page *page, 3839void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3288,15 +4198,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3288 struct mem_cgroup **memcgp) 4198 struct mem_cgroup **memcgp)
3289{ 4199{
3290 struct mem_cgroup *memcg = NULL; 4200 struct mem_cgroup *memcg = NULL;
4201 unsigned int nr_pages = 1;
3291 struct page_cgroup *pc; 4202 struct page_cgroup *pc;
3292 enum charge_type ctype; 4203 enum charge_type ctype;
3293 4204
3294 *memcgp = NULL; 4205 *memcgp = NULL;
3295 4206
3296 VM_BUG_ON(PageTransHuge(page));
3297 if (mem_cgroup_disabled()) 4207 if (mem_cgroup_disabled())
3298 return; 4208 return;
3299 4209
4210 if (PageTransHuge(page))
4211 nr_pages <<= compound_order(page);
4212
3300 pc = lookup_page_cgroup(page); 4213 pc = lookup_page_cgroup(page);
3301 lock_page_cgroup(pc); 4214 lock_page_cgroup(pc);
3302 if (PageCgroupUsed(pc)) { 4215 if (PageCgroupUsed(pc)) {
@@ -3358,7 +4271,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3358 * charged to the res_counter since we plan on replacing the 4271 * charged to the res_counter since we plan on replacing the
3359 * old one and only one page is going to be left afterwards. 4272 * old one and only one page is going to be left afterwards.
3360 */ 4273 */
3361 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 4274 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
3362} 4275}
3363 4276
3364/* remove redundant charge if migration failed*/ 4277/* remove redundant charge if migration failed*/
@@ -3371,8 +4284,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3371 4284
3372 if (!memcg) 4285 if (!memcg)
3373 return; 4286 return;
3374 /* blocks rmdir() */ 4287
3375 cgroup_exclude_rmdir(&memcg->css);
3376 if (!migration_ok) { 4288 if (!migration_ok) {
3377 used = oldpage; 4289 used = oldpage;
3378 unused = newpage; 4290 unused = newpage;
@@ -3406,13 +4318,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3406 */ 4318 */
3407 if (anon) 4319 if (anon)
3408 mem_cgroup_uncharge_page(used); 4320 mem_cgroup_uncharge_page(used);
3409 /*
3410 * At migration, we may charge account against cgroup which has no
3411 * tasks.
3412 * So, rmdir()->pre_destroy() can be called while we do this charge.
3413 * In that case, we need to call pre_destroy() again. check it here.
3414 */
3415 cgroup_release_and_wakeup_rmdir(&memcg->css);
3416} 4321}
3417 4322
3418/* 4323/*
@@ -3490,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
3490} 4395}
3491#endif 4396#endif
3492 4397
3493static DEFINE_MUTEX(set_limit_mutex);
3494
3495static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4398static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3496 unsigned long long val) 4399 unsigned long long val)
3497{ 4400{
@@ -3712,17 +4615,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3712 return nr_reclaimed; 4615 return nr_reclaimed;
3713} 4616}
3714 4617
3715/* 4618/**
4619 * mem_cgroup_force_empty_list - clears LRU of a group
4620 * @memcg: group to clear
4621 * @node: NUMA node
4622 * @zid: zone id
4623 * @lru: lru to to clear
4624 *
3716 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 4625 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3717 * reclaim the pages page themselves - it just removes the page_cgroups. 4626 * reclaim the pages page themselves - pages are moved to the parent (or root)
3718 * Returns true if some page_cgroups were not freed, indicating that the caller 4627 * group.
3719 * must retry this operation.
3720 */ 4628 */
3721static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 4629static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3722 int node, int zid, enum lru_list lru) 4630 int node, int zid, enum lru_list lru)
3723{ 4631{
3724 struct lruvec *lruvec; 4632 struct lruvec *lruvec;
3725 unsigned long flags, loop; 4633 unsigned long flags;
3726 struct list_head *list; 4634 struct list_head *list;
3727 struct page *busy; 4635 struct page *busy;
3728 struct zone *zone; 4636 struct zone *zone;
@@ -3731,11 +4639,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3731 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 4639 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3732 list = &lruvec->lists[lru]; 4640 list = &lruvec->lists[lru];
3733 4641
3734 loop = mem_cgroup_get_lru_size(lruvec, lru);
3735 /* give some margin against EBUSY etc...*/
3736 loop += 256;
3737 busy = NULL; 4642 busy = NULL;
3738 while (loop--) { 4643 do {
3739 struct page_cgroup *pc; 4644 struct page_cgroup *pc;
3740 struct page *page; 4645 struct page *page;
3741 4646
@@ -3761,76 +4666,80 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3761 cond_resched(); 4666 cond_resched();
3762 } else 4667 } else
3763 busy = NULL; 4668 busy = NULL;
3764 } 4669 } while (!list_empty(list));
3765 return !list_empty(list);
3766} 4670}
3767 4671
3768/* 4672/*
3769 * make mem_cgroup's charge to be 0 if there is no task. 4673 * make mem_cgroup's charge to be 0 if there is no task by moving
4674 * all the charges and pages to the parent.
3770 * This enables deleting this mem_cgroup. 4675 * This enables deleting this mem_cgroup.
4676 *
4677 * Caller is responsible for holding css reference on the memcg.
3771 */ 4678 */
3772static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) 4679static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3773{ 4680{
3774 int ret; 4681 int node, zid;
3775 int node, zid, shrink; 4682 u64 usage;
3776 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3777 struct cgroup *cgrp = memcg->css.cgroup;
3778
3779 css_get(&memcg->css);
3780 4683
3781 shrink = 0;
3782 /* should free all ? */
3783 if (free_all)
3784 goto try_to_free;
3785move_account:
3786 do { 4684 do {
3787 ret = -EBUSY;
3788 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3789 goto out;
3790 /* This is for making all *used* pages to be on LRU. */ 4685 /* This is for making all *used* pages to be on LRU. */
3791 lru_add_drain_all(); 4686 lru_add_drain_all();
3792 drain_all_stock_sync(memcg); 4687 drain_all_stock_sync(memcg);
3793 ret = 0;
3794 mem_cgroup_start_move(memcg); 4688 mem_cgroup_start_move(memcg);
3795 for_each_node_state(node, N_HIGH_MEMORY) { 4689 for_each_node_state(node, N_MEMORY) {
3796 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 4690 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3797 enum lru_list lru; 4691 enum lru_list lru;
3798 for_each_lru(lru) { 4692 for_each_lru(lru) {
3799 ret = mem_cgroup_force_empty_list(memcg, 4693 mem_cgroup_force_empty_list(memcg,
3800 node, zid, lru); 4694 node, zid, lru);
3801 if (ret)
3802 break;
3803 } 4695 }
3804 } 4696 }
3805 if (ret)
3806 break;
3807 } 4697 }
3808 mem_cgroup_end_move(memcg); 4698 mem_cgroup_end_move(memcg);
3809 memcg_oom_recover(memcg); 4699 memcg_oom_recover(memcg);
3810 cond_resched(); 4700 cond_resched();
3811 /* "ret" should also be checked to ensure all lists are empty. */
3812 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3813out:
3814 css_put(&memcg->css);
3815 return ret;
3816 4701
3817try_to_free: 4702 /*
4703 * Kernel memory may not necessarily be trackable to a specific
4704 * process. So they are not migrated, and therefore we can't
4705 * expect their value to drop to 0 here.
4706 * Having res filled up with kmem only is enough.
4707 *
4708 * This is a safety check because mem_cgroup_force_empty_list
4709 * could have raced with mem_cgroup_replace_page_cache callers
4710 * so the lru seemed empty but the page could have been added
4711 * right after the check. RES_USAGE should be safe as we always
4712 * charge before adding to the LRU.
4713 */
4714 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4715 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4716 } while (usage > 0);
4717}
4718
4719/*
4720 * Reclaims as many pages from the given memcg as possible and moves
4721 * the rest to the parent.
4722 *
4723 * Caller is responsible for holding css reference for memcg.
4724 */
4725static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4726{
4727 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4728 struct cgroup *cgrp = memcg->css.cgroup;
4729
3818 /* returns EBUSY if there is a task or if we come here twice. */ 4730 /* returns EBUSY if there is a task or if we come here twice. */
3819 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 4731 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3820 ret = -EBUSY; 4732 return -EBUSY;
3821 goto out; 4733
3822 }
3823 /* we call try-to-free pages for make this cgroup empty */ 4734 /* we call try-to-free pages for make this cgroup empty */
3824 lru_add_drain_all(); 4735 lru_add_drain_all();
3825 /* try to free all pages in this cgroup */ 4736 /* try to free all pages in this cgroup */
3826 shrink = 1;
3827 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 4737 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3828 int progress; 4738 int progress;
3829 4739
3830 if (signal_pending(current)) { 4740 if (signal_pending(current))
3831 ret = -EINTR; 4741 return -EINTR;
3832 goto out; 4742
3833 }
3834 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 4743 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3835 false); 4744 false);
3836 if (!progress) { 4745 if (!progress) {
@@ -3841,13 +4750,23 @@ try_to_free:
3841 4750
3842 } 4751 }
3843 lru_add_drain(); 4752 lru_add_drain();
3844 /* try move_account...there may be some *locked* pages. */ 4753 mem_cgroup_reparent_charges(memcg);
3845 goto move_account; 4754
4755 return 0;
3846} 4756}
3847 4757
3848static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 4758static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3849{ 4759{
3850 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 4760 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4761 int ret;
4762
4763 if (mem_cgroup_is_root(memcg))
4764 return -EINVAL;
4765 css_get(&memcg->css);
4766 ret = mem_cgroup_force_empty(memcg);
4767 css_put(&memcg->css);
4768
4769 return ret;
3851} 4770}
3852 4771
3853 4772
@@ -3938,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3938 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4857 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3939 char str[64]; 4858 char str[64];
3940 u64 val; 4859 u64 val;
3941 int type, name, len; 4860 int name, len;
4861 enum res_type type;
3942 4862
3943 type = MEMFILE_TYPE(cft->private); 4863 type = MEMFILE_TYPE(cft->private);
3944 name = MEMFILE_ATTR(cft->private); 4864 name = MEMFILE_ATTR(cft->private);
@@ -3959,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3959 else 4879 else
3960 val = res_counter_read_u64(&memcg->memsw, name); 4880 val = res_counter_read_u64(&memcg->memsw, name);
3961 break; 4881 break;
4882 case _KMEM:
4883 val = res_counter_read_u64(&memcg->kmem, name);
4884 break;
3962 default: 4885 default:
3963 BUG(); 4886 BUG();
3964 } 4887 }
@@ -3966,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3966 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 4889 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3967 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 4890 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3968} 4891}
4892
4893static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4894{
4895 int ret = -EINVAL;
4896#ifdef CONFIG_MEMCG_KMEM
4897 bool must_inc_static_branch = false;
4898
4899 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4900 /*
4901 * For simplicity, we won't allow this to be disabled. It also can't
4902 * be changed if the cgroup has children already, or if tasks had
4903 * already joined.
4904 *
4905 * If tasks join before we set the limit, a person looking at
4906 * kmem.usage_in_bytes will have no way to determine when it took
4907 * place, which makes the value quite meaningless.
4908 *
4909 * After it first became limited, changes in the value of the limit are
4910 * of course permitted.
4911 *
4912 * Taking the cgroup_lock is really offensive, but it is so far the only
4913 * way to guarantee that no children will appear. There are plenty of
4914 * other offenders, and they should all go away. Fine grained locking
4915 * is probably the way to go here. When we are fully hierarchical, we
4916 * can also get rid of the use_hierarchy check.
4917 */
4918 cgroup_lock();
4919 mutex_lock(&set_limit_mutex);
4920 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4921 if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
4922 !list_empty(&cont->children))) {
4923 ret = -EBUSY;
4924 goto out;
4925 }
4926 ret = res_counter_set_limit(&memcg->kmem, val);
4927 VM_BUG_ON(ret);
4928
4929 ret = memcg_update_cache_sizes(memcg);
4930 if (ret) {
4931 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4932 goto out;
4933 }
4934 must_inc_static_branch = true;
4935 /*
4936 * kmem charges can outlive the cgroup. In the case of slab
4937 * pages, for instance, a page contain objects from various
4938 * processes, so it is unfeasible to migrate them away. We
4939 * need to reference count the memcg because of that.
4940 */
4941 mem_cgroup_get(memcg);
4942 } else
4943 ret = res_counter_set_limit(&memcg->kmem, val);
4944out:
4945 mutex_unlock(&set_limit_mutex);
4946 cgroup_unlock();
4947
4948 /*
4949 * We are by now familiar with the fact that we can't inc the static
4950 * branch inside cgroup_lock. See disarm functions for details. A
4951 * worker here is overkill, but also wrong: After the limit is set, we
4952 * must start accounting right away. Since this operation can't fail,
4953 * we can safely defer it to here - no rollback will be needed.
4954 *
4955 * The boolean used to control this is also safe, because
4956 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
4957 * able to set it to true;
4958 */
4959 if (must_inc_static_branch) {
4960 static_key_slow_inc(&memcg_kmem_enabled_key);
4961 /*
4962 * setting the active bit after the inc will guarantee no one
4963 * starts accounting before all call sites are patched
4964 */
4965 memcg_kmem_set_active(memcg);
4966 }
4967
4968#endif
4969 return ret;
4970}
4971
4972static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4973{
4974 int ret = 0;
4975 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4976 if (!parent)
4977 goto out;
4978
4979 memcg->kmem_account_flags = parent->kmem_account_flags;
4980#ifdef CONFIG_MEMCG_KMEM
4981 /*
4982 * When that happen, we need to disable the static branch only on those
4983 * memcgs that enabled it. To achieve this, we would be forced to
4984 * complicate the code by keeping track of which memcgs were the ones
4985 * that actually enabled limits, and which ones got it from its
4986 * parents.
4987 *
4988 * It is a lot simpler just to do static_key_slow_inc() on every child
4989 * that is accounted.
4990 */
4991 if (!memcg_kmem_is_active(memcg))
4992 goto out;
4993
4994 /*
4995 * destroy(), called if we fail, will issue static_key_slow_inc() and
4996 * mem_cgroup_put() if kmem is enabled. We have to either call them
4997 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
4998 * this more consistent, since it always leads to the same destroy path
4999 */
5000 mem_cgroup_get(memcg);
5001 static_key_slow_inc(&memcg_kmem_enabled_key);
5002
5003 mutex_lock(&set_limit_mutex);
5004 ret = memcg_update_cache_sizes(memcg);
5005 mutex_unlock(&set_limit_mutex);
5006#endif
5007out:
5008 return ret;
5009}
5010
3969/* 5011/*
3970 * The user of this function is... 5012 * The user of this function is...
3971 * RES_LIMIT. 5013 * RES_LIMIT.
@@ -3974,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3974 const char *buffer) 5016 const char *buffer)
3975{ 5017{
3976 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5018 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3977 int type, name; 5019 enum res_type type;
5020 int name;
3978 unsigned long long val; 5021 unsigned long long val;
3979 int ret; 5022 int ret;
3980 5023
@@ -3996,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3996 break; 5039 break;
3997 if (type == _MEM) 5040 if (type == _MEM)
3998 ret = mem_cgroup_resize_limit(memcg, val); 5041 ret = mem_cgroup_resize_limit(memcg, val);
3999 else 5042 else if (type == _MEMSWAP)
4000 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5043 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5044 else if (type == _KMEM)
5045 ret = memcg_update_kmem_limit(cont, val);
5046 else
5047 return -EINVAL;
4001 break; 5048 break;
4002 case RES_SOFT_LIMIT: 5049 case RES_SOFT_LIMIT:
4003 ret = res_counter_memparse_write_strategy(buffer, &val); 5050 ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4050,7 +5097,8 @@ out:
4050static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 5097static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4051{ 5098{
4052 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5099 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4053 int type, name; 5100 int name;
5101 enum res_type type;
4054 5102
4055 type = MEMFILE_TYPE(event); 5103 type = MEMFILE_TYPE(event);
4056 name = MEMFILE_ATTR(event); 5104 name = MEMFILE_ATTR(event);
@@ -4062,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4062 case RES_MAX_USAGE: 5110 case RES_MAX_USAGE:
4063 if (type == _MEM) 5111 if (type == _MEM)
4064 res_counter_reset_max(&memcg->res); 5112 res_counter_reset_max(&memcg->res);
4065 else 5113 else if (type == _MEMSWAP)
4066 res_counter_reset_max(&memcg->memsw); 5114 res_counter_reset_max(&memcg->memsw);
5115 else if (type == _KMEM)
5116 res_counter_reset_max(&memcg->kmem);
5117 else
5118 return -EINVAL;
4067 break; 5119 break;
4068 case RES_FAILCNT: 5120 case RES_FAILCNT:
4069 if (type == _MEM) 5121 if (type == _MEM)
4070 res_counter_reset_failcnt(&memcg->res); 5122 res_counter_reset_failcnt(&memcg->res);
4071 else 5123 else if (type == _MEMSWAP)
4072 res_counter_reset_failcnt(&memcg->memsw); 5124 res_counter_reset_failcnt(&memcg->memsw);
5125 else if (type == _KMEM)
5126 res_counter_reset_failcnt(&memcg->kmem);
5127 else
5128 return -EINVAL;
4073 break; 5129 break;
4074 } 5130 }
4075 5131
@@ -4120,7 +5176,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4120 5176
4121 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5177 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4122 seq_printf(m, "total=%lu", total_nr); 5178 seq_printf(m, "total=%lu", total_nr);
4123 for_each_node_state(nid, N_HIGH_MEMORY) { 5179 for_each_node_state(nid, N_MEMORY) {
4124 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 5180 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4125 seq_printf(m, " N%d=%lu", nid, node_nr); 5181 seq_printf(m, " N%d=%lu", nid, node_nr);
4126 } 5182 }
@@ -4128,7 +5184,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4128 5184
4129 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 5185 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4130 seq_printf(m, "file=%lu", file_nr); 5186 seq_printf(m, "file=%lu", file_nr);
4131 for_each_node_state(nid, N_HIGH_MEMORY) { 5187 for_each_node_state(nid, N_MEMORY) {
4132 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5188 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4133 LRU_ALL_FILE); 5189 LRU_ALL_FILE);
4134 seq_printf(m, " N%d=%lu", nid, node_nr); 5190 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4137,7 +5193,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4137 5193
4138 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 5194 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4139 seq_printf(m, "anon=%lu", anon_nr); 5195 seq_printf(m, "anon=%lu", anon_nr);
4140 for_each_node_state(nid, N_HIGH_MEMORY) { 5196 for_each_node_state(nid, N_MEMORY) {
4141 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5197 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4142 LRU_ALL_ANON); 5198 LRU_ALL_ANON);
4143 seq_printf(m, " N%d=%lu", nid, node_nr); 5199 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4146,7 +5202,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4146 5202
4147 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 5203 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4148 seq_printf(m, "unevictable=%lu", unevictable_nr); 5204 seq_printf(m, "unevictable=%lu", unevictable_nr);
4149 for_each_node_state(nid, N_HIGH_MEMORY) { 5205 for_each_node_state(nid, N_MEMORY) {
4150 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5206 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4151 BIT(LRU_UNEVICTABLE)); 5207 BIT(LRU_UNEVICTABLE));
4152 seq_printf(m, " N%d=%lu", nid, node_nr); 5208 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4386,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4386 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5442 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4387 struct mem_cgroup_thresholds *thresholds; 5443 struct mem_cgroup_thresholds *thresholds;
4388 struct mem_cgroup_threshold_ary *new; 5444 struct mem_cgroup_threshold_ary *new;
4389 int type = MEMFILE_TYPE(cft->private); 5445 enum res_type type = MEMFILE_TYPE(cft->private);
4390 u64 threshold, usage; 5446 u64 threshold, usage;
4391 int i, size, ret; 5447 int i, size, ret;
4392 5448
@@ -4469,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4469 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5525 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4470 struct mem_cgroup_thresholds *thresholds; 5526 struct mem_cgroup_thresholds *thresholds;
4471 struct mem_cgroup_threshold_ary *new; 5527 struct mem_cgroup_threshold_ary *new;
4472 int type = MEMFILE_TYPE(cft->private); 5528 enum res_type type = MEMFILE_TYPE(cft->private);
4473 u64 usage; 5529 u64 usage;
4474 int i, j, size; 5530 int i, j, size;
4475 5531
@@ -4547,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4547{ 5603{
4548 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5604 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4549 struct mem_cgroup_eventfd_list *event; 5605 struct mem_cgroup_eventfd_list *event;
4550 int type = MEMFILE_TYPE(cft->private); 5606 enum res_type type = MEMFILE_TYPE(cft->private);
4551 5607
4552 BUG_ON(type != _OOM_TYPE); 5608 BUG_ON(type != _OOM_TYPE);
4553 event = kmalloc(sizeof(*event), GFP_KERNEL); 5609 event = kmalloc(sizeof(*event), GFP_KERNEL);
@@ -4572,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4572{ 5628{
4573 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5629 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4574 struct mem_cgroup_eventfd_list *ev, *tmp; 5630 struct mem_cgroup_eventfd_list *ev, *tmp;
4575 int type = MEMFILE_TYPE(cft->private); 5631 enum res_type type = MEMFILE_TYPE(cft->private);
4576 5632
4577 BUG_ON(type != _OOM_TYPE); 5633 BUG_ON(type != _OOM_TYPE);
4578 5634
@@ -4631,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4631#ifdef CONFIG_MEMCG_KMEM 5687#ifdef CONFIG_MEMCG_KMEM
4632static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5688static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4633{ 5689{
5690 int ret;
5691
5692 memcg->kmemcg_id = -1;
5693 ret = memcg_propagate_kmem(memcg);
5694 if (ret)
5695 return ret;
5696
4634 return mem_cgroup_sockets_init(memcg, ss); 5697 return mem_cgroup_sockets_init(memcg, ss);
4635}; 5698};
4636 5699
4637static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5700static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4638{ 5701{
4639 mem_cgroup_sockets_destroy(memcg); 5702 mem_cgroup_sockets_destroy(memcg);
5703
5704 memcg_kmem_mark_dead(memcg);
5705
5706 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5707 return;
5708
5709 /*
5710 * Charges already down to 0, undo mem_cgroup_get() done in the charge
5711 * path here, being careful not to race with memcg_uncharge_kmem: it is
5712 * possible that the charges went down to 0 between mark_dead and the
5713 * res_counter read, so in that case, we don't need the put
5714 */
5715 if (memcg_kmem_test_and_clear_dead(memcg))
5716 mem_cgroup_put(memcg);
4640} 5717}
4641#else 5718#else
4642static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5719static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4745,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = {
4745 .read = mem_cgroup_read, 5822 .read = mem_cgroup_read,
4746 }, 5823 },
4747#endif 5824#endif
5825#ifdef CONFIG_MEMCG_KMEM
5826 {
5827 .name = "kmem.limit_in_bytes",
5828 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5829 .write_string = mem_cgroup_write,
5830 .read = mem_cgroup_read,
5831 },
5832 {
5833 .name = "kmem.usage_in_bytes",
5834 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5835 .read = mem_cgroup_read,
5836 },
5837 {
5838 .name = "kmem.failcnt",
5839 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5840 .trigger = mem_cgroup_reset,
5841 .read = mem_cgroup_read,
5842 },
5843 {
5844 .name = "kmem.max_usage_in_bytes",
5845 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5846 .trigger = mem_cgroup_reset,
5847 .read = mem_cgroup_read,
5848 },
5849#ifdef CONFIG_SLABINFO
5850 {
5851 .name = "kmem.slabinfo",
5852 .read_seq_string = mem_cgroup_slabinfo_read,
5853 },
5854#endif
5855#endif
4748 { }, /* terminate */ 5856 { }, /* terminate */
4749}; 5857};
4750 5858
@@ -4812,16 +5920,29 @@ out_free:
4812} 5920}
4813 5921
4814/* 5922/*
4815 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, 5923 * At destroying mem_cgroup, references from swap_cgroup can remain.
4816 * but in process context. The work_freeing structure is overlaid 5924 * (scanning all at force_empty is too costly...)
4817 * on the rcu_freeing structure, which itself is overlaid on memsw. 5925 *
5926 * Instead of clearing all references at force_empty, we remember
5927 * the number of reference from swap_cgroup and free mem_cgroup when
5928 * it goes down to 0.
5929 *
5930 * Removal of cgroup itself succeeds regardless of refs from swap.
4818 */ 5931 */
4819static void free_work(struct work_struct *work) 5932
5933static void __mem_cgroup_free(struct mem_cgroup *memcg)
4820{ 5934{
4821 struct mem_cgroup *memcg; 5935 int node;
4822 int size = sizeof(struct mem_cgroup); 5936 int size = sizeof(struct mem_cgroup);
4823 5937
4824 memcg = container_of(work, struct mem_cgroup, work_freeing); 5938 mem_cgroup_remove_from_trees(memcg);
5939 free_css_id(&mem_cgroup_subsys, &memcg->css);
5940
5941 for_each_node(node)
5942 free_mem_cgroup_per_zone_info(memcg, node);
5943
5944 free_percpu(memcg->stat);
5945
4825 /* 5946 /*
4826 * We need to make sure that (at least for now), the jump label 5947 * We need to make sure that (at least for now), the jump label
4827 * destruction code runs outside of the cgroup lock. This is because 5948 * destruction code runs outside of the cgroup lock. This is because
@@ -4833,45 +5954,34 @@ static void free_work(struct work_struct *work)
4833 * to move this code around, and make sure it is outside 5954 * to move this code around, and make sure it is outside
4834 * the cgroup_lock. 5955 * the cgroup_lock.
4835 */ 5956 */
4836 disarm_sock_keys(memcg); 5957 disarm_static_keys(memcg);
4837 if (size < PAGE_SIZE) 5958 if (size < PAGE_SIZE)
4838 kfree(memcg); 5959 kfree(memcg);
4839 else 5960 else
4840 vfree(memcg); 5961 vfree(memcg);
4841} 5962}
4842 5963
4843static void free_rcu(struct rcu_head *rcu_head)
4844{
4845 struct mem_cgroup *memcg;
4846
4847 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4848 INIT_WORK(&memcg->work_freeing, free_work);
4849 schedule_work(&memcg->work_freeing);
4850}
4851 5964
4852/* 5965/*
4853 * At destroying mem_cgroup, references from swap_cgroup can remain. 5966 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
4854 * (scanning all at force_empty is too costly...) 5967 * but in process context. The work_freeing structure is overlaid
4855 * 5968 * on the rcu_freeing structure, which itself is overlaid on memsw.
4856 * Instead of clearing all references at force_empty, we remember
4857 * the number of reference from swap_cgroup and free mem_cgroup when
4858 * it goes down to 0.
4859 *
4860 * Removal of cgroup itself succeeds regardless of refs from swap.
4861 */ 5969 */
4862 5970static void free_work(struct work_struct *work)
4863static void __mem_cgroup_free(struct mem_cgroup *memcg)
4864{ 5971{
4865 int node; 5972 struct mem_cgroup *memcg;
4866 5973
4867 mem_cgroup_remove_from_trees(memcg); 5974 memcg = container_of(work, struct mem_cgroup, work_freeing);
4868 free_css_id(&mem_cgroup_subsys, &memcg->css); 5975 __mem_cgroup_free(memcg);
5976}
4869 5977
4870 for_each_node(node) 5978static void free_rcu(struct rcu_head *rcu_head)
4871 free_mem_cgroup_per_zone_info(memcg, node); 5979{
5980 struct mem_cgroup *memcg;
4872 5981
4873 free_percpu(memcg->stat); 5982 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4874 call_rcu(&memcg->rcu_freeing, free_rcu); 5983 INIT_WORK(&memcg->work_freeing, free_work);
5984 schedule_work(&memcg->work_freeing);
4875} 5985}
4876 5986
4877static void mem_cgroup_get(struct mem_cgroup *memcg) 5987static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4883,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4883{ 5993{
4884 if (atomic_sub_and_test(count, &memcg->refcnt)) { 5994 if (atomic_sub_and_test(count, &memcg->refcnt)) {
4885 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5995 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4886 __mem_cgroup_free(memcg); 5996 call_rcu(&memcg->rcu_freeing, free_rcu);
4887 if (parent) 5997 if (parent)
4888 mem_cgroup_put(parent); 5998 mem_cgroup_put(parent);
4889 } 5999 }
@@ -4953,7 +6063,7 @@ err_cleanup:
4953} 6063}
4954 6064
4955static struct cgroup_subsys_state * __ref 6065static struct cgroup_subsys_state * __ref
4956mem_cgroup_create(struct cgroup *cont) 6066mem_cgroup_css_alloc(struct cgroup *cont)
4957{ 6067{
4958 struct mem_cgroup *memcg, *parent; 6068 struct mem_cgroup *memcg, *parent;
4959 long error = -ENOMEM; 6069 long error = -ENOMEM;
@@ -4980,7 +6090,6 @@ mem_cgroup_create(struct cgroup *cont)
4980 &per_cpu(memcg_stock, cpu); 6090 &per_cpu(memcg_stock, cpu);
4981 INIT_WORK(&stock->work, drain_local_stock); 6091 INIT_WORK(&stock->work, drain_local_stock);
4982 } 6092 }
4983 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4984 } else { 6093 } else {
4985 parent = mem_cgroup_from_cont(cont->parent); 6094 parent = mem_cgroup_from_cont(cont->parent);
4986 memcg->use_hierarchy = parent->use_hierarchy; 6095 memcg->use_hierarchy = parent->use_hierarchy;
@@ -4990,6 +6099,8 @@ mem_cgroup_create(struct cgroup *cont)
4990 if (parent && parent->use_hierarchy) { 6099 if (parent && parent->use_hierarchy) {
4991 res_counter_init(&memcg->res, &parent->res); 6100 res_counter_init(&memcg->res, &parent->res);
4992 res_counter_init(&memcg->memsw, &parent->memsw); 6101 res_counter_init(&memcg->memsw, &parent->memsw);
6102 res_counter_init(&memcg->kmem, &parent->kmem);
6103
4993 /* 6104 /*
4994 * We increment refcnt of the parent to ensure that we can 6105 * We increment refcnt of the parent to ensure that we can
4995 * safely access it on res_counter_charge/uncharge. 6106 * safely access it on res_counter_charge/uncharge.
@@ -5000,6 +6111,7 @@ mem_cgroup_create(struct cgroup *cont)
5000 } else { 6111 } else {
5001 res_counter_init(&memcg->res, NULL); 6112 res_counter_init(&memcg->res, NULL);
5002 res_counter_init(&memcg->memsw, NULL); 6113 res_counter_init(&memcg->memsw, NULL);
6114 res_counter_init(&memcg->kmem, NULL);
5003 /* 6115 /*
5004 * Deeper hierachy with use_hierarchy == false doesn't make 6116 * Deeper hierachy with use_hierarchy == false doesn't make
5005 * much sense so let cgroup subsystem know about this 6117 * much sense so let cgroup subsystem know about this
@@ -5034,14 +6146,15 @@ free_out:
5034 return ERR_PTR(error); 6146 return ERR_PTR(error);
5035} 6147}
5036 6148
5037static int mem_cgroup_pre_destroy(struct cgroup *cont) 6149static void mem_cgroup_css_offline(struct cgroup *cont)
5038{ 6150{
5039 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6151 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5040 6152
5041 return mem_cgroup_force_empty(memcg, false); 6153 mem_cgroup_reparent_charges(memcg);
6154 mem_cgroup_destroy_all_caches(memcg);
5042} 6155}
5043 6156
5044static void mem_cgroup_destroy(struct cgroup *cont) 6157static void mem_cgroup_css_free(struct cgroup *cont)
5045{ 6158{
5046 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6159 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5047 6160
@@ -5631,18 +6744,30 @@ static void mem_cgroup_move_task(struct cgroup *cont,
5631struct cgroup_subsys mem_cgroup_subsys = { 6744struct cgroup_subsys mem_cgroup_subsys = {
5632 .name = "memory", 6745 .name = "memory",
5633 .subsys_id = mem_cgroup_subsys_id, 6746 .subsys_id = mem_cgroup_subsys_id,
5634 .create = mem_cgroup_create, 6747 .css_alloc = mem_cgroup_css_alloc,
5635 .pre_destroy = mem_cgroup_pre_destroy, 6748 .css_offline = mem_cgroup_css_offline,
5636 .destroy = mem_cgroup_destroy, 6749 .css_free = mem_cgroup_css_free,
5637 .can_attach = mem_cgroup_can_attach, 6750 .can_attach = mem_cgroup_can_attach,
5638 .cancel_attach = mem_cgroup_cancel_attach, 6751 .cancel_attach = mem_cgroup_cancel_attach,
5639 .attach = mem_cgroup_move_task, 6752 .attach = mem_cgroup_move_task,
5640 .base_cftypes = mem_cgroup_files, 6753 .base_cftypes = mem_cgroup_files,
5641 .early_init = 0, 6754 .early_init = 0,
5642 .use_id = 1, 6755 .use_id = 1,
5643 .__DEPRECATED_clear_css_refs = true,
5644}; 6756};
5645 6757
6758/*
6759 * The rest of init is performed during ->css_alloc() for root css which
6760 * happens before initcalls. hotcpu_notifier() can't be done together as
6761 * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
6762 * dependency. Do it from a subsys_initcall().
6763 */
6764static int __init mem_cgroup_init(void)
6765{
6766 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6767 return 0;
6768}
6769subsys_initcall(mem_cgroup_init);
6770
5646#ifdef CONFIG_MEMCG_SWAP 6771#ifdef CONFIG_MEMCG_SWAP
5647static int __init enable_swap_account(char *s) 6772static int __init enable_swap_account(char *s)
5648{ 6773{
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6c5899b9034a..c6e4dd3e1c08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
402 struct anon_vma *av; 402 struct anon_vma *av;
403 pgoff_t pgoff; 403 pgoff_t pgoff;
404 404
405 av = page_lock_anon_vma(page); 405 av = page_lock_anon_vma_read(page);
406 if (av == NULL) /* Not actually mapped anymore */ 406 if (av == NULL) /* Not actually mapped anymore */
407 return; 407 return;
408 408
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
423 } 423 }
424 } 424 }
425 read_unlock(&tasklist_lock); 425 read_unlock(&tasklist_lock);
426 page_unlock_anon_vma(av); 426 page_unlock_anon_vma_read(av);
427} 427}
428 428
429/* 429/*
@@ -781,16 +781,16 @@ static struct page_state {
781 { compound, compound, "huge", me_huge_page }, 781 { compound, compound, "huge", me_huge_page },
782#endif 782#endif
783 783
784 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, 784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "swapcache", me_swapcache_clean }, 785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786 786
787 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, 787 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
788 { unevict, unevict, "unevictable LRU", me_pagecache_clean}, 788 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
789 789
790 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, 790 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
791 { mlock, mlock, "mlocked LRU", me_pagecache_clean }, 791 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
792 792
793 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795 795
796 /* 796 /*
@@ -812,14 +812,14 @@ static struct page_state {
812#undef slab 812#undef slab
813#undef reserved 813#undef reserved
814 814
815/*
816 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
817 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
818 */
815static void action_result(unsigned long pfn, char *msg, int result) 819static void action_result(unsigned long pfn, char *msg, int result)
816{ 820{
817 struct page *page = pfn_to_page(pfn); 821 pr_err("MCE %#lx: %s page recovery: %s\n",
818 822 pfn, msg, action_name[result]);
819 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
820 pfn,
821 PageDirty(page) ? "dirty " : "",
822 msg, action_name[result]);
823} 823}
824 824
825static int page_action(struct page_state *ps, struct page *p, 825static int page_action(struct page_state *ps, struct page *p,
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1385 * Isolate the page, so that it doesn't get reallocated if it 1385 * Isolate the page, so that it doesn't get reallocated if it
1386 * was free. 1386 * was free.
1387 */ 1387 */
1388 set_migratetype_isolate(p); 1388 set_migratetype_isolate(p, true);
1389 /* 1389 /*
1390 * When the target page is a free hugepage, just remove it 1390 * When the target page is a free hugepage, just remove it
1391 * from free hugepage list. 1391 * from free hugepage list.
@@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags)
1476{ 1476{
1477 int ret; 1477 int ret;
1478 unsigned long pfn = page_to_pfn(page); 1478 unsigned long pfn = page_to_pfn(page);
1479 struct page *hpage = compound_trans_head(page);
1479 1480
1480 if (PageHuge(page)) 1481 if (PageHuge(page))
1481 return soft_offline_huge_page(page, flags); 1482 return soft_offline_huge_page(page, flags);
1483 if (PageTransHuge(hpage)) {
1484 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1485 pr_info("soft offline: %#lx: failed to split THP\n",
1486 pfn);
1487 return -EBUSY;
1488 }
1489 }
1482 1490
1483 ret = get_any_page(page, pfn, flags); 1491 ret = get_any_page(page, pfn, flags);
1484 if (ret < 0) 1492 if (ret < 0)
@@ -1558,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
1558 page_is_file_cache(page)); 1566 page_is_file_cache(page));
1559 list_add(&page->lru, &pagelist); 1567 list_add(&page->lru, &pagelist);
1560 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1561 false, MIGRATE_SYNC); 1569 false, MIGRATE_SYNC,
1570 MR_MEMORY_FAILURE);
1562 if (ret) { 1571 if (ret) {
1563 putback_lru_pages(&pagelist); 1572 putback_lru_pages(&pagelist);
1564 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1573 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 221fc9ffcab1..bb1369f7b9b4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,8 @@
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h>
61#include <linux/string.h>
60 62
61#include <asm/io.h> 63#include <asm/io.h>
62#include <asm/pgalloc.h> 64#include <asm/pgalloc.h>
@@ -182,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
182 return 1; 184 return 1;
183 } 185 }
184 186
187 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
188 return 0;
189
185 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); 190 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
186 if (!batch) 191 if (!batch)
187 return 0; 192 return 0;
188 193
194 tlb->batch_count++;
189 batch->next = NULL; 195 batch->next = NULL;
190 batch->nr = 0; 196 batch->nr = 0;
191 batch->max = MAX_GATHER_BATCH; 197 batch->max = MAX_GATHER_BATCH;
@@ -214,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
214 tlb->local.nr = 0; 220 tlb->local.nr = 0;
215 tlb->local.max = ARRAY_SIZE(tlb->__pages); 221 tlb->local.max = ARRAY_SIZE(tlb->__pages);
216 tlb->active = &tlb->local; 222 tlb->active = &tlb->local;
223 tlb->batch_count = 0;
217 224
218#ifdef CONFIG_HAVE_RCU_TABLE_FREE 225#ifdef CONFIG_HAVE_RCU_TABLE_FREE
219 tlb->batch = NULL; 226 tlb->batch = NULL;
@@ -717,20 +724,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 724 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718} 725}
719 726
720#ifndef is_zero_pfn
721static inline int is_zero_pfn(unsigned long pfn)
722{
723 return pfn == zero_pfn;
724}
725#endif
726
727#ifndef my_zero_pfn
728static inline unsigned long my_zero_pfn(unsigned long addr)
729{
730 return zero_pfn;
731}
732#endif
733
734/* 727/*
735 * vm_normal_page -- This function gets the "struct page" associated with a pte. 728 * vm_normal_page -- This function gets the "struct page" associated with a pte.
736 * 729 *
@@ -1250,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1250 BUG(); 1243 BUG();
1251 } 1244 }
1252#endif 1245#endif
1253 split_huge_page_pmd(vma->vm_mm, pmd); 1246 split_huge_page_pmd(vma, addr, pmd);
1254 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1247 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1255 goto next; 1248 goto next;
1256 /* fall through */ 1249 /* fall through */
@@ -1517,9 +1510,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1517 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1510 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1518 goto out; 1511 goto out;
1519 } 1512 }
1513 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1514 goto no_page_table;
1520 if (pmd_trans_huge(*pmd)) { 1515 if (pmd_trans_huge(*pmd)) {
1521 if (flags & FOLL_SPLIT) { 1516 if (flags & FOLL_SPLIT) {
1522 split_huge_page_pmd(mm, pmd); 1517 split_huge_page_pmd(vma, address, pmd);
1523 goto split_fallthrough; 1518 goto split_fallthrough;
1524 } 1519 }
1525 spin_lock(&mm->page_table_lock); 1520 spin_lock(&mm->page_table_lock);
@@ -1546,6 +1541,8 @@ split_fallthrough:
1546 pte = *ptep; 1541 pte = *ptep;
1547 if (!pte_present(pte)) 1542 if (!pte_present(pte))
1548 goto no_page; 1543 goto no_page;
1544 if ((flags & FOLL_NUMA) && pte_numa(pte))
1545 goto no_page;
1549 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1546 if ((flags & FOLL_WRITE) && !pte_write(pte))
1550 goto unlock; 1547 goto unlock;
1551 1548
@@ -1697,6 +1694,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1697 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1694 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1698 vm_flags &= (gup_flags & FOLL_FORCE) ? 1695 vm_flags &= (gup_flags & FOLL_FORCE) ?
1699 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1696 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1697
1698 /*
1699 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1700 * would be called on PROT_NONE ranges. We must never invoke
1701 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1702 * page faults would unprotect the PROT_NONE ranges if
1703 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1704 * bitflag. So to avoid that, don't set FOLL_NUMA if
1705 * FOLL_FORCE is set.
1706 */
1707 if (!(gup_flags & FOLL_FORCE))
1708 gup_flags |= FOLL_NUMA;
1709
1700 i = 0; 1710 i = 0;
1701 1711
1702 do { 1712 do {
@@ -2794,13 +2804,8 @@ unlock:
2794oom_free_new: 2804oom_free_new:
2795 page_cache_release(new_page); 2805 page_cache_release(new_page);
2796oom: 2806oom:
2797 if (old_page) { 2807 if (old_page)
2798 if (page_mkwrite) {
2799 unlock_page(old_page);
2800 page_cache_release(old_page);
2801 }
2802 page_cache_release(old_page); 2808 page_cache_release(old_page);
2803 }
2804 return VM_FAULT_OOM; 2809 return VM_FAULT_OOM;
2805 2810
2806unwritable_page: 2811unwritable_page:
@@ -3431,6 +3436,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3431 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3436 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3432} 3437}
3433 3438
3439int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3440 unsigned long addr, int current_nid)
3441{
3442 get_page(page);
3443
3444 count_vm_numa_event(NUMA_HINT_FAULTS);
3445 if (current_nid == numa_node_id())
3446 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3447
3448 return mpol_misplaced(page, vma, addr);
3449}
3450
3451int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3452 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3453{
3454 struct page *page = NULL;
3455 spinlock_t *ptl;
3456 int current_nid = -1;
3457 int target_nid;
3458 bool migrated = false;
3459
3460 /*
3461 * The "pte" at this point cannot be used safely without
3462 * validation through pte_unmap_same(). It's of NUMA type but
3463 * the pfn may be screwed if the read is non atomic.
3464 *
3465 * ptep_modify_prot_start is not called as this is clearing
3466 * the _PAGE_NUMA bit and it is not really expected that there
3467 * would be concurrent hardware modifications to the PTE.
3468 */
3469 ptl = pte_lockptr(mm, pmd);
3470 spin_lock(ptl);
3471 if (unlikely(!pte_same(*ptep, pte))) {
3472 pte_unmap_unlock(ptep, ptl);
3473 goto out;
3474 }
3475
3476 pte = pte_mknonnuma(pte);
3477 set_pte_at(mm, addr, ptep, pte);
3478 update_mmu_cache(vma, addr, ptep);
3479
3480 page = vm_normal_page(vma, addr, pte);
3481 if (!page) {
3482 pte_unmap_unlock(ptep, ptl);
3483 return 0;
3484 }
3485
3486 current_nid = page_to_nid(page);
3487 target_nid = numa_migrate_prep(page, vma, addr, current_nid);
3488 pte_unmap_unlock(ptep, ptl);
3489 if (target_nid == -1) {
3490 /*
3491 * Account for the fault against the current node if it not
3492 * being replaced regardless of where the page is located.
3493 */
3494 current_nid = numa_node_id();
3495 put_page(page);
3496 goto out;
3497 }
3498
3499 /* Migrate to the requested node */
3500 migrated = migrate_misplaced_page(page, target_nid);
3501 if (migrated)
3502 current_nid = target_nid;
3503
3504out:
3505 if (current_nid != -1)
3506 task_numa_fault(current_nid, 1, migrated);
3507 return 0;
3508}
3509
3510/* NUMA hinting page fault entry point for regular pmds */
3511#ifdef CONFIG_NUMA_BALANCING
3512static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3513 unsigned long addr, pmd_t *pmdp)
3514{
3515 pmd_t pmd;
3516 pte_t *pte, *orig_pte;
3517 unsigned long _addr = addr & PMD_MASK;
3518 unsigned long offset;
3519 spinlock_t *ptl;
3520 bool numa = false;
3521 int local_nid = numa_node_id();
3522
3523 spin_lock(&mm->page_table_lock);
3524 pmd = *pmdp;
3525 if (pmd_numa(pmd)) {
3526 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3527 numa = true;
3528 }
3529 spin_unlock(&mm->page_table_lock);
3530
3531 if (!numa)
3532 return 0;
3533
3534 /* we're in a page fault so some vma must be in the range */
3535 BUG_ON(!vma);
3536 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3537 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3538 VM_BUG_ON(offset >= PMD_SIZE);
3539 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3540 pte += offset >> PAGE_SHIFT;
3541 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3542 pte_t pteval = *pte;
3543 struct page *page;
3544 int curr_nid = local_nid;
3545 int target_nid;
3546 bool migrated;
3547 if (!pte_present(pteval))
3548 continue;
3549 if (!pte_numa(pteval))
3550 continue;
3551 if (addr >= vma->vm_end) {
3552 vma = find_vma(mm, addr);
3553 /* there's a pte present so there must be a vma */
3554 BUG_ON(!vma);
3555 BUG_ON(addr < vma->vm_start);
3556 }
3557 if (pte_numa(pteval)) {
3558 pteval = pte_mknonnuma(pteval);
3559 set_pte_at(mm, addr, pte, pteval);
3560 }
3561 page = vm_normal_page(vma, addr, pteval);
3562 if (unlikely(!page))
3563 continue;
3564 /* only check non-shared pages */
3565 if (unlikely(page_mapcount(page) != 1))
3566 continue;
3567
3568 /*
3569 * Note that the NUMA fault is later accounted to either
3570 * the node that is currently running or where the page is
3571 * migrated to.
3572 */
3573 curr_nid = local_nid;
3574 target_nid = numa_migrate_prep(page, vma, addr,
3575 page_to_nid(page));
3576 if (target_nid == -1) {
3577 put_page(page);
3578 continue;
3579 }
3580
3581 /* Migrate to the requested node */
3582 pte_unmap_unlock(pte, ptl);
3583 migrated = migrate_misplaced_page(page, target_nid);
3584 if (migrated)
3585 curr_nid = target_nid;
3586 task_numa_fault(curr_nid, 1, migrated);
3587
3588 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3589 }
3590 pte_unmap_unlock(orig_pte, ptl);
3591
3592 return 0;
3593}
3594#else
3595static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3596 unsigned long addr, pmd_t *pmdp)
3597{
3598 BUG();
3599 return 0;
3600}
3601#endif /* CONFIG_NUMA_BALANCING */
3602
3434/* 3603/*
3435 * These routines also need to handle stuff like marking pages dirty 3604 * These routines also need to handle stuff like marking pages dirty
3436 * and/or accessed for architectures that don't do it in hardware (most 3605 * and/or accessed for architectures that don't do it in hardware (most
@@ -3469,6 +3638,9 @@ int handle_pte_fault(struct mm_struct *mm,
3469 pte, pmd, flags, entry); 3638 pte, pmd, flags, entry);
3470 } 3639 }
3471 3640
3641 if (pte_numa(entry))
3642 return do_numa_page(mm, vma, address, entry, pte, pmd);
3643
3472 ptl = pte_lockptr(mm, pmd); 3644 ptl = pte_lockptr(mm, pmd);
3473 spin_lock(ptl); 3645 spin_lock(ptl);
3474 if (unlikely(!pte_same(*pte, entry))) 3646 if (unlikely(!pte_same(*pte, entry)))
@@ -3537,9 +3709,21 @@ retry:
3537 3709
3538 barrier(); 3710 barrier();
3539 if (pmd_trans_huge(orig_pmd)) { 3711 if (pmd_trans_huge(orig_pmd)) {
3540 if (flags & FAULT_FLAG_WRITE && 3712 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3541 !pmd_write(orig_pmd) && 3713
3542 !pmd_trans_splitting(orig_pmd)) { 3714 /*
3715 * If the pmd is splitting, return and retry the
3716 * the fault. Alternative: wait until the split
3717 * is done, and goto retry.
3718 */
3719 if (pmd_trans_splitting(orig_pmd))
3720 return 0;
3721
3722 if (pmd_numa(orig_pmd))
3723 return do_huge_pmd_numa_page(mm, vma, address,
3724 orig_pmd, pmd);
3725
3726 if (dirty && !pmd_write(orig_pmd)) {
3543 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3727 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3544 orig_pmd); 3728 orig_pmd);
3545 /* 3729 /*
@@ -3550,17 +3734,25 @@ retry:
3550 if (unlikely(ret & VM_FAULT_OOM)) 3734 if (unlikely(ret & VM_FAULT_OOM))
3551 goto retry; 3735 goto retry;
3552 return ret; 3736 return ret;
3737 } else {
3738 huge_pmd_set_accessed(mm, vma, address, pmd,
3739 orig_pmd, dirty);
3553 } 3740 }
3741
3554 return 0; 3742 return 0;
3555 } 3743 }
3556 } 3744 }
3557 3745
3746 if (pmd_numa(*pmd))
3747 return do_pmd_numa_page(mm, vma, address, pmd);
3748
3558 /* 3749 /*
3559 * Use __pte_alloc instead of pte_alloc_map, because we can't 3750 * Use __pte_alloc instead of pte_alloc_map, because we can't
3560 * run pte_offset_map on the pmd, if an huge pmd could 3751 * run pte_offset_map on the pmd, if an huge pmd could
3561 * materialize from under us from a different thread. 3752 * materialize from under us from a different thread.
3562 */ 3753 */
3563 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) 3754 if (unlikely(pmd_none(*pmd)) &&
3755 unlikely(__pte_alloc(mm, vma, pmd, address)))
3564 return VM_FAULT_OOM; 3756 return VM_FAULT_OOM;
3565 /* if an huge pmd materialized from under us just retry later */ 3757 /* if an huge pmd materialized from under us just retry later */
3566 if (unlikely(pmd_trans_huge(*pmd))) 3758 if (unlikely(pmd_trans_huge(*pmd)))
@@ -3940,15 +4132,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
3940 struct file *f = vma->vm_file; 4132 struct file *f = vma->vm_file;
3941 char *buf = (char *)__get_free_page(GFP_KERNEL); 4133 char *buf = (char *)__get_free_page(GFP_KERNEL);
3942 if (buf) { 4134 if (buf) {
3943 char *p, *s; 4135 char *p;
3944 4136
3945 p = d_path(&f->f_path, buf, PAGE_SIZE); 4137 p = d_path(&f->f_path, buf, PAGE_SIZE);
3946 if (IS_ERR(p)) 4138 if (IS_ERR(p))
3947 p = "?"; 4139 p = "?";
3948 s = strrchr(p, '/'); 4140 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3949 if (s)
3950 p = s+1;
3951 printk("%s%s[%lx+%lx]", prefix, p,
3952 vma->vm_start, 4141 vma->vm_start,
3953 vma->vm_end - vma->vm_start); 4142 vma->vm_end - vma->vm_start);
3954 free_page((unsigned long)buf); 4143 free_page((unsigned long)buf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4eeacae2b91..d04ed87bfacb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
106void __ref put_page_bootmem(struct page *page) 106void __ref put_page_bootmem(struct page *page)
107{ 107{
108 unsigned long type; 108 unsigned long type;
109 static DEFINE_MUTEX(ppb_lock);
109 110
110 type = (unsigned long) page->lru.next; 111 type = (unsigned long) page->lru.next;
111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
115 ClearPagePrivate(page); 116 ClearPagePrivate(page);
116 set_page_private(page, 0); 117 set_page_private(page, 0);
117 INIT_LIST_HEAD(&page->lru); 118 INIT_LIST_HEAD(&page->lru);
119
120 /*
121 * Please refer to comment for __free_pages_bootmem()
122 * for why we serialize here.
123 */
124 mutex_lock(&ppb_lock);
118 __free_pages_bootmem(page, 0); 125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock);
119 } 127 }
120 128
121} 129}
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
205 zone_span_writelock(zone); 213 zone_span_writelock(zone);
206 214
207 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 215 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
208 if (start_pfn < zone->zone_start_pfn) 216 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
209 zone->zone_start_pfn = start_pfn; 217 zone->zone_start_pfn = start_pfn;
210 218
211 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 219 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
214 zone_span_writeunlock(zone); 222 zone_span_writeunlock(zone);
215} 223}
216 224
225static void resize_zone(struct zone *zone, unsigned long start_pfn,
226 unsigned long end_pfn)
227{
228 zone_span_writelock(zone);
229
230 if (end_pfn - start_pfn) {
231 zone->zone_start_pfn = start_pfn;
232 zone->spanned_pages = end_pfn - start_pfn;
233 } else {
234 /*
235 * make it consist as free_area_init_core(),
236 * if spanned_pages = 0, then keep start_pfn = 0
237 */
238 zone->zone_start_pfn = 0;
239 zone->spanned_pages = 0;
240 }
241
242 zone_span_writeunlock(zone);
243}
244
245static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
246 unsigned long end_pfn)
247{
248 enum zone_type zid = zone_idx(zone);
249 int nid = zone->zone_pgdat->node_id;
250 unsigned long pfn;
251
252 for (pfn = start_pfn; pfn < end_pfn; pfn++)
253 set_page_links(pfn_to_page(pfn), zid, nid, pfn);
254}
255
256static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
257 unsigned long start_pfn, unsigned long end_pfn)
258{
259 int ret;
260 unsigned long flags;
261 unsigned long z1_start_pfn;
262
263 if (!z1->wait_table) {
264 ret = init_currently_empty_zone(z1, start_pfn,
265 end_pfn - start_pfn, MEMMAP_HOTPLUG);
266 if (ret)
267 return ret;
268 }
269
270 pgdat_resize_lock(z1->zone_pgdat, &flags);
271
272 /* can't move pfns which are higher than @z2 */
273 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
274 goto out_fail;
275 /* the move out part mast at the left most of @z2 */
276 if (start_pfn > z2->zone_start_pfn)
277 goto out_fail;
278 /* must included/overlap */
279 if (end_pfn <= z2->zone_start_pfn)
280 goto out_fail;
281
282 /* use start_pfn for z1's start_pfn if z1 is empty */
283 if (z1->spanned_pages)
284 z1_start_pfn = z1->zone_start_pfn;
285 else
286 z1_start_pfn = start_pfn;
287
288 resize_zone(z1, z1_start_pfn, end_pfn);
289 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
290
291 pgdat_resize_unlock(z1->zone_pgdat, &flags);
292
293 fix_zone_id(z1, start_pfn, end_pfn);
294
295 return 0;
296out_fail:
297 pgdat_resize_unlock(z1->zone_pgdat, &flags);
298 return -1;
299}
300
301static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
302 unsigned long start_pfn, unsigned long end_pfn)
303{
304 int ret;
305 unsigned long flags;
306 unsigned long z2_end_pfn;
307
308 if (!z2->wait_table) {
309 ret = init_currently_empty_zone(z2, start_pfn,
310 end_pfn - start_pfn, MEMMAP_HOTPLUG);
311 if (ret)
312 return ret;
313 }
314
315 pgdat_resize_lock(z1->zone_pgdat, &flags);
316
317 /* can't move pfns which are lower than @z1 */
318 if (z1->zone_start_pfn > start_pfn)
319 goto out_fail;
320 /* the move out part mast at the right most of @z1 */
321 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
322 goto out_fail;
323 /* must included/overlap */
324 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
325 goto out_fail;
326
327 /* use end_pfn for z2's end_pfn if z2 is empty */
328 if (z2->spanned_pages)
329 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
330 else
331 z2_end_pfn = end_pfn;
332
333 resize_zone(z1, z1->zone_start_pfn, start_pfn);
334 resize_zone(z2, start_pfn, z2_end_pfn);
335
336 pgdat_resize_unlock(z1->zone_pgdat, &flags);
337
338 fix_zone_id(z2, start_pfn, end_pfn);
339
340 return 0;
341out_fail:
342 pgdat_resize_unlock(z1->zone_pgdat, &flags);
343 return -1;
344}
345
217static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 346static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
218 unsigned long end_pfn) 347 unsigned long end_pfn)
219{ 348{
220 unsigned long old_pgdat_end_pfn = 349 unsigned long old_pgdat_end_pfn =
221 pgdat->node_start_pfn + pgdat->node_spanned_pages; 350 pgdat->node_start_pfn + pgdat->node_spanned_pages;
222 351
223 if (start_pfn < pgdat->node_start_pfn) 352 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
224 pgdat->node_start_pfn = start_pfn; 353 pgdat->node_start_pfn = start_pfn;
225 354
226 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 355 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +589,99 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
460 return 0; 589 return 0;
461} 590}
462 591
592#ifdef CONFIG_MOVABLE_NODE
593/*
594 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
595 * normal memory.
596 */
597static bool can_online_high_movable(struct zone *zone)
598{
599 return true;
600}
601#else /* CONFIG_MOVABLE_NODE */
602/* ensure every online node has NORMAL memory */
603static bool can_online_high_movable(struct zone *zone)
604{
605 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
606}
607#endif /* CONFIG_MOVABLE_NODE */
463 608
464int __ref online_pages(unsigned long pfn, unsigned long nr_pages) 609/* check which state of node_states will be changed when online memory */
610static void node_states_check_changes_online(unsigned long nr_pages,
611 struct zone *zone, struct memory_notify *arg)
612{
613 int nid = zone_to_nid(zone);
614 enum zone_type zone_last = ZONE_NORMAL;
615
616 /*
617 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
618 * contains nodes which have zones of 0...ZONE_NORMAL,
619 * set zone_last to ZONE_NORMAL.
620 *
621 * If we don't have HIGHMEM nor movable node,
622 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
623 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
624 */
625 if (N_MEMORY == N_NORMAL_MEMORY)
626 zone_last = ZONE_MOVABLE;
627
628 /*
629 * if the memory to be online is in a zone of 0...zone_last, and
630 * the zones of 0...zone_last don't have memory before online, we will
631 * need to set the node to node_states[N_NORMAL_MEMORY] after
632 * the memory is online.
633 */
634 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
635 arg->status_change_nid_normal = nid;
636 else
637 arg->status_change_nid_normal = -1;
638
639#ifdef CONFIG_HIGHMEM
640 /*
641 * If we have movable node, node_states[N_HIGH_MEMORY]
642 * contains nodes which have zones of 0...ZONE_HIGHMEM,
643 * set zone_last to ZONE_HIGHMEM.
644 *
645 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
646 * contains nodes which have zones of 0...ZONE_MOVABLE,
647 * set zone_last to ZONE_MOVABLE.
648 */
649 zone_last = ZONE_HIGHMEM;
650 if (N_MEMORY == N_HIGH_MEMORY)
651 zone_last = ZONE_MOVABLE;
652
653 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
654 arg->status_change_nid_high = nid;
655 else
656 arg->status_change_nid_high = -1;
657#else
658 arg->status_change_nid_high = arg->status_change_nid_normal;
659#endif
660
661 /*
662 * if the node don't have memory befor online, we will need to
663 * set the node to node_states[N_MEMORY] after the memory
664 * is online.
665 */
666 if (!node_state(nid, N_MEMORY))
667 arg->status_change_nid = nid;
668 else
669 arg->status_change_nid = -1;
670}
671
672static void node_states_set_node(int node, struct memory_notify *arg)
673{
674 if (arg->status_change_nid_normal >= 0)
675 node_set_state(node, N_NORMAL_MEMORY);
676
677 if (arg->status_change_nid_high >= 0)
678 node_set_state(node, N_HIGH_MEMORY);
679
680 node_set_state(node, N_MEMORY);
681}
682
683
684int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
465{ 685{
466 unsigned long onlined_pages = 0; 686 unsigned long onlined_pages = 0;
467 struct zone *zone; 687 struct zone *zone;
@@ -471,13 +691,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
471 struct memory_notify arg; 691 struct memory_notify arg;
472 692
473 lock_memory_hotplug(); 693 lock_memory_hotplug();
694 /*
695 * This doesn't need a lock to do pfn_to_page().
696 * The section can't be removed here because of the
697 * memory_block->state_mutex.
698 */
699 zone = page_zone(pfn_to_page(pfn));
700
701 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
702 !can_online_high_movable(zone)) {
703 unlock_memory_hotplug();
704 return -1;
705 }
706
707 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
708 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
709 unlock_memory_hotplug();
710 return -1;
711 }
712 }
713 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
714 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
715 unlock_memory_hotplug();
716 return -1;
717 }
718 }
719
720 /* Previous code may changed the zone of the pfn range */
721 zone = page_zone(pfn_to_page(pfn));
722
474 arg.start_pfn = pfn; 723 arg.start_pfn = pfn;
475 arg.nr_pages = nr_pages; 724 arg.nr_pages = nr_pages;
476 arg.status_change_nid = -1; 725 node_states_check_changes_online(nr_pages, zone, &arg);
477 726
478 nid = page_to_nid(pfn_to_page(pfn)); 727 nid = page_to_nid(pfn_to_page(pfn));
479 if (node_present_pages(nid) == 0)
480 arg.status_change_nid = nid;
481 728
482 ret = memory_notify(MEM_GOING_ONLINE, &arg); 729 ret = memory_notify(MEM_GOING_ONLINE, &arg);
483 ret = notifier_to_errno(ret); 730 ret = notifier_to_errno(ret);
@@ -487,23 +734,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
487 return ret; 734 return ret;
488 } 735 }
489 /* 736 /*
490 * This doesn't need a lock to do pfn_to_page().
491 * The section can't be removed here because of the
492 * memory_block->state_mutex.
493 */
494 zone = page_zone(pfn_to_page(pfn));
495 /*
496 * If this zone is not populated, then it is not in zonelist. 737 * If this zone is not populated, then it is not in zonelist.
497 * This means the page allocator ignores this zone. 738 * This means the page allocator ignores this zone.
498 * So, zonelist must be updated after online. 739 * So, zonelist must be updated after online.
499 */ 740 */
500 mutex_lock(&zonelists_mutex); 741 mutex_lock(&zonelists_mutex);
501 if (!populated_zone(zone)) 742 if (!populated_zone(zone)) {
502 need_zonelists_rebuild = 1; 743 need_zonelists_rebuild = 1;
744 build_all_zonelists(NULL, zone);
745 }
503 746
504 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 747 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
505 online_pages_range); 748 online_pages_range);
506 if (ret) { 749 if (ret) {
750 if (need_zonelists_rebuild)
751 zone_pcp_reset(zone);
507 mutex_unlock(&zonelists_mutex); 752 mutex_unlock(&zonelists_mutex);
508 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 753 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
509 (unsigned long long) pfn << PAGE_SHIFT, 754 (unsigned long long) pfn << PAGE_SHIFT,
@@ -514,12 +759,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
514 return ret; 759 return ret;
515 } 760 }
516 761
762 zone->managed_pages += onlined_pages;
517 zone->present_pages += onlined_pages; 763 zone->present_pages += onlined_pages;
518 zone->zone_pgdat->node_present_pages += onlined_pages; 764 zone->zone_pgdat->node_present_pages += onlined_pages;
519 if (onlined_pages) { 765 if (onlined_pages) {
520 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 766 node_states_set_node(zone_to_nid(zone), &arg);
521 if (need_zonelists_rebuild) 767 if (need_zonelists_rebuild)
522 build_all_zonelists(NULL, zone); 768 build_all_zonelists(NULL, NULL);
523 else 769 else
524 zone_pcp_update(zone); 770 zone_pcp_update(zone);
525 } 771 }
@@ -812,7 +1058,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
812 * migrate_pages returns # of failed pages. 1058 * migrate_pages returns # of failed pages.
813 */ 1059 */
814 ret = migrate_pages(&source, alloc_migrate_target, 0, 1060 ret = migrate_pages(&source, alloc_migrate_target, 0,
815 true, MIGRATE_SYNC); 1061 true, MIGRATE_SYNC,
1062 MR_MEMORY_HOTPLUG);
816 if (ret) 1063 if (ret)
817 putback_lru_pages(&source); 1064 putback_lru_pages(&source);
818 } 1065 }
@@ -847,7 +1094,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
847{ 1094{
848 int ret; 1095 int ret;
849 long offlined = *(long *)data; 1096 long offlined = *(long *)data;
850 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); 1097 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
851 offlined = nr_pages; 1098 offlined = nr_pages;
852 if (!ret) 1099 if (!ret)
853 *(long *)data += offlined; 1100 *(long *)data += offlined;
@@ -867,6 +1114,132 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
867 return offlined; 1114 return offlined;
868} 1115}
869 1116
1117#ifdef CONFIG_MOVABLE_NODE
1118/*
1119 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
1120 * normal memory.
1121 */
1122static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1123{
1124 return true;
1125}
1126#else /* CONFIG_MOVABLE_NODE */
1127/* ensure the node has NORMAL memory if it is still online */
1128static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1129{
1130 struct pglist_data *pgdat = zone->zone_pgdat;
1131 unsigned long present_pages = 0;
1132 enum zone_type zt;
1133
1134 for (zt = 0; zt <= ZONE_NORMAL; zt++)
1135 present_pages += pgdat->node_zones[zt].present_pages;
1136
1137 if (present_pages > nr_pages)
1138 return true;
1139
1140 present_pages = 0;
1141 for (; zt <= ZONE_MOVABLE; zt++)
1142 present_pages += pgdat->node_zones[zt].present_pages;
1143
1144 /*
1145 * we can't offline the last normal memory until all
1146 * higher memory is offlined.
1147 */
1148 return present_pages == 0;
1149}
1150#endif /* CONFIG_MOVABLE_NODE */
1151
1152/* check which state of node_states will be changed when offline memory */
1153static void node_states_check_changes_offline(unsigned long nr_pages,
1154 struct zone *zone, struct memory_notify *arg)
1155{
1156 struct pglist_data *pgdat = zone->zone_pgdat;
1157 unsigned long present_pages = 0;
1158 enum zone_type zt, zone_last = ZONE_NORMAL;
1159
1160 /*
1161 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1162 * contains nodes which have zones of 0...ZONE_NORMAL,
1163 * set zone_last to ZONE_NORMAL.
1164 *
1165 * If we don't have HIGHMEM nor movable node,
1166 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1167 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1168 */
1169 if (N_MEMORY == N_NORMAL_MEMORY)
1170 zone_last = ZONE_MOVABLE;
1171
1172 /*
1173 * check whether node_states[N_NORMAL_MEMORY] will be changed.
1174 * If the memory to be offline is in a zone of 0...zone_last,
1175 * and it is the last present memory, 0...zone_last will
1176 * become empty after offline , thus we can determind we will
1177 * need to clear the node from node_states[N_NORMAL_MEMORY].
1178 */
1179 for (zt = 0; zt <= zone_last; zt++)
1180 present_pages += pgdat->node_zones[zt].present_pages;
1181 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1182 arg->status_change_nid_normal = zone_to_nid(zone);
1183 else
1184 arg->status_change_nid_normal = -1;
1185
1186#ifdef CONFIG_HIGHMEM
1187 /*
1188 * If we have movable node, node_states[N_HIGH_MEMORY]
1189 * contains nodes which have zones of 0...ZONE_HIGHMEM,
1190 * set zone_last to ZONE_HIGHMEM.
1191 *
1192 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1193 * contains nodes which have zones of 0...ZONE_MOVABLE,
1194 * set zone_last to ZONE_MOVABLE.
1195 */
1196 zone_last = ZONE_HIGHMEM;
1197 if (N_MEMORY == N_HIGH_MEMORY)
1198 zone_last = ZONE_MOVABLE;
1199
1200 for (; zt <= zone_last; zt++)
1201 present_pages += pgdat->node_zones[zt].present_pages;
1202 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1203 arg->status_change_nid_high = zone_to_nid(zone);
1204 else
1205 arg->status_change_nid_high = -1;
1206#else
1207 arg->status_change_nid_high = arg->status_change_nid_normal;
1208#endif
1209
1210 /*
1211 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1212 */
1213 zone_last = ZONE_MOVABLE;
1214
1215 /*
1216 * check whether node_states[N_HIGH_MEMORY] will be changed
1217 * If we try to offline the last present @nr_pages from the node,
1218 * we can determind we will need to clear the node from
1219 * node_states[N_HIGH_MEMORY].
1220 */
1221 for (; zt <= zone_last; zt++)
1222 present_pages += pgdat->node_zones[zt].present_pages;
1223 if (nr_pages >= present_pages)
1224 arg->status_change_nid = zone_to_nid(zone);
1225 else
1226 arg->status_change_nid = -1;
1227}
1228
1229static void node_states_clear_node(int node, struct memory_notify *arg)
1230{
1231 if (arg->status_change_nid_normal >= 0)
1232 node_clear_state(node, N_NORMAL_MEMORY);
1233
1234 if ((N_MEMORY != N_NORMAL_MEMORY) &&
1235 (arg->status_change_nid_high >= 0))
1236 node_clear_state(node, N_HIGH_MEMORY);
1237
1238 if ((N_MEMORY != N_HIGH_MEMORY) &&
1239 (arg->status_change_nid >= 0))
1240 node_clear_state(node, N_MEMORY);
1241}
1242
870static int __ref __offline_pages(unsigned long start_pfn, 1243static int __ref __offline_pages(unsigned long start_pfn,
871 unsigned long end_pfn, unsigned long timeout) 1244 unsigned long end_pfn, unsigned long timeout)
872{ 1245{
@@ -893,16 +1266,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
893 node = zone_to_nid(zone); 1266 node = zone_to_nid(zone);
894 nr_pages = end_pfn - start_pfn; 1267 nr_pages = end_pfn - start_pfn;
895 1268
1269 ret = -EINVAL;
1270 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
1271 goto out;
1272
896 /* set above range as isolated */ 1273 /* set above range as isolated */
897 ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1274 ret = start_isolate_page_range(start_pfn, end_pfn,
1275 MIGRATE_MOVABLE, true);
898 if (ret) 1276 if (ret)
899 goto out; 1277 goto out;
900 1278
901 arg.start_pfn = start_pfn; 1279 arg.start_pfn = start_pfn;
902 arg.nr_pages = nr_pages; 1280 arg.nr_pages = nr_pages;
903 arg.status_change_nid = -1; 1281 node_states_check_changes_offline(nr_pages, zone, &arg);
904 if (nr_pages >= node_present_pages(node))
905 arg.status_change_nid = node;
906 1282
907 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1283 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
908 ret = notifier_to_errno(ret); 1284 ret = notifier_to_errno(ret);
@@ -943,10 +1319,10 @@ repeat:
943 goto repeat; 1319 goto repeat;
944 } 1320 }
945 } 1321 }
946 /* drain all zone's lru pagevec, this is asyncronous... */ 1322 /* drain all zone's lru pagevec, this is asynchronous... */
947 lru_add_drain_all(); 1323 lru_add_drain_all();
948 yield(); 1324 yield();
949 /* drain pcp pages , this is synchrouns. */ 1325 /* drain pcp pages, this is synchronous. */
950 drain_all_pages(); 1326 drain_all_pages();
951 /* check again */ 1327 /* check again */
952 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1328 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
@@ -955,12 +1331,13 @@ repeat:
955 goto failed_removal; 1331 goto failed_removal;
956 } 1332 }
957 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1333 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
958 /* Ok, all of our target is islaoted. 1334 /* Ok, all of our target is isolated.
959 We cannot do rollback at this point. */ 1335 We cannot do rollback at this point. */
960 offline_isolated_pages(start_pfn, end_pfn); 1336 offline_isolated_pages(start_pfn, end_pfn);
961 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1337 /* reset pagetype flags and makes migrate type to be MOVABLE */
962 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1338 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
963 /* removal success */ 1339 /* removal success */
1340 zone->managed_pages -= offlined_pages;
964 zone->present_pages -= offlined_pages; 1341 zone->present_pages -= offlined_pages;
965 zone->zone_pgdat->node_present_pages -= offlined_pages; 1342 zone->zone_pgdat->node_present_pages -= offlined_pages;
966 totalram_pages -= offlined_pages; 1343 totalram_pages -= offlined_pages;
@@ -975,10 +1352,9 @@ repeat:
975 } else 1352 } else
976 zone_pcp_update(zone); 1353 zone_pcp_update(zone);
977 1354
978 if (!node_present_pages(node)) { 1355 node_states_clear_node(node, &arg);
979 node_clear_state(node, N_HIGH_MEMORY); 1356 if (arg.status_change_nid >= 0)
980 kswapd_stop(node); 1357 kswapd_stop(node);
981 }
982 1358
983 vm_total_pages = nr_free_pagecache_pages(); 1359 vm_total_pages = nr_free_pagecache_pages();
984 writeback_set_ratelimit(); 1360 writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d04a8a54c294..e2df1c1fb41f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
90#include <linux/syscalls.h> 90#include <linux/syscalls.h>
91#include <linux/ctype.h> 91#include <linux/ctype.h>
92#include <linux/mm_inline.h> 92#include <linux/mm_inline.h>
93#include <linux/mmu_notifier.h>
93 94
94#include <asm/tlbflush.h> 95#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 96#include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
117 .flags = MPOL_F_LOCAL, 118 .flags = MPOL_F_LOCAL,
118}; 119};
119 120
121static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122
123static struct mempolicy *get_task_policy(struct task_struct *p)
124{
125 struct mempolicy *pol = p->mempolicy;
126 int node;
127
128 if (!pol) {
129 node = numa_node_id();
130 if (node != -1)
131 pol = &preferred_node_policy[node];
132
133 /* preferred_node_policy is not initialised early in boot */
134 if (!pol->mode)
135 pol = NULL;
136 }
137
138 return pol;
139}
140
120static const struct mempolicy_operations { 141static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 142 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 /* 143 /*
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
212 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 233 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
213 if (pol == NULL) 234 if (pol == NULL)
214 return 0; 235 return 0;
215 /* Check N_HIGH_MEMORY */ 236 /* Check N_MEMORY */
216 nodes_and(nsc->mask1, 237 nodes_and(nsc->mask1,
217 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); 238 cpuset_current_mems_allowed, node_states[N_MEMORY]);
218 239
219 VM_BUG_ON(!nodes); 240 VM_BUG_ON(!nodes);
220 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 241 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
254 if (mode == MPOL_DEFAULT) { 275 if (mode == MPOL_DEFAULT) {
255 if (nodes && !nodes_empty(*nodes)) 276 if (nodes && !nodes_empty(*nodes))
256 return ERR_PTR(-EINVAL); 277 return ERR_PTR(-EINVAL);
257 return NULL; /* simply delete any existing policy */ 278 return NULL;
258 } 279 }
259 VM_BUG_ON(!nodes); 280 VM_BUG_ON(!nodes);
260 281
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
269 (flags & MPOL_F_RELATIVE_NODES))) 290 (flags & MPOL_F_RELATIVE_NODES)))
270 return ERR_PTR(-EINVAL); 291 return ERR_PTR(-EINVAL);
271 } 292 }
293 } else if (mode == MPOL_LOCAL) {
294 if (!nodes_empty(*nodes))
295 return ERR_PTR(-EINVAL);
296 mode = MPOL_PREFERRED;
272 } else if (nodes_empty(*nodes)) 297 } else if (nodes_empty(*nodes))
273 return ERR_PTR(-EINVAL); 298 return ERR_PTR(-EINVAL);
274 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 299 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
511 pmd = pmd_offset(pud, addr); 536 pmd = pmd_offset(pud, addr);
512 do { 537 do {
513 next = pmd_addr_end(addr, end); 538 next = pmd_addr_end(addr, end);
514 split_huge_page_pmd(vma->vm_mm, pmd); 539 split_huge_page_pmd(vma, addr, pmd);
515 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 540 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
516 continue; 541 continue;
517 if (check_pte_range(vma, pmd, addr, next, nodes, 542 if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
561 return 0; 586 return 0;
562} 587}
563 588
589#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
590/*
591 * This is used to mark a range of virtual addresses to be inaccessible.
592 * These are later cleared by a NUMA hinting fault. Depending on these
593 * faults, pages may be migrated for better NUMA placement.
594 *
595 * This is assuming that NUMA faults are handled using PROT_NONE. If
596 * an architecture makes a different choice, it will need further
597 * changes to the core.
598 */
599unsigned long change_prot_numa(struct vm_area_struct *vma,
600 unsigned long addr, unsigned long end)
601{
602 int nr_updated;
603 BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
604
605 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
606 if (nr_updated)
607 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
608
609 return nr_updated;
610}
611#else
612static unsigned long change_prot_numa(struct vm_area_struct *vma,
613 unsigned long addr, unsigned long end)
614{
615 return 0;
616}
617#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
618
564/* 619/*
565 * Check if all pages in a range are on a set of nodes. 620 * Check if all pages in a range are on a set of nodes.
566 * If pagelist != NULL then isolate pages from the LRU and 621 * If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
579 return ERR_PTR(-EFAULT); 634 return ERR_PTR(-EFAULT);
580 prev = NULL; 635 prev = NULL;
581 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 636 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
637 unsigned long endvma = vma->vm_end;
638
639 if (endvma > end)
640 endvma = end;
641 if (vma->vm_start > start)
642 start = vma->vm_start;
643
582 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 644 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
583 if (!vma->vm_next && vma->vm_end < end) 645 if (!vma->vm_next && vma->vm_end < end)
584 return ERR_PTR(-EFAULT); 646 return ERR_PTR(-EFAULT);
585 if (prev && prev->vm_end < vma->vm_start) 647 if (prev && prev->vm_end < vma->vm_start)
586 return ERR_PTR(-EFAULT); 648 return ERR_PTR(-EFAULT);
587 } 649 }
588 if (!is_vm_hugetlb_page(vma) && 650
589 ((flags & MPOL_MF_STRICT) || 651 if (is_vm_hugetlb_page(vma))
652 goto next;
653
654 if (flags & MPOL_MF_LAZY) {
655 change_prot_numa(vma, start, endvma);
656 goto next;
657 }
658
659 if ((flags & MPOL_MF_STRICT) ||
590 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 660 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
591 vma_migratable(vma)))) { 661 vma_migratable(vma))) {
592 unsigned long endvma = vma->vm_end;
593 662
594 if (endvma > end)
595 endvma = end;
596 if (vma->vm_start > start)
597 start = vma->vm_start;
598 err = check_pgd_range(vma, start, endvma, nodes, 663 err = check_pgd_range(vma, start, endvma, nodes,
599 flags, private); 664 flags, private);
600 if (err) { 665 if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
602 break; 667 break;
603 } 668 }
604 } 669 }
670next:
605 prev = vma; 671 prev = vma;
606 } 672 }
607 return first; 673 return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
961 1027
962 if (!list_empty(&pagelist)) { 1028 if (!list_empty(&pagelist)) {
963 err = migrate_pages(&pagelist, new_node_page, dest, 1029 err = migrate_pages(&pagelist, new_node_page, dest,
964 false, MIGRATE_SYNC); 1030 false, MIGRATE_SYNC,
1031 MR_SYSCALL);
965 if (err) 1032 if (err)
966 putback_lru_pages(&pagelist); 1033 putback_lru_pages(&pagelist);
967 } 1034 }
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1133 int err; 1200 int err;
1134 LIST_HEAD(pagelist); 1201 LIST_HEAD(pagelist);
1135 1202
1136 if (flags & ~(unsigned long)(MPOL_MF_STRICT | 1203 if (flags & ~(unsigned long)MPOL_MF_VALID)
1137 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1138 return -EINVAL; 1204 return -EINVAL;
1139 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1205 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1140 return -EPERM; 1206 return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1157 if (IS_ERR(new)) 1223 if (IS_ERR(new))
1158 return PTR_ERR(new); 1224 return PTR_ERR(new);
1159 1225
1226 if (flags & MPOL_MF_LAZY)
1227 new->flags |= MPOL_F_MOF;
1228
1160 /* 1229 /*
1161 * If we are using the default policy then operation 1230 * If we are using the default policy then operation
1162 * on discontinuous address spaces is okay after all 1231 * on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
1193 vma = check_range(mm, start, end, nmask, 1262 vma = check_range(mm, start, end, nmask,
1194 flags | MPOL_MF_INVERT, &pagelist); 1263 flags | MPOL_MF_INVERT, &pagelist);
1195 1264
1196 err = PTR_ERR(vma); 1265 err = PTR_ERR(vma); /* maybe ... */
1197 if (!IS_ERR(vma)) { 1266 if (!IS_ERR(vma))
1198 int nr_failed = 0;
1199
1200 err = mbind_range(mm, start, end, new); 1267 err = mbind_range(mm, start, end, new);
1201 1268
1269 if (!err) {
1270 int nr_failed = 0;
1271
1202 if (!list_empty(&pagelist)) { 1272 if (!list_empty(&pagelist)) {
1273 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1203 nr_failed = migrate_pages(&pagelist, new_vma_page, 1274 nr_failed = migrate_pages(&pagelist, new_vma_page,
1204 (unsigned long)vma, 1275 (unsigned long)vma,
1205 false, MIGRATE_SYNC); 1276 false, MIGRATE_SYNC,
1277 MR_MEMPOLICY_MBIND);
1206 if (nr_failed) 1278 if (nr_failed)
1207 putback_lru_pages(&pagelist); 1279 putback_lru_pages(&pagelist);
1208 } 1280 }
1209 1281
1210 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1282 if (nr_failed && (flags & MPOL_MF_STRICT))
1211 err = -EIO; 1283 err = -EIO;
1212 } else 1284 } else
1213 putback_lru_pages(&pagelist); 1285 putback_lru_pages(&pagelist);
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1388 goto out_put; 1460 goto out_put;
1389 } 1461 }
1390 1462
1391 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { 1463 if (!nodes_subset(*new, node_states[N_MEMORY])) {
1392 err = -EINVAL; 1464 err = -EINVAL;
1393 goto out_put; 1465 goto out_put;
1394 } 1466 }
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1546struct mempolicy *get_vma_policy(struct task_struct *task, 1618struct mempolicy *get_vma_policy(struct task_struct *task,
1547 struct vm_area_struct *vma, unsigned long addr) 1619 struct vm_area_struct *vma, unsigned long addr)
1548{ 1620{
1549 struct mempolicy *pol = task->mempolicy; 1621 struct mempolicy *pol = get_task_policy(task);
1550 1622
1551 if (vma) { 1623 if (vma) {
1552 if (vma->vm_ops && vma->vm_ops->get_policy) { 1624 if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1907,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1907 unsigned long addr, int node) 1979 unsigned long addr, int node)
1908{ 1980{
1909 struct mempolicy *pol; 1981 struct mempolicy *pol;
1910 struct zonelist *zl;
1911 struct page *page; 1982 struct page *page;
1912 unsigned int cpuset_mems_cookie; 1983 unsigned int cpuset_mems_cookie;
1913 1984
@@ -1926,23 +1997,11 @@ retry_cpuset:
1926 1997
1927 return page; 1998 return page;
1928 } 1999 }
1929 zl = policy_zonelist(gfp, pol, node); 2000 page = __alloc_pages_nodemask(gfp, order,
1930 if (unlikely(mpol_needs_cond_ref(pol))) { 2001 policy_zonelist(gfp, pol, node),
1931 /*
1932 * slow path: ref counted shared policy
1933 */
1934 struct page *page = __alloc_pages_nodemask(gfp, order,
1935 zl, policy_nodemask(gfp, pol));
1936 __mpol_put(pol);
1937 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1938 goto retry_cpuset;
1939 return page;
1940 }
1941 /*
1942 * fast path: default or task policy
1943 */
1944 page = __alloc_pages_nodemask(gfp, order, zl,
1945 policy_nodemask(gfp, pol)); 2002 policy_nodemask(gfp, pol));
2003 if (unlikely(mpol_needs_cond_ref(pol)))
2004 __mpol_put(pol);
1946 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2005 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1947 goto retry_cpuset; 2006 goto retry_cpuset;
1948 return page; 2007 return page;
@@ -1969,7 +2028,7 @@ retry_cpuset:
1969 */ 2028 */
1970struct page *alloc_pages_current(gfp_t gfp, unsigned order) 2029struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1971{ 2030{
1972 struct mempolicy *pol = current->mempolicy; 2031 struct mempolicy *pol = get_task_policy(current);
1973 struct page *page; 2032 struct page *page;
1974 unsigned int cpuset_mems_cookie; 2033 unsigned int cpuset_mems_cookie;
1975 2034
@@ -2037,28 +2096,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
2037 return new; 2096 return new;
2038} 2097}
2039 2098
2040/*
2041 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
2042 * eliminate the * MPOL_F_* flags that require conditional ref and
2043 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
2044 * after return. Use the returned value.
2045 *
2046 * Allows use of a mempolicy for, e.g., multiple allocations with a single
2047 * policy lookup, even if the policy needs/has extra ref on lookup.
2048 * shmem_readahead needs this.
2049 */
2050struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
2051 struct mempolicy *frompol)
2052{
2053 if (!mpol_needs_cond_ref(frompol))
2054 return frompol;
2055
2056 *tompol = *frompol;
2057 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
2058 __mpol_put(frompol);
2059 return tompol;
2060}
2061
2062/* Slow path of a mempolicy comparison */ 2099/* Slow path of a mempolicy comparison */
2063bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2100bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2064{ 2101{
@@ -2095,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2095 */ 2132 */
2096 2133
2097/* lookup first element intersecting start-end */ 2134/* lookup first element intersecting start-end */
2098/* Caller holds sp->mutex */ 2135/* Caller holds sp->lock */
2099static struct sp_node * 2136static struct sp_node *
2100sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2137sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2101{ 2138{
@@ -2159,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2159 2196
2160 if (!sp->root.rb_node) 2197 if (!sp->root.rb_node)
2161 return NULL; 2198 return NULL;
2162 mutex_lock(&sp->mutex); 2199 spin_lock(&sp->lock);
2163 sn = sp_lookup(sp, idx, idx+1); 2200 sn = sp_lookup(sp, idx, idx+1);
2164 if (sn) { 2201 if (sn) {
2165 mpol_get(sn->policy); 2202 mpol_get(sn->policy);
2166 pol = sn->policy; 2203 pol = sn->policy;
2167 } 2204 }
2168 mutex_unlock(&sp->mutex); 2205 spin_unlock(&sp->lock);
2169 return pol; 2206 return pol;
2170} 2207}
2171 2208
@@ -2175,6 +2212,115 @@ static void sp_free(struct sp_node *n)
2175 kmem_cache_free(sn_cache, n); 2212 kmem_cache_free(sn_cache, n);
2176} 2213}
2177 2214
2215/**
2216 * mpol_misplaced - check whether current page node is valid in policy
2217 *
2218 * @page - page to be checked
2219 * @vma - vm area where page mapped
2220 * @addr - virtual address where page mapped
2221 *
2222 * Lookup current policy node id for vma,addr and "compare to" page's
2223 * node id.
2224 *
2225 * Returns:
2226 * -1 - not misplaced, page is in the right node
2227 * node - node id where the page should be
2228 *
2229 * Policy determination "mimics" alloc_page_vma().
2230 * Called from fault path where we know the vma and faulting address.
2231 */
2232int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2233{
2234 struct mempolicy *pol;
2235 struct zone *zone;
2236 int curnid = page_to_nid(page);
2237 unsigned long pgoff;
2238 int polnid = -1;
2239 int ret = -1;
2240
2241 BUG_ON(!vma);
2242
2243 pol = get_vma_policy(current, vma, addr);
2244 if (!(pol->flags & MPOL_F_MOF))
2245 goto out;
2246
2247 switch (pol->mode) {
2248 case MPOL_INTERLEAVE:
2249 BUG_ON(addr >= vma->vm_end);
2250 BUG_ON(addr < vma->vm_start);
2251
2252 pgoff = vma->vm_pgoff;
2253 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2254 polnid = offset_il_node(pol, vma, pgoff);
2255 break;
2256
2257 case MPOL_PREFERRED:
2258 if (pol->flags & MPOL_F_LOCAL)
2259 polnid = numa_node_id();
2260 else
2261 polnid = pol->v.preferred_node;
2262 break;
2263
2264 case MPOL_BIND:
2265 /*
2266 * allows binding to multiple nodes.
2267 * use current page if in policy nodemask,
2268 * else select nearest allowed node, if any.
2269 * If no allowed nodes, use current [!misplaced].
2270 */
2271 if (node_isset(curnid, pol->v.nodes))
2272 goto out;
2273 (void)first_zones_zonelist(
2274 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2275 gfp_zone(GFP_HIGHUSER),
2276 &pol->v.nodes, &zone);
2277 polnid = zone->node;
2278 break;
2279
2280 default:
2281 BUG();
2282 }
2283
2284 /* Migrate the page towards the node whose CPU is referencing it */
2285 if (pol->flags & MPOL_F_MORON) {
2286 int last_nid;
2287
2288 polnid = numa_node_id();
2289
2290 /*
2291 * Multi-stage node selection is used in conjunction
2292 * with a periodic migration fault to build a temporal
2293 * task<->page relation. By using a two-stage filter we
2294 * remove short/unlikely relations.
2295 *
2296 * Using P(p) ~ n_p / n_t as per frequentist
2297 * probability, we can equate a task's usage of a
2298 * particular page (n_p) per total usage of this
2299 * page (n_t) (in a given time-span) to a probability.
2300 *
2301 * Our periodic faults will sample this probability and
2302 * getting the same result twice in a row, given these
2303 * samples are fully independent, is then given by
2304 * P(n)^2, provided our sample period is sufficiently
2305 * short compared to the usage pattern.
2306 *
2307 * This quadric squishes small probabilities, making
2308 * it less likely we act on an unlikely task<->page
2309 * relation.
2310 */
2311 last_nid = page_xchg_last_nid(page, polnid);
2312 if (last_nid != polnid)
2313 goto out;
2314 }
2315
2316 if (curnid != polnid)
2317 ret = polnid;
2318out:
2319 mpol_cond_put(pol);
2320
2321 return ret;
2322}
2323
2178static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2324static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2179{ 2325{
2180 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2326 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2182,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2182 sp_free(n); 2328 sp_free(n);
2183} 2329}
2184 2330
2331static void sp_node_init(struct sp_node *node, unsigned long start,
2332 unsigned long end, struct mempolicy *pol)
2333{
2334 node->start = start;
2335 node->end = end;
2336 node->policy = pol;
2337}
2338
2185static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2339static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2186 struct mempolicy *pol) 2340 struct mempolicy *pol)
2187{ 2341{
@@ -2198,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2198 return NULL; 2352 return NULL;
2199 } 2353 }
2200 newpol->flags |= MPOL_F_SHARED; 2354 newpol->flags |= MPOL_F_SHARED;
2201 2355 sp_node_init(n, start, end, newpol);
2202 n->start = start;
2203 n->end = end;
2204 n->policy = newpol;
2205 2356
2206 return n; 2357 return n;
2207} 2358}
@@ -2211,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2211 unsigned long end, struct sp_node *new) 2362 unsigned long end, struct sp_node *new)
2212{ 2363{
2213 struct sp_node *n; 2364 struct sp_node *n;
2365 struct sp_node *n_new = NULL;
2366 struct mempolicy *mpol_new = NULL;
2214 int ret = 0; 2367 int ret = 0;
2215 2368
2216 mutex_lock(&sp->mutex); 2369restart:
2370 spin_lock(&sp->lock);
2217 n = sp_lookup(sp, start, end); 2371 n = sp_lookup(sp, start, end);
2218 /* Take care of old policies in the same range. */ 2372 /* Take care of old policies in the same range. */
2219 while (n && n->start < end) { 2373 while (n && n->start < end) {
@@ -2226,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2226 } else { 2380 } else {
2227 /* Old policy spanning whole new range. */ 2381 /* Old policy spanning whole new range. */
2228 if (n->end > end) { 2382 if (n->end > end) {
2229 struct sp_node *new2; 2383 if (!n_new)
2230 new2 = sp_alloc(end, n->end, n->policy); 2384 goto alloc_new;
2231 if (!new2) { 2385
2232 ret = -ENOMEM; 2386 *mpol_new = *n->policy;
2233 goto out; 2387 atomic_set(&mpol_new->refcnt, 1);
2234 } 2388 sp_node_init(n_new, n->end, end, mpol_new);
2389 sp_insert(sp, n_new);
2235 n->end = start; 2390 n->end = start;
2236 sp_insert(sp, new2); 2391 n_new = NULL;
2392 mpol_new = NULL;
2237 break; 2393 break;
2238 } else 2394 } else
2239 n->end = start; 2395 n->end = start;
@@ -2244,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2244 } 2400 }
2245 if (new) 2401 if (new)
2246 sp_insert(sp, new); 2402 sp_insert(sp, new);
2247out: 2403 spin_unlock(&sp->lock);
2248 mutex_unlock(&sp->mutex); 2404 ret = 0;
2405
2406err_out:
2407 if (mpol_new)
2408 mpol_put(mpol_new);
2409 if (n_new)
2410 kmem_cache_free(sn_cache, n_new);
2411
2249 return ret; 2412 return ret;
2413
2414alloc_new:
2415 spin_unlock(&sp->lock);
2416 ret = -ENOMEM;
2417 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2418 if (!n_new)
2419 goto err_out;
2420 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2421 if (!mpol_new)
2422 goto err_out;
2423 goto restart;
2250} 2424}
2251 2425
2252/** 2426/**
@@ -2264,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2264 int ret; 2438 int ret;
2265 2439
2266 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2440 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2267 mutex_init(&sp->mutex); 2441 spin_lock_init(&sp->lock);
2268 2442
2269 if (mpol) { 2443 if (mpol) {
2270 struct vm_area_struct pvma; 2444 struct vm_area_struct pvma;
@@ -2330,16 +2504,60 @@ void mpol_free_shared_policy(struct shared_policy *p)
2330 2504
2331 if (!p->root.rb_node) 2505 if (!p->root.rb_node)
2332 return; 2506 return;
2333 mutex_lock(&p->mutex); 2507 spin_lock(&p->lock);
2334 next = rb_first(&p->root); 2508 next = rb_first(&p->root);
2335 while (next) { 2509 while (next) {
2336 n = rb_entry(next, struct sp_node, nd); 2510 n = rb_entry(next, struct sp_node, nd);
2337 next = rb_next(&n->nd); 2511 next = rb_next(&n->nd);
2338 sp_delete(p, n); 2512 sp_delete(p, n);
2339 } 2513 }
2340 mutex_unlock(&p->mutex); 2514 spin_unlock(&p->lock);
2515}
2516
2517#ifdef CONFIG_NUMA_BALANCING
2518static bool __initdata numabalancing_override;
2519
2520static void __init check_numabalancing_enable(void)
2521{
2522 bool numabalancing_default = false;
2523
2524 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2525 numabalancing_default = true;
2526
2527 if (nr_node_ids > 1 && !numabalancing_override) {
2528 printk(KERN_INFO "Enabling automatic NUMA balancing. "
2529 "Configure with numa_balancing= or sysctl");
2530 set_numabalancing_state(numabalancing_default);
2531 }
2341} 2532}
2342 2533
2534static int __init setup_numabalancing(char *str)
2535{
2536 int ret = 0;
2537 if (!str)
2538 goto out;
2539 numabalancing_override = true;
2540
2541 if (!strcmp(str, "enable")) {
2542 set_numabalancing_state(true);
2543 ret = 1;
2544 } else if (!strcmp(str, "disable")) {
2545 set_numabalancing_state(false);
2546 ret = 1;
2547 }
2548out:
2549 if (!ret)
2550 printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2551
2552 return ret;
2553}
2554__setup("numa_balancing=", setup_numabalancing);
2555#else
2556static inline void __init check_numabalancing_enable(void)
2557{
2558}
2559#endif /* CONFIG_NUMA_BALANCING */
2560
2343/* assumes fs == KERNEL_DS */ 2561/* assumes fs == KERNEL_DS */
2344void __init numa_policy_init(void) 2562void __init numa_policy_init(void)
2345{ 2563{
@@ -2355,13 +2573,22 @@ void __init numa_policy_init(void)
2355 sizeof(struct sp_node), 2573 sizeof(struct sp_node),
2356 0, SLAB_PANIC, NULL); 2574 0, SLAB_PANIC, NULL);
2357 2575
2576 for_each_node(nid) {
2577 preferred_node_policy[nid] = (struct mempolicy) {
2578 .refcnt = ATOMIC_INIT(1),
2579 .mode = MPOL_PREFERRED,
2580 .flags = MPOL_F_MOF | MPOL_F_MORON,
2581 .v = { .preferred_node = nid, },
2582 };
2583 }
2584
2358 /* 2585 /*
2359 * Set interleaving policy for system init. Interleaving is only 2586 * Set interleaving policy for system init. Interleaving is only
2360 * enabled across suitably sized nodes (default is >= 16MB), or 2587 * enabled across suitably sized nodes (default is >= 16MB), or
2361 * fall back to the largest node if they're all smaller. 2588 * fall back to the largest node if they're all smaller.
2362 */ 2589 */
2363 nodes_clear(interleave_nodes); 2590 nodes_clear(interleave_nodes);
2364 for_each_node_state(nid, N_HIGH_MEMORY) { 2591 for_each_node_state(nid, N_MEMORY) {
2365 unsigned long total_pages = node_present_pages(nid); 2592 unsigned long total_pages = node_present_pages(nid);
2366 2593
2367 /* Preserve the largest node */ 2594 /* Preserve the largest node */
@@ -2381,6 +2608,8 @@ void __init numa_policy_init(void)
2381 2608
2382 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 2609 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2383 printk("numa_policy_init: interleaving failed\n"); 2610 printk("numa_policy_init: interleaving failed\n");
2611
2612 check_numabalancing_enable();
2384} 2613}
2385 2614
2386/* Reset policy of current process to default */ 2615/* Reset policy of current process to default */
@@ -2394,44 +2623,34 @@ void numa_default_policy(void)
2394 */ 2623 */
2395 2624
2396/* 2625/*
2397 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2626 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2398 * Used only for mpol_parse_str() and mpol_to_str()
2399 */ 2627 */
2400#define MPOL_LOCAL MPOL_MAX
2401static const char * const policy_modes[] = 2628static const char * const policy_modes[] =
2402{ 2629{
2403 [MPOL_DEFAULT] = "default", 2630 [MPOL_DEFAULT] = "default",
2404 [MPOL_PREFERRED] = "prefer", 2631 [MPOL_PREFERRED] = "prefer",
2405 [MPOL_BIND] = "bind", 2632 [MPOL_BIND] = "bind",
2406 [MPOL_INTERLEAVE] = "interleave", 2633 [MPOL_INTERLEAVE] = "interleave",
2407 [MPOL_LOCAL] = "local" 2634 [MPOL_LOCAL] = "local",
2408}; 2635};
2409 2636
2410 2637
2411#ifdef CONFIG_TMPFS 2638#ifdef CONFIG_TMPFS
2412/** 2639/**
2413 * mpol_parse_str - parse string to mempolicy 2640 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2414 * @str: string containing mempolicy to parse 2641 * @str: string containing mempolicy to parse
2415 * @mpol: pointer to struct mempolicy pointer, returned on success. 2642 * @mpol: pointer to struct mempolicy pointer, returned on success.
2416 * @no_context: flag whether to "contextualize" the mempolicy
2417 * 2643 *
2418 * Format of input: 2644 * Format of input:
2419 * <mode>[=<flags>][:<nodelist>] 2645 * <mode>[=<flags>][:<nodelist>]
2420 * 2646 *
2421 * if @no_context is true, save the input nodemask in w.user_nodemask in
2422 * the returned mempolicy. This will be used to "clone" the mempolicy in
2423 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
2424 * mount option. Note that if 'static' or 'relative' mode flags were
2425 * specified, the input nodemask will already have been saved. Saving
2426 * it again is redundant, but safe.
2427 *
2428 * On success, returns 0, else 1 2647 * On success, returns 0, else 1
2429 */ 2648 */
2430int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) 2649int mpol_parse_str(char *str, struct mempolicy **mpol)
2431{ 2650{
2432 struct mempolicy *new = NULL; 2651 struct mempolicy *new = NULL;
2433 unsigned short mode; 2652 unsigned short mode;
2434 unsigned short uninitialized_var(mode_flags); 2653 unsigned short mode_flags;
2435 nodemask_t nodes; 2654 nodemask_t nodes;
2436 char *nodelist = strchr(str, ':'); 2655 char *nodelist = strchr(str, ':');
2437 char *flags = strchr(str, '='); 2656 char *flags = strchr(str, '=');
@@ -2442,7 +2661,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2442 *nodelist++ = '\0'; 2661 *nodelist++ = '\0';
2443 if (nodelist_parse(nodelist, nodes)) 2662 if (nodelist_parse(nodelist, nodes))
2444 goto out; 2663 goto out;
2445 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) 2664 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2446 goto out; 2665 goto out;
2447 } else 2666 } else
2448 nodes_clear(nodes); 2667 nodes_clear(nodes);
@@ -2450,12 +2669,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2450 if (flags) 2669 if (flags)
2451 *flags++ = '\0'; /* terminate mode string */ 2670 *flags++ = '\0'; /* terminate mode string */
2452 2671
2453 for (mode = 0; mode <= MPOL_LOCAL; mode++) { 2672 for (mode = 0; mode < MPOL_MAX; mode++) {
2454 if (!strcmp(str, policy_modes[mode])) { 2673 if (!strcmp(str, policy_modes[mode])) {
2455 break; 2674 break;
2456 } 2675 }
2457 } 2676 }
2458 if (mode > MPOL_LOCAL) 2677 if (mode >= MPOL_MAX)
2459 goto out; 2678 goto out;
2460 2679
2461 switch (mode) { 2680 switch (mode) {
@@ -2476,7 +2695,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2476 * Default to online nodes with memory if no nodelist 2695 * Default to online nodes with memory if no nodelist
2477 */ 2696 */
2478 if (!nodelist) 2697 if (!nodelist)
2479 nodes = node_states[N_HIGH_MEMORY]; 2698 nodes = node_states[N_MEMORY];
2480 break; 2699 break;
2481 case MPOL_LOCAL: 2700 case MPOL_LOCAL:
2482 /* 2701 /*
@@ -2519,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2519 if (IS_ERR(new)) 2738 if (IS_ERR(new))
2520 goto out; 2739 goto out;
2521 2740
2522 if (no_context) { 2741 /*
2523 /* save for contextualization */ 2742 * Save nodes for mpol_to_str() to show the tmpfs mount options
2524 new->w.user_nodemask = nodes; 2743 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2525 } else { 2744 */
2526 int ret; 2745 if (mode != MPOL_PREFERRED)
2527 NODEMASK_SCRATCH(scratch); 2746 new->v.nodes = nodes;
2528 if (scratch) { 2747 else if (nodelist)
2529 task_lock(current); 2748 new->v.preferred_node = first_node(nodes);
2530 ret = mpol_set_nodemask(new, &nodes, scratch); 2749 else
2531 task_unlock(current); 2750 new->flags |= MPOL_F_LOCAL;
2532 } else 2751
2533 ret = -ENOMEM; 2752 /*
2534 NODEMASK_SCRATCH_FREE(scratch); 2753 * Save nodes for contextualization: this will be used to "clone"
2535 if (ret) { 2754 * the mempolicy in a specific context [cpuset] at a later time.
2536 mpol_put(new); 2755 */
2537 goto out; 2756 new->w.user_nodemask = nodes;
2538 } 2757
2539 }
2540 err = 0; 2758 err = 0;
2541 2759
2542out: 2760out:
@@ -2556,13 +2774,12 @@ out:
2556 * @buffer: to contain formatted mempolicy string 2774 * @buffer: to contain formatted mempolicy string
2557 * @maxlen: length of @buffer 2775 * @maxlen: length of @buffer
2558 * @pol: pointer to mempolicy to be formatted 2776 * @pol: pointer to mempolicy to be formatted
2559 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2560 * 2777 *
2561 * Convert a mempolicy into a string. 2778 * Convert a mempolicy into a string.
2562 * Returns the number of characters in buffer (if positive) 2779 * Returns the number of characters in buffer (if positive)
2563 * or an error (negative) 2780 * or an error (negative)
2564 */ 2781 */
2565int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) 2782int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2566{ 2783{
2567 char *p = buffer; 2784 char *p = buffer;
2568 int l; 2785 int l;
@@ -2588,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2588 case MPOL_PREFERRED: 2805 case MPOL_PREFERRED:
2589 nodes_clear(nodes); 2806 nodes_clear(nodes);
2590 if (flags & MPOL_F_LOCAL) 2807 if (flags & MPOL_F_LOCAL)
2591 mode = MPOL_LOCAL; /* pseudo-policy */ 2808 mode = MPOL_LOCAL;
2592 else 2809 else
2593 node_set(pol->v.preferred_node, nodes); 2810 node_set(pol->v.preferred_node, nodes);
2594 break; 2811 break;
@@ -2596,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2596 case MPOL_BIND: 2813 case MPOL_BIND:
2597 /* Fall through */ 2814 /* Fall through */
2598 case MPOL_INTERLEAVE: 2815 case MPOL_INTERLEAVE:
2599 if (no_context) 2816 nodes = pol->v.nodes;
2600 nodes = pol->w.user_nodemask;
2601 else
2602 nodes = pol->v.nodes;
2603 break; 2817 break;
2604 2818
2605 default: 2819 default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d773705..c38778610aa8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,9 +35,13 @@
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h> 36#include <linux/hugetlb_cgroup.h>
37#include <linux/gfp.h> 37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h>
38 39
39#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
40 41
42#define CREATE_TRACE_POINTS
43#include <trace/events/migrate.h>
44
41#include "internal.h" 45#include "internal.h"
42 46
43/* 47/*
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l)
79 list_del(&page->lru); 83 list_del(&page->lru);
80 dec_zone_page_state(page, NR_ISOLATED_ANON + 84 dec_zone_page_state(page, NR_ISOLATED_ANON +
81 page_is_file_cache(page)); 85 page_is_file_cache(page));
82 putback_lru_page(page); 86 putback_lru_page(page);
87 }
88}
89
90/*
91 * Put previously isolated pages back onto the appropriate lists
92 * from where they were once taken off for compaction/migration.
93 *
94 * This function shall be used instead of putback_lru_pages(),
95 * whenever the isolated pageset has been built by isolate_migratepages_range()
96 */
97void putback_movable_pages(struct list_head *l)
98{
99 struct page *page;
100 struct page *page2;
101
102 list_for_each_entry_safe(page, page2, l, lru) {
103 list_del(&page->lru);
104 dec_zone_page_state(page, NR_ISOLATED_ANON +
105 page_is_file_cache(page));
106 if (unlikely(balloon_page_movable(page)))
107 balloon_page_putback(page);
108 else
109 putback_lru_page(page);
83 } 110 }
84} 111}
85 112
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
91{ 118{
92 struct mm_struct *mm = vma->vm_mm; 119 struct mm_struct *mm = vma->vm_mm;
93 swp_entry_t entry; 120 swp_entry_t entry;
94 pgd_t *pgd;
95 pud_t *pud;
96 pmd_t *pmd; 121 pmd_t *pmd;
97 pte_t *ptep, pte; 122 pte_t *ptep, pte;
98 spinlock_t *ptl; 123 spinlock_t *ptl;
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
103 goto out; 128 goto out;
104 ptl = &mm->page_table_lock; 129 ptl = &mm->page_table_lock;
105 } else { 130 } else {
106 pgd = pgd_offset(mm, addr); 131 pmd = mm_find_pmd(mm, addr);
107 if (!pgd_present(*pgd)) 132 if (!pmd)
108 goto out;
109
110 pud = pud_offset(pgd, addr);
111 if (!pud_present(*pud))
112 goto out; 133 goto out;
113
114 pmd = pmd_offset(pud, addr);
115 if (pmd_trans_huge(*pmd)) 134 if (pmd_trans_huge(*pmd))
116 goto out; 135 goto out;
117 if (!pmd_present(*pmd))
118 goto out;
119 136
120 ptep = pte_offset_map(pmd, addr); 137 ptep = pte_offset_map(pmd, addr);
121 138
@@ -279,14 +296,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
279 struct page *newpage, struct page *page, 296 struct page *newpage, struct page *page,
280 struct buffer_head *head, enum migrate_mode mode) 297 struct buffer_head *head, enum migrate_mode mode)
281{ 298{
282 int expected_count; 299 int expected_count = 0;
283 void **pslot; 300 void **pslot;
284 301
285 if (!mapping) { 302 if (!mapping) {
286 /* Anonymous page without mapping */ 303 /* Anonymous page without mapping */
287 if (page_count(page) != 1) 304 if (page_count(page) != 1)
288 return -EAGAIN; 305 return -EAGAIN;
289 return 0; 306 return MIGRATEPAGE_SUCCESS;
290 } 307 }
291 308
292 spin_lock_irq(&mapping->tree_lock); 309 spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +373,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
356 } 373 }
357 spin_unlock_irq(&mapping->tree_lock); 374 spin_unlock_irq(&mapping->tree_lock);
358 375
359 return 0; 376 return MIGRATEPAGE_SUCCESS;
360} 377}
361 378
362/* 379/*
@@ -372,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
372 if (!mapping) { 389 if (!mapping) {
373 if (page_count(page) != 1) 390 if (page_count(page) != 1)
374 return -EAGAIN; 391 return -EAGAIN;
375 return 0; 392 return MIGRATEPAGE_SUCCESS;
376 } 393 }
377 394
378 spin_lock_irq(&mapping->tree_lock); 395 spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +416,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
399 page_unfreeze_refs(page, expected_count - 1); 416 page_unfreeze_refs(page, expected_count - 1);
400 417
401 spin_unlock_irq(&mapping->tree_lock); 418 spin_unlock_irq(&mapping->tree_lock);
402 return 0; 419 return MIGRATEPAGE_SUCCESS;
403} 420}
404 421
405/* 422/*
@@ -407,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
407 */ 424 */
408void migrate_page_copy(struct page *newpage, struct page *page) 425void migrate_page_copy(struct page *newpage, struct page *page)
409{ 426{
410 if (PageHuge(page)) 427 if (PageHuge(page) || PageTransHuge(page))
411 copy_huge_page(newpage, page); 428 copy_huge_page(newpage, page);
412 else 429 else
413 copy_highpage(newpage, page); 430 copy_highpage(newpage, page);
@@ -486,11 +503,11 @@ int migrate_page(struct address_space *mapping,
486 503
487 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); 504 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
488 505
489 if (rc) 506 if (rc != MIGRATEPAGE_SUCCESS)
490 return rc; 507 return rc;
491 508
492 migrate_page_copy(newpage, page); 509 migrate_page_copy(newpage, page);
493 return 0; 510 return MIGRATEPAGE_SUCCESS;
494} 511}
495EXPORT_SYMBOL(migrate_page); 512EXPORT_SYMBOL(migrate_page);
496 513
@@ -513,7 +530,7 @@ int buffer_migrate_page(struct address_space *mapping,
513 530
514 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); 531 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
515 532
516 if (rc) 533 if (rc != MIGRATEPAGE_SUCCESS)
517 return rc; 534 return rc;
518 535
519 /* 536 /*
@@ -549,7 +566,7 @@ int buffer_migrate_page(struct address_space *mapping,
549 566
550 } while (bh != head); 567 } while (bh != head);
551 568
552 return 0; 569 return MIGRATEPAGE_SUCCESS;
553} 570}
554EXPORT_SYMBOL(buffer_migrate_page); 571EXPORT_SYMBOL(buffer_migrate_page);
555#endif 572#endif
@@ -628,7 +645,7 @@ static int fallback_migrate_page(struct address_space *mapping,
628 * 645 *
629 * Return value: 646 * Return value:
630 * < 0 - error code 647 * < 0 - error code
631 * == 0 - success 648 * MIGRATEPAGE_SUCCESS - success
632 */ 649 */
633static int move_to_new_page(struct page *newpage, struct page *page, 650static int move_to_new_page(struct page *newpage, struct page *page,
634 int remap_swapcache, enum migrate_mode mode) 651 int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +682,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
665 else 682 else
666 rc = fallback_migrate_page(mapping, newpage, page, mode); 683 rc = fallback_migrate_page(mapping, newpage, page, mode);
667 684
668 if (rc) { 685 if (rc != MIGRATEPAGE_SUCCESS) {
669 newpage->mapping = NULL; 686 newpage->mapping = NULL;
670 } else { 687 } else {
671 if (remap_swapcache) 688 if (remap_swapcache)
@@ -751,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
751 */ 768 */
752 if (PageAnon(page)) { 769 if (PageAnon(page)) {
753 /* 770 /*
754 * Only page_lock_anon_vma() understands the subtleties of 771 * Only page_lock_anon_vma_read() understands the subtleties of
755 * getting a hold on an anon_vma from outside one of its mms. 772 * getting a hold on an anon_vma from outside one of its mms.
756 */ 773 */
757 anon_vma = page_get_anon_vma(page); 774 anon_vma = page_get_anon_vma(page);
@@ -778,6 +795,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
778 } 795 }
779 } 796 }
780 797
798 if (unlikely(balloon_page_movable(page))) {
799 /*
800 * A ballooned page does not need any special attention from
801 * physical to virtual reverse mapping procedures.
802 * Skip any attempt to unmap PTEs or to remap swap cache,
803 * in order to avoid burning cycles at rmap level, and perform
804 * the page migration right away (proteced by page lock).
805 */
806 rc = balloon_page_migrate(newpage, page, mode);
807 goto uncharge;
808 }
809
781 /* 810 /*
782 * Corner case handling: 811 * Corner case handling:
783 * 1. When a new swap-cache page is read into, it is added to the LRU 812 * 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +843,9 @@ skip_unmap:
814 put_anon_vma(anon_vma); 843 put_anon_vma(anon_vma);
815 844
816uncharge: 845uncharge:
817 mem_cgroup_end_migration(mem, page, newpage, rc == 0); 846 mem_cgroup_end_migration(mem, page, newpage,
847 (rc == MIGRATEPAGE_SUCCESS ||
848 rc == MIGRATEPAGE_BALLOON_SUCCESS));
818unlock: 849unlock:
819 unlock_page(page); 850 unlock_page(page);
820out: 851out:
@@ -846,6 +877,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
846 goto out; 877 goto out;
847 878
848 rc = __unmap_and_move(page, newpage, force, offlining, mode); 879 rc = __unmap_and_move(page, newpage, force, offlining, mode);
880
881 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
882 /*
883 * A ballooned page has been migrated already.
884 * Now, it's the time to wrap-up counters,
885 * handle the page back to Buddy and return.
886 */
887 dec_zone_page_state(page, NR_ISOLATED_ANON +
888 page_is_file_cache(page));
889 balloon_page_free(page);
890 return MIGRATEPAGE_SUCCESS;
891 }
849out: 892out:
850 if (rc != -EAGAIN) { 893 if (rc != -EAGAIN) {
851 /* 894 /*
@@ -958,10 +1001,11 @@ out:
958 */ 1001 */
959int migrate_pages(struct list_head *from, 1002int migrate_pages(struct list_head *from,
960 new_page_t get_new_page, unsigned long private, bool offlining, 1003 new_page_t get_new_page, unsigned long private, bool offlining,
961 enum migrate_mode mode) 1004 enum migrate_mode mode, int reason)
962{ 1005{
963 int retry = 1; 1006 int retry = 1;
964 int nr_failed = 0; 1007 int nr_failed = 0;
1008 int nr_succeeded = 0;
965 int pass = 0; 1009 int pass = 0;
966 struct page *page; 1010 struct page *page;
967 struct page *page2; 1011 struct page *page2;
@@ -987,7 +1031,8 @@ int migrate_pages(struct list_head *from,
987 case -EAGAIN: 1031 case -EAGAIN:
988 retry++; 1032 retry++;
989 break; 1033 break;
990 case 0: 1034 case MIGRATEPAGE_SUCCESS:
1035 nr_succeeded++;
991 break; 1036 break;
992 default: 1037 default:
993 /* Permanent failure */ 1038 /* Permanent failure */
@@ -996,15 +1041,18 @@ int migrate_pages(struct list_head *from,
996 } 1041 }
997 } 1042 }
998 } 1043 }
999 rc = 0; 1044 rc = nr_failed + retry;
1000out: 1045out:
1046 if (nr_succeeded)
1047 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1048 if (nr_failed)
1049 count_vm_events(PGMIGRATE_FAIL, nr_failed);
1050 trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1051
1001 if (!swapwrite) 1052 if (!swapwrite)
1002 current->flags &= ~PF_SWAPWRITE; 1053 current->flags &= ~PF_SWAPWRITE;
1003 1054
1004 if (rc) 1055 return rc;
1005 return rc;
1006
1007 return nr_failed + retry;
1008} 1056}
1009 1057
1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 1058int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1072,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1024 /* try again */ 1072 /* try again */
1025 cond_resched(); 1073 cond_resched();
1026 break; 1074 break;
1027 case 0: 1075 case MIGRATEPAGE_SUCCESS:
1028 goto out; 1076 goto out;
1029 default: 1077 default:
1030 rc = -EIO; 1078 rc = -EIO;
@@ -1139,7 +1187,8 @@ set_status:
1139 err = 0; 1187 err = 0;
1140 if (!list_empty(&pagelist)) { 1188 if (!list_empty(&pagelist)) {
1141 err = migrate_pages(&pagelist, new_page_node, 1189 err = migrate_pages(&pagelist, new_page_node,
1142 (unsigned long)pm, 0, MIGRATE_SYNC); 1190 (unsigned long)pm, 0, MIGRATE_SYNC,
1191 MR_SYSCALL);
1143 if (err) 1192 if (err)
1144 putback_lru_pages(&pagelist); 1193 putback_lru_pages(&pagelist);
1145 } 1194 }
@@ -1201,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1201 if (node < 0 || node >= MAX_NUMNODES) 1250 if (node < 0 || node >= MAX_NUMNODES)
1202 goto out_pm; 1251 goto out_pm;
1203 1252
1204 if (!node_state(node, N_HIGH_MEMORY)) 1253 if (!node_state(node, N_MEMORY))
1205 goto out_pm; 1254 goto out_pm;
1206 1255
1207 err = -EACCES; 1256 err = -EACCES;
@@ -1403,4 +1452,329 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1403 } 1452 }
1404 return err; 1453 return err;
1405} 1454}
1406#endif 1455
1456#ifdef CONFIG_NUMA_BALANCING
1457/*
1458 * Returns true if this is a safe migration target node for misplaced NUMA
1459 * pages. Currently it only checks the watermarks which crude
1460 */
1461static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1462 int nr_migrate_pages)
1463{
1464 int z;
1465 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1466 struct zone *zone = pgdat->node_zones + z;
1467
1468 if (!populated_zone(zone))
1469 continue;
1470
1471 if (zone->all_unreclaimable)
1472 continue;
1473
1474 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
1475 if (!zone_watermark_ok(zone, 0,
1476 high_wmark_pages(zone) +
1477 nr_migrate_pages,
1478 0, 0))
1479 continue;
1480 return true;
1481 }
1482 return false;
1483}
1484
1485static struct page *alloc_misplaced_dst_page(struct page *page,
1486 unsigned long data,
1487 int **result)
1488{
1489 int nid = (int) data;
1490 struct page *newpage;
1491
1492 newpage = alloc_pages_exact_node(nid,
1493 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
1494 __GFP_NOMEMALLOC | __GFP_NORETRY |
1495 __GFP_NOWARN) &
1496 ~GFP_IOFS, 0);
1497 if (newpage)
1498 page_xchg_last_nid(newpage, page_last_nid(page));
1499
1500 return newpage;
1501}
1502
1503/*
1504 * page migration rate limiting control.
1505 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1506 * window of time. Default here says do not migrate more than 1280M per second.
1507 * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
1508 * as it is faults that reset the window, pte updates will happen unconditionally
1509 * if there has not been a fault since @pteupdate_interval_millisecs after the
1510 * throttle window closed.
1511 */
1512static unsigned int migrate_interval_millisecs __read_mostly = 100;
1513static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1514static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1515
1516/* Returns true if NUMA migration is currently rate limited */
1517bool migrate_ratelimited(int node)
1518{
1519 pg_data_t *pgdat = NODE_DATA(node);
1520
1521 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1522 msecs_to_jiffies(pteupdate_interval_millisecs)))
1523 return false;
1524
1525 if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1526 return false;
1527
1528 return true;
1529}
1530
1531/* Returns true if the node is migrate rate-limited after the update */
1532bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
1533{
1534 bool rate_limited = false;
1535
1536 /*
1537 * Rate-limit the amount of data that is being migrated to a node.
1538 * Optimal placement is no good if the memory bus is saturated and
1539 * all the time is being spent migrating!
1540 */
1541 spin_lock(&pgdat->numabalancing_migrate_lock);
1542 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1543 pgdat->numabalancing_migrate_nr_pages = 0;
1544 pgdat->numabalancing_migrate_next_window = jiffies +
1545 msecs_to_jiffies(migrate_interval_millisecs);
1546 }
1547 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
1548 rate_limited = true;
1549 else
1550 pgdat->numabalancing_migrate_nr_pages += nr_pages;
1551 spin_unlock(&pgdat->numabalancing_migrate_lock);
1552
1553 return rate_limited;
1554}
1555
1556int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1557{
1558 int ret = 0;
1559
1560 /* Avoid migrating to a node that is nearly full */
1561 if (migrate_balanced_pgdat(pgdat, 1)) {
1562 int page_lru;
1563
1564 if (isolate_lru_page(page)) {
1565 put_page(page);
1566 return 0;
1567 }
1568
1569 /* Page is isolated */
1570 ret = 1;
1571 page_lru = page_is_file_cache(page);
1572 if (!PageTransHuge(page))
1573 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
1574 else
1575 mod_zone_page_state(page_zone(page),
1576 NR_ISOLATED_ANON + page_lru,
1577 HPAGE_PMD_NR);
1578 }
1579
1580 /*
1581 * Page is either isolated or there is not enough space on the target
1582 * node. If isolated, then it has taken a reference count and the
1583 * callers reference can be safely dropped without the page
1584 * disappearing underneath us during migration. Otherwise the page is
1585 * not to be migrated but the callers reference should still be
1586 * dropped so it does not leak.
1587 */
1588 put_page(page);
1589
1590 return ret;
1591}
1592
1593/*
1594 * Attempt to migrate a misplaced page to the specified destination
1595 * node. Caller is expected to have an elevated reference count on
1596 * the page that will be dropped by this function before returning.
1597 */
1598int migrate_misplaced_page(struct page *page, int node)
1599{
1600 pg_data_t *pgdat = NODE_DATA(node);
1601 int isolated = 0;
1602 int nr_remaining;
1603 LIST_HEAD(migratepages);
1604
1605 /*
1606 * Don't migrate pages that are mapped in multiple processes.
1607 * TODO: Handle false sharing detection instead of this hammer
1608 */
1609 if (page_mapcount(page) != 1) {
1610 put_page(page);
1611 goto out;
1612 }
1613
1614 /*
1615 * Rate-limit the amount of data that is being migrated to a node.
1616 * Optimal placement is no good if the memory bus is saturated and
1617 * all the time is being spent migrating!
1618 */
1619 if (numamigrate_update_ratelimit(pgdat, 1)) {
1620 put_page(page);
1621 goto out;
1622 }
1623
1624 isolated = numamigrate_isolate_page(pgdat, page);
1625 if (!isolated)
1626 goto out;
1627
1628 list_add(&page->lru, &migratepages);
1629 nr_remaining = migrate_pages(&migratepages,
1630 alloc_misplaced_dst_page,
1631 node, false, MIGRATE_ASYNC,
1632 MR_NUMA_MISPLACED);
1633 if (nr_remaining) {
1634 putback_lru_pages(&migratepages);
1635 isolated = 0;
1636 } else
1637 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1638 BUG_ON(!list_empty(&migratepages));
1639out:
1640 return isolated;
1641}
1642#endif /* CONFIG_NUMA_BALANCING */
1643
1644#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1645int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1646 struct vm_area_struct *vma,
1647 pmd_t *pmd, pmd_t entry,
1648 unsigned long address,
1649 struct page *page, int node)
1650{
1651 unsigned long haddr = address & HPAGE_PMD_MASK;
1652 pg_data_t *pgdat = NODE_DATA(node);
1653 int isolated = 0;
1654 struct page *new_page = NULL;
1655 struct mem_cgroup *memcg = NULL;
1656 int page_lru = page_is_file_cache(page);
1657
1658 /*
1659 * Don't migrate pages that are mapped in multiple processes.
1660 * TODO: Handle false sharing detection instead of this hammer
1661 */
1662 if (page_mapcount(page) != 1)
1663 goto out_dropref;
1664
1665 /*
1666 * Rate-limit the amount of data that is being migrated to a node.
1667 * Optimal placement is no good if the memory bus is saturated and
1668 * all the time is being spent migrating!
1669 */
1670 if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
1671 goto out_dropref;
1672
1673 new_page = alloc_pages_node(node,
1674 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
1675 if (!new_page) {
1676 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1677 goto out_dropref;
1678 }
1679 page_xchg_last_nid(new_page, page_last_nid(page));
1680
1681 isolated = numamigrate_isolate_page(pgdat, page);
1682
1683 /*
1684 * Failing to isolate or a GUP pin prevents migration. The expected
1685 * page count is 2. 1 for anonymous pages without a mapping and 1
1686 * for the callers pin. If the page was isolated, the page will
1687 * need to be put back on the LRU.
1688 */
1689 if (!isolated || page_count(page) != 2) {
1690 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1691 put_page(new_page);
1692 if (isolated) {
1693 putback_lru_page(page);
1694 isolated = 0;
1695 goto out;
1696 }
1697 goto out_keep_locked;
1698 }
1699
1700 /* Prepare a page as a migration target */
1701 __set_page_locked(new_page);
1702 SetPageSwapBacked(new_page);
1703
1704 /* anon mapping, we can simply copy page->mapping to the new page: */
1705 new_page->mapping = page->mapping;
1706 new_page->index = page->index;
1707 migrate_page_copy(new_page, page);
1708 WARN_ON(PageLRU(new_page));
1709
1710 /* Recheck the target PMD */
1711 spin_lock(&mm->page_table_lock);
1712 if (unlikely(!pmd_same(*pmd, entry))) {
1713 spin_unlock(&mm->page_table_lock);
1714
1715 /* Reverse changes made by migrate_page_copy() */
1716 if (TestClearPageActive(new_page))
1717 SetPageActive(page);
1718 if (TestClearPageUnevictable(new_page))
1719 SetPageUnevictable(page);
1720 mlock_migrate_page(page, new_page);
1721
1722 unlock_page(new_page);
1723 put_page(new_page); /* Free it */
1724
1725 unlock_page(page);
1726 putback_lru_page(page);
1727
1728 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1729 goto out;
1730 }
1731
1732 /*
1733 * Traditional migration needs to prepare the memcg charge
1734 * transaction early to prevent the old page from being
1735 * uncharged when installing migration entries. Here we can
1736 * save the potential rollback and start the charge transfer
1737 * only when migration is already known to end successfully.
1738 */
1739 mem_cgroup_prepare_migration(page, new_page, &memcg);
1740
1741 entry = mk_pmd(new_page, vma->vm_page_prot);
1742 entry = pmd_mknonnuma(entry);
1743 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1744 entry = pmd_mkhuge(entry);
1745
1746 page_add_new_anon_rmap(new_page, vma, haddr);
1747
1748 set_pmd_at(mm, haddr, pmd, entry);
1749 update_mmu_cache_pmd(vma, address, &entry);
1750 page_remove_rmap(page);
1751 /*
1752 * Finish the charge transaction under the page table lock to
1753 * prevent split_huge_page() from dividing up the charge
1754 * before it's fully transferred to the new page.
1755 */
1756 mem_cgroup_end_migration(memcg, page, new_page, true);
1757 spin_unlock(&mm->page_table_lock);
1758
1759 unlock_page(new_page);
1760 unlock_page(page);
1761 put_page(page); /* Drop the rmap reference */
1762 put_page(page); /* Drop the LRU isolation reference */
1763
1764 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1765 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1766
1767out:
1768 mod_zone_page_state(page_zone(page),
1769 NR_ISOLATED_ANON + page_lru,
1770 -HPAGE_PMD_NR);
1771 return isolated;
1772
1773out_dropref:
1774 put_page(page);
1775out_keep_locked:
1776 return 0;
1777}
1778#endif /* CONFIG_NUMA_BALANCING */
1779
1780#endif /* CONFIG_NUMA */
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c41e7d9..35730ee9d515 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/rbtree_augmented.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
@@ -89,6 +90,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
89struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 90struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
90 91
91/* 92/*
93 * The global memory commitment made in the system can be a metric
94 * that can be used to drive ballooning decisions when Linux is hosted
95 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
96 * balancing memory across competing virtual machines that are hosted.
97 * Several metrics drive this policy engine including the guest reported
98 * memory commitment.
99 */
100unsigned long vm_memory_committed(void)
101{
102 return percpu_counter_read_positive(&vm_committed_as);
103}
104EXPORT_SYMBOL_GPL(vm_memory_committed);
105
106/*
92 * Check that a process has enough memory to allocate a new virtual 107 * Check that a process has enough memory to allocate a new virtual
93 * mapping. 0 means there is enough memory for the allocation to 108 * mapping. 0 means there is enough memory for the allocation to
94 * succeed and -ENOMEM implies there is not. 109 * succeed and -ENOMEM implies there is not.
@@ -297,40 +312,88 @@ out:
297 return retval; 312 return retval;
298} 313}
299 314
315static long vma_compute_subtree_gap(struct vm_area_struct *vma)
316{
317 unsigned long max, subtree_gap;
318 max = vma->vm_start;
319 if (vma->vm_prev)
320 max -= vma->vm_prev->vm_end;
321 if (vma->vm_rb.rb_left) {
322 subtree_gap = rb_entry(vma->vm_rb.rb_left,
323 struct vm_area_struct, vm_rb)->rb_subtree_gap;
324 if (subtree_gap > max)
325 max = subtree_gap;
326 }
327 if (vma->vm_rb.rb_right) {
328 subtree_gap = rb_entry(vma->vm_rb.rb_right,
329 struct vm_area_struct, vm_rb)->rb_subtree_gap;
330 if (subtree_gap > max)
331 max = subtree_gap;
332 }
333 return max;
334}
335
300#ifdef CONFIG_DEBUG_VM_RB 336#ifdef CONFIG_DEBUG_VM_RB
301static int browse_rb(struct rb_root *root) 337static int browse_rb(struct rb_root *root)
302{ 338{
303 int i = 0, j; 339 int i = 0, j, bug = 0;
304 struct rb_node *nd, *pn = NULL; 340 struct rb_node *nd, *pn = NULL;
305 unsigned long prev = 0, pend = 0; 341 unsigned long prev = 0, pend = 0;
306 342
307 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 343 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
308 struct vm_area_struct *vma; 344 struct vm_area_struct *vma;
309 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 345 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
310 if (vma->vm_start < prev) 346 if (vma->vm_start < prev) {
311 printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; 347 printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
312 if (vma->vm_start < pend) 348 bug = 1;
349 }
350 if (vma->vm_start < pend) {
313 printk("vm_start %lx pend %lx\n", vma->vm_start, pend); 351 printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
314 if (vma->vm_start > vma->vm_end) 352 bug = 1;
315 printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); 353 }
354 if (vma->vm_start > vma->vm_end) {
355 printk("vm_end %lx < vm_start %lx\n",
356 vma->vm_end, vma->vm_start);
357 bug = 1;
358 }
359 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
360 printk("free gap %lx, correct %lx\n",
361 vma->rb_subtree_gap,
362 vma_compute_subtree_gap(vma));
363 bug = 1;
364 }
316 i++; 365 i++;
317 pn = nd; 366 pn = nd;
318 prev = vma->vm_start; 367 prev = vma->vm_start;
319 pend = vma->vm_end; 368 pend = vma->vm_end;
320 } 369 }
321 j = 0; 370 j = 0;
322 for (nd = pn; nd; nd = rb_prev(nd)) { 371 for (nd = pn; nd; nd = rb_prev(nd))
323 j++; 372 j++;
373 if (i != j) {
374 printk("backwards %d, forwards %d\n", j, i);
375 bug = 1;
376 }
377 return bug ? -1 : i;
378}
379
380static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
381{
382 struct rb_node *nd;
383
384 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
385 struct vm_area_struct *vma;
386 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
387 BUG_ON(vma != ignore &&
388 vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
324 } 389 }
325 if (i != j)
326 printk("backwards %d, forwards %d\n", j, i), i = 0;
327 return i;
328} 390}
329 391
330void validate_mm(struct mm_struct *mm) 392void validate_mm(struct mm_struct *mm)
331{ 393{
332 int bug = 0; 394 int bug = 0;
333 int i = 0; 395 int i = 0;
396 unsigned long highest_address = 0;
334 struct vm_area_struct *vma = mm->mmap; 397 struct vm_area_struct *vma = mm->mmap;
335 while (vma) { 398 while (vma) {
336 struct anon_vma_chain *avc; 399 struct anon_vma_chain *avc;
@@ -338,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
338 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 401 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
339 anon_vma_interval_tree_verify(avc); 402 anon_vma_interval_tree_verify(avc);
340 vma_unlock_anon_vma(vma); 403 vma_unlock_anon_vma(vma);
404 highest_address = vma->vm_end;
341 vma = vma->vm_next; 405 vma = vma->vm_next;
342 i++; 406 i++;
343 } 407 }
344 if (i != mm->map_count) 408 if (i != mm->map_count) {
345 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; 409 printk("map_count %d vm_next %d\n", mm->map_count, i);
410 bug = 1;
411 }
412 if (highest_address != mm->highest_vm_end) {
413 printk("mm->highest_vm_end %lx, found %lx\n",
414 mm->highest_vm_end, highest_address);
415 bug = 1;
416 }
346 i = browse_rb(&mm->mm_rb); 417 i = browse_rb(&mm->mm_rb);
347 if (i != mm->map_count) 418 if (i != mm->map_count) {
348 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; 419 printk("map_count %d rb %d\n", mm->map_count, i);
420 bug = 1;
421 }
349 BUG_ON(bug); 422 BUG_ON(bug);
350} 423}
351#else 424#else
425#define validate_mm_rb(root, ignore) do { } while (0)
352#define validate_mm(mm) do { } while (0) 426#define validate_mm(mm) do { } while (0)
353#endif 427#endif
354 428
429RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
430 unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
431
432/*
433 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
434 * vma->vm_prev->vm_end values changed, without modifying the vma's position
435 * in the rbtree.
436 */
437static void vma_gap_update(struct vm_area_struct *vma)
438{
439 /*
440 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
441 * function that does exacltly what we want.
442 */
443 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
444}
445
446static inline void vma_rb_insert(struct vm_area_struct *vma,
447 struct rb_root *root)
448{
449 /* All rb_subtree_gap values must be consistent prior to insertion */
450 validate_mm_rb(root, NULL);
451
452 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
453}
454
455static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
456{
457 /*
458 * All rb_subtree_gap values must be consistent prior to erase,
459 * with the possible exception of the vma being erased.
460 */
461 validate_mm_rb(root, vma);
462
463 /*
464 * Note rb_erase_augmented is a fairly large inline function,
465 * so make sure we instantiate it only once with our desired
466 * augmented rbtree callbacks.
467 */
468 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
469}
470
355/* 471/*
356 * vma has some anon_vma assigned, and is already inserted on that 472 * vma has some anon_vma assigned, and is already inserted on that
357 * anon_vma's interval trees. 473 * anon_vma's interval trees.
@@ -421,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
421void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 537void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
422 struct rb_node **rb_link, struct rb_node *rb_parent) 538 struct rb_node **rb_link, struct rb_node *rb_parent)
423{ 539{
540 /* Update tracking information for the gap following the new vma. */
541 if (vma->vm_next)
542 vma_gap_update(vma->vm_next);
543 else
544 mm->highest_vm_end = vma->vm_end;
545
546 /*
547 * vma->vm_prev wasn't known when we followed the rbtree to find the
548 * correct insertion point for that vma. As a result, we could not
549 * update the vma vm_rb parents rb_subtree_gap values on the way down.
550 * So, we first insert the vma with a zero rb_subtree_gap value
551 * (to be consistent with what we did on the way down), and then
552 * immediately update the gap to the correct value. Finally we
553 * rebalance the rbtree after all augmented values have been set.
554 */
424 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 555 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
425 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 556 vma->rb_subtree_gap = 0;
557 vma_gap_update(vma);
558 vma_rb_insert(vma, &mm->mm_rb);
426} 559}
427 560
428static void __vma_link_file(struct vm_area_struct *vma) 561static void __vma_link_file(struct vm_area_struct *vma)
@@ -498,12 +631,12 @@ static inline void
498__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 631__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
499 struct vm_area_struct *prev) 632 struct vm_area_struct *prev)
500{ 633{
501 struct vm_area_struct *next = vma->vm_next; 634 struct vm_area_struct *next;
502 635
503 prev->vm_next = next; 636 vma_rb_erase(vma, &mm->mm_rb);
637 prev->vm_next = next = vma->vm_next;
504 if (next) 638 if (next)
505 next->vm_prev = prev; 639 next->vm_prev = prev;
506 rb_erase(&vma->vm_rb, &mm->mm_rb);
507 if (mm->mmap_cache == vma) 640 if (mm->mmap_cache == vma)
508 mm->mmap_cache = prev; 641 mm->mmap_cache = prev;
509} 642}
@@ -525,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
525 struct rb_root *root = NULL; 658 struct rb_root *root = NULL;
526 struct anon_vma *anon_vma = NULL; 659 struct anon_vma *anon_vma = NULL;
527 struct file *file = vma->vm_file; 660 struct file *file = vma->vm_file;
661 bool start_changed = false, end_changed = false;
528 long adjust_next = 0; 662 long adjust_next = 0;
529 int remove_next = 0; 663 int remove_next = 0;
530 664
@@ -602,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end);
602 if (anon_vma) { 736 if (anon_vma) {
603 VM_BUG_ON(adjust_next && next->anon_vma && 737 VM_BUG_ON(adjust_next && next->anon_vma &&
604 anon_vma != next->anon_vma); 738 anon_vma != next->anon_vma);
605 anon_vma_lock(anon_vma); 739 anon_vma_lock_write(anon_vma);
606 anon_vma_interval_tree_pre_update_vma(vma); 740 anon_vma_interval_tree_pre_update_vma(vma);
607 if (adjust_next) 741 if (adjust_next)
608 anon_vma_interval_tree_pre_update_vma(next); 742 anon_vma_interval_tree_pre_update_vma(next);
@@ -615,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end);
615 vma_interval_tree_remove(next, root); 749 vma_interval_tree_remove(next, root);
616 } 750 }
617 751
618 vma->vm_start = start; 752 if (start != vma->vm_start) {
619 vma->vm_end = end; 753 vma->vm_start = start;
754 start_changed = true;
755 }
756 if (end != vma->vm_end) {
757 vma->vm_end = end;
758 end_changed = true;
759 }
620 vma->vm_pgoff = pgoff; 760 vma->vm_pgoff = pgoff;
621 if (adjust_next) { 761 if (adjust_next) {
622 next->vm_start += adjust_next << PAGE_SHIFT; 762 next->vm_start += adjust_next << PAGE_SHIFT;
@@ -645,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end);
645 * (it may either follow vma or precede it). 785 * (it may either follow vma or precede it).
646 */ 786 */
647 __insert_vm_struct(mm, insert); 787 __insert_vm_struct(mm, insert);
788 } else {
789 if (start_changed)
790 vma_gap_update(vma);
791 if (end_changed) {
792 if (!next)
793 mm->highest_vm_end = end;
794 else if (!adjust_next)
795 vma_gap_update(next);
796 }
648 } 797 }
649 798
650 if (anon_vma) { 799 if (anon_vma) {
@@ -678,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end);
678 * we must remove another next too. It would clutter 827 * we must remove another next too. It would clutter
679 * up the code too much to do both in one go. 828 * up the code too much to do both in one go.
680 */ 829 */
681 if (remove_next == 2) { 830 next = vma->vm_next;
682 next = vma->vm_next; 831 if (remove_next == 2)
683 goto again; 832 goto again;
684 } 833 else if (next)
834 vma_gap_update(next);
835 else
836 mm->highest_vm_end = end;
685 } 837 }
686 if (insert && file) 838 if (insert && file)
687 uprobe_mmap(insert); 839 uprobe_mmap(insert);
@@ -1153,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1153 * memory so no accounting is necessary 1305 * memory so no accounting is necessary
1154 */ 1306 */
1155 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, 1307 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1156 VM_NORESERVE, &user, 1308 VM_NORESERVE,
1157 HUGETLB_ANONHUGE_INODE); 1309 &user, HUGETLB_ANONHUGE_INODE,
1310 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1158 if (IS_ERR(file)) 1311 if (IS_ERR(file))
1159 return PTR_ERR(file); 1312 return PTR_ERR(file);
1160 } 1313 }
@@ -1335,7 +1488,11 @@ munmap_back:
1335 * 1488 *
1336 * Answer: Yes, several device drivers can do it in their 1489 * Answer: Yes, several device drivers can do it in their
1337 * f_op->mmap method. -DaveM 1490 * f_op->mmap method. -DaveM
1491 * Bug: If addr is changed, prev, rb_link, rb_parent should
1492 * be updated for vma_link()
1338 */ 1493 */
1494 WARN_ON_ONCE(addr != vma->vm_start);
1495
1339 addr = vma->vm_start; 1496 addr = vma->vm_start;
1340 pgoff = vma->vm_pgoff; 1497 pgoff = vma->vm_pgoff;
1341 vm_flags = vma->vm_flags; 1498 vm_flags = vma->vm_flags;
@@ -1400,6 +1557,206 @@ unacct_error:
1400 return error; 1557 return error;
1401} 1558}
1402 1559
1560unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1561{
1562 /*
1563 * We implement the search by looking for an rbtree node that
1564 * immediately follows a suitable gap. That is,
1565 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1566 * - gap_end = vma->vm_start >= info->low_limit + length;
1567 * - gap_end - gap_start >= length
1568 */
1569
1570 struct mm_struct *mm = current->mm;
1571 struct vm_area_struct *vma;
1572 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1573
1574 /* Adjust search length to account for worst case alignment overhead */
1575 length = info->length + info->align_mask;
1576 if (length < info->length)
1577 return -ENOMEM;
1578
1579 /* Adjust search limits by the desired length */
1580 if (info->high_limit < length)
1581 return -ENOMEM;
1582 high_limit = info->high_limit - length;
1583
1584 if (info->low_limit > high_limit)
1585 return -ENOMEM;
1586 low_limit = info->low_limit + length;
1587
1588 /* Check if rbtree root looks promising */
1589 if (RB_EMPTY_ROOT(&mm->mm_rb))
1590 goto check_highest;
1591 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1592 if (vma->rb_subtree_gap < length)
1593 goto check_highest;
1594
1595 while (true) {
1596 /* Visit left subtree if it looks promising */
1597 gap_end = vma->vm_start;
1598 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1599 struct vm_area_struct *left =
1600 rb_entry(vma->vm_rb.rb_left,
1601 struct vm_area_struct, vm_rb);
1602 if (left->rb_subtree_gap >= length) {
1603 vma = left;
1604 continue;
1605 }
1606 }
1607
1608 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1609check_current:
1610 /* Check if current node has a suitable gap */
1611 if (gap_start > high_limit)
1612 return -ENOMEM;
1613 if (gap_end >= low_limit && gap_end - gap_start >= length)
1614 goto found;
1615
1616 /* Visit right subtree if it looks promising */
1617 if (vma->vm_rb.rb_right) {
1618 struct vm_area_struct *right =
1619 rb_entry(vma->vm_rb.rb_right,
1620 struct vm_area_struct, vm_rb);
1621 if (right->rb_subtree_gap >= length) {
1622 vma = right;
1623 continue;
1624 }
1625 }
1626
1627 /* Go back up the rbtree to find next candidate node */
1628 while (true) {
1629 struct rb_node *prev = &vma->vm_rb;
1630 if (!rb_parent(prev))
1631 goto check_highest;
1632 vma = rb_entry(rb_parent(prev),
1633 struct vm_area_struct, vm_rb);
1634 if (prev == vma->vm_rb.rb_left) {
1635 gap_start = vma->vm_prev->vm_end;
1636 gap_end = vma->vm_start;
1637 goto check_current;
1638 }
1639 }
1640 }
1641
1642check_highest:
1643 /* Check highest gap, which does not precede any rbtree node */
1644 gap_start = mm->highest_vm_end;
1645 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
1646 if (gap_start > high_limit)
1647 return -ENOMEM;
1648
1649found:
1650 /* We found a suitable gap. Clip it with the original low_limit. */
1651 if (gap_start < info->low_limit)
1652 gap_start = info->low_limit;
1653
1654 /* Adjust gap address to the desired alignment */
1655 gap_start += (info->align_offset - gap_start) & info->align_mask;
1656
1657 VM_BUG_ON(gap_start + info->length > info->high_limit);
1658 VM_BUG_ON(gap_start + info->length > gap_end);
1659 return gap_start;
1660}
1661
1662unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1663{
1664 struct mm_struct *mm = current->mm;
1665 struct vm_area_struct *vma;
1666 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1667
1668 /* Adjust search length to account for worst case alignment overhead */
1669 length = info->length + info->align_mask;
1670 if (length < info->length)
1671 return -ENOMEM;
1672
1673 /*
1674 * Adjust search limits by the desired length.
1675 * See implementation comment at top of unmapped_area().
1676 */
1677 gap_end = info->high_limit;
1678 if (gap_end < length)
1679 return -ENOMEM;
1680 high_limit = gap_end - length;
1681
1682 if (info->low_limit > high_limit)
1683 return -ENOMEM;
1684 low_limit = info->low_limit + length;
1685
1686 /* Check highest gap, which does not precede any rbtree node */
1687 gap_start = mm->highest_vm_end;
1688 if (gap_start <= high_limit)
1689 goto found_highest;
1690
1691 /* Check if rbtree root looks promising */
1692 if (RB_EMPTY_ROOT(&mm->mm_rb))
1693 return -ENOMEM;
1694 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1695 if (vma->rb_subtree_gap < length)
1696 return -ENOMEM;
1697
1698 while (true) {
1699 /* Visit right subtree if it looks promising */
1700 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1701 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1702 struct vm_area_struct *right =
1703 rb_entry(vma->vm_rb.rb_right,
1704 struct vm_area_struct, vm_rb);
1705 if (right->rb_subtree_gap >= length) {
1706 vma = right;
1707 continue;
1708 }
1709 }
1710
1711check_current:
1712 /* Check if current node has a suitable gap */
1713 gap_end = vma->vm_start;
1714 if (gap_end < low_limit)
1715 return -ENOMEM;
1716 if (gap_start <= high_limit && gap_end - gap_start >= length)
1717 goto found;
1718
1719 /* Visit left subtree if it looks promising */
1720 if (vma->vm_rb.rb_left) {
1721 struct vm_area_struct *left =
1722 rb_entry(vma->vm_rb.rb_left,
1723 struct vm_area_struct, vm_rb);
1724 if (left->rb_subtree_gap >= length) {
1725 vma = left;
1726 continue;
1727 }
1728 }
1729
1730 /* Go back up the rbtree to find next candidate node */
1731 while (true) {
1732 struct rb_node *prev = &vma->vm_rb;
1733 if (!rb_parent(prev))
1734 return -ENOMEM;
1735 vma = rb_entry(rb_parent(prev),
1736 struct vm_area_struct, vm_rb);
1737 if (prev == vma->vm_rb.rb_right) {
1738 gap_start = vma->vm_prev ?
1739 vma->vm_prev->vm_end : 0;
1740 goto check_current;
1741 }
1742 }
1743 }
1744
1745found:
1746 /* We found a suitable gap. Clip it with the original high_limit. */
1747 if (gap_end > info->high_limit)
1748 gap_end = info->high_limit;
1749
1750found_highest:
1751 /* Compute highest gap address at the desired alignment */
1752 gap_end -= info->length;
1753 gap_end -= (gap_end - info->align_offset) & info->align_mask;
1754
1755 VM_BUG_ON(gap_end < info->low_limit);
1756 VM_BUG_ON(gap_end < gap_start);
1757 return gap_end;
1758}
1759
1403/* Get an address range which is currently unmapped. 1760/* Get an address range which is currently unmapped.
1404 * For shmat() with addr=0. 1761 * For shmat() with addr=0.
1405 * 1762 *
@@ -1418,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1418{ 1775{
1419 struct mm_struct *mm = current->mm; 1776 struct mm_struct *mm = current->mm;
1420 struct vm_area_struct *vma; 1777 struct vm_area_struct *vma;
1421 unsigned long start_addr; 1778 struct vm_unmapped_area_info info;
1422 1779
1423 if (len > TASK_SIZE) 1780 if (len > TASK_SIZE)
1424 return -ENOMEM; 1781 return -ENOMEM;
@@ -1433,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1433 (!vma || addr + len <= vma->vm_start)) 1790 (!vma || addr + len <= vma->vm_start))
1434 return addr; 1791 return addr;
1435 } 1792 }
1436 if (len > mm->cached_hole_size) {
1437 start_addr = addr = mm->free_area_cache;
1438 } else {
1439 start_addr = addr = TASK_UNMAPPED_BASE;
1440 mm->cached_hole_size = 0;
1441 }
1442 1793
1443full_search: 1794 info.flags = 0;
1444 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 1795 info.length = len;
1445 /* At this point: (!vma || addr < vma->vm_end). */ 1796 info.low_limit = TASK_UNMAPPED_BASE;
1446 if (TASK_SIZE - len < addr) { 1797 info.high_limit = TASK_SIZE;
1447 /* 1798 info.align_mask = 0;
1448 * Start a new search - just in case we missed 1799 return vm_unmapped_area(&info);
1449 * some holes.
1450 */
1451 if (start_addr != TASK_UNMAPPED_BASE) {
1452 addr = TASK_UNMAPPED_BASE;
1453 start_addr = addr;
1454 mm->cached_hole_size = 0;
1455 goto full_search;
1456 }
1457 return -ENOMEM;
1458 }
1459 if (!vma || addr + len <= vma->vm_start) {
1460 /*
1461 * Remember the place where we stopped the search:
1462 */
1463 mm->free_area_cache = addr + len;
1464 return addr;
1465 }
1466 if (addr + mm->cached_hole_size < vma->vm_start)
1467 mm->cached_hole_size = vma->vm_start - addr;
1468 addr = vma->vm_end;
1469 }
1470} 1800}
1471#endif 1801#endif
1472 1802
@@ -1491,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1491{ 1821{
1492 struct vm_area_struct *vma; 1822 struct vm_area_struct *vma;
1493 struct mm_struct *mm = current->mm; 1823 struct mm_struct *mm = current->mm;
1494 unsigned long addr = addr0, start_addr; 1824 unsigned long addr = addr0;
1825 struct vm_unmapped_area_info info;
1495 1826
1496 /* requested length too big for entire address space */ 1827 /* requested length too big for entire address space */
1497 if (len > TASK_SIZE) 1828 if (len > TASK_SIZE)
@@ -1509,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1509 return addr; 1840 return addr;
1510 } 1841 }
1511 1842
1512 /* check if free_area_cache is useful for us */ 1843 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1513 if (len <= mm->cached_hole_size) { 1844 info.length = len;
1514 mm->cached_hole_size = 0; 1845 info.low_limit = PAGE_SIZE;
1515 mm->free_area_cache = mm->mmap_base; 1846 info.high_limit = mm->mmap_base;
1516 } 1847 info.align_mask = 0;
1517 1848 addr = vm_unmapped_area(&info);
1518try_again:
1519 /* either no address requested or can't fit in requested address hole */
1520 start_addr = addr = mm->free_area_cache;
1521
1522 if (addr < len)
1523 goto fail;
1524
1525 addr -= len;
1526 do {
1527 /*
1528 * Lookup failure means no vma is above this address,
1529 * else if new region fits below vma->vm_start,
1530 * return with success:
1531 */
1532 vma = find_vma(mm, addr);
1533 if (!vma || addr+len <= vma->vm_start)
1534 /* remember the address as a hint for next time */
1535 return (mm->free_area_cache = addr);
1536
1537 /* remember the largest hole we saw so far */
1538 if (addr + mm->cached_hole_size < vma->vm_start)
1539 mm->cached_hole_size = vma->vm_start - addr;
1540
1541 /* try just below the current vma->vm_start */
1542 addr = vma->vm_start-len;
1543 } while (len < vma->vm_start);
1544
1545fail:
1546 /*
1547 * if hint left us with no space for the requested
1548 * mapping then try again:
1549 *
1550 * Note: this is different with the case of bottomup
1551 * which does the fully line-search, but we use find_vma
1552 * here that causes some holes skipped.
1553 */
1554 if (start_addr != mm->mmap_base) {
1555 mm->free_area_cache = mm->mmap_base;
1556 mm->cached_hole_size = 0;
1557 goto try_again;
1558 }
1559 1849
1560 /* 1850 /*
1561 * A failed mmap() very likely causes application failure, 1851 * A failed mmap() very likely causes application failure,
@@ -1563,14 +1853,13 @@ fail:
1563 * can happen with large stack limits and large mmap() 1853 * can happen with large stack limits and large mmap()
1564 * allocations. 1854 * allocations.
1565 */ 1855 */
1566 mm->cached_hole_size = ~0UL; 1856 if (addr & ~PAGE_MASK) {
1567 mm->free_area_cache = TASK_UNMAPPED_BASE; 1857 VM_BUG_ON(addr != -ENOMEM);
1568 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 1858 info.flags = 0;
1569 /* 1859 info.low_limit = TASK_UNMAPPED_BASE;
1570 * Restore the topdown base: 1860 info.high_limit = TASK_SIZE;
1571 */ 1861 addr = vm_unmapped_area(&info);
1572 mm->free_area_cache = mm->mmap_base; 1862 }
1573 mm->cached_hole_size = ~0UL;
1574 1863
1575 return addr; 1864 return addr;
1576} 1865}
@@ -1780,9 +2069,27 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1780 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 2069 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
1781 error = acct_stack_growth(vma, size, grow); 2070 error = acct_stack_growth(vma, size, grow);
1782 if (!error) { 2071 if (!error) {
2072 /*
2073 * vma_gap_update() doesn't support concurrent
2074 * updates, but we only hold a shared mmap_sem
2075 * lock here, so we need to protect against
2076 * concurrent vma expansions.
2077 * vma_lock_anon_vma() doesn't help here, as
2078 * we don't guarantee that all growable vmas
2079 * in a mm share the same root anon vma.
2080 * So, we reuse mm->page_table_lock to guard
2081 * against concurrent vma expansions.
2082 */
2083 spin_lock(&vma->vm_mm->page_table_lock);
1783 anon_vma_interval_tree_pre_update_vma(vma); 2084 anon_vma_interval_tree_pre_update_vma(vma);
1784 vma->vm_end = address; 2085 vma->vm_end = address;
1785 anon_vma_interval_tree_post_update_vma(vma); 2086 anon_vma_interval_tree_post_update_vma(vma);
2087 if (vma->vm_next)
2088 vma_gap_update(vma->vm_next);
2089 else
2090 vma->vm_mm->highest_vm_end = address;
2091 spin_unlock(&vma->vm_mm->page_table_lock);
2092
1786 perf_event_mmap(vma); 2093 perf_event_mmap(vma);
1787 } 2094 }
1788 } 2095 }
@@ -1833,10 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma,
1833 if (grow <= vma->vm_pgoff) { 2140 if (grow <= vma->vm_pgoff) {
1834 error = acct_stack_growth(vma, size, grow); 2141 error = acct_stack_growth(vma, size, grow);
1835 if (!error) { 2142 if (!error) {
2143 /*
2144 * vma_gap_update() doesn't support concurrent
2145 * updates, but we only hold a shared mmap_sem
2146 * lock here, so we need to protect against
2147 * concurrent vma expansions.
2148 * vma_lock_anon_vma() doesn't help here, as
2149 * we don't guarantee that all growable vmas
2150 * in a mm share the same root anon vma.
2151 * So, we reuse mm->page_table_lock to guard
2152 * against concurrent vma expansions.
2153 */
2154 spin_lock(&vma->vm_mm->page_table_lock);
1836 anon_vma_interval_tree_pre_update_vma(vma); 2155 anon_vma_interval_tree_pre_update_vma(vma);
1837 vma->vm_start = address; 2156 vma->vm_start = address;
1838 vma->vm_pgoff -= grow; 2157 vma->vm_pgoff -= grow;
1839 anon_vma_interval_tree_post_update_vma(vma); 2158 anon_vma_interval_tree_post_update_vma(vma);
2159 vma_gap_update(vma);
2160 spin_unlock(&vma->vm_mm->page_table_lock);
2161
1840 perf_event_mmap(vma); 2162 perf_event_mmap(vma);
1841 } 2163 }
1842 } 2164 }
@@ -1959,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1959 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2281 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1960 vma->vm_prev = NULL; 2282 vma->vm_prev = NULL;
1961 do { 2283 do {
1962 rb_erase(&vma->vm_rb, &mm->mm_rb); 2284 vma_rb_erase(vma, &mm->mm_rb);
1963 mm->map_count--; 2285 mm->map_count--;
1964 tail_vma = vma; 2286 tail_vma = vma;
1965 vma = vma->vm_next; 2287 vma = vma->vm_next;
1966 } while (vma && vma->vm_start < end); 2288 } while (vma && vma->vm_start < end);
1967 *insertion_point = vma; 2289 *insertion_point = vma;
1968 if (vma) 2290 if (vma) {
1969 vma->vm_prev = prev; 2291 vma->vm_prev = prev;
2292 vma_gap_update(vma);
2293 } else
2294 mm->highest_vm_end = prev ? prev->vm_end : 0;
1970 tail_vma->vm_next = NULL; 2295 tail_vma->vm_next = NULL;
1971 if (mm->unmap_area == arch_unmap_area) 2296 if (mm->unmap_area == arch_unmap_area)
1972 addr = prev ? prev->vm_end : mm->mmap_base; 2297 addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2561,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2561 * The LSB of head.next can't change from under us 2886 * The LSB of head.next can't change from under us
2562 * because we hold the mm_all_locks_mutex. 2887 * because we hold the mm_all_locks_mutex.
2563 */ 2888 */
2564 mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); 2889 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
2565 /* 2890 /*
2566 * We can safely modify head.next after taking the 2891 * We can safely modify head.next after taking the
2567 * anon_vma->root->mutex. If some other vma in this mm shares 2892 * anon_vma->root->rwsem. If some other vma in this mm shares
2568 * the same anon_vma we won't take it again. 2893 * the same anon_vma we won't take it again.
2569 * 2894 *
2570 * No need of atomic instructions here, head.next 2895 * No need of atomic instructions here, head.next
2571 * can't change from under us thanks to the 2896 * can't change from under us thanks to the
2572 * anon_vma->root->mutex. 2897 * anon_vma->root->rwsem.
2573 */ 2898 */
2574 if (__test_and_set_bit(0, (unsigned long *) 2899 if (__test_and_set_bit(0, (unsigned long *)
2575 &anon_vma->root->rb_root.rb_node)) 2900 &anon_vma->root->rb_root.rb_node))
@@ -2671,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2671 * 2996 *
2672 * No need of atomic instructions here, head.next 2997 * No need of atomic instructions here, head.next
2673 * can't change from under us until we release the 2998 * can't change from under us until we release the
2674 * anon_vma->root->mutex. 2999 * anon_vma->root->rwsem.
2675 */ 3000 */
2676 if (!__test_and_clear_bit(0, (unsigned long *) 3001 if (!__test_and_clear_bit(0, (unsigned long *)
2677 &anon_vma->root->rb_root.rb_node)) 3002 &anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a40992610ab6..94722a4d6b43 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
35} 35}
36#endif 36#endif
37 37
38static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable) 40 int dirty_accountable, int prot_numa, bool *ret_all_same_node)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm;
42 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
43 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0;
46 bool all_same_node = true;
47 int last_nid = -1;
44 48
45 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
46 arch_enter_lazy_mmu_mode(); 50 arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
48 oldpte = *pte; 52 oldpte = *pte;
49 if (pte_present(oldpte)) { 53 if (pte_present(oldpte)) {
50 pte_t ptent; 54 pte_t ptent;
55 bool updated = false;
51 56
52 ptent = ptep_modify_prot_start(mm, addr, pte); 57 ptent = ptep_modify_prot_start(mm, addr, pte);
53 ptent = pte_modify(ptent, newprot); 58 if (!prot_numa) {
59 ptent = pte_modify(ptent, newprot);
60 updated = true;
61 } else {
62 struct page *page;
63
64 page = vm_normal_page(vma, addr, oldpte);
65 if (page) {
66 int this_nid = page_to_nid(page);
67 if (last_nid == -1)
68 last_nid = this_nid;
69 if (last_nid != this_nid)
70 all_same_node = false;
71
72 /* only check non-shared pages */
73 if (!pte_numa(oldpte) &&
74 page_mapcount(page) == 1) {
75 ptent = pte_mknuma(ptent);
76 updated = true;
77 }
78 }
79 }
54 80
55 /* 81 /*
56 * Avoid taking write faults for pages we know to be 82 * Avoid taking write faults for pages we know to be
57 * dirty. 83 * dirty.
58 */ 84 */
59 if (dirty_accountable && pte_dirty(ptent)) 85 if (dirty_accountable && pte_dirty(ptent)) {
60 ptent = pte_mkwrite(ptent); 86 ptent = pte_mkwrite(ptent);
87 updated = true;
88 }
61 89
90 if (updated)
91 pages++;
62 ptep_modify_prot_commit(mm, addr, pte, ptent); 92 ptep_modify_prot_commit(mm, addr, pte, ptent);
63 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 93 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 94 swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,61 +102,101 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
72 set_pte_at(mm, addr, pte, 102 set_pte_at(mm, addr, pte,
73 swp_entry_to_pte(entry)); 103 swp_entry_to_pte(entry));
74 } 104 }
105 pages++;
75 } 106 }
76 } while (pte++, addr += PAGE_SIZE, addr != end); 107 } while (pte++, addr += PAGE_SIZE, addr != end);
77 arch_leave_lazy_mmu_mode(); 108 arch_leave_lazy_mmu_mode();
78 pte_unmap_unlock(pte - 1, ptl); 109 pte_unmap_unlock(pte - 1, ptl);
110
111 *ret_all_same_node = all_same_node;
112 return pages;
79} 113}
80 114
81static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, 115#ifdef CONFIG_NUMA_BALANCING
82 unsigned long addr, unsigned long end, pgprot_t newprot, 116static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
83 int dirty_accountable) 117 pmd_t *pmd)
118{
119 spin_lock(&mm->page_table_lock);
120 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
121 spin_unlock(&mm->page_table_lock);
122}
123#else
124static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
125 pmd_t *pmd)
126{
127 BUG();
128}
129#endif /* CONFIG_NUMA_BALANCING */
130
131static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
132 pud_t *pud, unsigned long addr, unsigned long end,
133 pgprot_t newprot, int dirty_accountable, int prot_numa)
84{ 134{
85 pmd_t *pmd; 135 pmd_t *pmd;
86 unsigned long next; 136 unsigned long next;
137 unsigned long pages = 0;
138 bool all_same_node;
87 139
88 pmd = pmd_offset(pud, addr); 140 pmd = pmd_offset(pud, addr);
89 do { 141 do {
90 next = pmd_addr_end(addr, end); 142 next = pmd_addr_end(addr, end);
91 if (pmd_trans_huge(*pmd)) { 143 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE) 144 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma->vm_mm, pmd); 145 split_huge_page_pmd(vma, addr, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot)) 146 else if (change_huge_pmd(vma, pmd, addr, newprot,
147 prot_numa)) {
148 pages += HPAGE_PMD_NR;
95 continue; 149 continue;
150 }
96 /* fall through */ 151 /* fall through */
97 } 152 }
98 if (pmd_none_or_clear_bad(pmd)) 153 if (pmd_none_or_clear_bad(pmd))
99 continue; 154 continue;
100 change_pte_range(vma->vm_mm, pmd, addr, next, newprot, 155 pages += change_pte_range(vma, pmd, addr, next, newprot,
101 dirty_accountable); 156 dirty_accountable, prot_numa, &all_same_node);
157
158 /*
159 * If we are changing protections for NUMA hinting faults then
160 * set pmd_numa if the examined pages were all on the same
161 * node. This allows a regular PMD to be handled as one fault
162 * and effectively batches the taking of the PTL
163 */
164 if (prot_numa && all_same_node)
165 change_pmd_protnuma(vma->vm_mm, addr, pmd);
102 } while (pmd++, addr = next, addr != end); 166 } while (pmd++, addr = next, addr != end);
167
168 return pages;
103} 169}
104 170
105static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 171static inline unsigned long change_pud_range(struct vm_area_struct *vma,
106 unsigned long addr, unsigned long end, pgprot_t newprot, 172 pgd_t *pgd, unsigned long addr, unsigned long end,
107 int dirty_accountable) 173 pgprot_t newprot, int dirty_accountable, int prot_numa)
108{ 174{
109 pud_t *pud; 175 pud_t *pud;
110 unsigned long next; 176 unsigned long next;
177 unsigned long pages = 0;
111 178
112 pud = pud_offset(pgd, addr); 179 pud = pud_offset(pgd, addr);
113 do { 180 do {
114 next = pud_addr_end(addr, end); 181 next = pud_addr_end(addr, end);
115 if (pud_none_or_clear_bad(pud)) 182 if (pud_none_or_clear_bad(pud))
116 continue; 183 continue;
117 change_pmd_range(vma, pud, addr, next, newprot, 184 pages += change_pmd_range(vma, pud, addr, next, newprot,
118 dirty_accountable); 185 dirty_accountable, prot_numa);
119 } while (pud++, addr = next, addr != end); 186 } while (pud++, addr = next, addr != end);
187
188 return pages;
120} 189}
121 190
122static void change_protection(struct vm_area_struct *vma, 191static unsigned long change_protection_range(struct vm_area_struct *vma,
123 unsigned long addr, unsigned long end, pgprot_t newprot, 192 unsigned long addr, unsigned long end, pgprot_t newprot,
124 int dirty_accountable) 193 int dirty_accountable, int prot_numa)
125{ 194{
126 struct mm_struct *mm = vma->vm_mm; 195 struct mm_struct *mm = vma->vm_mm;
127 pgd_t *pgd; 196 pgd_t *pgd;
128 unsigned long next; 197 unsigned long next;
129 unsigned long start = addr; 198 unsigned long start = addr;
199 unsigned long pages = 0;
130 200
131 BUG_ON(addr >= end); 201 BUG_ON(addr >= end);
132 pgd = pgd_offset(mm, addr); 202 pgd = pgd_offset(mm, addr);
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma,
135 next = pgd_addr_end(addr, end); 205 next = pgd_addr_end(addr, end);
136 if (pgd_none_or_clear_bad(pgd)) 206 if (pgd_none_or_clear_bad(pgd))
137 continue; 207 continue;
138 change_pud_range(vma, pgd, addr, next, newprot, 208 pages += change_pud_range(vma, pgd, addr, next, newprot,
139 dirty_accountable); 209 dirty_accountable, prot_numa);
140 } while (pgd++, addr = next, addr != end); 210 } while (pgd++, addr = next, addr != end);
141 flush_tlb_range(vma, start, end); 211
212 /* Only flush the TLB if we actually modified any entries: */
213 if (pages)
214 flush_tlb_range(vma, start, end);
215
216 return pages;
217}
218
219unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
220 unsigned long end, pgprot_t newprot,
221 int dirty_accountable, int prot_numa)
222{
223 struct mm_struct *mm = vma->vm_mm;
224 unsigned long pages;
225
226 mmu_notifier_invalidate_range_start(mm, start, end);
227 if (is_vm_hugetlb_page(vma))
228 pages = hugetlb_change_protection(vma, start, end, newprot);
229 else
230 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
231 mmu_notifier_invalidate_range_end(mm, start, end);
232
233 return pages;
142} 234}
143 235
144int 236int
@@ -213,12 +305,9 @@ success:
213 dirty_accountable = 1; 305 dirty_accountable = 1;
214 } 306 }
215 307
216 mmu_notifier_invalidate_range_start(mm, start, end); 308 change_protection(vma, start, end, vma->vm_page_prot,
217 if (is_vm_hugetlb_page(vma)) 309 dirty_accountable, 0);
218 hugetlb_change_protection(vma, start, end, vma->vm_page_prot); 310
219 else
220 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
221 mmu_notifier_invalidate_range_end(mm, start, end);
222 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 311 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
223 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 312 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
224 perf_event_mmap(vma); 313 perf_event_mmap(vma);
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
274 error = -EINVAL; 363 error = -EINVAL;
275 if (!(vma->vm_flags & VM_GROWSDOWN)) 364 if (!(vma->vm_flags & VM_GROWSDOWN))
276 goto out; 365 goto out;
277 } 366 } else {
278 else {
279 if (vma->vm_start > start) 367 if (vma->vm_start > start)
280 goto out; 368 goto out;
281 if (unlikely(grows & PROT_GROWSUP)) { 369 if (unlikely(grows & PROT_GROWSUP)) {
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
291 for (nstart = start ; ; ) { 379 for (nstart = start ; ; ) {
292 unsigned long newflags; 380 unsigned long newflags;
293 381
294 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 382 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
295 383
296 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 384 newflags = vm_flags;
385 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
297 386
298 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 387 /* newflags >> 4 shift VM_MAY% in place of VM_% */
299 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { 388 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b61c2d3307a..e1031e1f6a61 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
104 } 104 }
105 if (vma->anon_vma) { 105 if (vma->anon_vma) {
106 anon_vma = vma->anon_vma; 106 anon_vma = vma->anon_vma;
107 anon_vma_lock(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 } 108 }
109 } 109 }
110 110
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
182 need_flush = true; 182 need_flush = true;
183 continue; 183 continue;
184 } else if (!err) { 184 } else if (!err) {
185 split_huge_page_pmd(vma->vm_mm, old_pmd); 185 split_huge_page_pmd(vma, old_addr, old_pmd);
186 } 186 }
187 VM_BUG_ON(pmd_trans_huge(*old_pmd)); 187 VM_BUG_ON(pmd_trans_huge(*old_pmd));
188 } 188 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index ecc2f13d557d..03d152a76acf 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
137 return count; 137 return count;
138} 138}
139 139
140static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
141{
142 struct zone *z;
143
144 /*
145 * In free_area_init_core(), highmem zone's managed_pages is set to
146 * present_pages, and bootmem allocator doesn't allocate from highmem
147 * zones. So there's no need to recalculate managed_pages because all
148 * highmem pages will be managed by the buddy system. Here highmem
149 * zone also includes highmem movable zone.
150 */
151 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
152 if (!is_highmem(z))
153 z->managed_pages = 0;
154}
155
140/** 156/**
141 * free_all_bootmem - release free pages to the buddy allocator 157 * free_all_bootmem - release free pages to the buddy allocator
142 * 158 *
@@ -144,6 +160,11 @@ unsigned long __init free_low_memory_core_early(int nodeid)
144 */ 160 */
145unsigned long __init free_all_bootmem(void) 161unsigned long __init free_all_bootmem(void)
146{ 162{
163 struct pglist_data *pgdat;
164
165 for_each_online_pgdat(pgdat)
166 reset_node_lowmem_managed_pages(pgdat);
167
147 /* 168 /*
148 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 169 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
149 * because in some case like Node0 doesn't have RAM installed 170 * because in some case like Node0 doesn't have RAM installed
diff --git a/mm/nommu.c b/mm/nommu.c
index 45131b41bcdb..79c3cac87afa 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -66,6 +66,21 @@ int heap_stack_gap = 0;
66 66
67atomic_long_t mmap_pages_allocated; 67atomic_long_t mmap_pages_allocated;
68 68
69/*
70 * The global memory commitment made in the system can be a metric
71 * that can be used to drive ballooning decisions when Linux is hosted
72 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
73 * balancing memory across competing virtual machines that are hosted.
74 * Several metrics drive this policy engine including the guest reported
75 * memory commitment.
76 */
77unsigned long vm_memory_committed(void)
78{
79 return percpu_counter_read_positive(&vm_committed_as);
80}
81
82EXPORT_SYMBOL_GPL(vm_memory_committed);
83
69EXPORT_SYMBOL(mem_map); 84EXPORT_SYMBOL(mem_map);
70EXPORT_SYMBOL(num_physpages); 85EXPORT_SYMBOL(num_physpages);
71 86
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e0f3e24831..0399f146ae49 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
44int sysctl_oom_dump_tasks = 1; 44int sysctl_oom_dump_tasks = 1;
45static DEFINE_SPINLOCK(zone_scan_lock); 45static DEFINE_SPINLOCK(zone_scan_lock);
46 46
47/*
48 * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
49 * @old_val: old oom_score_adj for compare
50 * @new_val: new oom_score_adj for swap
51 *
52 * Sets the oom_score_adj value for current to @new_val iff its present value is
53 * @old_val. Usually used to reinstate a previous value to prevent racing with
54 * userspacing tuning the value in the interim.
55 */
56void compare_swap_oom_score_adj(int old_val, int new_val)
57{
58 struct sighand_struct *sighand = current->sighand;
59
60 spin_lock_irq(&sighand->siglock);
61 if (current->signal->oom_score_adj == old_val)
62 current->signal->oom_score_adj = new_val;
63 trace_oom_score_adj_update(current);
64 spin_unlock_irq(&sighand->siglock);
65}
66
67/**
68 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
69 * @new_val: new oom_score_adj value
70 *
71 * Sets the oom_score_adj value for current to @new_val with proper
72 * synchronization and returns the old value. Usually used to temporarily
73 * set a value, save the old value in the caller, and then reinstate it later.
74 */
75int test_set_oom_score_adj(int new_val)
76{
77 struct sighand_struct *sighand = current->sighand;
78 int old_val;
79
80 spin_lock_irq(&sighand->siglock);
81 old_val = current->signal->oom_score_adj;
82 current->signal->oom_score_adj = new_val;
83 trace_oom_score_adj_update(current);
84 spin_unlock_irq(&sighand->siglock);
85
86 return old_val;
87}
88
89#ifdef CONFIG_NUMA 47#ifdef CONFIG_NUMA
90/** 48/**
91 * has_intersects_mems_allowed() - check task eligiblity for kill 49 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
193 if (!p) 151 if (!p)
194 return 0; 152 return 0;
195 153
196 adj = p->signal->oom_score_adj; 154 adj = (long)p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) { 155 if (adj == OOM_SCORE_ADJ_MIN) {
198 task_unlock(p); 156 task_unlock(p);
199 return 0; 157 return 0;
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
257 * the page allocator means a mempolicy is in effect. Cpuset policy 215 * the page allocator means a mempolicy is in effect. Cpuset policy
258 * is enforced in get_page_from_freelist(). 216 * is enforced in get_page_from_freelist().
259 */ 217 */
260 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { 218 if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
261 *totalpages = total_swap_pages; 219 *totalpages = total_swap_pages;
262 for_each_node_mask(nid, *nodemask) 220 for_each_node_mask(nid, *nodemask)
263 *totalpages += node_spanned_pages(nid); 221 *totalpages += node_spanned_pages(nid);
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
310 if (!task->mm) 268 if (!task->mm)
311 return OOM_SCAN_CONTINUE; 269 return OOM_SCAN_CONTINUE;
312 270
313 if (task->flags & PF_EXITING) { 271 /*
272 * If task is allocating a lot of memory and has been marked to be
273 * killed first if it triggers an oom, then select it.
274 */
275 if (oom_task_origin(task))
276 return OOM_SCAN_SELECT;
277
278 if (task->flags & PF_EXITING && !force_kill) {
314 /* 279 /*
315 * If task is current and is in the process of releasing memory, 280 * If this task is not being ptraced on exit, then wait for it
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to 281 * to finish before killing some other task unnecessarily.
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */ 282 */
322 if (task == current) 283 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
323 return OOM_SCAN_SELECT; 284 return OOM_SCAN_ABORT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 } 285 }
334 return OOM_SCAN_OK; 286 return OOM_SCAN_OK;
335} 287}
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
412 continue; 364 continue;
413 } 365 }
414 366
415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", 367 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
416 task->pid, from_kuid(&init_user_ns, task_uid(task)), 368 task->pid, from_kuid(&init_user_ns, task_uid(task)),
417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 369 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
418 task->mm->nr_ptes, 370 task->mm->nr_ptes,
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
428{ 380{
429 task_lock(current); 381 task_lock(current);
430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 382 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
431 "oom_score_adj=%d\n", 383 "oom_score_adj=%hd\n",
432 current->comm, gfp_mask, order, 384 current->comm, gfp_mask, order,
433 current->signal->oom_score_adj); 385 current->signal->oom_score_adj);
434 cpuset_print_task_mems_allowed(current); 386 cpuset_print_task_mems_allowed(current);
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
639 spin_unlock(&zone_scan_lock); 591 spin_unlock(&zone_scan_lock);
640} 592}
641 593
642/*
643 * Try to acquire the oom killer lock for all system zones. Returns zero if a
644 * parallel oom killing is taking place, otherwise locks all zones and returns
645 * non-zero.
646 */
647static int try_set_system_oom(void)
648{
649 struct zone *zone;
650 int ret = 1;
651
652 spin_lock(&zone_scan_lock);
653 for_each_populated_zone(zone)
654 if (zone_is_oom_locked(zone)) {
655 ret = 0;
656 goto out;
657 }
658 for_each_populated_zone(zone)
659 zone_set_flag(zone, ZONE_OOM_LOCKED);
660out:
661 spin_unlock(&zone_scan_lock);
662 return ret;
663}
664
665/*
666 * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
667 * attempts or page faults may now recall the oom killer, if necessary.
668 */
669static void clear_system_oom(void)
670{
671 struct zone *zone;
672
673 spin_lock(&zone_scan_lock);
674 for_each_populated_zone(zone)
675 zone_clear_flag(zone, ZONE_OOM_LOCKED);
676 spin_unlock(&zone_scan_lock);
677}
678
679/** 594/**
680 * out_of_memory - kill the "best" process when we run out of memory 595 * out_of_memory - kill the "best" process when we run out of memory
681 * @zonelist: zonelist pointer 596 * @zonelist: zonelist pointer
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
706 return; 621 return;
707 622
708 /* 623 /*
709 * If current has a pending SIGKILL, then automatically select it. The 624 * If current has a pending SIGKILL or is exiting, then automatically
710 * goal is to allow it to allocate so that it may quickly exit and free 625 * select it. The goal is to allow it to allocate so that it may
711 * its memory. 626 * quickly exit and free its memory.
712 */ 627 */
713 if (fatal_signal_pending(current)) { 628 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
714 set_thread_flag(TIF_MEMDIE); 629 set_thread_flag(TIF_MEMDIE);
715 return; 630 return;
716 } 631 }
@@ -756,15 +671,16 @@ out:
756 671
757/* 672/*
758 * The pagefault handler calls here because it is out of memory, so kill a 673 * The pagefault handler calls here because it is out of memory, so kill a
759 * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel 674 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
760 * oom killing is already in progress so do nothing. If a task is found with 675 * parallel oom killing is already in progress so do nothing.
761 * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
762 */ 676 */
763void pagefault_out_of_memory(void) 677void pagefault_out_of_memory(void)
764{ 678{
765 if (try_set_system_oom()) { 679 struct zonelist *zonelist = node_zonelist(first_online_node,
680 GFP_KERNEL);
681
682 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
766 out_of_memory(NULL, 0, 0, NULL, false); 683 out_of_memory(NULL, 0, 0, NULL, false);
767 clear_system_oom(); 684 clear_zonelist_oom(zonelist, GFP_KERNEL);
768 } 685 }
769 schedule_timeout_killable(1);
770} 686}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 830893b2b3c7..0713bfbf0954 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -201,6 +201,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
201 zone_reclaimable_pages(z) - z->dirty_balance_reserve; 201 zone_reclaimable_pages(z) - z->dirty_balance_reserve;
202 } 202 }
203 /* 203 /*
204 * Unreclaimable memory (kernel memory or anonymous memory
205 * without swap) can bring down the dirtyable pages below
206 * the zone's dirty balance reserve and the above calculation
207 * will underflow. However we still want to add in nodes
208 * which are below threshold (negative values) to get a more
209 * accurate calculation but make sure that the total never
210 * underflows.
211 */
212 if ((long)x < 0)
213 x = 0;
214
215 /*
204 * Make sure that the number of highmem pages is never larger 216 * Make sure that the number of highmem pages is never larger
205 * than the number of the total dirtyable memory. This can only 217 * than the number of the total dirtyable memory. This can only
206 * occur in very strange VM situations but we want to make sure 218 * occur in very strange VM situations but we want to make sure
@@ -222,8 +234,8 @@ static unsigned long global_dirtyable_memory(void)
222{ 234{
223 unsigned long x; 235 unsigned long x;
224 236
225 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - 237 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
226 dirty_balance_reserve; 238 x -= min(x, dirty_balance_reserve);
227 239
228 if (!vm_highmem_is_dirtyable) 240 if (!vm_highmem_is_dirtyable)
229 x -= highmem_dirtyable_memory(x); 241 x -= highmem_dirtyable_memory(x);
@@ -290,9 +302,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
290 * highmem zone can hold its share of dirty pages, so we don't 302 * highmem zone can hold its share of dirty pages, so we don't
291 * care about vm_highmem_is_dirtyable here. 303 * care about vm_highmem_is_dirtyable here.
292 */ 304 */
293 return zone_page_state(zone, NR_FREE_PAGES) + 305 unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
294 zone_reclaimable_pages(zone) - 306 zone_reclaimable_pages(zone);
295 zone->dirty_balance_reserve; 307
308 /* don't allow this to underflow */
309 nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
310 return nr_pages;
296} 311}
297 312
298/** 313/**
@@ -1069,7 +1084,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
1069} 1084}
1070 1085
1071/* 1086/*
1072 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() 1087 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
1073 * will look to see if it needs to start dirty throttling. 1088 * will look to see if it needs to start dirty throttling.
1074 * 1089 *
1075 * If dirty_poll_interval is too low, big NUMA machines will call the expensive 1090 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1451,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
1436DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; 1451DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1437 1452
1438/** 1453/**
1439 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1454 * balance_dirty_pages_ratelimited - balance dirty memory state
1440 * @mapping: address_space which was dirtied 1455 * @mapping: address_space which was dirtied
1441 * @nr_pages_dirtied: number of pages which the caller has just dirtied
1442 * 1456 *
1443 * Processes which are dirtying memory should call in here once for each page 1457 * Processes which are dirtying memory should call in here once for each page
1444 * which was newly dirtied. The function will periodically check the system's 1458 * which was newly dirtied. The function will periodically check the system's
@@ -1449,8 +1463,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1449 * limit we decrease the ratelimiting by a lot, to prevent individual processes 1463 * limit we decrease the ratelimiting by a lot, to prevent individual processes
1450 * from overshooting the limit by (ratelimit_pages) each. 1464 * from overshooting the limit by (ratelimit_pages) each.
1451 */ 1465 */
1452void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 1466void balance_dirty_pages_ratelimited(struct address_space *mapping)
1453 unsigned long nr_pages_dirtied)
1454{ 1467{
1455 struct backing_dev_info *bdi = mapping->backing_dev_info; 1468 struct backing_dev_info *bdi = mapping->backing_dev_info;
1456 int ratelimit; 1469 int ratelimit;
@@ -1484,6 +1497,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1484 */ 1497 */
1485 p = &__get_cpu_var(dirty_throttle_leaks); 1498 p = &__get_cpu_var(dirty_throttle_leaks);
1486 if (*p > 0 && current->nr_dirtied < ratelimit) { 1499 if (*p > 0 && current->nr_dirtied < ratelimit) {
1500 unsigned long nr_pages_dirtied;
1487 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); 1501 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1488 *p -= nr_pages_dirtied; 1502 *p -= nr_pages_dirtied;
1489 current->nr_dirtied += nr_pages_dirtied; 1503 current->nr_dirtied += nr_pages_dirtied;
@@ -1493,7 +1507,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1493 if (unlikely(current->nr_dirtied >= ratelimit)) 1507 if (unlikely(current->nr_dirtied >= ratelimit))
1494 balance_dirty_pages(mapping, current->nr_dirtied); 1508 balance_dirty_pages(mapping, current->nr_dirtied);
1495} 1509}
1496EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 1510EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
1497 1511
1498void throttle_vm_writeout(gfp_t gfp_mask) 1512void throttle_vm_writeout(gfp_t gfp_mask)
1499{ 1513{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7bb35ac0964a..df2022ff0c8a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
90#ifdef CONFIG_HIGHMEM 90#ifdef CONFIG_HIGHMEM
91 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 91 [N_HIGH_MEMORY] = { { [0] = 1UL } },
92#endif 92#endif
93#ifdef CONFIG_MOVABLE_NODE
94 [N_MEMORY] = { { [0] = 1UL } },
95#endif
93 [N_CPU] = { { [0] = 1UL } }, 96 [N_CPU] = { { [0] = 1UL } },
94#endif /* NUMA */ 97#endif /* NUMA */
95}; 98};
@@ -218,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes);
218 221
219int page_group_by_mobility_disabled __read_mostly; 222int page_group_by_mobility_disabled __read_mostly;
220 223
221/*
222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate.
225 */
226void set_pageblock_migratetype(struct page *page, int migratetype) 224void set_pageblock_migratetype(struct page *page, int migratetype)
227{ 225{
228 226
@@ -368,8 +366,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
368 int nr_pages = 1 << order; 366 int nr_pages = 1 << order;
369 int bad = 0; 367 int bad = 0;
370 368
371 if (unlikely(compound_order(page) != order) || 369 if (unlikely(compound_order(page) != order)) {
372 unlikely(!PageHead(page))) {
373 bad_page(page); 370 bad_page(page);
374 bad++; 371 bad++;
375 } 372 }
@@ -523,7 +520,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
523 * If a block is freed, and its buddy is also free, then this 520 * If a block is freed, and its buddy is also free, then this
524 * triggers coalescing into a block of larger size. 521 * triggers coalescing into a block of larger size.
525 * 522 *
526 * -- wli 523 * -- nyc
527 */ 524 */
528 525
529static inline void __free_one_page(struct page *page, 526static inline void __free_one_page(struct page *page,
@@ -608,6 +605,7 @@ static inline int free_pages_check(struct page *page)
608 bad_page(page); 605 bad_page(page);
609 return 1; 606 return 1;
610 } 607 }
608 reset_page_last_nid(page);
611 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 609 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
612 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 610 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
613 return 0; 611 return 0;
@@ -667,11 +665,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 665 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
668 __free_one_page(page, zone, 0, mt); 666 __free_one_page(page, zone, 0, mt);
669 trace_mm_page_pcpu_drain(page, 0, mt); 667 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt)) 668 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 669 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
670 if (is_migrate_cma(mt))
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
672 }
672 } while (--to_free && --batch_free && !list_empty(list)); 673 } while (--to_free && --batch_free && !list_empty(list));
673 } 674 }
674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
675 spin_unlock(&zone->lock); 675 spin_unlock(&zone->lock);
676} 676}
677 677
@@ -730,6 +730,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
730 local_irq_restore(flags); 730 local_irq_restore(flags);
731} 731}
732 732
733/*
734 * Read access to zone->managed_pages is safe because it's unsigned long,
735 * but we still need to serialize writers. Currently all callers of
736 * __free_pages_bootmem() except put_page_bootmem() should only be used
737 * at boot time. So for shorter boot time, we shift the burden to
738 * put_page_bootmem() to serialize writers.
739 */
733void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 740void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
734{ 741{
735 unsigned int nr_pages = 1 << order; 742 unsigned int nr_pages = 1 << order;
@@ -745,6 +752,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
745 set_page_count(p, 0); 752 set_page_count(p, 0);
746 } 753 }
747 754
755 page_zone(page)->managed_pages += 1 << order;
748 set_page_refcounted(page); 756 set_page_refcounted(page);
749 __free_pages(page, order); 757 __free_pages(page, order);
750} 758}
@@ -780,7 +788,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
780 * large block of memory acted on by a series of small allocations. 788 * large block of memory acted on by a series of small allocations.
781 * This behavior is a critical factor in sglist merging's success. 789 * This behavior is a critical factor in sglist merging's success.
782 * 790 *
783 * -- wli 791 * -- nyc
784 */ 792 */
785static inline void expand(struct zone *zone, struct page *page, 793static inline void expand(struct zone *zone, struct page *page,
786 int low, int high, struct free_area *area, 794 int low, int high, struct free_area *area,
@@ -1376,14 +1384,8 @@ void split_page(struct page *page, unsigned int order)
1376 set_page_refcounted(page + i); 1384 set_page_refcounted(page + i);
1377} 1385}
1378 1386
1379/* 1387static int __isolate_free_page(struct page *page, unsigned int order)
1380 * Similar to the split_page family of functions except that the page
1381 * required at the given order and being isolated now to prevent races
1382 * with parallel allocators
1383 */
1384int capture_free_page(struct page *page, int alloc_order, int migratetype)
1385{ 1388{
1386 unsigned int order;
1387 unsigned long watermark; 1389 unsigned long watermark;
1388 struct zone *zone; 1390 struct zone *zone;
1389 int mt; 1391 int mt;
@@ -1391,27 +1393,23 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1391 BUG_ON(!PageBuddy(page)); 1393 BUG_ON(!PageBuddy(page));
1392 1394
1393 zone = page_zone(page); 1395 zone = page_zone(page);
1394 order = page_order(page); 1396 mt = get_pageblock_migratetype(page);
1395 1397
1396 /* Obey watermarks as if the page was being allocated */ 1398 if (mt != MIGRATE_ISOLATE) {
1397 watermark = low_wmark_pages(zone) + (1 << order); 1399 /* Obey watermarks as if the page was being allocated */
1398 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1400 watermark = low_wmark_pages(zone) + (1 << order);
1399 return 0; 1401 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1402 return 0;
1403
1404 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1405 }
1400 1406
1401 /* Remove page from free list */ 1407 /* Remove page from free list */
1402 list_del(&page->lru); 1408 list_del(&page->lru);
1403 zone->free_area[order].nr_free--; 1409 zone->free_area[order].nr_free--;
1404 rmv_page_order(page); 1410 rmv_page_order(page);
1405 1411
1406 mt = get_pageblock_migratetype(page); 1412 /* Set the pageblock if the isolated page is at least a pageblock */
1407 if (unlikely(mt != MIGRATE_ISOLATE))
1408 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1409
1410 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order,
1412 &zone->free_area[order], migratetype);
1413
1414 /* Set the pageblock if the captured page is at least a pageblock */
1415 if (order >= pageblock_order - 1) { 1413 if (order >= pageblock_order - 1) {
1416 struct page *endpage = page + (1 << order) - 1; 1414 struct page *endpage = page + (1 << order) - 1;
1417 for (; page < endpage; page += pageblock_nr_pages) { 1415 for (; page < endpage; page += pageblock_nr_pages) {
@@ -1440,10 +1438,9 @@ int split_free_page(struct page *page)
1440 unsigned int order; 1438 unsigned int order;
1441 int nr_pages; 1439 int nr_pages;
1442 1440
1443 BUG_ON(!PageBuddy(page));
1444 order = page_order(page); 1441 order = page_order(page);
1445 1442
1446 nr_pages = capture_free_page(page, order, 0); 1443 nr_pages = __isolate_free_page(page, order);
1447 if (!nr_pages) 1444 if (!nr_pages)
1448 return 0; 1445 return 0;
1449 1446
@@ -1641,20 +1638,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1641 return true; 1638 return true;
1642} 1639}
1643 1640
1644#ifdef CONFIG_MEMORY_ISOLATION
1645static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1646{
1647 if (unlikely(zone->nr_pageblock_isolate))
1648 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1649 return 0;
1650}
1651#else
1652static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1653{
1654 return 0;
1655}
1656#endif
1657
1658bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1641bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1659 int classzone_idx, int alloc_flags) 1642 int classzone_idx, int alloc_flags)
1660{ 1643{
@@ -1670,14 +1653,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1670 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1671 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1672 1655
1673 /*
1674 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1675 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1676 * sleep although it could do so. But this is more desirable for memory
1677 * hotplug than sleeping which can cause a livelock in the direct
1678 * reclaim path.
1679 */
1680 free_pages -= nr_zone_isolate_freepages(z);
1681 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1656 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1682 free_pages); 1657 free_pages);
1683} 1658}
@@ -1692,7 +1667,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1692 * 1667 *
1693 * If the zonelist cache is present in the passed in zonelist, then 1668 * If the zonelist cache is present in the passed in zonelist, then
1694 * returns a pointer to the allowed node mask (either the current 1669 * returns a pointer to the allowed node mask (either the current
1695 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1670 * tasks mems_allowed, or node_states[N_MEMORY].)
1696 * 1671 *
1697 * If the zonelist cache is not available for this zonelist, does 1672 * If the zonelist cache is not available for this zonelist, does
1698 * nothing and returns NULL. 1673 * nothing and returns NULL.
@@ -1721,7 +1696,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1721 1696
1722 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1697 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1723 &cpuset_current_mems_allowed : 1698 &cpuset_current_mems_allowed :
1724 &node_states[N_HIGH_MEMORY]; 1699 &node_states[N_MEMORY];
1725 return allowednodes; 1700 return allowednodes;
1726} 1701}
1727 1702
@@ -1871,7 +1846,7 @@ zonelist_scan:
1871 */ 1846 */
1872 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1847 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1873 high_zoneidx, nodemask) { 1848 high_zoneidx, nodemask) {
1874 if (NUMA_BUILD && zlc_active && 1849 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1875 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1850 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1876 continue; 1851 continue;
1877 if ((alloc_flags & ALLOC_CPUSET) && 1852 if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1892,8 @@ zonelist_scan:
1917 classzone_idx, alloc_flags)) 1892 classzone_idx, alloc_flags))
1918 goto try_this_zone; 1893 goto try_this_zone;
1919 1894
1920 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { 1895 if (IS_ENABLED(CONFIG_NUMA) &&
1896 !did_zlc_setup && nr_online_nodes > 1) {
1921 /* 1897 /*
1922 * we do zlc_setup if there are multiple nodes 1898 * we do zlc_setup if there are multiple nodes
1923 * and before considering the first zone allowed 1899 * and before considering the first zone allowed
@@ -1936,7 +1912,7 @@ zonelist_scan:
1936 * As we may have just activated ZLC, check if the first 1912 * As we may have just activated ZLC, check if the first
1937 * eligible zone has failed zone_reclaim recently. 1913 * eligible zone has failed zone_reclaim recently.
1938 */ 1914 */
1939 if (NUMA_BUILD && zlc_active && 1915 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1940 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1916 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1941 continue; 1917 continue;
1942 1918
@@ -1962,11 +1938,11 @@ try_this_zone:
1962 if (page) 1938 if (page)
1963 break; 1939 break;
1964this_zone_full: 1940this_zone_full:
1965 if (NUMA_BUILD) 1941 if (IS_ENABLED(CONFIG_NUMA))
1966 zlc_mark_zone_full(zonelist, z); 1942 zlc_mark_zone_full(zonelist, z);
1967 } 1943 }
1968 1944
1969 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1945 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
1970 /* Disable zlc cache for second zonelist scan */ 1946 /* Disable zlc cache for second zonelist scan */
1971 zlc_active = 0; 1947 zlc_active = 0;
1972 goto zonelist_scan; 1948 goto zonelist_scan;
@@ -2148,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2148 bool *contended_compaction, bool *deferred_compaction, 2124 bool *contended_compaction, bool *deferred_compaction,
2149 unsigned long *did_some_progress) 2125 unsigned long *did_some_progress)
2150{ 2126{
2151 struct page *page = NULL;
2152
2153 if (!order) 2127 if (!order)
2154 return NULL; 2128 return NULL;
2155 2129
@@ -2161,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2161 current->flags |= PF_MEMALLOC; 2135 current->flags |= PF_MEMALLOC;
2162 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2136 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2163 nodemask, sync_migration, 2137 nodemask, sync_migration,
2164 contended_compaction, &page); 2138 contended_compaction);
2165 current->flags &= ~PF_MEMALLOC; 2139 current->flags &= ~PF_MEMALLOC;
2166 2140
2167 /* If compaction captured a page, prep and use it */
2168 if (page) {
2169 prep_new_page(page, order, gfp_mask);
2170 goto got_page;
2171 }
2172
2173 if (*did_some_progress != COMPACT_SKIPPED) { 2141 if (*did_some_progress != COMPACT_SKIPPED) {
2142 struct page *page;
2143
2174 /* Page migration frees to the PCP lists but we want merging */ 2144 /* Page migration frees to the PCP lists but we want merging */
2175 drain_pages(get_cpu()); 2145 drain_pages(get_cpu());
2176 put_cpu(); 2146 put_cpu();
@@ -2180,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2180 alloc_flags & ~ALLOC_NO_WATERMARKS, 2150 alloc_flags & ~ALLOC_NO_WATERMARKS,
2181 preferred_zone, migratetype); 2151 preferred_zone, migratetype);
2182 if (page) { 2152 if (page) {
2183got_page:
2184 preferred_zone->compact_blockskip_flush = false; 2153 preferred_zone->compact_blockskip_flush = false;
2185 preferred_zone->compact_considered = 0; 2154 preferred_zone->compact_considered = 0;
2186 preferred_zone->compact_defer_shift = 0; 2155 preferred_zone->compact_defer_shift = 0;
@@ -2266,7 +2235,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2266 return NULL; 2235 return NULL;
2267 2236
2268 /* After successful reclaim, reconsider all zones for allocation */ 2237 /* After successful reclaim, reconsider all zones for allocation */
2269 if (NUMA_BUILD) 2238 if (IS_ENABLED(CONFIG_NUMA))
2270 zlc_clear_zones_full(zonelist); 2239 zlc_clear_zones_full(zonelist);
2271 2240
2272retry: 2241retry:
@@ -2412,12 +2381,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2412 * allowed per node queues are empty and that nodes are 2381 * allowed per node queues are empty and that nodes are
2413 * over allocated. 2382 * over allocated.
2414 */ 2383 */
2415 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2384 if (IS_ENABLED(CONFIG_NUMA) &&
2385 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2416 goto nopage; 2386 goto nopage;
2417 2387
2418restart: 2388restart:
2419 wake_all_kswapd(order, zonelist, high_zoneidx, 2389 if (!(gfp_mask & __GFP_NO_KSWAPD))
2420 zone_idx(preferred_zone)); 2390 wake_all_kswapd(order, zonelist, high_zoneidx,
2391 zone_idx(preferred_zone));
2421 2392
2422 /* 2393 /*
2423 * OK, we're below the kswapd watermark and have kicked background 2394 * OK, we're below the kswapd watermark and have kicked background
@@ -2494,7 +2465,7 @@ rebalance:
2494 * system then fail the allocation instead of entering direct reclaim. 2465 * system then fail the allocation instead of entering direct reclaim.
2495 */ 2466 */
2496 if ((deferred_compaction || contended_compaction) && 2467 if ((deferred_compaction || contended_compaction) &&
2497 (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) 2468 (gfp_mask & __GFP_NO_KSWAPD))
2498 goto nopage; 2469 goto nopage;
2499 2470
2500 /* Try direct reclaim and then allocating */ 2471 /* Try direct reclaim and then allocating */
@@ -2595,6 +2566,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2595 int migratetype = allocflags_to_migratetype(gfp_mask); 2566 int migratetype = allocflags_to_migratetype(gfp_mask);
2596 unsigned int cpuset_mems_cookie; 2567 unsigned int cpuset_mems_cookie;
2597 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; 2568 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2569 struct mem_cgroup *memcg = NULL;
2598 2570
2599 gfp_mask &= gfp_allowed_mask; 2571 gfp_mask &= gfp_allowed_mask;
2600 2572
@@ -2613,6 +2585,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2613 if (unlikely(!zonelist->_zonerefs->zone)) 2585 if (unlikely(!zonelist->_zonerefs->zone))
2614 return NULL; 2586 return NULL;
2615 2587
2588 /*
2589 * Will only have any effect when __GFP_KMEMCG is set. This is
2590 * verified in the (always inline) callee
2591 */
2592 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2593 return NULL;
2594
2616retry_cpuset: 2595retry_cpuset:
2617 cpuset_mems_cookie = get_mems_allowed(); 2596 cpuset_mems_cookie = get_mems_allowed();
2618 2597
@@ -2648,6 +2627,8 @@ out:
2648 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2627 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2649 goto retry_cpuset; 2628 goto retry_cpuset;
2650 2629
2630 memcg_kmem_commit_charge(page, memcg, order);
2631
2651 return page; 2632 return page;
2652} 2633}
2653EXPORT_SYMBOL(__alloc_pages_nodemask); 2634EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2700,6 +2681,31 @@ void free_pages(unsigned long addr, unsigned int order)
2700 2681
2701EXPORT_SYMBOL(free_pages); 2682EXPORT_SYMBOL(free_pages);
2702 2683
2684/*
2685 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2686 * pages allocated with __GFP_KMEMCG.
2687 *
2688 * Those pages are accounted to a particular memcg, embedded in the
2689 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2690 * for that information only to find out that it is NULL for users who have no
2691 * interest in that whatsoever, we provide these functions.
2692 *
2693 * The caller knows better which flags it relies on.
2694 */
2695void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2696{
2697 memcg_kmem_uncharge_pages(page, order);
2698 __free_pages(page, order);
2699}
2700
2701void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2702{
2703 if (addr != 0) {
2704 VM_BUG_ON(!virt_addr_valid((void *)addr));
2705 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2706 }
2707}
2708
2703static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2709static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2704{ 2710{
2705 if (addr) { 2711 if (addr) {
@@ -2818,7 +2824,7 @@ unsigned int nr_free_pagecache_pages(void)
2818 2824
2819static inline void show_node(struct zone *zone) 2825static inline void show_node(struct zone *zone)
2820{ 2826{
2821 if (NUMA_BUILD) 2827 if (IS_ENABLED(CONFIG_NUMA))
2822 printk("Node %d ", zone_to_nid(zone)); 2828 printk("Node %d ", zone_to_nid(zone));
2823} 2829}
2824 2830
@@ -2876,6 +2882,31 @@ out:
2876 2882
2877#define K(x) ((x) << (PAGE_SHIFT-10)) 2883#define K(x) ((x) << (PAGE_SHIFT-10))
2878 2884
2885static void show_migration_types(unsigned char type)
2886{
2887 static const char types[MIGRATE_TYPES] = {
2888 [MIGRATE_UNMOVABLE] = 'U',
2889 [MIGRATE_RECLAIMABLE] = 'E',
2890 [MIGRATE_MOVABLE] = 'M',
2891 [MIGRATE_RESERVE] = 'R',
2892#ifdef CONFIG_CMA
2893 [MIGRATE_CMA] = 'C',
2894#endif
2895 [MIGRATE_ISOLATE] = 'I',
2896 };
2897 char tmp[MIGRATE_TYPES + 1];
2898 char *p = tmp;
2899 int i;
2900
2901 for (i = 0; i < MIGRATE_TYPES; i++) {
2902 if (type & (1 << i))
2903 *p++ = types[i];
2904 }
2905
2906 *p = '\0';
2907 printk("(%s) ", tmp);
2908}
2909
2879/* 2910/*
2880 * Show free area list (used inside shift_scroll-lock stuff) 2911 * Show free area list (used inside shift_scroll-lock stuff)
2881 * We also calculate the percentage fragmentation. We do this by counting the 2912 * We also calculate the percentage fragmentation. We do this by counting the
@@ -2950,6 +2981,7 @@ void show_free_areas(unsigned int filter)
2950 " isolated(anon):%lukB" 2981 " isolated(anon):%lukB"
2951 " isolated(file):%lukB" 2982 " isolated(file):%lukB"
2952 " present:%lukB" 2983 " present:%lukB"
2984 " managed:%lukB"
2953 " mlocked:%lukB" 2985 " mlocked:%lukB"
2954 " dirty:%lukB" 2986 " dirty:%lukB"
2955 " writeback:%lukB" 2987 " writeback:%lukB"
@@ -2979,6 +3011,7 @@ void show_free_areas(unsigned int filter)
2979 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3011 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2980 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3012 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2981 K(zone->present_pages), 3013 K(zone->present_pages),
3014 K(zone->managed_pages),
2982 K(zone_page_state(zone, NR_MLOCK)), 3015 K(zone_page_state(zone, NR_MLOCK)),
2983 K(zone_page_state(zone, NR_FILE_DIRTY)), 3016 K(zone_page_state(zone, NR_FILE_DIRTY)),
2984 K(zone_page_state(zone, NR_WRITEBACK)), 3017 K(zone_page_state(zone, NR_WRITEBACK)),
@@ -3004,6 +3037,7 @@ void show_free_areas(unsigned int filter)
3004 3037
3005 for_each_populated_zone(zone) { 3038 for_each_populated_zone(zone) {
3006 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3039 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3040 unsigned char types[MAX_ORDER];
3007 3041
3008 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3042 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3009 continue; 3043 continue;
@@ -3012,12 +3046,24 @@ void show_free_areas(unsigned int filter)
3012 3046
3013 spin_lock_irqsave(&zone->lock, flags); 3047 spin_lock_irqsave(&zone->lock, flags);
3014 for (order = 0; order < MAX_ORDER; order++) { 3048 for (order = 0; order < MAX_ORDER; order++) {
3015 nr[order] = zone->free_area[order].nr_free; 3049 struct free_area *area = &zone->free_area[order];
3050 int type;
3051
3052 nr[order] = area->nr_free;
3016 total += nr[order] << order; 3053 total += nr[order] << order;
3054
3055 types[order] = 0;
3056 for (type = 0; type < MIGRATE_TYPES; type++) {
3057 if (!list_empty(&area->free_list[type]))
3058 types[order] |= 1 << type;
3059 }
3017 } 3060 }
3018 spin_unlock_irqrestore(&zone->lock, flags); 3061 spin_unlock_irqrestore(&zone->lock, flags);
3019 for (order = 0; order < MAX_ORDER; order++) 3062 for (order = 0; order < MAX_ORDER; order++) {
3020 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3063 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3064 if (nr[order])
3065 show_migration_types(types[order]);
3066 }
3021 printk("= %lukB\n", K(total)); 3067 printk("= %lukB\n", K(total));
3022 } 3068 }
3023 3069
@@ -3194,7 +3240,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
3194 return node; 3240 return node;
3195 } 3241 }
3196 3242
3197 for_each_node_state(n, N_HIGH_MEMORY) { 3243 for_each_node_state(n, N_MEMORY) {
3198 3244
3199 /* Don't want a node to appear more than once */ 3245 /* Don't want a node to appear more than once */
3200 if (node_isset(n, *used_node_mask)) 3246 if (node_isset(n, *used_node_mask))
@@ -3336,7 +3382,7 @@ static int default_zonelist_order(void)
3336 * local memory, NODE_ORDER may be suitable. 3382 * local memory, NODE_ORDER may be suitable.
3337 */ 3383 */
3338 average_size = total_size / 3384 average_size = total_size /
3339 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 3385 (nodes_weight(node_states[N_MEMORY]) + 1);
3340 for_each_online_node(nid) { 3386 for_each_online_node(nid) {
3341 low_kmem_size = 0; 3387 low_kmem_size = 0;
3342 total_size = 0; 3388 total_size = 0;
@@ -3826,6 +3872,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3826 mminit_verify_page_links(page, zone, nid, pfn); 3872 mminit_verify_page_links(page, zone, nid, pfn);
3827 init_page_count(page); 3873 init_page_count(page);
3828 reset_page_mapcount(page); 3874 reset_page_mapcount(page);
3875 reset_page_last_nid(page);
3829 SetPageReserved(page); 3876 SetPageReserved(page);
3830 /* 3877 /*
3831 * Mark the block movable so that blocks are reserved for 3878 * Mark the block movable so that blocks are reserved for
@@ -4432,6 +4479,26 @@ void __init set_pageblock_order(void)
4432 4479
4433#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4480#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4434 4481
4482static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4483 unsigned long present_pages)
4484{
4485 unsigned long pages = spanned_pages;
4486
4487 /*
4488 * Provide a more accurate estimation if there are holes within
4489 * the zone and SPARSEMEM is in use. If there are holes within the
4490 * zone, each populated memory region may cost us one or two extra
4491 * memmap pages due to alignment because memmap pages for each
4492 * populated regions may not naturally algined on page boundary.
4493 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4494 */
4495 if (spanned_pages > present_pages + (present_pages >> 4) &&
4496 IS_ENABLED(CONFIG_SPARSEMEM))
4497 pages = present_pages;
4498
4499 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4500}
4501
4435/* 4502/*
4436 * Set up the zone data structures: 4503 * Set up the zone data structures:
4437 * - mark all pages reserved 4504 * - mark all pages reserved
@@ -4449,54 +4516,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4449 int ret; 4516 int ret;
4450 4517
4451 pgdat_resize_init(pgdat); 4518 pgdat_resize_init(pgdat);
4519#ifdef CONFIG_NUMA_BALANCING
4520 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4521 pgdat->numabalancing_migrate_nr_pages = 0;
4522 pgdat->numabalancing_migrate_next_window = jiffies;
4523#endif
4452 init_waitqueue_head(&pgdat->kswapd_wait); 4524 init_waitqueue_head(&pgdat->kswapd_wait);
4453 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4525 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4454 pgdat_page_cgroup_init(pgdat); 4526 pgdat_page_cgroup_init(pgdat);
4455 4527
4456 for (j = 0; j < MAX_NR_ZONES; j++) { 4528 for (j = 0; j < MAX_NR_ZONES; j++) {
4457 struct zone *zone = pgdat->node_zones + j; 4529 struct zone *zone = pgdat->node_zones + j;
4458 unsigned long size, realsize, memmap_pages; 4530 unsigned long size, realsize, freesize, memmap_pages;
4459 4531
4460 size = zone_spanned_pages_in_node(nid, j, zones_size); 4532 size = zone_spanned_pages_in_node(nid, j, zones_size);
4461 realsize = size - zone_absent_pages_in_node(nid, j, 4533 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4462 zholes_size); 4534 zholes_size);
4463 4535
4464 /* 4536 /*
4465 * Adjust realsize so that it accounts for how much memory 4537 * Adjust freesize so that it accounts for how much memory
4466 * is used by this zone for memmap. This affects the watermark 4538 * is used by this zone for memmap. This affects the watermark
4467 * and per-cpu initialisations 4539 * and per-cpu initialisations
4468 */ 4540 */
4469 memmap_pages = 4541 memmap_pages = calc_memmap_size(size, realsize);
4470 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 4542 if (freesize >= memmap_pages) {
4471 if (realsize >= memmap_pages) { 4543 freesize -= memmap_pages;
4472 realsize -= memmap_pages;
4473 if (memmap_pages) 4544 if (memmap_pages)
4474 printk(KERN_DEBUG 4545 printk(KERN_DEBUG
4475 " %s zone: %lu pages used for memmap\n", 4546 " %s zone: %lu pages used for memmap\n",
4476 zone_names[j], memmap_pages); 4547 zone_names[j], memmap_pages);
4477 } else 4548 } else
4478 printk(KERN_WARNING 4549 printk(KERN_WARNING
4479 " %s zone: %lu pages exceeds realsize %lu\n", 4550 " %s zone: %lu pages exceeds freesize %lu\n",
4480 zone_names[j], memmap_pages, realsize); 4551 zone_names[j], memmap_pages, freesize);
4481 4552
4482 /* Account for reserved pages */ 4553 /* Account for reserved pages */
4483 if (j == 0 && realsize > dma_reserve) { 4554 if (j == 0 && freesize > dma_reserve) {
4484 realsize -= dma_reserve; 4555 freesize -= dma_reserve;
4485 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4556 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4486 zone_names[0], dma_reserve); 4557 zone_names[0], dma_reserve);
4487 } 4558 }
4488 4559
4489 if (!is_highmem_idx(j)) 4560 if (!is_highmem_idx(j))
4490 nr_kernel_pages += realsize; 4561 nr_kernel_pages += freesize;
4491 nr_all_pages += realsize; 4562 /* Charge for highmem memmap if there are enough kernel pages */
4563 else if (nr_kernel_pages > memmap_pages * 2)
4564 nr_kernel_pages -= memmap_pages;
4565 nr_all_pages += freesize;
4492 4566
4493 zone->spanned_pages = size; 4567 zone->spanned_pages = size;
4494 zone->present_pages = realsize; 4568 zone->present_pages = freesize;
4569 /*
4570 * Set an approximate value for lowmem here, it will be adjusted
4571 * when the bootmem allocator frees pages into the buddy system.
4572 * And all highmem pages will be managed by the buddy system.
4573 */
4574 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4495#ifdef CONFIG_NUMA 4575#ifdef CONFIG_NUMA
4496 zone->node = nid; 4576 zone->node = nid;
4497 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4577 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4498 / 100; 4578 / 100;
4499 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 4579 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4500#endif 4580#endif
4501 zone->name = zone_names[j]; 4581 zone->name = zone_names[j];
4502 spin_lock_init(&zone->lock); 4582 spin_lock_init(&zone->lock);
@@ -4687,7 +4767,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
4687/* 4767/*
4688 * early_calculate_totalpages() 4768 * early_calculate_totalpages()
4689 * Sum pages in active regions for movable zone. 4769 * Sum pages in active regions for movable zone.
4690 * Populate N_HIGH_MEMORY for calculating usable_nodes. 4770 * Populate N_MEMORY for calculating usable_nodes.
4691 */ 4771 */
4692static unsigned long __init early_calculate_totalpages(void) 4772static unsigned long __init early_calculate_totalpages(void)
4693{ 4773{
@@ -4700,7 +4780,7 @@ static unsigned long __init early_calculate_totalpages(void)
4700 4780
4701 totalpages += pages; 4781 totalpages += pages;
4702 if (pages) 4782 if (pages)
4703 node_set_state(nid, N_HIGH_MEMORY); 4783 node_set_state(nid, N_MEMORY);
4704 } 4784 }
4705 return totalpages; 4785 return totalpages;
4706} 4786}
@@ -4717,9 +4797,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4717 unsigned long usable_startpfn; 4797 unsigned long usable_startpfn;
4718 unsigned long kernelcore_node, kernelcore_remaining; 4798 unsigned long kernelcore_node, kernelcore_remaining;
4719 /* save the state before borrow the nodemask */ 4799 /* save the state before borrow the nodemask */
4720 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; 4800 nodemask_t saved_node_state = node_states[N_MEMORY];
4721 unsigned long totalpages = early_calculate_totalpages(); 4801 unsigned long totalpages = early_calculate_totalpages();
4722 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4802 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
4723 4803
4724 /* 4804 /*
4725 * If movablecore was specified, calculate what size of 4805 * If movablecore was specified, calculate what size of
@@ -4754,7 +4834,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4754restart: 4834restart:
4755 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4835 /* Spread kernelcore memory as evenly as possible throughout nodes */
4756 kernelcore_node = required_kernelcore / usable_nodes; 4836 kernelcore_node = required_kernelcore / usable_nodes;
4757 for_each_node_state(nid, N_HIGH_MEMORY) { 4837 for_each_node_state(nid, N_MEMORY) {
4758 unsigned long start_pfn, end_pfn; 4838 unsigned long start_pfn, end_pfn;
4759 4839
4760 /* 4840 /*
@@ -4846,23 +4926,27 @@ restart:
4846 4926
4847out: 4927out:
4848 /* restore the node_state */ 4928 /* restore the node_state */
4849 node_states[N_HIGH_MEMORY] = saved_node_state; 4929 node_states[N_MEMORY] = saved_node_state;
4850} 4930}
4851 4931
4852/* Any regular memory on that node ? */ 4932/* Any regular or high memory on that node ? */
4853static void __init check_for_regular_memory(pg_data_t *pgdat) 4933static void check_for_memory(pg_data_t *pgdat, int nid)
4854{ 4934{
4855#ifdef CONFIG_HIGHMEM
4856 enum zone_type zone_type; 4935 enum zone_type zone_type;
4857 4936
4858 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4937 if (N_MEMORY == N_NORMAL_MEMORY)
4938 return;
4939
4940 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
4859 struct zone *zone = &pgdat->node_zones[zone_type]; 4941 struct zone *zone = &pgdat->node_zones[zone_type];
4860 if (zone->present_pages) { 4942 if (zone->present_pages) {
4861 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4943 node_set_state(nid, N_HIGH_MEMORY);
4944 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
4945 zone_type <= ZONE_NORMAL)
4946 node_set_state(nid, N_NORMAL_MEMORY);
4862 break; 4947 break;
4863 } 4948 }
4864 } 4949 }
4865#endif
4866} 4950}
4867 4951
4868/** 4952/**
@@ -4945,8 +5029,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4945 5029
4946 /* Any memory on that node */ 5030 /* Any memory on that node */
4947 if (pgdat->node_present_pages) 5031 if (pgdat->node_present_pages)
4948 node_set_state(nid, N_HIGH_MEMORY); 5032 node_set_state(nid, N_MEMORY);
4949 check_for_regular_memory(pgdat); 5033 check_for_memory(pgdat, nid);
4950 } 5034 }
4951} 5035}
4952 5036
@@ -5174,10 +5258,6 @@ static void __setup_per_zone_wmarks(void)
5174 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5258 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5175 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5259 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5176 5260
5177 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5178 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5179 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5180
5181 setup_zone_migrate_reserve(zone); 5261 setup_zone_migrate_reserve(zone);
5182 spin_unlock_irqrestore(&zone->lock, flags); 5262 spin_unlock_irqrestore(&zone->lock, flags);
5183 } 5263 }
@@ -5505,7 +5585,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5505 pfn &= (PAGES_PER_SECTION-1); 5585 pfn &= (PAGES_PER_SECTION-1);
5506 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5586 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5507#else 5587#else
5508 pfn = pfn - zone->zone_start_pfn; 5588 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5509 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5589 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5510#endif /* CONFIG_SPARSEMEM */ 5590#endif /* CONFIG_SPARSEMEM */
5511} 5591}
@@ -5575,7 +5655,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5575 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5655 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5576 * expect this function should be exact. 5656 * expect this function should be exact.
5577 */ 5657 */
5578bool has_unmovable_pages(struct zone *zone, struct page *page, int count) 5658bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5659 bool skip_hwpoisoned_pages)
5579{ 5660{
5580 unsigned long pfn, iter, found; 5661 unsigned long pfn, iter, found;
5581 int mt; 5662 int mt;
@@ -5610,6 +5691,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5610 continue; 5691 continue;
5611 } 5692 }
5612 5693
5694 /*
5695 * The HWPoisoned page may be not in buddy system, and
5696 * page_count() is not 0.
5697 */
5698 if (skip_hwpoisoned_pages && PageHWPoison(page))
5699 continue;
5700
5613 if (!PageLRU(page)) 5701 if (!PageLRU(page))
5614 found++; 5702 found++;
5615 /* 5703 /*
@@ -5652,7 +5740,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5652 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5740 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5653 return false; 5741 return false;
5654 5742
5655 return !has_unmovable_pages(zone, page, 0); 5743 return !has_unmovable_pages(zone, page, 0, true);
5656} 5744}
5657 5745
5658#ifdef CONFIG_CMA 5746#ifdef CONFIG_CMA
@@ -5679,7 +5767,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5679 unsigned int tries = 0; 5767 unsigned int tries = 0;
5680 int ret = 0; 5768 int ret = 0;
5681 5769
5682 migrate_prep_local(); 5770 migrate_prep();
5683 5771
5684 while (pfn < end || !list_empty(&cc->migratepages)) { 5772 while (pfn < end || !list_empty(&cc->migratepages)) {
5685 if (fatal_signal_pending(current)) { 5773 if (fatal_signal_pending(current)) {
@@ -5707,61 +5795,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5707 5795
5708 ret = migrate_pages(&cc->migratepages, 5796 ret = migrate_pages(&cc->migratepages,
5709 alloc_migrate_target, 5797 alloc_migrate_target,
5710 0, false, MIGRATE_SYNC); 5798 0, false, MIGRATE_SYNC,
5799 MR_CMA);
5711 } 5800 }
5712 5801
5713 putback_lru_pages(&cc->migratepages); 5802 putback_movable_pages(&cc->migratepages);
5714 return ret > 0 ? 0 : ret; 5803 return ret > 0 ? 0 : ret;
5715} 5804}
5716 5805
5717/*
5718 * Update zone's cma pages counter used for watermark level calculation.
5719 */
5720static inline void __update_cma_watermarks(struct zone *zone, int count)
5721{
5722 unsigned long flags;
5723 spin_lock_irqsave(&zone->lock, flags);
5724 zone->min_cma_pages += count;
5725 spin_unlock_irqrestore(&zone->lock, flags);
5726 setup_per_zone_wmarks();
5727}
5728
5729/*
5730 * Trigger memory pressure bump to reclaim some pages in order to be able to
5731 * allocate 'count' pages in single page units. Does similar work as
5732 *__alloc_pages_slowpath() function.
5733 */
5734static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5735{
5736 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5737 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5738 int did_some_progress = 0;
5739 int order = 1;
5740
5741 /*
5742 * Increase level of watermarks to force kswapd do his job
5743 * to stabilise at new watermark level.
5744 */
5745 __update_cma_watermarks(zone, count);
5746
5747 /* Obey watermarks as if the page was being allocated */
5748 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5749 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5750
5751 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5752 NULL);
5753 if (!did_some_progress) {
5754 /* Exhausted what can be done so it's blamo time */
5755 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5756 }
5757 }
5758
5759 /* Restore original watermark levels. */
5760 __update_cma_watermarks(zone, -count);
5761
5762 return count;
5763}
5764
5765/** 5806/**
5766 * alloc_contig_range() -- tries to allocate given range of pages 5807 * alloc_contig_range() -- tries to allocate given range of pages
5767 * @start: start PFN to allocate 5808 * @start: start PFN to allocate
@@ -5785,7 +5826,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5785int alloc_contig_range(unsigned long start, unsigned long end, 5826int alloc_contig_range(unsigned long start, unsigned long end,
5786 unsigned migratetype) 5827 unsigned migratetype)
5787{ 5828{
5788 struct zone *zone = page_zone(pfn_to_page(start));
5789 unsigned long outer_start, outer_end; 5829 unsigned long outer_start, outer_end;
5790 int ret = 0, order; 5830 int ret = 0, order;
5791 5831
@@ -5823,7 +5863,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5823 */ 5863 */
5824 5864
5825 ret = start_isolate_page_range(pfn_max_align_down(start), 5865 ret = start_isolate_page_range(pfn_max_align_down(start),
5826 pfn_max_align_up(end), migratetype); 5866 pfn_max_align_up(end), migratetype,
5867 false);
5827 if (ret) 5868 if (ret)
5828 return ret; 5869 return ret;
5829 5870
@@ -5862,18 +5903,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5862 } 5903 }
5863 5904
5864 /* Make sure the range is really isolated. */ 5905 /* Make sure the range is really isolated. */
5865 if (test_pages_isolated(outer_start, end)) { 5906 if (test_pages_isolated(outer_start, end, false)) {
5866 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 5907 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5867 outer_start, end); 5908 outer_start, end);
5868 ret = -EBUSY; 5909 ret = -EBUSY;
5869 goto done; 5910 goto done;
5870 } 5911 }
5871 5912
5872 /*
5873 * Reclaim enough pages to make sure that contiguous allocation
5874 * will not starve the system.
5875 */
5876 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5877 5913
5878 /* Grab isolated pages from freelists. */ 5914 /* Grab isolated pages from freelists. */
5879 outer_end = isolate_freepages_range(&cc, outer_start, end); 5915 outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5896,8 +5932,15 @@ done:
5896 5932
5897void free_contig_range(unsigned long pfn, unsigned nr_pages) 5933void free_contig_range(unsigned long pfn, unsigned nr_pages)
5898{ 5934{
5899 for (; nr_pages--; ++pfn) 5935 unsigned int count = 0;
5900 __free_page(pfn_to_page(pfn)); 5936
5937 for (; nr_pages--; pfn++) {
5938 struct page *page = pfn_to_page(pfn);
5939
5940 count += page_count(page) != 1;
5941 __free_page(page);
5942 }
5943 WARN(count != 0, "%d pages are still in use!\n", count);
5901} 5944}
5902#endif 5945#endif
5903 5946
@@ -5931,7 +5974,6 @@ void __meminit zone_pcp_update(struct zone *zone)
5931} 5974}
5932#endif 5975#endif
5933 5976
5934#ifdef CONFIG_MEMORY_HOTREMOVE
5935void zone_pcp_reset(struct zone *zone) 5977void zone_pcp_reset(struct zone *zone)
5936{ 5978{
5937 unsigned long flags; 5979 unsigned long flags;
@@ -5951,6 +5993,7 @@ void zone_pcp_reset(struct zone *zone)
5951 local_irq_restore(flags); 5993 local_irq_restore(flags);
5952} 5994}
5953 5995
5996#ifdef CONFIG_MEMORY_HOTREMOVE
5954/* 5997/*
5955 * All pages in the range must be isolated before calling this. 5998 * All pages in the range must be isolated before calling this.
5956 */ 5999 */
@@ -5977,6 +6020,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5977 continue; 6020 continue;
5978 } 6021 }
5979 page = pfn_to_page(pfn); 6022 page = pfn_to_page(pfn);
6023 /*
6024 * The HWPoisoned page may be not in buddy system, and
6025 * page_count() is not 0.
6026 */
6027 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6028 pfn++;
6029 SetPageReserved(page);
6030 continue;
6031 }
6032
5980 BUG_ON(page_count(page)); 6033 BUG_ON(page_count(page));
5981 BUG_ON(!PageBuddy(page)); 6034 BUG_ON(!PageBuddy(page));
5982 order = page_order(page); 6035 order = page_order(page);
@@ -5987,8 +6040,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5987 list_del(&page->lru); 6040 list_del(&page->lru);
5988 rmv_page_order(page); 6041 rmv_page_order(page);
5989 zone->free_area[order].nr_free--; 6042 zone->free_area[order].nr_free--;
5990 __mod_zone_page_state(zone, NR_FREE_PAGES,
5991 - (1UL << order));
5992 for (i = 0; i < (1 << order); i++) 6043 for (i = 0; i < (1 << order); i++)
5993 SetPageReserved((page+i)); 6044 SetPageReserved((page+i));
5994 pfn += (1 << order); 6045 pfn += (1 << order);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c6daa6..6d757e3a872a 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
251 mn->nr_pages, mn->status_change_nid); 251 mn->nr_pages, mn->status_change_nid);
252 break; 252 break;
253 case MEM_CANCEL_ONLINE: 253 case MEM_CANCEL_ONLINE:
254 offline_page_cgroup(mn->start_pfn,
255 mn->nr_pages, mn->status_change_nid);
256 break;
254 case MEM_GOING_OFFLINE: 257 case MEM_GOING_OFFLINE:
255 break; 258 break;
256 case MEM_ONLINE: 259 case MEM_ONLINE:
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void)
271 if (mem_cgroup_disabled()) 274 if (mem_cgroup_disabled())
272 return; 275 return;
273 276
274 for_each_node_state(nid, N_HIGH_MEMORY) { 277 for_each_node_state(nid, N_MEMORY) {
275 unsigned long start_pfn, end_pfn; 278 unsigned long start_pfn, end_pfn;
276 279
277 start_pfn = node_start_pfn(nid); 280 start_pfn = node_start_pfn(nid);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f2f5b4818e94..383bdbb98b04 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,29 +8,7 @@
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include "internal.h" 9#include "internal.h"
10 10
11/* called while holding zone->lock */ 11int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
33int set_migratetype_isolate(struct page *page)
34{ 12{
35 struct zone *zone; 13 struct zone *zone;
36 unsigned long flags, pfn; 14 unsigned long flags, pfn;
@@ -66,7 +44,8 @@ int set_migratetype_isolate(struct page *page)
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 44 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages. 45 * We just check MOVABLE pages.
68 */ 46 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found)) 47 if (!has_unmovable_pages(zone, page, arg.pages_found,
48 skip_hwpoisoned_pages))
70 ret = 0; 49 ret = 0;
71 50
72 /* 51 /*
@@ -79,7 +58,7 @@ out:
79 unsigned long nr_pages; 58 unsigned long nr_pages;
80 int migratetype = get_pageblock_migratetype(page); 59 int migratetype = get_pageblock_migratetype(page);
81 60
82 set_pageblock_isolate(page); 61 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
83 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); 62 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
84 63
85 __mod_zone_freepage_state(zone, -nr_pages, migratetype); 64 __mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -102,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
102 goto out; 81 goto out;
103 nr_pages = move_freepages_block(zone, page, migratetype); 82 nr_pages = move_freepages_block(zone, page, migratetype);
104 __mod_zone_freepage_state(zone, nr_pages, migratetype); 83 __mod_zone_freepage_state(zone, nr_pages, migratetype);
105 restore_pageblock_isolate(page, migratetype); 84 set_pageblock_migratetype(page, migratetype);
106out: 85out:
107 spin_unlock_irqrestore(&zone->lock, flags); 86 spin_unlock_irqrestore(&zone->lock, flags);
108} 87}
@@ -134,7 +113,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
134 * Returns 0 on success and -EBUSY if any part of range cannot be isolated. 113 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
135 */ 114 */
136int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 115int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
137 unsigned migratetype) 116 unsigned migratetype, bool skip_hwpoisoned_pages)
138{ 117{
139 unsigned long pfn; 118 unsigned long pfn;
140 unsigned long undo_pfn; 119 unsigned long undo_pfn;
@@ -147,7 +126,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
147 pfn < end_pfn; 126 pfn < end_pfn;
148 pfn += pageblock_nr_pages) { 127 pfn += pageblock_nr_pages) {
149 page = __first_valid_page(pfn, pageblock_nr_pages); 128 page = __first_valid_page(pfn, pageblock_nr_pages);
150 if (page && set_migratetype_isolate(page)) { 129 if (page &&
130 set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
151 undo_pfn = pfn; 131 undo_pfn = pfn;
152 goto undo; 132 goto undo;
153 } 133 }
@@ -190,7 +170,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
190 * Returns 1 if all pages in the range are isolated. 170 * Returns 1 if all pages in the range are isolated.
191 */ 171 */
192static int 172static int
193__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 173__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
174 bool skip_hwpoisoned_pages)
194{ 175{
195 struct page *page; 176 struct page *page;
196 177
@@ -220,6 +201,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
220 else if (page_count(page) == 0 && 201 else if (page_count(page) == 0 &&
221 get_freepage_migratetype(page) == MIGRATE_ISOLATE) 202 get_freepage_migratetype(page) == MIGRATE_ISOLATE)
222 pfn += 1; 203 pfn += 1;
204 else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
205 /*
206 * The HWPoisoned page may be not in buddy
207 * system, and page_count() is not 0.
208 */
209 pfn++;
210 continue;
211 }
223 else 212 else
224 break; 213 break;
225 } 214 }
@@ -228,7 +217,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
228 return 1; 217 return 1;
229} 218}
230 219
231int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 220int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
221 bool skip_hwpoisoned_pages)
232{ 222{
233 unsigned long pfn, flags; 223 unsigned long pfn, flags;
234 struct page *page; 224 struct page *page;
@@ -251,7 +241,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
251 /* Check all pages are free or Marked as ISOLATED */ 241 /* Check all pages are free or Marked as ISOLATED */
252 zone = page_zone(page); 242 zone = page_zone(page);
253 spin_lock_irqsave(&zone->lock, flags); 243 spin_lock_irqsave(&zone->lock, flags);
254 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); 244 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
245 skip_hwpoisoned_pages);
255 spin_unlock_irqrestore(&zone->lock, flags); 246 spin_unlock_irqrestore(&zone->lock, flags);
256 return ret ? 0 : -EBUSY; 247 return ret ? 0 : -EBUSY;
257} 248}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 6c118d012bb5..35aa294656cd 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
58 if (!walk->pte_entry) 58 if (!walk->pte_entry)
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd_mm(walk->mm, addr, pmd);
62 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/percpu.c b/mm/percpu.c
index ddc5efb9c5bb..8c8e08f3a692 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
631 if (!chunk) 631 if (!chunk)
632 return; 632 return;
633 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 633 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
634 kfree(chunk); 634 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
635} 635}
636 636
637/* 637/*
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1380 1380
1381static int __init percpu_alloc_setup(char *str) 1381static int __init percpu_alloc_setup(char *str)
1382{ 1382{
1383 if (!str)
1384 return -EINVAL;
1385
1383 if (0) 1386 if (0)
1384 /* nada */; 1387 /* nada */;
1385#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK 1388#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b7..0c8323fe6c8f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
12 12
13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
14/* 14/*
15 * Only sets the access flags (dirty, accessed, and 15 * Only sets the access flags (dirty, accessed), as well as write
16 * writable). Furthermore, we know it always gets set to a "more 16 * permission. Furthermore, we know it always gets set to a "more
17 * permissive" setting, which allows most architectures to optimize 17 * permissive" setting, which allows most architectures to optimize
18 * this. We return whether the PTE actually changed, which in turn 18 * this. We return whether the PTE actually changed, which in turn
19 * instructs the caller to do things like update__mmu_cache. This 19 * instructs the caller to do things like update__mmu_cache. This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
27 int changed = !pte_same(*ptep, entry); 27 int changed = !pte_same(*ptep, entry);
28 if (changed) { 28 if (changed) {
29 set_pte_at(vma->vm_mm, address, ptep, entry); 29 set_pte_at(vma->vm_mm, address, ptep, entry);
30 flush_tlb_page(vma, address); 30 flush_tlb_fix_spurious_fault(vma, address);
31 } 31 }
32 return changed; 32 return changed;
33} 33}
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
88{ 88{
89 pte_t pte; 89 pte_t pte;
90 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); 90 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
91 flush_tlb_page(vma, address); 91 if (pte_accessible(pte))
92 flush_tlb_page(vma, address);
92 return pte; 93 return pte;
93} 94}
94#endif 95#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ee1ef0f317b..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
27 * anon_vma->mutex 27 * anon_vma->rwsem
28 * mm->page_table_lock or pte_lock 28 * mm->page_table_lock or pte_lock
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
30 * swap_lock (in swap_duplicate, swap_info_get) 30 * swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within bdi.wb->list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
39 * 39 *
40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 41 * ->tasklist_lock
42 * pte map lock 42 * pte map lock
43 */ 43 */
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
87 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 87 VM_BUG_ON(atomic_read(&anon_vma->refcount));
88 88
89 /* 89 /*
90 * Synchronize against page_lock_anon_vma() such that 90 * Synchronize against page_lock_anon_vma_read() such that
91 * we can safely hold the lock without the anon_vma getting 91 * we can safely hold the lock without the anon_vma getting
92 * freed. 92 * freed.
93 * 93 *
94 * Relies on the full mb implied by the atomic_dec_and_test() from 94 * Relies on the full mb implied by the atomic_dec_and_test() from
95 * put_anon_vma() against the acquire barrier implied by 95 * put_anon_vma() against the acquire barrier implied by
96 * mutex_trylock() from page_lock_anon_vma(). This orders: 96 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
97 * 97 *
98 * page_lock_anon_vma() VS put_anon_vma() 98 * page_lock_anon_vma_read() VS put_anon_vma()
99 * mutex_trylock() atomic_dec_and_test() 99 * down_read_trylock() atomic_dec_and_test()
100 * LOCK MB 100 * LOCK MB
101 * atomic_read() mutex_is_locked() 101 * atomic_read() rwsem_is_locked()
102 * 102 *
103 * LOCK should suffice since the actual taking of the lock must 103 * LOCK should suffice since the actual taking of the lock must
104 * happen _before_ what follows. 104 * happen _before_ what follows.
105 */ 105 */
106 if (mutex_is_locked(&anon_vma->root->mutex)) { 106 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock(anon_vma); 108 anon_vma_unlock(anon_vma);
109 } 109 }
110 110
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
146 * allocate a new one. 146 * allocate a new one.
147 * 147 *
148 * Anon-vma allocations are very subtle, because we may have 148 * Anon-vma allocations are very subtle, because we may have
149 * optimistically looked up an anon_vma in page_lock_anon_vma() 149 * optimistically looked up an anon_vma in page_lock_anon_vma_read()
150 * and that may actually touch the spinlock even in the newly 150 * and that may actually touch the spinlock even in the newly
151 * allocated vma (it depends on RCU to make sure that the 151 * allocated vma (it depends on RCU to make sure that the
152 * anon_vma isn't actually destroyed). 152 * anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
181 allocated = anon_vma; 181 allocated = anon_vma;
182 } 182 }
183 183
184 anon_vma_lock(anon_vma); 184 anon_vma_lock_write(anon_vma);
185 /* page_table_lock to protect against threads */ 185 /* page_table_lock to protect against threads */
186 spin_lock(&mm->page_table_lock); 186 spin_lock(&mm->page_table_lock);
187 if (likely(!vma->anon_vma)) { 187 if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
219 struct anon_vma *new_root = anon_vma->root; 219 struct anon_vma *new_root = anon_vma->root;
220 if (new_root != root) { 220 if (new_root != root) {
221 if (WARN_ON_ONCE(root)) 221 if (WARN_ON_ONCE(root))
222 mutex_unlock(&root->mutex); 222 up_write(&root->rwsem);
223 root = new_root; 223 root = new_root;
224 mutex_lock(&root->mutex); 224 down_write(&root->rwsem);
225 } 225 }
226 return root; 226 return root;
227} 227}
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
229static inline void unlock_anon_vma_root(struct anon_vma *root) 229static inline void unlock_anon_vma_root(struct anon_vma *root)
230{ 230{
231 if (root) 231 if (root)
232 mutex_unlock(&root->mutex); 232 up_write(&root->rwsem);
233} 233}
234 234
235/* 235/*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
306 get_anon_vma(anon_vma->root); 306 get_anon_vma(anon_vma->root);
307 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 307 /* Mark this anon_vma as the one where our new (COWed) pages go. */
308 vma->anon_vma = anon_vma; 308 vma->anon_vma = anon_vma;
309 anon_vma_lock(anon_vma); 309 anon_vma_lock_write(anon_vma);
310 anon_vma_chain_link(vma, avc, anon_vma); 310 anon_vma_chain_link(vma, avc, anon_vma);
311 anon_vma_unlock(anon_vma); 311 anon_vma_unlock(anon_vma);
312 312
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
349 /* 349 /*
350 * Iterate the list once more, it now only contains empty and unlinked 350 * Iterate the list once more, it now only contains empty and unlinked
351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
352 * needing to acquire the anon_vma->root->mutex. 352 * needing to write-acquire the anon_vma->root->rwsem.
353 */ 353 */
354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
355 struct anon_vma *anon_vma = avc->anon_vma; 355 struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
365{ 365{
366 struct anon_vma *anon_vma = data; 366 struct anon_vma *anon_vma = data;
367 367
368 mutex_init(&anon_vma->mutex); 368 init_rwsem(&anon_vma->rwsem);
369 atomic_set(&anon_vma->refcount, 0); 369 atomic_set(&anon_vma->refcount, 0);
370 anon_vma->rb_root = RB_ROOT; 370 anon_vma->rb_root = RB_ROOT;
371} 371}
@@ -442,7 +442,7 @@ out:
442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
443 * reference like with page_get_anon_vma() and then block on the mutex. 443 * reference like with page_get_anon_vma() and then block on the mutex.
444 */ 444 */
445struct anon_vma *page_lock_anon_vma(struct page *page) 445struct anon_vma *page_lock_anon_vma_read(struct page *page)
446{ 446{
447 struct anon_vma *anon_vma = NULL; 447 struct anon_vma *anon_vma = NULL;
448 struct anon_vma *root_anon_vma; 448 struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
457 457
458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
459 root_anon_vma = ACCESS_ONCE(anon_vma->root); 459 root_anon_vma = ACCESS_ONCE(anon_vma->root);
460 if (mutex_trylock(&root_anon_vma->mutex)) { 460 if (down_read_trylock(&root_anon_vma->rwsem)) {
461 /* 461 /*
462 * If the page is still mapped, then this anon_vma is still 462 * If the page is still mapped, then this anon_vma is still
463 * its anon_vma, and holding the mutex ensures that it will 463 * its anon_vma, and holding the mutex ensures that it will
464 * not go away, see anon_vma_free(). 464 * not go away, see anon_vma_free().
465 */ 465 */
466 if (!page_mapped(page)) { 466 if (!page_mapped(page)) {
467 mutex_unlock(&root_anon_vma->mutex); 467 up_read(&root_anon_vma->rwsem);
468 anon_vma = NULL; 468 anon_vma = NULL;
469 } 469 }
470 goto out; 470 goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
484 484
485 /* we pinned the anon_vma, its safe to sleep */ 485 /* we pinned the anon_vma, its safe to sleep */
486 rcu_read_unlock(); 486 rcu_read_unlock();
487 anon_vma_lock(anon_vma); 487 anon_vma_lock_read(anon_vma);
488 488
489 if (atomic_dec_and_test(&anon_vma->refcount)) { 489 if (atomic_dec_and_test(&anon_vma->refcount)) {
490 /* 490 /*
491 * Oops, we held the last refcount, release the lock 491 * Oops, we held the last refcount, release the lock
492 * and bail -- can't simply use put_anon_vma() because 492 * and bail -- can't simply use put_anon_vma() because
493 * we'll deadlock on the anon_vma_lock() recursion. 493 * we'll deadlock on the anon_vma_lock_write() recursion.
494 */ 494 */
495 anon_vma_unlock(anon_vma); 495 anon_vma_unlock_read(anon_vma);
496 __put_anon_vma(anon_vma); 496 __put_anon_vma(anon_vma);
497 anon_vma = NULL; 497 anon_vma = NULL;
498 } 498 }
@@ -504,9 +504,9 @@ out:
504 return anon_vma; 504 return anon_vma;
505} 505}
506 506
507void page_unlock_anon_vma(struct anon_vma *anon_vma) 507void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
508{ 508{
509 anon_vma_unlock(anon_vma); 509 anon_vma_unlock_read(anon_vma);
510} 510}
511 511
512/* 512/*
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
562 return address; 562 return address;
563} 563}
564 564
565pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
566{
567 pgd_t *pgd;
568 pud_t *pud;
569 pmd_t *pmd = NULL;
570
571 pgd = pgd_offset(mm, address);
572 if (!pgd_present(*pgd))
573 goto out;
574
575 pud = pud_offset(pgd, address);
576 if (!pud_present(*pud))
577 goto out;
578
579 pmd = pmd_offset(pud, address);
580 if (!pmd_present(*pmd))
581 pmd = NULL;
582out:
583 return pmd;
584}
585
565/* 586/*
566 * Check that @page is mapped at @address into @mm. 587 * Check that @page is mapped at @address into @mm.
567 * 588 *
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
574pte_t *__page_check_address(struct page *page, struct mm_struct *mm, 595pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
575 unsigned long address, spinlock_t **ptlp, int sync) 596 unsigned long address, spinlock_t **ptlp, int sync)
576{ 597{
577 pgd_t *pgd;
578 pud_t *pud;
579 pmd_t *pmd; 598 pmd_t *pmd;
580 pte_t *pte; 599 pte_t *pte;
581 spinlock_t *ptl; 600 spinlock_t *ptl;
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
586 goto check; 605 goto check;
587 } 606 }
588 607
589 pgd = pgd_offset(mm, address); 608 pmd = mm_find_pmd(mm, address);
590 if (!pgd_present(*pgd)) 609 if (!pmd)
591 return NULL;
592
593 pud = pud_offset(pgd, address);
594 if (!pud_present(*pud))
595 return NULL; 610 return NULL;
596 611
597 pmd = pmd_offset(pud, address);
598 if (!pmd_present(*pmd))
599 return NULL;
600 if (pmd_trans_huge(*pmd)) 612 if (pmd_trans_huge(*pmd))
601 return NULL; 613 return NULL;
602 614
@@ -732,7 +744,7 @@ static int page_referenced_anon(struct page *page,
732 struct anon_vma_chain *avc; 744 struct anon_vma_chain *avc;
733 int referenced = 0; 745 int referenced = 0;
734 746
735 anon_vma = page_lock_anon_vma(page); 747 anon_vma = page_lock_anon_vma_read(page);
736 if (!anon_vma) 748 if (!anon_vma)
737 return referenced; 749 return referenced;
738 750
@@ -754,7 +766,7 @@ static int page_referenced_anon(struct page *page,
754 break; 766 break;
755 } 767 }
756 768
757 page_unlock_anon_vma(anon_vma); 769 page_unlock_anon_vma_read(anon_vma);
758 return referenced; 770 return referenced;
759} 771}
760 772
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page)
1139 * containing the swap entry, but page not yet written to swap. 1151 * containing the swap entry, but page not yet written to swap.
1140 * 1152 *
1141 * And we can skip it on file pages, so long as the filesystem 1153 * And we can skip it on file pages, so long as the filesystem
1142 * participates in dirty tracking; but need to catch shm and tmpfs 1154 * participates in dirty tracking (note that this is not only an
1143 * and ramfs pages which have been modified since creation by read 1155 * optimization but also solves problems caused by dirty flag in
1144 * fault. 1156 * storage key getting set by a write from inside kernel); but need to
1157 * catch shm and tmpfs and ramfs pages which have been modified since
1158 * creation by read fault.
1145 * 1159 *
1146 * Note that mapping must be decided above, before decrementing 1160 * Note that mapping must be decided above, before decrementing
1147 * mapcount (which luckily provides a barrier): once page is unmapped, 1161 * mapcount (which luckily provides a barrier): once page is unmapped,
@@ -1235,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1235 update_hiwater_rss(mm); 1249 update_hiwater_rss(mm);
1236 1250
1237 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1251 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1238 if (PageAnon(page)) 1252 if (!PageHuge(page)) {
1239 dec_mm_counter(mm, MM_ANONPAGES); 1253 if (PageAnon(page))
1240 else 1254 dec_mm_counter(mm, MM_ANONPAGES);
1241 dec_mm_counter(mm, MM_FILEPAGES); 1255 else
1256 dec_mm_counter(mm, MM_FILEPAGES);
1257 }
1242 set_pte_at(mm, address, pte, 1258 set_pte_at(mm, address, pte,
1243 swp_entry_to_pte(make_hwpoison_entry(page))); 1259 swp_entry_to_pte(make_hwpoison_entry(page)));
1244 } else if (PageAnon(page)) { 1260 } else if (PageAnon(page)) {
1245 swp_entry_t entry = { .val = page_private(page) }; 1261 swp_entry_t entry = { .val = page_private(page) };
1246 1262
@@ -1299,7 +1315,7 @@ out_mlock:
1299 /* 1315 /*
1300 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1316 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1301 * unstable result and race. Plus, We can't wait here because 1317 * unstable result and race. Plus, We can't wait here because
1302 * we now hold anon_vma->mutex or mapping->i_mmap_mutex. 1318 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
1303 * if trylock failed, the page remain in evictable lru and later 1319 * if trylock failed, the page remain in evictable lru and later
1304 * vmscan could retry to move the page to unevictable lru if the 1320 * vmscan could retry to move the page to unevictable lru if the
1305 * page is actually mlocked. 1321 * page is actually mlocked.
@@ -1345,8 +1361,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1345 struct vm_area_struct *vma, struct page *check_page) 1361 struct vm_area_struct *vma, struct page *check_page)
1346{ 1362{
1347 struct mm_struct *mm = vma->vm_mm; 1363 struct mm_struct *mm = vma->vm_mm;
1348 pgd_t *pgd;
1349 pud_t *pud;
1350 pmd_t *pmd; 1364 pmd_t *pmd;
1351 pte_t *pte; 1365 pte_t *pte;
1352 pte_t pteval; 1366 pte_t pteval;
@@ -1366,16 +1380,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1366 if (end > vma->vm_end) 1380 if (end > vma->vm_end)
1367 end = vma->vm_end; 1381 end = vma->vm_end;
1368 1382
1369 pgd = pgd_offset(mm, address); 1383 pmd = mm_find_pmd(mm, address);
1370 if (!pgd_present(*pgd)) 1384 if (!pmd)
1371 return ret;
1372
1373 pud = pud_offset(pgd, address);
1374 if (!pud_present(*pud))
1375 return ret;
1376
1377 pmd = pmd_offset(pud, address);
1378 if (!pmd_present(*pmd))
1379 return ret; 1385 return ret;
1380 1386
1381 mmun_start = address; 1387 mmun_start = address;
@@ -1474,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1474 struct anon_vma_chain *avc; 1480 struct anon_vma_chain *avc;
1475 int ret = SWAP_AGAIN; 1481 int ret = SWAP_AGAIN;
1476 1482
1477 anon_vma = page_lock_anon_vma(page); 1483 anon_vma = page_lock_anon_vma_read(page);
1478 if (!anon_vma) 1484 if (!anon_vma)
1479 return ret; 1485 return ret;
1480 1486
@@ -1501,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1501 break; 1507 break;
1502 } 1508 }
1503 1509
1504 page_unlock_anon_vma(anon_vma); 1510 page_unlock_anon_vma_read(anon_vma);
1505 return ret; 1511 return ret;
1506} 1512}
1507 1513
@@ -1696,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1696 int ret = SWAP_AGAIN; 1702 int ret = SWAP_AGAIN;
1697 1703
1698 /* 1704 /*
1699 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1705 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
1700 * because that depends on page_mapped(); but not all its usages 1706 * because that depends on page_mapped(); but not all its usages
1701 * are holding mmap_sem. Users without mmap_sem are required to 1707 * are holding mmap_sem. Users without mmap_sem are required to
1702 * take a reference count to prevent the anon_vma disappearing 1708 * take a reference count to prevent the anon_vma disappearing
@@ -1704,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1704 anon_vma = page_anon_vma(page); 1710 anon_vma = page_anon_vma(page);
1705 if (!anon_vma) 1711 if (!anon_vma)
1706 return ret; 1712 return ret;
1707 anon_vma_lock(anon_vma); 1713 anon_vma_lock_read(anon_vma);
1708 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1714 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1709 struct vm_area_struct *vma = avc->vma; 1715 struct vm_area_struct *vma = avc->vma;
1710 unsigned long address = vma_address(page, vma); 1716 unsigned long address = vma_address(page, vma);
@@ -1712,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1712 if (ret != SWAP_AGAIN) 1718 if (ret != SWAP_AGAIN)
1713 break; 1719 break;
1714 } 1720 }
1715 anon_vma_unlock(anon_vma); 1721 anon_vma_unlock_read(anon_vma);
1716 return ret; 1722 return ret;
1717} 1723}
1718 1724
diff --git a/mm/shmem.c b/mm/shmem.c
index 89341b658bd0..5dd56f6efdbd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
889 if (!mpol || mpol->mode == MPOL_DEFAULT) 889 if (!mpol || mpol->mode == MPOL_DEFAULT)
890 return; /* show nothing */ 890 return; /* show nothing */
891 891
892 mpol_to_str(buffer, sizeof(buffer), mpol, 1); 892 mpol_to_str(buffer, sizeof(buffer), mpol);
893 893
894 seq_printf(seq, ",mpol=%s", buffer); 894 seq_printf(seq, ",mpol=%s", buffer);
895} 895}
@@ -910,25 +910,29 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
910static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 910static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
911 struct shmem_inode_info *info, pgoff_t index) 911 struct shmem_inode_info *info, pgoff_t index)
912{ 912{
913 struct mempolicy mpol, *spol;
914 struct vm_area_struct pvma; 913 struct vm_area_struct pvma;
915 914 struct page *page;
916 spol = mpol_cond_copy(&mpol,
917 mpol_shared_policy_lookup(&info->policy, index));
918 915
919 /* Create a pseudo vma that just contains the policy */ 916 /* Create a pseudo vma that just contains the policy */
920 pvma.vm_start = 0; 917 pvma.vm_start = 0;
921 /* Bias interleave by inode number to distribute better across nodes */ 918 /* Bias interleave by inode number to distribute better across nodes */
922 pvma.vm_pgoff = index + info->vfs_inode.i_ino; 919 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
923 pvma.vm_ops = NULL; 920 pvma.vm_ops = NULL;
924 pvma.vm_policy = spol; 921 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
925 return swapin_readahead(swap, gfp, &pvma, 0); 922
923 page = swapin_readahead(swap, gfp, &pvma, 0);
924
925 /* Drop reference taken by mpol_shared_policy_lookup() */
926 mpol_cond_put(pvma.vm_policy);
927
928 return page;
926} 929}
927 930
928static struct page *shmem_alloc_page(gfp_t gfp, 931static struct page *shmem_alloc_page(gfp_t gfp,
929 struct shmem_inode_info *info, pgoff_t index) 932 struct shmem_inode_info *info, pgoff_t index)
930{ 933{
931 struct vm_area_struct pvma; 934 struct vm_area_struct pvma;
935 struct page *page;
932 936
933 /* Create a pseudo vma that just contains the policy */ 937 /* Create a pseudo vma that just contains the policy */
934 pvma.vm_start = 0; 938 pvma.vm_start = 0;
@@ -937,10 +941,12 @@ static struct page *shmem_alloc_page(gfp_t gfp,
937 pvma.vm_ops = NULL; 941 pvma.vm_ops = NULL;
938 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 942 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
939 943
940 /* 944 page = alloc_page_vma(gfp, &pvma, 0);
941 * alloc_page_vma() will drop the shared policy reference 945
942 */ 946 /* Drop reference taken by mpol_shared_policy_lookup() */
943 return alloc_page_vma(gfp, &pvma, 0); 947 mpol_cond_put(pvma.vm_policy);
948
949 return page;
944} 950}
945#else /* !CONFIG_NUMA */ 951#else /* !CONFIG_NUMA */
946#ifdef CONFIG_TMPFS 952#ifdef CONFIG_TMPFS
@@ -1709,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1709 return error; 1715 return error;
1710} 1716}
1711 1717
1718/*
1719 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1720 */
1721static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1722 pgoff_t index, pgoff_t end, int whence)
1723{
1724 struct page *page;
1725 struct pagevec pvec;
1726 pgoff_t indices[PAGEVEC_SIZE];
1727 bool done = false;
1728 int i;
1729
1730 pagevec_init(&pvec, 0);
1731 pvec.nr = 1; /* start small: we may be there already */
1732 while (!done) {
1733 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1734 pvec.nr, pvec.pages, indices);
1735 if (!pvec.nr) {
1736 if (whence == SEEK_DATA)
1737 index = end;
1738 break;
1739 }
1740 for (i = 0; i < pvec.nr; i++, index++) {
1741 if (index < indices[i]) {
1742 if (whence == SEEK_HOLE) {
1743 done = true;
1744 break;
1745 }
1746 index = indices[i];
1747 }
1748 page = pvec.pages[i];
1749 if (page && !radix_tree_exceptional_entry(page)) {
1750 if (!PageUptodate(page))
1751 page = NULL;
1752 }
1753 if (index >= end ||
1754 (page && whence == SEEK_DATA) ||
1755 (!page && whence == SEEK_HOLE)) {
1756 done = true;
1757 break;
1758 }
1759 }
1760 shmem_deswap_pagevec(&pvec);
1761 pagevec_release(&pvec);
1762 pvec.nr = PAGEVEC_SIZE;
1763 cond_resched();
1764 }
1765 return index;
1766}
1767
1768static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1769{
1770 struct address_space *mapping = file->f_mapping;
1771 struct inode *inode = mapping->host;
1772 pgoff_t start, end;
1773 loff_t new_offset;
1774
1775 if (whence != SEEK_DATA && whence != SEEK_HOLE)
1776 return generic_file_llseek_size(file, offset, whence,
1777 MAX_LFS_FILESIZE, i_size_read(inode));
1778 mutex_lock(&inode->i_mutex);
1779 /* We're holding i_mutex so we can access i_size directly */
1780
1781 if (offset < 0)
1782 offset = -EINVAL;
1783 else if (offset >= inode->i_size)
1784 offset = -ENXIO;
1785 else {
1786 start = offset >> PAGE_CACHE_SHIFT;
1787 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1788 new_offset = shmem_seek_hole_data(mapping, start, end, whence);
1789 new_offset <<= PAGE_CACHE_SHIFT;
1790 if (new_offset > offset) {
1791 if (new_offset < inode->i_size)
1792 offset = new_offset;
1793 else if (whence == SEEK_DATA)
1794 offset = -ENXIO;
1795 else
1796 offset = inode->i_size;
1797 }
1798 }
1799
1800 if (offset >= 0 && offset != file->f_pos) {
1801 file->f_pos = offset;
1802 file->f_version = 0;
1803 }
1804 mutex_unlock(&inode->i_mutex);
1805 return offset;
1806}
1807
1712static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1808static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1713 loff_t len) 1809 loff_t len)
1714{ 1810{
@@ -2367,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2367 if (!gid_valid(sbinfo->gid)) 2463 if (!gid_valid(sbinfo->gid))
2368 goto bad_val; 2464 goto bad_val;
2369 } else if (!strcmp(this_char,"mpol")) { 2465 } else if (!strcmp(this_char,"mpol")) {
2370 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 2466 if (mpol_parse_str(value, &sbinfo->mpol))
2371 goto bad_val; 2467 goto bad_val;
2372 } else { 2468 } else {
2373 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2469 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2580,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
2580static const struct file_operations shmem_file_operations = { 2676static const struct file_operations shmem_file_operations = {
2581 .mmap = shmem_mmap, 2677 .mmap = shmem_mmap,
2582#ifdef CONFIG_TMPFS 2678#ifdef CONFIG_TMPFS
2583 .llseek = generic_file_llseek, 2679 .llseek = shmem_file_llseek,
2584 .read = do_sync_read, 2680 .read = do_sync_read,
2585 .write = do_sync_write, 2681 .write = do_sync_write,
2586 .aio_read = shmem_file_aio_read, 2682 .aio_read = shmem_file_aio_read,
diff --git a/mm/slab.c b/mm/slab.c
index 33d3363658df..e7667a3584bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,7 +87,6 @@
87 */ 87 */
88 88
89#include <linux/slab.h> 89#include <linux/slab.h>
90#include "slab.h"
91#include <linux/mm.h> 90#include <linux/mm.h>
92#include <linux/poison.h> 91#include <linux/poison.h>
93#include <linux/swap.h> 92#include <linux/swap.h>
@@ -128,6 +127,8 @@
128 127
129#include "internal.h" 128#include "internal.h"
130 129
130#include "slab.h"
131
131/* 132/*
132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 133 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
133 * 0 for faster, smaller code (especially in the critical paths). 134 * 0 for faster, smaller code (especially in the critical paths).
@@ -162,23 +163,6 @@
162 */ 163 */
163static bool pfmemalloc_active __read_mostly; 164static bool pfmemalloc_active __read_mostly;
164 165
165/* Legal flag mask for kmem_cache_create(). */
166#if DEBUG
167# define CREATE_MASK (SLAB_RED_ZONE | \
168 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
169 SLAB_CACHE_DMA | \
170 SLAB_STORE_USER | \
171 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
172 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
173 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
174#else
175# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
176 SLAB_CACHE_DMA | \
177 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
178 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
179 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
180#endif
181
182/* 166/*
183 * kmem_bufctl_t: 167 * kmem_bufctl_t:
184 * 168 *
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = {
564#undef CACHE 548#undef CACHE
565}; 549};
566 550
567static struct arraycache_init initarray_cache __initdata =
568 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
569static struct arraycache_init initarray_generic = 551static struct arraycache_init initarray_generic =
570 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 552 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
571 553
572/* internal cache of cache description objs */ 554/* internal cache of cache description objs */
573static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
574static struct kmem_cache kmem_cache_boot = { 555static struct kmem_cache kmem_cache_boot = {
575 .nodelists = kmem_cache_nodelists,
576 .batchcount = 1, 556 .batchcount = 1,
577 .limit = BOOT_CPUCACHE_ENTRIES, 557 .limit = BOOT_CPUCACHE_ENTRIES,
578 .shared = 1, 558 .shared = 1,
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q)
662 } 642 }
663} 643}
664 644
645static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
646{
647 struct kmem_list3 *l3;
648 l3 = cachep->nodelists[q];
649 if (!l3)
650 return;
651
652 slab_set_lock_classes(cachep, &on_slab_l3_key,
653 &on_slab_alc_key, q);
654}
655
656static inline void on_slab_lock_classes(struct kmem_cache *cachep)
657{
658 int node;
659
660 VM_BUG_ON(OFF_SLAB(cachep));
661 for_each_node(node)
662 on_slab_lock_classes_node(cachep, node);
663}
664
665static inline void init_lock_keys(void) 665static inline void init_lock_keys(void)
666{ 666{
667 int node; 667 int node;
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void)
678{ 678{
679} 679}
680 680
681static inline void on_slab_lock_classes(struct kmem_cache *cachep)
682{
683}
684
685static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
686{
687}
688
681static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) 689static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
682{ 690{
683} 691}
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
1406 free_alien_cache(alien); 1414 free_alien_cache(alien);
1407 if (cachep->flags & SLAB_DEBUG_OBJECTS) 1415 if (cachep->flags & SLAB_DEBUG_OBJECTS)
1408 slab_set_debugobj_lock_classes_node(cachep, node); 1416 slab_set_debugobj_lock_classes_node(cachep, node);
1417 else if (!OFF_SLAB(cachep) &&
1418 !(cachep->flags & SLAB_DESTROY_BY_RCU))
1419 on_slab_lock_classes_node(cachep, node);
1409 } 1420 }
1410 init_node_lock_keys(node); 1421 init_node_lock_keys(node);
1411 1422
@@ -1577,28 +1588,33 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1577} 1588}
1578 1589
1579/* 1590/*
1591 * The memory after the last cpu cache pointer is used for the
1592 * the nodelists pointer.
1593 */
1594static void setup_nodelists_pointer(struct kmem_cache *cachep)
1595{
1596 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
1597}
1598
1599/*
1580 * Initialisation. Called after the page allocator have been initialised and 1600 * Initialisation. Called after the page allocator have been initialised and
1581 * before smp_init(). 1601 * before smp_init().
1582 */ 1602 */
1583void __init kmem_cache_init(void) 1603void __init kmem_cache_init(void)
1584{ 1604{
1585 size_t left_over;
1586 struct cache_sizes *sizes; 1605 struct cache_sizes *sizes;
1587 struct cache_names *names; 1606 struct cache_names *names;
1588 int i; 1607 int i;
1589 int order;
1590 int node;
1591 1608
1592 kmem_cache = &kmem_cache_boot; 1609 kmem_cache = &kmem_cache_boot;
1610 setup_nodelists_pointer(kmem_cache);
1593 1611
1594 if (num_possible_nodes() == 1) 1612 if (num_possible_nodes() == 1)
1595 use_alien_caches = 0; 1613 use_alien_caches = 0;
1596 1614
1597 for (i = 0; i < NUM_INIT_LISTS; i++) { 1615 for (i = 0; i < NUM_INIT_LISTS; i++)
1598 kmem_list3_init(&initkmem_list3[i]); 1616 kmem_list3_init(&initkmem_list3[i]);
1599 if (i < MAX_NUMNODES) 1617
1600 kmem_cache->nodelists[i] = NULL;
1601 }
1602 set_up_list3s(kmem_cache, CACHE_CACHE); 1618 set_up_list3s(kmem_cache, CACHE_CACHE);
1603 1619
1604 /* 1620 /*
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void)
1629 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1645 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1630 */ 1646 */
1631 1647
1632 node = numa_mem_id();
1633
1634 /* 1) create the kmem_cache */ 1648 /* 1) create the kmem_cache */
1635 INIT_LIST_HEAD(&slab_caches);
1636 list_add(&kmem_cache->list, &slab_caches);
1637 kmem_cache->colour_off = cache_line_size();
1638 kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
1639 kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1640 1649
1641 /* 1650 /*
1642 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1651 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1643 */ 1652 */
1644 kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1653 create_boot_cache(kmem_cache, "kmem_cache",
1645 nr_node_ids * sizeof(struct kmem_list3 *); 1654 offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1646 kmem_cache->object_size = kmem_cache->size; 1655 nr_node_ids * sizeof(struct kmem_list3 *),
1647 kmem_cache->size = ALIGN(kmem_cache->object_size, 1656 SLAB_HWCACHE_ALIGN);
1648 cache_line_size()); 1657 list_add(&kmem_cache->list, &slab_caches);
1649 kmem_cache->reciprocal_buffer_size =
1650 reciprocal_value(kmem_cache->size);
1651
1652 for (order = 0; order < MAX_ORDER; order++) {
1653 cache_estimate(order, kmem_cache->size,
1654 cache_line_size(), 0, &left_over, &kmem_cache->num);
1655 if (kmem_cache->num)
1656 break;
1657 }
1658 BUG_ON(!kmem_cache->num);
1659 kmem_cache->gfporder = order;
1660 kmem_cache->colour = left_over / kmem_cache->colour_off;
1661 kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
1662 sizeof(struct slab), cache_line_size());
1663 1658
1664 /* 2+3) create the kmalloc caches */ 1659 /* 2+3) create the kmalloc caches */
1665 sizes = malloc_sizes; 1660 sizes = malloc_sizes;
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void)
1671 * bug. 1666 * bug.
1672 */ 1667 */
1673 1668
1674 sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1669 sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
1675 sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; 1670 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
1676 sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; 1671
1677 sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; 1672 if (INDEX_AC != INDEX_L3)
1678 sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; 1673 sizes[INDEX_L3].cs_cachep =
1679 __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); 1674 create_kmalloc_cache(names[INDEX_L3].name,
1680 list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); 1675 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
1681
1682 if (INDEX_AC != INDEX_L3) {
1683 sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1684 sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
1685 sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
1686 sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
1687 sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1688 __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1689 list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
1690 }
1691 1676
1692 slab_early_init = 0; 1677 slab_early_init = 0;
1693 1678
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void)
1699 * Note for systems short on memory removing the alignment will 1684 * Note for systems short on memory removing the alignment will
1700 * allow tighter packing of the smaller caches. 1685 * allow tighter packing of the smaller caches.
1701 */ 1686 */
1702 if (!sizes->cs_cachep) { 1687 if (!sizes->cs_cachep)
1703 sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1688 sizes->cs_cachep = create_kmalloc_cache(names->name,
1704 sizes->cs_cachep->name = names->name; 1689 sizes->cs_size, ARCH_KMALLOC_FLAGS);
1705 sizes->cs_cachep->size = sizes->cs_size; 1690
1706 sizes->cs_cachep->object_size = sizes->cs_size;
1707 sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1708 __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1709 list_add(&sizes->cs_cachep->list, &slab_caches);
1710 }
1711#ifdef CONFIG_ZONE_DMA 1691#ifdef CONFIG_ZONE_DMA
1712 sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1692 sizes->cs_dmacachep = create_kmalloc_cache(
1713 sizes->cs_dmacachep->name = names->name_dma; 1693 names->name_dma, sizes->cs_size,
1714 sizes->cs_dmacachep->size = sizes->cs_size; 1694 SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
1715 sizes->cs_dmacachep->object_size = sizes->cs_size;
1716 sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
1717 __kmem_cache_create(sizes->cs_dmacachep,
1718 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
1719 list_add(&sizes->cs_dmacachep->list, &slab_caches);
1720#endif 1695#endif
1721 sizes++; 1696 sizes++;
1722 names++; 1697 names++;
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void)
1727 1702
1728 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1703 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1729 1704
1730 BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
1731 memcpy(ptr, cpu_cache_get(kmem_cache), 1705 memcpy(ptr, cpu_cache_get(kmem_cache),
1732 sizeof(struct arraycache_init)); 1706 sizeof(struct arraycache_init));
1733 /* 1707 /*
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1921 if (page->pfmemalloc) 1895 if (page->pfmemalloc)
1922 SetPageSlabPfmemalloc(page + i); 1896 SetPageSlabPfmemalloc(page + i);
1923 } 1897 }
1898 memcg_bind_pages(cachep, cachep->gfporder);
1924 1899
1925 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1900 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1926 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1901 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1957 __ClearPageSlab(page); 1932 __ClearPageSlab(page);
1958 page++; 1933 page++;
1959 } 1934 }
1935
1936 memcg_release_pages(cachep, cachep->gfporder);
1960 if (current->reclaim_state) 1937 if (current->reclaim_state)
1961 current->reclaim_state->reclaimed_slab += nr_freed; 1938 current->reclaim_state->reclaimed_slab += nr_freed;
1962 free_pages((unsigned long)addr, cachep->gfporder); 1939 free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
1963} 1940}
1964 1941
1965static void kmem_rcu_free(struct rcu_head *head) 1942static void kmem_rcu_free(struct rcu_head *head)
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2282 2259
2283 if (slab_state == DOWN) { 2260 if (slab_state == DOWN) {
2284 /* 2261 /*
2285 * Note: the first kmem_cache_create must create the cache 2262 * Note: Creation of first cache (kmem_cache).
2263 * The setup_list3s is taken care
2264 * of by the caller of __kmem_cache_create
2265 */
2266 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2267 slab_state = PARTIAL;
2268 } else if (slab_state == PARTIAL) {
2269 /*
2270 * Note: the second kmem_cache_create must create the cache
2286 * that's used by kmalloc(24), otherwise the creation of 2271 * that's used by kmalloc(24), otherwise the creation of
2287 * further caches will BUG(). 2272 * further caches will BUG().
2288 */ 2273 */
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2290 2275
2291 /* 2276 /*
2292 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is 2277 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2293 * the first cache, then we need to set up all its list3s, 2278 * the second cache, then we need to set up all its list3s,
2294 * otherwise the creation of further caches will BUG(). 2279 * otherwise the creation of further caches will BUG().
2295 */ 2280 */
2296 set_up_list3s(cachep, SIZE_AC); 2281 set_up_list3s(cachep, SIZE_AC);
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2299 else 2284 else
2300 slab_state = PARTIAL_ARRAYCACHE; 2285 slab_state = PARTIAL_ARRAYCACHE;
2301 } else { 2286 } else {
2287 /* Remaining boot caches */
2302 cachep->array[smp_processor_id()] = 2288 cachep->array[smp_processor_id()] =
2303 kmalloc(sizeof(struct arraycache_init), gfp); 2289 kmalloc(sizeof(struct arraycache_init), gfp);
2304 2290
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2331 2317
2332/** 2318/**
2333 * __kmem_cache_create - Create a cache. 2319 * __kmem_cache_create - Create a cache.
2334 * @name: A string which is used in /proc/slabinfo to identify this cache. 2320 * @cachep: cache management descriptor
2335 * @size: The size of objects to be created in this cache.
2336 * @align: The required alignment for the objects.
2337 * @flags: SLAB flags 2321 * @flags: SLAB flags
2338 * @ctor: A constructor for the objects.
2339 * 2322 *
2340 * Returns a ptr to the cache on success, NULL on failure. 2323 * Returns a ptr to the cache on success, NULL on failure.
2341 * Cannot be called within a int, but can be interrupted. 2324 * Cannot be called within a int, but can be interrupted.
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2378 if (flags & SLAB_DESTROY_BY_RCU) 2361 if (flags & SLAB_DESTROY_BY_RCU)
2379 BUG_ON(flags & SLAB_POISON); 2362 BUG_ON(flags & SLAB_POISON);
2380#endif 2363#endif
2381 /*
2382 * Always checks flags, a caller might be expecting debug support which
2383 * isn't available.
2384 */
2385 BUG_ON(flags & ~CREATE_MASK);
2386 2364
2387 /* 2365 /*
2388 * Check that size is in terms of words. This is needed to avoid 2366 * Check that size is in terms of words. This is needed to avoid
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2394 size &= ~(BYTES_PER_WORD - 1); 2372 size &= ~(BYTES_PER_WORD - 1);
2395 } 2373 }
2396 2374
2397 /* calculate the final buffer alignment: */
2398
2399 /* 1) arch recommendation: can be overridden for debug */
2400 if (flags & SLAB_HWCACHE_ALIGN) {
2401 /*
2402 * Default alignment: as specified by the arch code. Except if
2403 * an object is really small, then squeeze multiple objects into
2404 * one cacheline.
2405 */
2406 ralign = cache_line_size();
2407 while (size <= ralign / 2)
2408 ralign /= 2;
2409 } else {
2410 ralign = BYTES_PER_WORD;
2411 }
2412
2413 /* 2375 /*
2414 * Redzoning and user store require word alignment or possibly larger. 2376 * Redzoning and user store require word alignment or possibly larger.
2415 * Note this will be overridden by architecture or caller mandated 2377 * Note this will be overridden by architecture or caller mandated
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2426 size &= ~(REDZONE_ALIGN - 1); 2388 size &= ~(REDZONE_ALIGN - 1);
2427 } 2389 }
2428 2390
2429 /* 2) arch mandated alignment */
2430 if (ralign < ARCH_SLAB_MINALIGN) {
2431 ralign = ARCH_SLAB_MINALIGN;
2432 }
2433 /* 3) caller mandated alignment */ 2391 /* 3) caller mandated alignment */
2434 if (ralign < cachep->align) { 2392 if (ralign < cachep->align) {
2435 ralign = cachep->align; 2393 ralign = cachep->align;
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2447 else 2405 else
2448 gfp = GFP_NOWAIT; 2406 gfp = GFP_NOWAIT;
2449 2407
2450 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; 2408 setup_nodelists_pointer(cachep);
2451#if DEBUG 2409#if DEBUG
2452 2410
2453 /* 2411 /*
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2566 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); 2524 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2567 2525
2568 slab_set_debugobj_lock_classes(cachep); 2526 slab_set_debugobj_lock_classes(cachep);
2569 } 2527 } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
2528 on_slab_lock_classes(cachep);
2570 2529
2571 return 0; 2530 return 0;
2572} 2531}
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3530 if (slab_should_failslab(cachep, flags)) 3489 if (slab_should_failslab(cachep, flags))
3531 return NULL; 3490 return NULL;
3532 3491
3492 cachep = memcg_kmem_get_cache(cachep, flags);
3493
3533 cache_alloc_debugcheck_before(cachep, flags); 3494 cache_alloc_debugcheck_before(cachep, flags);
3534 local_irq_save(save_flags); 3495 local_irq_save(save_flags);
3535 3496
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3615 if (slab_should_failslab(cachep, flags)) 3576 if (slab_should_failslab(cachep, flags))
3616 return NULL; 3577 return NULL;
3617 3578
3579 cachep = memcg_kmem_get_cache(cachep, flags);
3580
3618 cache_alloc_debugcheck_before(cachep, flags); 3581 cache_alloc_debugcheck_before(cachep, flags);
3619 local_irq_save(save_flags); 3582 local_irq_save(save_flags);
3620 objp = __do_cache_alloc(cachep, flags); 3583 objp = __do_cache_alloc(cachep, flags);
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
3928void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3891void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3929{ 3892{
3930 unsigned long flags; 3893 unsigned long flags;
3894 cachep = cache_from_obj(cachep, objp);
3895 if (!cachep)
3896 return;
3931 3897
3932 local_irq_save(flags); 3898 local_irq_save(flags);
3933 debug_check_no_locks_freed(objp, cachep->object_size); 3899 debug_check_no_locks_freed(objp, cachep->object_size);
@@ -3969,12 +3935,6 @@ void kfree(const void *objp)
3969} 3935}
3970EXPORT_SYMBOL(kfree); 3936EXPORT_SYMBOL(kfree);
3971 3937
3972unsigned int kmem_cache_size(struct kmem_cache *cachep)
3973{
3974 return cachep->object_size;
3975}
3976EXPORT_SYMBOL(kmem_cache_size);
3977
3978/* 3938/*
3979 * This initializes kmem_list3 or resizes various caches for all nodes. 3939 * This initializes kmem_list3 or resizes various caches for all nodes.
3980 */ 3940 */
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info)
4081} 4041}
4082 4042
4083/* Always called with the slab_mutex held */ 4043/* Always called with the slab_mutex held */
4084static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 4044static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
4085 int batchcount, int shared, gfp_t gfp) 4045 int batchcount, int shared, gfp_t gfp)
4086{ 4046{
4087 struct ccupdate_struct *new; 4047 struct ccupdate_struct *new;
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4124 return alloc_kmemlist(cachep, gfp); 4084 return alloc_kmemlist(cachep, gfp);
4125} 4085}
4126 4086
4087static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4088 int batchcount, int shared, gfp_t gfp)
4089{
4090 int ret;
4091 struct kmem_cache *c = NULL;
4092 int i = 0;
4093
4094 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
4095
4096 if (slab_state < FULL)
4097 return ret;
4098
4099 if ((ret < 0) || !is_root_cache(cachep))
4100 return ret;
4101
4102 VM_BUG_ON(!mutex_is_locked(&slab_mutex));
4103 for_each_memcg_cache_index(i) {
4104 c = cache_from_memcg(cachep, i);
4105 if (c)
4106 /* return value determined by the parent cache only */
4107 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
4108 }
4109
4110 return ret;
4111}
4112
4127/* Called with slab_mutex held always */ 4113/* Called with slab_mutex held always */
4128static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 4114static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4129{ 4115{
4130 int err; 4116 int err;
4131 int limit, shared; 4117 int limit = 0;
4118 int shared = 0;
4119 int batchcount = 0;
4120
4121 if (!is_root_cache(cachep)) {
4122 struct kmem_cache *root = memcg_root_cache(cachep);
4123 limit = root->limit;
4124 shared = root->shared;
4125 batchcount = root->batchcount;
4126 }
4132 4127
4128 if (limit && shared && batchcount)
4129 goto skip_setup;
4133 /* 4130 /*
4134 * The head array serves three purposes: 4131 * The head array serves three purposes:
4135 * - create a LIFO ordering, i.e. return objects that are cache-warm 4132 * - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4171 if (limit > 32) 4168 if (limit > 32)
4172 limit = 32; 4169 limit = 32;
4173#endif 4170#endif
4174 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); 4171 batchcount = (limit + 1) / 2;
4172skip_setup:
4173 err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
4175 if (err) 4174 if (err)
4176 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 4175 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4177 cachep->name, -err); 4176 cachep->name, -err);
@@ -4276,54 +4275,8 @@ out:
4276} 4275}
4277 4276
4278#ifdef CONFIG_SLABINFO 4277#ifdef CONFIG_SLABINFO
4279 4278void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4280static void print_slabinfo_header(struct seq_file *m)
4281{
4282 /*
4283 * Output format version, so at least we can change it
4284 * without _too_ many complaints.
4285 */
4286#if STATS
4287 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4288#else
4289 seq_puts(m, "slabinfo - version: 2.1\n");
4290#endif
4291 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
4292 "<objperslab> <pagesperslab>");
4293 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4294 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4295#if STATS
4296 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4297 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4298 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4299#endif
4300 seq_putc(m, '\n');
4301}
4302
4303static void *s_start(struct seq_file *m, loff_t *pos)
4304{
4305 loff_t n = *pos;
4306
4307 mutex_lock(&slab_mutex);
4308 if (!n)
4309 print_slabinfo_header(m);
4310
4311 return seq_list_start(&slab_caches, *pos);
4312}
4313
4314static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4315{ 4279{
4316 return seq_list_next(p, &slab_caches, pos);
4317}
4318
4319static void s_stop(struct seq_file *m, void *p)
4320{
4321 mutex_unlock(&slab_mutex);
4322}
4323
4324static int s_show(struct seq_file *m, void *p)
4325{
4326 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4327 struct slab *slabp; 4280 struct slab *slabp;
4328 unsigned long active_objs; 4281 unsigned long active_objs;
4329 unsigned long num_objs; 4282 unsigned long num_objs;
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p)
4378 if (error) 4331 if (error)
4379 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4332 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4380 4333
4381 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4334 sinfo->active_objs = active_objs;
4382 name, active_objs, num_objs, cachep->size, 4335 sinfo->num_objs = num_objs;
4383 cachep->num, (1 << cachep->gfporder)); 4336 sinfo->active_slabs = active_slabs;
4384 seq_printf(m, " : tunables %4u %4u %4u", 4337 sinfo->num_slabs = num_slabs;
4385 cachep->limit, cachep->batchcount, cachep->shared); 4338 sinfo->shared_avail = shared_avail;
4386 seq_printf(m, " : slabdata %6lu %6lu %6lu", 4339 sinfo->limit = cachep->limit;
4387 active_slabs, num_slabs, shared_avail); 4340 sinfo->batchcount = cachep->batchcount;
4341 sinfo->shared = cachep->shared;
4342 sinfo->objects_per_slab = cachep->num;
4343 sinfo->cache_order = cachep->gfporder;
4344}
4345
4346void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
4347{
4388#if STATS 4348#if STATS
4389 { /* list3 stats */ 4349 { /* list3 stats */
4390 unsigned long high = cachep->high_mark; 4350 unsigned long high = cachep->high_mark;
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p)
4414 allochit, allocmiss, freehit, freemiss); 4374 allochit, allocmiss, freehit, freemiss);
4415 } 4375 }
4416#endif 4376#endif
4417 seq_putc(m, '\n');
4418 return 0;
4419} 4377}
4420 4378
4421/*
4422 * slabinfo_op - iterator that generates /proc/slabinfo
4423 *
4424 * Output layout:
4425 * cache-name
4426 * num-active-objs
4427 * total-objs
4428 * object size
4429 * num-active-slabs
4430 * total-slabs
4431 * num-pages-per-slab
4432 * + further values on SMP and with statistics enabled
4433 */
4434
4435static const struct seq_operations slabinfo_op = {
4436 .start = s_start,
4437 .next = s_next,
4438 .stop = s_stop,
4439 .show = s_show,
4440};
4441
4442#define MAX_SLABINFO_WRITE 128 4379#define MAX_SLABINFO_WRITE 128
4443/** 4380/**
4444 * slabinfo_write - Tuning for the slab allocator 4381 * slabinfo_write - Tuning for the slab allocator
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = {
4447 * @count: data length 4384 * @count: data length
4448 * @ppos: unused 4385 * @ppos: unused
4449 */ 4386 */
4450static ssize_t slabinfo_write(struct file *file, const char __user *buffer, 4387ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4451 size_t count, loff_t *ppos) 4388 size_t count, loff_t *ppos)
4452{ 4389{
4453 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4390 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4490 return res; 4427 return res;
4491} 4428}
4492 4429
4493static int slabinfo_open(struct inode *inode, struct file *file)
4494{
4495 return seq_open(file, &slabinfo_op);
4496}
4497
4498static const struct file_operations proc_slabinfo_operations = {
4499 .open = slabinfo_open,
4500 .read = seq_read,
4501 .write = slabinfo_write,
4502 .llseek = seq_lseek,
4503 .release = seq_release,
4504};
4505
4506#ifdef CONFIG_DEBUG_SLAB_LEAK 4430#ifdef CONFIG_DEBUG_SLAB_LEAK
4507 4431
4508static void *leaks_start(struct seq_file *m, loff_t *pos) 4432static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p)
4631 return 0; 4555 return 0;
4632} 4556}
4633 4557
4558static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4559{
4560 return seq_list_next(p, &slab_caches, pos);
4561}
4562
4563static void s_stop(struct seq_file *m, void *p)
4564{
4565 mutex_unlock(&slab_mutex);
4566}
4567
4634static const struct seq_operations slabstats_op = { 4568static const struct seq_operations slabstats_op = {
4635 .start = leaks_start, 4569 .start = leaks_start,
4636 .next = s_next, 4570 .next = s_next,
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = {
4665 4599
4666static int __init slab_proc_init(void) 4600static int __init slab_proc_init(void)
4667{ 4601{
4668 proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
4669#ifdef CONFIG_DEBUG_SLAB_LEAK 4602#ifdef CONFIG_DEBUG_SLAB_LEAK
4670 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4603 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4671#endif 4604#endif
diff --git a/mm/slab.h b/mm/slab.h
index 7deeb449a301..34a98d642196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -32,19 +32,201 @@ extern struct list_head slab_caches;
32/* The slab cache that manages slab cache information */ 32/* The slab cache that manages slab cache information */
33extern struct kmem_cache *kmem_cache; 33extern struct kmem_cache *kmem_cache;
34 34
35unsigned long calculate_alignment(unsigned long flags,
36 unsigned long align, unsigned long size);
37
35/* Functions provided by the slab allocators */ 38/* Functions provided by the slab allocators */
36extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); 39extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
37 40
41extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
42 unsigned long flags);
43extern void create_boot_cache(struct kmem_cache *, const char *name,
44 size_t size, unsigned long flags);
45
46struct mem_cgroup;
38#ifdef CONFIG_SLUB 47#ifdef CONFIG_SLUB
39struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 48struct kmem_cache *
40 size_t align, unsigned long flags, void (*ctor)(void *)); 49__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
50 size_t align, unsigned long flags, void (*ctor)(void *));
41#else 51#else
42static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 52static inline struct kmem_cache *
43 size_t align, unsigned long flags, void (*ctor)(void *)) 53__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
54 size_t align, unsigned long flags, void (*ctor)(void *))
44{ return NULL; } 55{ return NULL; }
45#endif 56#endif
46 57
47 58
59/* Legal flag mask for kmem_cache_create(), for various configurations */
60#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
61 SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
62
63#if defined(CONFIG_DEBUG_SLAB)
64#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
65#elif defined(CONFIG_SLUB_DEBUG)
66#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
67 SLAB_TRACE | SLAB_DEBUG_FREE)
68#else
69#define SLAB_DEBUG_FLAGS (0)
70#endif
71
72#if defined(CONFIG_SLAB)
73#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
74 SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
75#elif defined(CONFIG_SLUB)
76#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
77 SLAB_TEMPORARY | SLAB_NOTRACK)
78#else
79#define SLAB_CACHE_FLAGS (0)
80#endif
81
82#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
83
48int __kmem_cache_shutdown(struct kmem_cache *); 84int __kmem_cache_shutdown(struct kmem_cache *);
49 85
86struct seq_file;
87struct file;
88
89struct slabinfo {
90 unsigned long active_objs;
91 unsigned long num_objs;
92 unsigned long active_slabs;
93 unsigned long num_slabs;
94 unsigned long shared_avail;
95 unsigned int limit;
96 unsigned int batchcount;
97 unsigned int shared;
98 unsigned int objects_per_slab;
99 unsigned int cache_order;
100};
101
102void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
103void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
104ssize_t slabinfo_write(struct file *file, const char __user *buffer,
105 size_t count, loff_t *ppos);
106
107#ifdef CONFIG_MEMCG_KMEM
108static inline bool is_root_cache(struct kmem_cache *s)
109{
110 return !s->memcg_params || s->memcg_params->is_root_cache;
111}
112
113static inline bool cache_match_memcg(struct kmem_cache *cachep,
114 struct mem_cgroup *memcg)
115{
116 return (is_root_cache(cachep) && !memcg) ||
117 (cachep->memcg_params->memcg == memcg);
118}
119
120static inline void memcg_bind_pages(struct kmem_cache *s, int order)
121{
122 if (!is_root_cache(s))
123 atomic_add(1 << order, &s->memcg_params->nr_pages);
124}
125
126static inline void memcg_release_pages(struct kmem_cache *s, int order)
127{
128 if (is_root_cache(s))
129 return;
130
131 if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
132 mem_cgroup_destroy_cache(s);
133}
134
135static inline bool slab_equal_or_root(struct kmem_cache *s,
136 struct kmem_cache *p)
137{
138 return (p == s) ||
139 (s->memcg_params && (p == s->memcg_params->root_cache));
140}
141
142/*
143 * We use suffixes to the name in memcg because we can't have caches
144 * created in the system with the same name. But when we print them
145 * locally, better refer to them with the base name
146 */
147static inline const char *cache_name(struct kmem_cache *s)
148{
149 if (!is_root_cache(s))
150 return s->memcg_params->root_cache->name;
151 return s->name;
152}
153
154static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
155{
156 return s->memcg_params->memcg_caches[idx];
157}
158
159static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
160{
161 if (is_root_cache(s))
162 return s;
163 return s->memcg_params->root_cache;
164}
165#else
166static inline bool is_root_cache(struct kmem_cache *s)
167{
168 return true;
169}
170
171static inline bool cache_match_memcg(struct kmem_cache *cachep,
172 struct mem_cgroup *memcg)
173{
174 return true;
175}
176
177static inline void memcg_bind_pages(struct kmem_cache *s, int order)
178{
179}
180
181static inline void memcg_release_pages(struct kmem_cache *s, int order)
182{
183}
184
185static inline bool slab_equal_or_root(struct kmem_cache *s,
186 struct kmem_cache *p)
187{
188 return true;
189}
190
191static inline const char *cache_name(struct kmem_cache *s)
192{
193 return s->name;
194}
195
196static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
197{
198 return NULL;
199}
200
201static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
202{
203 return s;
204}
205#endif
206
207static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
208{
209 struct kmem_cache *cachep;
210 struct page *page;
211
212 /*
213 * When kmemcg is not being used, both assignments should return the
214 * same value. but we don't want to pay the assignment price in that
215 * case. If it is not compiled in, the compiler should be smart enough
216 * to not do even the assignment. In that case, slab_equal_or_root
217 * will also be a constant.
218 */
219 if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
220 return s;
221
222 page = virt_to_head_page(x);
223 cachep = page->slab_cache;
224 if (slab_equal_or_root(cachep, s))
225 return cachep;
226
227 pr_err("%s: Wrong slab cache. %s but object is from %s\n",
228 __FUNCTION__, cachep->name, s->name);
229 WARN_ON_ONCE(1);
230 return s;
231}
50#endif 232#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 069a24e64403..3f3cd97d3fdf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -13,9 +13,12 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/seq_file.h>
17#include <linux/proc_fs.h>
16#include <asm/cacheflush.h> 18#include <asm/cacheflush.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
18#include <asm/page.h> 20#include <asm/page.h>
21#include <linux/memcontrol.h>
19 22
20#include "slab.h" 23#include "slab.h"
21 24
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
25struct kmem_cache *kmem_cache; 28struct kmem_cache *kmem_cache;
26 29
27#ifdef CONFIG_DEBUG_VM 30#ifdef CONFIG_DEBUG_VM
28static int kmem_cache_sanity_check(const char *name, size_t size) 31static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
32 size_t size)
29{ 33{
30 struct kmem_cache *s = NULL; 34 struct kmem_cache *s = NULL;
31 35
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
51 continue; 55 continue;
52 } 56 }
53 57
54 if (!strcmp(s->name, name)) { 58 /*
59 * For simplicity, we won't check this in the list of memcg
60 * caches. We have control over memcg naming, and if there
61 * aren't duplicates in the global list, there won't be any
62 * duplicates in the memcg lists as well.
63 */
64 if (!memcg && !strcmp(s->name, name)) {
55 pr_err("%s (%s): Cache name already exists.\n", 65 pr_err("%s (%s): Cache name already exists.\n",
56 __func__, name); 66 __func__, name);
57 dump_stack(); 67 dump_stack();
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
64 return 0; 74 return 0;
65} 75}
66#else 76#else
67static inline int kmem_cache_sanity_check(const char *name, size_t size) 77static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
78 const char *name, size_t size)
68{ 79{
69 return 0; 80 return 0;
70} 81}
71#endif 82#endif
72 83
84#ifdef CONFIG_MEMCG_KMEM
85int memcg_update_all_caches(int num_memcgs)
86{
87 struct kmem_cache *s;
88 int ret = 0;
89 mutex_lock(&slab_mutex);
90
91 list_for_each_entry(s, &slab_caches, list) {
92 if (!is_root_cache(s))
93 continue;
94
95 ret = memcg_update_cache_size(s, num_memcgs);
96 /*
97 * See comment in memcontrol.c, memcg_update_cache_size:
98 * Instead of freeing the memory, we'll just leave the caches
99 * up to this point in an updated state.
100 */
101 if (ret)
102 goto out;
103 }
104
105 memcg_update_array_size(num_memcgs);
106out:
107 mutex_unlock(&slab_mutex);
108 return ret;
109}
110#endif
111
112/*
113 * Figure out what the alignment of the objects will be given a set of
114 * flags, a user specified alignment and the size of the objects.
115 */
116unsigned long calculate_alignment(unsigned long flags,
117 unsigned long align, unsigned long size)
118{
119 /*
120 * If the user wants hardware cache aligned objects then follow that
121 * suggestion if the object is sufficiently large.
122 *
123 * The hardware cache alignment cannot override the specified
124 * alignment though. If that is greater then use it.
125 */
126 if (flags & SLAB_HWCACHE_ALIGN) {
127 unsigned long ralign = cache_line_size();
128 while (size <= ralign / 2)
129 ralign /= 2;
130 align = max(align, ralign);
131 }
132
133 if (align < ARCH_SLAB_MINALIGN)
134 align = ARCH_SLAB_MINALIGN;
135
136 return ALIGN(align, sizeof(void *));
137}
138
139
73/* 140/*
74 * kmem_cache_create - Create a cache. 141 * kmem_cache_create - Create a cache.
75 * @name: A string which is used in /proc/slabinfo to identify this cache. 142 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
95 * as davem. 162 * as davem.
96 */ 163 */
97 164
98struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, 165struct kmem_cache *
99 unsigned long flags, void (*ctor)(void *)) 166kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
167 size_t align, unsigned long flags, void (*ctor)(void *),
168 struct kmem_cache *parent_cache)
100{ 169{
101 struct kmem_cache *s = NULL; 170 struct kmem_cache *s = NULL;
102 int err = 0; 171 int err = 0;
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
104 get_online_cpus(); 173 get_online_cpus();
105 mutex_lock(&slab_mutex); 174 mutex_lock(&slab_mutex);
106 175
107 if (!kmem_cache_sanity_check(name, size) == 0) 176 if (!kmem_cache_sanity_check(memcg, name, size) == 0)
108 goto out_locked; 177 goto out_locked;
109 178
179 /*
180 * Some allocators will constraint the set of valid flags to a subset
181 * of all flags. We expect them to define CACHE_CREATE_MASK in this
182 * case, and we'll just provide them with a sanitized version of the
183 * passed flags.
184 */
185 flags &= CACHE_CREATE_MASK;
110 186
111 s = __kmem_cache_alias(name, size, align, flags, ctor); 187 s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
112 if (s) 188 if (s)
113 goto out_locked; 189 goto out_locked;
114 190
115 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); 191 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
116 if (s) { 192 if (s) {
117 s->object_size = s->size = size; 193 s->object_size = s->size = size;
118 s->align = align; 194 s->align = calculate_alignment(flags, align, size);
119 s->ctor = ctor; 195 s->ctor = ctor;
196
197 if (memcg_register_cache(memcg, s, parent_cache)) {
198 kmem_cache_free(kmem_cache, s);
199 err = -ENOMEM;
200 goto out_locked;
201 }
202
120 s->name = kstrdup(name, GFP_KERNEL); 203 s->name = kstrdup(name, GFP_KERNEL);
121 if (!s->name) { 204 if (!s->name) {
122 kmem_cache_free(kmem_cache, s); 205 kmem_cache_free(kmem_cache, s);
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
126 209
127 err = __kmem_cache_create(s, flags); 210 err = __kmem_cache_create(s, flags);
128 if (!err) { 211 if (!err) {
129
130 s->refcount = 1; 212 s->refcount = 1;
131 list_add(&s->list, &slab_caches); 213 list_add(&s->list, &slab_caches);
132 214 memcg_cache_list_add(memcg, s);
133 } else { 215 } else {
134 kfree(s->name); 216 kfree(s->name);
135 kmem_cache_free(kmem_cache, s); 217 kmem_cache_free(kmem_cache, s);
@@ -157,10 +239,20 @@ out_locked:
157 239
158 return s; 240 return s;
159} 241}
242
243struct kmem_cache *
244kmem_cache_create(const char *name, size_t size, size_t align,
245 unsigned long flags, void (*ctor)(void *))
246{
247 return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
248}
160EXPORT_SYMBOL(kmem_cache_create); 249EXPORT_SYMBOL(kmem_cache_create);
161 250
162void kmem_cache_destroy(struct kmem_cache *s) 251void kmem_cache_destroy(struct kmem_cache *s)
163{ 252{
253 /* Destroy all the children caches if we aren't a memcg cache */
254 kmem_cache_destroy_memcg_children(s);
255
164 get_online_cpus(); 256 get_online_cpus();
165 mutex_lock(&slab_mutex); 257 mutex_lock(&slab_mutex);
166 s->refcount--; 258 s->refcount--;
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
172 if (s->flags & SLAB_DESTROY_BY_RCU) 264 if (s->flags & SLAB_DESTROY_BY_RCU)
173 rcu_barrier(); 265 rcu_barrier();
174 266
267 memcg_release_cache(s);
175 kfree(s->name); 268 kfree(s->name);
176 kmem_cache_free(kmem_cache, s); 269 kmem_cache_free(kmem_cache, s);
177 } else { 270 } else {
@@ -192,3 +285,182 @@ int slab_is_available(void)
192{ 285{
193 return slab_state >= UP; 286 return slab_state >= UP;
194} 287}
288
289#ifndef CONFIG_SLOB
290/* Create a cache during boot when no slab services are available yet */
291void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
292 unsigned long flags)
293{
294 int err;
295
296 s->name = name;
297 s->size = s->object_size = size;
298 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
299 err = __kmem_cache_create(s, flags);
300
301 if (err)
302 panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
303 name, size, err);
304
305 s->refcount = -1; /* Exempt from merging for now */
306}
307
308struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
309 unsigned long flags)
310{
311 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
312
313 if (!s)
314 panic("Out of memory when creating slab %s\n", name);
315
316 create_boot_cache(s, name, size, flags);
317 list_add(&s->list, &slab_caches);
318 s->refcount = 1;
319 return s;
320}
321
322#endif /* !CONFIG_SLOB */
323
324
325#ifdef CONFIG_SLABINFO
326void print_slabinfo_header(struct seq_file *m)
327{
328 /*
329 * Output format version, so at least we can change it
330 * without _too_ many complaints.
331 */
332#ifdef CONFIG_DEBUG_SLAB
333 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
334#else
335 seq_puts(m, "slabinfo - version: 2.1\n");
336#endif
337 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
338 "<objperslab> <pagesperslab>");
339 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
340 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
341#ifdef CONFIG_DEBUG_SLAB
342 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
343 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
344 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
345#endif
346 seq_putc(m, '\n');
347}
348
349static void *s_start(struct seq_file *m, loff_t *pos)
350{
351 loff_t n = *pos;
352
353 mutex_lock(&slab_mutex);
354 if (!n)
355 print_slabinfo_header(m);
356
357 return seq_list_start(&slab_caches, *pos);
358}
359
360static void *s_next(struct seq_file *m, void *p, loff_t *pos)
361{
362 return seq_list_next(p, &slab_caches, pos);
363}
364
365static void s_stop(struct seq_file *m, void *p)
366{
367 mutex_unlock(&slab_mutex);
368}
369
370static void
371memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
372{
373 struct kmem_cache *c;
374 struct slabinfo sinfo;
375 int i;
376
377 if (!is_root_cache(s))
378 return;
379
380 for_each_memcg_cache_index(i) {
381 c = cache_from_memcg(s, i);
382 if (!c)
383 continue;
384
385 memset(&sinfo, 0, sizeof(sinfo));
386 get_slabinfo(c, &sinfo);
387
388 info->active_slabs += sinfo.active_slabs;
389 info->num_slabs += sinfo.num_slabs;
390 info->shared_avail += sinfo.shared_avail;
391 info->active_objs += sinfo.active_objs;
392 info->num_objs += sinfo.num_objs;
393 }
394}
395
396int cache_show(struct kmem_cache *s, struct seq_file *m)
397{
398 struct slabinfo sinfo;
399
400 memset(&sinfo, 0, sizeof(sinfo));
401 get_slabinfo(s, &sinfo);
402
403 memcg_accumulate_slabinfo(s, &sinfo);
404
405 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
406 cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
407 sinfo.objects_per_slab, (1 << sinfo.cache_order));
408
409 seq_printf(m, " : tunables %4u %4u %4u",
410 sinfo.limit, sinfo.batchcount, sinfo.shared);
411 seq_printf(m, " : slabdata %6lu %6lu %6lu",
412 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
413 slabinfo_show_stats(m, s);
414 seq_putc(m, '\n');
415 return 0;
416}
417
418static int s_show(struct seq_file *m, void *p)
419{
420 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
421
422 if (!is_root_cache(s))
423 return 0;
424 return cache_show(s, m);
425}
426
427/*
428 * slabinfo_op - iterator that generates /proc/slabinfo
429 *
430 * Output layout:
431 * cache-name
432 * num-active-objs
433 * total-objs
434 * object size
435 * num-active-slabs
436 * total-slabs
437 * num-pages-per-slab
438 * + further values on SMP and with statistics enabled
439 */
440static const struct seq_operations slabinfo_op = {
441 .start = s_start,
442 .next = s_next,
443 .stop = s_stop,
444 .show = s_show,
445};
446
447static int slabinfo_open(struct inode *inode, struct file *file)
448{
449 return seq_open(file, &slabinfo_op);
450}
451
452static const struct file_operations proc_slabinfo_operations = {
453 .open = slabinfo_open,
454 .read = seq_read,
455 .write = slabinfo_write,
456 .llseek = seq_lseek,
457 .release = seq_release,
458};
459
460static int __init slab_proc_init(void)
461{
462 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
463 return 0;
464}
465module_init(slab_proc_init);
466#endif /* CONFIG_SLABINFO */
diff --git a/mm/slob.c b/mm/slob.c
index 1e921c5e9576..a99fdf7a0907 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -28,9 +28,8 @@
28 * from kmalloc are prepended with a 4-byte header with the kmalloc size. 28 * from kmalloc are prepended with a 4-byte header with the kmalloc size.
29 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls 29 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
30 * alloc_pages() directly, allocating compound pages so the page order 30 * alloc_pages() directly, allocating compound pages so the page order
31 * does not have to be separately tracked, and also stores the exact 31 * does not have to be separately tracked.
32 * allocation size in page->private so that it can be used to accurately 32 * These objects are detected in kfree() because PageSlab()
33 * provide ksize(). These objects are detected in kfree() because slob_page()
34 * is false for them. 33 * is false for them.
35 * 34 *
36 * SLAB is emulated on top of SLOB by simply calling constructors and 35 * SLAB is emulated on top of SLOB by simply calling constructors and
@@ -59,7 +58,6 @@
59 58
60#include <linux/kernel.h> 59#include <linux/kernel.h>
61#include <linux/slab.h> 60#include <linux/slab.h>
62#include "slab.h"
63 61
64#include <linux/mm.h> 62#include <linux/mm.h>
65#include <linux/swap.h> /* struct reclaim_state */ 63#include <linux/swap.h> /* struct reclaim_state */
@@ -74,6 +72,7 @@
74 72
75#include <linux/atomic.h> 73#include <linux/atomic.h>
76 74
75#include "slab.h"
77/* 76/*
78 * slob_block has a field 'units', which indicates size of block if +ve, 77 * slob_block has a field 'units', which indicates size of block if +ve,
79 * or offset of next block if -ve (in SLOB_UNITs). 78 * or offset of next block if -ve (in SLOB_UNITs).
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp)
124 123
125#define SLOB_UNIT sizeof(slob_t) 124#define SLOB_UNIT sizeof(slob_t)
126#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) 125#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
127#define SLOB_ALIGN L1_CACHE_BYTES
128 126
129/* 127/*
130 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
455 if (likely(order)) 453 if (likely(order))
456 gfp |= __GFP_COMP; 454 gfp |= __GFP_COMP;
457 ret = slob_new_pages(gfp, order, node); 455 ret = slob_new_pages(gfp, order, node);
458 if (ret) {
459 struct page *page;
460 page = virt_to_page(ret);
461 page->private = size;
462 }
463 456
464 trace_kmalloc_node(caller, ret, 457 trace_kmalloc_node(caller, ret,
465 size, PAGE_SIZE << order, gfp, node); 458 size, PAGE_SIZE << order, gfp, node);
@@ -506,7 +499,7 @@ void kfree(const void *block)
506 unsigned int *m = (unsigned int *)(block - align); 499 unsigned int *m = (unsigned int *)(block - align);
507 slob_free(m, *m + align); 500 slob_free(m, *m + align);
508 } else 501 } else
509 put_page(sp); 502 __free_pages(sp, compound_order(sp));
510} 503}
511EXPORT_SYMBOL(kfree); 504EXPORT_SYMBOL(kfree);
512 505
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree);
514size_t ksize(const void *block) 507size_t ksize(const void *block)
515{ 508{
516 struct page *sp; 509 struct page *sp;
510 int align;
511 unsigned int *m;
517 512
518 BUG_ON(!block); 513 BUG_ON(!block);
519 if (unlikely(block == ZERO_SIZE_PTR)) 514 if (unlikely(block == ZERO_SIZE_PTR))
520 return 0; 515 return 0;
521 516
522 sp = virt_to_page(block); 517 sp = virt_to_page(block);
523 if (PageSlab(sp)) { 518 if (unlikely(!PageSlab(sp)))
524 int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 519 return PAGE_SIZE << compound_order(sp);
525 unsigned int *m = (unsigned int *)(block - align); 520
526 return SLOB_UNITS(*m) * SLOB_UNIT; 521 align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
527 } else 522 m = (unsigned int *)(block - align);
528 return sp->private; 523 return SLOB_UNITS(*m) * SLOB_UNIT;
529} 524}
530EXPORT_SYMBOL(ksize); 525EXPORT_SYMBOL(ksize);
531 526
532int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) 527int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
533{ 528{
534 size_t align = c->size;
535
536 if (flags & SLAB_DESTROY_BY_RCU) { 529 if (flags & SLAB_DESTROY_BY_RCU) {
537 /* leave room for rcu footer at the end of object */ 530 /* leave room for rcu footer at the end of object */
538 c->size += sizeof(struct slob_rcu); 531 c->size += sizeof(struct slob_rcu);
539 } 532 }
540 c->flags = flags; 533 c->flags = flags;
541 /* ignore alignment unless it's forced */
542 c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
543 if (c->align < ARCH_SLAB_MINALIGN)
544 c->align = ARCH_SLAB_MINALIGN;
545 if (c->align < align)
546 c->align = align;
547
548 return 0; 534 return 0;
549} 535}
550 536
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
558 544
559 if (c->size < PAGE_SIZE) { 545 if (c->size < PAGE_SIZE) {
560 b = slob_alloc(c->size, flags, c->align, node); 546 b = slob_alloc(c->size, flags, c->align, node);
561 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 547 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
562 SLOB_UNITS(c->size) * SLOB_UNIT, 548 SLOB_UNITS(c->size) * SLOB_UNIT,
563 flags, node); 549 flags, node);
564 } else { 550 } else {
565 b = slob_new_pages(flags, get_order(c->size), node); 551 b = slob_new_pages(flags, get_order(c->size), node);
566 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 552 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
567 PAGE_SIZE << get_order(c->size), 553 PAGE_SIZE << get_order(c->size),
568 flags, node); 554 flags, node);
569 } 555 }
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
608} 594}
609EXPORT_SYMBOL(kmem_cache_free); 595EXPORT_SYMBOL(kmem_cache_free);
610 596
611unsigned int kmem_cache_size(struct kmem_cache *c)
612{
613 return c->size;
614}
615EXPORT_SYMBOL(kmem_cache_size);
616
617int __kmem_cache_shutdown(struct kmem_cache *c) 597int __kmem_cache_shutdown(struct kmem_cache *c)
618{ 598{
619 /* No way to check for remaining objects */ 599 /* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index a0d698467f70..ba2ca53f6c3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -31,6 +31,7 @@
31#include <linux/fault-inject.h> 31#include <linux/fault-inject.h>
32#include <linux/stacktrace.h> 32#include <linux/stacktrace.h>
33#include <linux/prefetch.h> 33#include <linux/prefetch.h>
34#include <linux/memcontrol.h>
34 35
35#include <trace/events/kmem.h> 36#include <trace/events/kmem.h>
36 37
@@ -112,9 +113,6 @@
112 * the fast path and disables lockless freelists. 113 * the fast path and disables lockless freelists.
113 */ 114 */
114 115
115#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
116 SLAB_TRACE | SLAB_DEBUG_FREE)
117
118static inline int kmem_cache_debug(struct kmem_cache *s) 116static inline int kmem_cache_debug(struct kmem_cache *s)
119{ 117{
120#ifdef CONFIG_SLUB_DEBUG 118#ifdef CONFIG_SLUB_DEBUG
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
179#define __OBJECT_POISON 0x80000000UL /* Poison object */ 177#define __OBJECT_POISON 0x80000000UL /* Poison object */
180#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 178#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
181 179
182static int kmem_size = sizeof(struct kmem_cache);
183
184#ifdef CONFIG_SMP 180#ifdef CONFIG_SMP
185static struct notifier_block slab_notifier; 181static struct notifier_block slab_notifier;
186#endif 182#endif
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
205static int sysfs_slab_add(struct kmem_cache *); 201static int sysfs_slab_add(struct kmem_cache *);
206static int sysfs_slab_alias(struct kmem_cache *, const char *); 202static int sysfs_slab_alias(struct kmem_cache *, const char *);
207static void sysfs_slab_remove(struct kmem_cache *); 203static void sysfs_slab_remove(struct kmem_cache *);
208 204static void memcg_propagate_slab_attrs(struct kmem_cache *s);
209#else 205#else
210static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 206static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
211static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 207static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
212 { return 0; } 208 { return 0; }
213static inline void sysfs_slab_remove(struct kmem_cache *s) { } 209static inline void sysfs_slab_remove(struct kmem_cache *s) { }
214 210
211static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
215#endif 212#endif
216 213
217static inline void stat(const struct kmem_cache *s, enum stat_item si) 214static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
1092 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1089 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1093 goto out; 1090 goto out;
1094 1091
1095 if (unlikely(s != page->slab)) { 1092 if (unlikely(s != page->slab_cache)) {
1096 if (!PageSlab(page)) { 1093 if (!PageSlab(page)) {
1097 slab_err(s, page, "Attempt to free object(0x%p) " 1094 slab_err(s, page, "Attempt to free object(0x%p) "
1098 "outside of slab", object); 1095 "outside of slab", object);
1099 } else if (!page->slab) { 1096 } else if (!page->slab_cache) {
1100 printk(KERN_ERR 1097 printk(KERN_ERR
1101 "SLUB <none>: no slab for object 0x%p.\n", 1098 "SLUB <none>: no slab for object 0x%p.\n",
1102 object); 1099 object);
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1348 void *start; 1345 void *start;
1349 void *last; 1346 void *last;
1350 void *p; 1347 void *p;
1348 int order;
1351 1349
1352 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1350 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1353 1351
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1356 if (!page) 1354 if (!page)
1357 goto out; 1355 goto out;
1358 1356
1357 order = compound_order(page);
1359 inc_slabs_node(s, page_to_nid(page), page->objects); 1358 inc_slabs_node(s, page_to_nid(page), page->objects);
1360 page->slab = s; 1359 memcg_bind_pages(s, order);
1360 page->slab_cache = s;
1361 __SetPageSlab(page); 1361 __SetPageSlab(page);
1362 if (page->pfmemalloc) 1362 if (page->pfmemalloc)
1363 SetPageSlabPfmemalloc(page); 1363 SetPageSlabPfmemalloc(page);
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1365 start = page_address(page); 1365 start = page_address(page);
1366 1366
1367 if (unlikely(s->flags & SLAB_POISON)) 1367 if (unlikely(s->flags & SLAB_POISON))
1368 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); 1368 memset(start, POISON_INUSE, PAGE_SIZE << order);
1369 1369
1370 last = start; 1370 last = start;
1371 for_each_object(p, s, start, page->objects) { 1371 for_each_object(p, s, start, page->objects) {
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1406 1406
1407 __ClearPageSlabPfmemalloc(page); 1407 __ClearPageSlabPfmemalloc(page);
1408 __ClearPageSlab(page); 1408 __ClearPageSlab(page);
1409
1410 memcg_release_pages(s, order);
1409 reset_page_mapcount(page); 1411 reset_page_mapcount(page);
1410 if (current->reclaim_state) 1412 if (current->reclaim_state)
1411 current->reclaim_state->reclaimed_slab += pages; 1413 current->reclaim_state->reclaimed_slab += pages;
1412 __free_pages(page, order); 1414 __free_memcg_kmem_pages(page, order);
1413} 1415}
1414 1416
1415#define need_reserve_slab_rcu \ 1417#define need_reserve_slab_rcu \
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
1424 else 1426 else
1425 page = container_of((struct list_head *)h, struct page, lru); 1427 page = container_of((struct list_head *)h, struct page, lru);
1426 1428
1427 __free_slab(page->slab, page); 1429 __free_slab(page->slab_cache, page);
1428} 1430}
1429 1431
1430static void free_slab(struct kmem_cache *s, struct page *page) 1432static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1872,12 +1874,14 @@ redo:
1872/* 1874/*
1873 * Unfreeze all the cpu partial slabs. 1875 * Unfreeze all the cpu partial slabs.
1874 * 1876 *
1875 * This function must be called with interrupt disabled. 1877 * This function must be called with interrupts disabled
1878 * for the cpu using c (or some other guarantee must be there
1879 * to guarantee no concurrent accesses).
1876 */ 1880 */
1877static void unfreeze_partials(struct kmem_cache *s) 1881static void unfreeze_partials(struct kmem_cache *s,
1882 struct kmem_cache_cpu *c)
1878{ 1883{
1879 struct kmem_cache_node *n = NULL, *n2 = NULL; 1884 struct kmem_cache_node *n = NULL, *n2 = NULL;
1880 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
1881 struct page *page, *discard_page = NULL; 1885 struct page *page, *discard_page = NULL;
1882 1886
1883 while ((page = c->partial)) { 1887 while ((page = c->partial)) {
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1963 * set to the per node partial list. 1967 * set to the per node partial list.
1964 */ 1968 */
1965 local_irq_save(flags); 1969 local_irq_save(flags);
1966 unfreeze_partials(s); 1970 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
1967 local_irq_restore(flags); 1971 local_irq_restore(flags);
1968 oldpage = NULL; 1972 oldpage = NULL;
1969 pobjects = 0; 1973 pobjects = 0;
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2006 if (c->page) 2010 if (c->page)
2007 flush_slab(s, c); 2011 flush_slab(s, c);
2008 2012
2009 unfreeze_partials(s); 2013 unfreeze_partials(s, c);
2010 } 2014 }
2011} 2015}
2012 2016
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2325 if (slab_pre_alloc_hook(s, gfpflags)) 2329 if (slab_pre_alloc_hook(s, gfpflags))
2326 return NULL; 2330 return NULL;
2327 2331
2332 s = memcg_kmem_get_cache(s, gfpflags);
2328redo: 2333redo:
2329 2334
2330 /* 2335 /*
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2459 void *prior; 2464 void *prior;
2460 void **object = (void *)x; 2465 void **object = (void *)x;
2461 int was_frozen; 2466 int was_frozen;
2462 int inuse;
2463 struct page new; 2467 struct page new;
2464 unsigned long counters; 2468 unsigned long counters;
2465 struct kmem_cache_node *n = NULL; 2469 struct kmem_cache_node *n = NULL;
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2472 return; 2476 return;
2473 2477
2474 do { 2478 do {
2479 if (unlikely(n)) {
2480 spin_unlock_irqrestore(&n->list_lock, flags);
2481 n = NULL;
2482 }
2475 prior = page->freelist; 2483 prior = page->freelist;
2476 counters = page->counters; 2484 counters = page->counters;
2477 set_freepointer(s, object, prior); 2485 set_freepointer(s, object, prior);
2478 new.counters = counters; 2486 new.counters = counters;
2479 was_frozen = new.frozen; 2487 was_frozen = new.frozen;
2480 new.inuse--; 2488 new.inuse--;
2481 if ((!new.inuse || !prior) && !was_frozen && !n) { 2489 if ((!new.inuse || !prior) && !was_frozen) {
2482 2490
2483 if (!kmem_cache_debug(s) && !prior) 2491 if (!kmem_cache_debug(s) && !prior)
2484 2492
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2503 2511
2504 } 2512 }
2505 } 2513 }
2506 inuse = new.inuse;
2507 2514
2508 } while (!cmpxchg_double_slab(s, page, 2515 } while (!cmpxchg_double_slab(s, page,
2509 prior, counters, 2516 prior, counters,
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2529 return; 2536 return;
2530 } 2537 }
2531 2538
2539 if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
2540 goto slab_empty;
2541
2532 /* 2542 /*
2533 * was_frozen may have been set after we acquired the list_lock in 2543 * Objects left in the slab. If it was not on the partial list before
2534 * an earlier loop. So we need to check it here again. 2544 * then add it.
2535 */ 2545 */
2536 if (was_frozen) 2546 if (kmem_cache_debug(s) && unlikely(!prior)) {
2537 stat(s, FREE_FROZEN); 2547 remove_full(s, page);
2538 else { 2548 add_partial(n, page, DEACTIVATE_TO_TAIL);
2539 if (unlikely(!inuse && n->nr_partial > s->min_partial)) 2549 stat(s, FREE_ADD_PARTIAL);
2540 goto slab_empty;
2541
2542 /*
2543 * Objects left in the slab. If it was not on the partial list before
2544 * then add it.
2545 */
2546 if (unlikely(!prior)) {
2547 remove_full(s, page);
2548 add_partial(n, page, DEACTIVATE_TO_TAIL);
2549 stat(s, FREE_ADD_PARTIAL);
2550 }
2551 } 2550 }
2552 spin_unlock_irqrestore(&n->list_lock, flags); 2551 spin_unlock_irqrestore(&n->list_lock, flags);
2553 return; 2552 return;
@@ -2619,19 +2618,10 @@ redo:
2619 2618
2620void kmem_cache_free(struct kmem_cache *s, void *x) 2619void kmem_cache_free(struct kmem_cache *s, void *x)
2621{ 2620{
2622 struct page *page; 2621 s = cache_from_obj(s, x);
2623 2622 if (!s)
2624 page = virt_to_head_page(x);
2625
2626 if (kmem_cache_debug(s) && page->slab != s) {
2627 pr_err("kmem_cache_free: Wrong slab cache. %s but object"
2628 " is from %s\n", page->slab->name, s->name);
2629 WARN_ON_ONCE(1);
2630 return; 2623 return;
2631 } 2624 slab_free(s, virt_to_head_page(x), x, _RET_IP_);
2632
2633 slab_free(s, page, x, _RET_IP_);
2634
2635 trace_kmem_cache_free(_RET_IP_, x); 2625 trace_kmem_cache_free(_RET_IP_, x);
2636} 2626}
2637EXPORT_SYMBOL(kmem_cache_free); 2627EXPORT_SYMBOL(kmem_cache_free);
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved)
2769 return -ENOSYS; 2759 return -ENOSYS;
2770} 2760}
2771 2761
2772/*
2773 * Figure out what the alignment of the objects will be.
2774 */
2775static unsigned long calculate_alignment(unsigned long flags,
2776 unsigned long align, unsigned long size)
2777{
2778 /*
2779 * If the user wants hardware cache aligned objects then follow that
2780 * suggestion if the object is sufficiently large.
2781 *
2782 * The hardware cache alignment cannot override the specified
2783 * alignment though. If that is greater then use it.
2784 */
2785 if (flags & SLAB_HWCACHE_ALIGN) {
2786 unsigned long ralign = cache_line_size();
2787 while (size <= ralign / 2)
2788 ralign /= 2;
2789 align = max(align, ralign);
2790 }
2791
2792 if (align < ARCH_SLAB_MINALIGN)
2793 align = ARCH_SLAB_MINALIGN;
2794
2795 return ALIGN(align, sizeof(void *));
2796}
2797
2798static void 2762static void
2799init_kmem_cache_node(struct kmem_cache_node *n) 2763init_kmem_cache_node(struct kmem_cache_node *n)
2800{ 2764{
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2928{ 2892{
2929 unsigned long flags = s->flags; 2893 unsigned long flags = s->flags;
2930 unsigned long size = s->object_size; 2894 unsigned long size = s->object_size;
2931 unsigned long align = s->align;
2932 int order; 2895 int order;
2933 2896
2934 /* 2897 /*
@@ -3000,19 +2963,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3000#endif 2963#endif
3001 2964
3002 /* 2965 /*
3003 * Determine the alignment based on various parameters that the
3004 * user specified and the dynamic determination of cache line size
3005 * on bootup.
3006 */
3007 align = calculate_alignment(flags, align, s->object_size);
3008 s->align = align;
3009
3010 /*
3011 * SLUB stores one object immediately after another beginning from 2966 * SLUB stores one object immediately after another beginning from
3012 * offset 0. In order to align the objects we have to simply size 2967 * offset 0. In order to align the objects we have to simply size
3013 * each object to conform to the alignment. 2968 * each object to conform to the alignment.
3014 */ 2969 */
3015 size = ALIGN(size, align); 2970 size = ALIGN(size, s->align);
3016 s->size = size; 2971 s->size = size;
3017 if (forced_order >= 0) 2972 if (forced_order >= 0)
3018 order = forced_order; 2973 order = forced_order;
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3041 s->max = s->oo; 2996 s->max = s->oo;
3042 2997
3043 return !!oo_objects(s->oo); 2998 return !!oo_objects(s->oo);
3044
3045} 2999}
3046 3000
3047static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) 3001static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
@@ -3127,15 +3081,6 @@ error:
3127 return -EINVAL; 3081 return -EINVAL;
3128} 3082}
3129 3083
3130/*
3131 * Determine the size of a slab object
3132 */
3133unsigned int kmem_cache_size(struct kmem_cache *s)
3134{
3135 return s->object_size;
3136}
3137EXPORT_SYMBOL(kmem_cache_size);
3138
3139static void list_slab_objects(struct kmem_cache *s, struct page *page, 3084static void list_slab_objects(struct kmem_cache *s, struct page *page,
3140 const char *text) 3085 const char *text)
3141{ 3086{
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
3208{ 3153{
3209 int rc = kmem_cache_close(s); 3154 int rc = kmem_cache_close(s);
3210 3155
3211 if (!rc) 3156 if (!rc) {
3157 /*
3158 * We do the same lock strategy around sysfs_slab_add, see
3159 * __kmem_cache_create. Because this is pretty much the last
3160 * operation we do and the lock will be released shortly after
3161 * that in slab_common.c, we could just move sysfs_slab_remove
3162 * to a later point in common code. We should do that when we
3163 * have a common sysfs framework for all allocators.
3164 */
3165 mutex_unlock(&slab_mutex);
3212 sysfs_slab_remove(s); 3166 sysfs_slab_remove(s);
3167 mutex_lock(&slab_mutex);
3168 }
3213 3169
3214 return rc; 3170 return rc;
3215} 3171}
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str)
3261 3217
3262__setup("slub_nomerge", setup_slub_nomerge); 3218__setup("slub_nomerge", setup_slub_nomerge);
3263 3219
3264static struct kmem_cache *__init create_kmalloc_cache(const char *name,
3265 int size, unsigned int flags)
3266{
3267 struct kmem_cache *s;
3268
3269 s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3270
3271 s->name = name;
3272 s->size = s->object_size = size;
3273 s->align = ARCH_KMALLOC_MINALIGN;
3274
3275 /*
3276 * This function is called with IRQs disabled during early-boot on
3277 * single CPU so there's no need to take slab_mutex here.
3278 */
3279 if (kmem_cache_open(s, flags))
3280 goto panic;
3281
3282 list_add(&s->list, &slab_caches);
3283 return s;
3284
3285panic:
3286 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
3287 return NULL;
3288}
3289
3290/* 3220/*
3291 * Conversion table for small slabs sizes / 8 to the index in the 3221 * Conversion table for small slabs sizes / 8 to the index in the
3292 * kmalloc array. This is necessary for slabs < 192 since we have non power 3222 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3372 struct page *page; 3302 struct page *page;
3373 void *ptr = NULL; 3303 void *ptr = NULL;
3374 3304
3375 flags |= __GFP_COMP | __GFP_NOTRACK; 3305 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
3376 page = alloc_pages_node(node, flags, get_order(size)); 3306 page = alloc_pages_node(node, flags, get_order(size));
3377 if (page) 3307 if (page)
3378 ptr = page_address(page); 3308 ptr = page_address(page);
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object)
3424 return PAGE_SIZE << compound_order(page); 3354 return PAGE_SIZE << compound_order(page);
3425 } 3355 }
3426 3356
3427 return slab_ksize(page->slab); 3357 return slab_ksize(page->slab_cache);
3428} 3358}
3429EXPORT_SYMBOL(ksize); 3359EXPORT_SYMBOL(ksize);
3430 3360
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x)
3449 } 3379 }
3450 3380
3451 slab_lock(page); 3381 slab_lock(page);
3452 if (on_freelist(page->slab, page, object)) { 3382 if (on_freelist(page->slab_cache, page, object)) {
3453 object_err(page->slab, page, object, "Object is on free-list"); 3383 object_err(page->slab_cache, page, object, "Object is on free-list");
3454 rv = false; 3384 rv = false;
3455 } else { 3385 } else {
3456 rv = true; 3386 rv = true;
@@ -3478,10 +3408,10 @@ void kfree(const void *x)
3478 if (unlikely(!PageSlab(page))) { 3408 if (unlikely(!PageSlab(page))) {
3479 BUG_ON(!PageCompound(page)); 3409 BUG_ON(!PageCompound(page));
3480 kmemleak_free(x); 3410 kmemleak_free(x);
3481 __free_pages(page, compound_order(page)); 3411 __free_memcg_kmem_pages(page, compound_order(page));
3482 return; 3412 return;
3483 } 3413 }
3484 slab_free(page->slab, page, object, _RET_IP_); 3414 slab_free(page->slab_cache, page, object, _RET_IP_);
3485} 3415}
3486EXPORT_SYMBOL(kfree); 3416EXPORT_SYMBOL(kfree);
3487 3417
@@ -3573,7 +3503,7 @@ static void slab_mem_offline_callback(void *arg)
3573 struct memory_notify *marg = arg; 3503 struct memory_notify *marg = arg;
3574 int offline_node; 3504 int offline_node;
3575 3505
3576 offline_node = marg->status_change_nid; 3506 offline_node = marg->status_change_nid_normal;
3577 3507
3578 /* 3508 /*
3579 * If the node still has available memory. we need kmem_cache_node 3509 * If the node still has available memory. we need kmem_cache_node
@@ -3606,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg)
3606 struct kmem_cache_node *n; 3536 struct kmem_cache_node *n;
3607 struct kmem_cache *s; 3537 struct kmem_cache *s;
3608 struct memory_notify *marg = arg; 3538 struct memory_notify *marg = arg;
3609 int nid = marg->status_change_nid; 3539 int nid = marg->status_change_nid_normal;
3610 int ret = 0; 3540 int ret = 0;
3611 3541
3612 /* 3542 /*
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self,
3676 3606
3677/* 3607/*
3678 * Used for early kmem_cache structures that were allocated using 3608 * Used for early kmem_cache structures that were allocated using
3679 * the page allocator 3609 * the page allocator. Allocate them properly then fix up the pointers
3610 * that may be pointing to the wrong kmem_cache structure.
3680 */ 3611 */
3681 3612
3682static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) 3613static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3683{ 3614{
3684 int node; 3615 int node;
3616 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3685 3617
3686 list_add(&s->list, &slab_caches); 3618 memcpy(s, static_cache, kmem_cache->object_size);
3687 s->refcount = -1;
3688 3619
3689 for_each_node_state(node, N_NORMAL_MEMORY) { 3620 for_each_node_state(node, N_NORMAL_MEMORY) {
3690 struct kmem_cache_node *n = get_node(s, node); 3621 struct kmem_cache_node *n = get_node(s, node);
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
3692 3623
3693 if (n) { 3624 if (n) {
3694 list_for_each_entry(p, &n->partial, lru) 3625 list_for_each_entry(p, &n->partial, lru)
3695 p->slab = s; 3626 p->slab_cache = s;
3696 3627
3697#ifdef CONFIG_SLUB_DEBUG 3628#ifdef CONFIG_SLUB_DEBUG
3698 list_for_each_entry(p, &n->full, lru) 3629 list_for_each_entry(p, &n->full, lru)
3699 p->slab = s; 3630 p->slab_cache = s;
3700#endif 3631#endif
3701 } 3632 }
3702 } 3633 }
3634 list_add(&s->list, &slab_caches);
3635 return s;
3703} 3636}
3704 3637
3705void __init kmem_cache_init(void) 3638void __init kmem_cache_init(void)
3706{ 3639{
3640 static __initdata struct kmem_cache boot_kmem_cache,
3641 boot_kmem_cache_node;
3707 int i; 3642 int i;
3708 int caches = 0; 3643 int caches = 2;
3709 struct kmem_cache *temp_kmem_cache;
3710 int order;
3711 struct kmem_cache *temp_kmem_cache_node;
3712 unsigned long kmalloc_size;
3713 3644
3714 if (debug_guardpage_minorder()) 3645 if (debug_guardpage_minorder())
3715 slub_max_order = 0; 3646 slub_max_order = 0;
3716 3647
3717 kmem_size = offsetof(struct kmem_cache, node) + 3648 kmem_cache_node = &boot_kmem_cache_node;
3718 nr_node_ids * sizeof(struct kmem_cache_node *); 3649 kmem_cache = &boot_kmem_cache;
3719
3720 /* Allocate two kmem_caches from the page allocator */
3721 kmalloc_size = ALIGN(kmem_size, cache_line_size());
3722 order = get_order(2 * kmalloc_size);
3723 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
3724
3725 /*
3726 * Must first have the slab cache available for the allocations of the
3727 * struct kmem_cache_node's. There is special bootstrap code in
3728 * kmem_cache_open for slab_state == DOWN.
3729 */
3730 kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3731 3650
3732 kmem_cache_node->name = "kmem_cache_node"; 3651 create_boot_cache(kmem_cache_node, "kmem_cache_node",
3733 kmem_cache_node->size = kmem_cache_node->object_size = 3652 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
3734 sizeof(struct kmem_cache_node);
3735 kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
3736 3653
3737 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3654 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3738 3655
3739 /* Able to allocate the per node structures */ 3656 /* Able to allocate the per node structures */
3740 slab_state = PARTIAL; 3657 slab_state = PARTIAL;
3741 3658
3742 temp_kmem_cache = kmem_cache; 3659 create_boot_cache(kmem_cache, "kmem_cache",
3743 kmem_cache->name = "kmem_cache"; 3660 offsetof(struct kmem_cache, node) +
3744 kmem_cache->size = kmem_cache->object_size = kmem_size; 3661 nr_node_ids * sizeof(struct kmem_cache_node *),
3745 kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); 3662 SLAB_HWCACHE_ALIGN);
3746 3663
3747 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3664 kmem_cache = bootstrap(&boot_kmem_cache);
3748 memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3749 3665
3750 /* 3666 /*
3751 * Allocate kmem_cache_node properly from the kmem_cache slab. 3667 * Allocate kmem_cache_node properly from the kmem_cache slab.
3752 * kmem_cache_node is separately allocated so no need to 3668 * kmem_cache_node is separately allocated so no need to
3753 * update any list pointers. 3669 * update any list pointers.
3754 */ 3670 */
3755 temp_kmem_cache_node = kmem_cache_node; 3671 kmem_cache_node = bootstrap(&boot_kmem_cache_node);
3756
3757 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3758 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
3759
3760 kmem_cache_bootstrap_fixup(kmem_cache_node);
3761
3762 caches++;
3763 kmem_cache_bootstrap_fixup(kmem_cache);
3764 caches++;
3765 /* Free temporary boot structure */
3766 free_pages((unsigned long)temp_kmem_cache, order);
3767 3672
3768 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3673 /* Now we can use the kmem_cache to allocate kmalloc slabs */
3769 3674
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3891 return 0; 3796 return 0;
3892} 3797}
3893 3798
3894static struct kmem_cache *find_mergeable(size_t size, 3799static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
3895 size_t align, unsigned long flags, const char *name, 3800 size_t align, unsigned long flags, const char *name,
3896 void (*ctor)(void *)) 3801 void (*ctor)(void *))
3897{ 3802{
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
3927 if (s->size - size >= sizeof(void *)) 3832 if (s->size - size >= sizeof(void *))
3928 continue; 3833 continue;
3929 3834
3835 if (!cache_match_memcg(s, memcg))
3836 continue;
3837
3930 return s; 3838 return s;
3931 } 3839 }
3932 return NULL; 3840 return NULL;
3933} 3841}
3934 3842
3935struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 3843struct kmem_cache *
3936 size_t align, unsigned long flags, void (*ctor)(void *)) 3844__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
3845 size_t align, unsigned long flags, void (*ctor)(void *))
3937{ 3846{
3938 struct kmem_cache *s; 3847 struct kmem_cache *s;
3939 3848
3940 s = find_mergeable(size, align, flags, name, ctor); 3849 s = find_mergeable(memcg, size, align, flags, name, ctor);
3941 if (s) { 3850 if (s) {
3942 s->refcount++; 3851 s->refcount++;
3943 /* 3852 /*
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3964 if (err) 3873 if (err)
3965 return err; 3874 return err;
3966 3875
3876 /* Mutex is not taken during early boot */
3877 if (slab_state <= UP)
3878 return 0;
3879
3880 memcg_propagate_slab_attrs(s);
3967 mutex_unlock(&slab_mutex); 3881 mutex_unlock(&slab_mutex);
3968 err = sysfs_slab_add(s); 3882 err = sysfs_slab_add(s);
3969 mutex_lock(&slab_mutex); 3883 mutex_lock(&slab_mutex);
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
5197 return -EIO; 5111 return -EIO;
5198 5112
5199 err = attribute->store(s, buf, len); 5113 err = attribute->store(s, buf, len);
5114#ifdef CONFIG_MEMCG_KMEM
5115 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5116 int i;
5117
5118 mutex_lock(&slab_mutex);
5119 if (s->max_attr_size < len)
5120 s->max_attr_size = len;
5200 5121
5122 /*
5123 * This is a best effort propagation, so this function's return
5124 * value will be determined by the parent cache only. This is
5125 * basically because not all attributes will have a well
5126 * defined semantics for rollbacks - most of the actions will
5127 * have permanent effects.
5128 *
5129 * Returning the error value of any of the children that fail
5130 * is not 100 % defined, in the sense that users seeing the
5131 * error code won't be able to know anything about the state of
5132 * the cache.
5133 *
5134 * Only returning the error code for the parent cache at least
5135 * has well defined semantics. The cache being written to
5136 * directly either failed or succeeded, in which case we loop
5137 * through the descendants with best-effort propagation.
5138 */
5139 for_each_memcg_cache_index(i) {
5140 struct kmem_cache *c = cache_from_memcg(s, i);
5141 if (c)
5142 attribute->store(c, buf, len);
5143 }
5144 mutex_unlock(&slab_mutex);
5145 }
5146#endif
5201 return err; 5147 return err;
5202} 5148}
5203 5149
5150static void memcg_propagate_slab_attrs(struct kmem_cache *s)
5151{
5152#ifdef CONFIG_MEMCG_KMEM
5153 int i;
5154 char *buffer = NULL;
5155
5156 if (!is_root_cache(s))
5157 return;
5158
5159 /*
5160 * This mean this cache had no attribute written. Therefore, no point
5161 * in copying default values around
5162 */
5163 if (!s->max_attr_size)
5164 return;
5165
5166 for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5167 char mbuf[64];
5168 char *buf;
5169 struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5170
5171 if (!attr || !attr->store || !attr->show)
5172 continue;
5173
5174 /*
5175 * It is really bad that we have to allocate here, so we will
5176 * do it only as a fallback. If we actually allocate, though,
5177 * we can just use the allocated buffer until the end.
5178 *
5179 * Most of the slub attributes will tend to be very small in
5180 * size, but sysfs allows buffers up to a page, so they can
5181 * theoretically happen.
5182 */
5183 if (buffer)
5184 buf = buffer;
5185 else if (s->max_attr_size < ARRAY_SIZE(mbuf))
5186 buf = mbuf;
5187 else {
5188 buffer = (char *) get_zeroed_page(GFP_KERNEL);
5189 if (WARN_ON(!buffer))
5190 continue;
5191 buf = buffer;
5192 }
5193
5194 attr->show(s->memcg_params->root_cache, buf);
5195 attr->store(s, buf, strlen(buf));
5196 }
5197
5198 if (buffer)
5199 free_page((unsigned long)buffer);
5200#endif
5201}
5202
5204static const struct sysfs_ops slab_sysfs_ops = { 5203static const struct sysfs_ops slab_sysfs_ops = {
5205 .show = slab_attr_show, 5204 .show = slab_attr_show,
5206 .store = slab_attr_store, 5205 .store = slab_attr_store,
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
5257 if (p != name + 1) 5256 if (p != name + 1)
5258 *p++ = '-'; 5257 *p++ = '-';
5259 p += sprintf(p, "%07d", s->size); 5258 p += sprintf(p, "%07d", s->size);
5259
5260#ifdef CONFIG_MEMCG_KMEM
5261 if (!is_root_cache(s))
5262 p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
5263#endif
5264
5260 BUG_ON(p > name + ID_STR_LENGTH - 1); 5265 BUG_ON(p > name + ID_STR_LENGTH - 1);
5261 return name; 5266 return name;
5262} 5267}
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
5265{ 5270{
5266 int err; 5271 int err;
5267 const char *name; 5272 const char *name;
5268 int unmergeable; 5273 int unmergeable = slab_unmergeable(s);
5269
5270 if (slab_state < FULL)
5271 /* Defer until later */
5272 return 0;
5273 5274
5274 unmergeable = slab_unmergeable(s);
5275 if (unmergeable) { 5275 if (unmergeable) {
5276 /* 5276 /*
5277 * Slabcache can never be merged so we can use the name proper. 5277 * Slabcache can never be merged so we can use the name proper.
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init);
5405 * The /proc/slabinfo ABI 5405 * The /proc/slabinfo ABI
5406 */ 5406 */
5407#ifdef CONFIG_SLABINFO 5407#ifdef CONFIG_SLABINFO
5408static void print_slabinfo_header(struct seq_file *m) 5408void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5409{
5410 seq_puts(m, "slabinfo - version: 2.1\n");
5411 seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
5412 "<objperslab> <pagesperslab>");
5413 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
5414 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
5415 seq_putc(m, '\n');
5416}
5417
5418static void *s_start(struct seq_file *m, loff_t *pos)
5419{
5420 loff_t n = *pos;
5421
5422 mutex_lock(&slab_mutex);
5423 if (!n)
5424 print_slabinfo_header(m);
5425
5426 return seq_list_start(&slab_caches, *pos);
5427}
5428
5429static void *s_next(struct seq_file *m, void *p, loff_t *pos)
5430{
5431 return seq_list_next(p, &slab_caches, pos);
5432}
5433
5434static void s_stop(struct seq_file *m, void *p)
5435{
5436 mutex_unlock(&slab_mutex);
5437}
5438
5439static int s_show(struct seq_file *m, void *p)
5440{ 5409{
5441 unsigned long nr_partials = 0; 5410 unsigned long nr_partials = 0;
5442 unsigned long nr_slabs = 0; 5411 unsigned long nr_slabs = 0;
5443 unsigned long nr_inuse = 0;
5444 unsigned long nr_objs = 0; 5412 unsigned long nr_objs = 0;
5445 unsigned long nr_free = 0; 5413 unsigned long nr_free = 0;
5446 struct kmem_cache *s;
5447 int node; 5414 int node;
5448 5415
5449 s = list_entry(p, struct kmem_cache, list);
5450
5451 for_each_online_node(node) { 5416 for_each_online_node(node) {
5452 struct kmem_cache_node *n = get_node(s, node); 5417 struct kmem_cache_node *n = get_node(s, node);
5453 5418
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p)
5460 nr_free += count_partial(n, count_free); 5425 nr_free += count_partial(n, count_free);
5461 } 5426 }
5462 5427
5463 nr_inuse = nr_objs - nr_free; 5428 sinfo->active_objs = nr_objs - nr_free;
5464 5429 sinfo->num_objs = nr_objs;
5465 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, 5430 sinfo->active_slabs = nr_slabs;
5466 nr_objs, s->size, oo_objects(s->oo), 5431 sinfo->num_slabs = nr_slabs;
5467 (1 << oo_order(s->oo))); 5432 sinfo->objects_per_slab = oo_objects(s->oo);
5468 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); 5433 sinfo->cache_order = oo_order(s->oo);
5469 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
5470 0UL);
5471 seq_putc(m, '\n');
5472 return 0;
5473} 5434}
5474 5435
5475static const struct seq_operations slabinfo_op = { 5436void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5476 .start = s_start,
5477 .next = s_next,
5478 .stop = s_stop,
5479 .show = s_show,
5480};
5481
5482static int slabinfo_open(struct inode *inode, struct file *file)
5483{ 5437{
5484 return seq_open(file, &slabinfo_op);
5485} 5438}
5486 5439
5487static const struct file_operations proc_slabinfo_operations = { 5440ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5488 .open = slabinfo_open, 5441 size_t count, loff_t *ppos)
5489 .read = seq_read,
5490 .llseek = seq_lseek,
5491 .release = seq_release,
5492};
5493
5494static int __init slab_proc_init(void)
5495{ 5442{
5496 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); 5443 return -EIO;
5497 return 0;
5498} 5444}
5499module_init(slab_proc_init);
5500#endif /* CONFIG_SLABINFO */ 5445#endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse.c b/mm/sparse.c
index fac95f2888f2..6b5fb762e2ca 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
617{ 617{
618 return; /* XXX: Not implemented yet */ 618 return; /* XXX: Not implemented yet */
619} 619}
620static void free_map_bootmem(struct page *page, unsigned long nr_pages) 620static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
621{ 621{
622} 622}
623#else 623#else
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
638got_map_page: 638got_map_page:
639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); 639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
640got_map_ptr: 640got_map_ptr:
641 memset(ret, 0, memmap_size);
642 641
643 return ret; 642 return ret;
644} 643}
@@ -658,10 +657,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
658 get_order(sizeof(struct page) * nr_pages)); 657 get_order(sizeof(struct page) * nr_pages));
659} 658}
660 659
661static void free_map_bootmem(struct page *page, unsigned long nr_pages) 660static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
662{ 661{
663 unsigned long maps_section_nr, removing_section_nr, i; 662 unsigned long maps_section_nr, removing_section_nr, i;
664 unsigned long magic; 663 unsigned long magic;
664 struct page *page = virt_to_page(memmap);
665 665
666 for (i = 0; i < nr_pages; i++, page++) { 666 for (i = 0; i < nr_pages; i++, page++) {
667 magic = (unsigned long) page->lru.next; 667 magic = (unsigned long) page->lru.next;
@@ -710,13 +710,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
710 */ 710 */
711 711
712 if (memmap) { 712 if (memmap) {
713 struct page *memmap_page;
714 memmap_page = virt_to_page(memmap);
715
716 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) 713 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
717 >> PAGE_SHIFT; 714 >> PAGE_SHIFT;
718 715
719 free_map_bootmem(memmap_page, nr_pages); 716 free_map_bootmem(memmap, nr_pages);
720 } 717 }
721} 718}
722 719
@@ -760,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
760 goto out; 757 goto out;
761 } 758 }
762 759
760 memset(memmap, 0, sizeof(struct page) * nr_pages);
761
763 ms->section_mem_map |= SECTION_MARKED_PRESENT; 762 ms->section_mem_map |= SECTION_MARKED_PRESENT;
764 763
765 ret = sparse_init_one_section(ms, section_nr, memmap, usemap); 764 ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
@@ -773,6 +772,27 @@ out:
773 return ret; 772 return ret;
774} 773}
775 774
775#ifdef CONFIG_MEMORY_FAILURE
776static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
777{
778 int i;
779
780 if (!memmap)
781 return;
782
783 for (i = 0; i < PAGES_PER_SECTION; i++) {
784 if (PageHWPoison(&memmap[i])) {
785 atomic_long_sub(1, &mce_bad_pages);
786 ClearPageHWPoison(&memmap[i]);
787 }
788 }
789}
790#else
791static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
792{
793}
794#endif
795
776void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 796void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
777{ 797{
778 struct page *memmap = NULL; 798 struct page *memmap = NULL;
@@ -786,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
786 ms->pageblock_flags = NULL; 806 ms->pageblock_flags = NULL;
787 } 807 }
788 808
809 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
789 free_section_usemap(memmap, usemap); 810 free_section_usemap(memmap, usemap);
790} 811}
791#endif 812#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f91a25547ffe..e97a0e5aea91 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1443 return generic_swapfile_activate(sis, swap_file, span); 1443 return generic_swapfile_activate(sis, swap_file, span);
1444} 1444}
1445 1445
1446static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void _enable_swap_info(struct swap_info_struct *p, int prio,
1447 unsigned char *swap_map, 1447 unsigned char *swap_map,
1448 unsigned long *frontswap_map) 1448 unsigned long *frontswap_map)
1449{ 1449{
1450 int i, prev; 1450 int i, prev;
1451 1451
1452 spin_lock(&swap_lock);
1453 if (prio >= 0) 1452 if (prio >= 0)
1454 p->prio = prio; 1453 p->prio = prio;
1455 else 1454 else
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1472 swap_list.head = swap_list.next = p->type; 1471 swap_list.head = swap_list.next = p->type;
1473 else 1472 else
1474 swap_info[prev]->next = p->type; 1473 swap_info[prev]->next = p->type;
1474}
1475
1476static void enable_swap_info(struct swap_info_struct *p, int prio,
1477 unsigned char *swap_map,
1478 unsigned long *frontswap_map)
1479{
1480 spin_lock(&swap_lock);
1481 _enable_swap_info(p, prio, swap_map, frontswap_map);
1475 frontswap_init(p->type); 1482 frontswap_init(p->type);
1476 spin_unlock(&swap_lock); 1483 spin_unlock(&swap_lock);
1477} 1484}
1478 1485
1486static void reinsert_swap_info(struct swap_info_struct *p)
1487{
1488 spin_lock(&swap_lock);
1489 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1490 spin_unlock(&swap_lock);
1491}
1492
1479SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1493SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1480{ 1494{
1481 struct swap_info_struct *p = NULL; 1495 struct swap_info_struct *p = NULL;
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1484 struct address_space *mapping; 1498 struct address_space *mapping;
1485 struct inode *inode; 1499 struct inode *inode;
1486 struct filename *pathname; 1500 struct filename *pathname;
1487 int oom_score_adj;
1488 int i, type, prev; 1501 int i, type, prev;
1489 int err; 1502 int err;
1490 1503
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1543 p->flags &= ~SWP_WRITEOK; 1556 p->flags &= ~SWP_WRITEOK;
1544 spin_unlock(&swap_lock); 1557 spin_unlock(&swap_lock);
1545 1558
1546 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1559 set_current_oom_origin();
1547 err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1560 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1548 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1561 clear_current_oom_origin();
1549 1562
1550 if (err) { 1563 if (err) {
1551 /*
1552 * reading p->prio and p->swap_map outside the lock is
1553 * safe here because only sys_swapon and sys_swapoff
1554 * change them, and there can be no other sys_swapon or
1555 * sys_swapoff for this swap_info_struct at this point.
1556 */
1557 /* re-insert swap space back into swap_list */ 1564 /* re-insert swap space back into swap_list */
1558 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1565 reinsert_swap_info(p);
1559 goto out_dput; 1566 goto out_dput;
1560 } 1567 }
1561 1568
diff --git a/mm/truncate.c b/mm/truncate.c
index d51ce92d6e83..c75b736e54b7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -577,29 +577,6 @@ void truncate_setsize(struct inode *inode, loff_t newsize)
577EXPORT_SYMBOL(truncate_setsize); 577EXPORT_SYMBOL(truncate_setsize);
578 578
579/** 579/**
580 * vmtruncate - unmap mappings "freed" by truncate() syscall
581 * @inode: inode of the file used
582 * @newsize: file offset to start truncating
583 *
584 * This function is deprecated and truncate_setsize or truncate_pagecache
585 * should be used instead, together with filesystem specific block truncation.
586 */
587int vmtruncate(struct inode *inode, loff_t newsize)
588{
589 int error;
590
591 error = inode_newsize_ok(inode, newsize);
592 if (error)
593 return error;
594
595 truncate_setsize(inode, newsize);
596 if (inode->i_op->truncate)
597 inode->i_op->truncate(inode);
598 return 0;
599}
600EXPORT_SYMBOL(vmtruncate);
601
602/**
603 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched 580 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
604 * @inode: inode 581 * @inode: inode
605 * @lstart: offset of beginning of hole 582 * @lstart: offset of beginning of hole
diff --git a/mm/util.c b/mm/util.c
index dc3036cdcc6a..c55e26b17d93 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__krealloc);
152 * 152 *
153 * The contents of the object pointed to are preserved up to the 153 * The contents of the object pointed to are preserved up to the
154 * lesser of the new and old sizes. If @p is %NULL, krealloc() 154 * lesser of the new and old sizes. If @p is %NULL, krealloc()
155 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 155 * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
156 * %NULL pointer, the object pointed to is freed. 156 * %NULL pointer, the object pointed to is freed.
157 */ 157 */
158void *krealloc(const void *p, size_t new_size, gfp_t flags) 158void *krealloc(const void *p, size_t new_size, gfp_t flags)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 78e08300db21..5123a169ab7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
2550 2550
2551static void show_numa_info(struct seq_file *m, struct vm_struct *v) 2551static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2552{ 2552{
2553 if (NUMA_BUILD) { 2553 if (IS_ENABLED(CONFIG_NUMA)) {
2554 unsigned int nr, *counters = m->private; 2554 unsigned int nr, *counters = m->private;
2555 2555
2556 if (!counters) 2556 if (!counters)
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
2615 unsigned int *ptr = NULL; 2615 unsigned int *ptr = NULL;
2616 int ret; 2616 int ret;
2617 2617
2618 if (NUMA_BUILD) { 2618 if (IS_ENABLED(CONFIG_NUMA)) {
2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
2620 if (ptr == NULL) 2620 if (ptr == NULL)
2621 return -ENOMEM; 2621 return -ENOMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 48550c66f1f2..196709f5ee58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
1177} 1177}
1178 1178
1179/* 1179/*
1180 * Are there way too many processes in the direct reclaim path already? 1180 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1181 * then get resheduled. When there are massive number of tasks doing page
1182 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1183 * the LRU list will go small and be scanned faster than necessary, leading to
1184 * unnecessary swapping, thrashing and OOM.
1181 */ 1185 */
1182static int too_many_isolated(struct zone *zone, int file, 1186static int too_many_isolated(struct zone *zone, int file,
1183 struct scan_control *sc) 1187 struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
1198 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1202 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1199 } 1203 }
1200 1204
1205 /*
1206 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1207 * won't get blocked by normal direct-reclaimers, forming a circular
1208 * deadlock.
1209 */
1210 if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
1211 inactive >>= 3;
1212
1201 return isolated > inactive; 1213 return isolated > inactive;
1202} 1214}
1203 1215
@@ -1679,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1679 1691
1680 if (global_reclaim(sc)) { 1692 if (global_reclaim(sc)) {
1681 free = zone_page_state(zone, NR_FREE_PAGES); 1693 free = zone_page_state(zone, NR_FREE_PAGES);
1682 /* If we have very few page cache pages,
1683 force-scan anon pages. */
1684 if (unlikely(file + free <= high_wmark_pages(zone))) { 1694 if (unlikely(file + free <= high_wmark_pages(zone))) {
1695 /*
1696 * If we have very few page cache pages, force-scan
1697 * anon pages.
1698 */
1685 fraction[0] = 1; 1699 fraction[0] = 1;
1686 fraction[1] = 0; 1700 fraction[1] = 0;
1687 denominator = 1; 1701 denominator = 1;
1688 goto out; 1702 goto out;
1703 } else if (!inactive_file_is_low_global(zone)) {
1704 /*
1705 * There is enough inactive page cache, do not
1706 * reclaim anything from the working set right now.
1707 */
1708 fraction[0] = 0;
1709 fraction[1] = 1;
1710 denominator = 1;
1711 goto out;
1689 } 1712 }
1690 } 1713 }
1691 1714
@@ -1752,7 +1775,7 @@ out:
1752/* Use reclaim/compaction for costly allocs or under memory pressure */ 1775/* Use reclaim/compaction for costly allocs or under memory pressure */
1753static bool in_reclaim_compaction(struct scan_control *sc) 1776static bool in_reclaim_compaction(struct scan_control *sc)
1754{ 1777{
1755 if (COMPACTION_BUILD && sc->order && 1778 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
1756 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 1779 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1757 sc->priority < DEF_PRIORITY - 2)) 1780 sc->priority < DEF_PRIORITY - 2))
1758 return true; 1781 return true;
@@ -2005,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2005 if (zone->all_unreclaimable && 2028 if (zone->all_unreclaimable &&
2006 sc->priority != DEF_PRIORITY) 2029 sc->priority != DEF_PRIORITY)
2007 continue; /* Let kswapd poll it */ 2030 continue; /* Let kswapd poll it */
2008 if (COMPACTION_BUILD) { 2031 if (IS_ENABLED(CONFIG_COMPACTION)) {
2009 /* 2032 /*
2010 * If we already have plenty of memory free for 2033 * If we already have plenty of memory free for
2011 * compaction in this zone, don't free any more. 2034 * compaction in this zone, don't free any more.
@@ -2207,9 +2230,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2207 * Throttle direct reclaimers if backing storage is backed by the network 2230 * Throttle direct reclaimers if backing storage is backed by the network
2208 * and the PFMEMALLOC reserve for the preferred node is getting dangerously 2231 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2209 * depleted. kswapd will continue to make progress and wake the processes 2232 * depleted. kswapd will continue to make progress and wake the processes
2210 * when the low watermark is reached 2233 * when the low watermark is reached.
2234 *
2235 * Returns true if a fatal signal was delivered during throttling. If this
2236 * happens, the page allocator should not consider triggering the OOM killer.
2211 */ 2237 */
2212static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 2238static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2213 nodemask_t *nodemask) 2239 nodemask_t *nodemask)
2214{ 2240{
2215 struct zone *zone; 2241 struct zone *zone;
@@ -2224,13 +2250,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2224 * processes to block on log_wait_commit(). 2250 * processes to block on log_wait_commit().
2225 */ 2251 */
2226 if (current->flags & PF_KTHREAD) 2252 if (current->flags & PF_KTHREAD)
2227 return; 2253 goto out;
2254
2255 /*
2256 * If a fatal signal is pending, this process should not throttle.
2257 * It should return quickly so it can exit and free its memory
2258 */
2259 if (fatal_signal_pending(current))
2260 goto out;
2228 2261
2229 /* Check if the pfmemalloc reserves are ok */ 2262 /* Check if the pfmemalloc reserves are ok */
2230 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); 2263 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2231 pgdat = zone->zone_pgdat; 2264 pgdat = zone->zone_pgdat;
2232 if (pfmemalloc_watermark_ok(pgdat)) 2265 if (pfmemalloc_watermark_ok(pgdat))
2233 return; 2266 goto out;
2234 2267
2235 /* Account for the throttling */ 2268 /* Account for the throttling */
2236 count_vm_event(PGSCAN_DIRECT_THROTTLE); 2269 count_vm_event(PGSCAN_DIRECT_THROTTLE);
@@ -2246,12 +2279,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2246 if (!(gfp_mask & __GFP_FS)) { 2279 if (!(gfp_mask & __GFP_FS)) {
2247 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 2280 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2248 pfmemalloc_watermark_ok(pgdat), HZ); 2281 pfmemalloc_watermark_ok(pgdat), HZ);
2249 return; 2282
2283 goto check_pending;
2250 } 2284 }
2251 2285
2252 /* Throttle until kswapd wakes the process */ 2286 /* Throttle until kswapd wakes the process */
2253 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 2287 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2254 pfmemalloc_watermark_ok(pgdat)); 2288 pfmemalloc_watermark_ok(pgdat));
2289
2290check_pending:
2291 if (fatal_signal_pending(current))
2292 return true;
2293
2294out:
2295 return false;
2255} 2296}
2256 2297
2257unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2298unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
@@ -2273,13 +2314,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2273 .gfp_mask = sc.gfp_mask, 2314 .gfp_mask = sc.gfp_mask,
2274 }; 2315 };
2275 2316
2276 throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2277
2278 /* 2317 /*
2279 * Do not enter reclaim if fatal signal is pending. 1 is returned so 2318 * Do not enter reclaim if fatal signal was delivered while throttled.
2280 * that the page allocator does not consider triggering OOM 2319 * 1 is returned so that the page allocator does not OOM kill at this
2320 * point.
2281 */ 2321 */
2282 if (fatal_signal_pending(current)) 2322 if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
2283 return 1; 2323 return 1;
2284 2324
2285 trace_mm_vmscan_direct_reclaim_begin(order, 2325 trace_mm_vmscan_direct_reclaim_begin(order,
@@ -2397,13 +2437,31 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
2397 } while (memcg); 2437 } while (memcg);
2398} 2438}
2399 2439
2440static bool zone_balanced(struct zone *zone, int order,
2441 unsigned long balance_gap, int classzone_idx)
2442{
2443 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
2444 balance_gap, classzone_idx, 0))
2445 return false;
2446
2447 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2448 !compaction_suitable(zone, order))
2449 return false;
2450
2451 return true;
2452}
2453
2400/* 2454/*
2401 * pgdat_balanced is used when checking if a node is balanced for high-order 2455 * pgdat_balanced() is used when checking if a node is balanced.
2402 * allocations. Only zones that meet watermarks and are in a zone allowed 2456 *
2403 * by the callers classzone_idx are added to balanced_pages. The total of 2457 * For order-0, all zones must be balanced!
2404 * balanced pages must be at least 25% of the zones allowed by classzone_idx 2458 *
2405 * for the node to be considered balanced. Forcing all zones to be balanced 2459 * For high-order allocations only zones that meet watermarks and are in a
2406 * for high orders can cause excessive reclaim when there are imbalanced zones. 2460 * zone allowed by the callers classzone_idx are added to balanced_pages. The
2461 * total of balanced pages must be at least 25% of the zones allowed by
2462 * classzone_idx for the node to be considered balanced. Forcing all zones to
2463 * be balanced for high orders can cause excessive reclaim when there are
2464 * imbalanced zones.
2407 * The choice of 25% is due to 2465 * The choice of 25% is due to
2408 * o a 16M DMA zone that is balanced will not balance a zone on any 2466 * o a 16M DMA zone that is balanced will not balance a zone on any
2409 * reasonable sized machine 2467 * reasonable sized machine
@@ -2413,17 +2471,43 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
2413 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2471 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2414 * to balance a node on its own. These seemed like reasonable ratios. 2472 * to balance a node on its own. These seemed like reasonable ratios.
2415 */ 2473 */
2416static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, 2474static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2417 int classzone_idx)
2418{ 2475{
2419 unsigned long present_pages = 0; 2476 unsigned long present_pages = 0;
2477 unsigned long balanced_pages = 0;
2420 int i; 2478 int i;
2421 2479
2422 for (i = 0; i <= classzone_idx; i++) 2480 /* Check the watermark levels */
2423 present_pages += pgdat->node_zones[i].present_pages; 2481 for (i = 0; i <= classzone_idx; i++) {
2482 struct zone *zone = pgdat->node_zones + i;
2483
2484 if (!populated_zone(zone))
2485 continue;
2486
2487 present_pages += zone->present_pages;
2424 2488
2425 /* A special case here: if zone has no page, we think it's balanced */ 2489 /*
2426 return balanced_pages >= (present_pages >> 2); 2490 * A special case here:
2491 *
2492 * balance_pgdat() skips over all_unreclaimable after
2493 * DEF_PRIORITY. Effectively, it considers them balanced so
2494 * they must be considered balanced here as well!
2495 */
2496 if (zone->all_unreclaimable) {
2497 balanced_pages += zone->present_pages;
2498 continue;
2499 }
2500
2501 if (zone_balanced(zone, order, 0, i))
2502 balanced_pages += zone->present_pages;
2503 else if (!order)
2504 return false;
2505 }
2506
2507 if (order)
2508 return balanced_pages >= (present_pages >> 2);
2509 else
2510 return true;
2427} 2511}
2428 2512
2429/* 2513/*
@@ -2435,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2435static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, 2519static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2436 int classzone_idx) 2520 int classzone_idx)
2437{ 2521{
2438 int i;
2439 unsigned long balanced = 0;
2440 bool all_zones_ok = true;
2441
2442 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2522 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2443 if (remaining) 2523 if (remaining)
2444 return false; 2524 return false;
@@ -2457,40 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2457 return false; 2537 return false;
2458 } 2538 }
2459 2539
2460 /* Check the watermark levels */ 2540 return pgdat_balanced(pgdat, order, classzone_idx);
2461 for (i = 0; i <= classzone_idx; i++) {
2462 struct zone *zone = pgdat->node_zones + i;
2463
2464 if (!populated_zone(zone))
2465 continue;
2466
2467 /*
2468 * balance_pgdat() skips over all_unreclaimable after
2469 * DEF_PRIORITY. Effectively, it considers them balanced so
2470 * they must be considered balanced here as well if kswapd
2471 * is to sleep
2472 */
2473 if (zone->all_unreclaimable) {
2474 balanced += zone->present_pages;
2475 continue;
2476 }
2477
2478 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2479 i, 0))
2480 all_zones_ok = false;
2481 else
2482 balanced += zone->present_pages;
2483 }
2484
2485 /*
2486 * For high-order requests, the balanced zones must contain at least
2487 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2488 * must be balanced
2489 */
2490 if (order)
2491 return pgdat_balanced(pgdat, balanced, classzone_idx);
2492 else
2493 return all_zones_ok;
2494} 2541}
2495 2542
2496/* 2543/*
@@ -2517,8 +2564,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2517static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2564static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2518 int *classzone_idx) 2565 int *classzone_idx)
2519{ 2566{
2520 int all_zones_ok; 2567 struct zone *unbalanced_zone;
2521 unsigned long balanced;
2522 int i; 2568 int i;
2523 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2569 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2524 unsigned long total_scanned; 2570 unsigned long total_scanned;
@@ -2551,8 +2597,7 @@ loop_again:
2551 unsigned long lru_pages = 0; 2597 unsigned long lru_pages = 0;
2552 int has_under_min_watermark_zone = 0; 2598 int has_under_min_watermark_zone = 0;
2553 2599
2554 all_zones_ok = 1; 2600 unbalanced_zone = NULL;
2555 balanced = 0;
2556 2601
2557 /* 2602 /*
2558 * Scan in the highmem->dma direction for the highest 2603 * Scan in the highmem->dma direction for the highest
@@ -2585,8 +2630,7 @@ loop_again:
2585 break; 2630 break;
2586 } 2631 }
2587 2632
2588 if (!zone_watermark_ok_safe(zone, order, 2633 if (!zone_balanced(zone, order, 0, 0)) {
2589 high_wmark_pages(zone), 0, 0)) {
2590 end_zone = i; 2634 end_zone = i;
2591 break; 2635 break;
2592 } else { 2636 } else {
@@ -2656,15 +2700,14 @@ loop_again:
2656 * Do not reclaim more than needed for compaction. 2700 * Do not reclaim more than needed for compaction.
2657 */ 2701 */
2658 testorder = order; 2702 testorder = order;
2659 if (COMPACTION_BUILD && order && 2703 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2660 compaction_suitable(zone, order) != 2704 compaction_suitable(zone, order) !=
2661 COMPACT_SKIPPED) 2705 COMPACT_SKIPPED)
2662 testorder = 0; 2706 testorder = 0;
2663 2707
2664 if ((buffer_heads_over_limit && is_highmem_idx(i)) || 2708 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2665 !zone_watermark_ok_safe(zone, testorder, 2709 !zone_balanced(zone, testorder,
2666 high_wmark_pages(zone) + balance_gap, 2710 balance_gap, end_zone)) {
2667 end_zone, 0)) {
2668 shrink_zone(zone, &sc); 2711 shrink_zone(zone, &sc);
2669 2712
2670 reclaim_state->reclaimed_slab = 0; 2713 reclaim_state->reclaimed_slab = 0;
@@ -2691,9 +2734,8 @@ loop_again:
2691 continue; 2734 continue;
2692 } 2735 }
2693 2736
2694 if (!zone_watermark_ok_safe(zone, testorder, 2737 if (!zone_balanced(zone, testorder, 0, end_zone)) {
2695 high_wmark_pages(zone), end_zone, 0)) { 2738 unbalanced_zone = zone;
2696 all_zones_ok = 0;
2697 /* 2739 /*
2698 * We are still under min water mark. This 2740 * We are still under min water mark. This
2699 * means that we have a GFP_ATOMIC allocation 2741 * means that we have a GFP_ATOMIC allocation
@@ -2711,8 +2753,6 @@ loop_again:
2711 * speculatively avoid congestion waits 2753 * speculatively avoid congestion waits
2712 */ 2754 */
2713 zone_clear_flag(zone, ZONE_CONGESTED); 2755 zone_clear_flag(zone, ZONE_CONGESTED);
2714 if (i <= *classzone_idx)
2715 balanced += zone->present_pages;
2716 } 2756 }
2717 2757
2718 } 2758 }
@@ -2726,7 +2766,7 @@ loop_again:
2726 pfmemalloc_watermark_ok(pgdat)) 2766 pfmemalloc_watermark_ok(pgdat))
2727 wake_up(&pgdat->pfmemalloc_wait); 2767 wake_up(&pgdat->pfmemalloc_wait);
2728 2768
2729 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2769 if (pgdat_balanced(pgdat, order, *classzone_idx))
2730 break; /* kswapd: all done */ 2770 break; /* kswapd: all done */
2731 /* 2771 /*
2732 * OK, kswapd is getting into trouble. Take a nap, then take 2772 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2735,8 +2775,8 @@ loop_again:
2735 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { 2775 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2736 if (has_under_min_watermark_zone) 2776 if (has_under_min_watermark_zone)
2737 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2777 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2738 else 2778 else if (unbalanced_zone)
2739 congestion_wait(BLK_RW_ASYNC, HZ/10); 2779 wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
2740 } 2780 }
2741 2781
2742 /* 2782 /*
@@ -2750,12 +2790,7 @@ loop_again:
2750 } while (--sc.priority >= 0); 2790 } while (--sc.priority >= 0);
2751out: 2791out:
2752 2792
2753 /* 2793 if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
2754 * order-0: All zones must meet high watermark for a balanced node
2755 * high-order: Balanced zones must make up at least 25% of the node
2756 * for the node to be balanced
2757 */
2758 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2759 cond_resched(); 2794 cond_resched();
2760 2795
2761 try_to_freeze(); 2796 try_to_freeze();
@@ -2797,29 +2832,10 @@ out:
2797 if (!populated_zone(zone)) 2832 if (!populated_zone(zone))
2798 continue; 2833 continue;
2799 2834
2800 if (zone->all_unreclaimable &&
2801 sc.priority != DEF_PRIORITY)
2802 continue;
2803
2804 /* Would compaction fail due to lack of free memory? */
2805 if (COMPACTION_BUILD &&
2806 compaction_suitable(zone, order) == COMPACT_SKIPPED)
2807 goto loop_again;
2808
2809 /* Confirm the zone is balanced for order-0 */
2810 if (!zone_watermark_ok(zone, 0,
2811 high_wmark_pages(zone), 0, 0)) {
2812 order = sc.order = 0;
2813 goto loop_again;
2814 }
2815
2816 /* Check if the memory needs to be defragmented. */ 2835 /* Check if the memory needs to be defragmented. */
2817 if (zone_watermark_ok(zone, order, 2836 if (zone_watermark_ok(zone, order,
2818 low_wmark_pages(zone), *classzone_idx, 0)) 2837 low_wmark_pages(zone), *classzone_idx, 0))
2819 zones_need_compaction = 0; 2838 zones_need_compaction = 0;
2820
2821 /* If balanced, clear the congested flag */
2822 zone_clear_flag(zone, ZONE_CONGESTED);
2823 } 2839 }
2824 2840
2825 if (zones_need_compaction) 2841 if (zones_need_compaction)
@@ -2944,7 +2960,7 @@ static int kswapd(void *p)
2944 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2960 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2945 balanced_classzone_idx = classzone_idx; 2961 balanced_classzone_idx = classzone_idx;
2946 for ( ; ; ) { 2962 for ( ; ; ) {
2947 int ret; 2963 bool ret;
2948 2964
2949 /* 2965 /*
2950 * If the last balance_pgdat was unsuccessful it's unlikely a 2966 * If the last balance_pgdat was unsuccessful it's unlikely a
@@ -3106,13 +3122,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3106 not required for correctness. So if the last cpu in a node goes 3122 not required for correctness. So if the last cpu in a node goes
3107 away, we get changed to run anywhere: as the first one comes back, 3123 away, we get changed to run anywhere: as the first one comes back,
3108 restore their cpu bindings. */ 3124 restore their cpu bindings. */
3109static int __devinit cpu_callback(struct notifier_block *nfb, 3125static int cpu_callback(struct notifier_block *nfb, unsigned long action,
3110 unsigned long action, void *hcpu) 3126 void *hcpu)
3111{ 3127{
3112 int nid; 3128 int nid;
3113 3129
3114 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 3130 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3115 for_each_node_state(nid, N_HIGH_MEMORY) { 3131 for_each_node_state(nid, N_MEMORY) {
3116 pg_data_t *pgdat = NODE_DATA(nid); 3132 pg_data_t *pgdat = NODE_DATA(nid);
3117 const struct cpumask *mask; 3133 const struct cpumask *mask;
3118 3134
@@ -3168,7 +3184,7 @@ static int __init kswapd_init(void)
3168 int nid; 3184 int nid;
3169 3185
3170 swap_setup(); 3186 swap_setup();
3171 for_each_node_state(nid, N_HIGH_MEMORY) 3187 for_each_node_state(nid, N_MEMORY)
3172 kswapd_run(nid); 3188 kswapd_run(nid);
3173 hotcpu_notifier(cpu_callback, 0); 3189 hotcpu_notifier(cpu_callback, 0);
3174 return 0; 3190 return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7370579111b..9800306c8195 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
774 774
775 "pgrotated", 775 "pgrotated",
776 776
777#ifdef CONFIG_NUMA_BALANCING
778 "numa_pte_updates",
779 "numa_hint_faults",
780 "numa_hint_faults_local",
781 "numa_pages_migrated",
782#endif
783#ifdef CONFIG_MIGRATION
784 "pgmigrate_success",
785 "pgmigrate_fail",
786#endif
777#ifdef CONFIG_COMPACTION 787#ifdef CONFIG_COMPACTION
778 "compact_blocks_moved", 788 "compact_migrate_scanned",
779 "compact_pages_moved", 789 "compact_free_scanned",
780 "compact_pagemigrate_failed", 790 "compact_isolated",
781 "compact_stall", 791 "compact_stall",
782 "compact_fail", 792 "compact_fail",
783 "compact_success", 793 "compact_success",
@@ -801,6 +811,8 @@ const char * const vmstat_text[] = {
801 "thp_collapse_alloc", 811 "thp_collapse_alloc",
802 "thp_collapse_alloc_failed", 812 "thp_collapse_alloc_failed",
803 "thp_split", 813 "thp_split",
814 "thp_zero_page_alloc",
815 "thp_zero_page_alloc_failed",
804#endif 816#endif
805 817
806#endif /* CONFIG_VM_EVENTS_COUNTERS */ 818#endif /* CONFIG_VM_EVENTS_COUNTERS */
@@ -930,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
930 pg_data_t *pgdat = (pg_data_t *)arg; 942 pg_data_t *pgdat = (pg_data_t *)arg;
931 943
932 /* check memoryless node */ 944 /* check memoryless node */
933 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 945 if (!node_state(pgdat->node_id, N_MEMORY))
934 return 0; 946 return 0;
935 947
936 seq_printf(m, "Page block order: %d\n", pageblock_order); 948 seq_printf(m, "Page block order: %d\n", pageblock_order);
@@ -992,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
992 "\n high %lu" 1004 "\n high %lu"
993 "\n scanned %lu" 1005 "\n scanned %lu"
994 "\n spanned %lu" 1006 "\n spanned %lu"
995 "\n present %lu", 1007 "\n present %lu"
1008 "\n managed %lu",
996 zone_page_state(zone, NR_FREE_PAGES), 1009 zone_page_state(zone, NR_FREE_PAGES),
997 min_wmark_pages(zone), 1010 min_wmark_pages(zone),
998 low_wmark_pages(zone), 1011 low_wmark_pages(zone),
999 high_wmark_pages(zone), 1012 high_wmark_pages(zone),
1000 zone->pages_scanned, 1013 zone->pages_scanned,
1001 zone->spanned_pages, 1014 zone->spanned_pages,
1002 zone->present_pages); 1015 zone->present_pages,
1016 zone->managed_pages);
1003 1017
1004 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1018 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1005 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1019 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
@@ -1292,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg)
1292 pg_data_t *pgdat = (pg_data_t *)arg; 1306 pg_data_t *pgdat = (pg_data_t *)arg;
1293 1307
1294 /* check memoryless node */ 1308 /* check memoryless node */
1295 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 1309 if (!node_state(pgdat->node_id, N_MEMORY))
1296 return 0; 1310 return 0;
1297 1311
1298 walk_zones_in_node(m, pgdat, unusable_show_print); 1312 walk_zones_in_node(m, pgdat, unusable_show_print);