aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig34
-rw-r--r--mm/Makefile3
-rw-r--r--mm/balloon_compaction.c302
-rw-r--r--mm/bootmem.c79
-rw-r--r--mm/compaction.c150
-rw-r--r--mm/dmapool.c55
-rw-r--r--mm/highmem.c29
-rw-r--r--mm/huge_memory.c641
-rw-r--r--mm/hugetlb.c63
-rw-r--r--mm/hugetlb_cgroup.c42
-rw-r--r--mm/internal.h12
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/ksm.c37
-rw-r--r--mm/memcontrol.c1469
-rw-r--r--mm/memory-failure.c35
-rw-r--r--mm/memory.c238
-rw-r--r--mm/memory_hotplug.c430
-rw-r--r--mm/mempolicy.c318
-rw-r--r--mm/migrate.c438
-rw-r--r--mm/mmap.c569
-rw-r--r--mm/mprotect.c151
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nobootmem.c22
-rw-r--r--mm/nommu.c15
-rw-r--r--mm/oom_kill.c138
-rw-r--r--mm/page-writeback.c11
-rw-r--r--mm/page_alloc.c343
-rw-r--r--mm/page_cgroup.c5
-rw-r--r--mm/page_isolation.c27
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c5
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c134
-rw-r--r--mm/shmem.c92
-rw-r--r--mm/slab.c383
-rw-r--r--mm/slab.h190
-rw-r--r--mm/slab_common.c292
-rw-r--r--mm/slob.c48
-rw-r--r--mm/slub.c451
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swapfile.c31
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c56
-rw-r--r--mm/vmstat.c28
45 files changed, 5664 insertions, 1751 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a3f8dddaaab3..278e3ab1f169 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,25 @@ config NO_BOOTMEM
143config MEMORY_ISOLATION 143config MEMORY_ISOLATION
144 boolean 144 boolean
145 145
146config MOVABLE_NODE
147 boolean "Enable to assign a node which has only movable memory"
148 depends on HAVE_MEMBLOCK
149 depends on NO_BOOTMEM
150 depends on X86_64
151 depends on NUMA
152 default n
153 help
154 Allow a node to have only movable memory. Pages used by the kernel,
155 such as direct mapping pages cannot be migrated. So the corresponding
156 memory device cannot be hotplugged. This option allows users to
157 online all the memory of a node as movable memory so that the whole
158 node can be hotplugged. Users who don't use the memory hotplug
159 feature are fine with this option on since they don't online memory
160 as movable.
161
162 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly.
164
146# eventually, we can have this option just 'select SPARSEMEM' 165# eventually, we can have this option just 'select SPARSEMEM'
147config MEMORY_HOTPLUG 166config MEMORY_HOTPLUG
148 bool "Allow for memory hot-add" 167 bool "Allow for memory hot-add"
@@ -188,6 +207,21 @@ config SPLIT_PTLOCK_CPUS
188 default "4" 207 default "4"
189 208
190# 209#
210# support for memory balloon compaction
211config BALLOON_COMPACTION
212 bool "Allow for balloon memory compaction/migration"
213 def_bool y
214 depends on COMPACTION && VIRTIO_BALLOON
215 help
216 Memory fragmentation introduced by ballooning might reduce
217 significantly the number of 2MB contiguous memory blocks that can be
218 used within a guest, thus imposing performance penalties associated
219 with the reduced number of transparent huge pages that could be used
220 by the guest workload. Allowing the compaction & migration for memory
221 pages enlisted as being part of memory balloon devices avoids the
222 scenario aforementioned and helps improving memory defragmentation.
223
224#
191# support for memory compaction 225# support for memory compaction
192config COMPACTION 226config COMPACTION
193 bool "Allow for memory compaction" 227 bool "Allow for memory compaction"
diff --git a/mm/Makefile b/mm/Makefile
index 6b025f80af34..3a4628751f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o interval_tree.o $(mmu-y) 19 compaction.o balloon_compaction.o \
20 interval_tree.o $(mmu-y)
20 21
21obj-y += init-mm.o 22obj-y += init-mm.o
22 23
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 000000000000..07dbc8ec46cf
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
1/*
2 * mm/balloon_compaction.c
3 *
4 * Common interface for making balloon pages movable by compaction.
5 *
6 * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
7 */
8#include <linux/mm.h>
9#include <linux/slab.h>
10#include <linux/export.h>
11#include <linux/balloon_compaction.h>
12
13/*
14 * balloon_devinfo_alloc - allocates a balloon device information descriptor.
15 * @balloon_dev_descriptor: pointer to reference the balloon device which
16 * this struct balloon_dev_info will be servicing.
17 *
18 * Driver must call it to properly allocate and initialize an instance of
19 * struct balloon_dev_info which will be used to reference a balloon device
20 * as well as to keep track of the balloon device page list.
21 */
22struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
23{
24 struct balloon_dev_info *b_dev_info;
25 b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
26 if (!b_dev_info)
27 return ERR_PTR(-ENOMEM);
28
29 b_dev_info->balloon_device = balloon_dev_descriptor;
30 b_dev_info->mapping = NULL;
31 b_dev_info->isolated_pages = 0;
32 spin_lock_init(&b_dev_info->pages_lock);
33 INIT_LIST_HEAD(&b_dev_info->pages);
34
35 return b_dev_info;
36}
37EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
38
39/*
40 * balloon_page_enqueue - allocates a new page and inserts it into the balloon
41 * page list.
42 * @b_dev_info: balloon device decriptor where we will insert a new page to
43 *
44 * Driver must call it to properly allocate a new enlisted balloon page
45 * before definetively removing it from the guest system.
46 * This function returns the page address for the recently enqueued page or
47 * NULL in the case we fail to allocate a new page this turn.
48 */
49struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
50{
51 unsigned long flags;
52 struct page *page = alloc_page(balloon_mapping_gfp_mask() |
53 __GFP_NOMEMALLOC | __GFP_NORETRY);
54 if (!page)
55 return NULL;
56
57 /*
58 * Block others from accessing the 'page' when we get around to
59 * establishing additional references. We should be the only one
60 * holding a reference to the 'page' at this point.
61 */
62 BUG_ON(!trylock_page(page));
63 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
64 balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
65 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
66 unlock_page(page);
67 return page;
68}
69EXPORT_SYMBOL_GPL(balloon_page_enqueue);
70
71/*
72 * balloon_page_dequeue - removes a page from balloon's page list and returns
73 * the its address to allow the driver release the page.
74 * @b_dev_info: balloon device decriptor where we will grab a page from.
75 *
76 * Driver must call it to properly de-allocate a previous enlisted balloon page
77 * before definetively releasing it back to the guest system.
78 * This function returns the page address for the recently dequeued page or
79 * NULL in the case we find balloon's page list temporarily empty due to
80 * compaction isolated pages.
81 */
82struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
83{
84 struct page *page, *tmp;
85 unsigned long flags;
86 bool dequeued_page;
87
88 dequeued_page = false;
89 list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
90 /*
91 * Block others from accessing the 'page' while we get around
92 * establishing additional references and preparing the 'page'
93 * to be released by the balloon driver.
94 */
95 if (trylock_page(page)) {
96 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
97 /*
98 * Raise the page refcount here to prevent any wrong
99 * attempt to isolate this page, in case of coliding
100 * with balloon_page_isolate() just after we release
101 * the page lock.
102 *
103 * balloon_page_free() will take care of dropping
104 * this extra refcount later.
105 */
106 get_page(page);
107 balloon_page_delete(page);
108 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
109 unlock_page(page);
110 dequeued_page = true;
111 break;
112 }
113 }
114
115 if (!dequeued_page) {
116 /*
117 * If we are unable to dequeue a balloon page because the page
118 * list is empty and there is no isolated pages, then something
119 * went out of track and some balloon pages are lost.
120 * BUG() here, otherwise the balloon driver may get stuck into
121 * an infinite loop while attempting to release all its pages.
122 */
123 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
124 if (unlikely(list_empty(&b_dev_info->pages) &&
125 !b_dev_info->isolated_pages))
126 BUG();
127 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
128 page = NULL;
129 }
130 return page;
131}
132EXPORT_SYMBOL_GPL(balloon_page_dequeue);
133
134#ifdef CONFIG_BALLOON_COMPACTION
135/*
136 * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
137 * @b_dev_info: holds the balloon device information descriptor.
138 * @a_ops: balloon_mapping address_space_operations descriptor.
139 *
140 * Driver must call it to properly allocate and initialize an instance of
141 * struct address_space which will be used as the special page->mapping for
142 * balloon device enlisted page instances.
143 */
144struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
145 const struct address_space_operations *a_ops)
146{
147 struct address_space *mapping;
148
149 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
150 if (!mapping)
151 return ERR_PTR(-ENOMEM);
152
153 /*
154 * Give a clean 'zeroed' status to all elements of this special
155 * balloon page->mapping struct address_space instance.
156 */
157 address_space_init_once(mapping);
158
159 /*
160 * Set mapping->flags appropriately, to allow balloon pages
161 * ->mapping identification.
162 */
163 mapping_set_balloon(mapping);
164 mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
165
166 /* balloon's page->mapping->a_ops callback descriptor */
167 mapping->a_ops = a_ops;
168
169 /*
170 * Establish a pointer reference back to the balloon device descriptor
171 * this particular page->mapping will be servicing.
172 * This is used by compaction / migration procedures to identify and
173 * access the balloon device pageset while isolating / migrating pages.
174 *
175 * As some balloon drivers can register multiple balloon devices
176 * for a single guest, this also helps compaction / migration to
177 * properly deal with multiple balloon pagesets, when required.
178 */
179 mapping->private_data = b_dev_info;
180 b_dev_info->mapping = mapping;
181
182 return mapping;
183}
184EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
185
186static inline void __isolate_balloon_page(struct page *page)
187{
188 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
189 unsigned long flags;
190 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
191 list_del(&page->lru);
192 b_dev_info->isolated_pages++;
193 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
194}
195
196static inline void __putback_balloon_page(struct page *page)
197{
198 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
199 unsigned long flags;
200 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
201 list_add(&page->lru, &b_dev_info->pages);
202 b_dev_info->isolated_pages--;
203 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
204}
205
206static inline int __migrate_balloon_page(struct address_space *mapping,
207 struct page *newpage, struct page *page, enum migrate_mode mode)
208{
209 return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
210}
211
212/* __isolate_lru_page() counterpart for a ballooned page */
213bool balloon_page_isolate(struct page *page)
214{
215 /*
216 * Avoid burning cycles with pages that are yet under __free_pages(),
217 * or just got freed under us.
218 *
219 * In case we 'win' a race for a balloon page being freed under us and
220 * raise its refcount preventing __free_pages() from doing its job
221 * the put_page() at the end of this block will take care of
222 * release this page, thus avoiding a nasty leakage.
223 */
224 if (likely(get_page_unless_zero(page))) {
225 /*
226 * As balloon pages are not isolated from LRU lists, concurrent
227 * compaction threads can race against page migration functions
228 * as well as race against the balloon driver releasing a page.
229 *
230 * In order to avoid having an already isolated balloon page
231 * being (wrongly) re-isolated while it is under migration,
232 * or to avoid attempting to isolate pages being released by
233 * the balloon driver, lets be sure we have the page lock
234 * before proceeding with the balloon page isolation steps.
235 */
236 if (likely(trylock_page(page))) {
237 /*
238 * A ballooned page, by default, has just one refcount.
239 * Prevent concurrent compaction threads from isolating
240 * an already isolated balloon page by refcount check.
241 */
242 if (__is_movable_balloon_page(page) &&
243 page_count(page) == 2) {
244 __isolate_balloon_page(page);
245 unlock_page(page);
246 return true;
247 }
248 unlock_page(page);
249 }
250 put_page(page);
251 }
252 return false;
253}
254
255/* putback_lru_page() counterpart for a ballooned page */
256void balloon_page_putback(struct page *page)
257{
258 /*
259 * 'lock_page()' stabilizes the page and prevents races against
260 * concurrent isolation threads attempting to re-isolate it.
261 */
262 lock_page(page);
263
264 if (__is_movable_balloon_page(page)) {
265 __putback_balloon_page(page);
266 /* drop the extra ref count taken for page isolation */
267 put_page(page);
268 } else {
269 WARN_ON(1);
270 dump_page(page);
271 }
272 unlock_page(page);
273}
274
275/* move_to_new_page() counterpart for a ballooned page */
276int balloon_page_migrate(struct page *newpage,
277 struct page *page, enum migrate_mode mode)
278{
279 struct address_space *mapping;
280 int rc = -EAGAIN;
281
282 /*
283 * Block others from accessing the 'newpage' when we get around to
284 * establishing additional references. We should be the only one
285 * holding a reference to the 'newpage' at this point.
286 */
287 BUG_ON(!trylock_page(newpage));
288
289 if (WARN_ON(!__is_movable_balloon_page(page))) {
290 dump_page(page);
291 unlock_page(newpage);
292 return rc;
293 }
294
295 mapping = page->mapping;
296 if (mapping)
297 rc = __migrate_balloon_page(mapping, newpage, page, mode);
298
299 unlock_page(newpage);
300 return rc;
301}
302#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..1324cd74faec 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
147 147
148/* 148/*
149 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
150 * @addr: starting address of the range 150 * @addr: starting physical address of the range
151 * @size: size of the range in bytes 151 * @size: size of the range in bytes
152 * 152 *
153 * This is only useful when the bootmem allocator has already been torn 153 * This is only useful when the bootmem allocator has already been torn
154 * down, but we are still initializing the system. Pages are given directly 154 * down, but we are still initializing the system. Pages are given directly
155 * to the page allocator, no bootmem metadata is updated because it is gone. 155 * to the page allocator, no bootmem metadata is updated because it is gone.
156 */ 156 */
157void __init free_bootmem_late(unsigned long addr, unsigned long size) 157void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
158{ 158{
159 unsigned long cursor, end; 159 unsigned long cursor, end;
160 160
161 kmemleak_free_part(__va(addr), size); 161 kmemleak_free_part(__va(physaddr), size);
162 162
163 cursor = PFN_UP(addr); 163 cursor = PFN_UP(physaddr);
164 end = PFN_DOWN(addr + size); 164 end = PFN_DOWN(physaddr + size);
165 165
166 for (; cursor < end; cursor++) { 166 for (; cursor < end; cursor++) {
167 __free_pages_bootmem(pfn_to_page(cursor), 0); 167 __free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
229 return count; 229 return count;
230} 230}
231 231
232static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
233{
234 struct zone *z;
235
236 /*
237 * In free_area_init_core(), highmem zone's managed_pages is set to
238 * present_pages, and bootmem allocator doesn't allocate from highmem
239 * zones. So there's no need to recalculate managed_pages because all
240 * highmem pages will be managed by the buddy system. Here highmem
241 * zone also includes highmem movable zone.
242 */
243 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
244 if (!is_highmem(z))
245 z->managed_pages = 0;
246}
247
232/** 248/**
233 * free_all_bootmem_node - release a node's free pages to the buddy allocator 249 * free_all_bootmem_node - release a node's free pages to the buddy allocator
234 * @pgdat: node to be released 250 * @pgdat: node to be released
@@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
238unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 254unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
239{ 255{
240 register_page_bootmem_info_node(pgdat); 256 register_page_bootmem_info_node(pgdat);
257 reset_node_lowmem_managed_pages(pgdat);
241 return free_all_bootmem_core(pgdat->bdata); 258 return free_all_bootmem_core(pgdat->bdata);
242} 259}
243 260
@@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void)
250{ 267{
251 unsigned long total_pages = 0; 268 unsigned long total_pages = 0;
252 bootmem_data_t *bdata; 269 bootmem_data_t *bdata;
270 struct pglist_data *pgdat;
271
272 for_each_online_pgdat(pgdat)
273 reset_node_lowmem_managed_pages(pgdat);
253 274
254 list_for_each_entry(bdata, &bdata_list, list) 275 list_for_each_entry(bdata, &bdata_list, list)
255 total_pages += free_all_bootmem_core(bdata); 276 total_pages += free_all_bootmem_core(bdata);
@@ -377,21 +398,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
377 398
378/** 399/**
379 * free_bootmem - mark a page range as usable 400 * free_bootmem - mark a page range as usable
380 * @addr: starting address of the range 401 * @addr: starting physical address of the range
381 * @size: size of the range in bytes 402 * @size: size of the range in bytes
382 * 403 *
383 * Partial pages will be considered reserved and left as they are. 404 * Partial pages will be considered reserved and left as they are.
384 * 405 *
385 * The range must be contiguous but may span node boundaries. 406 * The range must be contiguous but may span node boundaries.
386 */ 407 */
387void __init free_bootmem(unsigned long addr, unsigned long size) 408void __init free_bootmem(unsigned long physaddr, unsigned long size)
388{ 409{
389 unsigned long start, end; 410 unsigned long start, end;
390 411
391 kmemleak_free_part(__va(addr), size); 412 kmemleak_free_part(__va(physaddr), size);
392 413
393 start = PFN_UP(addr); 414 start = PFN_UP(physaddr);
394 end = PFN_DOWN(addr + size); 415 end = PFN_DOWN(physaddr + size);
395 416
396 mark_bootmem(start, end, 0, 0); 417 mark_bootmem(start, end, 0, 0);
397} 418}
@@ -439,12 +460,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
439 return mark_bootmem(start, end, 1, flags); 460 return mark_bootmem(start, end, 1, flags);
440} 461}
441 462
442int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
443 int flags)
444{
445 return reserve_bootmem(phys, len, flags);
446}
447
448static unsigned long __init align_idx(struct bootmem_data *bdata, 463static unsigned long __init align_idx(struct bootmem_data *bdata,
449 unsigned long idx, unsigned long step) 464 unsigned long idx, unsigned long step)
450{ 465{
@@ -575,27 +590,6 @@ find_block:
575 return NULL; 590 return NULL;
576} 591}
577 592
578static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
579 unsigned long size, unsigned long align,
580 unsigned long goal, unsigned long limit)
581{
582 if (WARN_ON_ONCE(slab_is_available()))
583 return kzalloc(size, GFP_NOWAIT);
584
585#ifdef CONFIG_HAVE_ARCH_BOOTMEM
586 {
587 bootmem_data_t *p_bdata;
588
589 p_bdata = bootmem_arch_preferred_node(bdata, size, align,
590 goal, limit);
591 if (p_bdata)
592 return alloc_bootmem_bdata(p_bdata, size, align,
593 goal, limit);
594 }
595#endif
596 return NULL;
597}
598
599static void * __init alloc_bootmem_core(unsigned long size, 593static void * __init alloc_bootmem_core(unsigned long size,
600 unsigned long align, 594 unsigned long align,
601 unsigned long goal, 595 unsigned long goal,
@@ -604,9 +598,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
604 bootmem_data_t *bdata; 598 bootmem_data_t *bdata;
605 void *region; 599 void *region;
606 600
607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); 601 if (WARN_ON_ONCE(slab_is_available()))
608 if (region) 602 return kzalloc(size, GFP_NOWAIT);
609 return region;
610 603
611 list_for_each_entry(bdata, &bdata_list, list) { 604 list_for_each_entry(bdata, &bdata_list, list) {
612 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) 605 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -704,11 +697,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
704{ 697{
705 void *ptr; 698 void *ptr;
706 699
700 if (WARN_ON_ONCE(slab_is_available()))
701 return kzalloc(size, GFP_NOWAIT);
707again: 702again:
708 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
709 align, goal, limit);
710 if (ptr)
711 return ptr;
712 703
713 /* do not panic in alloc_bootmem_bdata() */ 704 /* do not panic in alloc_bootmem_bdata() */
714 if (limit && goal + size > limit) 705 if (limit && goal + size > limit)
diff --git a/mm/compaction.c b/mm/compaction.c
index 694eaabaaebd..5ad7f4f4d6f7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -14,6 +14,7 @@
14#include <linux/backing-dev.h> 14#include <linux/backing-dev.h>
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include <linux/balloon_compaction.h>
17#include "internal.h" 18#include "internal.h"
18 19
19#if defined CONFIG_COMPACTION || defined CONFIG_CMA 20#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -214,60 +215,6 @@ static bool suitable_migration_target(struct page *page)
214 return false; 215 return false;
215} 216}
216 217
217static void compact_capture_page(struct compact_control *cc)
218{
219 unsigned long flags;
220 int mtype, mtype_low, mtype_high;
221
222 if (!cc->page || *cc->page)
223 return;
224
225 /*
226 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
227 * regardless of the migratetype of the freelist is is captured from.
228 * This is fine because the order for a high-order MIGRATE_MOVABLE
229 * allocation is typically at least a pageblock size and overall
230 * fragmentation is not impaired. Other allocation types must
231 * capture pages from their own migratelist because otherwise they
232 * could pollute other pageblocks like MIGRATE_MOVABLE with
233 * difficult to move pages and making fragmentation worse overall.
234 */
235 if (cc->migratetype == MIGRATE_MOVABLE) {
236 mtype_low = 0;
237 mtype_high = MIGRATE_PCPTYPES;
238 } else {
239 mtype_low = cc->migratetype;
240 mtype_high = cc->migratetype + 1;
241 }
242
243 /* Speculatively examine the free lists without zone lock */
244 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
245 int order;
246 for (order = cc->order; order < MAX_ORDER; order++) {
247 struct page *page;
248 struct free_area *area;
249 area = &(cc->zone->free_area[order]);
250 if (list_empty(&area->free_list[mtype]))
251 continue;
252
253 /* Take the lock and attempt capture of the page */
254 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
255 return;
256 if (!list_empty(&area->free_list[mtype])) {
257 page = list_entry(area->free_list[mtype].next,
258 struct page, lru);
259 if (capture_free_page(page, cc->order, mtype)) {
260 spin_unlock_irqrestore(&cc->zone->lock,
261 flags);
262 *cc->page = page;
263 return;
264 }
265 }
266 spin_unlock_irqrestore(&cc->zone->lock, flags);
267 }
268 }
269}
270
271/* 218/*
272 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 219 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
273 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 220 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -356,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
356 if (blockpfn == end_pfn) 303 if (blockpfn == end_pfn)
357 update_pageblock_skip(cc, valid_page, total_isolated, false); 304 update_pageblock_skip(cc, valid_page, total_isolated, false);
358 305
306 count_vm_events(COMPACTFREE_SCANNED, nr_scanned);
307 if (total_isolated)
308 count_vm_events(COMPACTISOLATED, total_isolated);
309
359 return total_isolated; 310 return total_isolated;
360} 311}
361 312
@@ -565,9 +516,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
565 goto next_pageblock; 516 goto next_pageblock;
566 } 517 }
567 518
568 /* Check may be lockless but that's ok as we recheck later */ 519 /*
569 if (!PageLRU(page)) 520 * Check may be lockless but that's ok as we recheck later.
521 * It's possible to migrate LRU pages and balloon pages
522 * Skip any other type of page
523 */
524 if (!PageLRU(page)) {
525 if (unlikely(balloon_page_movable(page))) {
526 if (locked && balloon_page_isolate(page)) {
527 /* Successfully isolated */
528 cc->finished_update_migrate = true;
529 list_add(&page->lru, migratelist);
530 cc->nr_migratepages++;
531 nr_isolated++;
532 goto check_compact_cluster;
533 }
534 }
570 continue; 535 continue;
536 }
571 537
572 /* 538 /*
573 * PageLRU is set. lru_lock normally excludes isolation 539 * PageLRU is set. lru_lock normally excludes isolation
@@ -621,6 +587,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
621 cc->nr_migratepages++; 587 cc->nr_migratepages++;
622 nr_isolated++; 588 nr_isolated++;
623 589
590check_compact_cluster:
624 /* Avoid isolating too much */ 591 /* Avoid isolating too much */
625 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 592 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
626 ++low_pfn; 593 ++low_pfn;
@@ -646,6 +613,10 @@ next_pageblock:
646 613
647 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 614 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
648 615
616 count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned);
617 if (nr_isolated)
618 count_vm_events(COMPACTISOLATED, nr_isolated);
619
649 return low_pfn; 620 return low_pfn;
650} 621}
651 622
@@ -936,6 +907,60 @@ unsigned long compaction_suitable(struct zone *zone, int order)
936 return COMPACT_CONTINUE; 907 return COMPACT_CONTINUE;
937} 908}
938 909
910static void compact_capture_page(struct compact_control *cc)
911{
912 unsigned long flags;
913 int mtype, mtype_low, mtype_high;
914
915 if (!cc->page || *cc->page)
916 return;
917
918 /*
919 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
920 * regardless of the migratetype of the freelist is is captured from.
921 * This is fine because the order for a high-order MIGRATE_MOVABLE
922 * allocation is typically at least a pageblock size and overall
923 * fragmentation is not impaired. Other allocation types must
924 * capture pages from their own migratelist because otherwise they
925 * could pollute other pageblocks like MIGRATE_MOVABLE with
926 * difficult to move pages and making fragmentation worse overall.
927 */
928 if (cc->migratetype == MIGRATE_MOVABLE) {
929 mtype_low = 0;
930 mtype_high = MIGRATE_PCPTYPES;
931 } else {
932 mtype_low = cc->migratetype;
933 mtype_high = cc->migratetype + 1;
934 }
935
936 /* Speculatively examine the free lists without zone lock */
937 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
938 int order;
939 for (order = cc->order; order < MAX_ORDER; order++) {
940 struct page *page;
941 struct free_area *area;
942 area = &(cc->zone->free_area[order]);
943 if (list_empty(&area->free_list[mtype]))
944 continue;
945
946 /* Take the lock and attempt capture of the page */
947 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
948 return;
949 if (!list_empty(&area->free_list[mtype])) {
950 page = list_entry(area->free_list[mtype].next,
951 struct page, lru);
952 if (capture_free_page(page, cc->order, mtype)) {
953 spin_unlock_irqrestore(&cc->zone->lock,
954 flags);
955 *cc->page = page;
956 return;
957 }
958 }
959 spin_unlock_irqrestore(&cc->zone->lock, flags);
960 }
961 }
962}
963
939static int compact_zone(struct zone *zone, struct compact_control *cc) 964static int compact_zone(struct zone *zone, struct compact_control *cc)
940{ 965{
941 int ret; 966 int ret;
@@ -986,7 +1011,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
986 switch (isolate_migratepages(zone, cc)) { 1011 switch (isolate_migratepages(zone, cc)) {
987 case ISOLATE_ABORT: 1012 case ISOLATE_ABORT:
988 ret = COMPACT_PARTIAL; 1013 ret = COMPACT_PARTIAL;
989 putback_lru_pages(&cc->migratepages); 1014 putback_movable_pages(&cc->migratepages);
990 cc->nr_migratepages = 0; 1015 cc->nr_migratepages = 0;
991 goto out; 1016 goto out;
992 case ISOLATE_NONE: 1017 case ISOLATE_NONE:
@@ -998,20 +1023,17 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
998 nr_migrate = cc->nr_migratepages; 1023 nr_migrate = cc->nr_migratepages;
999 err = migrate_pages(&cc->migratepages, compaction_alloc, 1024 err = migrate_pages(&cc->migratepages, compaction_alloc,
1000 (unsigned long)cc, false, 1025 (unsigned long)cc, false,
1001 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); 1026 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
1027 MR_COMPACTION);
1002 update_nr_listpages(cc); 1028 update_nr_listpages(cc);
1003 nr_remaining = cc->nr_migratepages; 1029 nr_remaining = cc->nr_migratepages;
1004 1030
1005 count_vm_event(COMPACTBLOCKS);
1006 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
1007 if (nr_remaining)
1008 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
1009 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 1031 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
1010 nr_remaining); 1032 nr_remaining);
1011 1033
1012 /* Release LRU pages not migrated */ 1034 /* Release isolated pages not migrated */
1013 if (err) { 1035 if (err) {
1014 putback_lru_pages(&cc->migratepages); 1036 putback_movable_pages(&cc->migratepages);
1015 cc->nr_migratepages = 0; 1037 cc->nr_migratepages = 0;
1016 if (err == -ENOMEM) { 1038 if (err == -ENOMEM) {
1017 ret = COMPACT_PARTIAL; 1039 ret = COMPACT_PARTIAL;
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c5ab33bca0a8..c69781e97cf9 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */
50 size_t allocation; 50 size_t allocation;
51 size_t boundary; 51 size_t boundary;
52 char name[32]; 52 char name[32];
53 wait_queue_head_t waitq;
54 struct list_head pools; 53 struct list_head pools;
55}; 54};
56 55
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
62 unsigned int offset; 61 unsigned int offset;
63}; 62};
64 63
65#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
66
67static DEFINE_MUTEX(pools_lock); 64static DEFINE_MUTEX(pools_lock);
68 65
69static ssize_t 66static ssize_t
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
172 retval->size = size; 169 retval->size = size;
173 retval->boundary = boundary; 170 retval->boundary = boundary;
174 retval->allocation = allocation; 171 retval->allocation = allocation;
175 init_waitqueue_head(&retval->waitq);
176 172
177 if (dev) { 173 if (dev) {
178 int ret; 174 int ret;
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
227 memset(page->vaddr, POOL_POISON_FREED, pool->allocation); 223 memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
228#endif 224#endif
229 pool_initialise_page(pool, page); 225 pool_initialise_page(pool, page);
230 list_add(&page->page_list, &pool->page_list);
231 page->in_use = 0; 226 page->in_use = 0;
232 page->offset = 0; 227 page->offset = 0;
233 } else { 228 } else {
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
315 might_sleep_if(mem_flags & __GFP_WAIT); 310 might_sleep_if(mem_flags & __GFP_WAIT);
316 311
317 spin_lock_irqsave(&pool->lock, flags); 312 spin_lock_irqsave(&pool->lock, flags);
318 restart:
319 list_for_each_entry(page, &pool->page_list, page_list) { 313 list_for_each_entry(page, &pool->page_list, page_list) {
320 if (page->offset < pool->allocation) 314 if (page->offset < pool->allocation)
321 goto ready; 315 goto ready;
322 } 316 }
323 page = pool_alloc_page(pool, GFP_ATOMIC);
324 if (!page) {
325 if (mem_flags & __GFP_WAIT) {
326 DECLARE_WAITQUEUE(wait, current);
327 317
328 __set_current_state(TASK_UNINTERRUPTIBLE); 318 /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
329 __add_wait_queue(&pool->waitq, &wait); 319 spin_unlock_irqrestore(&pool->lock, flags);
330 spin_unlock_irqrestore(&pool->lock, flags);
331 320
332 schedule_timeout(POOL_TIMEOUT_JIFFIES); 321 page = pool_alloc_page(pool, mem_flags);
322 if (!page)
323 return NULL;
333 324
334 spin_lock_irqsave(&pool->lock, flags); 325 spin_lock_irqsave(&pool->lock, flags);
335 __remove_wait_queue(&pool->waitq, &wait);
336 goto restart;
337 }
338 retval = NULL;
339 goto done;
340 }
341 326
327 list_add(&page->page_list, &pool->page_list);
342 ready: 328 ready:
343 page->in_use++; 329 page->in_use++;
344 offset = page->offset; 330 offset = page->offset;
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
346 retval = offset + page->vaddr; 332 retval = offset + page->vaddr;
347 *handle = offset + page->dma; 333 *handle = offset + page->dma;
348#ifdef DMAPOOL_DEBUG 334#ifdef DMAPOOL_DEBUG
335 {
336 int i;
337 u8 *data = retval;
338 /* page->offset is stored in first 4 bytes */
339 for (i = sizeof(page->offset); i < pool->size; i++) {
340 if (data[i] == POOL_POISON_FREED)
341 continue;
342 if (pool->dev)
343 dev_err(pool->dev,
344 "dma_pool_alloc %s, %p (corruped)\n",
345 pool->name, retval);
346 else
347 pr_err("dma_pool_alloc %s, %p (corruped)\n",
348 pool->name, retval);
349
350 /*
351 * Dump the first 4 bytes even if they are not
352 * POOL_POISON_FREED
353 */
354 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
355 data, pool->size, 1);
356 break;
357 }
358 }
349 memset(retval, POOL_POISON_ALLOCATED, pool->size); 359 memset(retval, POOL_POISON_ALLOCATED, pool->size);
350#endif 360#endif
351 done:
352 spin_unlock_irqrestore(&pool->lock, flags); 361 spin_unlock_irqrestore(&pool->lock, flags);
353 return retval; 362 return retval;
354} 363}
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
435 page->in_use--; 444 page->in_use--;
436 *(int *)vaddr = page->offset; 445 *(int *)vaddr = page->offset;
437 page->offset = offset; 446 page->offset = offset;
438 if (waitqueue_active(&pool->waitq))
439 wake_up_locked(&pool->waitq);
440 /* 447 /*
441 * Resist a temptation to do 448 * Resist a temptation to do
442 * if (!is_page_busy(page)) pool_free_page(pool, page); 449 * if (!is_page_busy(page)) pool_free_page(pool, page);
diff --git a/mm/highmem.c b/mm/highmem.c
index 2da13a5c50e2..d999077431df 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -99,7 +99,7 @@ struct page *kmap_to_page(void *vaddr)
99 unsigned long addr = (unsigned long)vaddr; 99 unsigned long addr = (unsigned long)vaddr;
100 100
101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { 101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; 102 int i = PKMAP_NR(addr);
103 return pte_page(pkmap_page_table[i]); 103 return pte_page(pkmap_page_table[i]);
104 } 104 }
105 105
@@ -137,8 +137,7 @@ static void flush_all_zero_pkmaps(void)
137 * So no dangers, even with speculative execution. 137 * So no dangers, even with speculative execution.
138 */ 138 */
139 page = pte_page(pkmap_page_table[i]); 139 page = pte_page(pkmap_page_table[i]);
140 pte_clear(&init_mm, (unsigned long)page_address(page), 140 pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
141 &pkmap_page_table[i]);
142 141
143 set_page_address(page, NULL); 142 set_page_address(page, NULL);
144 need_flush = 1; 143 need_flush = 1;
@@ -324,11 +323,7 @@ struct page_address_map {
324 struct list_head list; 323 struct list_head list;
325}; 324};
326 325
327/* 326static struct page_address_map page_address_maps[LAST_PKMAP];
328 * page_address_map freelist, allocated from page_address_maps.
329 */
330static struct list_head page_address_pool; /* freelist */
331static spinlock_t pool_lock; /* protects page_address_pool */
332 327
333/* 328/*
334 * Hash table bucket 329 * Hash table bucket
@@ -393,14 +388,7 @@ void set_page_address(struct page *page, void *virtual)
393 388
394 pas = page_slot(page); 389 pas = page_slot(page);
395 if (virtual) { /* Add */ 390 if (virtual) { /* Add */
396 BUG_ON(list_empty(&page_address_pool)); 391 pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
397
398 spin_lock_irqsave(&pool_lock, flags);
399 pam = list_entry(page_address_pool.next,
400 struct page_address_map, list);
401 list_del(&pam->list);
402 spin_unlock_irqrestore(&pool_lock, flags);
403
404 pam->page = page; 392 pam->page = page;
405 pam->virtual = virtual; 393 pam->virtual = virtual;
406 394
@@ -413,9 +401,6 @@ void set_page_address(struct page *page, void *virtual)
413 if (pam->page == page) { 401 if (pam->page == page) {
414 list_del(&pam->list); 402 list_del(&pam->list);
415 spin_unlock_irqrestore(&pas->lock, flags); 403 spin_unlock_irqrestore(&pas->lock, flags);
416 spin_lock_irqsave(&pool_lock, flags);
417 list_add_tail(&pam->list, &page_address_pool);
418 spin_unlock_irqrestore(&pool_lock, flags);
419 goto done; 404 goto done;
420 } 405 }
421 } 406 }
@@ -425,20 +410,14 @@ done:
425 return; 410 return;
426} 411}
427 412
428static struct page_address_map page_address_maps[LAST_PKMAP];
429
430void __init page_address_init(void) 413void __init page_address_init(void)
431{ 414{
432 int i; 415 int i;
433 416
434 INIT_LIST_HEAD(&page_address_pool);
435 for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
436 list_add(&page_address_maps[i].list, &page_address_pool);
437 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { 417 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
438 INIT_LIST_HEAD(&page_address_htable[i].lh); 418 INIT_LIST_HEAD(&page_address_htable[i].lh);
439 spin_lock_init(&page_address_htable[i].lock); 419 spin_lock_init(&page_address_htable[i].lock);
440 } 420 }
441 spin_lock_init(&pool_lock);
442} 421}
443 422
444#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40f17c34b415..32754eece63e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,15 @@
12#include <linux/mmu_notifier.h> 12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h> 13#include <linux/rmap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/shrinker.h>
15#include <linux/mm_inline.h> 16#include <linux/mm_inline.h>
16#include <linux/kthread.h> 17#include <linux/kthread.h>
17#include <linux/khugepaged.h> 18#include <linux/khugepaged.h>
18#include <linux/freezer.h> 19#include <linux/freezer.h>
19#include <linux/mman.h> 20#include <linux/mman.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/migrate.h>
23
21#include <asm/tlb.h> 24#include <asm/tlb.h>
22#include <asm/pgalloc.h> 25#include <asm/pgalloc.h>
23#include "internal.h" 26#include "internal.h"
@@ -37,7 +40,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
37 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 40 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
38#endif 41#endif
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 42 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
40 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 43 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
44 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41 45
42/* default scan 8*512 pte (or vmas) every 30 second */ 46/* default scan 8*512 pte (or vmas) every 30 second */
43static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 47static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -159,6 +163,77 @@ static int start_khugepaged(void)
159 return err; 163 return err;
160} 164}
161 165
166static atomic_t huge_zero_refcount;
167static unsigned long huge_zero_pfn __read_mostly;
168
169static inline bool is_huge_zero_pfn(unsigned long pfn)
170{
171 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
172 return zero_pfn && pfn == zero_pfn;
173}
174
175static inline bool is_huge_zero_pmd(pmd_t pmd)
176{
177 return is_huge_zero_pfn(pmd_pfn(pmd));
178}
179
180static unsigned long get_huge_zero_page(void)
181{
182 struct page *zero_page;
183retry:
184 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
185 return ACCESS_ONCE(huge_zero_pfn);
186
187 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
188 HPAGE_PMD_ORDER);
189 if (!zero_page) {
190 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
191 return 0;
192 }
193 count_vm_event(THP_ZERO_PAGE_ALLOC);
194 preempt_disable();
195 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
196 preempt_enable();
197 __free_page(zero_page);
198 goto retry;
199 }
200
201 /* We take additional reference here. It will be put back by shrinker */
202 atomic_set(&huge_zero_refcount, 2);
203 preempt_enable();
204 return ACCESS_ONCE(huge_zero_pfn);
205}
206
207static void put_huge_zero_page(void)
208{
209 /*
210 * Counter should never go to zero here. Only shrinker can put
211 * last reference.
212 */
213 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
214}
215
216static int shrink_huge_zero_page(struct shrinker *shrink,
217 struct shrink_control *sc)
218{
219 if (!sc->nr_to_scan)
220 /* we can free zero page only if last reference remains */
221 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
222
223 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
224 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
225 BUG_ON(zero_pfn == 0);
226 __free_page(__pfn_to_page(zero_pfn));
227 }
228
229 return 0;
230}
231
232static struct shrinker huge_zero_page_shrinker = {
233 .shrink = shrink_huge_zero_page,
234 .seeks = DEFAULT_SEEKS,
235};
236
162#ifdef CONFIG_SYSFS 237#ifdef CONFIG_SYSFS
163 238
164static ssize_t double_flag_show(struct kobject *kobj, 239static ssize_t double_flag_show(struct kobject *kobj,
@@ -284,6 +359,20 @@ static ssize_t defrag_store(struct kobject *kobj,
284static struct kobj_attribute defrag_attr = 359static struct kobj_attribute defrag_attr =
285 __ATTR(defrag, 0644, defrag_show, defrag_store); 360 __ATTR(defrag, 0644, defrag_show, defrag_store);
286 361
362static ssize_t use_zero_page_show(struct kobject *kobj,
363 struct kobj_attribute *attr, char *buf)
364{
365 return single_flag_show(kobj, attr, buf,
366 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
367}
368static ssize_t use_zero_page_store(struct kobject *kobj,
369 struct kobj_attribute *attr, const char *buf, size_t count)
370{
371 return single_flag_store(kobj, attr, buf, count,
372 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
373}
374static struct kobj_attribute use_zero_page_attr =
375 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
287#ifdef CONFIG_DEBUG_VM 376#ifdef CONFIG_DEBUG_VM
288static ssize_t debug_cow_show(struct kobject *kobj, 377static ssize_t debug_cow_show(struct kobject *kobj,
289 struct kobj_attribute *attr, char *buf) 378 struct kobj_attribute *attr, char *buf)
@@ -305,6 +394,7 @@ static struct kobj_attribute debug_cow_attr =
305static struct attribute *hugepage_attr[] = { 394static struct attribute *hugepage_attr[] = {
306 &enabled_attr.attr, 395 &enabled_attr.attr,
307 &defrag_attr.attr, 396 &defrag_attr.attr,
397 &use_zero_page_attr.attr,
308#ifdef CONFIG_DEBUG_VM 398#ifdef CONFIG_DEBUG_VM
309 &debug_cow_attr.attr, 399 &debug_cow_attr.attr,
310#endif 400#endif
@@ -550,6 +640,8 @@ static int __init hugepage_init(void)
550 goto out; 640 goto out;
551 } 641 }
552 642
643 register_shrinker(&huge_zero_page_shrinker);
644
553 /* 645 /*
554 * By default disable transparent hugepages on smaller systems, 646 * By default disable transparent hugepages on smaller systems,
555 * where the extra memory used could hurt more than TLB overhead 647 * where the extra memory used could hurt more than TLB overhead
@@ -599,13 +691,22 @@ out:
599} 691}
600__setup("transparent_hugepage=", setup_transparent_hugepage); 692__setup("transparent_hugepage=", setup_transparent_hugepage);
601 693
602static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 694pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
603{ 695{
604 if (likely(vma->vm_flags & VM_WRITE)) 696 if (likely(vma->vm_flags & VM_WRITE))
605 pmd = pmd_mkwrite(pmd); 697 pmd = pmd_mkwrite(pmd);
606 return pmd; 698 return pmd;
607} 699}
608 700
701static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
702{
703 pmd_t entry;
704 entry = mk_pmd(page, vma->vm_page_prot);
705 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
706 entry = pmd_mkhuge(entry);
707 return entry;
708}
709
609static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 710static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
610 struct vm_area_struct *vma, 711 struct vm_area_struct *vma,
611 unsigned long haddr, pmd_t *pmd, 712 unsigned long haddr, pmd_t *pmd,
@@ -629,9 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
629 pte_free(mm, pgtable); 730 pte_free(mm, pgtable);
630 } else { 731 } else {
631 pmd_t entry; 732 pmd_t entry;
632 entry = mk_pmd(page, vma->vm_page_prot); 733 entry = mk_huge_pmd(page, vma);
633 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
634 entry = pmd_mkhuge(entry);
635 /* 734 /*
636 * The spinlocking to take the lru_lock inside 735 * The spinlocking to take the lru_lock inside
637 * page_add_new_anon_rmap() acts as a full memory 736 * page_add_new_anon_rmap() acts as a full memory
@@ -671,6 +770,22 @@ static inline struct page *alloc_hugepage(int defrag)
671} 770}
672#endif 771#endif
673 772
773static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
774 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
775 unsigned long zero_pfn)
776{
777 pmd_t entry;
778 if (!pmd_none(*pmd))
779 return false;
780 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
781 entry = pmd_wrprotect(entry);
782 entry = pmd_mkhuge(entry);
783 set_pmd_at(mm, haddr, pmd, entry);
784 pgtable_trans_huge_deposit(mm, pgtable);
785 mm->nr_ptes++;
786 return true;
787}
788
674int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 789int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
675 unsigned long address, pmd_t *pmd, 790 unsigned long address, pmd_t *pmd,
676 unsigned int flags) 791 unsigned int flags)
@@ -684,6 +799,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
684 return VM_FAULT_OOM; 799 return VM_FAULT_OOM;
685 if (unlikely(khugepaged_enter(vma))) 800 if (unlikely(khugepaged_enter(vma)))
686 return VM_FAULT_OOM; 801 return VM_FAULT_OOM;
802 if (!(flags & FAULT_FLAG_WRITE) &&
803 transparent_hugepage_use_zero_page()) {
804 pgtable_t pgtable;
805 unsigned long zero_pfn;
806 bool set;
807 pgtable = pte_alloc_one(mm, haddr);
808 if (unlikely(!pgtable))
809 return VM_FAULT_OOM;
810 zero_pfn = get_huge_zero_page();
811 if (unlikely(!zero_pfn)) {
812 pte_free(mm, pgtable);
813 count_vm_event(THP_FAULT_FALLBACK);
814 goto out;
815 }
816 spin_lock(&mm->page_table_lock);
817 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
818 zero_pfn);
819 spin_unlock(&mm->page_table_lock);
820 if (!set) {
821 pte_free(mm, pgtable);
822 put_huge_zero_page();
823 }
824 return 0;
825 }
687 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 826 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
688 vma, haddr, numa_node_id(), 0); 827 vma, haddr, numa_node_id(), 0);
689 if (unlikely(!page)) { 828 if (unlikely(!page)) {
@@ -710,7 +849,8 @@ out:
710 * run pte_offset_map on the pmd, if an huge pmd could 849 * run pte_offset_map on the pmd, if an huge pmd could
711 * materialize from under us from a different thread. 850 * materialize from under us from a different thread.
712 */ 851 */
713 if (unlikely(__pte_alloc(mm, vma, pmd, address))) 852 if (unlikely(pmd_none(*pmd)) &&
853 unlikely(__pte_alloc(mm, vma, pmd, address)))
714 return VM_FAULT_OOM; 854 return VM_FAULT_OOM;
715 /* if an huge pmd materialized from under us just retry later */ 855 /* if an huge pmd materialized from under us just retry later */
716 if (unlikely(pmd_trans_huge(*pmd))) 856 if (unlikely(pmd_trans_huge(*pmd)))
@@ -748,6 +888,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
748 pte_free(dst_mm, pgtable); 888 pte_free(dst_mm, pgtable);
749 goto out_unlock; 889 goto out_unlock;
750 } 890 }
891 /*
892 * mm->page_table_lock is enough to be sure that huge zero pmd is not
893 * under splitting since we don't split the page itself, only pmd to
894 * a page table.
895 */
896 if (is_huge_zero_pmd(pmd)) {
897 unsigned long zero_pfn;
898 bool set;
899 /*
900 * get_huge_zero_page() will never allocate a new page here,
901 * since we already have a zero page to copy. It just takes a
902 * reference.
903 */
904 zero_pfn = get_huge_zero_page();
905 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
906 zero_pfn);
907 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
908 ret = 0;
909 goto out_unlock;
910 }
751 if (unlikely(pmd_trans_splitting(pmd))) { 911 if (unlikely(pmd_trans_splitting(pmd))) {
752 /* split huge page running from under us */ 912 /* split huge page running from under us */
753 spin_unlock(&src_mm->page_table_lock); 913 spin_unlock(&src_mm->page_table_lock);
@@ -777,6 +937,102 @@ out:
777 return ret; 937 return ret;
778} 938}
779 939
940void huge_pmd_set_accessed(struct mm_struct *mm,
941 struct vm_area_struct *vma,
942 unsigned long address,
943 pmd_t *pmd, pmd_t orig_pmd,
944 int dirty)
945{
946 pmd_t entry;
947 unsigned long haddr;
948
949 spin_lock(&mm->page_table_lock);
950 if (unlikely(!pmd_same(*pmd, orig_pmd)))
951 goto unlock;
952
953 entry = pmd_mkyoung(orig_pmd);
954 haddr = address & HPAGE_PMD_MASK;
955 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
956 update_mmu_cache_pmd(vma, address, pmd);
957
958unlock:
959 spin_unlock(&mm->page_table_lock);
960}
961
962static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
963 struct vm_area_struct *vma, unsigned long address,
964 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
965{
966 pgtable_t pgtable;
967 pmd_t _pmd;
968 struct page *page;
969 int i, ret = 0;
970 unsigned long mmun_start; /* For mmu_notifiers */
971 unsigned long mmun_end; /* For mmu_notifiers */
972
973 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
974 if (!page) {
975 ret |= VM_FAULT_OOM;
976 goto out;
977 }
978
979 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
980 put_page(page);
981 ret |= VM_FAULT_OOM;
982 goto out;
983 }
984
985 clear_user_highpage(page, address);
986 __SetPageUptodate(page);
987
988 mmun_start = haddr;
989 mmun_end = haddr + HPAGE_PMD_SIZE;
990 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
991
992 spin_lock(&mm->page_table_lock);
993 if (unlikely(!pmd_same(*pmd, orig_pmd)))
994 goto out_free_page;
995
996 pmdp_clear_flush(vma, haddr, pmd);
997 /* leave pmd empty until pte is filled */
998
999 pgtable = pgtable_trans_huge_withdraw(mm);
1000 pmd_populate(mm, &_pmd, pgtable);
1001
1002 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1003 pte_t *pte, entry;
1004 if (haddr == (address & PAGE_MASK)) {
1005 entry = mk_pte(page, vma->vm_page_prot);
1006 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1007 page_add_new_anon_rmap(page, vma, haddr);
1008 } else {
1009 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
1010 entry = pte_mkspecial(entry);
1011 }
1012 pte = pte_offset_map(&_pmd, haddr);
1013 VM_BUG_ON(!pte_none(*pte));
1014 set_pte_at(mm, haddr, pte, entry);
1015 pte_unmap(pte);
1016 }
1017 smp_wmb(); /* make pte visible before pmd */
1018 pmd_populate(mm, pmd, pgtable);
1019 spin_unlock(&mm->page_table_lock);
1020 put_huge_zero_page();
1021 inc_mm_counter(mm, MM_ANONPAGES);
1022
1023 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1024
1025 ret |= VM_FAULT_WRITE;
1026out:
1027 return ret;
1028out_free_page:
1029 spin_unlock(&mm->page_table_lock);
1030 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1031 mem_cgroup_uncharge_page(page);
1032 put_page(page);
1033 goto out;
1034}
1035
780static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1036static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
781 struct vm_area_struct *vma, 1037 struct vm_area_struct *vma,
782 unsigned long address, 1038 unsigned long address,
@@ -883,19 +1139,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
883 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1139 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
884{ 1140{
885 int ret = 0; 1141 int ret = 0;
886 struct page *page, *new_page; 1142 struct page *page = NULL, *new_page;
887 unsigned long haddr; 1143 unsigned long haddr;
888 unsigned long mmun_start; /* For mmu_notifiers */ 1144 unsigned long mmun_start; /* For mmu_notifiers */
889 unsigned long mmun_end; /* For mmu_notifiers */ 1145 unsigned long mmun_end; /* For mmu_notifiers */
890 1146
891 VM_BUG_ON(!vma->anon_vma); 1147 VM_BUG_ON(!vma->anon_vma);
1148 haddr = address & HPAGE_PMD_MASK;
1149 if (is_huge_zero_pmd(orig_pmd))
1150 goto alloc;
892 spin_lock(&mm->page_table_lock); 1151 spin_lock(&mm->page_table_lock);
893 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1152 if (unlikely(!pmd_same(*pmd, orig_pmd)))
894 goto out_unlock; 1153 goto out_unlock;
895 1154
896 page = pmd_page(orig_pmd); 1155 page = pmd_page(orig_pmd);
897 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1156 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
898 haddr = address & HPAGE_PMD_MASK;
899 if (page_mapcount(page) == 1) { 1157 if (page_mapcount(page) == 1) {
900 pmd_t entry; 1158 pmd_t entry;
901 entry = pmd_mkyoung(orig_pmd); 1159 entry = pmd_mkyoung(orig_pmd);
@@ -907,7 +1165,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
907 } 1165 }
908 get_page(page); 1166 get_page(page);
909 spin_unlock(&mm->page_table_lock); 1167 spin_unlock(&mm->page_table_lock);
910 1168alloc:
911 if (transparent_hugepage_enabled(vma) && 1169 if (transparent_hugepage_enabled(vma) &&
912 !transparent_hugepage_debug_cow()) 1170 !transparent_hugepage_debug_cow())
913 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1171 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -917,24 +1175,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
917 1175
918 if (unlikely(!new_page)) { 1176 if (unlikely(!new_page)) {
919 count_vm_event(THP_FAULT_FALLBACK); 1177 count_vm_event(THP_FAULT_FALLBACK);
920 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1178 if (is_huge_zero_pmd(orig_pmd)) {
921 pmd, orig_pmd, page, haddr); 1179 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
922 if (ret & VM_FAULT_OOM) 1180 address, pmd, orig_pmd, haddr);
923 split_huge_page(page); 1181 } else {
924 put_page(page); 1182 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1183 pmd, orig_pmd, page, haddr);
1184 if (ret & VM_FAULT_OOM)
1185 split_huge_page(page);
1186 put_page(page);
1187 }
925 goto out; 1188 goto out;
926 } 1189 }
927 count_vm_event(THP_FAULT_ALLOC); 1190 count_vm_event(THP_FAULT_ALLOC);
928 1191
929 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1192 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
930 put_page(new_page); 1193 put_page(new_page);
931 split_huge_page(page); 1194 if (page) {
932 put_page(page); 1195 split_huge_page(page);
1196 put_page(page);
1197 }
933 ret |= VM_FAULT_OOM; 1198 ret |= VM_FAULT_OOM;
934 goto out; 1199 goto out;
935 } 1200 }
936 1201
937 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1202 if (is_huge_zero_pmd(orig_pmd))
1203 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1204 else
1205 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
938 __SetPageUptodate(new_page); 1206 __SetPageUptodate(new_page);
939 1207
940 mmun_start = haddr; 1208 mmun_start = haddr;
@@ -942,7 +1210,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
942 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1210 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
943 1211
944 spin_lock(&mm->page_table_lock); 1212 spin_lock(&mm->page_table_lock);
945 put_page(page); 1213 if (page)
1214 put_page(page);
946 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1215 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
947 spin_unlock(&mm->page_table_lock); 1216 spin_unlock(&mm->page_table_lock);
948 mem_cgroup_uncharge_page(new_page); 1217 mem_cgroup_uncharge_page(new_page);
@@ -950,16 +1219,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
950 goto out_mn; 1219 goto out_mn;
951 } else { 1220 } else {
952 pmd_t entry; 1221 pmd_t entry;
953 VM_BUG_ON(!PageHead(page)); 1222 entry = mk_huge_pmd(new_page, vma);
954 entry = mk_pmd(new_page, vma->vm_page_prot);
955 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
956 entry = pmd_mkhuge(entry);
957 pmdp_clear_flush(vma, haddr, pmd); 1223 pmdp_clear_flush(vma, haddr, pmd);
958 page_add_new_anon_rmap(new_page, vma, haddr); 1224 page_add_new_anon_rmap(new_page, vma, haddr);
959 set_pmd_at(mm, haddr, pmd, entry); 1225 set_pmd_at(mm, haddr, pmd, entry);
960 update_mmu_cache_pmd(vma, address, pmd); 1226 update_mmu_cache_pmd(vma, address, pmd);
961 page_remove_rmap(page); 1227 if (is_huge_zero_pmd(orig_pmd)) {
962 put_page(page); 1228 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1229 put_huge_zero_page();
1230 } else {
1231 VM_BUG_ON(!PageHead(page));
1232 page_remove_rmap(page);
1233 put_page(page);
1234 }
963 ret |= VM_FAULT_WRITE; 1235 ret |= VM_FAULT_WRITE;
964 } 1236 }
965 spin_unlock(&mm->page_table_lock); 1237 spin_unlock(&mm->page_table_lock);
@@ -1017,6 +1289,81 @@ out:
1017 return page; 1289 return page;
1018} 1290}
1019 1291
1292/* NUMA hinting page fault entry point for trans huge pmds */
1293int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1294 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1295{
1296 struct page *page;
1297 unsigned long haddr = addr & HPAGE_PMD_MASK;
1298 int target_nid;
1299 int current_nid = -1;
1300 bool migrated;
1301 bool page_locked = false;
1302
1303 spin_lock(&mm->page_table_lock);
1304 if (unlikely(!pmd_same(pmd, *pmdp)))
1305 goto out_unlock;
1306
1307 page = pmd_page(pmd);
1308 get_page(page);
1309 current_nid = page_to_nid(page);
1310 count_vm_numa_event(NUMA_HINT_FAULTS);
1311 if (current_nid == numa_node_id())
1312 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1313
1314 target_nid = mpol_misplaced(page, vma, haddr);
1315 if (target_nid == -1) {
1316 put_page(page);
1317 goto clear_pmdnuma;
1318 }
1319
1320 /* Acquire the page lock to serialise THP migrations */
1321 spin_unlock(&mm->page_table_lock);
1322 lock_page(page);
1323 page_locked = true;
1324
1325 /* Confirm the PTE did not while locked */
1326 spin_lock(&mm->page_table_lock);
1327 if (unlikely(!pmd_same(pmd, *pmdp))) {
1328 unlock_page(page);
1329 put_page(page);
1330 goto out_unlock;
1331 }
1332 spin_unlock(&mm->page_table_lock);
1333
1334 /* Migrate the THP to the requested node */
1335 migrated = migrate_misplaced_transhuge_page(mm, vma,
1336 pmdp, pmd, addr,
1337 page, target_nid);
1338 if (migrated)
1339 current_nid = target_nid;
1340 else {
1341 spin_lock(&mm->page_table_lock);
1342 if (unlikely(!pmd_same(pmd, *pmdp))) {
1343 unlock_page(page);
1344 goto out_unlock;
1345 }
1346 goto clear_pmdnuma;
1347 }
1348
1349 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1350 return 0;
1351
1352clear_pmdnuma:
1353 pmd = pmd_mknonnuma(pmd);
1354 set_pmd_at(mm, haddr, pmdp, pmd);
1355 VM_BUG_ON(pmd_numa(*pmdp));
1356 update_mmu_cache_pmd(vma, addr, pmdp);
1357 if (page_locked)
1358 unlock_page(page);
1359
1360out_unlock:
1361 spin_unlock(&mm->page_table_lock);
1362 if (current_nid != -1)
1363 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1364 return 0;
1365}
1366
1020int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1367int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1021 pmd_t *pmd, unsigned long addr) 1368 pmd_t *pmd, unsigned long addr)
1022{ 1369{
@@ -1028,15 +1375,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1028 pmd_t orig_pmd; 1375 pmd_t orig_pmd;
1029 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1376 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1030 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1377 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1031 page = pmd_page(orig_pmd);
1032 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1378 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1033 page_remove_rmap(page); 1379 if (is_huge_zero_pmd(orig_pmd)) {
1034 VM_BUG_ON(page_mapcount(page) < 0); 1380 tlb->mm->nr_ptes--;
1035 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1381 spin_unlock(&tlb->mm->page_table_lock);
1036 VM_BUG_ON(!PageHead(page)); 1382 put_huge_zero_page();
1037 tlb->mm->nr_ptes--; 1383 } else {
1038 spin_unlock(&tlb->mm->page_table_lock); 1384 page = pmd_page(orig_pmd);
1039 tlb_remove_page(tlb, page); 1385 page_remove_rmap(page);
1386 VM_BUG_ON(page_mapcount(page) < 0);
1387 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1388 VM_BUG_ON(!PageHead(page));
1389 tlb->mm->nr_ptes--;
1390 spin_unlock(&tlb->mm->page_table_lock);
1391 tlb_remove_page(tlb, page);
1392 }
1040 pte_free(tlb->mm, pgtable); 1393 pte_free(tlb->mm, pgtable);
1041 ret = 1; 1394 ret = 1;
1042 } 1395 }
@@ -1099,7 +1452,7 @@ out:
1099} 1452}
1100 1453
1101int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1454int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1102 unsigned long addr, pgprot_t newprot) 1455 unsigned long addr, pgprot_t newprot, int prot_numa)
1103{ 1456{
1104 struct mm_struct *mm = vma->vm_mm; 1457 struct mm_struct *mm = vma->vm_mm;
1105 int ret = 0; 1458 int ret = 0;
@@ -1107,7 +1460,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1107 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1460 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1108 pmd_t entry; 1461 pmd_t entry;
1109 entry = pmdp_get_and_clear(mm, addr, pmd); 1462 entry = pmdp_get_and_clear(mm, addr, pmd);
1110 entry = pmd_modify(entry, newprot); 1463 if (!prot_numa) {
1464 entry = pmd_modify(entry, newprot);
1465 BUG_ON(pmd_write(entry));
1466 } else {
1467 struct page *page = pmd_page(*pmd);
1468
1469 /* only check non-shared pages */
1470 if (page_mapcount(page) == 1 &&
1471 !pmd_numa(*pmd)) {
1472 entry = pmd_mknuma(entry);
1473 }
1474 }
1111 set_pmd_at(mm, addr, pmd, entry); 1475 set_pmd_at(mm, addr, pmd, entry);
1112 spin_unlock(&vma->vm_mm->page_table_lock); 1476 spin_unlock(&vma->vm_mm->page_table_lock);
1113 ret = 1; 1477 ret = 1;
@@ -1146,22 +1510,14 @@ pmd_t *page_check_address_pmd(struct page *page,
1146 unsigned long address, 1510 unsigned long address,
1147 enum page_check_address_pmd_flag flag) 1511 enum page_check_address_pmd_flag flag)
1148{ 1512{
1149 pgd_t *pgd;
1150 pud_t *pud;
1151 pmd_t *pmd, *ret = NULL; 1513 pmd_t *pmd, *ret = NULL;
1152 1514
1153 if (address & ~HPAGE_PMD_MASK) 1515 if (address & ~HPAGE_PMD_MASK)
1154 goto out; 1516 goto out;
1155 1517
1156 pgd = pgd_offset(mm, address); 1518 pmd = mm_find_pmd(mm, address);
1157 if (!pgd_present(*pgd)) 1519 if (!pmd)
1158 goto out;
1159
1160 pud = pud_offset(pgd, address);
1161 if (!pud_present(*pud))
1162 goto out; 1520 goto out;
1163
1164 pmd = pmd_offset(pud, address);
1165 if (pmd_none(*pmd)) 1521 if (pmd_none(*pmd))
1166 goto out; 1522 goto out;
1167 if (pmd_page(*pmd) != page) 1523 if (pmd_page(*pmd) != page)
@@ -1205,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
1205 * We can't temporarily set the pmd to null in order 1561 * We can't temporarily set the pmd to null in order
1206 * to split it, the pmd must remain marked huge at all 1562 * to split it, the pmd must remain marked huge at all
1207 * times or the VM won't take the pmd_trans_huge paths 1563 * times or the VM won't take the pmd_trans_huge paths
1208 * and it won't wait on the anon_vma->root->mutex to 1564 * and it won't wait on the anon_vma->root->rwsem to
1209 * serialize against split_huge_page*. 1565 * serialize against split_huge_page*.
1210 */ 1566 */
1211 pmdp_splitting_flush(vma, address, pmd); 1567 pmdp_splitting_flush(vma, address, pmd);
@@ -1296,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
1296 page_tail->mapping = page->mapping; 1652 page_tail->mapping = page->mapping;
1297 1653
1298 page_tail->index = page->index + i; 1654 page_tail->index = page->index + i;
1655 page_xchg_last_nid(page_tail, page_last_nid(page));
1299 1656
1300 BUG_ON(!PageAnon(page_tail)); 1657 BUG_ON(!PageAnon(page_tail));
1301 BUG_ON(!PageUptodate(page_tail)); 1658 BUG_ON(!PageUptodate(page_tail));
@@ -1363,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
1363 BUG_ON(page_mapcount(page) != 1); 1720 BUG_ON(page_mapcount(page) != 1);
1364 if (!pmd_young(*pmd)) 1721 if (!pmd_young(*pmd))
1365 entry = pte_mkold(entry); 1722 entry = pte_mkold(entry);
1723 if (pmd_numa(*pmd))
1724 entry = pte_mknuma(entry);
1366 pte = pte_offset_map(&_pmd, haddr); 1725 pte = pte_offset_map(&_pmd, haddr);
1367 BUG_ON(!pte_none(*pte)); 1726 BUG_ON(!pte_none(*pte));
1368 set_pte_at(mm, haddr, pte, entry); 1727 set_pte_at(mm, haddr, pte, entry);
@@ -1405,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
1405 return ret; 1764 return ret;
1406} 1765}
1407 1766
1408/* must be called with anon_vma->root->mutex hold */ 1767/* must be called with anon_vma->root->rwsem held */
1409static void __split_huge_page(struct page *page, 1768static void __split_huge_page(struct page *page,
1410 struct anon_vma *anon_vma) 1769 struct anon_vma *anon_vma)
1411{ 1770{
@@ -1458,8 +1817,9 @@ int split_huge_page(struct page *page)
1458 struct anon_vma *anon_vma; 1817 struct anon_vma *anon_vma;
1459 int ret = 1; 1818 int ret = 1;
1460 1819
1820 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1461 BUG_ON(!PageAnon(page)); 1821 BUG_ON(!PageAnon(page));
1462 anon_vma = page_lock_anon_vma(page); 1822 anon_vma = page_lock_anon_vma_read(page);
1463 if (!anon_vma) 1823 if (!anon_vma)
1464 goto out; 1824 goto out;
1465 ret = 0; 1825 ret = 0;
@@ -1472,7 +1832,7 @@ int split_huge_page(struct page *page)
1472 1832
1473 BUG_ON(PageCompound(page)); 1833 BUG_ON(PageCompound(page));
1474out_unlock: 1834out_unlock:
1475 page_unlock_anon_vma(anon_vma); 1835 page_unlock_anon_vma_read(anon_vma);
1476out: 1836out:
1477 return ret; 1837 return ret;
1478} 1838}
@@ -1701,64 +2061,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
1701 } 2061 }
1702} 2062}
1703 2063
1704static void release_all_pte_pages(pte_t *pte)
1705{
1706 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1707}
1708
1709static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 2064static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1710 unsigned long address, 2065 unsigned long address,
1711 pte_t *pte) 2066 pte_t *pte)
1712{ 2067{
1713 struct page *page; 2068 struct page *page;
1714 pte_t *_pte; 2069 pte_t *_pte;
1715 int referenced = 0, isolated = 0, none = 0; 2070 int referenced = 0, none = 0;
1716 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2071 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1717 _pte++, address += PAGE_SIZE) { 2072 _pte++, address += PAGE_SIZE) {
1718 pte_t pteval = *_pte; 2073 pte_t pteval = *_pte;
1719 if (pte_none(pteval)) { 2074 if (pte_none(pteval)) {
1720 if (++none <= khugepaged_max_ptes_none) 2075 if (++none <= khugepaged_max_ptes_none)
1721 continue; 2076 continue;
1722 else { 2077 else
1723 release_pte_pages(pte, _pte);
1724 goto out; 2078 goto out;
1725 }
1726 } 2079 }
1727 if (!pte_present(pteval) || !pte_write(pteval)) { 2080 if (!pte_present(pteval) || !pte_write(pteval))
1728 release_pte_pages(pte, _pte);
1729 goto out; 2081 goto out;
1730 }
1731 page = vm_normal_page(vma, address, pteval); 2082 page = vm_normal_page(vma, address, pteval);
1732 if (unlikely(!page)) { 2083 if (unlikely(!page))
1733 release_pte_pages(pte, _pte);
1734 goto out; 2084 goto out;
1735 } 2085
1736 VM_BUG_ON(PageCompound(page)); 2086 VM_BUG_ON(PageCompound(page));
1737 BUG_ON(!PageAnon(page)); 2087 BUG_ON(!PageAnon(page));
1738 VM_BUG_ON(!PageSwapBacked(page)); 2088 VM_BUG_ON(!PageSwapBacked(page));
1739 2089
1740 /* cannot use mapcount: can't collapse if there's a gup pin */ 2090 /* cannot use mapcount: can't collapse if there's a gup pin */
1741 if (page_count(page) != 1) { 2091 if (page_count(page) != 1)
1742 release_pte_pages(pte, _pte);
1743 goto out; 2092 goto out;
1744 }
1745 /* 2093 /*
1746 * We can do it before isolate_lru_page because the 2094 * We can do it before isolate_lru_page because the
1747 * page can't be freed from under us. NOTE: PG_lock 2095 * page can't be freed from under us. NOTE: PG_lock
1748 * is needed to serialize against split_huge_page 2096 * is needed to serialize against split_huge_page
1749 * when invoked from the VM. 2097 * when invoked from the VM.
1750 */ 2098 */
1751 if (!trylock_page(page)) { 2099 if (!trylock_page(page))
1752 release_pte_pages(pte, _pte);
1753 goto out; 2100 goto out;
1754 }
1755 /* 2101 /*
1756 * Isolate the page to avoid collapsing an hugepage 2102 * Isolate the page to avoid collapsing an hugepage
1757 * currently in use by the VM. 2103 * currently in use by the VM.
1758 */ 2104 */
1759 if (isolate_lru_page(page)) { 2105 if (isolate_lru_page(page)) {
1760 unlock_page(page); 2106 unlock_page(page);
1761 release_pte_pages(pte, _pte);
1762 goto out; 2107 goto out;
1763 } 2108 }
1764 /* 0 stands for page_is_file_cache(page) == false */ 2109 /* 0 stands for page_is_file_cache(page) == false */
@@ -1771,12 +2116,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1771 mmu_notifier_test_young(vma->vm_mm, address)) 2116 mmu_notifier_test_young(vma->vm_mm, address))
1772 referenced = 1; 2117 referenced = 1;
1773 } 2118 }
1774 if (unlikely(!referenced)) 2119 if (likely(referenced))
1775 release_all_pte_pages(pte); 2120 return 1;
1776 else
1777 isolated = 1;
1778out: 2121out:
1779 return isolated; 2122 release_pte_pages(pte, _pte);
2123 return 0;
1780} 2124}
1781 2125
1782static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 2126static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -1918,14 +2262,26 @@ static struct page
1918} 2262}
1919#endif 2263#endif
1920 2264
2265static bool hugepage_vma_check(struct vm_area_struct *vma)
2266{
2267 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2268 (vma->vm_flags & VM_NOHUGEPAGE))
2269 return false;
2270
2271 if (!vma->anon_vma || vma->vm_ops)
2272 return false;
2273 if (is_vma_temporary_stack(vma))
2274 return false;
2275 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2276 return true;
2277}
2278
1921static void collapse_huge_page(struct mm_struct *mm, 2279static void collapse_huge_page(struct mm_struct *mm,
1922 unsigned long address, 2280 unsigned long address,
1923 struct page **hpage, 2281 struct page **hpage,
1924 struct vm_area_struct *vma, 2282 struct vm_area_struct *vma,
1925 int node) 2283 int node)
1926{ 2284{
1927 pgd_t *pgd;
1928 pud_t *pud;
1929 pmd_t *pmd, _pmd; 2285 pmd_t *pmd, _pmd;
1930 pte_t *pte; 2286 pte_t *pte;
1931 pgtable_t pgtable; 2287 pgtable_t pgtable;
@@ -1960,31 +2316,15 @@ static void collapse_huge_page(struct mm_struct *mm,
1960 hend = vma->vm_end & HPAGE_PMD_MASK; 2316 hend = vma->vm_end & HPAGE_PMD_MASK;
1961 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 2317 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1962 goto out; 2318 goto out;
1963 2319 if (!hugepage_vma_check(vma))
1964 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1965 (vma->vm_flags & VM_NOHUGEPAGE))
1966 goto out;
1967
1968 if (!vma->anon_vma || vma->vm_ops)
1969 goto out; 2320 goto out;
1970 if (is_vma_temporary_stack(vma)) 2321 pmd = mm_find_pmd(mm, address);
2322 if (!pmd)
1971 goto out; 2323 goto out;
1972 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2324 if (pmd_trans_huge(*pmd))
1973
1974 pgd = pgd_offset(mm, address);
1975 if (!pgd_present(*pgd))
1976 goto out;
1977
1978 pud = pud_offset(pgd, address);
1979 if (!pud_present(*pud))
1980 goto out;
1981
1982 pmd = pmd_offset(pud, address);
1983 /* pmd can't go away or become huge under us */
1984 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1985 goto out; 2325 goto out;
1986 2326
1987 anon_vma_lock(vma->anon_vma); 2327 anon_vma_lock_write(vma->anon_vma);
1988 2328
1989 pte = pte_offset_map(pmd, address); 2329 pte = pte_offset_map(pmd, address);
1990 ptl = pte_lockptr(mm, pmd); 2330 ptl = pte_lockptr(mm, pmd);
@@ -2028,9 +2368,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2028 __SetPageUptodate(new_page); 2368 __SetPageUptodate(new_page);
2029 pgtable = pmd_pgtable(_pmd); 2369 pgtable = pmd_pgtable(_pmd);
2030 2370
2031 _pmd = mk_pmd(new_page, vma->vm_page_prot); 2371 _pmd = mk_huge_pmd(new_page, vma);
2032 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
2033 _pmd = pmd_mkhuge(_pmd);
2034 2372
2035 /* 2373 /*
2036 * spin_lock() below is not the equivalent of smp_wmb(), so 2374 * spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2064,8 +2402,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2064 unsigned long address, 2402 unsigned long address,
2065 struct page **hpage) 2403 struct page **hpage)
2066{ 2404{
2067 pgd_t *pgd;
2068 pud_t *pud;
2069 pmd_t *pmd; 2405 pmd_t *pmd;
2070 pte_t *pte, *_pte; 2406 pte_t *pte, *_pte;
2071 int ret = 0, referenced = 0, none = 0; 2407 int ret = 0, referenced = 0, none = 0;
@@ -2076,16 +2412,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2076 2412
2077 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2413 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2078 2414
2079 pgd = pgd_offset(mm, address); 2415 pmd = mm_find_pmd(mm, address);
2080 if (!pgd_present(*pgd)) 2416 if (!pmd)
2081 goto out; 2417 goto out;
2082 2418 if (pmd_trans_huge(*pmd))
2083 pud = pud_offset(pgd, address);
2084 if (!pud_present(*pud))
2085 goto out;
2086
2087 pmd = pmd_offset(pud, address);
2088 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
2089 goto out; 2419 goto out;
2090 2420
2091 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2421 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2193,20 +2523,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2193 progress++; 2523 progress++;
2194 break; 2524 break;
2195 } 2525 }
2196 2526 if (!hugepage_vma_check(vma)) {
2197 if ((!(vma->vm_flags & VM_HUGEPAGE) && 2527skip:
2198 !khugepaged_always()) ||
2199 (vma->vm_flags & VM_NOHUGEPAGE)) {
2200 skip:
2201 progress++; 2528 progress++;
2202 continue; 2529 continue;
2203 } 2530 }
2204 if (!vma->anon_vma || vma->vm_ops)
2205 goto skip;
2206 if (is_vma_temporary_stack(vma))
2207 goto skip;
2208 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2209
2210 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2531 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2211 hend = vma->vm_end & HPAGE_PMD_MASK; 2532 hend = vma->vm_end & HPAGE_PMD_MASK;
2212 if (hstart >= hend) 2533 if (hstart >= hend)
@@ -2356,19 +2677,65 @@ static int khugepaged(void *none)
2356 return 0; 2677 return 0;
2357} 2678}
2358 2679
2359void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) 2680static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2681 unsigned long haddr, pmd_t *pmd)
2682{
2683 struct mm_struct *mm = vma->vm_mm;
2684 pgtable_t pgtable;
2685 pmd_t _pmd;
2686 int i;
2687
2688 pmdp_clear_flush(vma, haddr, pmd);
2689 /* leave pmd empty until pte is filled */
2690
2691 pgtable = pgtable_trans_huge_withdraw(mm);
2692 pmd_populate(mm, &_pmd, pgtable);
2693
2694 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2695 pte_t *pte, entry;
2696 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2697 entry = pte_mkspecial(entry);
2698 pte = pte_offset_map(&_pmd, haddr);
2699 VM_BUG_ON(!pte_none(*pte));
2700 set_pte_at(mm, haddr, pte, entry);
2701 pte_unmap(pte);
2702 }
2703 smp_wmb(); /* make pte visible before pmd */
2704 pmd_populate(mm, pmd, pgtable);
2705 put_huge_zero_page();
2706}
2707
2708void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2709 pmd_t *pmd)
2360{ 2710{
2361 struct page *page; 2711 struct page *page;
2712 struct mm_struct *mm = vma->vm_mm;
2713 unsigned long haddr = address & HPAGE_PMD_MASK;
2714 unsigned long mmun_start; /* For mmu_notifiers */
2715 unsigned long mmun_end; /* For mmu_notifiers */
2716
2717 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2362 2718
2719 mmun_start = haddr;
2720 mmun_end = haddr + HPAGE_PMD_SIZE;
2721 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2363 spin_lock(&mm->page_table_lock); 2722 spin_lock(&mm->page_table_lock);
2364 if (unlikely(!pmd_trans_huge(*pmd))) { 2723 if (unlikely(!pmd_trans_huge(*pmd))) {
2365 spin_unlock(&mm->page_table_lock); 2724 spin_unlock(&mm->page_table_lock);
2725 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2726 return;
2727 }
2728 if (is_huge_zero_pmd(*pmd)) {
2729 __split_huge_zero_page_pmd(vma, haddr, pmd);
2730 spin_unlock(&mm->page_table_lock);
2731 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2366 return; 2732 return;
2367 } 2733 }
2368 page = pmd_page(*pmd); 2734 page = pmd_page(*pmd);
2369 VM_BUG_ON(!page_count(page)); 2735 VM_BUG_ON(!page_count(page));
2370 get_page(page); 2736 get_page(page);
2371 spin_unlock(&mm->page_table_lock); 2737 spin_unlock(&mm->page_table_lock);
2738 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2372 2739
2373 split_huge_page(page); 2740 split_huge_page(page);
2374 2741
@@ -2376,31 +2743,31 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2376 BUG_ON(pmd_trans_huge(*pmd)); 2743 BUG_ON(pmd_trans_huge(*pmd));
2377} 2744}
2378 2745
2746void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2747 pmd_t *pmd)
2748{
2749 struct vm_area_struct *vma;
2750
2751 vma = find_vma(mm, address);
2752 BUG_ON(vma == NULL);
2753 split_huge_page_pmd(vma, address, pmd);
2754}
2755
2379static void split_huge_page_address(struct mm_struct *mm, 2756static void split_huge_page_address(struct mm_struct *mm,
2380 unsigned long address) 2757 unsigned long address)
2381{ 2758{
2382 pgd_t *pgd;
2383 pud_t *pud;
2384 pmd_t *pmd; 2759 pmd_t *pmd;
2385 2760
2386 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2761 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2387 2762
2388 pgd = pgd_offset(mm, address); 2763 pmd = mm_find_pmd(mm, address);
2389 if (!pgd_present(*pgd)) 2764 if (!pmd)
2390 return;
2391
2392 pud = pud_offset(pgd, address);
2393 if (!pud_present(*pud))
2394 return;
2395
2396 pmd = pmd_offset(pud, address);
2397 if (!pmd_present(*pmd))
2398 return; 2765 return;
2399 /* 2766 /*
2400 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2767 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2401 * materialize from under us. 2768 * materialize from under us.
2402 */ 2769 */
2403 split_huge_page_pmd(mm, pmd); 2770 split_huge_page_pmd_mm(mm, address, pmd);
2404} 2771}
2405 2772
2406void __vma_adjust_trans_huge(struct vm_area_struct *vma, 2773void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 59a0059b39e2..4f3ea0b1e57c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Generic hugetlb support. 2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004 3 * (C) Nadia Yvette Chambers, April 2004
4 */ 4 */
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/init.h> 6#include <linux/init.h>
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
1057 * on-line nodes with memory and will handle the hstate accounting. 1057 * on-line nodes with memory and will handle the hstate accounting.
1058 */ 1058 */
1059 while (nr_pages--) { 1059 while (nr_pages--) {
1060 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) 1060 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1061 break; 1061 break;
1062 } 1062 }
1063} 1063}
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1180int __weak alloc_bootmem_huge_page(struct hstate *h) 1180int __weak alloc_bootmem_huge_page(struct hstate *h)
1181{ 1181{
1182 struct huge_bootmem_page *m; 1182 struct huge_bootmem_page *m;
1183 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 1183 int nr_nodes = nodes_weight(node_states[N_MEMORY]);
1184 1184
1185 while (nr_nodes) { 1185 while (nr_nodes) {
1186 void *addr; 1186 void *addr;
1187 1187
1188 addr = __alloc_bootmem_node_nopanic( 1188 addr = __alloc_bootmem_node_nopanic(
1189 NODE_DATA(hstate_next_node_to_alloc(h, 1189 NODE_DATA(hstate_next_node_to_alloc(h,
1190 &node_states[N_HIGH_MEMORY])), 1190 &node_states[N_MEMORY])),
1191 huge_page_size(h), huge_page_size(h), 0); 1191 huge_page_size(h), huge_page_size(h), 0);
1192 1192
1193 if (addr) { 1193 if (addr) {
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1259 if (!alloc_bootmem_huge_page(h)) 1259 if (!alloc_bootmem_huge_page(h))
1260 break; 1260 break;
1261 } else if (!alloc_fresh_huge_page(h, 1261 } else if (!alloc_fresh_huge_page(h,
1262 &node_states[N_HIGH_MEMORY])) 1262 &node_states[N_MEMORY]))
1263 break; 1263 break;
1264 } 1264 }
1265 h->max_huge_pages = i; 1265 h->max_huge_pages = i;
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1527 if (!(obey_mempolicy && 1527 if (!(obey_mempolicy &&
1528 init_nodemask_of_mempolicy(nodes_allowed))) { 1528 init_nodemask_of_mempolicy(nodes_allowed))) {
1529 NODEMASK_FREE(nodes_allowed); 1529 NODEMASK_FREE(nodes_allowed);
1530 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1530 nodes_allowed = &node_states[N_MEMORY];
1531 } 1531 }
1532 } else if (nodes_allowed) { 1532 } else if (nodes_allowed) {
1533 /* 1533 /*
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1537 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 1537 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1538 init_nodemask_of_node(nodes_allowed, nid); 1538 init_nodemask_of_node(nodes_allowed, nid);
1539 } else 1539 } else
1540 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1540 nodes_allowed = &node_states[N_MEMORY];
1541 1541
1542 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 1542 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1543 1543
1544 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1544 if (nodes_allowed != &node_states[N_MEMORY])
1545 NODEMASK_FREE(nodes_allowed); 1545 NODEMASK_FREE(nodes_allowed);
1546 1546
1547 return len; 1547 return len;
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
1800 * remove hstate attributes from any nodes that have them. 1800 * remove hstate attributes from any nodes that have them.
1801 */ 1801 */
1802 for (nid = 0; nid < nr_node_ids; nid++) 1802 for (nid = 0; nid < nr_node_ids; nid++)
1803 hugetlb_unregister_node(&node_devices[nid]); 1803 hugetlb_unregister_node(node_devices[nid]);
1804} 1804}
1805 1805
1806/* 1806/*
@@ -1844,8 +1844,8 @@ static void hugetlb_register_all_nodes(void)
1844{ 1844{
1845 int nid; 1845 int nid;
1846 1846
1847 for_each_node_state(nid, N_HIGH_MEMORY) { 1847 for_each_node_state(nid, N_MEMORY) {
1848 struct node *node = &node_devices[nid]; 1848 struct node *node = node_devices[nid];
1849 if (node->dev.id == nid) 1849 if (node->dev.id == nid)
1850 hugetlb_register_node(node); 1850 hugetlb_register_node(node);
1851 } 1851 }
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
1906 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1906 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1907 1907
1908 hugetlb_init_hstates(); 1908 hugetlb_init_hstates();
1909
1910 gather_bootmem_prealloc(); 1909 gather_bootmem_prealloc();
1911
1912 report_hugepages(); 1910 report_hugepages();
1913 1911
1914 hugetlb_sysfs_init(); 1912 hugetlb_sysfs_init();
1915
1916 hugetlb_register_all_nodes(); 1913 hugetlb_register_all_nodes();
1914 hugetlb_cgroup_file_init();
1917 1915
1918 return 0; 1916 return 0;
1919} 1917}
@@ -1939,17 +1937,10 @@ void __init hugetlb_add_hstate(unsigned order)
1939 for (i = 0; i < MAX_NUMNODES; ++i) 1937 for (i = 0; i < MAX_NUMNODES; ++i)
1940 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1938 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1941 INIT_LIST_HEAD(&h->hugepage_activelist); 1939 INIT_LIST_HEAD(&h->hugepage_activelist);
1942 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1940 h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
1943 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1941 h->next_nid_to_free = first_node(node_states[N_MEMORY]);
1944 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1942 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1945 huge_page_size(h)/1024); 1943 huge_page_size(h)/1024);
1946 /*
1947 * Add cgroup control files only if the huge page consists
1948 * of more than two normal pages. This is because we use
1949 * page[2].lru.next for storing cgoup details.
1950 */
1951 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1952 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1953 1944
1954 parsed_hstate = h; 1945 parsed_hstate = h;
1955} 1946}
@@ -2035,11 +2026,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2035 if (!(obey_mempolicy && 2026 if (!(obey_mempolicy &&
2036 init_nodemask_of_mempolicy(nodes_allowed))) { 2027 init_nodemask_of_mempolicy(nodes_allowed))) {
2037 NODEMASK_FREE(nodes_allowed); 2028 NODEMASK_FREE(nodes_allowed);
2038 nodes_allowed = &node_states[N_HIGH_MEMORY]; 2029 nodes_allowed = &node_states[N_MEMORY];
2039 } 2030 }
2040 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); 2031 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
2041 2032
2042 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 2033 if (nodes_allowed != &node_states[N_MEMORY])
2043 NODEMASK_FREE(nodes_allowed); 2034 NODEMASK_FREE(nodes_allowed);
2044 } 2035 }
2045out: 2036out:
@@ -2386,8 +2377,10 @@ again:
2386 /* 2377 /*
2387 * HWPoisoned hugepage is already unmapped and dropped reference 2378 * HWPoisoned hugepage is already unmapped and dropped reference
2388 */ 2379 */
2389 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) 2380 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
2381 pte_clear(mm, address, ptep);
2390 continue; 2382 continue;
2383 }
2391 2384
2392 page = pte_page(pte); 2385 page = pte_page(pte);
2393 /* 2386 /*
@@ -3014,7 +3007,7 @@ same_page:
3014 return i ? i : -EFAULT; 3007 return i ? i : -EFAULT;
3015} 3008}
3016 3009
3017void hugetlb_change_protection(struct vm_area_struct *vma, 3010unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3018 unsigned long address, unsigned long end, pgprot_t newprot) 3011 unsigned long address, unsigned long end, pgprot_t newprot)
3019{ 3012{
3020 struct mm_struct *mm = vma->vm_mm; 3013 struct mm_struct *mm = vma->vm_mm;
@@ -3022,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3022 pte_t *ptep; 3015 pte_t *ptep;
3023 pte_t pte; 3016 pte_t pte;
3024 struct hstate *h = hstate_vma(vma); 3017 struct hstate *h = hstate_vma(vma);
3018 unsigned long pages = 0;
3025 3019
3026 BUG_ON(address >= end); 3020 BUG_ON(address >= end);
3027 flush_cache_range(vma, address, end); 3021 flush_cache_range(vma, address, end);
@@ -3032,12 +3026,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3032 ptep = huge_pte_offset(mm, address); 3026 ptep = huge_pte_offset(mm, address);
3033 if (!ptep) 3027 if (!ptep)
3034 continue; 3028 continue;
3035 if (huge_pmd_unshare(mm, &address, ptep)) 3029 if (huge_pmd_unshare(mm, &address, ptep)) {
3030 pages++;
3036 continue; 3031 continue;
3032 }
3037 if (!huge_pte_none(huge_ptep_get(ptep))) { 3033 if (!huge_pte_none(huge_ptep_get(ptep))) {
3038 pte = huge_ptep_get_and_clear(mm, address, ptep); 3034 pte = huge_ptep_get_and_clear(mm, address, ptep);
3039 pte = pte_mkhuge(pte_modify(pte, newprot)); 3035 pte = pte_mkhuge(pte_modify(pte, newprot));
3040 set_huge_pte_at(mm, address, ptep, pte); 3036 set_huge_pte_at(mm, address, ptep, pte);
3037 pages++;
3041 } 3038 }
3042 } 3039 }
3043 spin_unlock(&mm->page_table_lock); 3040 spin_unlock(&mm->page_table_lock);
@@ -3049,6 +3046,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3049 */ 3046 */
3050 flush_tlb_range(vma, start, end); 3047 flush_tlb_range(vma, start, end);
3051 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3048 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
3049
3050 return pages << h->order;
3052} 3051}
3053 3052
3054int hugetlb_reserve_pages(struct inode *inode, 3053int hugetlb_reserve_pages(struct inode *inode,
@@ -3170,7 +3169,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3170 3169
3171 spin_lock(&hugetlb_lock); 3170 spin_lock(&hugetlb_lock);
3172 if (is_hugepage_on_freelist(hpage)) { 3171 if (is_hugepage_on_freelist(hpage)) {
3173 list_del(&hpage->lru); 3172 /*
3173 * Hwpoisoned hugepage isn't linked to activelist or freelist,
3174 * but dangling hpage->lru can trigger list-debug warnings
3175 * (this happens when we call unpoison_memory() on it),
3176 * so let it point to itself with list_del_init().
3177 */
3178 list_del_init(&hpage->lru);
3174 set_page_refcounted(hpage); 3179 set_page_refcounted(hpage);
3175 h->free_huge_pages--; 3180 h->free_huge_pages--;
3176 h->free_huge_pages_node[nid]--; 3181 h->free_huge_pages_node[nid]--;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a3f358fb8a0c..9cea7de22ffb 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
77 return false; 77 return false;
78} 78}
79 79
80static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) 80static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
81{ 81{
82 int idx; 82 int idx;
83 struct cgroup *parent_cgroup; 83 struct cgroup *parent_cgroup;
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
101 return &h_cgroup->css; 101 return &h_cgroup->css;
102} 102}
103 103
104static void hugetlb_cgroup_destroy(struct cgroup *cgroup) 104static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
105{ 105{
106 struct hugetlb_cgroup *h_cgroup; 106 struct hugetlb_cgroup *h_cgroup;
107 107
@@ -155,18 +155,13 @@ out:
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup. 156 * the parent cgroup.
157 */ 157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) 158static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
159{ 159{
160 struct hstate *h; 160 struct hstate *h;
161 struct page *page; 161 struct page *page;
162 int ret = 0, idx = 0; 162 int idx = 0;
163 163
164 do { 164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) { 165 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock); 166 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru) 167 list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
177 } 172 }
178 cond_resched(); 173 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup)); 174 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182} 175}
183 176
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 177int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -340,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
340 return buf; 333 return buf;
341} 334}
342 335
343int __init hugetlb_cgroup_file_init(int idx) 336static void __init __hugetlb_cgroup_file_init(int idx)
344{ 337{
345 char buf[32]; 338 char buf[32];
346 struct cftype *cft; 339 struct cftype *cft;
@@ -382,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
382 375
383 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); 376 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
384 377
385 return 0; 378 return;
379}
380
381void __init hugetlb_cgroup_file_init(void)
382{
383 struct hstate *h;
384
385 for_each_hstate(h) {
386 /*
387 * Add cgroup control files only if the huge page consists
388 * of more than two normal pages. This is because we use
389 * page[2].lru.next for storing cgroup details.
390 */
391 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
392 __hugetlb_cgroup_file_init(hstate_index(h));
393 }
386} 394}
387 395
388/* 396/*
@@ -411,8 +419,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
411 419
412struct cgroup_subsys hugetlb_subsys = { 420struct cgroup_subsys hugetlb_subsys = {
413 .name = "hugetlb", 421 .name = "hugetlb",
414 .create = hugetlb_cgroup_create, 422 .css_alloc = hugetlb_cgroup_css_alloc,
415 .pre_destroy = hugetlb_cgroup_pre_destroy, 423 .css_offline = hugetlb_cgroup_css_offline,
416 .destroy = hugetlb_cgroup_destroy, 424 .css_free = hugetlb_cgroup_css_free,
417 .subsys_id = hugetlb_subsys_id, 425 .subsys_id = hugetlb_subsys_id,
418}; 426};
diff --git a/mm/internal.h b/mm/internal.h
index a4fa284f6bc2..d597f94cc205 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page);
92extern void putback_lru_page(struct page *page); 92extern void putback_lru_page(struct page *page);
93 93
94/* 94/*
95 * in mm/rmap.c:
96 */
97extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
98
99/*
95 * in mm/page_alloc.c 100 * in mm/page_alloc.c
96 */ 101 */
97extern void __free_pages_bootmem(struct page *page, unsigned int order); 102extern void __free_pages_bootmem(struct page *page, unsigned int order);
@@ -212,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
212{ 217{
213 if (TestClearPageMlocked(page)) { 218 if (TestClearPageMlocked(page)) {
214 unsigned long flags; 219 unsigned long flags;
220 int nr_pages = hpage_nr_pages(page);
215 221
216 local_irq_save(flags); 222 local_irq_save(flags);
217 __dec_zone_page_state(page, NR_MLOCK); 223 __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
218 SetPageMlocked(newpage); 224 SetPageMlocked(newpage);
219 __inc_zone_page_state(newpage, NR_MLOCK); 225 __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
220 local_irq_restore(flags); 226 local_irq_restore(flags);
221 } 227 }
222} 228}
223 229
230extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
231
224#ifdef CONFIG_TRANSPARENT_HUGEPAGE 232#ifdef CONFIG_TRANSPARENT_HUGEPAGE
225extern unsigned long vma_address(struct page *page, 233extern unsigned long vma_address(struct page *page,
226 struct vm_area_struct *vma); 234 struct vm_area_struct *vma);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a217cc544060..752a705c77c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
1556 struct kmemleak_object *object; 1556 struct kmemleak_object *object;
1557 unsigned long addr; 1557 unsigned long addr;
1558 1558
1559 addr= simple_strtoul(str, NULL, 0); 1559 if (kstrtoul(str, 0, &addr))
1560 return -EINVAL;
1560 object = find_and_get_object(addr, 0); 1561 object = find_and_get_object(addr, 0);
1561 if (!object) { 1562 if (!object) {
1562 pr_info("Unknown object at 0x%08lx\n", addr); 1563 pr_info("Unknown object at 0x%08lx\n", addr);
diff --git a/mm/ksm.c b/mm/ksm.c
index ae539f0b8aa1..51573858938d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
778 struct page *kpage, pte_t orig_pte) 778 struct page *kpage, pte_t orig_pte)
779{ 779{
780 struct mm_struct *mm = vma->vm_mm; 780 struct mm_struct *mm = vma->vm_mm;
781 pgd_t *pgd;
782 pud_t *pud;
783 pmd_t *pmd; 781 pmd_t *pmd;
784 pte_t *ptep; 782 pte_t *ptep;
785 spinlock_t *ptl; 783 spinlock_t *ptl;
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
792 if (addr == -EFAULT) 790 if (addr == -EFAULT)
793 goto out; 791 goto out;
794 792
795 pgd = pgd_offset(mm, addr); 793 pmd = mm_find_pmd(mm, addr);
796 if (!pgd_present(*pgd)) 794 if (!pmd)
797 goto out; 795 goto out;
798
799 pud = pud_offset(pgd, addr);
800 if (!pud_present(*pud))
801 goto out;
802
803 pmd = pmd_offset(pud, addr);
804 BUG_ON(pmd_trans_huge(*pmd)); 796 BUG_ON(pmd_trans_huge(*pmd));
805 if (!pmd_present(*pmd))
806 goto out;
807 797
808 mmun_start = addr; 798 mmun_start = addr;
809 mmun_end = addr + PAGE_SIZE; 799 mmun_end = addr + PAGE_SIZE;
@@ -1634,7 +1624,7 @@ again:
1634 struct anon_vma_chain *vmac; 1624 struct anon_vma_chain *vmac;
1635 struct vm_area_struct *vma; 1625 struct vm_area_struct *vma;
1636 1626
1637 anon_vma_lock(anon_vma); 1627 anon_vma_lock_read(anon_vma);
1638 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1628 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1639 0, ULONG_MAX) { 1629 0, ULONG_MAX) {
1640 vma = vmac->vma; 1630 vma = vmac->vma;
@@ -1658,7 +1648,7 @@ again:
1658 if (!search_new_forks || !mapcount) 1648 if (!search_new_forks || !mapcount)
1659 break; 1649 break;
1660 } 1650 }
1661 anon_vma_unlock(anon_vma); 1651 anon_vma_unlock_read(anon_vma);
1662 if (!mapcount) 1652 if (!mapcount)
1663 goto out; 1653 goto out;
1664 } 1654 }
@@ -1688,7 +1678,7 @@ again:
1688 struct anon_vma_chain *vmac; 1678 struct anon_vma_chain *vmac;
1689 struct vm_area_struct *vma; 1679 struct vm_area_struct *vma;
1690 1680
1691 anon_vma_lock(anon_vma); 1681 anon_vma_lock_read(anon_vma);
1692 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1682 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1693 0, ULONG_MAX) { 1683 0, ULONG_MAX) {
1694 vma = vmac->vma; 1684 vma = vmac->vma;
@@ -1707,11 +1697,11 @@ again:
1707 ret = try_to_unmap_one(page, vma, 1697 ret = try_to_unmap_one(page, vma,
1708 rmap_item->address, flags); 1698 rmap_item->address, flags);
1709 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1699 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1710 anon_vma_unlock(anon_vma); 1700 anon_vma_unlock_read(anon_vma);
1711 goto out; 1701 goto out;
1712 } 1702 }
1713 } 1703 }
1714 anon_vma_unlock(anon_vma); 1704 anon_vma_unlock_read(anon_vma);
1715 } 1705 }
1716 if (!search_new_forks++) 1706 if (!search_new_forks++)
1717 goto again; 1707 goto again;
@@ -1741,7 +1731,7 @@ again:
1741 struct anon_vma_chain *vmac; 1731 struct anon_vma_chain *vmac;
1742 struct vm_area_struct *vma; 1732 struct vm_area_struct *vma;
1743 1733
1744 anon_vma_lock(anon_vma); 1734 anon_vma_lock_read(anon_vma);
1745 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1735 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1746 0, ULONG_MAX) { 1736 0, ULONG_MAX) {
1747 vma = vmac->vma; 1737 vma = vmac->vma;
@@ -1759,11 +1749,11 @@ again:
1759 1749
1760 ret = rmap_one(page, vma, rmap_item->address, arg); 1750 ret = rmap_one(page, vma, rmap_item->address, arg);
1761 if (ret != SWAP_AGAIN) { 1751 if (ret != SWAP_AGAIN) {
1762 anon_vma_unlock(anon_vma); 1752 anon_vma_unlock_read(anon_vma);
1763 goto out; 1753 goto out;
1764 } 1754 }
1765 } 1755 }
1766 anon_vma_unlock(anon_vma); 1756 anon_vma_unlock_read(anon_vma);
1767 } 1757 }
1768 if (!search_new_forks++) 1758 if (!search_new_forks++)
1769 goto again; 1759 goto again;
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1929 if (ksm_run != flags) { 1919 if (ksm_run != flags) {
1930 ksm_run = flags; 1920 ksm_run = flags;
1931 if (flags & KSM_RUN_UNMERGE) { 1921 if (flags & KSM_RUN_UNMERGE) {
1932 int oom_score_adj; 1922 set_current_oom_origin();
1933
1934 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1935 err = unmerge_and_remove_all_rmap_items(); 1923 err = unmerge_and_remove_all_rmap_items();
1936 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, 1924 clear_current_oom_origin();
1937 oom_score_adj);
1938 if (err) { 1925 if (err) {
1939 ksm_run = KSM_RUN_STOP; 1926 ksm_run = KSM_RUN_STOP;
1940 count = err; 1927 count = err;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dd39ba000b31..f3009b4bae51 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
10 * Copyright (C) 2009 Nokia Corporation 10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov 11 * Author: Kirill A. Shutemov
12 * 12 *
13 * Kernel Memory Controller
14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
15 * Authors: Glauber Costa and Suleiman Souhlal
16 *
13 * This program is free software; you can redistribute it and/or modify 17 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 18 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or 19 * the Free Software Foundation; either version 2 of the License, or
@@ -59,6 +63,8 @@
59#include <trace/events/vmscan.h> 63#include <trace/events/vmscan.h>
60 64
61struct cgroup_subsys mem_cgroup_subsys __read_mostly; 65struct cgroup_subsys mem_cgroup_subsys __read_mostly;
66EXPORT_SYMBOL(mem_cgroup_subsys);
67
62#define MEM_CGROUP_RECLAIM_RETRIES 5 68#define MEM_CGROUP_RECLAIM_RETRIES 5
63static struct mem_cgroup *root_mem_cgroup __read_mostly; 69static struct mem_cgroup *root_mem_cgroup __read_mostly;
64 70
@@ -266,6 +272,10 @@ struct mem_cgroup {
266 }; 272 };
267 273
268 /* 274 /*
275 * the counter to account for kernel memory usage.
276 */
277 struct res_counter kmem;
278 /*
269 * Per cgroup active and inactive list, similar to the 279 * Per cgroup active and inactive list, similar to the
270 * per zone LRU lists. 280 * per zone LRU lists.
271 */ 281 */
@@ -280,6 +290,7 @@ struct mem_cgroup {
280 * Should the accounting and control be hierarchical, per subtree? 290 * Should the accounting and control be hierarchical, per subtree?
281 */ 291 */
282 bool use_hierarchy; 292 bool use_hierarchy;
293 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
283 294
284 bool oom_lock; 295 bool oom_lock;
285 atomic_t under_oom; 296 atomic_t under_oom;
@@ -330,8 +341,61 @@ struct mem_cgroup {
330#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 341#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
331 struct tcp_memcontrol tcp_mem; 342 struct tcp_memcontrol tcp_mem;
332#endif 343#endif
344#if defined(CONFIG_MEMCG_KMEM)
345 /* analogous to slab_common's slab_caches list. per-memcg */
346 struct list_head memcg_slab_caches;
347 /* Not a spinlock, we can take a lot of time walking the list */
348 struct mutex slab_caches_mutex;
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id;
351#endif
333}; 352};
334 353
354/* internal only representation about the status of kmem accounting. */
355enum {
356 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
357 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
358 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
359};
360
361/* We account when limit is on, but only after call sites are patched */
362#define KMEM_ACCOUNTED_MASK \
363 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
364
365#ifdef CONFIG_MEMCG_KMEM
366static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
367{
368 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
369}
370
371static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
372{
373 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
374}
375
376static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
377{
378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
379}
380
381static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
382{
383 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
384}
385
386static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
387{
388 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
389 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
390}
391
392static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
393{
394 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
395 &memcg->kmem_account_flags);
396}
397#endif
398
335/* Stuffs for move charges at task migration. */ 399/* Stuffs for move charges at task migration. */
336/* 400/*
337 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 401 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -386,9 +450,13 @@ enum charge_type {
386}; 450};
387 451
388/* for encoding cft->private value on file */ 452/* for encoding cft->private value on file */
389#define _MEM (0) 453enum res_type {
390#define _MEMSWAP (1) 454 _MEM,
391#define _OOM_TYPE (2) 455 _MEMSWAP,
456 _OOM_TYPE,
457 _KMEM,
458};
459
392#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 460#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
393#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 461#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
394#define MEMFILE_ATTR(val) ((val) & 0xffff) 462#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -485,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
485} 553}
486#endif 554#endif
487 555
556#ifdef CONFIG_MEMCG_KMEM
557/*
558 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
559 * There are two main reasons for not using the css_id for this:
560 * 1) this works better in sparse environments, where we have a lot of memcgs,
561 * but only a few kmem-limited. Or also, if we have, for instance, 200
562 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
563 * 200 entry array for that.
564 *
565 * 2) In order not to violate the cgroup API, we would like to do all memory
566 * allocation in ->create(). At that point, we haven't yet allocated the
567 * css_id. Having a separate index prevents us from messing with the cgroup
568 * core for this
569 *
570 * The current size of the caches array is stored in
571 * memcg_limited_groups_array_size. It will double each time we have to
572 * increase it.
573 */
574static DEFINE_IDA(kmem_limited_groups);
575int memcg_limited_groups_array_size;
576
577/*
578 * MIN_SIZE is different than 1, because we would like to avoid going through
579 * the alloc/free process all the time. In a small machine, 4 kmem-limited
580 * cgroups is a reasonable guess. In the future, it could be a parameter or
581 * tunable, but that is strictly not necessary.
582 *
583 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
584 * this constant directly from cgroup, but it is understandable that this is
585 * better kept as an internal representation in cgroup.c. In any case, the
586 * css_id space is not getting any smaller, and we don't have to necessarily
587 * increase ours as well if it increases.
588 */
589#define MEMCG_CACHES_MIN_SIZE 4
590#define MEMCG_CACHES_MAX_SIZE 65535
591
592/*
593 * A lot of the calls to the cache allocation functions are expected to be
594 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
595 * conditional to this static branch, we'll have to allow modules that does
596 * kmem_cache_alloc and the such to see this symbol as well
597 */
598struct static_key memcg_kmem_enabled_key;
599EXPORT_SYMBOL(memcg_kmem_enabled_key);
600
601static void disarm_kmem_keys(struct mem_cgroup *memcg)
602{
603 if (memcg_kmem_is_active(memcg)) {
604 static_key_slow_dec(&memcg_kmem_enabled_key);
605 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
606 }
607 /*
608 * This check can't live in kmem destruction function,
609 * since the charges will outlive the cgroup
610 */
611 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
612}
613#else
614static void disarm_kmem_keys(struct mem_cgroup *memcg)
615{
616}
617#endif /* CONFIG_MEMCG_KMEM */
618
619static void disarm_static_keys(struct mem_cgroup *memcg)
620{
621 disarm_sock_keys(memcg);
622 disarm_kmem_keys(memcg);
623}
624
488static void drain_all_stock_async(struct mem_cgroup *memcg); 625static void drain_all_stock_async(struct mem_cgroup *memcg);
489 626
490static struct mem_cgroup_per_zone * 627static struct mem_cgroup_per_zone *
@@ -800,7 +937,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
800 int nid; 937 int nid;
801 u64 total = 0; 938 u64 total = 0;
802 939
803 for_each_node_state(nid, N_HIGH_MEMORY) 940 for_each_node_state(nid, N_MEMORY)
804 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 941 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
805 return total; 942 return total;
806} 943}
@@ -1015,13 +1152,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
1015 iter != NULL; \ 1152 iter != NULL; \
1016 iter = mem_cgroup_iter(NULL, iter, NULL)) 1153 iter = mem_cgroup_iter(NULL, iter, NULL))
1017 1154
1018void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1155void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1019{ 1156{
1020 struct mem_cgroup *memcg; 1157 struct mem_cgroup *memcg;
1021 1158
1022 if (!mm)
1023 return;
1024
1025 rcu_read_lock(); 1159 rcu_read_lock();
1026 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1160 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1027 if (unlikely(!memcg)) 1161 if (unlikely(!memcg))
@@ -1040,7 +1174,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1040out: 1174out:
1041 rcu_read_unlock(); 1175 rcu_read_unlock();
1042} 1176}
1043EXPORT_SYMBOL(mem_cgroup_count_vm_event); 1177EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1044 1178
1045/** 1179/**
1046 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1180 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
@@ -1454,6 +1588,10 @@ done:
1454 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1588 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1455 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1589 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1456 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1590 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1591 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1592 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1593 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1594 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1457} 1595}
1458 1596
1459/* 1597/*
@@ -1498,8 +1636,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1498 return limit; 1636 return limit;
1499} 1637}
1500 1638
1501void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1639static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1502 int order) 1640 int order)
1503{ 1641{
1504 struct mem_cgroup *iter; 1642 struct mem_cgroup *iter;
1505 unsigned long chosen_points = 0; 1643 unsigned long chosen_points = 0;
@@ -1644,9 +1782,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1644 return; 1782 return;
1645 1783
1646 /* make a nodemask where this memcg uses memory from */ 1784 /* make a nodemask where this memcg uses memory from */
1647 memcg->scan_nodes = node_states[N_HIGH_MEMORY]; 1785 memcg->scan_nodes = node_states[N_MEMORY];
1648 1786
1649 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1787 for_each_node_mask(nid, node_states[N_MEMORY]) {
1650 1788
1651 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1789 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1652 node_clear(nid, memcg->scan_nodes); 1790 node_clear(nid, memcg->scan_nodes);
@@ -1717,7 +1855,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1717 /* 1855 /*
1718 * Check rest of nodes. 1856 * Check rest of nodes.
1719 */ 1857 */
1720 for_each_node_state(nid, N_HIGH_MEMORY) { 1858 for_each_node_state(nid, N_MEMORY) {
1721 if (node_isset(nid, memcg->scan_nodes)) 1859 if (node_isset(nid, memcg->scan_nodes))
1722 continue; 1860 continue;
1723 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1861 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
@@ -2061,20 +2199,28 @@ struct memcg_stock_pcp {
2061static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2199static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2062static DEFINE_MUTEX(percpu_charge_mutex); 2200static DEFINE_MUTEX(percpu_charge_mutex);
2063 2201
2064/* 2202/**
2065 * Try to consume stocked charge on this cpu. If success, one page is consumed 2203 * consume_stock: Try to consume stocked charge on this cpu.
2066 * from local stock and true is returned. If the stock is 0 or charges from a 2204 * @memcg: memcg to consume from.
2067 * cgroup which is not current target, returns false. This stock will be 2205 * @nr_pages: how many pages to charge.
2068 * refilled. 2206 *
2207 * The charges will only happen if @memcg matches the current cpu's memcg
2208 * stock, and at least @nr_pages are available in that stock. Failure to
2209 * service an allocation will refill the stock.
2210 *
2211 * returns true if successful, false otherwise.
2069 */ 2212 */
2070static bool consume_stock(struct mem_cgroup *memcg) 2213static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2071{ 2214{
2072 struct memcg_stock_pcp *stock; 2215 struct memcg_stock_pcp *stock;
2073 bool ret = true; 2216 bool ret = true;
2074 2217
2218 if (nr_pages > CHARGE_BATCH)
2219 return false;
2220
2075 stock = &get_cpu_var(memcg_stock); 2221 stock = &get_cpu_var(memcg_stock);
2076 if (memcg == stock->cached && stock->nr_pages) 2222 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2077 stock->nr_pages--; 2223 stock->nr_pages -= nr_pages;
2078 else /* need to call res_counter_charge */ 2224 else /* need to call res_counter_charge */
2079 ret = false; 2225 ret = false;
2080 put_cpu_var(memcg_stock); 2226 put_cpu_var(memcg_stock);
@@ -2251,7 +2397,8 @@ enum {
2251}; 2397};
2252 2398
2253static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2399static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2254 unsigned int nr_pages, bool oom_check) 2400 unsigned int nr_pages, unsigned int min_pages,
2401 bool oom_check)
2255{ 2402{
2256 unsigned long csize = nr_pages * PAGE_SIZE; 2403 unsigned long csize = nr_pages * PAGE_SIZE;
2257 struct mem_cgroup *mem_over_limit; 2404 struct mem_cgroup *mem_over_limit;
@@ -2274,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2274 } else 2421 } else
2275 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2422 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2276 /* 2423 /*
2277 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2278 * of regular pages (CHARGE_BATCH), or a single regular page (1).
2279 *
2280 * Never reclaim on behalf of optional batching, retry with a 2424 * Never reclaim on behalf of optional batching, retry with a
2281 * single page instead. 2425 * single page instead.
2282 */ 2426 */
2283 if (nr_pages == CHARGE_BATCH) 2427 if (nr_pages > min_pages)
2284 return CHARGE_RETRY; 2428 return CHARGE_RETRY;
2285 2429
2286 if (!(gfp_mask & __GFP_WAIT)) 2430 if (!(gfp_mask & __GFP_WAIT))
2287 return CHARGE_WOULDBLOCK; 2431 return CHARGE_WOULDBLOCK;
2288 2432
2433 if (gfp_mask & __GFP_NORETRY)
2434 return CHARGE_NOMEM;
2435
2289 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2436 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2290 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2437 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2291 return CHARGE_RETRY; 2438 return CHARGE_RETRY;
@@ -2298,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2298 * unlikely to succeed so close to the limit, and we fall back 2445 * unlikely to succeed so close to the limit, and we fall back
2299 * to regular pages anyway in case of failure. 2446 * to regular pages anyway in case of failure.
2300 */ 2447 */
2301 if (nr_pages == 1 && ret) 2448 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2302 return CHARGE_RETRY; 2449 return CHARGE_RETRY;
2303 2450
2304 /* 2451 /*
@@ -2370,10 +2517,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2370again: 2517again:
2371 if (*ptr) { /* css should be a valid one */ 2518 if (*ptr) { /* css should be a valid one */
2372 memcg = *ptr; 2519 memcg = *ptr;
2373 VM_BUG_ON(css_is_removed(&memcg->css));
2374 if (mem_cgroup_is_root(memcg)) 2520 if (mem_cgroup_is_root(memcg))
2375 goto done; 2521 goto done;
2376 if (nr_pages == 1 && consume_stock(memcg)) 2522 if (consume_stock(memcg, nr_pages))
2377 goto done; 2523 goto done;
2378 css_get(&memcg->css); 2524 css_get(&memcg->css);
2379 } else { 2525 } else {
@@ -2398,7 +2544,7 @@ again:
2398 rcu_read_unlock(); 2544 rcu_read_unlock();
2399 goto done; 2545 goto done;
2400 } 2546 }
2401 if (nr_pages == 1 && consume_stock(memcg)) { 2547 if (consume_stock(memcg, nr_pages)) {
2402 /* 2548 /*
2403 * It seems dagerous to access memcg without css_get(). 2549 * It seems dagerous to access memcg without css_get().
2404 * But considering how consume_stok works, it's not 2550 * But considering how consume_stok works, it's not
@@ -2433,7 +2579,8 @@ again:
2433 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2579 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2434 } 2580 }
2435 2581
2436 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); 2582 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2583 oom_check);
2437 switch (ret) { 2584 switch (ret) {
2438 case CHARGE_OK: 2585 case CHARGE_OK:
2439 break; 2586 break;
@@ -2510,9 +2657,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2510 2657
2511/* 2658/*
2512 * A helper function to get mem_cgroup from ID. must be called under 2659 * A helper function to get mem_cgroup from ID. must be called under
2513 * rcu_read_lock(). The caller must check css_is_removed() or some if 2660 * rcu_read_lock(). The caller is responsible for calling css_tryget if
2514 * it's concern. (dropping refcnt from swap can be called against removed 2661 * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2515 * memcg.) 2662 * called against removed memcg.)
2516 */ 2663 */
2517static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2664static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2518{ 2665{
@@ -2626,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2626 memcg_check_events(memcg, page); 2773 memcg_check_events(memcg, page);
2627} 2774}
2628 2775
2776static DEFINE_MUTEX(set_limit_mutex);
2777
2778#ifdef CONFIG_MEMCG_KMEM
2779static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2780{
2781 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2782 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2783}
2784
2785/*
2786 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2787 * in the memcg_cache_params struct.
2788 */
2789static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2790{
2791 struct kmem_cache *cachep;
2792
2793 VM_BUG_ON(p->is_root_cache);
2794 cachep = p->root_cache;
2795 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2796}
2797
2798#ifdef CONFIG_SLABINFO
2799static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
2800 struct seq_file *m)
2801{
2802 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2803 struct memcg_cache_params *params;
2804
2805 if (!memcg_can_account_kmem(memcg))
2806 return -EIO;
2807
2808 print_slabinfo_header(m);
2809
2810 mutex_lock(&memcg->slab_caches_mutex);
2811 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2812 cache_show(memcg_params_to_cache(params), m);
2813 mutex_unlock(&memcg->slab_caches_mutex);
2814
2815 return 0;
2816}
2817#endif
2818
2819static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2820{
2821 struct res_counter *fail_res;
2822 struct mem_cgroup *_memcg;
2823 int ret = 0;
2824 bool may_oom;
2825
2826 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2827 if (ret)
2828 return ret;
2829
2830 /*
2831 * Conditions under which we can wait for the oom_killer. Those are
2832 * the same conditions tested by the core page allocator
2833 */
2834 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2835
2836 _memcg = memcg;
2837 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2838 &_memcg, may_oom);
2839
2840 if (ret == -EINTR) {
2841 /*
2842 * __mem_cgroup_try_charge() chosed to bypass to root due to
2843 * OOM kill or fatal signal. Since our only options are to
2844 * either fail the allocation or charge it to this cgroup, do
2845 * it as a temporary condition. But we can't fail. From a
2846 * kmem/slab perspective, the cache has already been selected,
2847 * by mem_cgroup_kmem_get_cache(), so it is too late to change
2848 * our minds.
2849 *
2850 * This condition will only trigger if the task entered
2851 * memcg_charge_kmem in a sane state, but was OOM-killed during
2852 * __mem_cgroup_try_charge() above. Tasks that were already
2853 * dying when the allocation triggers should have been already
2854 * directed to the root cgroup in memcontrol.h
2855 */
2856 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2857 if (do_swap_account)
2858 res_counter_charge_nofail(&memcg->memsw, size,
2859 &fail_res);
2860 ret = 0;
2861 } else if (ret)
2862 res_counter_uncharge(&memcg->kmem, size);
2863
2864 return ret;
2865}
2866
2867static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2868{
2869 res_counter_uncharge(&memcg->res, size);
2870 if (do_swap_account)
2871 res_counter_uncharge(&memcg->memsw, size);
2872
2873 /* Not down to 0 */
2874 if (res_counter_uncharge(&memcg->kmem, size))
2875 return;
2876
2877 if (memcg_kmem_test_and_clear_dead(memcg))
2878 mem_cgroup_put(memcg);
2879}
2880
2881void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
2882{
2883 if (!memcg)
2884 return;
2885
2886 mutex_lock(&memcg->slab_caches_mutex);
2887 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2888 mutex_unlock(&memcg->slab_caches_mutex);
2889}
2890
2891/*
2892 * helper for acessing a memcg's index. It will be used as an index in the
2893 * child cache array in kmem_cache, and also to derive its name. This function
2894 * will return -1 when this is not a kmem-limited memcg.
2895 */
2896int memcg_cache_id(struct mem_cgroup *memcg)
2897{
2898 return memcg ? memcg->kmemcg_id : -1;
2899}
2900
2901/*
2902 * This ends up being protected by the set_limit mutex, during normal
2903 * operation, because that is its main call site.
2904 *
2905 * But when we create a new cache, we can call this as well if its parent
2906 * is kmem-limited. That will have to hold set_limit_mutex as well.
2907 */
2908int memcg_update_cache_sizes(struct mem_cgroup *memcg)
2909{
2910 int num, ret;
2911
2912 num = ida_simple_get(&kmem_limited_groups,
2913 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2914 if (num < 0)
2915 return num;
2916 /*
2917 * After this point, kmem_accounted (that we test atomically in
2918 * the beginning of this conditional), is no longer 0. This
2919 * guarantees only one process will set the following boolean
2920 * to true. We don't need test_and_set because we're protected
2921 * by the set_limit_mutex anyway.
2922 */
2923 memcg_kmem_set_activated(memcg);
2924
2925 ret = memcg_update_all_caches(num+1);
2926 if (ret) {
2927 ida_simple_remove(&kmem_limited_groups, num);
2928 memcg_kmem_clear_activated(memcg);
2929 return ret;
2930 }
2931
2932 memcg->kmemcg_id = num;
2933 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
2934 mutex_init(&memcg->slab_caches_mutex);
2935 return 0;
2936}
2937
2938static size_t memcg_caches_array_size(int num_groups)
2939{
2940 ssize_t size;
2941 if (num_groups <= 0)
2942 return 0;
2943
2944 size = 2 * num_groups;
2945 if (size < MEMCG_CACHES_MIN_SIZE)
2946 size = MEMCG_CACHES_MIN_SIZE;
2947 else if (size > MEMCG_CACHES_MAX_SIZE)
2948 size = MEMCG_CACHES_MAX_SIZE;
2949
2950 return size;
2951}
2952
2953/*
2954 * We should update the current array size iff all caches updates succeed. This
2955 * can only be done from the slab side. The slab mutex needs to be held when
2956 * calling this.
2957 */
2958void memcg_update_array_size(int num)
2959{
2960 if (num > memcg_limited_groups_array_size)
2961 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2962}
2963
2964int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2965{
2966 struct memcg_cache_params *cur_params = s->memcg_params;
2967
2968 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
2969
2970 if (num_groups > memcg_limited_groups_array_size) {
2971 int i;
2972 ssize_t size = memcg_caches_array_size(num_groups);
2973
2974 size *= sizeof(void *);
2975 size += sizeof(struct memcg_cache_params);
2976
2977 s->memcg_params = kzalloc(size, GFP_KERNEL);
2978 if (!s->memcg_params) {
2979 s->memcg_params = cur_params;
2980 return -ENOMEM;
2981 }
2982
2983 s->memcg_params->is_root_cache = true;
2984
2985 /*
2986 * There is the chance it will be bigger than
2987 * memcg_limited_groups_array_size, if we failed an allocation
2988 * in a cache, in which case all caches updated before it, will
2989 * have a bigger array.
2990 *
2991 * But if that is the case, the data after
2992 * memcg_limited_groups_array_size is certainly unused
2993 */
2994 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2995 if (!cur_params->memcg_caches[i])
2996 continue;
2997 s->memcg_params->memcg_caches[i] =
2998 cur_params->memcg_caches[i];
2999 }
3000
3001 /*
3002 * Ideally, we would wait until all caches succeed, and only
3003 * then free the old one. But this is not worth the extra
3004 * pointer per-cache we'd have to have for this.
3005 *
3006 * It is not a big deal if some caches are left with a size
3007 * bigger than the others. And all updates will reset this
3008 * anyway.
3009 */
3010 kfree(cur_params);
3011 }
3012 return 0;
3013}
3014
3015int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3016 struct kmem_cache *root_cache)
3017{
3018 size_t size = sizeof(struct memcg_cache_params);
3019
3020 if (!memcg_kmem_enabled())
3021 return 0;
3022
3023 if (!memcg)
3024 size += memcg_limited_groups_array_size * sizeof(void *);
3025
3026 s->memcg_params = kzalloc(size, GFP_KERNEL);
3027 if (!s->memcg_params)
3028 return -ENOMEM;
3029
3030 if (memcg) {
3031 s->memcg_params->memcg = memcg;
3032 s->memcg_params->root_cache = root_cache;
3033 }
3034 return 0;
3035}
3036
3037void memcg_release_cache(struct kmem_cache *s)
3038{
3039 struct kmem_cache *root;
3040 struct mem_cgroup *memcg;
3041 int id;
3042
3043 /*
3044 * This happens, for instance, when a root cache goes away before we
3045 * add any memcg.
3046 */
3047 if (!s->memcg_params)
3048 return;
3049
3050 if (s->memcg_params->is_root_cache)
3051 goto out;
3052
3053 memcg = s->memcg_params->memcg;
3054 id = memcg_cache_id(memcg);
3055
3056 root = s->memcg_params->root_cache;
3057 root->memcg_params->memcg_caches[id] = NULL;
3058 mem_cgroup_put(memcg);
3059
3060 mutex_lock(&memcg->slab_caches_mutex);
3061 list_del(&s->memcg_params->list);
3062 mutex_unlock(&memcg->slab_caches_mutex);
3063
3064out:
3065 kfree(s->memcg_params);
3066}
3067
3068/*
3069 * During the creation a new cache, we need to disable our accounting mechanism
3070 * altogether. This is true even if we are not creating, but rather just
3071 * enqueing new caches to be created.
3072 *
3073 * This is because that process will trigger allocations; some visible, like
3074 * explicit kmallocs to auxiliary data structures, name strings and internal
3075 * cache structures; some well concealed, like INIT_WORK() that can allocate
3076 * objects during debug.
3077 *
3078 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
3079 * to it. This may not be a bounded recursion: since the first cache creation
3080 * failed to complete (waiting on the allocation), we'll just try to create the
3081 * cache again, failing at the same point.
3082 *
3083 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
3084 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
3085 * inside the following two functions.
3086 */
3087static inline void memcg_stop_kmem_account(void)
3088{
3089 VM_BUG_ON(!current->mm);
3090 current->memcg_kmem_skip_account++;
3091}
3092
3093static inline void memcg_resume_kmem_account(void)
3094{
3095 VM_BUG_ON(!current->mm);
3096 current->memcg_kmem_skip_account--;
3097}
3098
3099static void kmem_cache_destroy_work_func(struct work_struct *w)
3100{
3101 struct kmem_cache *cachep;
3102 struct memcg_cache_params *p;
3103
3104 p = container_of(w, struct memcg_cache_params, destroy);
3105
3106 cachep = memcg_params_to_cache(p);
3107
3108 /*
3109 * If we get down to 0 after shrink, we could delete right away.
3110 * However, memcg_release_pages() already puts us back in the workqueue
3111 * in that case. If we proceed deleting, we'll get a dangling
3112 * reference, and removing the object from the workqueue in that case
3113 * is unnecessary complication. We are not a fast path.
3114 *
3115 * Note that this case is fundamentally different from racing with
3116 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3117 * kmem_cache_shrink, not only we would be reinserting a dead cache
3118 * into the queue, but doing so from inside the worker racing to
3119 * destroy it.
3120 *
3121 * So if we aren't down to zero, we'll just schedule a worker and try
3122 * again
3123 */
3124 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3125 kmem_cache_shrink(cachep);
3126 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3127 return;
3128 } else
3129 kmem_cache_destroy(cachep);
3130}
3131
3132void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3133{
3134 if (!cachep->memcg_params->dead)
3135 return;
3136
3137 /*
3138 * There are many ways in which we can get here.
3139 *
3140 * We can get to a memory-pressure situation while the delayed work is
3141 * still pending to run. The vmscan shrinkers can then release all
3142 * cache memory and get us to destruction. If this is the case, we'll
3143 * be executed twice, which is a bug (the second time will execute over
3144 * bogus data). In this case, cancelling the work should be fine.
3145 *
3146 * But we can also get here from the worker itself, if
3147 * kmem_cache_shrink is enough to shake all the remaining objects and
3148 * get the page count to 0. In this case, we'll deadlock if we try to
3149 * cancel the work (the worker runs with an internal lock held, which
3150 * is the same lock we would hold for cancel_work_sync().)
3151 *
3152 * Since we can't possibly know who got us here, just refrain from
3153 * running if there is already work pending
3154 */
3155 if (work_pending(&cachep->memcg_params->destroy))
3156 return;
3157 /*
3158 * We have to defer the actual destroying to a workqueue, because
3159 * we might currently be in a context that cannot sleep.
3160 */
3161 schedule_work(&cachep->memcg_params->destroy);
3162}
3163
3164static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
3165{
3166 char *name;
3167 struct dentry *dentry;
3168
3169 rcu_read_lock();
3170 dentry = rcu_dereference(memcg->css.cgroup->dentry);
3171 rcu_read_unlock();
3172
3173 BUG_ON(dentry == NULL);
3174
3175 name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
3176 memcg_cache_id(memcg), dentry->d_name.name);
3177
3178 return name;
3179}
3180
3181static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3182 struct kmem_cache *s)
3183{
3184 char *name;
3185 struct kmem_cache *new;
3186
3187 name = memcg_cache_name(memcg, s);
3188 if (!name)
3189 return NULL;
3190
3191 new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
3192 (s->flags & ~SLAB_PANIC), s->ctor, s);
3193
3194 if (new)
3195 new->allocflags |= __GFP_KMEMCG;
3196
3197 kfree(name);
3198 return new;
3199}
3200
3201/*
3202 * This lock protects updaters, not readers. We want readers to be as fast as
3203 * they can, and they will either see NULL or a valid cache value. Our model
3204 * allow them to see NULL, in which case the root memcg will be selected.
3205 *
3206 * We need this lock because multiple allocations to the same cache from a non
3207 * will span more than one worker. Only one of them can create the cache.
3208 */
3209static DEFINE_MUTEX(memcg_cache_mutex);
3210static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3211 struct kmem_cache *cachep)
3212{
3213 struct kmem_cache *new_cachep;
3214 int idx;
3215
3216 BUG_ON(!memcg_can_account_kmem(memcg));
3217
3218 idx = memcg_cache_id(memcg);
3219
3220 mutex_lock(&memcg_cache_mutex);
3221 new_cachep = cachep->memcg_params->memcg_caches[idx];
3222 if (new_cachep)
3223 goto out;
3224
3225 new_cachep = kmem_cache_dup(memcg, cachep);
3226 if (new_cachep == NULL) {
3227 new_cachep = cachep;
3228 goto out;
3229 }
3230
3231 mem_cgroup_get(memcg);
3232 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3233
3234 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3235 /*
3236 * the readers won't lock, make sure everybody sees the updated value,
3237 * so they won't put stuff in the queue again for no reason
3238 */
3239 wmb();
3240out:
3241 mutex_unlock(&memcg_cache_mutex);
3242 return new_cachep;
3243}
3244
3245void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3246{
3247 struct kmem_cache *c;
3248 int i;
3249
3250 if (!s->memcg_params)
3251 return;
3252 if (!s->memcg_params->is_root_cache)
3253 return;
3254
3255 /*
3256 * If the cache is being destroyed, we trust that there is no one else
3257 * requesting objects from it. Even if there are, the sanity checks in
3258 * kmem_cache_destroy should caught this ill-case.
3259 *
3260 * Still, we don't want anyone else freeing memcg_caches under our
3261 * noses, which can happen if a new memcg comes to life. As usual,
3262 * we'll take the set_limit_mutex to protect ourselves against this.
3263 */
3264 mutex_lock(&set_limit_mutex);
3265 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3266 c = s->memcg_params->memcg_caches[i];
3267 if (!c)
3268 continue;
3269
3270 /*
3271 * We will now manually delete the caches, so to avoid races
3272 * we need to cancel all pending destruction workers and
3273 * proceed with destruction ourselves.
3274 *
3275 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3276 * and that could spawn the workers again: it is likely that
3277 * the cache still have active pages until this very moment.
3278 * This would lead us back to mem_cgroup_destroy_cache.
3279 *
3280 * But that will not execute at all if the "dead" flag is not
3281 * set, so flip it down to guarantee we are in control.
3282 */
3283 c->memcg_params->dead = false;
3284 cancel_work_sync(&c->memcg_params->destroy);
3285 kmem_cache_destroy(c);
3286 }
3287 mutex_unlock(&set_limit_mutex);
3288}
3289
3290struct create_work {
3291 struct mem_cgroup *memcg;
3292 struct kmem_cache *cachep;
3293 struct work_struct work;
3294};
3295
3296static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3297{
3298 struct kmem_cache *cachep;
3299 struct memcg_cache_params *params;
3300
3301 if (!memcg_kmem_is_active(memcg))
3302 return;
3303
3304 mutex_lock(&memcg->slab_caches_mutex);
3305 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3306 cachep = memcg_params_to_cache(params);
3307 cachep->memcg_params->dead = true;
3308 INIT_WORK(&cachep->memcg_params->destroy,
3309 kmem_cache_destroy_work_func);
3310 schedule_work(&cachep->memcg_params->destroy);
3311 }
3312 mutex_unlock(&memcg->slab_caches_mutex);
3313}
3314
3315static void memcg_create_cache_work_func(struct work_struct *w)
3316{
3317 struct create_work *cw;
3318
3319 cw = container_of(w, struct create_work, work);
3320 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3321 /* Drop the reference gotten when we enqueued. */
3322 css_put(&cw->memcg->css);
3323 kfree(cw);
3324}
3325
3326/*
3327 * Enqueue the creation of a per-memcg kmem_cache.
3328 * Called with rcu_read_lock.
3329 */
3330static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3331 struct kmem_cache *cachep)
3332{
3333 struct create_work *cw;
3334
3335 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3336 if (cw == NULL)
3337 return;
3338
3339 /* The corresponding put will be done in the workqueue. */
3340 if (!css_tryget(&memcg->css)) {
3341 kfree(cw);
3342 return;
3343 }
3344
3345 cw->memcg = memcg;
3346 cw->cachep = cachep;
3347
3348 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3349 schedule_work(&cw->work);
3350}
3351
3352static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3353 struct kmem_cache *cachep)
3354{
3355 /*
3356 * We need to stop accounting when we kmalloc, because if the
3357 * corresponding kmalloc cache is not yet created, the first allocation
3358 * in __memcg_create_cache_enqueue will recurse.
3359 *
3360 * However, it is better to enclose the whole function. Depending on
3361 * the debugging options enabled, INIT_WORK(), for instance, can
3362 * trigger an allocation. This too, will make us recurse. Because at
3363 * this point we can't allow ourselves back into memcg_kmem_get_cache,
3364 * the safest choice is to do it like this, wrapping the whole function.
3365 */
3366 memcg_stop_kmem_account();
3367 __memcg_create_cache_enqueue(memcg, cachep);
3368 memcg_resume_kmem_account();
3369}
3370/*
3371 * Return the kmem_cache we're supposed to use for a slab allocation.
3372 * We try to use the current memcg's version of the cache.
3373 *
3374 * If the cache does not exist yet, if we are the first user of it,
3375 * we either create it immediately, if possible, or create it asynchronously
3376 * in a workqueue.
3377 * In the latter case, we will let the current allocation go through with
3378 * the original cache.
3379 *
3380 * Can't be called in interrupt context or from kernel threads.
3381 * This function needs to be called with rcu_read_lock() held.
3382 */
3383struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3384 gfp_t gfp)
3385{
3386 struct mem_cgroup *memcg;
3387 int idx;
3388
3389 VM_BUG_ON(!cachep->memcg_params);
3390 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3391
3392 if (!current->mm || current->memcg_kmem_skip_account)
3393 return cachep;
3394
3395 rcu_read_lock();
3396 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3397 rcu_read_unlock();
3398
3399 if (!memcg_can_account_kmem(memcg))
3400 return cachep;
3401
3402 idx = memcg_cache_id(memcg);
3403
3404 /*
3405 * barrier to mare sure we're always seeing the up to date value. The
3406 * code updating memcg_caches will issue a write barrier to match this.
3407 */
3408 read_barrier_depends();
3409 if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
3410 /*
3411 * If we are in a safe context (can wait, and not in interrupt
3412 * context), we could be be predictable and return right away.
3413 * This would guarantee that the allocation being performed
3414 * already belongs in the new cache.
3415 *
3416 * However, there are some clashes that can arrive from locking.
3417 * For instance, because we acquire the slab_mutex while doing
3418 * kmem_cache_dup, this means no further allocation could happen
3419 * with the slab_mutex held.
3420 *
3421 * Also, because cache creation issue get_online_cpus(), this
3422 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3423 * that ends up reversed during cpu hotplug. (cpuset allocates
3424 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3425 * better to defer everything.
3426 */
3427 memcg_create_cache_enqueue(memcg, cachep);
3428 return cachep;
3429 }
3430
3431 return cachep->memcg_params->memcg_caches[idx];
3432}
3433EXPORT_SYMBOL(__memcg_kmem_get_cache);
3434
3435/*
3436 * We need to verify if the allocation against current->mm->owner's memcg is
3437 * possible for the given order. But the page is not allocated yet, so we'll
3438 * need a further commit step to do the final arrangements.
3439 *
3440 * It is possible for the task to switch cgroups in this mean time, so at
3441 * commit time, we can't rely on task conversion any longer. We'll then use
3442 * the handle argument to return to the caller which cgroup we should commit
3443 * against. We could also return the memcg directly and avoid the pointer
3444 * passing, but a boolean return value gives better semantics considering
3445 * the compiled-out case as well.
3446 *
3447 * Returning true means the allocation is possible.
3448 */
3449bool
3450__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3451{
3452 struct mem_cgroup *memcg;
3453 int ret;
3454
3455 *_memcg = NULL;
3456 memcg = try_get_mem_cgroup_from_mm(current->mm);
3457
3458 /*
3459 * very rare case described in mem_cgroup_from_task. Unfortunately there
3460 * isn't much we can do without complicating this too much, and it would
3461 * be gfp-dependent anyway. Just let it go
3462 */
3463 if (unlikely(!memcg))
3464 return true;
3465
3466 if (!memcg_can_account_kmem(memcg)) {
3467 css_put(&memcg->css);
3468 return true;
3469 }
3470
3471 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3472 if (!ret)
3473 *_memcg = memcg;
3474
3475 css_put(&memcg->css);
3476 return (ret == 0);
3477}
3478
3479void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3480 int order)
3481{
3482 struct page_cgroup *pc;
3483
3484 VM_BUG_ON(mem_cgroup_is_root(memcg));
3485
3486 /* The page allocation failed. Revert */
3487 if (!page) {
3488 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3489 return;
3490 }
3491
3492 pc = lookup_page_cgroup(page);
3493 lock_page_cgroup(pc);
3494 pc->mem_cgroup = memcg;
3495 SetPageCgroupUsed(pc);
3496 unlock_page_cgroup(pc);
3497}
3498
3499void __memcg_kmem_uncharge_pages(struct page *page, int order)
3500{
3501 struct mem_cgroup *memcg = NULL;
3502 struct page_cgroup *pc;
3503
3504
3505 pc = lookup_page_cgroup(page);
3506 /*
3507 * Fast unlocked return. Theoretically might have changed, have to
3508 * check again after locking.
3509 */
3510 if (!PageCgroupUsed(pc))
3511 return;
3512
3513 lock_page_cgroup(pc);
3514 if (PageCgroupUsed(pc)) {
3515 memcg = pc->mem_cgroup;
3516 ClearPageCgroupUsed(pc);
3517 }
3518 unlock_page_cgroup(pc);
3519
3520 /*
3521 * We trust that only if there is a memcg associated with the page, it
3522 * is a valid allocation
3523 */
3524 if (!memcg)
3525 return;
3526
3527 VM_BUG_ON(mem_cgroup_is_root(memcg));
3528 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3529}
3530#else
3531static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3532{
3533}
3534#endif /* CONFIG_MEMCG_KMEM */
3535
2629#ifdef CONFIG_TRANSPARENT_HUGEPAGE 3536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2630 3537
2631#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3538#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
@@ -2709,13 +3616,6 @@ static int mem_cgroup_move_account(struct page *page,
2709 /* caller should have done css_get */ 3616 /* caller should have done css_get */
2710 pc->mem_cgroup = to; 3617 pc->mem_cgroup = to;
2711 mem_cgroup_charge_statistics(to, anon, nr_pages); 3618 mem_cgroup_charge_statistics(to, anon, nr_pages);
2712 /*
2713 * We charges against "to" which may not have any tasks. Then, "to"
2714 * can be under rmdir(). But in current implementation, caller of
2715 * this function is just force_empty() and move charge, so it's
2716 * guaranteed that "to" is never removed. So, we don't check rmdir
2717 * status here.
2718 */
2719 move_unlock_mem_cgroup(from, &flags); 3619 move_unlock_mem_cgroup(from, &flags);
2720 ret = 0; 3620 ret = 0;
2721unlock: 3621unlock:
@@ -2729,10 +3629,27 @@ out:
2729 return ret; 3629 return ret;
2730} 3630}
2731 3631
2732/* 3632/**
2733 * move charges to its parent. 3633 * mem_cgroup_move_parent - moves page to the parent group
3634 * @page: the page to move
3635 * @pc: page_cgroup of the page
3636 * @child: page's cgroup
3637 *
3638 * move charges to its parent or the root cgroup if the group has no
3639 * parent (aka use_hierarchy==0).
3640 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3641 * mem_cgroup_move_account fails) the failure is always temporary and
3642 * it signals a race with a page removal/uncharge or migration. In the
3643 * first case the page is on the way out and it will vanish from the LRU
3644 * on the next attempt and the call should be retried later.
3645 * Isolation from the LRU fails only if page has been isolated from
3646 * the LRU since we looked at it and that usually means either global
3647 * reclaim or migration going on. The page will either get back to the
3648 * LRU or vanish.
3649 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3650 * (!PageCgroupUsed) or moved to a different group. The page will
3651 * disappear in the next attempt.
2734 */ 3652 */
2735
2736static int mem_cgroup_move_parent(struct page *page, 3653static int mem_cgroup_move_parent(struct page *page,
2737 struct page_cgroup *pc, 3654 struct page_cgroup *pc,
2738 struct mem_cgroup *child) 3655 struct mem_cgroup *child)
@@ -2742,9 +3659,7 @@ static int mem_cgroup_move_parent(struct page *page,
2742 unsigned long uninitialized_var(flags); 3659 unsigned long uninitialized_var(flags);
2743 int ret; 3660 int ret;
2744 3661
2745 /* Is ROOT ? */ 3662 VM_BUG_ON(mem_cgroup_is_root(child));
2746 if (mem_cgroup_is_root(child))
2747 return -EINVAL;
2748 3663
2749 ret = -EBUSY; 3664 ret = -EBUSY;
2750 if (!get_page_unless_zero(page)) 3665 if (!get_page_unless_zero(page))
@@ -2761,8 +3676,10 @@ static int mem_cgroup_move_parent(struct page *page,
2761 if (!parent) 3676 if (!parent)
2762 parent = root_mem_cgroup; 3677 parent = root_mem_cgroup;
2763 3678
2764 if (nr_pages > 1) 3679 if (nr_pages > 1) {
3680 VM_BUG_ON(!PageTransHuge(page));
2765 flags = compound_lock_irqsave(page); 3681 flags = compound_lock_irqsave(page);
3682 }
2766 3683
2767 ret = mem_cgroup_move_account(page, nr_pages, 3684 ret = mem_cgroup_move_account(page, nr_pages,
2768 pc, child, parent); 3685 pc, child, parent);
@@ -2904,7 +3821,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2904 return; 3821 return;
2905 if (!memcg) 3822 if (!memcg)
2906 return; 3823 return;
2907 cgroup_exclude_rmdir(&memcg->css);
2908 3824
2909 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 3825 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2910 /* 3826 /*
@@ -2918,12 +3834,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2918 swp_entry_t ent = {.val = page_private(page)}; 3834 swp_entry_t ent = {.val = page_private(page)};
2919 mem_cgroup_uncharge_swap(ent); 3835 mem_cgroup_uncharge_swap(ent);
2920 } 3836 }
2921 /*
2922 * At swapin, we may charge account against cgroup which has no tasks.
2923 * So, rmdir()->pre_destroy() can be called while we do this charge.
2924 * In that case, we need to call pre_destroy() again. check it here.
2925 */
2926 cgroup_release_and_wakeup_rmdir(&memcg->css);
2927} 3837}
2928 3838
2929void mem_cgroup_commit_charge_swapin(struct page *page, 3839void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3288,15 +4198,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3288 struct mem_cgroup **memcgp) 4198 struct mem_cgroup **memcgp)
3289{ 4199{
3290 struct mem_cgroup *memcg = NULL; 4200 struct mem_cgroup *memcg = NULL;
4201 unsigned int nr_pages = 1;
3291 struct page_cgroup *pc; 4202 struct page_cgroup *pc;
3292 enum charge_type ctype; 4203 enum charge_type ctype;
3293 4204
3294 *memcgp = NULL; 4205 *memcgp = NULL;
3295 4206
3296 VM_BUG_ON(PageTransHuge(page));
3297 if (mem_cgroup_disabled()) 4207 if (mem_cgroup_disabled())
3298 return; 4208 return;
3299 4209
4210 if (PageTransHuge(page))
4211 nr_pages <<= compound_order(page);
4212
3300 pc = lookup_page_cgroup(page); 4213 pc = lookup_page_cgroup(page);
3301 lock_page_cgroup(pc); 4214 lock_page_cgroup(pc);
3302 if (PageCgroupUsed(pc)) { 4215 if (PageCgroupUsed(pc)) {
@@ -3358,7 +4271,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3358 * charged to the res_counter since we plan on replacing the 4271 * charged to the res_counter since we plan on replacing the
3359 * old one and only one page is going to be left afterwards. 4272 * old one and only one page is going to be left afterwards.
3360 */ 4273 */
3361 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 4274 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
3362} 4275}
3363 4276
3364/* remove redundant charge if migration failed*/ 4277/* remove redundant charge if migration failed*/
@@ -3371,8 +4284,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3371 4284
3372 if (!memcg) 4285 if (!memcg)
3373 return; 4286 return;
3374 /* blocks rmdir() */ 4287
3375 cgroup_exclude_rmdir(&memcg->css);
3376 if (!migration_ok) { 4288 if (!migration_ok) {
3377 used = oldpage; 4289 used = oldpage;
3378 unused = newpage; 4290 unused = newpage;
@@ -3406,13 +4318,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3406 */ 4318 */
3407 if (anon) 4319 if (anon)
3408 mem_cgroup_uncharge_page(used); 4320 mem_cgroup_uncharge_page(used);
3409 /*
3410 * At migration, we may charge account against cgroup which has no
3411 * tasks.
3412 * So, rmdir()->pre_destroy() can be called while we do this charge.
3413 * In that case, we need to call pre_destroy() again. check it here.
3414 */
3415 cgroup_release_and_wakeup_rmdir(&memcg->css);
3416} 4321}
3417 4322
3418/* 4323/*
@@ -3490,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
3490} 4395}
3491#endif 4396#endif
3492 4397
3493static DEFINE_MUTEX(set_limit_mutex);
3494
3495static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4398static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3496 unsigned long long val) 4399 unsigned long long val)
3497{ 4400{
@@ -3712,17 +4615,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3712 return nr_reclaimed; 4615 return nr_reclaimed;
3713} 4616}
3714 4617
3715/* 4618/**
4619 * mem_cgroup_force_empty_list - clears LRU of a group
4620 * @memcg: group to clear
4621 * @node: NUMA node
4622 * @zid: zone id
4623 * @lru: lru to to clear
4624 *
3716 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 4625 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3717 * reclaim the pages page themselves - it just removes the page_cgroups. 4626 * reclaim the pages page themselves - pages are moved to the parent (or root)
3718 * Returns true if some page_cgroups were not freed, indicating that the caller 4627 * group.
3719 * must retry this operation.
3720 */ 4628 */
3721static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 4629static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3722 int node, int zid, enum lru_list lru) 4630 int node, int zid, enum lru_list lru)
3723{ 4631{
3724 struct lruvec *lruvec; 4632 struct lruvec *lruvec;
3725 unsigned long flags, loop; 4633 unsigned long flags;
3726 struct list_head *list; 4634 struct list_head *list;
3727 struct page *busy; 4635 struct page *busy;
3728 struct zone *zone; 4636 struct zone *zone;
@@ -3731,11 +4639,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3731 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 4639 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3732 list = &lruvec->lists[lru]; 4640 list = &lruvec->lists[lru];
3733 4641
3734 loop = mem_cgroup_get_lru_size(lruvec, lru);
3735 /* give some margin against EBUSY etc...*/
3736 loop += 256;
3737 busy = NULL; 4642 busy = NULL;
3738 while (loop--) { 4643 do {
3739 struct page_cgroup *pc; 4644 struct page_cgroup *pc;
3740 struct page *page; 4645 struct page *page;
3741 4646
@@ -3761,76 +4666,80 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3761 cond_resched(); 4666 cond_resched();
3762 } else 4667 } else
3763 busy = NULL; 4668 busy = NULL;
3764 } 4669 } while (!list_empty(list));
3765 return !list_empty(list);
3766} 4670}
3767 4671
3768/* 4672/*
3769 * make mem_cgroup's charge to be 0 if there is no task. 4673 * make mem_cgroup's charge to be 0 if there is no task by moving
4674 * all the charges and pages to the parent.
3770 * This enables deleting this mem_cgroup. 4675 * This enables deleting this mem_cgroup.
4676 *
4677 * Caller is responsible for holding css reference on the memcg.
3771 */ 4678 */
3772static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) 4679static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3773{ 4680{
3774 int ret; 4681 int node, zid;
3775 int node, zid, shrink; 4682 u64 usage;
3776 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3777 struct cgroup *cgrp = memcg->css.cgroup;
3778
3779 css_get(&memcg->css);
3780 4683
3781 shrink = 0;
3782 /* should free all ? */
3783 if (free_all)
3784 goto try_to_free;
3785move_account:
3786 do { 4684 do {
3787 ret = -EBUSY;
3788 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3789 goto out;
3790 /* This is for making all *used* pages to be on LRU. */ 4685 /* This is for making all *used* pages to be on LRU. */
3791 lru_add_drain_all(); 4686 lru_add_drain_all();
3792 drain_all_stock_sync(memcg); 4687 drain_all_stock_sync(memcg);
3793 ret = 0;
3794 mem_cgroup_start_move(memcg); 4688 mem_cgroup_start_move(memcg);
3795 for_each_node_state(node, N_HIGH_MEMORY) { 4689 for_each_node_state(node, N_MEMORY) {
3796 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 4690 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3797 enum lru_list lru; 4691 enum lru_list lru;
3798 for_each_lru(lru) { 4692 for_each_lru(lru) {
3799 ret = mem_cgroup_force_empty_list(memcg, 4693 mem_cgroup_force_empty_list(memcg,
3800 node, zid, lru); 4694 node, zid, lru);
3801 if (ret)
3802 break;
3803 } 4695 }
3804 } 4696 }
3805 if (ret)
3806 break;
3807 } 4697 }
3808 mem_cgroup_end_move(memcg); 4698 mem_cgroup_end_move(memcg);
3809 memcg_oom_recover(memcg); 4699 memcg_oom_recover(memcg);
3810 cond_resched(); 4700 cond_resched();
3811 /* "ret" should also be checked to ensure all lists are empty. */
3812 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3813out:
3814 css_put(&memcg->css);
3815 return ret;
3816 4701
3817try_to_free: 4702 /*
4703 * Kernel memory may not necessarily be trackable to a specific
4704 * process. So they are not migrated, and therefore we can't
4705 * expect their value to drop to 0 here.
4706 * Having res filled up with kmem only is enough.
4707 *
4708 * This is a safety check because mem_cgroup_force_empty_list
4709 * could have raced with mem_cgroup_replace_page_cache callers
4710 * so the lru seemed empty but the page could have been added
4711 * right after the check. RES_USAGE should be safe as we always
4712 * charge before adding to the LRU.
4713 */
4714 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4715 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4716 } while (usage > 0);
4717}
4718
4719/*
4720 * Reclaims as many pages from the given memcg as possible and moves
4721 * the rest to the parent.
4722 *
4723 * Caller is responsible for holding css reference for memcg.
4724 */
4725static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4726{
4727 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4728 struct cgroup *cgrp = memcg->css.cgroup;
4729
3818 /* returns EBUSY if there is a task or if we come here twice. */ 4730 /* returns EBUSY if there is a task or if we come here twice. */
3819 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 4731 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3820 ret = -EBUSY; 4732 return -EBUSY;
3821 goto out; 4733
3822 }
3823 /* we call try-to-free pages for make this cgroup empty */ 4734 /* we call try-to-free pages for make this cgroup empty */
3824 lru_add_drain_all(); 4735 lru_add_drain_all();
3825 /* try to free all pages in this cgroup */ 4736 /* try to free all pages in this cgroup */
3826 shrink = 1;
3827 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 4737 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3828 int progress; 4738 int progress;
3829 4739
3830 if (signal_pending(current)) { 4740 if (signal_pending(current))
3831 ret = -EINTR; 4741 return -EINTR;
3832 goto out; 4742
3833 }
3834 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 4743 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3835 false); 4744 false);
3836 if (!progress) { 4745 if (!progress) {
@@ -3841,13 +4750,23 @@ try_to_free:
3841 4750
3842 } 4751 }
3843 lru_add_drain(); 4752 lru_add_drain();
3844 /* try move_account...there may be some *locked* pages. */ 4753 mem_cgroup_reparent_charges(memcg);
3845 goto move_account; 4754
4755 return 0;
3846} 4756}
3847 4757
3848static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 4758static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3849{ 4759{
3850 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 4760 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4761 int ret;
4762
4763 if (mem_cgroup_is_root(memcg))
4764 return -EINVAL;
4765 css_get(&memcg->css);
4766 ret = mem_cgroup_force_empty(memcg);
4767 css_put(&memcg->css);
4768
4769 return ret;
3851} 4770}
3852 4771
3853 4772
@@ -3938,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3938 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4857 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3939 char str[64]; 4858 char str[64];
3940 u64 val; 4859 u64 val;
3941 int type, name, len; 4860 int name, len;
4861 enum res_type type;
3942 4862
3943 type = MEMFILE_TYPE(cft->private); 4863 type = MEMFILE_TYPE(cft->private);
3944 name = MEMFILE_ATTR(cft->private); 4864 name = MEMFILE_ATTR(cft->private);
@@ -3959,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3959 else 4879 else
3960 val = res_counter_read_u64(&memcg->memsw, name); 4880 val = res_counter_read_u64(&memcg->memsw, name);
3961 break; 4881 break;
4882 case _KMEM:
4883 val = res_counter_read_u64(&memcg->kmem, name);
4884 break;
3962 default: 4885 default:
3963 BUG(); 4886 BUG();
3964 } 4887 }
@@ -3966,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3966 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 4889 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3967 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 4890 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3968} 4891}
4892
4893static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4894{
4895 int ret = -EINVAL;
4896#ifdef CONFIG_MEMCG_KMEM
4897 bool must_inc_static_branch = false;
4898
4899 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4900 /*
4901 * For simplicity, we won't allow this to be disabled. It also can't
4902 * be changed if the cgroup has children already, or if tasks had
4903 * already joined.
4904 *
4905 * If tasks join before we set the limit, a person looking at
4906 * kmem.usage_in_bytes will have no way to determine when it took
4907 * place, which makes the value quite meaningless.
4908 *
4909 * After it first became limited, changes in the value of the limit are
4910 * of course permitted.
4911 *
4912 * Taking the cgroup_lock is really offensive, but it is so far the only
4913 * way to guarantee that no children will appear. There are plenty of
4914 * other offenders, and they should all go away. Fine grained locking
4915 * is probably the way to go here. When we are fully hierarchical, we
4916 * can also get rid of the use_hierarchy check.
4917 */
4918 cgroup_lock();
4919 mutex_lock(&set_limit_mutex);
4920 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4921 if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
4922 !list_empty(&cont->children))) {
4923 ret = -EBUSY;
4924 goto out;
4925 }
4926 ret = res_counter_set_limit(&memcg->kmem, val);
4927 VM_BUG_ON(ret);
4928
4929 ret = memcg_update_cache_sizes(memcg);
4930 if (ret) {
4931 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4932 goto out;
4933 }
4934 must_inc_static_branch = true;
4935 /*
4936 * kmem charges can outlive the cgroup. In the case of slab
4937 * pages, for instance, a page contain objects from various
4938 * processes, so it is unfeasible to migrate them away. We
4939 * need to reference count the memcg because of that.
4940 */
4941 mem_cgroup_get(memcg);
4942 } else
4943 ret = res_counter_set_limit(&memcg->kmem, val);
4944out:
4945 mutex_unlock(&set_limit_mutex);
4946 cgroup_unlock();
4947
4948 /*
4949 * We are by now familiar with the fact that we can't inc the static
4950 * branch inside cgroup_lock. See disarm functions for details. A
4951 * worker here is overkill, but also wrong: After the limit is set, we
4952 * must start accounting right away. Since this operation can't fail,
4953 * we can safely defer it to here - no rollback will be needed.
4954 *
4955 * The boolean used to control this is also safe, because
4956 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
4957 * able to set it to true;
4958 */
4959 if (must_inc_static_branch) {
4960 static_key_slow_inc(&memcg_kmem_enabled_key);
4961 /*
4962 * setting the active bit after the inc will guarantee no one
4963 * starts accounting before all call sites are patched
4964 */
4965 memcg_kmem_set_active(memcg);
4966 }
4967
4968#endif
4969 return ret;
4970}
4971
4972static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4973{
4974 int ret = 0;
4975 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4976 if (!parent)
4977 goto out;
4978
4979 memcg->kmem_account_flags = parent->kmem_account_flags;
4980#ifdef CONFIG_MEMCG_KMEM
4981 /*
4982 * When that happen, we need to disable the static branch only on those
4983 * memcgs that enabled it. To achieve this, we would be forced to
4984 * complicate the code by keeping track of which memcgs were the ones
4985 * that actually enabled limits, and which ones got it from its
4986 * parents.
4987 *
4988 * It is a lot simpler just to do static_key_slow_inc() on every child
4989 * that is accounted.
4990 */
4991 if (!memcg_kmem_is_active(memcg))
4992 goto out;
4993
4994 /*
4995 * destroy(), called if we fail, will issue static_key_slow_inc() and
4996 * mem_cgroup_put() if kmem is enabled. We have to either call them
4997 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
4998 * this more consistent, since it always leads to the same destroy path
4999 */
5000 mem_cgroup_get(memcg);
5001 static_key_slow_inc(&memcg_kmem_enabled_key);
5002
5003 mutex_lock(&set_limit_mutex);
5004 ret = memcg_update_cache_sizes(memcg);
5005 mutex_unlock(&set_limit_mutex);
5006#endif
5007out:
5008 return ret;
5009}
5010
3969/* 5011/*
3970 * The user of this function is... 5012 * The user of this function is...
3971 * RES_LIMIT. 5013 * RES_LIMIT.
@@ -3974,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3974 const char *buffer) 5016 const char *buffer)
3975{ 5017{
3976 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5018 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3977 int type, name; 5019 enum res_type type;
5020 int name;
3978 unsigned long long val; 5021 unsigned long long val;
3979 int ret; 5022 int ret;
3980 5023
@@ -3996,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3996 break; 5039 break;
3997 if (type == _MEM) 5040 if (type == _MEM)
3998 ret = mem_cgroup_resize_limit(memcg, val); 5041 ret = mem_cgroup_resize_limit(memcg, val);
3999 else 5042 else if (type == _MEMSWAP)
4000 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5043 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5044 else if (type == _KMEM)
5045 ret = memcg_update_kmem_limit(cont, val);
5046 else
5047 return -EINVAL;
4001 break; 5048 break;
4002 case RES_SOFT_LIMIT: 5049 case RES_SOFT_LIMIT:
4003 ret = res_counter_memparse_write_strategy(buffer, &val); 5050 ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4050,7 +5097,8 @@ out:
4050static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 5097static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4051{ 5098{
4052 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5099 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4053 int type, name; 5100 int name;
5101 enum res_type type;
4054 5102
4055 type = MEMFILE_TYPE(event); 5103 type = MEMFILE_TYPE(event);
4056 name = MEMFILE_ATTR(event); 5104 name = MEMFILE_ATTR(event);
@@ -4062,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4062 case RES_MAX_USAGE: 5110 case RES_MAX_USAGE:
4063 if (type == _MEM) 5111 if (type == _MEM)
4064 res_counter_reset_max(&memcg->res); 5112 res_counter_reset_max(&memcg->res);
4065 else 5113 else if (type == _MEMSWAP)
4066 res_counter_reset_max(&memcg->memsw); 5114 res_counter_reset_max(&memcg->memsw);
5115 else if (type == _KMEM)
5116 res_counter_reset_max(&memcg->kmem);
5117 else
5118 return -EINVAL;
4067 break; 5119 break;
4068 case RES_FAILCNT: 5120 case RES_FAILCNT:
4069 if (type == _MEM) 5121 if (type == _MEM)
4070 res_counter_reset_failcnt(&memcg->res); 5122 res_counter_reset_failcnt(&memcg->res);
4071 else 5123 else if (type == _MEMSWAP)
4072 res_counter_reset_failcnt(&memcg->memsw); 5124 res_counter_reset_failcnt(&memcg->memsw);
5125 else if (type == _KMEM)
5126 res_counter_reset_failcnt(&memcg->kmem);
5127 else
5128 return -EINVAL;
4073 break; 5129 break;
4074 } 5130 }
4075 5131
@@ -4120,7 +5176,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4120 5176
4121 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5177 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4122 seq_printf(m, "total=%lu", total_nr); 5178 seq_printf(m, "total=%lu", total_nr);
4123 for_each_node_state(nid, N_HIGH_MEMORY) { 5179 for_each_node_state(nid, N_MEMORY) {
4124 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 5180 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4125 seq_printf(m, " N%d=%lu", nid, node_nr); 5181 seq_printf(m, " N%d=%lu", nid, node_nr);
4126 } 5182 }
@@ -4128,7 +5184,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4128 5184
4129 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 5185 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4130 seq_printf(m, "file=%lu", file_nr); 5186 seq_printf(m, "file=%lu", file_nr);
4131 for_each_node_state(nid, N_HIGH_MEMORY) { 5187 for_each_node_state(nid, N_MEMORY) {
4132 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5188 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4133 LRU_ALL_FILE); 5189 LRU_ALL_FILE);
4134 seq_printf(m, " N%d=%lu", nid, node_nr); 5190 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4137,7 +5193,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4137 5193
4138 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 5194 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4139 seq_printf(m, "anon=%lu", anon_nr); 5195 seq_printf(m, "anon=%lu", anon_nr);
4140 for_each_node_state(nid, N_HIGH_MEMORY) { 5196 for_each_node_state(nid, N_MEMORY) {
4141 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5197 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4142 LRU_ALL_ANON); 5198 LRU_ALL_ANON);
4143 seq_printf(m, " N%d=%lu", nid, node_nr); 5199 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4146,7 +5202,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4146 5202
4147 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 5203 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4148 seq_printf(m, "unevictable=%lu", unevictable_nr); 5204 seq_printf(m, "unevictable=%lu", unevictable_nr);
4149 for_each_node_state(nid, N_HIGH_MEMORY) { 5205 for_each_node_state(nid, N_MEMORY) {
4150 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5206 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4151 BIT(LRU_UNEVICTABLE)); 5207 BIT(LRU_UNEVICTABLE));
4152 seq_printf(m, " N%d=%lu", nid, node_nr); 5208 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4386,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4386 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5442 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4387 struct mem_cgroup_thresholds *thresholds; 5443 struct mem_cgroup_thresholds *thresholds;
4388 struct mem_cgroup_threshold_ary *new; 5444 struct mem_cgroup_threshold_ary *new;
4389 int type = MEMFILE_TYPE(cft->private); 5445 enum res_type type = MEMFILE_TYPE(cft->private);
4390 u64 threshold, usage; 5446 u64 threshold, usage;
4391 int i, size, ret; 5447 int i, size, ret;
4392 5448
@@ -4469,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4469 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5525 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4470 struct mem_cgroup_thresholds *thresholds; 5526 struct mem_cgroup_thresholds *thresholds;
4471 struct mem_cgroup_threshold_ary *new; 5527 struct mem_cgroup_threshold_ary *new;
4472 int type = MEMFILE_TYPE(cft->private); 5528 enum res_type type = MEMFILE_TYPE(cft->private);
4473 u64 usage; 5529 u64 usage;
4474 int i, j, size; 5530 int i, j, size;
4475 5531
@@ -4547,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4547{ 5603{
4548 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5604 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4549 struct mem_cgroup_eventfd_list *event; 5605 struct mem_cgroup_eventfd_list *event;
4550 int type = MEMFILE_TYPE(cft->private); 5606 enum res_type type = MEMFILE_TYPE(cft->private);
4551 5607
4552 BUG_ON(type != _OOM_TYPE); 5608 BUG_ON(type != _OOM_TYPE);
4553 event = kmalloc(sizeof(*event), GFP_KERNEL); 5609 event = kmalloc(sizeof(*event), GFP_KERNEL);
@@ -4572,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4572{ 5628{
4573 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5629 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4574 struct mem_cgroup_eventfd_list *ev, *tmp; 5630 struct mem_cgroup_eventfd_list *ev, *tmp;
4575 int type = MEMFILE_TYPE(cft->private); 5631 enum res_type type = MEMFILE_TYPE(cft->private);
4576 5632
4577 BUG_ON(type != _OOM_TYPE); 5633 BUG_ON(type != _OOM_TYPE);
4578 5634
@@ -4631,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4631#ifdef CONFIG_MEMCG_KMEM 5687#ifdef CONFIG_MEMCG_KMEM
4632static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5688static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4633{ 5689{
5690 int ret;
5691
5692 memcg->kmemcg_id = -1;
5693 ret = memcg_propagate_kmem(memcg);
5694 if (ret)
5695 return ret;
5696
4634 return mem_cgroup_sockets_init(memcg, ss); 5697 return mem_cgroup_sockets_init(memcg, ss);
4635}; 5698};
4636 5699
4637static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5700static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4638{ 5701{
4639 mem_cgroup_sockets_destroy(memcg); 5702 mem_cgroup_sockets_destroy(memcg);
5703
5704 memcg_kmem_mark_dead(memcg);
5705
5706 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5707 return;
5708
5709 /*
5710 * Charges already down to 0, undo mem_cgroup_get() done in the charge
5711 * path here, being careful not to race with memcg_uncharge_kmem: it is
5712 * possible that the charges went down to 0 between mark_dead and the
5713 * res_counter read, so in that case, we don't need the put
5714 */
5715 if (memcg_kmem_test_and_clear_dead(memcg))
5716 mem_cgroup_put(memcg);
4640} 5717}
4641#else 5718#else
4642static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5719static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4745,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = {
4745 .read = mem_cgroup_read, 5822 .read = mem_cgroup_read,
4746 }, 5823 },
4747#endif 5824#endif
5825#ifdef CONFIG_MEMCG_KMEM
5826 {
5827 .name = "kmem.limit_in_bytes",
5828 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5829 .write_string = mem_cgroup_write,
5830 .read = mem_cgroup_read,
5831 },
5832 {
5833 .name = "kmem.usage_in_bytes",
5834 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5835 .read = mem_cgroup_read,
5836 },
5837 {
5838 .name = "kmem.failcnt",
5839 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5840 .trigger = mem_cgroup_reset,
5841 .read = mem_cgroup_read,
5842 },
5843 {
5844 .name = "kmem.max_usage_in_bytes",
5845 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5846 .trigger = mem_cgroup_reset,
5847 .read = mem_cgroup_read,
5848 },
5849#ifdef CONFIG_SLABINFO
5850 {
5851 .name = "kmem.slabinfo",
5852 .read_seq_string = mem_cgroup_slabinfo_read,
5853 },
5854#endif
5855#endif
4748 { }, /* terminate */ 5856 { }, /* terminate */
4749}; 5857};
4750 5858
@@ -4812,16 +5920,29 @@ out_free:
4812} 5920}
4813 5921
4814/* 5922/*
4815 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, 5923 * At destroying mem_cgroup, references from swap_cgroup can remain.
4816 * but in process context. The work_freeing structure is overlaid 5924 * (scanning all at force_empty is too costly...)
4817 * on the rcu_freeing structure, which itself is overlaid on memsw. 5925 *
5926 * Instead of clearing all references at force_empty, we remember
5927 * the number of reference from swap_cgroup and free mem_cgroup when
5928 * it goes down to 0.
5929 *
5930 * Removal of cgroup itself succeeds regardless of refs from swap.
4818 */ 5931 */
4819static void free_work(struct work_struct *work) 5932
5933static void __mem_cgroup_free(struct mem_cgroup *memcg)
4820{ 5934{
4821 struct mem_cgroup *memcg; 5935 int node;
4822 int size = sizeof(struct mem_cgroup); 5936 int size = sizeof(struct mem_cgroup);
4823 5937
4824 memcg = container_of(work, struct mem_cgroup, work_freeing); 5938 mem_cgroup_remove_from_trees(memcg);
5939 free_css_id(&mem_cgroup_subsys, &memcg->css);
5940
5941 for_each_node(node)
5942 free_mem_cgroup_per_zone_info(memcg, node);
5943
5944 free_percpu(memcg->stat);
5945
4825 /* 5946 /*
4826 * We need to make sure that (at least for now), the jump label 5947 * We need to make sure that (at least for now), the jump label
4827 * destruction code runs outside of the cgroup lock. This is because 5948 * destruction code runs outside of the cgroup lock. This is because
@@ -4833,45 +5954,34 @@ static void free_work(struct work_struct *work)
4833 * to move this code around, and make sure it is outside 5954 * to move this code around, and make sure it is outside
4834 * the cgroup_lock. 5955 * the cgroup_lock.
4835 */ 5956 */
4836 disarm_sock_keys(memcg); 5957 disarm_static_keys(memcg);
4837 if (size < PAGE_SIZE) 5958 if (size < PAGE_SIZE)
4838 kfree(memcg); 5959 kfree(memcg);
4839 else 5960 else
4840 vfree(memcg); 5961 vfree(memcg);
4841} 5962}
4842 5963
4843static void free_rcu(struct rcu_head *rcu_head)
4844{
4845 struct mem_cgroup *memcg;
4846
4847 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4848 INIT_WORK(&memcg->work_freeing, free_work);
4849 schedule_work(&memcg->work_freeing);
4850}
4851 5964
4852/* 5965/*
4853 * At destroying mem_cgroup, references from swap_cgroup can remain. 5966 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
4854 * (scanning all at force_empty is too costly...) 5967 * but in process context. The work_freeing structure is overlaid
4855 * 5968 * on the rcu_freeing structure, which itself is overlaid on memsw.
4856 * Instead of clearing all references at force_empty, we remember
4857 * the number of reference from swap_cgroup and free mem_cgroup when
4858 * it goes down to 0.
4859 *
4860 * Removal of cgroup itself succeeds regardless of refs from swap.
4861 */ 5969 */
4862 5970static void free_work(struct work_struct *work)
4863static void __mem_cgroup_free(struct mem_cgroup *memcg)
4864{ 5971{
4865 int node; 5972 struct mem_cgroup *memcg;
4866 5973
4867 mem_cgroup_remove_from_trees(memcg); 5974 memcg = container_of(work, struct mem_cgroup, work_freeing);
4868 free_css_id(&mem_cgroup_subsys, &memcg->css); 5975 __mem_cgroup_free(memcg);
5976}
4869 5977
4870 for_each_node(node) 5978static void free_rcu(struct rcu_head *rcu_head)
4871 free_mem_cgroup_per_zone_info(memcg, node); 5979{
5980 struct mem_cgroup *memcg;
4872 5981
4873 free_percpu(memcg->stat); 5982 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4874 call_rcu(&memcg->rcu_freeing, free_rcu); 5983 INIT_WORK(&memcg->work_freeing, free_work);
5984 schedule_work(&memcg->work_freeing);
4875} 5985}
4876 5986
4877static void mem_cgroup_get(struct mem_cgroup *memcg) 5987static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4883,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4883{ 5993{
4884 if (atomic_sub_and_test(count, &memcg->refcnt)) { 5994 if (atomic_sub_and_test(count, &memcg->refcnt)) {
4885 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5995 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4886 __mem_cgroup_free(memcg); 5996 call_rcu(&memcg->rcu_freeing, free_rcu);
4887 if (parent) 5997 if (parent)
4888 mem_cgroup_put(parent); 5998 mem_cgroup_put(parent);
4889 } 5999 }
@@ -4953,7 +6063,7 @@ err_cleanup:
4953} 6063}
4954 6064
4955static struct cgroup_subsys_state * __ref 6065static struct cgroup_subsys_state * __ref
4956mem_cgroup_create(struct cgroup *cont) 6066mem_cgroup_css_alloc(struct cgroup *cont)
4957{ 6067{
4958 struct mem_cgroup *memcg, *parent; 6068 struct mem_cgroup *memcg, *parent;
4959 long error = -ENOMEM; 6069 long error = -ENOMEM;
@@ -4990,6 +6100,8 @@ mem_cgroup_create(struct cgroup *cont)
4990 if (parent && parent->use_hierarchy) { 6100 if (parent && parent->use_hierarchy) {
4991 res_counter_init(&memcg->res, &parent->res); 6101 res_counter_init(&memcg->res, &parent->res);
4992 res_counter_init(&memcg->memsw, &parent->memsw); 6102 res_counter_init(&memcg->memsw, &parent->memsw);
6103 res_counter_init(&memcg->kmem, &parent->kmem);
6104
4993 /* 6105 /*
4994 * We increment refcnt of the parent to ensure that we can 6106 * We increment refcnt of the parent to ensure that we can
4995 * safely access it on res_counter_charge/uncharge. 6107 * safely access it on res_counter_charge/uncharge.
@@ -5000,6 +6112,7 @@ mem_cgroup_create(struct cgroup *cont)
5000 } else { 6112 } else {
5001 res_counter_init(&memcg->res, NULL); 6113 res_counter_init(&memcg->res, NULL);
5002 res_counter_init(&memcg->memsw, NULL); 6114 res_counter_init(&memcg->memsw, NULL);
6115 res_counter_init(&memcg->kmem, NULL);
5003 /* 6116 /*
5004 * Deeper hierachy with use_hierarchy == false doesn't make 6117 * Deeper hierachy with use_hierarchy == false doesn't make
5005 * much sense so let cgroup subsystem know about this 6118 * much sense so let cgroup subsystem know about this
@@ -5034,14 +6147,15 @@ free_out:
5034 return ERR_PTR(error); 6147 return ERR_PTR(error);
5035} 6148}
5036 6149
5037static int mem_cgroup_pre_destroy(struct cgroup *cont) 6150static void mem_cgroup_css_offline(struct cgroup *cont)
5038{ 6151{
5039 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6152 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5040 6153
5041 return mem_cgroup_force_empty(memcg, false); 6154 mem_cgroup_reparent_charges(memcg);
6155 mem_cgroup_destroy_all_caches(memcg);
5042} 6156}
5043 6157
5044static void mem_cgroup_destroy(struct cgroup *cont) 6158static void mem_cgroup_css_free(struct cgroup *cont)
5045{ 6159{
5046 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6160 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5047 6161
@@ -5631,16 +6745,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
5631struct cgroup_subsys mem_cgroup_subsys = { 6745struct cgroup_subsys mem_cgroup_subsys = {
5632 .name = "memory", 6746 .name = "memory",
5633 .subsys_id = mem_cgroup_subsys_id, 6747 .subsys_id = mem_cgroup_subsys_id,
5634 .create = mem_cgroup_create, 6748 .css_alloc = mem_cgroup_css_alloc,
5635 .pre_destroy = mem_cgroup_pre_destroy, 6749 .css_offline = mem_cgroup_css_offline,
5636 .destroy = mem_cgroup_destroy, 6750 .css_free = mem_cgroup_css_free,
5637 .can_attach = mem_cgroup_can_attach, 6751 .can_attach = mem_cgroup_can_attach,
5638 .cancel_attach = mem_cgroup_cancel_attach, 6752 .cancel_attach = mem_cgroup_cancel_attach,
5639 .attach = mem_cgroup_move_task, 6753 .attach = mem_cgroup_move_task,
5640 .base_cftypes = mem_cgroup_files, 6754 .base_cftypes = mem_cgroup_files,
5641 .early_init = 0, 6755 .early_init = 0,
5642 .use_id = 1, 6756 .use_id = 1,
5643 .__DEPRECATED_clear_css_refs = true,
5644}; 6757};
5645 6758
5646#ifdef CONFIG_MEMCG_SWAP 6759#ifdef CONFIG_MEMCG_SWAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8b20278be6a6..c6e4dd3e1c08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
402 struct anon_vma *av; 402 struct anon_vma *av;
403 pgoff_t pgoff; 403 pgoff_t pgoff;
404 404
405 av = page_lock_anon_vma(page); 405 av = page_lock_anon_vma_read(page);
406 if (av == NULL) /* Not actually mapped anymore */ 406 if (av == NULL) /* Not actually mapped anymore */
407 return; 407 return;
408 408
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
423 } 423 }
424 } 424 }
425 read_unlock(&tasklist_lock); 425 read_unlock(&tasklist_lock);
426 page_unlock_anon_vma(av); 426 page_unlock_anon_vma_read(av);
427} 427}
428 428
429/* 429/*
@@ -781,16 +781,16 @@ static struct page_state {
781 { compound, compound, "huge", me_huge_page }, 781 { compound, compound, "huge", me_huge_page },
782#endif 782#endif
783 783
784 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, 784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "swapcache", me_swapcache_clean }, 785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786 786
787 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, 787 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
788 { unevict, unevict, "unevictable LRU", me_pagecache_clean}, 788 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
789 789
790 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, 790 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
791 { mlock, mlock, "mlocked LRU", me_pagecache_clean }, 791 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
792 792
793 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795 795
796 /* 796 /*
@@ -812,14 +812,14 @@ static struct page_state {
812#undef slab 812#undef slab
813#undef reserved 813#undef reserved
814 814
815/*
816 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
817 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
818 */
815static void action_result(unsigned long pfn, char *msg, int result) 819static void action_result(unsigned long pfn, char *msg, int result)
816{ 820{
817 struct page *page = pfn_to_page(pfn); 821 pr_err("MCE %#lx: %s page recovery: %s\n",
818 822 pfn, msg, action_name[result]);
819 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
820 pfn,
821 PageDirty(page) ? "dirty " : "",
822 msg, action_name[result]);
823} 823}
824 824
825static int page_action(struct page_state *ps, struct page *p, 825static int page_action(struct page_state *ps, struct page *p,
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1385 * Isolate the page, so that it doesn't get reallocated if it 1385 * Isolate the page, so that it doesn't get reallocated if it
1386 * was free. 1386 * was free.
1387 */ 1387 */
1388 set_migratetype_isolate(p); 1388 set_migratetype_isolate(p, true);
1389 /* 1389 /*
1390 * When the target page is a free hugepage, just remove it 1390 * When the target page is a free hugepage, just remove it
1391 * from free hugepage list. 1391 * from free hugepage list.
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
1566 page_is_file_cache(page)); 1566 page_is_file_cache(page));
1567 list_add(&page->lru, &pagelist); 1567 list_add(&page->lru, &pagelist);
1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1569 false, MIGRATE_SYNC); 1569 false, MIGRATE_SYNC,
1570 MR_MEMORY_FAILURE);
1570 if (ret) { 1571 if (ret) {
1571 putback_lru_pages(&pagelist); 1572 putback_lru_pages(&pagelist);
1572 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1573 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 221fc9ffcab1..e0a9b0ce4f10 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,8 @@
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h>
61#include <linux/string.h>
60 62
61#include <asm/io.h> 63#include <asm/io.h>
62#include <asm/pgalloc.h> 64#include <asm/pgalloc.h>
@@ -717,20 +719,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 719 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718} 720}
719 721
720#ifndef is_zero_pfn
721static inline int is_zero_pfn(unsigned long pfn)
722{
723 return pfn == zero_pfn;
724}
725#endif
726
727#ifndef my_zero_pfn
728static inline unsigned long my_zero_pfn(unsigned long addr)
729{
730 return zero_pfn;
731}
732#endif
733
734/* 722/*
735 * vm_normal_page -- This function gets the "struct page" associated with a pte. 723 * vm_normal_page -- This function gets the "struct page" associated with a pte.
736 * 724 *
@@ -1250,7 +1238,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1250 BUG(); 1238 BUG();
1251 } 1239 }
1252#endif 1240#endif
1253 split_huge_page_pmd(vma->vm_mm, pmd); 1241 split_huge_page_pmd(vma, addr, pmd);
1254 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1242 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1255 goto next; 1243 goto next;
1256 /* fall through */ 1244 /* fall through */
@@ -1517,9 +1505,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1517 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1505 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1518 goto out; 1506 goto out;
1519 } 1507 }
1508 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1509 goto no_page_table;
1520 if (pmd_trans_huge(*pmd)) { 1510 if (pmd_trans_huge(*pmd)) {
1521 if (flags & FOLL_SPLIT) { 1511 if (flags & FOLL_SPLIT) {
1522 split_huge_page_pmd(mm, pmd); 1512 split_huge_page_pmd(vma, address, pmd);
1523 goto split_fallthrough; 1513 goto split_fallthrough;
1524 } 1514 }
1525 spin_lock(&mm->page_table_lock); 1515 spin_lock(&mm->page_table_lock);
@@ -1546,6 +1536,8 @@ split_fallthrough:
1546 pte = *ptep; 1536 pte = *ptep;
1547 if (!pte_present(pte)) 1537 if (!pte_present(pte))
1548 goto no_page; 1538 goto no_page;
1539 if ((flags & FOLL_NUMA) && pte_numa(pte))
1540 goto no_page;
1549 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1541 if ((flags & FOLL_WRITE) && !pte_write(pte))
1550 goto unlock; 1542 goto unlock;
1551 1543
@@ -1697,6 +1689,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1697 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1689 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1698 vm_flags &= (gup_flags & FOLL_FORCE) ? 1690 vm_flags &= (gup_flags & FOLL_FORCE) ?
1699 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1691 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1692
1693 /*
1694 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1695 * would be called on PROT_NONE ranges. We must never invoke
1696 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1697 * page faults would unprotect the PROT_NONE ranges if
1698 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1699 * bitflag. So to avoid that, don't set FOLL_NUMA if
1700 * FOLL_FORCE is set.
1701 */
1702 if (!(gup_flags & FOLL_FORCE))
1703 gup_flags |= FOLL_NUMA;
1704
1700 i = 0; 1705 i = 0;
1701 1706
1702 do { 1707 do {
@@ -2794,13 +2799,8 @@ unlock:
2794oom_free_new: 2799oom_free_new:
2795 page_cache_release(new_page); 2800 page_cache_release(new_page);
2796oom: 2801oom:
2797 if (old_page) { 2802 if (old_page)
2798 if (page_mkwrite) {
2799 unlock_page(old_page);
2800 page_cache_release(old_page);
2801 }
2802 page_cache_release(old_page); 2803 page_cache_release(old_page);
2803 }
2804 return VM_FAULT_OOM; 2804 return VM_FAULT_OOM;
2805 2805
2806unwritable_page: 2806unwritable_page:
@@ -3431,6 +3431,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3431 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3431 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3432} 3432}
3433 3433
3434int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3435 unsigned long addr, int current_nid)
3436{
3437 get_page(page);
3438
3439 count_vm_numa_event(NUMA_HINT_FAULTS);
3440 if (current_nid == numa_node_id())
3441 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3442
3443 return mpol_misplaced(page, vma, addr);
3444}
3445
3446int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3447 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3448{
3449 struct page *page = NULL;
3450 spinlock_t *ptl;
3451 int current_nid = -1;
3452 int target_nid;
3453 bool migrated = false;
3454
3455 /*
3456 * The "pte" at this point cannot be used safely without
3457 * validation through pte_unmap_same(). It's of NUMA type but
3458 * the pfn may be screwed if the read is non atomic.
3459 *
3460 * ptep_modify_prot_start is not called as this is clearing
3461 * the _PAGE_NUMA bit and it is not really expected that there
3462 * would be concurrent hardware modifications to the PTE.
3463 */
3464 ptl = pte_lockptr(mm, pmd);
3465 spin_lock(ptl);
3466 if (unlikely(!pte_same(*ptep, pte))) {
3467 pte_unmap_unlock(ptep, ptl);
3468 goto out;
3469 }
3470
3471 pte = pte_mknonnuma(pte);
3472 set_pte_at(mm, addr, ptep, pte);
3473 update_mmu_cache(vma, addr, ptep);
3474
3475 page = vm_normal_page(vma, addr, pte);
3476 if (!page) {
3477 pte_unmap_unlock(ptep, ptl);
3478 return 0;
3479 }
3480
3481 current_nid = page_to_nid(page);
3482 target_nid = numa_migrate_prep(page, vma, addr, current_nid);
3483 pte_unmap_unlock(ptep, ptl);
3484 if (target_nid == -1) {
3485 /*
3486 * Account for the fault against the current node if it not
3487 * being replaced regardless of where the page is located.
3488 */
3489 current_nid = numa_node_id();
3490 put_page(page);
3491 goto out;
3492 }
3493
3494 /* Migrate to the requested node */
3495 migrated = migrate_misplaced_page(page, target_nid);
3496 if (migrated)
3497 current_nid = target_nid;
3498
3499out:
3500 if (current_nid != -1)
3501 task_numa_fault(current_nid, 1, migrated);
3502 return 0;
3503}
3504
3505/* NUMA hinting page fault entry point for regular pmds */
3506#ifdef CONFIG_NUMA_BALANCING
3507static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3508 unsigned long addr, pmd_t *pmdp)
3509{
3510 pmd_t pmd;
3511 pte_t *pte, *orig_pte;
3512 unsigned long _addr = addr & PMD_MASK;
3513 unsigned long offset;
3514 spinlock_t *ptl;
3515 bool numa = false;
3516 int local_nid = numa_node_id();
3517
3518 spin_lock(&mm->page_table_lock);
3519 pmd = *pmdp;
3520 if (pmd_numa(pmd)) {
3521 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3522 numa = true;
3523 }
3524 spin_unlock(&mm->page_table_lock);
3525
3526 if (!numa)
3527 return 0;
3528
3529 /* we're in a page fault so some vma must be in the range */
3530 BUG_ON(!vma);
3531 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3532 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3533 VM_BUG_ON(offset >= PMD_SIZE);
3534 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3535 pte += offset >> PAGE_SHIFT;
3536 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3537 pte_t pteval = *pte;
3538 struct page *page;
3539 int curr_nid = local_nid;
3540 int target_nid;
3541 bool migrated;
3542 if (!pte_present(pteval))
3543 continue;
3544 if (!pte_numa(pteval))
3545 continue;
3546 if (addr >= vma->vm_end) {
3547 vma = find_vma(mm, addr);
3548 /* there's a pte present so there must be a vma */
3549 BUG_ON(!vma);
3550 BUG_ON(addr < vma->vm_start);
3551 }
3552 if (pte_numa(pteval)) {
3553 pteval = pte_mknonnuma(pteval);
3554 set_pte_at(mm, addr, pte, pteval);
3555 }
3556 page = vm_normal_page(vma, addr, pteval);
3557 if (unlikely(!page))
3558 continue;
3559 /* only check non-shared pages */
3560 if (unlikely(page_mapcount(page) != 1))
3561 continue;
3562
3563 /*
3564 * Note that the NUMA fault is later accounted to either
3565 * the node that is currently running or where the page is
3566 * migrated to.
3567 */
3568 curr_nid = local_nid;
3569 target_nid = numa_migrate_prep(page, vma, addr,
3570 page_to_nid(page));
3571 if (target_nid == -1) {
3572 put_page(page);
3573 continue;
3574 }
3575
3576 /* Migrate to the requested node */
3577 pte_unmap_unlock(pte, ptl);
3578 migrated = migrate_misplaced_page(page, target_nid);
3579 if (migrated)
3580 curr_nid = target_nid;
3581 task_numa_fault(curr_nid, 1, migrated);
3582
3583 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3584 }
3585 pte_unmap_unlock(orig_pte, ptl);
3586
3587 return 0;
3588}
3589#else
3590static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3591 unsigned long addr, pmd_t *pmdp)
3592{
3593 BUG();
3594 return 0;
3595}
3596#endif /* CONFIG_NUMA_BALANCING */
3597
3434/* 3598/*
3435 * These routines also need to handle stuff like marking pages dirty 3599 * These routines also need to handle stuff like marking pages dirty
3436 * and/or accessed for architectures that don't do it in hardware (most 3600 * and/or accessed for architectures that don't do it in hardware (most
@@ -3469,6 +3633,9 @@ int handle_pte_fault(struct mm_struct *mm,
3469 pte, pmd, flags, entry); 3633 pte, pmd, flags, entry);
3470 } 3634 }
3471 3635
3636 if (pte_numa(entry))
3637 return do_numa_page(mm, vma, address, entry, pte, pmd);
3638
3472 ptl = pte_lockptr(mm, pmd); 3639 ptl = pte_lockptr(mm, pmd);
3473 spin_lock(ptl); 3640 spin_lock(ptl);
3474 if (unlikely(!pte_same(*pte, entry))) 3641 if (unlikely(!pte_same(*pte, entry)))
@@ -3537,9 +3704,13 @@ retry:
3537 3704
3538 barrier(); 3705 barrier();
3539 if (pmd_trans_huge(orig_pmd)) { 3706 if (pmd_trans_huge(orig_pmd)) {
3540 if (flags & FAULT_FLAG_WRITE && 3707 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3541 !pmd_write(orig_pmd) && 3708
3542 !pmd_trans_splitting(orig_pmd)) { 3709 if (pmd_numa(orig_pmd))
3710 return do_huge_pmd_numa_page(mm, vma, address,
3711 orig_pmd, pmd);
3712
3713 if (dirty && !pmd_write(orig_pmd)) {
3543 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3714 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3544 orig_pmd); 3715 orig_pmd);
3545 /* 3716 /*
@@ -3550,17 +3721,25 @@ retry:
3550 if (unlikely(ret & VM_FAULT_OOM)) 3721 if (unlikely(ret & VM_FAULT_OOM))
3551 goto retry; 3722 goto retry;
3552 return ret; 3723 return ret;
3724 } else {
3725 huge_pmd_set_accessed(mm, vma, address, pmd,
3726 orig_pmd, dirty);
3553 } 3727 }
3728
3554 return 0; 3729 return 0;
3555 } 3730 }
3556 } 3731 }
3557 3732
3733 if (pmd_numa(*pmd))
3734 return do_pmd_numa_page(mm, vma, address, pmd);
3735
3558 /* 3736 /*
3559 * Use __pte_alloc instead of pte_alloc_map, because we can't 3737 * Use __pte_alloc instead of pte_alloc_map, because we can't
3560 * run pte_offset_map on the pmd, if an huge pmd could 3738 * run pte_offset_map on the pmd, if an huge pmd could
3561 * materialize from under us from a different thread. 3739 * materialize from under us from a different thread.
3562 */ 3740 */
3563 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) 3741 if (unlikely(pmd_none(*pmd)) &&
3742 unlikely(__pte_alloc(mm, vma, pmd, address)))
3564 return VM_FAULT_OOM; 3743 return VM_FAULT_OOM;
3565 /* if an huge pmd materialized from under us just retry later */ 3744 /* if an huge pmd materialized from under us just retry later */
3566 if (unlikely(pmd_trans_huge(*pmd))) 3745 if (unlikely(pmd_trans_huge(*pmd)))
@@ -3940,15 +4119,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
3940 struct file *f = vma->vm_file; 4119 struct file *f = vma->vm_file;
3941 char *buf = (char *)__get_free_page(GFP_KERNEL); 4120 char *buf = (char *)__get_free_page(GFP_KERNEL);
3942 if (buf) { 4121 if (buf) {
3943 char *p, *s; 4122 char *p;
3944 4123
3945 p = d_path(&f->f_path, buf, PAGE_SIZE); 4124 p = d_path(&f->f_path, buf, PAGE_SIZE);
3946 if (IS_ERR(p)) 4125 if (IS_ERR(p))
3947 p = "?"; 4126 p = "?";
3948 s = strrchr(p, '/'); 4127 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3949 if (s)
3950 p = s+1;
3951 printk("%s%s[%lx+%lx]", prefix, p,
3952 vma->vm_start, 4128 vma->vm_start,
3953 vma->vm_end - vma->vm_start); 4129 vma->vm_end - vma->vm_start);
3954 free_page((unsigned long)buf); 4130 free_page((unsigned long)buf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4eeacae2b91..d04ed87bfacb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
106void __ref put_page_bootmem(struct page *page) 106void __ref put_page_bootmem(struct page *page)
107{ 107{
108 unsigned long type; 108 unsigned long type;
109 static DEFINE_MUTEX(ppb_lock);
109 110
110 type = (unsigned long) page->lru.next; 111 type = (unsigned long) page->lru.next;
111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
115 ClearPagePrivate(page); 116 ClearPagePrivate(page);
116 set_page_private(page, 0); 117 set_page_private(page, 0);
117 INIT_LIST_HEAD(&page->lru); 118 INIT_LIST_HEAD(&page->lru);
119
120 /*
121 * Please refer to comment for __free_pages_bootmem()
122 * for why we serialize here.
123 */
124 mutex_lock(&ppb_lock);
118 __free_pages_bootmem(page, 0); 125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock);
119 } 127 }
120 128
121} 129}
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
205 zone_span_writelock(zone); 213 zone_span_writelock(zone);
206 214
207 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 215 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
208 if (start_pfn < zone->zone_start_pfn) 216 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
209 zone->zone_start_pfn = start_pfn; 217 zone->zone_start_pfn = start_pfn;
210 218
211 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 219 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
214 zone_span_writeunlock(zone); 222 zone_span_writeunlock(zone);
215} 223}
216 224
225static void resize_zone(struct zone *zone, unsigned long start_pfn,
226 unsigned long end_pfn)
227{
228 zone_span_writelock(zone);
229
230 if (end_pfn - start_pfn) {
231 zone->zone_start_pfn = start_pfn;
232 zone->spanned_pages = end_pfn - start_pfn;
233 } else {
234 /*
235 * make it consist as free_area_init_core(),
236 * if spanned_pages = 0, then keep start_pfn = 0
237 */
238 zone->zone_start_pfn = 0;
239 zone->spanned_pages = 0;
240 }
241
242 zone_span_writeunlock(zone);
243}
244
245static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
246 unsigned long end_pfn)
247{
248 enum zone_type zid = zone_idx(zone);
249 int nid = zone->zone_pgdat->node_id;
250 unsigned long pfn;
251
252 for (pfn = start_pfn; pfn < end_pfn; pfn++)
253 set_page_links(pfn_to_page(pfn), zid, nid, pfn);
254}
255
256static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
257 unsigned long start_pfn, unsigned long end_pfn)
258{
259 int ret;
260 unsigned long flags;
261 unsigned long z1_start_pfn;
262
263 if (!z1->wait_table) {
264 ret = init_currently_empty_zone(z1, start_pfn,
265 end_pfn - start_pfn, MEMMAP_HOTPLUG);
266 if (ret)
267 return ret;
268 }
269
270 pgdat_resize_lock(z1->zone_pgdat, &flags);
271
272 /* can't move pfns which are higher than @z2 */
273 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
274 goto out_fail;
275 /* the move out part mast at the left most of @z2 */
276 if (start_pfn > z2->zone_start_pfn)
277 goto out_fail;
278 /* must included/overlap */
279 if (end_pfn <= z2->zone_start_pfn)
280 goto out_fail;
281
282 /* use start_pfn for z1's start_pfn if z1 is empty */
283 if (z1->spanned_pages)
284 z1_start_pfn = z1->zone_start_pfn;
285 else
286 z1_start_pfn = start_pfn;
287
288 resize_zone(z1, z1_start_pfn, end_pfn);
289 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
290
291 pgdat_resize_unlock(z1->zone_pgdat, &flags);
292
293 fix_zone_id(z1, start_pfn, end_pfn);
294
295 return 0;
296out_fail:
297 pgdat_resize_unlock(z1->zone_pgdat, &flags);
298 return -1;
299}
300
301static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
302 unsigned long start_pfn, unsigned long end_pfn)
303{
304 int ret;
305 unsigned long flags;
306 unsigned long z2_end_pfn;
307
308 if (!z2->wait_table) {
309 ret = init_currently_empty_zone(z2, start_pfn,
310 end_pfn - start_pfn, MEMMAP_HOTPLUG);
311 if (ret)
312 return ret;
313 }
314
315 pgdat_resize_lock(z1->zone_pgdat, &flags);
316
317 /* can't move pfns which are lower than @z1 */
318 if (z1->zone_start_pfn > start_pfn)
319 goto out_fail;
320 /* the move out part mast at the right most of @z1 */
321 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
322 goto out_fail;
323 /* must included/overlap */
324 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
325 goto out_fail;
326
327 /* use end_pfn for z2's end_pfn if z2 is empty */
328 if (z2->spanned_pages)
329 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
330 else
331 z2_end_pfn = end_pfn;
332
333 resize_zone(z1, z1->zone_start_pfn, start_pfn);
334 resize_zone(z2, start_pfn, z2_end_pfn);
335
336 pgdat_resize_unlock(z1->zone_pgdat, &flags);
337
338 fix_zone_id(z2, start_pfn, end_pfn);
339
340 return 0;
341out_fail:
342 pgdat_resize_unlock(z1->zone_pgdat, &flags);
343 return -1;
344}
345
217static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 346static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
218 unsigned long end_pfn) 347 unsigned long end_pfn)
219{ 348{
220 unsigned long old_pgdat_end_pfn = 349 unsigned long old_pgdat_end_pfn =
221 pgdat->node_start_pfn + pgdat->node_spanned_pages; 350 pgdat->node_start_pfn + pgdat->node_spanned_pages;
222 351
223 if (start_pfn < pgdat->node_start_pfn) 352 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
224 pgdat->node_start_pfn = start_pfn; 353 pgdat->node_start_pfn = start_pfn;
225 354
226 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 355 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +589,99 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
460 return 0; 589 return 0;
461} 590}
462 591
592#ifdef CONFIG_MOVABLE_NODE
593/*
594 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
595 * normal memory.
596 */
597static bool can_online_high_movable(struct zone *zone)
598{
599 return true;
600}
601#else /* CONFIG_MOVABLE_NODE */
602/* ensure every online node has NORMAL memory */
603static bool can_online_high_movable(struct zone *zone)
604{
605 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
606}
607#endif /* CONFIG_MOVABLE_NODE */
463 608
464int __ref online_pages(unsigned long pfn, unsigned long nr_pages) 609/* check which state of node_states will be changed when online memory */
610static void node_states_check_changes_online(unsigned long nr_pages,
611 struct zone *zone, struct memory_notify *arg)
612{
613 int nid = zone_to_nid(zone);
614 enum zone_type zone_last = ZONE_NORMAL;
615
616 /*
617 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
618 * contains nodes which have zones of 0...ZONE_NORMAL,
619 * set zone_last to ZONE_NORMAL.
620 *
621 * If we don't have HIGHMEM nor movable node,
622 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
623 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
624 */
625 if (N_MEMORY == N_NORMAL_MEMORY)
626 zone_last = ZONE_MOVABLE;
627
628 /*
629 * if the memory to be online is in a zone of 0...zone_last, and
630 * the zones of 0...zone_last don't have memory before online, we will
631 * need to set the node to node_states[N_NORMAL_MEMORY] after
632 * the memory is online.
633 */
634 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
635 arg->status_change_nid_normal = nid;
636 else
637 arg->status_change_nid_normal = -1;
638
639#ifdef CONFIG_HIGHMEM
640 /*
641 * If we have movable node, node_states[N_HIGH_MEMORY]
642 * contains nodes which have zones of 0...ZONE_HIGHMEM,
643 * set zone_last to ZONE_HIGHMEM.
644 *
645 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
646 * contains nodes which have zones of 0...ZONE_MOVABLE,
647 * set zone_last to ZONE_MOVABLE.
648 */
649 zone_last = ZONE_HIGHMEM;
650 if (N_MEMORY == N_HIGH_MEMORY)
651 zone_last = ZONE_MOVABLE;
652
653 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
654 arg->status_change_nid_high = nid;
655 else
656 arg->status_change_nid_high = -1;
657#else
658 arg->status_change_nid_high = arg->status_change_nid_normal;
659#endif
660
661 /*
662 * if the node don't have memory befor online, we will need to
663 * set the node to node_states[N_MEMORY] after the memory
664 * is online.
665 */
666 if (!node_state(nid, N_MEMORY))
667 arg->status_change_nid = nid;
668 else
669 arg->status_change_nid = -1;
670}
671
672static void node_states_set_node(int node, struct memory_notify *arg)
673{
674 if (arg->status_change_nid_normal >= 0)
675 node_set_state(node, N_NORMAL_MEMORY);
676
677 if (arg->status_change_nid_high >= 0)
678 node_set_state(node, N_HIGH_MEMORY);
679
680 node_set_state(node, N_MEMORY);
681}
682
683
684int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
465{ 685{
466 unsigned long onlined_pages = 0; 686 unsigned long onlined_pages = 0;
467 struct zone *zone; 687 struct zone *zone;
@@ -471,13 +691,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
471 struct memory_notify arg; 691 struct memory_notify arg;
472 692
473 lock_memory_hotplug(); 693 lock_memory_hotplug();
694 /*
695 * This doesn't need a lock to do pfn_to_page().
696 * The section can't be removed here because of the
697 * memory_block->state_mutex.
698 */
699 zone = page_zone(pfn_to_page(pfn));
700
701 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
702 !can_online_high_movable(zone)) {
703 unlock_memory_hotplug();
704 return -1;
705 }
706
707 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
708 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
709 unlock_memory_hotplug();
710 return -1;
711 }
712 }
713 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
714 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
715 unlock_memory_hotplug();
716 return -1;
717 }
718 }
719
720 /* Previous code may changed the zone of the pfn range */
721 zone = page_zone(pfn_to_page(pfn));
722
474 arg.start_pfn = pfn; 723 arg.start_pfn = pfn;
475 arg.nr_pages = nr_pages; 724 arg.nr_pages = nr_pages;
476 arg.status_change_nid = -1; 725 node_states_check_changes_online(nr_pages, zone, &arg);
477 726
478 nid = page_to_nid(pfn_to_page(pfn)); 727 nid = page_to_nid(pfn_to_page(pfn));
479 if (node_present_pages(nid) == 0)
480 arg.status_change_nid = nid;
481 728
482 ret = memory_notify(MEM_GOING_ONLINE, &arg); 729 ret = memory_notify(MEM_GOING_ONLINE, &arg);
483 ret = notifier_to_errno(ret); 730 ret = notifier_to_errno(ret);
@@ -487,23 +734,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
487 return ret; 734 return ret;
488 } 735 }
489 /* 736 /*
490 * This doesn't need a lock to do pfn_to_page().
491 * The section can't be removed here because of the
492 * memory_block->state_mutex.
493 */
494 zone = page_zone(pfn_to_page(pfn));
495 /*
496 * If this zone is not populated, then it is not in zonelist. 737 * If this zone is not populated, then it is not in zonelist.
497 * This means the page allocator ignores this zone. 738 * This means the page allocator ignores this zone.
498 * So, zonelist must be updated after online. 739 * So, zonelist must be updated after online.
499 */ 740 */
500 mutex_lock(&zonelists_mutex); 741 mutex_lock(&zonelists_mutex);
501 if (!populated_zone(zone)) 742 if (!populated_zone(zone)) {
502 need_zonelists_rebuild = 1; 743 need_zonelists_rebuild = 1;
744 build_all_zonelists(NULL, zone);
745 }
503 746
504 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 747 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
505 online_pages_range); 748 online_pages_range);
506 if (ret) { 749 if (ret) {
750 if (need_zonelists_rebuild)
751 zone_pcp_reset(zone);
507 mutex_unlock(&zonelists_mutex); 752 mutex_unlock(&zonelists_mutex);
508 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 753 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
509 (unsigned long long) pfn << PAGE_SHIFT, 754 (unsigned long long) pfn << PAGE_SHIFT,
@@ -514,12 +759,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
514 return ret; 759 return ret;
515 } 760 }
516 761
762 zone->managed_pages += onlined_pages;
517 zone->present_pages += onlined_pages; 763 zone->present_pages += onlined_pages;
518 zone->zone_pgdat->node_present_pages += onlined_pages; 764 zone->zone_pgdat->node_present_pages += onlined_pages;
519 if (onlined_pages) { 765 if (onlined_pages) {
520 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 766 node_states_set_node(zone_to_nid(zone), &arg);
521 if (need_zonelists_rebuild) 767 if (need_zonelists_rebuild)
522 build_all_zonelists(NULL, zone); 768 build_all_zonelists(NULL, NULL);
523 else 769 else
524 zone_pcp_update(zone); 770 zone_pcp_update(zone);
525 } 771 }
@@ -812,7 +1058,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
812 * migrate_pages returns # of failed pages. 1058 * migrate_pages returns # of failed pages.
813 */ 1059 */
814 ret = migrate_pages(&source, alloc_migrate_target, 0, 1060 ret = migrate_pages(&source, alloc_migrate_target, 0,
815 true, MIGRATE_SYNC); 1061 true, MIGRATE_SYNC,
1062 MR_MEMORY_HOTPLUG);
816 if (ret) 1063 if (ret)
817 putback_lru_pages(&source); 1064 putback_lru_pages(&source);
818 } 1065 }
@@ -847,7 +1094,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
847{ 1094{
848 int ret; 1095 int ret;
849 long offlined = *(long *)data; 1096 long offlined = *(long *)data;
850 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); 1097 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
851 offlined = nr_pages; 1098 offlined = nr_pages;
852 if (!ret) 1099 if (!ret)
853 *(long *)data += offlined; 1100 *(long *)data += offlined;
@@ -867,6 +1114,132 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
867 return offlined; 1114 return offlined;
868} 1115}
869 1116
1117#ifdef CONFIG_MOVABLE_NODE
1118/*
1119 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
1120 * normal memory.
1121 */
1122static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1123{
1124 return true;
1125}
1126#else /* CONFIG_MOVABLE_NODE */
1127/* ensure the node has NORMAL memory if it is still online */
1128static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1129{
1130 struct pglist_data *pgdat = zone->zone_pgdat;
1131 unsigned long present_pages = 0;
1132 enum zone_type zt;
1133
1134 for (zt = 0; zt <= ZONE_NORMAL; zt++)
1135 present_pages += pgdat->node_zones[zt].present_pages;
1136
1137 if (present_pages > nr_pages)
1138 return true;
1139
1140 present_pages = 0;
1141 for (; zt <= ZONE_MOVABLE; zt++)
1142 present_pages += pgdat->node_zones[zt].present_pages;
1143
1144 /*
1145 * we can't offline the last normal memory until all
1146 * higher memory is offlined.
1147 */
1148 return present_pages == 0;
1149}
1150#endif /* CONFIG_MOVABLE_NODE */
1151
1152/* check which state of node_states will be changed when offline memory */
1153static void node_states_check_changes_offline(unsigned long nr_pages,
1154 struct zone *zone, struct memory_notify *arg)
1155{
1156 struct pglist_data *pgdat = zone->zone_pgdat;
1157 unsigned long present_pages = 0;
1158 enum zone_type zt, zone_last = ZONE_NORMAL;
1159
1160 /*
1161 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1162 * contains nodes which have zones of 0...ZONE_NORMAL,
1163 * set zone_last to ZONE_NORMAL.
1164 *
1165 * If we don't have HIGHMEM nor movable node,
1166 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1167 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1168 */
1169 if (N_MEMORY == N_NORMAL_MEMORY)
1170 zone_last = ZONE_MOVABLE;
1171
1172 /*
1173 * check whether node_states[N_NORMAL_MEMORY] will be changed.
1174 * If the memory to be offline is in a zone of 0...zone_last,
1175 * and it is the last present memory, 0...zone_last will
1176 * become empty after offline , thus we can determind we will
1177 * need to clear the node from node_states[N_NORMAL_MEMORY].
1178 */
1179 for (zt = 0; zt <= zone_last; zt++)
1180 present_pages += pgdat->node_zones[zt].present_pages;
1181 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1182 arg->status_change_nid_normal = zone_to_nid(zone);
1183 else
1184 arg->status_change_nid_normal = -1;
1185
1186#ifdef CONFIG_HIGHMEM
1187 /*
1188 * If we have movable node, node_states[N_HIGH_MEMORY]
1189 * contains nodes which have zones of 0...ZONE_HIGHMEM,
1190 * set zone_last to ZONE_HIGHMEM.
1191 *
1192 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1193 * contains nodes which have zones of 0...ZONE_MOVABLE,
1194 * set zone_last to ZONE_MOVABLE.
1195 */
1196 zone_last = ZONE_HIGHMEM;
1197 if (N_MEMORY == N_HIGH_MEMORY)
1198 zone_last = ZONE_MOVABLE;
1199
1200 for (; zt <= zone_last; zt++)
1201 present_pages += pgdat->node_zones[zt].present_pages;
1202 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1203 arg->status_change_nid_high = zone_to_nid(zone);
1204 else
1205 arg->status_change_nid_high = -1;
1206#else
1207 arg->status_change_nid_high = arg->status_change_nid_normal;
1208#endif
1209
1210 /*
1211 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1212 */
1213 zone_last = ZONE_MOVABLE;
1214
1215 /*
1216 * check whether node_states[N_HIGH_MEMORY] will be changed
1217 * If we try to offline the last present @nr_pages from the node,
1218 * we can determind we will need to clear the node from
1219 * node_states[N_HIGH_MEMORY].
1220 */
1221 for (; zt <= zone_last; zt++)
1222 present_pages += pgdat->node_zones[zt].present_pages;
1223 if (nr_pages >= present_pages)
1224 arg->status_change_nid = zone_to_nid(zone);
1225 else
1226 arg->status_change_nid = -1;
1227}
1228
1229static void node_states_clear_node(int node, struct memory_notify *arg)
1230{
1231 if (arg->status_change_nid_normal >= 0)
1232 node_clear_state(node, N_NORMAL_MEMORY);
1233
1234 if ((N_MEMORY != N_NORMAL_MEMORY) &&
1235 (arg->status_change_nid_high >= 0))
1236 node_clear_state(node, N_HIGH_MEMORY);
1237
1238 if ((N_MEMORY != N_HIGH_MEMORY) &&
1239 (arg->status_change_nid >= 0))
1240 node_clear_state(node, N_MEMORY);
1241}
1242
870static int __ref __offline_pages(unsigned long start_pfn, 1243static int __ref __offline_pages(unsigned long start_pfn,
871 unsigned long end_pfn, unsigned long timeout) 1244 unsigned long end_pfn, unsigned long timeout)
872{ 1245{
@@ -893,16 +1266,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
893 node = zone_to_nid(zone); 1266 node = zone_to_nid(zone);
894 nr_pages = end_pfn - start_pfn; 1267 nr_pages = end_pfn - start_pfn;
895 1268
1269 ret = -EINVAL;
1270 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
1271 goto out;
1272
896 /* set above range as isolated */ 1273 /* set above range as isolated */
897 ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1274 ret = start_isolate_page_range(start_pfn, end_pfn,
1275 MIGRATE_MOVABLE, true);
898 if (ret) 1276 if (ret)
899 goto out; 1277 goto out;
900 1278
901 arg.start_pfn = start_pfn; 1279 arg.start_pfn = start_pfn;
902 arg.nr_pages = nr_pages; 1280 arg.nr_pages = nr_pages;
903 arg.status_change_nid = -1; 1281 node_states_check_changes_offline(nr_pages, zone, &arg);
904 if (nr_pages >= node_present_pages(node))
905 arg.status_change_nid = node;
906 1282
907 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1283 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
908 ret = notifier_to_errno(ret); 1284 ret = notifier_to_errno(ret);
@@ -943,10 +1319,10 @@ repeat:
943 goto repeat; 1319 goto repeat;
944 } 1320 }
945 } 1321 }
946 /* drain all zone's lru pagevec, this is asyncronous... */ 1322 /* drain all zone's lru pagevec, this is asynchronous... */
947 lru_add_drain_all(); 1323 lru_add_drain_all();
948 yield(); 1324 yield();
949 /* drain pcp pages , this is synchrouns. */ 1325 /* drain pcp pages, this is synchronous. */
950 drain_all_pages(); 1326 drain_all_pages();
951 /* check again */ 1327 /* check again */
952 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1328 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
@@ -955,12 +1331,13 @@ repeat:
955 goto failed_removal; 1331 goto failed_removal;
956 } 1332 }
957 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1333 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
958 /* Ok, all of our target is islaoted. 1334 /* Ok, all of our target is isolated.
959 We cannot do rollback at this point. */ 1335 We cannot do rollback at this point. */
960 offline_isolated_pages(start_pfn, end_pfn); 1336 offline_isolated_pages(start_pfn, end_pfn);
961 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1337 /* reset pagetype flags and makes migrate type to be MOVABLE */
962 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1338 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
963 /* removal success */ 1339 /* removal success */
1340 zone->managed_pages -= offlined_pages;
964 zone->present_pages -= offlined_pages; 1341 zone->present_pages -= offlined_pages;
965 zone->zone_pgdat->node_present_pages -= offlined_pages; 1342 zone->zone_pgdat->node_present_pages -= offlined_pages;
966 totalram_pages -= offlined_pages; 1343 totalram_pages -= offlined_pages;
@@ -975,10 +1352,9 @@ repeat:
975 } else 1352 } else
976 zone_pcp_update(zone); 1353 zone_pcp_update(zone);
977 1354
978 if (!node_present_pages(node)) { 1355 node_states_clear_node(node, &arg);
979 node_clear_state(node, N_HIGH_MEMORY); 1356 if (arg.status_change_nid >= 0)
980 kswapd_stop(node); 1357 kswapd_stop(node);
981 }
982 1358
983 vm_total_pages = nr_free_pagecache_pages(); 1359 vm_total_pages = nr_free_pagecache_pages();
984 writeback_set_ratelimit(); 1360 writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ea600da8940..d1b315e98627 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
90#include <linux/syscalls.h> 90#include <linux/syscalls.h>
91#include <linux/ctype.h> 91#include <linux/ctype.h>
92#include <linux/mm_inline.h> 92#include <linux/mm_inline.h>
93#include <linux/mmu_notifier.h>
93 94
94#include <asm/tlbflush.h> 95#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 96#include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
117 .flags = MPOL_F_LOCAL, 118 .flags = MPOL_F_LOCAL,
118}; 119};
119 120
121static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122
123static struct mempolicy *get_task_policy(struct task_struct *p)
124{
125 struct mempolicy *pol = p->mempolicy;
126 int node;
127
128 if (!pol) {
129 node = numa_node_id();
130 if (node != -1)
131 pol = &preferred_node_policy[node];
132
133 /* preferred_node_policy is not initialised early in boot */
134 if (!pol->mode)
135 pol = NULL;
136 }
137
138 return pol;
139}
140
120static const struct mempolicy_operations { 141static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 142 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 /* 143 /*
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
212 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 233 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
213 if (pol == NULL) 234 if (pol == NULL)
214 return 0; 235 return 0;
215 /* Check N_HIGH_MEMORY */ 236 /* Check N_MEMORY */
216 nodes_and(nsc->mask1, 237 nodes_and(nsc->mask1,
217 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); 238 cpuset_current_mems_allowed, node_states[N_MEMORY]);
218 239
219 VM_BUG_ON(!nodes); 240 VM_BUG_ON(!nodes);
220 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 241 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
254 if (mode == MPOL_DEFAULT) { 275 if (mode == MPOL_DEFAULT) {
255 if (nodes && !nodes_empty(*nodes)) 276 if (nodes && !nodes_empty(*nodes))
256 return ERR_PTR(-EINVAL); 277 return ERR_PTR(-EINVAL);
257 return NULL; /* simply delete any existing policy */ 278 return NULL;
258 } 279 }
259 VM_BUG_ON(!nodes); 280 VM_BUG_ON(!nodes);
260 281
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
269 (flags & MPOL_F_RELATIVE_NODES))) 290 (flags & MPOL_F_RELATIVE_NODES)))
270 return ERR_PTR(-EINVAL); 291 return ERR_PTR(-EINVAL);
271 } 292 }
293 } else if (mode == MPOL_LOCAL) {
294 if (!nodes_empty(*nodes))
295 return ERR_PTR(-EINVAL);
296 mode = MPOL_PREFERRED;
272 } else if (nodes_empty(*nodes)) 297 } else if (nodes_empty(*nodes))
273 return ERR_PTR(-EINVAL); 298 return ERR_PTR(-EINVAL);
274 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 299 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
511 pmd = pmd_offset(pud, addr); 536 pmd = pmd_offset(pud, addr);
512 do { 537 do {
513 next = pmd_addr_end(addr, end); 538 next = pmd_addr_end(addr, end);
514 split_huge_page_pmd(vma->vm_mm, pmd); 539 split_huge_page_pmd(vma, addr, pmd);
515 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 540 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
516 continue; 541 continue;
517 if (check_pte_range(vma, pmd, addr, next, nodes, 542 if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
561 return 0; 586 return 0;
562} 587}
563 588
589#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
590/*
591 * This is used to mark a range of virtual addresses to be inaccessible.
592 * These are later cleared by a NUMA hinting fault. Depending on these
593 * faults, pages may be migrated for better NUMA placement.
594 *
595 * This is assuming that NUMA faults are handled using PROT_NONE. If
596 * an architecture makes a different choice, it will need further
597 * changes to the core.
598 */
599unsigned long change_prot_numa(struct vm_area_struct *vma,
600 unsigned long addr, unsigned long end)
601{
602 int nr_updated;
603 BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
604
605 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
606 if (nr_updated)
607 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
608
609 return nr_updated;
610}
611#else
612static unsigned long change_prot_numa(struct vm_area_struct *vma,
613 unsigned long addr, unsigned long end)
614{
615 return 0;
616}
617#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
618
564/* 619/*
565 * Check if all pages in a range are on a set of nodes. 620 * Check if all pages in a range are on a set of nodes.
566 * If pagelist != NULL then isolate pages from the LRU and 621 * If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
579 return ERR_PTR(-EFAULT); 634 return ERR_PTR(-EFAULT);
580 prev = NULL; 635 prev = NULL;
581 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 636 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
637 unsigned long endvma = vma->vm_end;
638
639 if (endvma > end)
640 endvma = end;
641 if (vma->vm_start > start)
642 start = vma->vm_start;
643
582 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 644 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
583 if (!vma->vm_next && vma->vm_end < end) 645 if (!vma->vm_next && vma->vm_end < end)
584 return ERR_PTR(-EFAULT); 646 return ERR_PTR(-EFAULT);
585 if (prev && prev->vm_end < vma->vm_start) 647 if (prev && prev->vm_end < vma->vm_start)
586 return ERR_PTR(-EFAULT); 648 return ERR_PTR(-EFAULT);
587 } 649 }
588 if (!is_vm_hugetlb_page(vma) && 650
589 ((flags & MPOL_MF_STRICT) || 651 if (is_vm_hugetlb_page(vma))
652 goto next;
653
654 if (flags & MPOL_MF_LAZY) {
655 change_prot_numa(vma, start, endvma);
656 goto next;
657 }
658
659 if ((flags & MPOL_MF_STRICT) ||
590 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 660 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
591 vma_migratable(vma)))) { 661 vma_migratable(vma))) {
592 unsigned long endvma = vma->vm_end;
593 662
594 if (endvma > end)
595 endvma = end;
596 if (vma->vm_start > start)
597 start = vma->vm_start;
598 err = check_pgd_range(vma, start, endvma, nodes, 663 err = check_pgd_range(vma, start, endvma, nodes,
599 flags, private); 664 flags, private);
600 if (err) { 665 if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
602 break; 667 break;
603 } 668 }
604 } 669 }
670next:
605 prev = vma; 671 prev = vma;
606 } 672 }
607 return first; 673 return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
961 1027
962 if (!list_empty(&pagelist)) { 1028 if (!list_empty(&pagelist)) {
963 err = migrate_pages(&pagelist, new_node_page, dest, 1029 err = migrate_pages(&pagelist, new_node_page, dest,
964 false, MIGRATE_SYNC); 1030 false, MIGRATE_SYNC,
1031 MR_SYSCALL);
965 if (err) 1032 if (err)
966 putback_lru_pages(&pagelist); 1033 putback_lru_pages(&pagelist);
967 } 1034 }
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1133 int err; 1200 int err;
1134 LIST_HEAD(pagelist); 1201 LIST_HEAD(pagelist);
1135 1202
1136 if (flags & ~(unsigned long)(MPOL_MF_STRICT | 1203 if (flags & ~(unsigned long)MPOL_MF_VALID)
1137 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1138 return -EINVAL; 1204 return -EINVAL;
1139 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1205 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1140 return -EPERM; 1206 return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1157 if (IS_ERR(new)) 1223 if (IS_ERR(new))
1158 return PTR_ERR(new); 1224 return PTR_ERR(new);
1159 1225
1226 if (flags & MPOL_MF_LAZY)
1227 new->flags |= MPOL_F_MOF;
1228
1160 /* 1229 /*
1161 * If we are using the default policy then operation 1230 * If we are using the default policy then operation
1162 * on discontinuous address spaces is okay after all 1231 * on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
1193 vma = check_range(mm, start, end, nmask, 1262 vma = check_range(mm, start, end, nmask,
1194 flags | MPOL_MF_INVERT, &pagelist); 1263 flags | MPOL_MF_INVERT, &pagelist);
1195 1264
1196 err = PTR_ERR(vma); 1265 err = PTR_ERR(vma); /* maybe ... */
1197 if (!IS_ERR(vma)) { 1266 if (!IS_ERR(vma))
1198 int nr_failed = 0;
1199
1200 err = mbind_range(mm, start, end, new); 1267 err = mbind_range(mm, start, end, new);
1201 1268
1269 if (!err) {
1270 int nr_failed = 0;
1271
1202 if (!list_empty(&pagelist)) { 1272 if (!list_empty(&pagelist)) {
1273 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1203 nr_failed = migrate_pages(&pagelist, new_vma_page, 1274 nr_failed = migrate_pages(&pagelist, new_vma_page,
1204 (unsigned long)vma, 1275 (unsigned long)vma,
1205 false, MIGRATE_SYNC); 1276 false, MIGRATE_SYNC,
1277 MR_MEMPOLICY_MBIND);
1206 if (nr_failed) 1278 if (nr_failed)
1207 putback_lru_pages(&pagelist); 1279 putback_lru_pages(&pagelist);
1208 } 1280 }
1209 1281
1210 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1282 if (nr_failed && (flags & MPOL_MF_STRICT))
1211 err = -EIO; 1283 err = -EIO;
1212 } else 1284 } else
1213 putback_lru_pages(&pagelist); 1285 putback_lru_pages(&pagelist);
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1388 goto out_put; 1460 goto out_put;
1389 } 1461 }
1390 1462
1391 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { 1463 if (!nodes_subset(*new, node_states[N_MEMORY])) {
1392 err = -EINVAL; 1464 err = -EINVAL;
1393 goto out_put; 1465 goto out_put;
1394 } 1466 }
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1546struct mempolicy *get_vma_policy(struct task_struct *task, 1618struct mempolicy *get_vma_policy(struct task_struct *task,
1547 struct vm_area_struct *vma, unsigned long addr) 1619 struct vm_area_struct *vma, unsigned long addr)
1548{ 1620{
1549 struct mempolicy *pol = task->mempolicy; 1621 struct mempolicy *pol = get_task_policy(task);
1550 1622
1551 if (vma) { 1623 if (vma) {
1552 if (vma->vm_ops && vma->vm_ops->get_policy) { 1624 if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1907,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1907 unsigned long addr, int node) 1979 unsigned long addr, int node)
1908{ 1980{
1909 struct mempolicy *pol; 1981 struct mempolicy *pol;
1910 struct zonelist *zl;
1911 struct page *page; 1982 struct page *page;
1912 unsigned int cpuset_mems_cookie; 1983 unsigned int cpuset_mems_cookie;
1913 1984
@@ -1926,23 +1997,11 @@ retry_cpuset:
1926 1997
1927 return page; 1998 return page;
1928 } 1999 }
1929 zl = policy_zonelist(gfp, pol, node); 2000 page = __alloc_pages_nodemask(gfp, order,
1930 if (unlikely(mpol_needs_cond_ref(pol))) { 2001 policy_zonelist(gfp, pol, node),
1931 /*
1932 * slow path: ref counted shared policy
1933 */
1934 struct page *page = __alloc_pages_nodemask(gfp, order,
1935 zl, policy_nodemask(gfp, pol));
1936 __mpol_put(pol);
1937 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1938 goto retry_cpuset;
1939 return page;
1940 }
1941 /*
1942 * fast path: default or task policy
1943 */
1944 page = __alloc_pages_nodemask(gfp, order, zl,
1945 policy_nodemask(gfp, pol)); 2002 policy_nodemask(gfp, pol));
2003 if (unlikely(mpol_needs_cond_ref(pol)))
2004 __mpol_put(pol);
1946 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2005 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1947 goto retry_cpuset; 2006 goto retry_cpuset;
1948 return page; 2007 return page;
@@ -1969,7 +2028,7 @@ retry_cpuset:
1969 */ 2028 */
1970struct page *alloc_pages_current(gfp_t gfp, unsigned order) 2029struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1971{ 2030{
1972 struct mempolicy *pol = current->mempolicy; 2031 struct mempolicy *pol = get_task_policy(current);
1973 struct page *page; 2032 struct page *page;
1974 unsigned int cpuset_mems_cookie; 2033 unsigned int cpuset_mems_cookie;
1975 2034
@@ -2153,6 +2212,115 @@ static void sp_free(struct sp_node *n)
2153 kmem_cache_free(sn_cache, n); 2212 kmem_cache_free(sn_cache, n);
2154} 2213}
2155 2214
2215/**
2216 * mpol_misplaced - check whether current page node is valid in policy
2217 *
2218 * @page - page to be checked
2219 * @vma - vm area where page mapped
2220 * @addr - virtual address where page mapped
2221 *
2222 * Lookup current policy node id for vma,addr and "compare to" page's
2223 * node id.
2224 *
2225 * Returns:
2226 * -1 - not misplaced, page is in the right node
2227 * node - node id where the page should be
2228 *
2229 * Policy determination "mimics" alloc_page_vma().
2230 * Called from fault path where we know the vma and faulting address.
2231 */
2232int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2233{
2234 struct mempolicy *pol;
2235 struct zone *zone;
2236 int curnid = page_to_nid(page);
2237 unsigned long pgoff;
2238 int polnid = -1;
2239 int ret = -1;
2240
2241 BUG_ON(!vma);
2242
2243 pol = get_vma_policy(current, vma, addr);
2244 if (!(pol->flags & MPOL_F_MOF))
2245 goto out;
2246
2247 switch (pol->mode) {
2248 case MPOL_INTERLEAVE:
2249 BUG_ON(addr >= vma->vm_end);
2250 BUG_ON(addr < vma->vm_start);
2251
2252 pgoff = vma->vm_pgoff;
2253 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2254 polnid = offset_il_node(pol, vma, pgoff);
2255 break;
2256
2257 case MPOL_PREFERRED:
2258 if (pol->flags & MPOL_F_LOCAL)
2259 polnid = numa_node_id();
2260 else
2261 polnid = pol->v.preferred_node;
2262 break;
2263
2264 case MPOL_BIND:
2265 /*
2266 * allows binding to multiple nodes.
2267 * use current page if in policy nodemask,
2268 * else select nearest allowed node, if any.
2269 * If no allowed nodes, use current [!misplaced].
2270 */
2271 if (node_isset(curnid, pol->v.nodes))
2272 goto out;
2273 (void)first_zones_zonelist(
2274 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2275 gfp_zone(GFP_HIGHUSER),
2276 &pol->v.nodes, &zone);
2277 polnid = zone->node;
2278 break;
2279
2280 default:
2281 BUG();
2282 }
2283
2284 /* Migrate the page towards the node whose CPU is referencing it */
2285 if (pol->flags & MPOL_F_MORON) {
2286 int last_nid;
2287
2288 polnid = numa_node_id();
2289
2290 /*
2291 * Multi-stage node selection is used in conjunction
2292 * with a periodic migration fault to build a temporal
2293 * task<->page relation. By using a two-stage filter we
2294 * remove short/unlikely relations.
2295 *
2296 * Using P(p) ~ n_p / n_t as per frequentist
2297 * probability, we can equate a task's usage of a
2298 * particular page (n_p) per total usage of this
2299 * page (n_t) (in a given time-span) to a probability.
2300 *
2301 * Our periodic faults will sample this probability and
2302 * getting the same result twice in a row, given these
2303 * samples are fully independent, is then given by
2304 * P(n)^2, provided our sample period is sufficiently
2305 * short compared to the usage pattern.
2306 *
2307 * This quadric squishes small probabilities, making
2308 * it less likely we act on an unlikely task<->page
2309 * relation.
2310 */
2311 last_nid = page_xchg_last_nid(page, polnid);
2312 if (last_nid != polnid)
2313 goto out;
2314 }
2315
2316 if (curnid != polnid)
2317 ret = polnid;
2318out:
2319 mpol_cond_put(pol);
2320
2321 return ret;
2322}
2323
2156static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2324static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2157{ 2325{
2158 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2326 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2318,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)
2318 mutex_unlock(&p->mutex); 2486 mutex_unlock(&p->mutex);
2319} 2487}
2320 2488
2489#ifdef CONFIG_NUMA_BALANCING
2490static bool __initdata numabalancing_override;
2491
2492static void __init check_numabalancing_enable(void)
2493{
2494 bool numabalancing_default = false;
2495
2496 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2497 numabalancing_default = true;
2498
2499 if (nr_node_ids > 1 && !numabalancing_override) {
2500 printk(KERN_INFO "Enabling automatic NUMA balancing. "
2501 "Configure with numa_balancing= or sysctl");
2502 set_numabalancing_state(numabalancing_default);
2503 }
2504}
2505
2506static int __init setup_numabalancing(char *str)
2507{
2508 int ret = 0;
2509 if (!str)
2510 goto out;
2511 numabalancing_override = true;
2512
2513 if (!strcmp(str, "enable")) {
2514 set_numabalancing_state(true);
2515 ret = 1;
2516 } else if (!strcmp(str, "disable")) {
2517 set_numabalancing_state(false);
2518 ret = 1;
2519 }
2520out:
2521 if (!ret)
2522 printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2523
2524 return ret;
2525}
2526__setup("numa_balancing=", setup_numabalancing);
2527#else
2528static inline void __init check_numabalancing_enable(void)
2529{
2530}
2531#endif /* CONFIG_NUMA_BALANCING */
2532
2321/* assumes fs == KERNEL_DS */ 2533/* assumes fs == KERNEL_DS */
2322void __init numa_policy_init(void) 2534void __init numa_policy_init(void)
2323{ 2535{
@@ -2333,13 +2545,22 @@ void __init numa_policy_init(void)
2333 sizeof(struct sp_node), 2545 sizeof(struct sp_node),
2334 0, SLAB_PANIC, NULL); 2546 0, SLAB_PANIC, NULL);
2335 2547
2548 for_each_node(nid) {
2549 preferred_node_policy[nid] = (struct mempolicy) {
2550 .refcnt = ATOMIC_INIT(1),
2551 .mode = MPOL_PREFERRED,
2552 .flags = MPOL_F_MOF | MPOL_F_MORON,
2553 .v = { .preferred_node = nid, },
2554 };
2555 }
2556
2336 /* 2557 /*
2337 * Set interleaving policy for system init. Interleaving is only 2558 * Set interleaving policy for system init. Interleaving is only
2338 * enabled across suitably sized nodes (default is >= 16MB), or 2559 * enabled across suitably sized nodes (default is >= 16MB), or
2339 * fall back to the largest node if they're all smaller. 2560 * fall back to the largest node if they're all smaller.
2340 */ 2561 */
2341 nodes_clear(interleave_nodes); 2562 nodes_clear(interleave_nodes);
2342 for_each_node_state(nid, N_HIGH_MEMORY) { 2563 for_each_node_state(nid, N_MEMORY) {
2343 unsigned long total_pages = node_present_pages(nid); 2564 unsigned long total_pages = node_present_pages(nid);
2344 2565
2345 /* Preserve the largest node */ 2566 /* Preserve the largest node */
@@ -2359,6 +2580,8 @@ void __init numa_policy_init(void)
2359 2580
2360 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 2581 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2361 printk("numa_policy_init: interleaving failed\n"); 2582 printk("numa_policy_init: interleaving failed\n");
2583
2584 check_numabalancing_enable();
2362} 2585}
2363 2586
2364/* Reset policy of current process to default */ 2587/* Reset policy of current process to default */
@@ -2375,14 +2598,13 @@ void numa_default_policy(void)
2375 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2598 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
2376 * Used only for mpol_parse_str() and mpol_to_str() 2599 * Used only for mpol_parse_str() and mpol_to_str()
2377 */ 2600 */
2378#define MPOL_LOCAL MPOL_MAX
2379static const char * const policy_modes[] = 2601static const char * const policy_modes[] =
2380{ 2602{
2381 [MPOL_DEFAULT] = "default", 2603 [MPOL_DEFAULT] = "default",
2382 [MPOL_PREFERRED] = "prefer", 2604 [MPOL_PREFERRED] = "prefer",
2383 [MPOL_BIND] = "bind", 2605 [MPOL_BIND] = "bind",
2384 [MPOL_INTERLEAVE] = "interleave", 2606 [MPOL_INTERLEAVE] = "interleave",
2385 [MPOL_LOCAL] = "local" 2607 [MPOL_LOCAL] = "local",
2386}; 2608};
2387 2609
2388 2610
@@ -2420,7 +2642,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2420 *nodelist++ = '\0'; 2642 *nodelist++ = '\0';
2421 if (nodelist_parse(nodelist, nodes)) 2643 if (nodelist_parse(nodelist, nodes))
2422 goto out; 2644 goto out;
2423 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) 2645 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2424 goto out; 2646 goto out;
2425 } else 2647 } else
2426 nodes_clear(nodes); 2648 nodes_clear(nodes);
@@ -2428,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2428 if (flags) 2650 if (flags)
2429 *flags++ = '\0'; /* terminate mode string */ 2651 *flags++ = '\0'; /* terminate mode string */
2430 2652
2431 for (mode = 0; mode <= MPOL_LOCAL; mode++) { 2653 for (mode = 0; mode < MPOL_MAX; mode++) {
2432 if (!strcmp(str, policy_modes[mode])) { 2654 if (!strcmp(str, policy_modes[mode])) {
2433 break; 2655 break;
2434 } 2656 }
2435 } 2657 }
2436 if (mode > MPOL_LOCAL) 2658 if (mode >= MPOL_MAX)
2437 goto out; 2659 goto out;
2438 2660
2439 switch (mode) { 2661 switch (mode) {
@@ -2454,7 +2676,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2454 * Default to online nodes with memory if no nodelist 2676 * Default to online nodes with memory if no nodelist
2455 */ 2677 */
2456 if (!nodelist) 2678 if (!nodelist)
2457 nodes = node_states[N_HIGH_MEMORY]; 2679 nodes = node_states[N_MEMORY];
2458 break; 2680 break;
2459 case MPOL_LOCAL: 2681 case MPOL_LOCAL:
2460 /* 2682 /*
diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d773705..3b676b0c5c3e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,9 +35,13 @@
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h> 36#include <linux/hugetlb_cgroup.h>
37#include <linux/gfp.h> 37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h>
38 39
39#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
40 41
42#define CREATE_TRACE_POINTS
43#include <trace/events/migrate.h>
44
41#include "internal.h" 45#include "internal.h"
42 46
43/* 47/*
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l)
79 list_del(&page->lru); 83 list_del(&page->lru);
80 dec_zone_page_state(page, NR_ISOLATED_ANON + 84 dec_zone_page_state(page, NR_ISOLATED_ANON +
81 page_is_file_cache(page)); 85 page_is_file_cache(page));
82 putback_lru_page(page); 86 putback_lru_page(page);
87 }
88}
89
90/*
91 * Put previously isolated pages back onto the appropriate lists
92 * from where they were once taken off for compaction/migration.
93 *
94 * This function shall be used instead of putback_lru_pages(),
95 * whenever the isolated pageset has been built by isolate_migratepages_range()
96 */
97void putback_movable_pages(struct list_head *l)
98{
99 struct page *page;
100 struct page *page2;
101
102 list_for_each_entry_safe(page, page2, l, lru) {
103 list_del(&page->lru);
104 dec_zone_page_state(page, NR_ISOLATED_ANON +
105 page_is_file_cache(page));
106 if (unlikely(balloon_page_movable(page)))
107 balloon_page_putback(page);
108 else
109 putback_lru_page(page);
83 } 110 }
84} 111}
85 112
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
91{ 118{
92 struct mm_struct *mm = vma->vm_mm; 119 struct mm_struct *mm = vma->vm_mm;
93 swp_entry_t entry; 120 swp_entry_t entry;
94 pgd_t *pgd;
95 pud_t *pud;
96 pmd_t *pmd; 121 pmd_t *pmd;
97 pte_t *ptep, pte; 122 pte_t *ptep, pte;
98 spinlock_t *ptl; 123 spinlock_t *ptl;
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
103 goto out; 128 goto out;
104 ptl = &mm->page_table_lock; 129 ptl = &mm->page_table_lock;
105 } else { 130 } else {
106 pgd = pgd_offset(mm, addr); 131 pmd = mm_find_pmd(mm, addr);
107 if (!pgd_present(*pgd)) 132 if (!pmd)
108 goto out;
109
110 pud = pud_offset(pgd, addr);
111 if (!pud_present(*pud))
112 goto out; 133 goto out;
113
114 pmd = pmd_offset(pud, addr);
115 if (pmd_trans_huge(*pmd)) 134 if (pmd_trans_huge(*pmd))
116 goto out; 135 goto out;
117 if (!pmd_present(*pmd))
118 goto out;
119 136
120 ptep = pte_offset_map(pmd, addr); 137 ptep = pte_offset_map(pmd, addr);
121 138
@@ -279,14 +296,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
279 struct page *newpage, struct page *page, 296 struct page *newpage, struct page *page,
280 struct buffer_head *head, enum migrate_mode mode) 297 struct buffer_head *head, enum migrate_mode mode)
281{ 298{
282 int expected_count; 299 int expected_count = 0;
283 void **pslot; 300 void **pslot;
284 301
285 if (!mapping) { 302 if (!mapping) {
286 /* Anonymous page without mapping */ 303 /* Anonymous page without mapping */
287 if (page_count(page) != 1) 304 if (page_count(page) != 1)
288 return -EAGAIN; 305 return -EAGAIN;
289 return 0; 306 return MIGRATEPAGE_SUCCESS;
290 } 307 }
291 308
292 spin_lock_irq(&mapping->tree_lock); 309 spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +373,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
356 } 373 }
357 spin_unlock_irq(&mapping->tree_lock); 374 spin_unlock_irq(&mapping->tree_lock);
358 375
359 return 0; 376 return MIGRATEPAGE_SUCCESS;
360} 377}
361 378
362/* 379/*
@@ -372,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
372 if (!mapping) { 389 if (!mapping) {
373 if (page_count(page) != 1) 390 if (page_count(page) != 1)
374 return -EAGAIN; 391 return -EAGAIN;
375 return 0; 392 return MIGRATEPAGE_SUCCESS;
376 } 393 }
377 394
378 spin_lock_irq(&mapping->tree_lock); 395 spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +416,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
399 page_unfreeze_refs(page, expected_count - 1); 416 page_unfreeze_refs(page, expected_count - 1);
400 417
401 spin_unlock_irq(&mapping->tree_lock); 418 spin_unlock_irq(&mapping->tree_lock);
402 return 0; 419 return MIGRATEPAGE_SUCCESS;
403} 420}
404 421
405/* 422/*
@@ -407,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
407 */ 424 */
408void migrate_page_copy(struct page *newpage, struct page *page) 425void migrate_page_copy(struct page *newpage, struct page *page)
409{ 426{
410 if (PageHuge(page)) 427 if (PageHuge(page) || PageTransHuge(page))
411 copy_huge_page(newpage, page); 428 copy_huge_page(newpage, page);
412 else 429 else
413 copy_highpage(newpage, page); 430 copy_highpage(newpage, page);
@@ -486,11 +503,11 @@ int migrate_page(struct address_space *mapping,
486 503
487 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); 504 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
488 505
489 if (rc) 506 if (rc != MIGRATEPAGE_SUCCESS)
490 return rc; 507 return rc;
491 508
492 migrate_page_copy(newpage, page); 509 migrate_page_copy(newpage, page);
493 return 0; 510 return MIGRATEPAGE_SUCCESS;
494} 511}
495EXPORT_SYMBOL(migrate_page); 512EXPORT_SYMBOL(migrate_page);
496 513
@@ -513,7 +530,7 @@ int buffer_migrate_page(struct address_space *mapping,
513 530
514 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); 531 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
515 532
516 if (rc) 533 if (rc != MIGRATEPAGE_SUCCESS)
517 return rc; 534 return rc;
518 535
519 /* 536 /*
@@ -549,7 +566,7 @@ int buffer_migrate_page(struct address_space *mapping,
549 566
550 } while (bh != head); 567 } while (bh != head);
551 568
552 return 0; 569 return MIGRATEPAGE_SUCCESS;
553} 570}
554EXPORT_SYMBOL(buffer_migrate_page); 571EXPORT_SYMBOL(buffer_migrate_page);
555#endif 572#endif
@@ -628,7 +645,7 @@ static int fallback_migrate_page(struct address_space *mapping,
628 * 645 *
629 * Return value: 646 * Return value:
630 * < 0 - error code 647 * < 0 - error code
631 * == 0 - success 648 * MIGRATEPAGE_SUCCESS - success
632 */ 649 */
633static int move_to_new_page(struct page *newpage, struct page *page, 650static int move_to_new_page(struct page *newpage, struct page *page,
634 int remap_swapcache, enum migrate_mode mode) 651 int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +682,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
665 else 682 else
666 rc = fallback_migrate_page(mapping, newpage, page, mode); 683 rc = fallback_migrate_page(mapping, newpage, page, mode);
667 684
668 if (rc) { 685 if (rc != MIGRATEPAGE_SUCCESS) {
669 newpage->mapping = NULL; 686 newpage->mapping = NULL;
670 } else { 687 } else {
671 if (remap_swapcache) 688 if (remap_swapcache)
@@ -751,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
751 */ 768 */
752 if (PageAnon(page)) { 769 if (PageAnon(page)) {
753 /* 770 /*
754 * Only page_lock_anon_vma() understands the subtleties of 771 * Only page_lock_anon_vma_read() understands the subtleties of
755 * getting a hold on an anon_vma from outside one of its mms. 772 * getting a hold on an anon_vma from outside one of its mms.
756 */ 773 */
757 anon_vma = page_get_anon_vma(page); 774 anon_vma = page_get_anon_vma(page);
@@ -778,6 +795,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
778 } 795 }
779 } 796 }
780 797
798 if (unlikely(balloon_page_movable(page))) {
799 /*
800 * A ballooned page does not need any special attention from
801 * physical to virtual reverse mapping procedures.
802 * Skip any attempt to unmap PTEs or to remap swap cache,
803 * in order to avoid burning cycles at rmap level, and perform
804 * the page migration right away (proteced by page lock).
805 */
806 rc = balloon_page_migrate(newpage, page, mode);
807 goto uncharge;
808 }
809
781 /* 810 /*
782 * Corner case handling: 811 * Corner case handling:
783 * 1. When a new swap-cache page is read into, it is added to the LRU 812 * 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +843,9 @@ skip_unmap:
814 put_anon_vma(anon_vma); 843 put_anon_vma(anon_vma);
815 844
816uncharge: 845uncharge:
817 mem_cgroup_end_migration(mem, page, newpage, rc == 0); 846 mem_cgroup_end_migration(mem, page, newpage,
847 (rc == MIGRATEPAGE_SUCCESS ||
848 rc == MIGRATEPAGE_BALLOON_SUCCESS));
818unlock: 849unlock:
819 unlock_page(page); 850 unlock_page(page);
820out: 851out:
@@ -846,6 +877,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
846 goto out; 877 goto out;
847 878
848 rc = __unmap_and_move(page, newpage, force, offlining, mode); 879 rc = __unmap_and_move(page, newpage, force, offlining, mode);
880
881 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
882 /*
883 * A ballooned page has been migrated already.
884 * Now, it's the time to wrap-up counters,
885 * handle the page back to Buddy and return.
886 */
887 dec_zone_page_state(page, NR_ISOLATED_ANON +
888 page_is_file_cache(page));
889 balloon_page_free(page);
890 return MIGRATEPAGE_SUCCESS;
891 }
849out: 892out:
850 if (rc != -EAGAIN) { 893 if (rc != -EAGAIN) {
851 /* 894 /*
@@ -958,10 +1001,11 @@ out:
958 */ 1001 */
959int migrate_pages(struct list_head *from, 1002int migrate_pages(struct list_head *from,
960 new_page_t get_new_page, unsigned long private, bool offlining, 1003 new_page_t get_new_page, unsigned long private, bool offlining,
961 enum migrate_mode mode) 1004 enum migrate_mode mode, int reason)
962{ 1005{
963 int retry = 1; 1006 int retry = 1;
964 int nr_failed = 0; 1007 int nr_failed = 0;
1008 int nr_succeeded = 0;
965 int pass = 0; 1009 int pass = 0;
966 struct page *page; 1010 struct page *page;
967 struct page *page2; 1011 struct page *page2;
@@ -987,7 +1031,8 @@ int migrate_pages(struct list_head *from,
987 case -EAGAIN: 1031 case -EAGAIN:
988 retry++; 1032 retry++;
989 break; 1033 break;
990 case 0: 1034 case MIGRATEPAGE_SUCCESS:
1035 nr_succeeded++;
991 break; 1036 break;
992 default: 1037 default:
993 /* Permanent failure */ 1038 /* Permanent failure */
@@ -996,15 +1041,18 @@ int migrate_pages(struct list_head *from,
996 } 1041 }
997 } 1042 }
998 } 1043 }
999 rc = 0; 1044 rc = nr_failed + retry;
1000out: 1045out:
1046 if (nr_succeeded)
1047 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1048 if (nr_failed)
1049 count_vm_events(PGMIGRATE_FAIL, nr_failed);
1050 trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1051
1001 if (!swapwrite) 1052 if (!swapwrite)
1002 current->flags &= ~PF_SWAPWRITE; 1053 current->flags &= ~PF_SWAPWRITE;
1003 1054
1004 if (rc) 1055 return rc;
1005 return rc;
1006
1007 return nr_failed + retry;
1008} 1056}
1009 1057
1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 1058int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1072,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1024 /* try again */ 1072 /* try again */
1025 cond_resched(); 1073 cond_resched();
1026 break; 1074 break;
1027 case 0: 1075 case MIGRATEPAGE_SUCCESS:
1028 goto out; 1076 goto out;
1029 default: 1077 default:
1030 rc = -EIO; 1078 rc = -EIO;
@@ -1139,7 +1187,8 @@ set_status:
1139 err = 0; 1187 err = 0;
1140 if (!list_empty(&pagelist)) { 1188 if (!list_empty(&pagelist)) {
1141 err = migrate_pages(&pagelist, new_page_node, 1189 err = migrate_pages(&pagelist, new_page_node,
1142 (unsigned long)pm, 0, MIGRATE_SYNC); 1190 (unsigned long)pm, 0, MIGRATE_SYNC,
1191 MR_SYSCALL);
1143 if (err) 1192 if (err)
1144 putback_lru_pages(&pagelist); 1193 putback_lru_pages(&pagelist);
1145 } 1194 }
@@ -1201,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1201 if (node < 0 || node >= MAX_NUMNODES) 1250 if (node < 0 || node >= MAX_NUMNODES)
1202 goto out_pm; 1251 goto out_pm;
1203 1252
1204 if (!node_state(node, N_HIGH_MEMORY)) 1253 if (!node_state(node, N_MEMORY))
1205 goto out_pm; 1254 goto out_pm;
1206 1255
1207 err = -EACCES; 1256 err = -EACCES;
@@ -1403,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1403 } 1452 }
1404 return err; 1453 return err;
1405} 1454}
1406#endif 1455
1456#ifdef CONFIG_NUMA_BALANCING
1457/*
1458 * Returns true if this is a safe migration target node for misplaced NUMA
1459 * pages. Currently it only checks the watermarks which crude
1460 */
1461static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1462 int nr_migrate_pages)
1463{
1464 int z;
1465 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1466 struct zone *zone = pgdat->node_zones + z;
1467
1468 if (!populated_zone(zone))
1469 continue;
1470
1471 if (zone->all_unreclaimable)
1472 continue;
1473
1474 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
1475 if (!zone_watermark_ok(zone, 0,
1476 high_wmark_pages(zone) +
1477 nr_migrate_pages,
1478 0, 0))
1479 continue;
1480 return true;
1481 }
1482 return false;
1483}
1484
1485static struct page *alloc_misplaced_dst_page(struct page *page,
1486 unsigned long data,
1487 int **result)
1488{
1489 int nid = (int) data;
1490 struct page *newpage;
1491
1492 newpage = alloc_pages_exact_node(nid,
1493 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
1494 __GFP_NOMEMALLOC | __GFP_NORETRY |
1495 __GFP_NOWARN) &
1496 ~GFP_IOFS, 0);
1497 if (newpage)
1498 page_xchg_last_nid(newpage, page_last_nid(page));
1499
1500 return newpage;
1501}
1502
1503/*
1504 * page migration rate limiting control.
1505 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1506 * window of time. Default here says do not migrate more than 1280M per second.
1507 * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
1508 * as it is faults that reset the window, pte updates will happen unconditionally
1509 * if there has not been a fault since @pteupdate_interval_millisecs after the
1510 * throttle window closed.
1511 */
1512static unsigned int migrate_interval_millisecs __read_mostly = 100;
1513static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1514static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1515
1516/* Returns true if NUMA migration is currently rate limited */
1517bool migrate_ratelimited(int node)
1518{
1519 pg_data_t *pgdat = NODE_DATA(node);
1520
1521 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1522 msecs_to_jiffies(pteupdate_interval_millisecs)))
1523 return false;
1524
1525 if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1526 return false;
1527
1528 return true;
1529}
1530
1531/* Returns true if the node is migrate rate-limited after the update */
1532bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
1533{
1534 bool rate_limited = false;
1535
1536 /*
1537 * Rate-limit the amount of data that is being migrated to a node.
1538 * Optimal placement is no good if the memory bus is saturated and
1539 * all the time is being spent migrating!
1540 */
1541 spin_lock(&pgdat->numabalancing_migrate_lock);
1542 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1543 pgdat->numabalancing_migrate_nr_pages = 0;
1544 pgdat->numabalancing_migrate_next_window = jiffies +
1545 msecs_to_jiffies(migrate_interval_millisecs);
1546 }
1547 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
1548 rate_limited = true;
1549 else
1550 pgdat->numabalancing_migrate_nr_pages += nr_pages;
1551 spin_unlock(&pgdat->numabalancing_migrate_lock);
1552
1553 return rate_limited;
1554}
1555
1556int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1557{
1558 int ret = 0;
1559
1560 /* Avoid migrating to a node that is nearly full */
1561 if (migrate_balanced_pgdat(pgdat, 1)) {
1562 int page_lru;
1563
1564 if (isolate_lru_page(page)) {
1565 put_page(page);
1566 return 0;
1567 }
1568
1569 /* Page is isolated */
1570 ret = 1;
1571 page_lru = page_is_file_cache(page);
1572 if (!PageTransHuge(page))
1573 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
1574 else
1575 mod_zone_page_state(page_zone(page),
1576 NR_ISOLATED_ANON + page_lru,
1577 HPAGE_PMD_NR);
1578 }
1579
1580 /*
1581 * Page is either isolated or there is not enough space on the target
1582 * node. If isolated, then it has taken a reference count and the
1583 * callers reference can be safely dropped without the page
1584 * disappearing underneath us during migration. Otherwise the page is
1585 * not to be migrated but the callers reference should still be
1586 * dropped so it does not leak.
1587 */
1588 put_page(page);
1589
1590 return ret;
1591}
1592
1593/*
1594 * Attempt to migrate a misplaced page to the specified destination
1595 * node. Caller is expected to have an elevated reference count on
1596 * the page that will be dropped by this function before returning.
1597 */
1598int migrate_misplaced_page(struct page *page, int node)
1599{
1600 pg_data_t *pgdat = NODE_DATA(node);
1601 int isolated = 0;
1602 int nr_remaining;
1603 LIST_HEAD(migratepages);
1604
1605 /*
1606 * Don't migrate pages that are mapped in multiple processes.
1607 * TODO: Handle false sharing detection instead of this hammer
1608 */
1609 if (page_mapcount(page) != 1) {
1610 put_page(page);
1611 goto out;
1612 }
1613
1614 /*
1615 * Rate-limit the amount of data that is being migrated to a node.
1616 * Optimal placement is no good if the memory bus is saturated and
1617 * all the time is being spent migrating!
1618 */
1619 if (numamigrate_update_ratelimit(pgdat, 1)) {
1620 put_page(page);
1621 goto out;
1622 }
1623
1624 isolated = numamigrate_isolate_page(pgdat, page);
1625 if (!isolated)
1626 goto out;
1627
1628 list_add(&page->lru, &migratepages);
1629 nr_remaining = migrate_pages(&migratepages,
1630 alloc_misplaced_dst_page,
1631 node, false, MIGRATE_ASYNC,
1632 MR_NUMA_MISPLACED);
1633 if (nr_remaining) {
1634 putback_lru_pages(&migratepages);
1635 isolated = 0;
1636 } else
1637 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1638 BUG_ON(!list_empty(&migratepages));
1639out:
1640 return isolated;
1641}
1642#endif /* CONFIG_NUMA_BALANCING */
1643
1644#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1645int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1646 struct vm_area_struct *vma,
1647 pmd_t *pmd, pmd_t entry,
1648 unsigned long address,
1649 struct page *page, int node)
1650{
1651 unsigned long haddr = address & HPAGE_PMD_MASK;
1652 pg_data_t *pgdat = NODE_DATA(node);
1653 int isolated = 0;
1654 struct page *new_page = NULL;
1655 struct mem_cgroup *memcg = NULL;
1656 int page_lru = page_is_file_cache(page);
1657
1658 /*
1659 * Don't migrate pages that are mapped in multiple processes.
1660 * TODO: Handle false sharing detection instead of this hammer
1661 */
1662 if (page_mapcount(page) != 1)
1663 goto out_dropref;
1664
1665 /*
1666 * Rate-limit the amount of data that is being migrated to a node.
1667 * Optimal placement is no good if the memory bus is saturated and
1668 * all the time is being spent migrating!
1669 */
1670 if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
1671 goto out_dropref;
1672
1673 new_page = alloc_pages_node(node,
1674 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
1675 if (!new_page) {
1676 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1677 goto out_dropref;
1678 }
1679 page_xchg_last_nid(new_page, page_last_nid(page));
1680
1681 isolated = numamigrate_isolate_page(pgdat, page);
1682 if (!isolated) {
1683 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1684 put_page(new_page);
1685 goto out_keep_locked;
1686 }
1687
1688 /* Prepare a page as a migration target */
1689 __set_page_locked(new_page);
1690 SetPageSwapBacked(new_page);
1691
1692 /* anon mapping, we can simply copy page->mapping to the new page: */
1693 new_page->mapping = page->mapping;
1694 new_page->index = page->index;
1695 migrate_page_copy(new_page, page);
1696 WARN_ON(PageLRU(new_page));
1697
1698 /* Recheck the target PMD */
1699 spin_lock(&mm->page_table_lock);
1700 if (unlikely(!pmd_same(*pmd, entry))) {
1701 spin_unlock(&mm->page_table_lock);
1702
1703 /* Reverse changes made by migrate_page_copy() */
1704 if (TestClearPageActive(new_page))
1705 SetPageActive(page);
1706 if (TestClearPageUnevictable(new_page))
1707 SetPageUnevictable(page);
1708 mlock_migrate_page(page, new_page);
1709
1710 unlock_page(new_page);
1711 put_page(new_page); /* Free it */
1712
1713 unlock_page(page);
1714 putback_lru_page(page);
1715
1716 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1717 goto out;
1718 }
1719
1720 /*
1721 * Traditional migration needs to prepare the memcg charge
1722 * transaction early to prevent the old page from being
1723 * uncharged when installing migration entries. Here we can
1724 * save the potential rollback and start the charge transfer
1725 * only when migration is already known to end successfully.
1726 */
1727 mem_cgroup_prepare_migration(page, new_page, &memcg);
1728
1729 entry = mk_pmd(new_page, vma->vm_page_prot);
1730 entry = pmd_mknonnuma(entry);
1731 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1732 entry = pmd_mkhuge(entry);
1733
1734 page_add_new_anon_rmap(new_page, vma, haddr);
1735
1736 set_pmd_at(mm, haddr, pmd, entry);
1737 update_mmu_cache_pmd(vma, address, &entry);
1738 page_remove_rmap(page);
1739 /*
1740 * Finish the charge transaction under the page table lock to
1741 * prevent split_huge_page() from dividing up the charge
1742 * before it's fully transferred to the new page.
1743 */
1744 mem_cgroup_end_migration(memcg, page, new_page, true);
1745 spin_unlock(&mm->page_table_lock);
1746
1747 unlock_page(new_page);
1748 unlock_page(page);
1749 put_page(page); /* Drop the rmap reference */
1750 put_page(page); /* Drop the LRU isolation reference */
1751
1752 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1753 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1754
1755out:
1756 mod_zone_page_state(page_zone(page),
1757 NR_ISOLATED_ANON + page_lru,
1758 -HPAGE_PMD_NR);
1759 return isolated;
1760
1761out_dropref:
1762 put_page(page);
1763out_keep_locked:
1764 return 0;
1765}
1766#endif /* CONFIG_NUMA_BALANCING */
1767
1768#endif /* CONFIG_NUMA */
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c41e7d9..f54b235f29a9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/rbtree_augmented.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
@@ -89,6 +90,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
89struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 90struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
90 91
91/* 92/*
93 * The global memory commitment made in the system can be a metric
94 * that can be used to drive ballooning decisions when Linux is hosted
95 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
96 * balancing memory across competing virtual machines that are hosted.
97 * Several metrics drive this policy engine including the guest reported
98 * memory commitment.
99 */
100unsigned long vm_memory_committed(void)
101{
102 return percpu_counter_read_positive(&vm_committed_as);
103}
104EXPORT_SYMBOL_GPL(vm_memory_committed);
105
106/*
92 * Check that a process has enough memory to allocate a new virtual 107 * Check that a process has enough memory to allocate a new virtual
93 * mapping. 0 means there is enough memory for the allocation to 108 * mapping. 0 means there is enough memory for the allocation to
94 * succeed and -ENOMEM implies there is not. 109 * succeed and -ENOMEM implies there is not.
@@ -297,40 +312,88 @@ out:
297 return retval; 312 return retval;
298} 313}
299 314
315static long vma_compute_subtree_gap(struct vm_area_struct *vma)
316{
317 unsigned long max, subtree_gap;
318 max = vma->vm_start;
319 if (vma->vm_prev)
320 max -= vma->vm_prev->vm_end;
321 if (vma->vm_rb.rb_left) {
322 subtree_gap = rb_entry(vma->vm_rb.rb_left,
323 struct vm_area_struct, vm_rb)->rb_subtree_gap;
324 if (subtree_gap > max)
325 max = subtree_gap;
326 }
327 if (vma->vm_rb.rb_right) {
328 subtree_gap = rb_entry(vma->vm_rb.rb_right,
329 struct vm_area_struct, vm_rb)->rb_subtree_gap;
330 if (subtree_gap > max)
331 max = subtree_gap;
332 }
333 return max;
334}
335
300#ifdef CONFIG_DEBUG_VM_RB 336#ifdef CONFIG_DEBUG_VM_RB
301static int browse_rb(struct rb_root *root) 337static int browse_rb(struct rb_root *root)
302{ 338{
303 int i = 0, j; 339 int i = 0, j, bug = 0;
304 struct rb_node *nd, *pn = NULL; 340 struct rb_node *nd, *pn = NULL;
305 unsigned long prev = 0, pend = 0; 341 unsigned long prev = 0, pend = 0;
306 342
307 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 343 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
308 struct vm_area_struct *vma; 344 struct vm_area_struct *vma;
309 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 345 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
310 if (vma->vm_start < prev) 346 if (vma->vm_start < prev) {
311 printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; 347 printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
312 if (vma->vm_start < pend) 348 bug = 1;
349 }
350 if (vma->vm_start < pend) {
313 printk("vm_start %lx pend %lx\n", vma->vm_start, pend); 351 printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
314 if (vma->vm_start > vma->vm_end) 352 bug = 1;
315 printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); 353 }
354 if (vma->vm_start > vma->vm_end) {
355 printk("vm_end %lx < vm_start %lx\n",
356 vma->vm_end, vma->vm_start);
357 bug = 1;
358 }
359 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
360 printk("free gap %lx, correct %lx\n",
361 vma->rb_subtree_gap,
362 vma_compute_subtree_gap(vma));
363 bug = 1;
364 }
316 i++; 365 i++;
317 pn = nd; 366 pn = nd;
318 prev = vma->vm_start; 367 prev = vma->vm_start;
319 pend = vma->vm_end; 368 pend = vma->vm_end;
320 } 369 }
321 j = 0; 370 j = 0;
322 for (nd = pn; nd; nd = rb_prev(nd)) { 371 for (nd = pn; nd; nd = rb_prev(nd))
323 j++; 372 j++;
373 if (i != j) {
374 printk("backwards %d, forwards %d\n", j, i);
375 bug = 1;
376 }
377 return bug ? -1 : i;
378}
379
380static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
381{
382 struct rb_node *nd;
383
384 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
385 struct vm_area_struct *vma;
386 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
387 BUG_ON(vma != ignore &&
388 vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
324 } 389 }
325 if (i != j)
326 printk("backwards %d, forwards %d\n", j, i), i = 0;
327 return i;
328} 390}
329 391
330void validate_mm(struct mm_struct *mm) 392void validate_mm(struct mm_struct *mm)
331{ 393{
332 int bug = 0; 394 int bug = 0;
333 int i = 0; 395 int i = 0;
396 unsigned long highest_address = 0;
334 struct vm_area_struct *vma = mm->mmap; 397 struct vm_area_struct *vma = mm->mmap;
335 while (vma) { 398 while (vma) {
336 struct anon_vma_chain *avc; 399 struct anon_vma_chain *avc;
@@ -338,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
338 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 401 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
339 anon_vma_interval_tree_verify(avc); 402 anon_vma_interval_tree_verify(avc);
340 vma_unlock_anon_vma(vma); 403 vma_unlock_anon_vma(vma);
404 highest_address = vma->vm_end;
341 vma = vma->vm_next; 405 vma = vma->vm_next;
342 i++; 406 i++;
343 } 407 }
344 if (i != mm->map_count) 408 if (i != mm->map_count) {
345 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; 409 printk("map_count %d vm_next %d\n", mm->map_count, i);
410 bug = 1;
411 }
412 if (highest_address != mm->highest_vm_end) {
413 printk("mm->highest_vm_end %lx, found %lx\n",
414 mm->highest_vm_end, highest_address);
415 bug = 1;
416 }
346 i = browse_rb(&mm->mm_rb); 417 i = browse_rb(&mm->mm_rb);
347 if (i != mm->map_count) 418 if (i != mm->map_count) {
348 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; 419 printk("map_count %d rb %d\n", mm->map_count, i);
420 bug = 1;
421 }
349 BUG_ON(bug); 422 BUG_ON(bug);
350} 423}
351#else 424#else
425#define validate_mm_rb(root, ignore) do { } while (0)
352#define validate_mm(mm) do { } while (0) 426#define validate_mm(mm) do { } while (0)
353#endif 427#endif
354 428
429RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
430 unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
431
432/*
433 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
434 * vma->vm_prev->vm_end values changed, without modifying the vma's position
435 * in the rbtree.
436 */
437static void vma_gap_update(struct vm_area_struct *vma)
438{
439 /*
440 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
441 * function that does exacltly what we want.
442 */
443 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
444}
445
446static inline void vma_rb_insert(struct vm_area_struct *vma,
447 struct rb_root *root)
448{
449 /* All rb_subtree_gap values must be consistent prior to insertion */
450 validate_mm_rb(root, NULL);
451
452 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
453}
454
455static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
456{
457 /*
458 * All rb_subtree_gap values must be consistent prior to erase,
459 * with the possible exception of the vma being erased.
460 */
461 validate_mm_rb(root, vma);
462
463 /*
464 * Note rb_erase_augmented is a fairly large inline function,
465 * so make sure we instantiate it only once with our desired
466 * augmented rbtree callbacks.
467 */
468 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
469}
470
355/* 471/*
356 * vma has some anon_vma assigned, and is already inserted on that 472 * vma has some anon_vma assigned, and is already inserted on that
357 * anon_vma's interval trees. 473 * anon_vma's interval trees.
@@ -421,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
421void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 537void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
422 struct rb_node **rb_link, struct rb_node *rb_parent) 538 struct rb_node **rb_link, struct rb_node *rb_parent)
423{ 539{
540 /* Update tracking information for the gap following the new vma. */
541 if (vma->vm_next)
542 vma_gap_update(vma->vm_next);
543 else
544 mm->highest_vm_end = vma->vm_end;
545
546 /*
547 * vma->vm_prev wasn't known when we followed the rbtree to find the
548 * correct insertion point for that vma. As a result, we could not
549 * update the vma vm_rb parents rb_subtree_gap values on the way down.
550 * So, we first insert the vma with a zero rb_subtree_gap value
551 * (to be consistent with what we did on the way down), and then
552 * immediately update the gap to the correct value. Finally we
553 * rebalance the rbtree after all augmented values have been set.
554 */
424 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 555 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
425 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 556 vma->rb_subtree_gap = 0;
557 vma_gap_update(vma);
558 vma_rb_insert(vma, &mm->mm_rb);
426} 559}
427 560
428static void __vma_link_file(struct vm_area_struct *vma) 561static void __vma_link_file(struct vm_area_struct *vma)
@@ -498,12 +631,12 @@ static inline void
498__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 631__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
499 struct vm_area_struct *prev) 632 struct vm_area_struct *prev)
500{ 633{
501 struct vm_area_struct *next = vma->vm_next; 634 struct vm_area_struct *next;
502 635
503 prev->vm_next = next; 636 vma_rb_erase(vma, &mm->mm_rb);
637 prev->vm_next = next = vma->vm_next;
504 if (next) 638 if (next)
505 next->vm_prev = prev; 639 next->vm_prev = prev;
506 rb_erase(&vma->vm_rb, &mm->mm_rb);
507 if (mm->mmap_cache == vma) 640 if (mm->mmap_cache == vma)
508 mm->mmap_cache = prev; 641 mm->mmap_cache = prev;
509} 642}
@@ -525,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
525 struct rb_root *root = NULL; 658 struct rb_root *root = NULL;
526 struct anon_vma *anon_vma = NULL; 659 struct anon_vma *anon_vma = NULL;
527 struct file *file = vma->vm_file; 660 struct file *file = vma->vm_file;
661 bool start_changed = false, end_changed = false;
528 long adjust_next = 0; 662 long adjust_next = 0;
529 int remove_next = 0; 663 int remove_next = 0;
530 664
@@ -602,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end);
602 if (anon_vma) { 736 if (anon_vma) {
603 VM_BUG_ON(adjust_next && next->anon_vma && 737 VM_BUG_ON(adjust_next && next->anon_vma &&
604 anon_vma != next->anon_vma); 738 anon_vma != next->anon_vma);
605 anon_vma_lock(anon_vma); 739 anon_vma_lock_write(anon_vma);
606 anon_vma_interval_tree_pre_update_vma(vma); 740 anon_vma_interval_tree_pre_update_vma(vma);
607 if (adjust_next) 741 if (adjust_next)
608 anon_vma_interval_tree_pre_update_vma(next); 742 anon_vma_interval_tree_pre_update_vma(next);
@@ -615,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end);
615 vma_interval_tree_remove(next, root); 749 vma_interval_tree_remove(next, root);
616 } 750 }
617 751
618 vma->vm_start = start; 752 if (start != vma->vm_start) {
619 vma->vm_end = end; 753 vma->vm_start = start;
754 start_changed = true;
755 }
756 if (end != vma->vm_end) {
757 vma->vm_end = end;
758 end_changed = true;
759 }
620 vma->vm_pgoff = pgoff; 760 vma->vm_pgoff = pgoff;
621 if (adjust_next) { 761 if (adjust_next) {
622 next->vm_start += adjust_next << PAGE_SHIFT; 762 next->vm_start += adjust_next << PAGE_SHIFT;
@@ -645,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end);
645 * (it may either follow vma or precede it). 785 * (it may either follow vma or precede it).
646 */ 786 */
647 __insert_vm_struct(mm, insert); 787 __insert_vm_struct(mm, insert);
788 } else {
789 if (start_changed)
790 vma_gap_update(vma);
791 if (end_changed) {
792 if (!next)
793 mm->highest_vm_end = end;
794 else if (!adjust_next)
795 vma_gap_update(next);
796 }
648 } 797 }
649 798
650 if (anon_vma) { 799 if (anon_vma) {
@@ -678,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end);
678 * we must remove another next too. It would clutter 827 * we must remove another next too. It would clutter
679 * up the code too much to do both in one go. 828 * up the code too much to do both in one go.
680 */ 829 */
681 if (remove_next == 2) { 830 next = vma->vm_next;
682 next = vma->vm_next; 831 if (remove_next == 2)
683 goto again; 832 goto again;
684 } 833 else if (next)
834 vma_gap_update(next);
835 else
836 mm->highest_vm_end = end;
685 } 837 }
686 if (insert && file) 838 if (insert && file)
687 uprobe_mmap(insert); 839 uprobe_mmap(insert);
@@ -1153,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1153 * memory so no accounting is necessary 1305 * memory so no accounting is necessary
1154 */ 1306 */
1155 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, 1307 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1156 VM_NORESERVE, &user, 1308 VM_NORESERVE,
1157 HUGETLB_ANONHUGE_INODE); 1309 &user, HUGETLB_ANONHUGE_INODE,
1310 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1158 if (IS_ERR(file)) 1311 if (IS_ERR(file))
1159 return PTR_ERR(file); 1312 return PTR_ERR(file);
1160 } 1313 }
@@ -1335,7 +1488,11 @@ munmap_back:
1335 * 1488 *
1336 * Answer: Yes, several device drivers can do it in their 1489 * Answer: Yes, several device drivers can do it in their
1337 * f_op->mmap method. -DaveM 1490 * f_op->mmap method. -DaveM
1491 * Bug: If addr is changed, prev, rb_link, rb_parent should
1492 * be updated for vma_link()
1338 */ 1493 */
1494 WARN_ON_ONCE(addr != vma->vm_start);
1495
1339 addr = vma->vm_start; 1496 addr = vma->vm_start;
1340 pgoff = vma->vm_pgoff; 1497 pgoff = vma->vm_pgoff;
1341 vm_flags = vma->vm_flags; 1498 vm_flags = vma->vm_flags;
@@ -1400,6 +1557,206 @@ unacct_error:
1400 return error; 1557 return error;
1401} 1558}
1402 1559
1560unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1561{
1562 /*
1563 * We implement the search by looking for an rbtree node that
1564 * immediately follows a suitable gap. That is,
1565 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1566 * - gap_end = vma->vm_start >= info->low_limit + length;
1567 * - gap_end - gap_start >= length
1568 */
1569
1570 struct mm_struct *mm = current->mm;
1571 struct vm_area_struct *vma;
1572 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1573
1574 /* Adjust search length to account for worst case alignment overhead */
1575 length = info->length + info->align_mask;
1576 if (length < info->length)
1577 return -ENOMEM;
1578
1579 /* Adjust search limits by the desired length */
1580 if (info->high_limit < length)
1581 return -ENOMEM;
1582 high_limit = info->high_limit - length;
1583
1584 if (info->low_limit > high_limit)
1585 return -ENOMEM;
1586 low_limit = info->low_limit + length;
1587
1588 /* Check if rbtree root looks promising */
1589 if (RB_EMPTY_ROOT(&mm->mm_rb))
1590 goto check_highest;
1591 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1592 if (vma->rb_subtree_gap < length)
1593 goto check_highest;
1594
1595 while (true) {
1596 /* Visit left subtree if it looks promising */
1597 gap_end = vma->vm_start;
1598 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1599 struct vm_area_struct *left =
1600 rb_entry(vma->vm_rb.rb_left,
1601 struct vm_area_struct, vm_rb);
1602 if (left->rb_subtree_gap >= length) {
1603 vma = left;
1604 continue;
1605 }
1606 }
1607
1608 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1609check_current:
1610 /* Check if current node has a suitable gap */
1611 if (gap_start > high_limit)
1612 return -ENOMEM;
1613 if (gap_end >= low_limit && gap_end - gap_start >= length)
1614 goto found;
1615
1616 /* Visit right subtree if it looks promising */
1617 if (vma->vm_rb.rb_right) {
1618 struct vm_area_struct *right =
1619 rb_entry(vma->vm_rb.rb_right,
1620 struct vm_area_struct, vm_rb);
1621 if (right->rb_subtree_gap >= length) {
1622 vma = right;
1623 continue;
1624 }
1625 }
1626
1627 /* Go back up the rbtree to find next candidate node */
1628 while (true) {
1629 struct rb_node *prev = &vma->vm_rb;
1630 if (!rb_parent(prev))
1631 goto check_highest;
1632 vma = rb_entry(rb_parent(prev),
1633 struct vm_area_struct, vm_rb);
1634 if (prev == vma->vm_rb.rb_left) {
1635 gap_start = vma->vm_prev->vm_end;
1636 gap_end = vma->vm_start;
1637 goto check_current;
1638 }
1639 }
1640 }
1641
1642check_highest:
1643 /* Check highest gap, which does not precede any rbtree node */
1644 gap_start = mm->highest_vm_end;
1645 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
1646 if (gap_start > high_limit)
1647 return -ENOMEM;
1648
1649found:
1650 /* We found a suitable gap. Clip it with the original low_limit. */
1651 if (gap_start < info->low_limit)
1652 gap_start = info->low_limit;
1653
1654 /* Adjust gap address to the desired alignment */
1655 gap_start += (info->align_offset - gap_start) & info->align_mask;
1656
1657 VM_BUG_ON(gap_start + info->length > info->high_limit);
1658 VM_BUG_ON(gap_start + info->length > gap_end);
1659 return gap_start;
1660}
1661
1662unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1663{
1664 struct mm_struct *mm = current->mm;
1665 struct vm_area_struct *vma;
1666 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1667
1668 /* Adjust search length to account for worst case alignment overhead */
1669 length = info->length + info->align_mask;
1670 if (length < info->length)
1671 return -ENOMEM;
1672
1673 /*
1674 * Adjust search limits by the desired length.
1675 * See implementation comment at top of unmapped_area().
1676 */
1677 gap_end = info->high_limit;
1678 if (gap_end < length)
1679 return -ENOMEM;
1680 high_limit = gap_end - length;
1681
1682 if (info->low_limit > high_limit)
1683 return -ENOMEM;
1684 low_limit = info->low_limit + length;
1685
1686 /* Check highest gap, which does not precede any rbtree node */
1687 gap_start = mm->highest_vm_end;
1688 if (gap_start <= high_limit)
1689 goto found_highest;
1690
1691 /* Check if rbtree root looks promising */
1692 if (RB_EMPTY_ROOT(&mm->mm_rb))
1693 return -ENOMEM;
1694 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1695 if (vma->rb_subtree_gap < length)
1696 return -ENOMEM;
1697
1698 while (true) {
1699 /* Visit right subtree if it looks promising */
1700 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1701 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1702 struct vm_area_struct *right =
1703 rb_entry(vma->vm_rb.rb_right,
1704 struct vm_area_struct, vm_rb);
1705 if (right->rb_subtree_gap >= length) {
1706 vma = right;
1707 continue;
1708 }
1709 }
1710
1711check_current:
1712 /* Check if current node has a suitable gap */
1713 gap_end = vma->vm_start;
1714 if (gap_end < low_limit)
1715 return -ENOMEM;
1716 if (gap_start <= high_limit && gap_end - gap_start >= length)
1717 goto found;
1718
1719 /* Visit left subtree if it looks promising */
1720 if (vma->vm_rb.rb_left) {
1721 struct vm_area_struct *left =
1722 rb_entry(vma->vm_rb.rb_left,
1723 struct vm_area_struct, vm_rb);
1724 if (left->rb_subtree_gap >= length) {
1725 vma = left;
1726 continue;
1727 }
1728 }
1729
1730 /* Go back up the rbtree to find next candidate node */
1731 while (true) {
1732 struct rb_node *prev = &vma->vm_rb;
1733 if (!rb_parent(prev))
1734 return -ENOMEM;
1735 vma = rb_entry(rb_parent(prev),
1736 struct vm_area_struct, vm_rb);
1737 if (prev == vma->vm_rb.rb_right) {
1738 gap_start = vma->vm_prev ?
1739 vma->vm_prev->vm_end : 0;
1740 goto check_current;
1741 }
1742 }
1743 }
1744
1745found:
1746 /* We found a suitable gap. Clip it with the original high_limit. */
1747 if (gap_end > info->high_limit)
1748 gap_end = info->high_limit;
1749
1750found_highest:
1751 /* Compute highest gap address at the desired alignment */
1752 gap_end -= info->length;
1753 gap_end -= (gap_end - info->align_offset) & info->align_mask;
1754
1755 VM_BUG_ON(gap_end < info->low_limit);
1756 VM_BUG_ON(gap_end < gap_start);
1757 return gap_end;
1758}
1759
1403/* Get an address range which is currently unmapped. 1760/* Get an address range which is currently unmapped.
1404 * For shmat() with addr=0. 1761 * For shmat() with addr=0.
1405 * 1762 *
@@ -1418,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1418{ 1775{
1419 struct mm_struct *mm = current->mm; 1776 struct mm_struct *mm = current->mm;
1420 struct vm_area_struct *vma; 1777 struct vm_area_struct *vma;
1421 unsigned long start_addr; 1778 struct vm_unmapped_area_info info;
1422 1779
1423 if (len > TASK_SIZE) 1780 if (len > TASK_SIZE)
1424 return -ENOMEM; 1781 return -ENOMEM;
@@ -1433,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1433 (!vma || addr + len <= vma->vm_start)) 1790 (!vma || addr + len <= vma->vm_start))
1434 return addr; 1791 return addr;
1435 } 1792 }
1436 if (len > mm->cached_hole_size) {
1437 start_addr = addr = mm->free_area_cache;
1438 } else {
1439 start_addr = addr = TASK_UNMAPPED_BASE;
1440 mm->cached_hole_size = 0;
1441 }
1442 1793
1443full_search: 1794 info.flags = 0;
1444 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 1795 info.length = len;
1445 /* At this point: (!vma || addr < vma->vm_end). */ 1796 info.low_limit = TASK_UNMAPPED_BASE;
1446 if (TASK_SIZE - len < addr) { 1797 info.high_limit = TASK_SIZE;
1447 /* 1798 info.align_mask = 0;
1448 * Start a new search - just in case we missed 1799 return vm_unmapped_area(&info);
1449 * some holes.
1450 */
1451 if (start_addr != TASK_UNMAPPED_BASE) {
1452 addr = TASK_UNMAPPED_BASE;
1453 start_addr = addr;
1454 mm->cached_hole_size = 0;
1455 goto full_search;
1456 }
1457 return -ENOMEM;
1458 }
1459 if (!vma || addr + len <= vma->vm_start) {
1460 /*
1461 * Remember the place where we stopped the search:
1462 */
1463 mm->free_area_cache = addr + len;
1464 return addr;
1465 }
1466 if (addr + mm->cached_hole_size < vma->vm_start)
1467 mm->cached_hole_size = vma->vm_start - addr;
1468 addr = vma->vm_end;
1469 }
1470} 1800}
1471#endif 1801#endif
1472 1802
@@ -1491,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1491{ 1821{
1492 struct vm_area_struct *vma; 1822 struct vm_area_struct *vma;
1493 struct mm_struct *mm = current->mm; 1823 struct mm_struct *mm = current->mm;
1494 unsigned long addr = addr0, start_addr; 1824 unsigned long addr = addr0;
1825 struct vm_unmapped_area_info info;
1495 1826
1496 /* requested length too big for entire address space */ 1827 /* requested length too big for entire address space */
1497 if (len > TASK_SIZE) 1828 if (len > TASK_SIZE)
@@ -1509,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1509 return addr; 1840 return addr;
1510 } 1841 }
1511 1842
1512 /* check if free_area_cache is useful for us */ 1843 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1513 if (len <= mm->cached_hole_size) { 1844 info.length = len;
1514 mm->cached_hole_size = 0; 1845 info.low_limit = PAGE_SIZE;
1515 mm->free_area_cache = mm->mmap_base; 1846 info.high_limit = mm->mmap_base;
1516 } 1847 info.align_mask = 0;
1517 1848 addr = vm_unmapped_area(&info);
1518try_again:
1519 /* either no address requested or can't fit in requested address hole */
1520 start_addr = addr = mm->free_area_cache;
1521
1522 if (addr < len)
1523 goto fail;
1524
1525 addr -= len;
1526 do {
1527 /*
1528 * Lookup failure means no vma is above this address,
1529 * else if new region fits below vma->vm_start,
1530 * return with success:
1531 */
1532 vma = find_vma(mm, addr);
1533 if (!vma || addr+len <= vma->vm_start)
1534 /* remember the address as a hint for next time */
1535 return (mm->free_area_cache = addr);
1536
1537 /* remember the largest hole we saw so far */
1538 if (addr + mm->cached_hole_size < vma->vm_start)
1539 mm->cached_hole_size = vma->vm_start - addr;
1540
1541 /* try just below the current vma->vm_start */
1542 addr = vma->vm_start-len;
1543 } while (len < vma->vm_start);
1544
1545fail:
1546 /*
1547 * if hint left us with no space for the requested
1548 * mapping then try again:
1549 *
1550 * Note: this is different with the case of bottomup
1551 * which does the fully line-search, but we use find_vma
1552 * here that causes some holes skipped.
1553 */
1554 if (start_addr != mm->mmap_base) {
1555 mm->free_area_cache = mm->mmap_base;
1556 mm->cached_hole_size = 0;
1557 goto try_again;
1558 }
1559 1849
1560 /* 1850 /*
1561 * A failed mmap() very likely causes application failure, 1851 * A failed mmap() very likely causes application failure,
@@ -1563,14 +1853,13 @@ fail:
1563 * can happen with large stack limits and large mmap() 1853 * can happen with large stack limits and large mmap()
1564 * allocations. 1854 * allocations.
1565 */ 1855 */
1566 mm->cached_hole_size = ~0UL; 1856 if (addr & ~PAGE_MASK) {
1567 mm->free_area_cache = TASK_UNMAPPED_BASE; 1857 VM_BUG_ON(addr != -ENOMEM);
1568 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 1858 info.flags = 0;
1569 /* 1859 info.low_limit = TASK_UNMAPPED_BASE;
1570 * Restore the topdown base: 1860 info.high_limit = TASK_SIZE;
1571 */ 1861 addr = vm_unmapped_area(&info);
1572 mm->free_area_cache = mm->mmap_base; 1862 }
1573 mm->cached_hole_size = ~0UL;
1574 1863
1575 return addr; 1864 return addr;
1576} 1865}
@@ -1780,9 +2069,27 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1780 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 2069 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
1781 error = acct_stack_growth(vma, size, grow); 2070 error = acct_stack_growth(vma, size, grow);
1782 if (!error) { 2071 if (!error) {
2072 /*
2073 * vma_gap_update() doesn't support concurrent
2074 * updates, but we only hold a shared mmap_sem
2075 * lock here, so we need to protect against
2076 * concurrent vma expansions.
2077 * vma_lock_anon_vma() doesn't help here, as
2078 * we don't guarantee that all growable vmas
2079 * in a mm share the same root anon vma.
2080 * So, we reuse mm->page_table_lock to guard
2081 * against concurrent vma expansions.
2082 */
2083 spin_lock(&vma->vm_mm->page_table_lock);
1783 anon_vma_interval_tree_pre_update_vma(vma); 2084 anon_vma_interval_tree_pre_update_vma(vma);
1784 vma->vm_end = address; 2085 vma->vm_end = address;
1785 anon_vma_interval_tree_post_update_vma(vma); 2086 anon_vma_interval_tree_post_update_vma(vma);
2087 if (vma->vm_next)
2088 vma_gap_update(vma->vm_next);
2089 else
2090 vma->vm_mm->highest_vm_end = address;
2091 spin_unlock(&vma->vm_mm->page_table_lock);
2092
1786 perf_event_mmap(vma); 2093 perf_event_mmap(vma);
1787 } 2094 }
1788 } 2095 }
@@ -1833,10 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma,
1833 if (grow <= vma->vm_pgoff) { 2140 if (grow <= vma->vm_pgoff) {
1834 error = acct_stack_growth(vma, size, grow); 2141 error = acct_stack_growth(vma, size, grow);
1835 if (!error) { 2142 if (!error) {
2143 /*
2144 * vma_gap_update() doesn't support concurrent
2145 * updates, but we only hold a shared mmap_sem
2146 * lock here, so we need to protect against
2147 * concurrent vma expansions.
2148 * vma_lock_anon_vma() doesn't help here, as
2149 * we don't guarantee that all growable vmas
2150 * in a mm share the same root anon vma.
2151 * So, we reuse mm->page_table_lock to guard
2152 * against concurrent vma expansions.
2153 */
2154 spin_lock(&vma->vm_mm->page_table_lock);
1836 anon_vma_interval_tree_pre_update_vma(vma); 2155 anon_vma_interval_tree_pre_update_vma(vma);
1837 vma->vm_start = address; 2156 vma->vm_start = address;
1838 vma->vm_pgoff -= grow; 2157 vma->vm_pgoff -= grow;
1839 anon_vma_interval_tree_post_update_vma(vma); 2158 anon_vma_interval_tree_post_update_vma(vma);
2159 vma_gap_update(vma);
2160 spin_unlock(&vma->vm_mm->page_table_lock);
2161
1840 perf_event_mmap(vma); 2162 perf_event_mmap(vma);
1841 } 2163 }
1842 } 2164 }
@@ -1959,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1959 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2281 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1960 vma->vm_prev = NULL; 2282 vma->vm_prev = NULL;
1961 do { 2283 do {
1962 rb_erase(&vma->vm_rb, &mm->mm_rb); 2284 vma_rb_erase(vma, &mm->mm_rb);
1963 mm->map_count--; 2285 mm->map_count--;
1964 tail_vma = vma; 2286 tail_vma = vma;
1965 vma = vma->vm_next; 2287 vma = vma->vm_next;
1966 } while (vma && vma->vm_start < end); 2288 } while (vma && vma->vm_start < end);
1967 *insertion_point = vma; 2289 *insertion_point = vma;
1968 if (vma) 2290 if (vma) {
1969 vma->vm_prev = prev; 2291 vma->vm_prev = prev;
2292 vma_gap_update(vma);
2293 } else
2294 mm->highest_vm_end = prev ? prev->vm_end : 0;
1970 tail_vma->vm_next = NULL; 2295 tail_vma->vm_next = NULL;
1971 if (mm->unmap_area == arch_unmap_area) 2296 if (mm->unmap_area == arch_unmap_area)
1972 addr = prev ? prev->vm_end : mm->mmap_base; 2297 addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2561,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2561 * The LSB of head.next can't change from under us 2886 * The LSB of head.next can't change from under us
2562 * because we hold the mm_all_locks_mutex. 2887 * because we hold the mm_all_locks_mutex.
2563 */ 2888 */
2564 mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); 2889 down_write(&anon_vma->root->rwsem);
2565 /* 2890 /*
2566 * We can safely modify head.next after taking the 2891 * We can safely modify head.next after taking the
2567 * anon_vma->root->mutex. If some other vma in this mm shares 2892 * anon_vma->root->rwsem. If some other vma in this mm shares
2568 * the same anon_vma we won't take it again. 2893 * the same anon_vma we won't take it again.
2569 * 2894 *
2570 * No need of atomic instructions here, head.next 2895 * No need of atomic instructions here, head.next
2571 * can't change from under us thanks to the 2896 * can't change from under us thanks to the
2572 * anon_vma->root->mutex. 2897 * anon_vma->root->rwsem.
2573 */ 2898 */
2574 if (__test_and_set_bit(0, (unsigned long *) 2899 if (__test_and_set_bit(0, (unsigned long *)
2575 &anon_vma->root->rb_root.rb_node)) 2900 &anon_vma->root->rb_root.rb_node))
@@ -2671,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2671 * 2996 *
2672 * No need of atomic instructions here, head.next 2997 * No need of atomic instructions here, head.next
2673 * can't change from under us until we release the 2998 * can't change from under us until we release the
2674 * anon_vma->root->mutex. 2999 * anon_vma->root->rwsem.
2675 */ 3000 */
2676 if (!__test_and_clear_bit(0, (unsigned long *) 3001 if (!__test_and_clear_bit(0, (unsigned long *)
2677 &anon_vma->root->rb_root.rb_node)) 3002 &anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a40992610ab6..94722a4d6b43 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
35} 35}
36#endif 36#endif
37 37
38static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable) 40 int dirty_accountable, int prot_numa, bool *ret_all_same_node)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm;
42 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
43 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0;
46 bool all_same_node = true;
47 int last_nid = -1;
44 48
45 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
46 arch_enter_lazy_mmu_mode(); 50 arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
48 oldpte = *pte; 52 oldpte = *pte;
49 if (pte_present(oldpte)) { 53 if (pte_present(oldpte)) {
50 pte_t ptent; 54 pte_t ptent;
55 bool updated = false;
51 56
52 ptent = ptep_modify_prot_start(mm, addr, pte); 57 ptent = ptep_modify_prot_start(mm, addr, pte);
53 ptent = pte_modify(ptent, newprot); 58 if (!prot_numa) {
59 ptent = pte_modify(ptent, newprot);
60 updated = true;
61 } else {
62 struct page *page;
63
64 page = vm_normal_page(vma, addr, oldpte);
65 if (page) {
66 int this_nid = page_to_nid(page);
67 if (last_nid == -1)
68 last_nid = this_nid;
69 if (last_nid != this_nid)
70 all_same_node = false;
71
72 /* only check non-shared pages */
73 if (!pte_numa(oldpte) &&
74 page_mapcount(page) == 1) {
75 ptent = pte_mknuma(ptent);
76 updated = true;
77 }
78 }
79 }
54 80
55 /* 81 /*
56 * Avoid taking write faults for pages we know to be 82 * Avoid taking write faults for pages we know to be
57 * dirty. 83 * dirty.
58 */ 84 */
59 if (dirty_accountable && pte_dirty(ptent)) 85 if (dirty_accountable && pte_dirty(ptent)) {
60 ptent = pte_mkwrite(ptent); 86 ptent = pte_mkwrite(ptent);
87 updated = true;
88 }
61 89
90 if (updated)
91 pages++;
62 ptep_modify_prot_commit(mm, addr, pte, ptent); 92 ptep_modify_prot_commit(mm, addr, pte, ptent);
63 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 93 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 94 swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,61 +102,101 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
72 set_pte_at(mm, addr, pte, 102 set_pte_at(mm, addr, pte,
73 swp_entry_to_pte(entry)); 103 swp_entry_to_pte(entry));
74 } 104 }
105 pages++;
75 } 106 }
76 } while (pte++, addr += PAGE_SIZE, addr != end); 107 } while (pte++, addr += PAGE_SIZE, addr != end);
77 arch_leave_lazy_mmu_mode(); 108 arch_leave_lazy_mmu_mode();
78 pte_unmap_unlock(pte - 1, ptl); 109 pte_unmap_unlock(pte - 1, ptl);
110
111 *ret_all_same_node = all_same_node;
112 return pages;
79} 113}
80 114
81static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, 115#ifdef CONFIG_NUMA_BALANCING
82 unsigned long addr, unsigned long end, pgprot_t newprot, 116static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
83 int dirty_accountable) 117 pmd_t *pmd)
118{
119 spin_lock(&mm->page_table_lock);
120 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
121 spin_unlock(&mm->page_table_lock);
122}
123#else
124static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
125 pmd_t *pmd)
126{
127 BUG();
128}
129#endif /* CONFIG_NUMA_BALANCING */
130
131static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
132 pud_t *pud, unsigned long addr, unsigned long end,
133 pgprot_t newprot, int dirty_accountable, int prot_numa)
84{ 134{
85 pmd_t *pmd; 135 pmd_t *pmd;
86 unsigned long next; 136 unsigned long next;
137 unsigned long pages = 0;
138 bool all_same_node;
87 139
88 pmd = pmd_offset(pud, addr); 140 pmd = pmd_offset(pud, addr);
89 do { 141 do {
90 next = pmd_addr_end(addr, end); 142 next = pmd_addr_end(addr, end);
91 if (pmd_trans_huge(*pmd)) { 143 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE) 144 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma->vm_mm, pmd); 145 split_huge_page_pmd(vma, addr, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot)) 146 else if (change_huge_pmd(vma, pmd, addr, newprot,
147 prot_numa)) {
148 pages += HPAGE_PMD_NR;
95 continue; 149 continue;
150 }
96 /* fall through */ 151 /* fall through */
97 } 152 }
98 if (pmd_none_or_clear_bad(pmd)) 153 if (pmd_none_or_clear_bad(pmd))
99 continue; 154 continue;
100 change_pte_range(vma->vm_mm, pmd, addr, next, newprot, 155 pages += change_pte_range(vma, pmd, addr, next, newprot,
101 dirty_accountable); 156 dirty_accountable, prot_numa, &all_same_node);
157
158 /*
159 * If we are changing protections for NUMA hinting faults then
160 * set pmd_numa if the examined pages were all on the same
161 * node. This allows a regular PMD to be handled as one fault
162 * and effectively batches the taking of the PTL
163 */
164 if (prot_numa && all_same_node)
165 change_pmd_protnuma(vma->vm_mm, addr, pmd);
102 } while (pmd++, addr = next, addr != end); 166 } while (pmd++, addr = next, addr != end);
167
168 return pages;
103} 169}
104 170
105static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 171static inline unsigned long change_pud_range(struct vm_area_struct *vma,
106 unsigned long addr, unsigned long end, pgprot_t newprot, 172 pgd_t *pgd, unsigned long addr, unsigned long end,
107 int dirty_accountable) 173 pgprot_t newprot, int dirty_accountable, int prot_numa)
108{ 174{
109 pud_t *pud; 175 pud_t *pud;
110 unsigned long next; 176 unsigned long next;
177 unsigned long pages = 0;
111 178
112 pud = pud_offset(pgd, addr); 179 pud = pud_offset(pgd, addr);
113 do { 180 do {
114 next = pud_addr_end(addr, end); 181 next = pud_addr_end(addr, end);
115 if (pud_none_or_clear_bad(pud)) 182 if (pud_none_or_clear_bad(pud))
116 continue; 183 continue;
117 change_pmd_range(vma, pud, addr, next, newprot, 184 pages += change_pmd_range(vma, pud, addr, next, newprot,
118 dirty_accountable); 185 dirty_accountable, prot_numa);
119 } while (pud++, addr = next, addr != end); 186 } while (pud++, addr = next, addr != end);
187
188 return pages;
120} 189}
121 190
122static void change_protection(struct vm_area_struct *vma, 191static unsigned long change_protection_range(struct vm_area_struct *vma,
123 unsigned long addr, unsigned long end, pgprot_t newprot, 192 unsigned long addr, unsigned long end, pgprot_t newprot,
124 int dirty_accountable) 193 int dirty_accountable, int prot_numa)
125{ 194{
126 struct mm_struct *mm = vma->vm_mm; 195 struct mm_struct *mm = vma->vm_mm;
127 pgd_t *pgd; 196 pgd_t *pgd;
128 unsigned long next; 197 unsigned long next;
129 unsigned long start = addr; 198 unsigned long start = addr;
199 unsigned long pages = 0;
130 200
131 BUG_ON(addr >= end); 201 BUG_ON(addr >= end);
132 pgd = pgd_offset(mm, addr); 202 pgd = pgd_offset(mm, addr);
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma,
135 next = pgd_addr_end(addr, end); 205 next = pgd_addr_end(addr, end);
136 if (pgd_none_or_clear_bad(pgd)) 206 if (pgd_none_or_clear_bad(pgd))
137 continue; 207 continue;
138 change_pud_range(vma, pgd, addr, next, newprot, 208 pages += change_pud_range(vma, pgd, addr, next, newprot,
139 dirty_accountable); 209 dirty_accountable, prot_numa);
140 } while (pgd++, addr = next, addr != end); 210 } while (pgd++, addr = next, addr != end);
141 flush_tlb_range(vma, start, end); 211
212 /* Only flush the TLB if we actually modified any entries: */
213 if (pages)
214 flush_tlb_range(vma, start, end);
215
216 return pages;
217}
218
219unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
220 unsigned long end, pgprot_t newprot,
221 int dirty_accountable, int prot_numa)
222{
223 struct mm_struct *mm = vma->vm_mm;
224 unsigned long pages;
225
226 mmu_notifier_invalidate_range_start(mm, start, end);
227 if (is_vm_hugetlb_page(vma))
228 pages = hugetlb_change_protection(vma, start, end, newprot);
229 else
230 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
231 mmu_notifier_invalidate_range_end(mm, start, end);
232
233 return pages;
142} 234}
143 235
144int 236int
@@ -213,12 +305,9 @@ success:
213 dirty_accountable = 1; 305 dirty_accountable = 1;
214 } 306 }
215 307
216 mmu_notifier_invalidate_range_start(mm, start, end); 308 change_protection(vma, start, end, vma->vm_page_prot,
217 if (is_vm_hugetlb_page(vma)) 309 dirty_accountable, 0);
218 hugetlb_change_protection(vma, start, end, vma->vm_page_prot); 310
219 else
220 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
221 mmu_notifier_invalidate_range_end(mm, start, end);
222 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 311 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
223 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 312 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
224 perf_event_mmap(vma); 313 perf_event_mmap(vma);
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
274 error = -EINVAL; 363 error = -EINVAL;
275 if (!(vma->vm_flags & VM_GROWSDOWN)) 364 if (!(vma->vm_flags & VM_GROWSDOWN))
276 goto out; 365 goto out;
277 } 366 } else {
278 else {
279 if (vma->vm_start > start) 367 if (vma->vm_start > start)
280 goto out; 368 goto out;
281 if (unlikely(grows & PROT_GROWSUP)) { 369 if (unlikely(grows & PROT_GROWSUP)) {
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
291 for (nstart = start ; ; ) { 379 for (nstart = start ; ; ) {
292 unsigned long newflags; 380 unsigned long newflags;
293 381
294 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 382 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
295 383
296 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 384 newflags = vm_flags;
385 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
297 386
298 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 387 /* newflags >> 4 shift VM_MAY% in place of VM_% */
299 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { 388 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b61c2d3307a..e1031e1f6a61 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
104 } 104 }
105 if (vma->anon_vma) { 105 if (vma->anon_vma) {
106 anon_vma = vma->anon_vma; 106 anon_vma = vma->anon_vma;
107 anon_vma_lock(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 } 108 }
109 } 109 }
110 110
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
182 need_flush = true; 182 need_flush = true;
183 continue; 183 continue;
184 } else if (!err) { 184 } else if (!err) {
185 split_huge_page_pmd(vma->vm_mm, old_pmd); 185 split_huge_page_pmd(vma, old_addr, old_pmd);
186 } 186 }
187 VM_BUG_ON(pmd_trans_huge(*old_pmd)); 187 VM_BUG_ON(pmd_trans_huge(*old_pmd));
188 } 188 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bd82f6b31411..b8294fc03df8 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
137 return count; 137 return count;
138} 138}
139 139
140static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
141{
142 struct zone *z;
143
144 /*
145 * In free_area_init_core(), highmem zone's managed_pages is set to
146 * present_pages, and bootmem allocator doesn't allocate from highmem
147 * zones. So there's no need to recalculate managed_pages because all
148 * highmem pages will be managed by the buddy system. Here highmem
149 * zone also includes highmem movable zone.
150 */
151 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
152 if (!is_highmem(z))
153 z->managed_pages = 0;
154}
155
140/** 156/**
141 * free_all_bootmem_node - release a node's free pages to the buddy allocator 157 * free_all_bootmem_node - release a node's free pages to the buddy allocator
142 * @pgdat: node to be released 158 * @pgdat: node to be released
@@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
146unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 162unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
147{ 163{
148 register_page_bootmem_info_node(pgdat); 164 register_page_bootmem_info_node(pgdat);
165 reset_node_lowmem_managed_pages(pgdat);
149 166
150 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ 167 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
151 return 0; 168 return 0;
@@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
158 */ 175 */
159unsigned long __init free_all_bootmem(void) 176unsigned long __init free_all_bootmem(void)
160{ 177{
178 struct pglist_data *pgdat;
179
180 for_each_online_pgdat(pgdat)
181 reset_node_lowmem_managed_pages(pgdat);
182
161 /* 183 /*
162 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 184 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
163 * because in some case like Node0 doesn't have RAM installed 185 * because in some case like Node0 doesn't have RAM installed
diff --git a/mm/nommu.c b/mm/nommu.c
index 45131b41bcdb..79c3cac87afa 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -66,6 +66,21 @@ int heap_stack_gap = 0;
66 66
67atomic_long_t mmap_pages_allocated; 67atomic_long_t mmap_pages_allocated;
68 68
69/*
70 * The global memory commitment made in the system can be a metric
71 * that can be used to drive ballooning decisions when Linux is hosted
72 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
73 * balancing memory across competing virtual machines that are hosted.
74 * Several metrics drive this policy engine including the guest reported
75 * memory commitment.
76 */
77unsigned long vm_memory_committed(void)
78{
79 return percpu_counter_read_positive(&vm_committed_as);
80}
81
82EXPORT_SYMBOL_GPL(vm_memory_committed);
83
69EXPORT_SYMBOL(mem_map); 84EXPORT_SYMBOL(mem_map);
70EXPORT_SYMBOL(num_physpages); 85EXPORT_SYMBOL(num_physpages);
71 86
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e0f3e24831..0399f146ae49 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
44int sysctl_oom_dump_tasks = 1; 44int sysctl_oom_dump_tasks = 1;
45static DEFINE_SPINLOCK(zone_scan_lock); 45static DEFINE_SPINLOCK(zone_scan_lock);
46 46
47/*
48 * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
49 * @old_val: old oom_score_adj for compare
50 * @new_val: new oom_score_adj for swap
51 *
52 * Sets the oom_score_adj value for current to @new_val iff its present value is
53 * @old_val. Usually used to reinstate a previous value to prevent racing with
54 * userspacing tuning the value in the interim.
55 */
56void compare_swap_oom_score_adj(int old_val, int new_val)
57{
58 struct sighand_struct *sighand = current->sighand;
59
60 spin_lock_irq(&sighand->siglock);
61 if (current->signal->oom_score_adj == old_val)
62 current->signal->oom_score_adj = new_val;
63 trace_oom_score_adj_update(current);
64 spin_unlock_irq(&sighand->siglock);
65}
66
67/**
68 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
69 * @new_val: new oom_score_adj value
70 *
71 * Sets the oom_score_adj value for current to @new_val with proper
72 * synchronization and returns the old value. Usually used to temporarily
73 * set a value, save the old value in the caller, and then reinstate it later.
74 */
75int test_set_oom_score_adj(int new_val)
76{
77 struct sighand_struct *sighand = current->sighand;
78 int old_val;
79
80 spin_lock_irq(&sighand->siglock);
81 old_val = current->signal->oom_score_adj;
82 current->signal->oom_score_adj = new_val;
83 trace_oom_score_adj_update(current);
84 spin_unlock_irq(&sighand->siglock);
85
86 return old_val;
87}
88
89#ifdef CONFIG_NUMA 47#ifdef CONFIG_NUMA
90/** 48/**
91 * has_intersects_mems_allowed() - check task eligiblity for kill 49 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
193 if (!p) 151 if (!p)
194 return 0; 152 return 0;
195 153
196 adj = p->signal->oom_score_adj; 154 adj = (long)p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) { 155 if (adj == OOM_SCORE_ADJ_MIN) {
198 task_unlock(p); 156 task_unlock(p);
199 return 0; 157 return 0;
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
257 * the page allocator means a mempolicy is in effect. Cpuset policy 215 * the page allocator means a mempolicy is in effect. Cpuset policy
258 * is enforced in get_page_from_freelist(). 216 * is enforced in get_page_from_freelist().
259 */ 217 */
260 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { 218 if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
261 *totalpages = total_swap_pages; 219 *totalpages = total_swap_pages;
262 for_each_node_mask(nid, *nodemask) 220 for_each_node_mask(nid, *nodemask)
263 *totalpages += node_spanned_pages(nid); 221 *totalpages += node_spanned_pages(nid);
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
310 if (!task->mm) 268 if (!task->mm)
311 return OOM_SCAN_CONTINUE; 269 return OOM_SCAN_CONTINUE;
312 270
313 if (task->flags & PF_EXITING) { 271 /*
272 * If task is allocating a lot of memory and has been marked to be
273 * killed first if it triggers an oom, then select it.
274 */
275 if (oom_task_origin(task))
276 return OOM_SCAN_SELECT;
277
278 if (task->flags & PF_EXITING && !force_kill) {
314 /* 279 /*
315 * If task is current and is in the process of releasing memory, 280 * If this task is not being ptraced on exit, then wait for it
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to 281 * to finish before killing some other task unnecessarily.
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */ 282 */
322 if (task == current) 283 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
323 return OOM_SCAN_SELECT; 284 return OOM_SCAN_ABORT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 } 285 }
334 return OOM_SCAN_OK; 286 return OOM_SCAN_OK;
335} 287}
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
412 continue; 364 continue;
413 } 365 }
414 366
415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", 367 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
416 task->pid, from_kuid(&init_user_ns, task_uid(task)), 368 task->pid, from_kuid(&init_user_ns, task_uid(task)),
417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 369 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
418 task->mm->nr_ptes, 370 task->mm->nr_ptes,
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
428{ 380{
429 task_lock(current); 381 task_lock(current);
430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 382 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
431 "oom_score_adj=%d\n", 383 "oom_score_adj=%hd\n",
432 current->comm, gfp_mask, order, 384 current->comm, gfp_mask, order,
433 current->signal->oom_score_adj); 385 current->signal->oom_score_adj);
434 cpuset_print_task_mems_allowed(current); 386 cpuset_print_task_mems_allowed(current);
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
639 spin_unlock(&zone_scan_lock); 591 spin_unlock(&zone_scan_lock);
640} 592}
641 593
642/*
643 * Try to acquire the oom killer lock for all system zones. Returns zero if a
644 * parallel oom killing is taking place, otherwise locks all zones and returns
645 * non-zero.
646 */
647static int try_set_system_oom(void)
648{
649 struct zone *zone;
650 int ret = 1;
651
652 spin_lock(&zone_scan_lock);
653 for_each_populated_zone(zone)
654 if (zone_is_oom_locked(zone)) {
655 ret = 0;
656 goto out;
657 }
658 for_each_populated_zone(zone)
659 zone_set_flag(zone, ZONE_OOM_LOCKED);
660out:
661 spin_unlock(&zone_scan_lock);
662 return ret;
663}
664
665/*
666 * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
667 * attempts or page faults may now recall the oom killer, if necessary.
668 */
669static void clear_system_oom(void)
670{
671 struct zone *zone;
672
673 spin_lock(&zone_scan_lock);
674 for_each_populated_zone(zone)
675 zone_clear_flag(zone, ZONE_OOM_LOCKED);
676 spin_unlock(&zone_scan_lock);
677}
678
679/** 594/**
680 * out_of_memory - kill the "best" process when we run out of memory 595 * out_of_memory - kill the "best" process when we run out of memory
681 * @zonelist: zonelist pointer 596 * @zonelist: zonelist pointer
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
706 return; 621 return;
707 622
708 /* 623 /*
709 * If current has a pending SIGKILL, then automatically select it. The 624 * If current has a pending SIGKILL or is exiting, then automatically
710 * goal is to allow it to allocate so that it may quickly exit and free 625 * select it. The goal is to allow it to allocate so that it may
711 * its memory. 626 * quickly exit and free its memory.
712 */ 627 */
713 if (fatal_signal_pending(current)) { 628 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
714 set_thread_flag(TIF_MEMDIE); 629 set_thread_flag(TIF_MEMDIE);
715 return; 630 return;
716 } 631 }
@@ -756,15 +671,16 @@ out:
756 671
757/* 672/*
758 * The pagefault handler calls here because it is out of memory, so kill a 673 * The pagefault handler calls here because it is out of memory, so kill a
759 * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel 674 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
760 * oom killing is already in progress so do nothing. If a task is found with 675 * parallel oom killing is already in progress so do nothing.
761 * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
762 */ 676 */
763void pagefault_out_of_memory(void) 677void pagefault_out_of_memory(void)
764{ 678{
765 if (try_set_system_oom()) { 679 struct zonelist *zonelist = node_zonelist(first_online_node,
680 GFP_KERNEL);
681
682 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
766 out_of_memory(NULL, 0, 0, NULL, false); 683 out_of_memory(NULL, 0, 0, NULL, false);
767 clear_system_oom(); 684 clear_zonelist_oom(zonelist, GFP_KERNEL);
768 } 685 }
769 schedule_timeout_killable(1);
770} 686}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 830893b2b3c7..6f4271224493 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
1069} 1069}
1070 1070
1071/* 1071/*
1072 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() 1072 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
1073 * will look to see if it needs to start dirty throttling. 1073 * will look to see if it needs to start dirty throttling.
1074 * 1074 *
1075 * If dirty_poll_interval is too low, big NUMA machines will call the expensive 1075 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
1436DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; 1436DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1437 1437
1438/** 1438/**
1439 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1439 * balance_dirty_pages_ratelimited - balance dirty memory state
1440 * @mapping: address_space which was dirtied 1440 * @mapping: address_space which was dirtied
1441 * @nr_pages_dirtied: number of pages which the caller has just dirtied
1442 * 1441 *
1443 * Processes which are dirtying memory should call in here once for each page 1442 * Processes which are dirtying memory should call in here once for each page
1444 * which was newly dirtied. The function will periodically check the system's 1443 * which was newly dirtied. The function will periodically check the system's
@@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1449 * limit we decrease the ratelimiting by a lot, to prevent individual processes 1448 * limit we decrease the ratelimiting by a lot, to prevent individual processes
1450 * from overshooting the limit by (ratelimit_pages) each. 1449 * from overshooting the limit by (ratelimit_pages) each.
1451 */ 1450 */
1452void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 1451void balance_dirty_pages_ratelimited(struct address_space *mapping)
1453 unsigned long nr_pages_dirtied)
1454{ 1452{
1455 struct backing_dev_info *bdi = mapping->backing_dev_info; 1453 struct backing_dev_info *bdi = mapping->backing_dev_info;
1456 int ratelimit; 1454 int ratelimit;
@@ -1484,6 +1482,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1484 */ 1482 */
1485 p = &__get_cpu_var(dirty_throttle_leaks); 1483 p = &__get_cpu_var(dirty_throttle_leaks);
1486 if (*p > 0 && current->nr_dirtied < ratelimit) { 1484 if (*p > 0 && current->nr_dirtied < ratelimit) {
1485 unsigned long nr_pages_dirtied;
1487 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); 1486 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1488 *p -= nr_pages_dirtied; 1487 *p -= nr_pages_dirtied;
1489 current->nr_dirtied += nr_pages_dirtied; 1488 current->nr_dirtied += nr_pages_dirtied;
@@ -1493,7 +1492,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1493 if (unlikely(current->nr_dirtied >= ratelimit)) 1492 if (unlikely(current->nr_dirtied >= ratelimit))
1494 balance_dirty_pages(mapping, current->nr_dirtied); 1493 balance_dirty_pages(mapping, current->nr_dirtied);
1495} 1494}
1496EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 1495EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
1497 1496
1498void throttle_vm_writeout(gfp_t gfp_mask) 1497void throttle_vm_writeout(gfp_t gfp_mask)
1499{ 1498{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e208f0ad68c..2ad2ad168efe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
90#ifdef CONFIG_HIGHMEM 90#ifdef CONFIG_HIGHMEM
91 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 91 [N_HIGH_MEMORY] = { { [0] = 1UL } },
92#endif 92#endif
93#ifdef CONFIG_MOVABLE_NODE
94 [N_MEMORY] = { { [0] = 1UL } },
95#endif
93 [N_CPU] = { { [0] = 1UL } }, 96 [N_CPU] = { { [0] = 1UL } },
94#endif /* NUMA */ 97#endif /* NUMA */
95}; 98};
@@ -368,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
368 int nr_pages = 1 << order; 371 int nr_pages = 1 << order;
369 int bad = 0; 372 int bad = 0;
370 373
371 if (unlikely(compound_order(page) != order) || 374 if (unlikely(compound_order(page) != order)) {
372 unlikely(!PageHead(page))) {
373 bad_page(page); 375 bad_page(page);
374 bad++; 376 bad++;
375 } 377 }
@@ -523,7 +525,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
523 * If a block is freed, and its buddy is also free, then this 525 * If a block is freed, and its buddy is also free, then this
524 * triggers coalescing into a block of larger size. 526 * triggers coalescing into a block of larger size.
525 * 527 *
526 * -- wli 528 * -- nyc
527 */ 529 */
528 530
529static inline void __free_one_page(struct page *page, 531static inline void __free_one_page(struct page *page,
@@ -608,6 +610,7 @@ static inline int free_pages_check(struct page *page)
608 bad_page(page); 610 bad_page(page);
609 return 1; 611 return 1;
610 } 612 }
613 reset_page_last_nid(page);
611 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 614 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
612 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 615 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
613 return 0; 616 return 0;
@@ -667,11 +670,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 670 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
668 __free_one_page(page, zone, 0, mt); 671 __free_one_page(page, zone, 0, mt);
669 trace_mm_page_pcpu_drain(page, 0, mt); 672 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt)) 673 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 674 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
675 if (is_migrate_cma(mt))
676 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
677 }
672 } while (--to_free && --batch_free && !list_empty(list)); 678 } while (--to_free && --batch_free && !list_empty(list));
673 } 679 }
674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
675 spin_unlock(&zone->lock); 680 spin_unlock(&zone->lock);
676} 681}
677 682
@@ -730,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
730 local_irq_restore(flags); 735 local_irq_restore(flags);
731} 736}
732 737
738/*
739 * Read access to zone->managed_pages is safe because it's unsigned long,
740 * but we still need to serialize writers. Currently all callers of
741 * __free_pages_bootmem() except put_page_bootmem() should only be used
742 * at boot time. So for shorter boot time, we shift the burden to
743 * put_page_bootmem() to serialize writers.
744 */
733void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 745void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
734{ 746{
735 unsigned int nr_pages = 1 << order; 747 unsigned int nr_pages = 1 << order;
@@ -745,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
745 set_page_count(p, 0); 757 set_page_count(p, 0);
746 } 758 }
747 759
760 page_zone(page)->managed_pages += 1 << order;
748 set_page_refcounted(page); 761 set_page_refcounted(page);
749 __free_pages(page, order); 762 __free_pages(page, order);
750} 763}
@@ -780,7 +793,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
780 * large block of memory acted on by a series of small allocations. 793 * large block of memory acted on by a series of small allocations.
781 * This behavior is a critical factor in sglist merging's success. 794 * This behavior is a critical factor in sglist merging's success.
782 * 795 *
783 * -- wli 796 * -- nyc
784 */ 797 */
785static inline void expand(struct zone *zone, struct page *page, 798static inline void expand(struct zone *zone, struct page *page,
786 int low, int high, struct free_area *area, 799 int low, int high, struct free_area *area,
@@ -1392,21 +1405,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1392 1405
1393 zone = page_zone(page); 1406 zone = page_zone(page);
1394 order = page_order(page); 1407 order = page_order(page);
1408 mt = get_pageblock_migratetype(page);
1395 1409
1396 /* Obey watermarks as if the page was being allocated */ 1410 if (mt != MIGRATE_ISOLATE) {
1397 watermark = low_wmark_pages(zone) + (1 << order); 1411 /* Obey watermarks as if the page was being allocated */
1398 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1412 watermark = low_wmark_pages(zone) + (1 << order);
1399 return 0; 1413 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1414 return 0;
1415
1416 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1417 }
1400 1418
1401 /* Remove page from free list */ 1419 /* Remove page from free list */
1402 list_del(&page->lru); 1420 list_del(&page->lru);
1403 zone->free_area[order].nr_free--; 1421 zone->free_area[order].nr_free--;
1404 rmv_page_order(page); 1422 rmv_page_order(page);
1405 1423
1406 mt = get_pageblock_migratetype(page);
1407 if (unlikely(mt != MIGRATE_ISOLATE))
1408 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1409
1410 if (alloc_order != order) 1424 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order, 1425 expand(zone, page, alloc_order, order,
1412 &zone->free_area[order], migratetype); 1426 &zone->free_area[order], migratetype);
@@ -1692,7 +1706,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1692 * 1706 *
1693 * If the zonelist cache is present in the passed in zonelist, then 1707 * If the zonelist cache is present in the passed in zonelist, then
1694 * returns a pointer to the allowed node mask (either the current 1708 * returns a pointer to the allowed node mask (either the current
1695 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1709 * tasks mems_allowed, or node_states[N_MEMORY].)
1696 * 1710 *
1697 * If the zonelist cache is not available for this zonelist, does 1711 * If the zonelist cache is not available for this zonelist, does
1698 * nothing and returns NULL. 1712 * nothing and returns NULL.
@@ -1721,7 +1735,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1721 1735
1722 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1736 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1723 &cpuset_current_mems_allowed : 1737 &cpuset_current_mems_allowed :
1724 &node_states[N_HIGH_MEMORY]; 1738 &node_states[N_MEMORY];
1725 return allowednodes; 1739 return allowednodes;
1726} 1740}
1727 1741
@@ -1871,7 +1885,7 @@ zonelist_scan:
1871 */ 1885 */
1872 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1886 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1873 high_zoneidx, nodemask) { 1887 high_zoneidx, nodemask) {
1874 if (NUMA_BUILD && zlc_active && 1888 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1875 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1889 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1876 continue; 1890 continue;
1877 if ((alloc_flags & ALLOC_CPUSET) && 1891 if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1931,8 @@ zonelist_scan:
1917 classzone_idx, alloc_flags)) 1931 classzone_idx, alloc_flags))
1918 goto try_this_zone; 1932 goto try_this_zone;
1919 1933
1920 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { 1934 if (IS_ENABLED(CONFIG_NUMA) &&
1935 !did_zlc_setup && nr_online_nodes > 1) {
1921 /* 1936 /*
1922 * we do zlc_setup if there are multiple nodes 1937 * we do zlc_setup if there are multiple nodes
1923 * and before considering the first zone allowed 1938 * and before considering the first zone allowed
@@ -1936,7 +1951,7 @@ zonelist_scan:
1936 * As we may have just activated ZLC, check if the first 1951 * As we may have just activated ZLC, check if the first
1937 * eligible zone has failed zone_reclaim recently. 1952 * eligible zone has failed zone_reclaim recently.
1938 */ 1953 */
1939 if (NUMA_BUILD && zlc_active && 1954 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1940 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1955 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1941 continue; 1956 continue;
1942 1957
@@ -1962,11 +1977,11 @@ try_this_zone:
1962 if (page) 1977 if (page)
1963 break; 1978 break;
1964this_zone_full: 1979this_zone_full:
1965 if (NUMA_BUILD) 1980 if (IS_ENABLED(CONFIG_NUMA))
1966 zlc_mark_zone_full(zonelist, z); 1981 zlc_mark_zone_full(zonelist, z);
1967 } 1982 }
1968 1983
1969 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1984 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
1970 /* Disable zlc cache for second zonelist scan */ 1985 /* Disable zlc cache for second zonelist scan */
1971 zlc_active = 0; 1986 zlc_active = 0;
1972 goto zonelist_scan; 1987 goto zonelist_scan;
@@ -2266,7 +2281,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2266 return NULL; 2281 return NULL;
2267 2282
2268 /* After successful reclaim, reconsider all zones for allocation */ 2283 /* After successful reclaim, reconsider all zones for allocation */
2269 if (NUMA_BUILD) 2284 if (IS_ENABLED(CONFIG_NUMA))
2270 zlc_clear_zones_full(zonelist); 2285 zlc_clear_zones_full(zonelist);
2271 2286
2272retry: 2287retry:
@@ -2412,7 +2427,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2412 * allowed per node queues are empty and that nodes are 2427 * allowed per node queues are empty and that nodes are
2413 * over allocated. 2428 * over allocated.
2414 */ 2429 */
2415 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2430 if (IS_ENABLED(CONFIG_NUMA) &&
2431 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2416 goto nopage; 2432 goto nopage;
2417 2433
2418restart: 2434restart:
@@ -2596,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2596 int migratetype = allocflags_to_migratetype(gfp_mask); 2612 int migratetype = allocflags_to_migratetype(gfp_mask);
2597 unsigned int cpuset_mems_cookie; 2613 unsigned int cpuset_mems_cookie;
2598 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; 2614 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2615 struct mem_cgroup *memcg = NULL;
2599 2616
2600 gfp_mask &= gfp_allowed_mask; 2617 gfp_mask &= gfp_allowed_mask;
2601 2618
@@ -2614,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2614 if (unlikely(!zonelist->_zonerefs->zone)) 2631 if (unlikely(!zonelist->_zonerefs->zone))
2615 return NULL; 2632 return NULL;
2616 2633
2634 /*
2635 * Will only have any effect when __GFP_KMEMCG is set. This is
2636 * verified in the (always inline) callee
2637 */
2638 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2639 return NULL;
2640
2617retry_cpuset: 2641retry_cpuset:
2618 cpuset_mems_cookie = get_mems_allowed(); 2642 cpuset_mems_cookie = get_mems_allowed();
2619 2643
@@ -2649,6 +2673,8 @@ out:
2649 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2673 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2650 goto retry_cpuset; 2674 goto retry_cpuset;
2651 2675
2676 memcg_kmem_commit_charge(page, memcg, order);
2677
2652 return page; 2678 return page;
2653} 2679}
2654EXPORT_SYMBOL(__alloc_pages_nodemask); 2680EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2701,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order)
2701 2727
2702EXPORT_SYMBOL(free_pages); 2728EXPORT_SYMBOL(free_pages);
2703 2729
2730/*
2731 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2732 * pages allocated with __GFP_KMEMCG.
2733 *
2734 * Those pages are accounted to a particular memcg, embedded in the
2735 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2736 * for that information only to find out that it is NULL for users who have no
2737 * interest in that whatsoever, we provide these functions.
2738 *
2739 * The caller knows better which flags it relies on.
2740 */
2741void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2742{
2743 memcg_kmem_uncharge_pages(page, order);
2744 __free_pages(page, order);
2745}
2746
2747void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2748{
2749 if (addr != 0) {
2750 VM_BUG_ON(!virt_addr_valid((void *)addr));
2751 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2752 }
2753}
2754
2704static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2755static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2705{ 2756{
2706 if (addr) { 2757 if (addr) {
@@ -2819,7 +2870,7 @@ unsigned int nr_free_pagecache_pages(void)
2819 2870
2820static inline void show_node(struct zone *zone) 2871static inline void show_node(struct zone *zone)
2821{ 2872{
2822 if (NUMA_BUILD) 2873 if (IS_ENABLED(CONFIG_NUMA))
2823 printk("Node %d ", zone_to_nid(zone)); 2874 printk("Node %d ", zone_to_nid(zone));
2824} 2875}
2825 2876
@@ -2877,6 +2928,31 @@ out:
2877 2928
2878#define K(x) ((x) << (PAGE_SHIFT-10)) 2929#define K(x) ((x) << (PAGE_SHIFT-10))
2879 2930
2931static void show_migration_types(unsigned char type)
2932{
2933 static const char types[MIGRATE_TYPES] = {
2934 [MIGRATE_UNMOVABLE] = 'U',
2935 [MIGRATE_RECLAIMABLE] = 'E',
2936 [MIGRATE_MOVABLE] = 'M',
2937 [MIGRATE_RESERVE] = 'R',
2938#ifdef CONFIG_CMA
2939 [MIGRATE_CMA] = 'C',
2940#endif
2941 [MIGRATE_ISOLATE] = 'I',
2942 };
2943 char tmp[MIGRATE_TYPES + 1];
2944 char *p = tmp;
2945 int i;
2946
2947 for (i = 0; i < MIGRATE_TYPES; i++) {
2948 if (type & (1 << i))
2949 *p++ = types[i];
2950 }
2951
2952 *p = '\0';
2953 printk("(%s) ", tmp);
2954}
2955
2880/* 2956/*
2881 * Show free area list (used inside shift_scroll-lock stuff) 2957 * Show free area list (used inside shift_scroll-lock stuff)
2882 * We also calculate the percentage fragmentation. We do this by counting the 2958 * We also calculate the percentage fragmentation. We do this by counting the
@@ -2951,6 +3027,7 @@ void show_free_areas(unsigned int filter)
2951 " isolated(anon):%lukB" 3027 " isolated(anon):%lukB"
2952 " isolated(file):%lukB" 3028 " isolated(file):%lukB"
2953 " present:%lukB" 3029 " present:%lukB"
3030 " managed:%lukB"
2954 " mlocked:%lukB" 3031 " mlocked:%lukB"
2955 " dirty:%lukB" 3032 " dirty:%lukB"
2956 " writeback:%lukB" 3033 " writeback:%lukB"
@@ -2980,6 +3057,7 @@ void show_free_areas(unsigned int filter)
2980 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3057 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2981 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3058 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2982 K(zone->present_pages), 3059 K(zone->present_pages),
3060 K(zone->managed_pages),
2983 K(zone_page_state(zone, NR_MLOCK)), 3061 K(zone_page_state(zone, NR_MLOCK)),
2984 K(zone_page_state(zone, NR_FILE_DIRTY)), 3062 K(zone_page_state(zone, NR_FILE_DIRTY)),
2985 K(zone_page_state(zone, NR_WRITEBACK)), 3063 K(zone_page_state(zone, NR_WRITEBACK)),
@@ -3005,6 +3083,7 @@ void show_free_areas(unsigned int filter)
3005 3083
3006 for_each_populated_zone(zone) { 3084 for_each_populated_zone(zone) {
3007 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3085 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3086 unsigned char types[MAX_ORDER];
3008 3087
3009 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3088 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3010 continue; 3089 continue;
@@ -3013,12 +3092,24 @@ void show_free_areas(unsigned int filter)
3013 3092
3014 spin_lock_irqsave(&zone->lock, flags); 3093 spin_lock_irqsave(&zone->lock, flags);
3015 for (order = 0; order < MAX_ORDER; order++) { 3094 for (order = 0; order < MAX_ORDER; order++) {
3016 nr[order] = zone->free_area[order].nr_free; 3095 struct free_area *area = &zone->free_area[order];
3096 int type;
3097
3098 nr[order] = area->nr_free;
3017 total += nr[order] << order; 3099 total += nr[order] << order;
3100
3101 types[order] = 0;
3102 for (type = 0; type < MIGRATE_TYPES; type++) {
3103 if (!list_empty(&area->free_list[type]))
3104 types[order] |= 1 << type;
3105 }
3018 } 3106 }
3019 spin_unlock_irqrestore(&zone->lock, flags); 3107 spin_unlock_irqrestore(&zone->lock, flags);
3020 for (order = 0; order < MAX_ORDER; order++) 3108 for (order = 0; order < MAX_ORDER; order++) {
3021 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3109 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3110 if (nr[order])
3111 show_migration_types(types[order]);
3112 }
3022 printk("= %lukB\n", K(total)); 3113 printk("= %lukB\n", K(total));
3023 } 3114 }
3024 3115
@@ -3195,7 +3286,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
3195 return node; 3286 return node;
3196 } 3287 }
3197 3288
3198 for_each_node_state(n, N_HIGH_MEMORY) { 3289 for_each_node_state(n, N_MEMORY) {
3199 3290
3200 /* Don't want a node to appear more than once */ 3291 /* Don't want a node to appear more than once */
3201 if (node_isset(n, *used_node_mask)) 3292 if (node_isset(n, *used_node_mask))
@@ -3337,7 +3428,7 @@ static int default_zonelist_order(void)
3337 * local memory, NODE_ORDER may be suitable. 3428 * local memory, NODE_ORDER may be suitable.
3338 */ 3429 */
3339 average_size = total_size / 3430 average_size = total_size /
3340 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 3431 (nodes_weight(node_states[N_MEMORY]) + 1);
3341 for_each_online_node(nid) { 3432 for_each_online_node(nid) {
3342 low_kmem_size = 0; 3433 low_kmem_size = 0;
3343 total_size = 0; 3434 total_size = 0;
@@ -3827,6 +3918,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3827 mminit_verify_page_links(page, zone, nid, pfn); 3918 mminit_verify_page_links(page, zone, nid, pfn);
3828 init_page_count(page); 3919 init_page_count(page);
3829 reset_page_mapcount(page); 3920 reset_page_mapcount(page);
3921 reset_page_last_nid(page);
3830 SetPageReserved(page); 3922 SetPageReserved(page);
3831 /* 3923 /*
3832 * Mark the block movable so that blocks are reserved for 3924 * Mark the block movable so that blocks are reserved for
@@ -4433,6 +4525,26 @@ void __init set_pageblock_order(void)
4433 4525
4434#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4526#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4435 4527
4528static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4529 unsigned long present_pages)
4530{
4531 unsigned long pages = spanned_pages;
4532
4533 /*
4534 * Provide a more accurate estimation if there are holes within
4535 * the zone and SPARSEMEM is in use. If there are holes within the
4536 * zone, each populated memory region may cost us one or two extra
4537 * memmap pages due to alignment because memmap pages for each
4538 * populated regions may not naturally algined on page boundary.
4539 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4540 */
4541 if (spanned_pages > present_pages + (present_pages >> 4) &&
4542 IS_ENABLED(CONFIG_SPARSEMEM))
4543 pages = present_pages;
4544
4545 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4546}
4547
4436/* 4548/*
4437 * Set up the zone data structures: 4549 * Set up the zone data structures:
4438 * - mark all pages reserved 4550 * - mark all pages reserved
@@ -4450,54 +4562,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4450 int ret; 4562 int ret;
4451 4563
4452 pgdat_resize_init(pgdat); 4564 pgdat_resize_init(pgdat);
4565#ifdef CONFIG_NUMA_BALANCING
4566 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4567 pgdat->numabalancing_migrate_nr_pages = 0;
4568 pgdat->numabalancing_migrate_next_window = jiffies;
4569#endif
4453 init_waitqueue_head(&pgdat->kswapd_wait); 4570 init_waitqueue_head(&pgdat->kswapd_wait);
4454 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4571 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4455 pgdat_page_cgroup_init(pgdat); 4572 pgdat_page_cgroup_init(pgdat);
4456 4573
4457 for (j = 0; j < MAX_NR_ZONES; j++) { 4574 for (j = 0; j < MAX_NR_ZONES; j++) {
4458 struct zone *zone = pgdat->node_zones + j; 4575 struct zone *zone = pgdat->node_zones + j;
4459 unsigned long size, realsize, memmap_pages; 4576 unsigned long size, realsize, freesize, memmap_pages;
4460 4577
4461 size = zone_spanned_pages_in_node(nid, j, zones_size); 4578 size = zone_spanned_pages_in_node(nid, j, zones_size);
4462 realsize = size - zone_absent_pages_in_node(nid, j, 4579 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4463 zholes_size); 4580 zholes_size);
4464 4581
4465 /* 4582 /*
4466 * Adjust realsize so that it accounts for how much memory 4583 * Adjust freesize so that it accounts for how much memory
4467 * is used by this zone for memmap. This affects the watermark 4584 * is used by this zone for memmap. This affects the watermark
4468 * and per-cpu initialisations 4585 * and per-cpu initialisations
4469 */ 4586 */
4470 memmap_pages = 4587 memmap_pages = calc_memmap_size(size, realsize);
4471 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 4588 if (freesize >= memmap_pages) {
4472 if (realsize >= memmap_pages) { 4589 freesize -= memmap_pages;
4473 realsize -= memmap_pages;
4474 if (memmap_pages) 4590 if (memmap_pages)
4475 printk(KERN_DEBUG 4591 printk(KERN_DEBUG
4476 " %s zone: %lu pages used for memmap\n", 4592 " %s zone: %lu pages used for memmap\n",
4477 zone_names[j], memmap_pages); 4593 zone_names[j], memmap_pages);
4478 } else 4594 } else
4479 printk(KERN_WARNING 4595 printk(KERN_WARNING
4480 " %s zone: %lu pages exceeds realsize %lu\n", 4596 " %s zone: %lu pages exceeds freesize %lu\n",
4481 zone_names[j], memmap_pages, realsize); 4597 zone_names[j], memmap_pages, freesize);
4482 4598
4483 /* Account for reserved pages */ 4599 /* Account for reserved pages */
4484 if (j == 0 && realsize > dma_reserve) { 4600 if (j == 0 && freesize > dma_reserve) {
4485 realsize -= dma_reserve; 4601 freesize -= dma_reserve;
4486 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4602 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4487 zone_names[0], dma_reserve); 4603 zone_names[0], dma_reserve);
4488 } 4604 }
4489 4605
4490 if (!is_highmem_idx(j)) 4606 if (!is_highmem_idx(j))
4491 nr_kernel_pages += realsize; 4607 nr_kernel_pages += freesize;
4492 nr_all_pages += realsize; 4608 /* Charge for highmem memmap if there are enough kernel pages */
4609 else if (nr_kernel_pages > memmap_pages * 2)
4610 nr_kernel_pages -= memmap_pages;
4611 nr_all_pages += freesize;
4493 4612
4494 zone->spanned_pages = size; 4613 zone->spanned_pages = size;
4495 zone->present_pages = realsize; 4614 zone->present_pages = freesize;
4615 /*
4616 * Set an approximate value for lowmem here, it will be adjusted
4617 * when the bootmem allocator frees pages into the buddy system.
4618 * And all highmem pages will be managed by the buddy system.
4619 */
4620 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4496#ifdef CONFIG_NUMA 4621#ifdef CONFIG_NUMA
4497 zone->node = nid; 4622 zone->node = nid;
4498 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4623 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4499 / 100; 4624 / 100;
4500 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 4625 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4501#endif 4626#endif
4502 zone->name = zone_names[j]; 4627 zone->name = zone_names[j];
4503 spin_lock_init(&zone->lock); 4628 spin_lock_init(&zone->lock);
@@ -4688,7 +4813,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
4688/* 4813/*
4689 * early_calculate_totalpages() 4814 * early_calculate_totalpages()
4690 * Sum pages in active regions for movable zone. 4815 * Sum pages in active regions for movable zone.
4691 * Populate N_HIGH_MEMORY for calculating usable_nodes. 4816 * Populate N_MEMORY for calculating usable_nodes.
4692 */ 4817 */
4693static unsigned long __init early_calculate_totalpages(void) 4818static unsigned long __init early_calculate_totalpages(void)
4694{ 4819{
@@ -4701,7 +4826,7 @@ static unsigned long __init early_calculate_totalpages(void)
4701 4826
4702 totalpages += pages; 4827 totalpages += pages;
4703 if (pages) 4828 if (pages)
4704 node_set_state(nid, N_HIGH_MEMORY); 4829 node_set_state(nid, N_MEMORY);
4705 } 4830 }
4706 return totalpages; 4831 return totalpages;
4707} 4832}
@@ -4718,9 +4843,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4718 unsigned long usable_startpfn; 4843 unsigned long usable_startpfn;
4719 unsigned long kernelcore_node, kernelcore_remaining; 4844 unsigned long kernelcore_node, kernelcore_remaining;
4720 /* save the state before borrow the nodemask */ 4845 /* save the state before borrow the nodemask */
4721 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; 4846 nodemask_t saved_node_state = node_states[N_MEMORY];
4722 unsigned long totalpages = early_calculate_totalpages(); 4847 unsigned long totalpages = early_calculate_totalpages();
4723 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4848 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
4724 4849
4725 /* 4850 /*
4726 * If movablecore was specified, calculate what size of 4851 * If movablecore was specified, calculate what size of
@@ -4755,7 +4880,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4755restart: 4880restart:
4756 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4881 /* Spread kernelcore memory as evenly as possible throughout nodes */
4757 kernelcore_node = required_kernelcore / usable_nodes; 4882 kernelcore_node = required_kernelcore / usable_nodes;
4758 for_each_node_state(nid, N_HIGH_MEMORY) { 4883 for_each_node_state(nid, N_MEMORY) {
4759 unsigned long start_pfn, end_pfn; 4884 unsigned long start_pfn, end_pfn;
4760 4885
4761 /* 4886 /*
@@ -4847,23 +4972,27 @@ restart:
4847 4972
4848out: 4973out:
4849 /* restore the node_state */ 4974 /* restore the node_state */
4850 node_states[N_HIGH_MEMORY] = saved_node_state; 4975 node_states[N_MEMORY] = saved_node_state;
4851} 4976}
4852 4977
4853/* Any regular memory on that node ? */ 4978/* Any regular or high memory on that node ? */
4854static void __init check_for_regular_memory(pg_data_t *pgdat) 4979static void check_for_memory(pg_data_t *pgdat, int nid)
4855{ 4980{
4856#ifdef CONFIG_HIGHMEM
4857 enum zone_type zone_type; 4981 enum zone_type zone_type;
4858 4982
4859 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4983 if (N_MEMORY == N_NORMAL_MEMORY)
4984 return;
4985
4986 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
4860 struct zone *zone = &pgdat->node_zones[zone_type]; 4987 struct zone *zone = &pgdat->node_zones[zone_type];
4861 if (zone->present_pages) { 4988 if (zone->present_pages) {
4862 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4989 node_set_state(nid, N_HIGH_MEMORY);
4990 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
4991 zone_type <= ZONE_NORMAL)
4992 node_set_state(nid, N_NORMAL_MEMORY);
4863 break; 4993 break;
4864 } 4994 }
4865 } 4995 }
4866#endif
4867} 4996}
4868 4997
4869/** 4998/**
@@ -4946,8 +5075,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4946 5075
4947 /* Any memory on that node */ 5076 /* Any memory on that node */
4948 if (pgdat->node_present_pages) 5077 if (pgdat->node_present_pages)
4949 node_set_state(nid, N_HIGH_MEMORY); 5078 node_set_state(nid, N_MEMORY);
4950 check_for_regular_memory(pgdat); 5079 check_for_memory(pgdat, nid);
4951 } 5080 }
4952} 5081}
4953 5082
@@ -5175,10 +5304,6 @@ static void __setup_per_zone_wmarks(void)
5175 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5304 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5176 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5305 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5177 5306
5178 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5179 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5180 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5181
5182 setup_zone_migrate_reserve(zone); 5307 setup_zone_migrate_reserve(zone);
5183 spin_unlock_irqrestore(&zone->lock, flags); 5308 spin_unlock_irqrestore(&zone->lock, flags);
5184 } 5309 }
@@ -5576,7 +5701,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5576 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5701 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5577 * expect this function should be exact. 5702 * expect this function should be exact.
5578 */ 5703 */
5579bool has_unmovable_pages(struct zone *zone, struct page *page, int count) 5704bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5705 bool skip_hwpoisoned_pages)
5580{ 5706{
5581 unsigned long pfn, iter, found; 5707 unsigned long pfn, iter, found;
5582 int mt; 5708 int mt;
@@ -5611,6 +5737,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5611 continue; 5737 continue;
5612 } 5738 }
5613 5739
5740 /*
5741 * The HWPoisoned page may be not in buddy system, and
5742 * page_count() is not 0.
5743 */
5744 if (skip_hwpoisoned_pages && PageHWPoison(page))
5745 continue;
5746
5614 if (!PageLRU(page)) 5747 if (!PageLRU(page))
5615 found++; 5748 found++;
5616 /* 5749 /*
@@ -5653,7 +5786,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5653 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5786 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5654 return false; 5787 return false;
5655 5788
5656 return !has_unmovable_pages(zone, page, 0); 5789 return !has_unmovable_pages(zone, page, 0, true);
5657} 5790}
5658 5791
5659#ifdef CONFIG_CMA 5792#ifdef CONFIG_CMA
@@ -5680,7 +5813,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5680 unsigned int tries = 0; 5813 unsigned int tries = 0;
5681 int ret = 0; 5814 int ret = 0;
5682 5815
5683 migrate_prep_local(); 5816 migrate_prep();
5684 5817
5685 while (pfn < end || !list_empty(&cc->migratepages)) { 5818 while (pfn < end || !list_empty(&cc->migratepages)) {
5686 if (fatal_signal_pending(current)) { 5819 if (fatal_signal_pending(current)) {
@@ -5708,61 +5841,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5708 5841
5709 ret = migrate_pages(&cc->migratepages, 5842 ret = migrate_pages(&cc->migratepages,
5710 alloc_migrate_target, 5843 alloc_migrate_target,
5711 0, false, MIGRATE_SYNC); 5844 0, false, MIGRATE_SYNC,
5845 MR_CMA);
5712 } 5846 }
5713 5847
5714 putback_lru_pages(&cc->migratepages); 5848 putback_movable_pages(&cc->migratepages);
5715 return ret > 0 ? 0 : ret; 5849 return ret > 0 ? 0 : ret;
5716} 5850}
5717 5851
5718/*
5719 * Update zone's cma pages counter used for watermark level calculation.
5720 */
5721static inline void __update_cma_watermarks(struct zone *zone, int count)
5722{
5723 unsigned long flags;
5724 spin_lock_irqsave(&zone->lock, flags);
5725 zone->min_cma_pages += count;
5726 spin_unlock_irqrestore(&zone->lock, flags);
5727 setup_per_zone_wmarks();
5728}
5729
5730/*
5731 * Trigger memory pressure bump to reclaim some pages in order to be able to
5732 * allocate 'count' pages in single page units. Does similar work as
5733 *__alloc_pages_slowpath() function.
5734 */
5735static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5736{
5737 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5738 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5739 int did_some_progress = 0;
5740 int order = 1;
5741
5742 /*
5743 * Increase level of watermarks to force kswapd do his job
5744 * to stabilise at new watermark level.
5745 */
5746 __update_cma_watermarks(zone, count);
5747
5748 /* Obey watermarks as if the page was being allocated */
5749 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5750 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5751
5752 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5753 NULL);
5754 if (!did_some_progress) {
5755 /* Exhausted what can be done so it's blamo time */
5756 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5757 }
5758 }
5759
5760 /* Restore original watermark levels. */
5761 __update_cma_watermarks(zone, -count);
5762
5763 return count;
5764}
5765
5766/** 5852/**
5767 * alloc_contig_range() -- tries to allocate given range of pages 5853 * alloc_contig_range() -- tries to allocate given range of pages
5768 * @start: start PFN to allocate 5854 * @start: start PFN to allocate
@@ -5786,7 +5872,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5786int alloc_contig_range(unsigned long start, unsigned long end, 5872int alloc_contig_range(unsigned long start, unsigned long end,
5787 unsigned migratetype) 5873 unsigned migratetype)
5788{ 5874{
5789 struct zone *zone = page_zone(pfn_to_page(start));
5790 unsigned long outer_start, outer_end; 5875 unsigned long outer_start, outer_end;
5791 int ret = 0, order; 5876 int ret = 0, order;
5792 5877
@@ -5824,7 +5909,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5824 */ 5909 */
5825 5910
5826 ret = start_isolate_page_range(pfn_max_align_down(start), 5911 ret = start_isolate_page_range(pfn_max_align_down(start),
5827 pfn_max_align_up(end), migratetype); 5912 pfn_max_align_up(end), migratetype,
5913 false);
5828 if (ret) 5914 if (ret)
5829 return ret; 5915 return ret;
5830 5916
@@ -5863,18 +5949,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5863 } 5949 }
5864 5950
5865 /* Make sure the range is really isolated. */ 5951 /* Make sure the range is really isolated. */
5866 if (test_pages_isolated(outer_start, end)) { 5952 if (test_pages_isolated(outer_start, end, false)) {
5867 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 5953 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5868 outer_start, end); 5954 outer_start, end);
5869 ret = -EBUSY; 5955 ret = -EBUSY;
5870 goto done; 5956 goto done;
5871 } 5957 }
5872 5958
5873 /*
5874 * Reclaim enough pages to make sure that contiguous allocation
5875 * will not starve the system.
5876 */
5877 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5878 5959
5879 /* Grab isolated pages from freelists. */ 5960 /* Grab isolated pages from freelists. */
5880 outer_end = isolate_freepages_range(&cc, outer_start, end); 5961 outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5932,7 +6013,6 @@ void __meminit zone_pcp_update(struct zone *zone)
5932} 6013}
5933#endif 6014#endif
5934 6015
5935#ifdef CONFIG_MEMORY_HOTREMOVE
5936void zone_pcp_reset(struct zone *zone) 6016void zone_pcp_reset(struct zone *zone)
5937{ 6017{
5938 unsigned long flags; 6018 unsigned long flags;
@@ -5952,6 +6032,7 @@ void zone_pcp_reset(struct zone *zone)
5952 local_irq_restore(flags); 6032 local_irq_restore(flags);
5953} 6033}
5954 6034
6035#ifdef CONFIG_MEMORY_HOTREMOVE
5955/* 6036/*
5956 * All pages in the range must be isolated before calling this. 6037 * All pages in the range must be isolated before calling this.
5957 */ 6038 */
@@ -5978,6 +6059,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5978 continue; 6059 continue;
5979 } 6060 }
5980 page = pfn_to_page(pfn); 6061 page = pfn_to_page(pfn);
6062 /*
6063 * The HWPoisoned page may be not in buddy system, and
6064 * page_count() is not 0.
6065 */
6066 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6067 pfn++;
6068 SetPageReserved(page);
6069 continue;
6070 }
6071
5981 BUG_ON(page_count(page)); 6072 BUG_ON(page_count(page));
5982 BUG_ON(!PageBuddy(page)); 6073 BUG_ON(!PageBuddy(page));
5983 order = page_order(page); 6074 order = page_order(page);
@@ -5988,8 +6079,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5988 list_del(&page->lru); 6079 list_del(&page->lru);
5989 rmv_page_order(page); 6080 rmv_page_order(page);
5990 zone->free_area[order].nr_free--; 6081 zone->free_area[order].nr_free--;
5991 __mod_zone_page_state(zone, NR_FREE_PAGES,
5992 - (1UL << order));
5993 for (i = 0; i < (1 << order); i++) 6082 for (i = 0; i < (1 << order); i++)
5994 SetPageReserved((page+i)); 6083 SetPageReserved((page+i));
5995 pfn += (1 << order); 6084 pfn += (1 << order);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c6daa6..6d757e3a872a 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
251 mn->nr_pages, mn->status_change_nid); 251 mn->nr_pages, mn->status_change_nid);
252 break; 252 break;
253 case MEM_CANCEL_ONLINE: 253 case MEM_CANCEL_ONLINE:
254 offline_page_cgroup(mn->start_pfn,
255 mn->nr_pages, mn->status_change_nid);
256 break;
254 case MEM_GOING_OFFLINE: 257 case MEM_GOING_OFFLINE:
255 break; 258 break;
256 case MEM_ONLINE: 259 case MEM_ONLINE:
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void)
271 if (mem_cgroup_disabled()) 274 if (mem_cgroup_disabled())
272 return; 275 return;
273 276
274 for_each_node_state(nid, N_HIGH_MEMORY) { 277 for_each_node_state(nid, N_MEMORY) {
275 unsigned long start_pfn, end_pfn; 278 unsigned long start_pfn, end_pfn;
276 279
277 start_pfn = node_start_pfn(nid); 280 start_pfn = node_start_pfn(nid);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f2f5b4818e94..9d2264ea4606 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -30,7 +30,7 @@ static void restore_pageblock_isolate(struct page *page, int migratetype)
30 zone->nr_pageblock_isolate--; 30 zone->nr_pageblock_isolate--;
31} 31}
32 32
33int set_migratetype_isolate(struct page *page) 33int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
34{ 34{
35 struct zone *zone; 35 struct zone *zone;
36 unsigned long flags, pfn; 36 unsigned long flags, pfn;
@@ -66,7 +66,8 @@ int set_migratetype_isolate(struct page *page)
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages. 67 * We just check MOVABLE pages.
68 */ 68 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found)) 69 if (!has_unmovable_pages(zone, page, arg.pages_found,
70 skip_hwpoisoned_pages))
70 ret = 0; 71 ret = 0;
71 72
72 /* 73 /*
@@ -134,7 +135,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
134 * Returns 0 on success and -EBUSY if any part of range cannot be isolated. 135 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
135 */ 136 */
136int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 137int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
137 unsigned migratetype) 138 unsigned migratetype, bool skip_hwpoisoned_pages)
138{ 139{
139 unsigned long pfn; 140 unsigned long pfn;
140 unsigned long undo_pfn; 141 unsigned long undo_pfn;
@@ -147,7 +148,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
147 pfn < end_pfn; 148 pfn < end_pfn;
148 pfn += pageblock_nr_pages) { 149 pfn += pageblock_nr_pages) {
149 page = __first_valid_page(pfn, pageblock_nr_pages); 150 page = __first_valid_page(pfn, pageblock_nr_pages);
150 if (page && set_migratetype_isolate(page)) { 151 if (page &&
152 set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
151 undo_pfn = pfn; 153 undo_pfn = pfn;
152 goto undo; 154 goto undo;
153 } 155 }
@@ -190,7 +192,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
190 * Returns 1 if all pages in the range are isolated. 192 * Returns 1 if all pages in the range are isolated.
191 */ 193 */
192static int 194static int
193__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 195__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
196 bool skip_hwpoisoned_pages)
194{ 197{
195 struct page *page; 198 struct page *page;
196 199
@@ -220,6 +223,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
220 else if (page_count(page) == 0 && 223 else if (page_count(page) == 0 &&
221 get_freepage_migratetype(page) == MIGRATE_ISOLATE) 224 get_freepage_migratetype(page) == MIGRATE_ISOLATE)
222 pfn += 1; 225 pfn += 1;
226 else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
227 /*
228 * The HWPoisoned page may be not in buddy
229 * system, and page_count() is not 0.
230 */
231 pfn++;
232 continue;
233 }
223 else 234 else
224 break; 235 break;
225 } 236 }
@@ -228,7 +239,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
228 return 1; 239 return 1;
229} 240}
230 241
231int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 242int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
243 bool skip_hwpoisoned_pages)
232{ 244{
233 unsigned long pfn, flags; 245 unsigned long pfn, flags;
234 struct page *page; 246 struct page *page;
@@ -251,7 +263,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
251 /* Check all pages are free or Marked as ISOLATED */ 263 /* Check all pages are free or Marked as ISOLATED */
252 zone = page_zone(page); 264 zone = page_zone(page);
253 spin_lock_irqsave(&zone->lock, flags); 265 spin_lock_irqsave(&zone->lock, flags);
254 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); 266 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
267 skip_hwpoisoned_pages);
255 spin_unlock_irqrestore(&zone->lock, flags); 268 spin_unlock_irqrestore(&zone->lock, flags);
256 return ret ? 0 : -EBUSY; 269 return ret ? 0 : -EBUSY;
257} 270}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 6c118d012bb5..35aa294656cd 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
58 if (!walk->pte_entry) 58 if (!walk->pte_entry)
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd_mm(walk->mm, addr, pmd);
62 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/percpu.c b/mm/percpu.c
index ddc5efb9c5bb..8c8e08f3a692 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
631 if (!chunk) 631 if (!chunk)
632 return; 632 return;
633 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 633 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
634 kfree(chunk); 634 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
635} 635}
636 636
637/* 637/*
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1380 1380
1381static int __init percpu_alloc_setup(char *str) 1381static int __init percpu_alloc_setup(char *str)
1382{ 1382{
1383 if (!str)
1384 return -EINVAL;
1385
1383 if (0) 1386 if (0)
1384 /* nada */; 1387 /* nada */;
1385#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK 1388#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b7..0c8323fe6c8f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
12 12
13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
14/* 14/*
15 * Only sets the access flags (dirty, accessed, and 15 * Only sets the access flags (dirty, accessed), as well as write
16 * writable). Furthermore, we know it always gets set to a "more 16 * permission. Furthermore, we know it always gets set to a "more
17 * permissive" setting, which allows most architectures to optimize 17 * permissive" setting, which allows most architectures to optimize
18 * this. We return whether the PTE actually changed, which in turn 18 * this. We return whether the PTE actually changed, which in turn
19 * instructs the caller to do things like update__mmu_cache. This 19 * instructs the caller to do things like update__mmu_cache. This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
27 int changed = !pte_same(*ptep, entry); 27 int changed = !pte_same(*ptep, entry);
28 if (changed) { 28 if (changed) {
29 set_pte_at(vma->vm_mm, address, ptep, entry); 29 set_pte_at(vma->vm_mm, address, ptep, entry);
30 flush_tlb_page(vma, address); 30 flush_tlb_fix_spurious_fault(vma, address);
31 } 31 }
32 return changed; 32 return changed;
33} 33}
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
88{ 88{
89 pte_t pte; 89 pte_t pte;
90 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); 90 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
91 flush_tlb_page(vma, address); 91 if (pte_accessible(pte))
92 flush_tlb_page(vma, address);
92 return pte; 93 return pte;
93} 94}
94#endif 95#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ee1ef0f317b..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
27 * anon_vma->mutex 27 * anon_vma->rwsem
28 * mm->page_table_lock or pte_lock 28 * mm->page_table_lock or pte_lock
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
30 * swap_lock (in swap_duplicate, swap_info_get) 30 * swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within bdi.wb->list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
39 * 39 *
40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 41 * ->tasklist_lock
42 * pte map lock 42 * pte map lock
43 */ 43 */
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
87 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 87 VM_BUG_ON(atomic_read(&anon_vma->refcount));
88 88
89 /* 89 /*
90 * Synchronize against page_lock_anon_vma() such that 90 * Synchronize against page_lock_anon_vma_read() such that
91 * we can safely hold the lock without the anon_vma getting 91 * we can safely hold the lock without the anon_vma getting
92 * freed. 92 * freed.
93 * 93 *
94 * Relies on the full mb implied by the atomic_dec_and_test() from 94 * Relies on the full mb implied by the atomic_dec_and_test() from
95 * put_anon_vma() against the acquire barrier implied by 95 * put_anon_vma() against the acquire barrier implied by
96 * mutex_trylock() from page_lock_anon_vma(). This orders: 96 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
97 * 97 *
98 * page_lock_anon_vma() VS put_anon_vma() 98 * page_lock_anon_vma_read() VS put_anon_vma()
99 * mutex_trylock() atomic_dec_and_test() 99 * down_read_trylock() atomic_dec_and_test()
100 * LOCK MB 100 * LOCK MB
101 * atomic_read() mutex_is_locked() 101 * atomic_read() rwsem_is_locked()
102 * 102 *
103 * LOCK should suffice since the actual taking of the lock must 103 * LOCK should suffice since the actual taking of the lock must
104 * happen _before_ what follows. 104 * happen _before_ what follows.
105 */ 105 */
106 if (mutex_is_locked(&anon_vma->root->mutex)) { 106 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock(anon_vma); 108 anon_vma_unlock(anon_vma);
109 } 109 }
110 110
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
146 * allocate a new one. 146 * allocate a new one.
147 * 147 *
148 * Anon-vma allocations are very subtle, because we may have 148 * Anon-vma allocations are very subtle, because we may have
149 * optimistically looked up an anon_vma in page_lock_anon_vma() 149 * optimistically looked up an anon_vma in page_lock_anon_vma_read()
150 * and that may actually touch the spinlock even in the newly 150 * and that may actually touch the spinlock even in the newly
151 * allocated vma (it depends on RCU to make sure that the 151 * allocated vma (it depends on RCU to make sure that the
152 * anon_vma isn't actually destroyed). 152 * anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
181 allocated = anon_vma; 181 allocated = anon_vma;
182 } 182 }
183 183
184 anon_vma_lock(anon_vma); 184 anon_vma_lock_write(anon_vma);
185 /* page_table_lock to protect against threads */ 185 /* page_table_lock to protect against threads */
186 spin_lock(&mm->page_table_lock); 186 spin_lock(&mm->page_table_lock);
187 if (likely(!vma->anon_vma)) { 187 if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
219 struct anon_vma *new_root = anon_vma->root; 219 struct anon_vma *new_root = anon_vma->root;
220 if (new_root != root) { 220 if (new_root != root) {
221 if (WARN_ON_ONCE(root)) 221 if (WARN_ON_ONCE(root))
222 mutex_unlock(&root->mutex); 222 up_write(&root->rwsem);
223 root = new_root; 223 root = new_root;
224 mutex_lock(&root->mutex); 224 down_write(&root->rwsem);
225 } 225 }
226 return root; 226 return root;
227} 227}
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
229static inline void unlock_anon_vma_root(struct anon_vma *root) 229static inline void unlock_anon_vma_root(struct anon_vma *root)
230{ 230{
231 if (root) 231 if (root)
232 mutex_unlock(&root->mutex); 232 up_write(&root->rwsem);
233} 233}
234 234
235/* 235/*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
306 get_anon_vma(anon_vma->root); 306 get_anon_vma(anon_vma->root);
307 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 307 /* Mark this anon_vma as the one where our new (COWed) pages go. */
308 vma->anon_vma = anon_vma; 308 vma->anon_vma = anon_vma;
309 anon_vma_lock(anon_vma); 309 anon_vma_lock_write(anon_vma);
310 anon_vma_chain_link(vma, avc, anon_vma); 310 anon_vma_chain_link(vma, avc, anon_vma);
311 anon_vma_unlock(anon_vma); 311 anon_vma_unlock(anon_vma);
312 312
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
349 /* 349 /*
350 * Iterate the list once more, it now only contains empty and unlinked 350 * Iterate the list once more, it now only contains empty and unlinked
351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
352 * needing to acquire the anon_vma->root->mutex. 352 * needing to write-acquire the anon_vma->root->rwsem.
353 */ 353 */
354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
355 struct anon_vma *anon_vma = avc->anon_vma; 355 struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
365{ 365{
366 struct anon_vma *anon_vma = data; 366 struct anon_vma *anon_vma = data;
367 367
368 mutex_init(&anon_vma->mutex); 368 init_rwsem(&anon_vma->rwsem);
369 atomic_set(&anon_vma->refcount, 0); 369 atomic_set(&anon_vma->refcount, 0);
370 anon_vma->rb_root = RB_ROOT; 370 anon_vma->rb_root = RB_ROOT;
371} 371}
@@ -442,7 +442,7 @@ out:
442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
443 * reference like with page_get_anon_vma() and then block on the mutex. 443 * reference like with page_get_anon_vma() and then block on the mutex.
444 */ 444 */
445struct anon_vma *page_lock_anon_vma(struct page *page) 445struct anon_vma *page_lock_anon_vma_read(struct page *page)
446{ 446{
447 struct anon_vma *anon_vma = NULL; 447 struct anon_vma *anon_vma = NULL;
448 struct anon_vma *root_anon_vma; 448 struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
457 457
458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
459 root_anon_vma = ACCESS_ONCE(anon_vma->root); 459 root_anon_vma = ACCESS_ONCE(anon_vma->root);
460 if (mutex_trylock(&root_anon_vma->mutex)) { 460 if (down_read_trylock(&root_anon_vma->rwsem)) {
461 /* 461 /*
462 * If the page is still mapped, then this anon_vma is still 462 * If the page is still mapped, then this anon_vma is still
463 * its anon_vma, and holding the mutex ensures that it will 463 * its anon_vma, and holding the mutex ensures that it will
464 * not go away, see anon_vma_free(). 464 * not go away, see anon_vma_free().
465 */ 465 */
466 if (!page_mapped(page)) { 466 if (!page_mapped(page)) {
467 mutex_unlock(&root_anon_vma->mutex); 467 up_read(&root_anon_vma->rwsem);
468 anon_vma = NULL; 468 anon_vma = NULL;
469 } 469 }
470 goto out; 470 goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
484 484
485 /* we pinned the anon_vma, its safe to sleep */ 485 /* we pinned the anon_vma, its safe to sleep */
486 rcu_read_unlock(); 486 rcu_read_unlock();
487 anon_vma_lock(anon_vma); 487 anon_vma_lock_read(anon_vma);
488 488
489 if (atomic_dec_and_test(&anon_vma->refcount)) { 489 if (atomic_dec_and_test(&anon_vma->refcount)) {
490 /* 490 /*
491 * Oops, we held the last refcount, release the lock 491 * Oops, we held the last refcount, release the lock
492 * and bail -- can't simply use put_anon_vma() because 492 * and bail -- can't simply use put_anon_vma() because
493 * we'll deadlock on the anon_vma_lock() recursion. 493 * we'll deadlock on the anon_vma_lock_write() recursion.
494 */ 494 */
495 anon_vma_unlock(anon_vma); 495 anon_vma_unlock_read(anon_vma);
496 __put_anon_vma(anon_vma); 496 __put_anon_vma(anon_vma);
497 anon_vma = NULL; 497 anon_vma = NULL;
498 } 498 }
@@ -504,9 +504,9 @@ out:
504 return anon_vma; 504 return anon_vma;
505} 505}
506 506
507void page_unlock_anon_vma(struct anon_vma *anon_vma) 507void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
508{ 508{
509 anon_vma_unlock(anon_vma); 509 anon_vma_unlock_read(anon_vma);
510} 510}
511 511
512/* 512/*
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
562 return address; 562 return address;
563} 563}
564 564
565pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
566{
567 pgd_t *pgd;
568 pud_t *pud;
569 pmd_t *pmd = NULL;
570
571 pgd = pgd_offset(mm, address);
572 if (!pgd_present(*pgd))
573 goto out;
574
575 pud = pud_offset(pgd, address);
576 if (!pud_present(*pud))
577 goto out;
578
579 pmd = pmd_offset(pud, address);
580 if (!pmd_present(*pmd))
581 pmd = NULL;
582out:
583 return pmd;
584}
585
565/* 586/*
566 * Check that @page is mapped at @address into @mm. 587 * Check that @page is mapped at @address into @mm.
567 * 588 *
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
574pte_t *__page_check_address(struct page *page, struct mm_struct *mm, 595pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
575 unsigned long address, spinlock_t **ptlp, int sync) 596 unsigned long address, spinlock_t **ptlp, int sync)
576{ 597{
577 pgd_t *pgd;
578 pud_t *pud;
579 pmd_t *pmd; 598 pmd_t *pmd;
580 pte_t *pte; 599 pte_t *pte;
581 spinlock_t *ptl; 600 spinlock_t *ptl;
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
586 goto check; 605 goto check;
587 } 606 }
588 607
589 pgd = pgd_offset(mm, address); 608 pmd = mm_find_pmd(mm, address);
590 if (!pgd_present(*pgd)) 609 if (!pmd)
591 return NULL;
592
593 pud = pud_offset(pgd, address);
594 if (!pud_present(*pud))
595 return NULL; 610 return NULL;
596 611
597 pmd = pmd_offset(pud, address);
598 if (!pmd_present(*pmd))
599 return NULL;
600 if (pmd_trans_huge(*pmd)) 612 if (pmd_trans_huge(*pmd))
601 return NULL; 613 return NULL;
602 614
@@ -732,7 +744,7 @@ static int page_referenced_anon(struct page *page,
732 struct anon_vma_chain *avc; 744 struct anon_vma_chain *avc;
733 int referenced = 0; 745 int referenced = 0;
734 746
735 anon_vma = page_lock_anon_vma(page); 747 anon_vma = page_lock_anon_vma_read(page);
736 if (!anon_vma) 748 if (!anon_vma)
737 return referenced; 749 return referenced;
738 750
@@ -754,7 +766,7 @@ static int page_referenced_anon(struct page *page,
754 break; 766 break;
755 } 767 }
756 768
757 page_unlock_anon_vma(anon_vma); 769 page_unlock_anon_vma_read(anon_vma);
758 return referenced; 770 return referenced;
759} 771}
760 772
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page)
1139 * containing the swap entry, but page not yet written to swap. 1151 * containing the swap entry, but page not yet written to swap.
1140 * 1152 *
1141 * And we can skip it on file pages, so long as the filesystem 1153 * And we can skip it on file pages, so long as the filesystem
1142 * participates in dirty tracking; but need to catch shm and tmpfs 1154 * participates in dirty tracking (note that this is not only an
1143 * and ramfs pages which have been modified since creation by read 1155 * optimization but also solves problems caused by dirty flag in
1144 * fault. 1156 * storage key getting set by a write from inside kernel); but need to
1157 * catch shm and tmpfs and ramfs pages which have been modified since
1158 * creation by read fault.
1145 * 1159 *
1146 * Note that mapping must be decided above, before decrementing 1160 * Note that mapping must be decided above, before decrementing
1147 * mapcount (which luckily provides a barrier): once page is unmapped, 1161 * mapcount (which luckily provides a barrier): once page is unmapped,
@@ -1235,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1235 update_hiwater_rss(mm); 1249 update_hiwater_rss(mm);
1236 1250
1237 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1251 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1238 if (PageAnon(page)) 1252 if (!PageHuge(page)) {
1239 dec_mm_counter(mm, MM_ANONPAGES); 1253 if (PageAnon(page))
1240 else 1254 dec_mm_counter(mm, MM_ANONPAGES);
1241 dec_mm_counter(mm, MM_FILEPAGES); 1255 else
1256 dec_mm_counter(mm, MM_FILEPAGES);
1257 }
1242 set_pte_at(mm, address, pte, 1258 set_pte_at(mm, address, pte,
1243 swp_entry_to_pte(make_hwpoison_entry(page))); 1259 swp_entry_to_pte(make_hwpoison_entry(page)));
1244 } else if (PageAnon(page)) { 1260 } else if (PageAnon(page)) {
1245 swp_entry_t entry = { .val = page_private(page) }; 1261 swp_entry_t entry = { .val = page_private(page) };
1246 1262
@@ -1299,7 +1315,7 @@ out_mlock:
1299 /* 1315 /*
1300 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1316 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1301 * unstable result and race. Plus, We can't wait here because 1317 * unstable result and race. Plus, We can't wait here because
1302 * we now hold anon_vma->mutex or mapping->i_mmap_mutex. 1318 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
1303 * if trylock failed, the page remain in evictable lru and later 1319 * if trylock failed, the page remain in evictable lru and later
1304 * vmscan could retry to move the page to unevictable lru if the 1320 * vmscan could retry to move the page to unevictable lru if the
1305 * page is actually mlocked. 1321 * page is actually mlocked.
@@ -1345,8 +1361,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1345 struct vm_area_struct *vma, struct page *check_page) 1361 struct vm_area_struct *vma, struct page *check_page)
1346{ 1362{
1347 struct mm_struct *mm = vma->vm_mm; 1363 struct mm_struct *mm = vma->vm_mm;
1348 pgd_t *pgd;
1349 pud_t *pud;
1350 pmd_t *pmd; 1364 pmd_t *pmd;
1351 pte_t *pte; 1365 pte_t *pte;
1352 pte_t pteval; 1366 pte_t pteval;
@@ -1366,16 +1380,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1366 if (end > vma->vm_end) 1380 if (end > vma->vm_end)
1367 end = vma->vm_end; 1381 end = vma->vm_end;
1368 1382
1369 pgd = pgd_offset(mm, address); 1383 pmd = mm_find_pmd(mm, address);
1370 if (!pgd_present(*pgd)) 1384 if (!pmd)
1371 return ret;
1372
1373 pud = pud_offset(pgd, address);
1374 if (!pud_present(*pud))
1375 return ret;
1376
1377 pmd = pmd_offset(pud, address);
1378 if (!pmd_present(*pmd))
1379 return ret; 1385 return ret;
1380 1386
1381 mmun_start = address; 1387 mmun_start = address;
@@ -1474,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1474 struct anon_vma_chain *avc; 1480 struct anon_vma_chain *avc;
1475 int ret = SWAP_AGAIN; 1481 int ret = SWAP_AGAIN;
1476 1482
1477 anon_vma = page_lock_anon_vma(page); 1483 anon_vma = page_lock_anon_vma_read(page);
1478 if (!anon_vma) 1484 if (!anon_vma)
1479 return ret; 1485 return ret;
1480 1486
@@ -1501,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1501 break; 1507 break;
1502 } 1508 }
1503 1509
1504 page_unlock_anon_vma(anon_vma); 1510 page_unlock_anon_vma_read(anon_vma);
1505 return ret; 1511 return ret;
1506} 1512}
1507 1513
@@ -1696,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1696 int ret = SWAP_AGAIN; 1702 int ret = SWAP_AGAIN;
1697 1703
1698 /* 1704 /*
1699 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1705 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
1700 * because that depends on page_mapped(); but not all its usages 1706 * because that depends on page_mapped(); but not all its usages
1701 * are holding mmap_sem. Users without mmap_sem are required to 1707 * are holding mmap_sem. Users without mmap_sem are required to
1702 * take a reference count to prevent the anon_vma disappearing 1708 * take a reference count to prevent the anon_vma disappearing
@@ -1704,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1704 anon_vma = page_anon_vma(page); 1710 anon_vma = page_anon_vma(page);
1705 if (!anon_vma) 1711 if (!anon_vma)
1706 return ret; 1712 return ret;
1707 anon_vma_lock(anon_vma); 1713 anon_vma_lock_read(anon_vma);
1708 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1714 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1709 struct vm_area_struct *vma = avc->vma; 1715 struct vm_area_struct *vma = avc->vma;
1710 unsigned long address = vma_address(page, vma); 1716 unsigned long address = vma_address(page, vma);
@@ -1712,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1712 if (ret != SWAP_AGAIN) 1718 if (ret != SWAP_AGAIN)
1713 break; 1719 break;
1714 } 1720 }
1715 anon_vma_unlock(anon_vma); 1721 anon_vma_unlock_read(anon_vma);
1716 return ret; 1722 return ret;
1717} 1723}
1718 1724
diff --git a/mm/shmem.c b/mm/shmem.c
index 50c5b8f3a359..5c90d84c2b02 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1715,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1715 return error; 1715 return error;
1716} 1716}
1717 1717
1718/*
1719 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1720 */
1721static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1722 pgoff_t index, pgoff_t end, int whence)
1723{
1724 struct page *page;
1725 struct pagevec pvec;
1726 pgoff_t indices[PAGEVEC_SIZE];
1727 bool done = false;
1728 int i;
1729
1730 pagevec_init(&pvec, 0);
1731 pvec.nr = 1; /* start small: we may be there already */
1732 while (!done) {
1733 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1734 pvec.nr, pvec.pages, indices);
1735 if (!pvec.nr) {
1736 if (whence == SEEK_DATA)
1737 index = end;
1738 break;
1739 }
1740 for (i = 0; i < pvec.nr; i++, index++) {
1741 if (index < indices[i]) {
1742 if (whence == SEEK_HOLE) {
1743 done = true;
1744 break;
1745 }
1746 index = indices[i];
1747 }
1748 page = pvec.pages[i];
1749 if (page && !radix_tree_exceptional_entry(page)) {
1750 if (!PageUptodate(page))
1751 page = NULL;
1752 }
1753 if (index >= end ||
1754 (page && whence == SEEK_DATA) ||
1755 (!page && whence == SEEK_HOLE)) {
1756 done = true;
1757 break;
1758 }
1759 }
1760 shmem_deswap_pagevec(&pvec);
1761 pagevec_release(&pvec);
1762 pvec.nr = PAGEVEC_SIZE;
1763 cond_resched();
1764 }
1765 return index;
1766}
1767
1768static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1769{
1770 struct address_space *mapping = file->f_mapping;
1771 struct inode *inode = mapping->host;
1772 pgoff_t start, end;
1773 loff_t new_offset;
1774
1775 if (whence != SEEK_DATA && whence != SEEK_HOLE)
1776 return generic_file_llseek_size(file, offset, whence,
1777 MAX_LFS_FILESIZE, i_size_read(inode));
1778 mutex_lock(&inode->i_mutex);
1779 /* We're holding i_mutex so we can access i_size directly */
1780
1781 if (offset < 0)
1782 offset = -EINVAL;
1783 else if (offset >= inode->i_size)
1784 offset = -ENXIO;
1785 else {
1786 start = offset >> PAGE_CACHE_SHIFT;
1787 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1788 new_offset = shmem_seek_hole_data(mapping, start, end, whence);
1789 new_offset <<= PAGE_CACHE_SHIFT;
1790 if (new_offset > offset) {
1791 if (new_offset < inode->i_size)
1792 offset = new_offset;
1793 else if (whence == SEEK_DATA)
1794 offset = -ENXIO;
1795 else
1796 offset = inode->i_size;
1797 }
1798 }
1799
1800 if (offset >= 0 && offset != file->f_pos) {
1801 file->f_pos = offset;
1802 file->f_version = 0;
1803 }
1804 mutex_unlock(&inode->i_mutex);
1805 return offset;
1806}
1807
1718static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1808static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1719 loff_t len) 1809 loff_t len)
1720{ 1810{
@@ -2586,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
2586static const struct file_operations shmem_file_operations = { 2676static const struct file_operations shmem_file_operations = {
2587 .mmap = shmem_mmap, 2677 .mmap = shmem_mmap,
2588#ifdef CONFIG_TMPFS 2678#ifdef CONFIG_TMPFS
2589 .llseek = generic_file_llseek, 2679 .llseek = shmem_file_llseek,
2590 .read = do_sync_read, 2680 .read = do_sync_read,
2591 .write = do_sync_write, 2681 .write = do_sync_write,
2592 .aio_read = shmem_file_aio_read, 2682 .aio_read = shmem_file_aio_read,
diff --git a/mm/slab.c b/mm/slab.c
index 33d3363658df..e7667a3584bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,7 +87,6 @@
87 */ 87 */
88 88
89#include <linux/slab.h> 89#include <linux/slab.h>
90#include "slab.h"
91#include <linux/mm.h> 90#include <linux/mm.h>
92#include <linux/poison.h> 91#include <linux/poison.h>
93#include <linux/swap.h> 92#include <linux/swap.h>
@@ -128,6 +127,8 @@
128 127
129#include "internal.h" 128#include "internal.h"
130 129
130#include "slab.h"
131
131/* 132/*
132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 133 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
133 * 0 for faster, smaller code (especially in the critical paths). 134 * 0 for faster, smaller code (especially in the critical paths).
@@ -162,23 +163,6 @@
162 */ 163 */
163static bool pfmemalloc_active __read_mostly; 164static bool pfmemalloc_active __read_mostly;
164 165
165/* Legal flag mask for kmem_cache_create(). */
166#if DEBUG
167# define CREATE_MASK (SLAB_RED_ZONE | \
168 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
169 SLAB_CACHE_DMA | \
170 SLAB_STORE_USER | \
171 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
172 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
173 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
174#else
175# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
176 SLAB_CACHE_DMA | \
177 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
178 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
179 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
180#endif
181
182/* 166/*
183 * kmem_bufctl_t: 167 * kmem_bufctl_t:
184 * 168 *
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = {
564#undef CACHE 548#undef CACHE
565}; 549};
566 550
567static struct arraycache_init initarray_cache __initdata =
568 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
569static struct arraycache_init initarray_generic = 551static struct arraycache_init initarray_generic =
570 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 552 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
571 553
572/* internal cache of cache description objs */ 554/* internal cache of cache description objs */
573static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
574static struct kmem_cache kmem_cache_boot = { 555static struct kmem_cache kmem_cache_boot = {
575 .nodelists = kmem_cache_nodelists,
576 .batchcount = 1, 556 .batchcount = 1,
577 .limit = BOOT_CPUCACHE_ENTRIES, 557 .limit = BOOT_CPUCACHE_ENTRIES,
578 .shared = 1, 558 .shared = 1,
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q)
662 } 642 }
663} 643}
664 644
645static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
646{
647 struct kmem_list3 *l3;
648 l3 = cachep->nodelists[q];
649 if (!l3)
650 return;
651
652 slab_set_lock_classes(cachep, &on_slab_l3_key,
653 &on_slab_alc_key, q);
654}
655
656static inline void on_slab_lock_classes(struct kmem_cache *cachep)
657{
658 int node;
659
660 VM_BUG_ON(OFF_SLAB(cachep));
661 for_each_node(node)
662 on_slab_lock_classes_node(cachep, node);
663}
664
665static inline void init_lock_keys(void) 665static inline void init_lock_keys(void)
666{ 666{
667 int node; 667 int node;
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void)
678{ 678{
679} 679}
680 680
681static inline void on_slab_lock_classes(struct kmem_cache *cachep)
682{
683}
684
685static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
686{
687}
688
681static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) 689static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
682{ 690{
683} 691}
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
1406 free_alien_cache(alien); 1414 free_alien_cache(alien);
1407 if (cachep->flags & SLAB_DEBUG_OBJECTS) 1415 if (cachep->flags & SLAB_DEBUG_OBJECTS)
1408 slab_set_debugobj_lock_classes_node(cachep, node); 1416 slab_set_debugobj_lock_classes_node(cachep, node);
1417 else if (!OFF_SLAB(cachep) &&
1418 !(cachep->flags & SLAB_DESTROY_BY_RCU))
1419 on_slab_lock_classes_node(cachep, node);
1409 } 1420 }
1410 init_node_lock_keys(node); 1421 init_node_lock_keys(node);
1411 1422
@@ -1577,28 +1588,33 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1577} 1588}
1578 1589
1579/* 1590/*
1591 * The memory after the last cpu cache pointer is used for the
1592 * the nodelists pointer.
1593 */
1594static void setup_nodelists_pointer(struct kmem_cache *cachep)
1595{
1596 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
1597}
1598
1599/*
1580 * Initialisation. Called after the page allocator have been initialised and 1600 * Initialisation. Called after the page allocator have been initialised and
1581 * before smp_init(). 1601 * before smp_init().
1582 */ 1602 */
1583void __init kmem_cache_init(void) 1603void __init kmem_cache_init(void)
1584{ 1604{
1585 size_t left_over;
1586 struct cache_sizes *sizes; 1605 struct cache_sizes *sizes;
1587 struct cache_names *names; 1606 struct cache_names *names;
1588 int i; 1607 int i;
1589 int order;
1590 int node;
1591 1608
1592 kmem_cache = &kmem_cache_boot; 1609 kmem_cache = &kmem_cache_boot;
1610 setup_nodelists_pointer(kmem_cache);
1593 1611
1594 if (num_possible_nodes() == 1) 1612 if (num_possible_nodes() == 1)
1595 use_alien_caches = 0; 1613 use_alien_caches = 0;
1596 1614
1597 for (i = 0; i < NUM_INIT_LISTS; i++) { 1615 for (i = 0; i < NUM_INIT_LISTS; i++)
1598 kmem_list3_init(&initkmem_list3[i]); 1616 kmem_list3_init(&initkmem_list3[i]);
1599 if (i < MAX_NUMNODES) 1617
1600 kmem_cache->nodelists[i] = NULL;
1601 }
1602 set_up_list3s(kmem_cache, CACHE_CACHE); 1618 set_up_list3s(kmem_cache, CACHE_CACHE);
1603 1619
1604 /* 1620 /*
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void)
1629 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1645 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1630 */ 1646 */
1631 1647
1632 node = numa_mem_id();
1633
1634 /* 1) create the kmem_cache */ 1648 /* 1) create the kmem_cache */
1635 INIT_LIST_HEAD(&slab_caches);
1636 list_add(&kmem_cache->list, &slab_caches);
1637 kmem_cache->colour_off = cache_line_size();
1638 kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
1639 kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1640 1649
1641 /* 1650 /*
1642 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1651 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1643 */ 1652 */
1644 kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1653 create_boot_cache(kmem_cache, "kmem_cache",
1645 nr_node_ids * sizeof(struct kmem_list3 *); 1654 offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1646 kmem_cache->object_size = kmem_cache->size; 1655 nr_node_ids * sizeof(struct kmem_list3 *),
1647 kmem_cache->size = ALIGN(kmem_cache->object_size, 1656 SLAB_HWCACHE_ALIGN);
1648 cache_line_size()); 1657 list_add(&kmem_cache->list, &slab_caches);
1649 kmem_cache->reciprocal_buffer_size =
1650 reciprocal_value(kmem_cache->size);
1651
1652 for (order = 0; order < MAX_ORDER; order++) {
1653 cache_estimate(order, kmem_cache->size,
1654 cache_line_size(), 0, &left_over, &kmem_cache->num);
1655 if (kmem_cache->num)
1656 break;
1657 }
1658 BUG_ON(!kmem_cache->num);
1659 kmem_cache->gfporder = order;
1660 kmem_cache->colour = left_over / kmem_cache->colour_off;
1661 kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
1662 sizeof(struct slab), cache_line_size());
1663 1658
1664 /* 2+3) create the kmalloc caches */ 1659 /* 2+3) create the kmalloc caches */
1665 sizes = malloc_sizes; 1660 sizes = malloc_sizes;
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void)
1671 * bug. 1666 * bug.
1672 */ 1667 */
1673 1668
1674 sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1669 sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
1675 sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; 1670 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
1676 sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; 1671
1677 sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; 1672 if (INDEX_AC != INDEX_L3)
1678 sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; 1673 sizes[INDEX_L3].cs_cachep =
1679 __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); 1674 create_kmalloc_cache(names[INDEX_L3].name,
1680 list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); 1675 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
1681
1682 if (INDEX_AC != INDEX_L3) {
1683 sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1684 sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
1685 sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
1686 sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
1687 sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1688 __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1689 list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
1690 }
1691 1676
1692 slab_early_init = 0; 1677 slab_early_init = 0;
1693 1678
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void)
1699 * Note for systems short on memory removing the alignment will 1684 * Note for systems short on memory removing the alignment will
1700 * allow tighter packing of the smaller caches. 1685 * allow tighter packing of the smaller caches.
1701 */ 1686 */
1702 if (!sizes->cs_cachep) { 1687 if (!sizes->cs_cachep)
1703 sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1688 sizes->cs_cachep = create_kmalloc_cache(names->name,
1704 sizes->cs_cachep->name = names->name; 1689 sizes->cs_size, ARCH_KMALLOC_FLAGS);
1705 sizes->cs_cachep->size = sizes->cs_size; 1690
1706 sizes->cs_cachep->object_size = sizes->cs_size;
1707 sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1708 __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1709 list_add(&sizes->cs_cachep->list, &slab_caches);
1710 }
1711#ifdef CONFIG_ZONE_DMA 1691#ifdef CONFIG_ZONE_DMA
1712 sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1692 sizes->cs_dmacachep = create_kmalloc_cache(
1713 sizes->cs_dmacachep->name = names->name_dma; 1693 names->name_dma, sizes->cs_size,
1714 sizes->cs_dmacachep->size = sizes->cs_size; 1694 SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
1715 sizes->cs_dmacachep->object_size = sizes->cs_size;
1716 sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
1717 __kmem_cache_create(sizes->cs_dmacachep,
1718 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
1719 list_add(&sizes->cs_dmacachep->list, &slab_caches);
1720#endif 1695#endif
1721 sizes++; 1696 sizes++;
1722 names++; 1697 names++;
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void)
1727 1702
1728 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1703 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1729 1704
1730 BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
1731 memcpy(ptr, cpu_cache_get(kmem_cache), 1705 memcpy(ptr, cpu_cache_get(kmem_cache),
1732 sizeof(struct arraycache_init)); 1706 sizeof(struct arraycache_init));
1733 /* 1707 /*
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1921 if (page->pfmemalloc) 1895 if (page->pfmemalloc)
1922 SetPageSlabPfmemalloc(page + i); 1896 SetPageSlabPfmemalloc(page + i);
1923 } 1897 }
1898 memcg_bind_pages(cachep, cachep->gfporder);
1924 1899
1925 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1900 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1926 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1901 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1957 __ClearPageSlab(page); 1932 __ClearPageSlab(page);
1958 page++; 1933 page++;
1959 } 1934 }
1935
1936 memcg_release_pages(cachep, cachep->gfporder);
1960 if (current->reclaim_state) 1937 if (current->reclaim_state)
1961 current->reclaim_state->reclaimed_slab += nr_freed; 1938 current->reclaim_state->reclaimed_slab += nr_freed;
1962 free_pages((unsigned long)addr, cachep->gfporder); 1939 free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
1963} 1940}
1964 1941
1965static void kmem_rcu_free(struct rcu_head *head) 1942static void kmem_rcu_free(struct rcu_head *head)
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2282 2259
2283 if (slab_state == DOWN) { 2260 if (slab_state == DOWN) {
2284 /* 2261 /*
2285 * Note: the first kmem_cache_create must create the cache 2262 * Note: Creation of first cache (kmem_cache).
2263 * The setup_list3s is taken care
2264 * of by the caller of __kmem_cache_create
2265 */
2266 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2267 slab_state = PARTIAL;
2268 } else if (slab_state == PARTIAL) {
2269 /*
2270 * Note: the second kmem_cache_create must create the cache
2286 * that's used by kmalloc(24), otherwise the creation of 2271 * that's used by kmalloc(24), otherwise the creation of
2287 * further caches will BUG(). 2272 * further caches will BUG().
2288 */ 2273 */
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2290 2275
2291 /* 2276 /*
2292 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is 2277 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2293 * the first cache, then we need to set up all its list3s, 2278 * the second cache, then we need to set up all its list3s,
2294 * otherwise the creation of further caches will BUG(). 2279 * otherwise the creation of further caches will BUG().
2295 */ 2280 */
2296 set_up_list3s(cachep, SIZE_AC); 2281 set_up_list3s(cachep, SIZE_AC);
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2299 else 2284 else
2300 slab_state = PARTIAL_ARRAYCACHE; 2285 slab_state = PARTIAL_ARRAYCACHE;
2301 } else { 2286 } else {
2287 /* Remaining boot caches */
2302 cachep->array[smp_processor_id()] = 2288 cachep->array[smp_processor_id()] =
2303 kmalloc(sizeof(struct arraycache_init), gfp); 2289 kmalloc(sizeof(struct arraycache_init), gfp);
2304 2290
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2331 2317
2332/** 2318/**
2333 * __kmem_cache_create - Create a cache. 2319 * __kmem_cache_create - Create a cache.
2334 * @name: A string which is used in /proc/slabinfo to identify this cache. 2320 * @cachep: cache management descriptor
2335 * @size: The size of objects to be created in this cache.
2336 * @align: The required alignment for the objects.
2337 * @flags: SLAB flags 2321 * @flags: SLAB flags
2338 * @ctor: A constructor for the objects.
2339 * 2322 *
2340 * Returns a ptr to the cache on success, NULL on failure. 2323 * Returns a ptr to the cache on success, NULL on failure.
2341 * Cannot be called within a int, but can be interrupted. 2324 * Cannot be called within a int, but can be interrupted.
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2378 if (flags & SLAB_DESTROY_BY_RCU) 2361 if (flags & SLAB_DESTROY_BY_RCU)
2379 BUG_ON(flags & SLAB_POISON); 2362 BUG_ON(flags & SLAB_POISON);
2380#endif 2363#endif
2381 /*
2382 * Always checks flags, a caller might be expecting debug support which
2383 * isn't available.
2384 */
2385 BUG_ON(flags & ~CREATE_MASK);
2386 2364
2387 /* 2365 /*
2388 * Check that size is in terms of words. This is needed to avoid 2366 * Check that size is in terms of words. This is needed to avoid
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2394 size &= ~(BYTES_PER_WORD - 1); 2372 size &= ~(BYTES_PER_WORD - 1);
2395 } 2373 }
2396 2374
2397 /* calculate the final buffer alignment: */
2398
2399 /* 1) arch recommendation: can be overridden for debug */
2400 if (flags & SLAB_HWCACHE_ALIGN) {
2401 /*
2402 * Default alignment: as specified by the arch code. Except if
2403 * an object is really small, then squeeze multiple objects into
2404 * one cacheline.
2405 */
2406 ralign = cache_line_size();
2407 while (size <= ralign / 2)
2408 ralign /= 2;
2409 } else {
2410 ralign = BYTES_PER_WORD;
2411 }
2412
2413 /* 2375 /*
2414 * Redzoning and user store require word alignment or possibly larger. 2376 * Redzoning and user store require word alignment or possibly larger.
2415 * Note this will be overridden by architecture or caller mandated 2377 * Note this will be overridden by architecture or caller mandated
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2426 size &= ~(REDZONE_ALIGN - 1); 2388 size &= ~(REDZONE_ALIGN - 1);
2427 } 2389 }
2428 2390
2429 /* 2) arch mandated alignment */
2430 if (ralign < ARCH_SLAB_MINALIGN) {
2431 ralign = ARCH_SLAB_MINALIGN;
2432 }
2433 /* 3) caller mandated alignment */ 2391 /* 3) caller mandated alignment */
2434 if (ralign < cachep->align) { 2392 if (ralign < cachep->align) {
2435 ralign = cachep->align; 2393 ralign = cachep->align;
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2447 else 2405 else
2448 gfp = GFP_NOWAIT; 2406 gfp = GFP_NOWAIT;
2449 2407
2450 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; 2408 setup_nodelists_pointer(cachep);
2451#if DEBUG 2409#if DEBUG
2452 2410
2453 /* 2411 /*
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2566 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); 2524 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2567 2525
2568 slab_set_debugobj_lock_classes(cachep); 2526 slab_set_debugobj_lock_classes(cachep);
2569 } 2527 } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
2528 on_slab_lock_classes(cachep);
2570 2529
2571 return 0; 2530 return 0;
2572} 2531}
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3530 if (slab_should_failslab(cachep, flags)) 3489 if (slab_should_failslab(cachep, flags))
3531 return NULL; 3490 return NULL;
3532 3491
3492 cachep = memcg_kmem_get_cache(cachep, flags);
3493
3533 cache_alloc_debugcheck_before(cachep, flags); 3494 cache_alloc_debugcheck_before(cachep, flags);
3534 local_irq_save(save_flags); 3495 local_irq_save(save_flags);
3535 3496
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3615 if (slab_should_failslab(cachep, flags)) 3576 if (slab_should_failslab(cachep, flags))
3616 return NULL; 3577 return NULL;
3617 3578
3579 cachep = memcg_kmem_get_cache(cachep, flags);
3580
3618 cache_alloc_debugcheck_before(cachep, flags); 3581 cache_alloc_debugcheck_before(cachep, flags);
3619 local_irq_save(save_flags); 3582 local_irq_save(save_flags);
3620 objp = __do_cache_alloc(cachep, flags); 3583 objp = __do_cache_alloc(cachep, flags);
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
3928void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3891void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3929{ 3892{
3930 unsigned long flags; 3893 unsigned long flags;
3894 cachep = cache_from_obj(cachep, objp);
3895 if (!cachep)
3896 return;
3931 3897
3932 local_irq_save(flags); 3898 local_irq_save(flags);
3933 debug_check_no_locks_freed(objp, cachep->object_size); 3899 debug_check_no_locks_freed(objp, cachep->object_size);
@@ -3969,12 +3935,6 @@ void kfree(const void *objp)
3969} 3935}
3970EXPORT_SYMBOL(kfree); 3936EXPORT_SYMBOL(kfree);
3971 3937
3972unsigned int kmem_cache_size(struct kmem_cache *cachep)
3973{
3974 return cachep->object_size;
3975}
3976EXPORT_SYMBOL(kmem_cache_size);
3977
3978/* 3938/*
3979 * This initializes kmem_list3 or resizes various caches for all nodes. 3939 * This initializes kmem_list3 or resizes various caches for all nodes.
3980 */ 3940 */
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info)
4081} 4041}
4082 4042
4083/* Always called with the slab_mutex held */ 4043/* Always called with the slab_mutex held */
4084static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 4044static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
4085 int batchcount, int shared, gfp_t gfp) 4045 int batchcount, int shared, gfp_t gfp)
4086{ 4046{
4087 struct ccupdate_struct *new; 4047 struct ccupdate_struct *new;
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4124 return alloc_kmemlist(cachep, gfp); 4084 return alloc_kmemlist(cachep, gfp);
4125} 4085}
4126 4086
4087static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4088 int batchcount, int shared, gfp_t gfp)
4089{
4090 int ret;
4091 struct kmem_cache *c = NULL;
4092 int i = 0;
4093
4094 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
4095
4096 if (slab_state < FULL)
4097 return ret;
4098
4099 if ((ret < 0) || !is_root_cache(cachep))
4100 return ret;
4101
4102 VM_BUG_ON(!mutex_is_locked(&slab_mutex));
4103 for_each_memcg_cache_index(i) {
4104 c = cache_from_memcg(cachep, i);
4105 if (c)
4106 /* return value determined by the parent cache only */
4107 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
4108 }
4109
4110 return ret;
4111}
4112
4127/* Called with slab_mutex held always */ 4113/* Called with slab_mutex held always */
4128static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 4114static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4129{ 4115{
4130 int err; 4116 int err;
4131 int limit, shared; 4117 int limit = 0;
4118 int shared = 0;
4119 int batchcount = 0;
4120
4121 if (!is_root_cache(cachep)) {
4122 struct kmem_cache *root = memcg_root_cache(cachep);
4123 limit = root->limit;
4124 shared = root->shared;
4125 batchcount = root->batchcount;
4126 }
4132 4127
4128 if (limit && shared && batchcount)
4129 goto skip_setup;
4133 /* 4130 /*
4134 * The head array serves three purposes: 4131 * The head array serves three purposes:
4135 * - create a LIFO ordering, i.e. return objects that are cache-warm 4132 * - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4171 if (limit > 32) 4168 if (limit > 32)
4172 limit = 32; 4169 limit = 32;
4173#endif 4170#endif
4174 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); 4171 batchcount = (limit + 1) / 2;
4172skip_setup:
4173 err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
4175 if (err) 4174 if (err)
4176 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 4175 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4177 cachep->name, -err); 4176 cachep->name, -err);
@@ -4276,54 +4275,8 @@ out:
4276} 4275}
4277 4276
4278#ifdef CONFIG_SLABINFO 4277#ifdef CONFIG_SLABINFO
4279 4278void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4280static void print_slabinfo_header(struct seq_file *m)
4281{
4282 /*
4283 * Output format version, so at least we can change it
4284 * without _too_ many complaints.
4285 */
4286#if STATS
4287 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4288#else
4289 seq_puts(m, "slabinfo - version: 2.1\n");
4290#endif
4291 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
4292 "<objperslab> <pagesperslab>");
4293 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4294 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4295#if STATS
4296 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4297 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4298 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4299#endif
4300 seq_putc(m, '\n');
4301}
4302
4303static void *s_start(struct seq_file *m, loff_t *pos)
4304{
4305 loff_t n = *pos;
4306
4307 mutex_lock(&slab_mutex);
4308 if (!n)
4309 print_slabinfo_header(m);
4310
4311 return seq_list_start(&slab_caches, *pos);
4312}
4313
4314static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4315{ 4279{
4316 return seq_list_next(p, &slab_caches, pos);
4317}
4318
4319static void s_stop(struct seq_file *m, void *p)
4320{
4321 mutex_unlock(&slab_mutex);
4322}
4323
4324static int s_show(struct seq_file *m, void *p)
4325{
4326 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4327 struct slab *slabp; 4280 struct slab *slabp;
4328 unsigned long active_objs; 4281 unsigned long active_objs;
4329 unsigned long num_objs; 4282 unsigned long num_objs;
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p)
4378 if (error) 4331 if (error)
4379 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4332 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4380 4333
4381 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4334 sinfo->active_objs = active_objs;
4382 name, active_objs, num_objs, cachep->size, 4335 sinfo->num_objs = num_objs;
4383 cachep->num, (1 << cachep->gfporder)); 4336 sinfo->active_slabs = active_slabs;
4384 seq_printf(m, " : tunables %4u %4u %4u", 4337 sinfo->num_slabs = num_slabs;
4385 cachep->limit, cachep->batchcount, cachep->shared); 4338 sinfo->shared_avail = shared_avail;
4386 seq_printf(m, " : slabdata %6lu %6lu %6lu", 4339 sinfo->limit = cachep->limit;
4387 active_slabs, num_slabs, shared_avail); 4340 sinfo->batchcount = cachep->batchcount;
4341 sinfo->shared = cachep->shared;
4342 sinfo->objects_per_slab = cachep->num;
4343 sinfo->cache_order = cachep->gfporder;
4344}
4345
4346void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
4347{
4388#if STATS 4348#if STATS
4389 { /* list3 stats */ 4349 { /* list3 stats */
4390 unsigned long high = cachep->high_mark; 4350 unsigned long high = cachep->high_mark;
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p)
4414 allochit, allocmiss, freehit, freemiss); 4374 allochit, allocmiss, freehit, freemiss);
4415 } 4375 }
4416#endif 4376#endif
4417 seq_putc(m, '\n');
4418 return 0;
4419} 4377}
4420 4378
4421/*
4422 * slabinfo_op - iterator that generates /proc/slabinfo
4423 *
4424 * Output layout:
4425 * cache-name
4426 * num-active-objs
4427 * total-objs
4428 * object size
4429 * num-active-slabs
4430 * total-slabs
4431 * num-pages-per-slab
4432 * + further values on SMP and with statistics enabled
4433 */
4434
4435static const struct seq_operations slabinfo_op = {
4436 .start = s_start,
4437 .next = s_next,
4438 .stop = s_stop,
4439 .show = s_show,
4440};
4441
4442#define MAX_SLABINFO_WRITE 128 4379#define MAX_SLABINFO_WRITE 128
4443/** 4380/**
4444 * slabinfo_write - Tuning for the slab allocator 4381 * slabinfo_write - Tuning for the slab allocator
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = {
4447 * @count: data length 4384 * @count: data length
4448 * @ppos: unused 4385 * @ppos: unused
4449 */ 4386 */
4450static ssize_t slabinfo_write(struct file *file, const char __user *buffer, 4387ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4451 size_t count, loff_t *ppos) 4388 size_t count, loff_t *ppos)
4452{ 4389{
4453 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4390 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4490 return res; 4427 return res;
4491} 4428}
4492 4429
4493static int slabinfo_open(struct inode *inode, struct file *file)
4494{
4495 return seq_open(file, &slabinfo_op);
4496}
4497
4498static const struct file_operations proc_slabinfo_operations = {
4499 .open = slabinfo_open,
4500 .read = seq_read,
4501 .write = slabinfo_write,
4502 .llseek = seq_lseek,
4503 .release = seq_release,
4504};
4505
4506#ifdef CONFIG_DEBUG_SLAB_LEAK 4430#ifdef CONFIG_DEBUG_SLAB_LEAK
4507 4431
4508static void *leaks_start(struct seq_file *m, loff_t *pos) 4432static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p)
4631 return 0; 4555 return 0;
4632} 4556}
4633 4557
4558static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4559{
4560 return seq_list_next(p, &slab_caches, pos);
4561}
4562
4563static void s_stop(struct seq_file *m, void *p)
4564{
4565 mutex_unlock(&slab_mutex);
4566}
4567
4634static const struct seq_operations slabstats_op = { 4568static const struct seq_operations slabstats_op = {
4635 .start = leaks_start, 4569 .start = leaks_start,
4636 .next = s_next, 4570 .next = s_next,
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = {
4665 4599
4666static int __init slab_proc_init(void) 4600static int __init slab_proc_init(void)
4667{ 4601{
4668 proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
4669#ifdef CONFIG_DEBUG_SLAB_LEAK 4602#ifdef CONFIG_DEBUG_SLAB_LEAK
4670 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4603 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4671#endif 4604#endif
diff --git a/mm/slab.h b/mm/slab.h
index 7deeb449a301..34a98d642196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -32,19 +32,201 @@ extern struct list_head slab_caches;
32/* The slab cache that manages slab cache information */ 32/* The slab cache that manages slab cache information */
33extern struct kmem_cache *kmem_cache; 33extern struct kmem_cache *kmem_cache;
34 34
35unsigned long calculate_alignment(unsigned long flags,
36 unsigned long align, unsigned long size);
37
35/* Functions provided by the slab allocators */ 38/* Functions provided by the slab allocators */
36extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); 39extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
37 40
41extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
42 unsigned long flags);
43extern void create_boot_cache(struct kmem_cache *, const char *name,
44 size_t size, unsigned long flags);
45
46struct mem_cgroup;
38#ifdef CONFIG_SLUB 47#ifdef CONFIG_SLUB
39struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 48struct kmem_cache *
40 size_t align, unsigned long flags, void (*ctor)(void *)); 49__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
50 size_t align, unsigned long flags, void (*ctor)(void *));
41#else 51#else
42static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 52static inline struct kmem_cache *
43 size_t align, unsigned long flags, void (*ctor)(void *)) 53__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
54 size_t align, unsigned long flags, void (*ctor)(void *))
44{ return NULL; } 55{ return NULL; }
45#endif 56#endif
46 57
47 58
59/* Legal flag mask for kmem_cache_create(), for various configurations */
60#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
61 SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
62
63#if defined(CONFIG_DEBUG_SLAB)
64#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
65#elif defined(CONFIG_SLUB_DEBUG)
66#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
67 SLAB_TRACE | SLAB_DEBUG_FREE)
68#else
69#define SLAB_DEBUG_FLAGS (0)
70#endif
71
72#if defined(CONFIG_SLAB)
73#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
74 SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
75#elif defined(CONFIG_SLUB)
76#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
77 SLAB_TEMPORARY | SLAB_NOTRACK)
78#else
79#define SLAB_CACHE_FLAGS (0)
80#endif
81
82#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
83
48int __kmem_cache_shutdown(struct kmem_cache *); 84int __kmem_cache_shutdown(struct kmem_cache *);
49 85
86struct seq_file;
87struct file;
88
89struct slabinfo {
90 unsigned long active_objs;
91 unsigned long num_objs;
92 unsigned long active_slabs;
93 unsigned long num_slabs;
94 unsigned long shared_avail;
95 unsigned int limit;
96 unsigned int batchcount;
97 unsigned int shared;
98 unsigned int objects_per_slab;
99 unsigned int cache_order;
100};
101
102void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
103void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
104ssize_t slabinfo_write(struct file *file, const char __user *buffer,
105 size_t count, loff_t *ppos);
106
107#ifdef CONFIG_MEMCG_KMEM
108static inline bool is_root_cache(struct kmem_cache *s)
109{
110 return !s->memcg_params || s->memcg_params->is_root_cache;
111}
112
113static inline bool cache_match_memcg(struct kmem_cache *cachep,
114 struct mem_cgroup *memcg)
115{
116 return (is_root_cache(cachep) && !memcg) ||
117 (cachep->memcg_params->memcg == memcg);
118}
119
120static inline void memcg_bind_pages(struct kmem_cache *s, int order)
121{
122 if (!is_root_cache(s))
123 atomic_add(1 << order, &s->memcg_params->nr_pages);
124}
125
126static inline void memcg_release_pages(struct kmem_cache *s, int order)
127{
128 if (is_root_cache(s))
129 return;
130
131 if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
132 mem_cgroup_destroy_cache(s);
133}
134
135static inline bool slab_equal_or_root(struct kmem_cache *s,
136 struct kmem_cache *p)
137{
138 return (p == s) ||
139 (s->memcg_params && (p == s->memcg_params->root_cache));
140}
141
142/*
143 * We use suffixes to the name in memcg because we can't have caches
144 * created in the system with the same name. But when we print them
145 * locally, better refer to them with the base name
146 */
147static inline const char *cache_name(struct kmem_cache *s)
148{
149 if (!is_root_cache(s))
150 return s->memcg_params->root_cache->name;
151 return s->name;
152}
153
154static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
155{
156 return s->memcg_params->memcg_caches[idx];
157}
158
159static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
160{
161 if (is_root_cache(s))
162 return s;
163 return s->memcg_params->root_cache;
164}
165#else
166static inline bool is_root_cache(struct kmem_cache *s)
167{
168 return true;
169}
170
171static inline bool cache_match_memcg(struct kmem_cache *cachep,
172 struct mem_cgroup *memcg)
173{
174 return true;
175}
176
177static inline void memcg_bind_pages(struct kmem_cache *s, int order)
178{
179}
180
181static inline void memcg_release_pages(struct kmem_cache *s, int order)
182{
183}
184
185static inline bool slab_equal_or_root(struct kmem_cache *s,
186 struct kmem_cache *p)
187{
188 return true;
189}
190
191static inline const char *cache_name(struct kmem_cache *s)
192{
193 return s->name;
194}
195
196static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
197{
198 return NULL;
199}
200
201static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
202{
203 return s;
204}
205#endif
206
207static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
208{
209 struct kmem_cache *cachep;
210 struct page *page;
211
212 /*
213 * When kmemcg is not being used, both assignments should return the
214 * same value. but we don't want to pay the assignment price in that
215 * case. If it is not compiled in, the compiler should be smart enough
216 * to not do even the assignment. In that case, slab_equal_or_root
217 * will also be a constant.
218 */
219 if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
220 return s;
221
222 page = virt_to_head_page(x);
223 cachep = page->slab_cache;
224 if (slab_equal_or_root(cachep, s))
225 return cachep;
226
227 pr_err("%s: Wrong slab cache. %s but object is from %s\n",
228 __FUNCTION__, cachep->name, s->name);
229 WARN_ON_ONCE(1);
230 return s;
231}
50#endif 232#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 069a24e64403..3f3cd97d3fdf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -13,9 +13,12 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/seq_file.h>
17#include <linux/proc_fs.h>
16#include <asm/cacheflush.h> 18#include <asm/cacheflush.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
18#include <asm/page.h> 20#include <asm/page.h>
21#include <linux/memcontrol.h>
19 22
20#include "slab.h" 23#include "slab.h"
21 24
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
25struct kmem_cache *kmem_cache; 28struct kmem_cache *kmem_cache;
26 29
27#ifdef CONFIG_DEBUG_VM 30#ifdef CONFIG_DEBUG_VM
28static int kmem_cache_sanity_check(const char *name, size_t size) 31static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
32 size_t size)
29{ 33{
30 struct kmem_cache *s = NULL; 34 struct kmem_cache *s = NULL;
31 35
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
51 continue; 55 continue;
52 } 56 }
53 57
54 if (!strcmp(s->name, name)) { 58 /*
59 * For simplicity, we won't check this in the list of memcg
60 * caches. We have control over memcg naming, and if there
61 * aren't duplicates in the global list, there won't be any
62 * duplicates in the memcg lists as well.
63 */
64 if (!memcg && !strcmp(s->name, name)) {
55 pr_err("%s (%s): Cache name already exists.\n", 65 pr_err("%s (%s): Cache name already exists.\n",
56 __func__, name); 66 __func__, name);
57 dump_stack(); 67 dump_stack();
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
64 return 0; 74 return 0;
65} 75}
66#else 76#else
67static inline int kmem_cache_sanity_check(const char *name, size_t size) 77static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
78 const char *name, size_t size)
68{ 79{
69 return 0; 80 return 0;
70} 81}
71#endif 82#endif
72 83
84#ifdef CONFIG_MEMCG_KMEM
85int memcg_update_all_caches(int num_memcgs)
86{
87 struct kmem_cache *s;
88 int ret = 0;
89 mutex_lock(&slab_mutex);
90
91 list_for_each_entry(s, &slab_caches, list) {
92 if (!is_root_cache(s))
93 continue;
94
95 ret = memcg_update_cache_size(s, num_memcgs);
96 /*
97 * See comment in memcontrol.c, memcg_update_cache_size:
98 * Instead of freeing the memory, we'll just leave the caches
99 * up to this point in an updated state.
100 */
101 if (ret)
102 goto out;
103 }
104
105 memcg_update_array_size(num_memcgs);
106out:
107 mutex_unlock(&slab_mutex);
108 return ret;
109}
110#endif
111
112/*
113 * Figure out what the alignment of the objects will be given a set of
114 * flags, a user specified alignment and the size of the objects.
115 */
116unsigned long calculate_alignment(unsigned long flags,
117 unsigned long align, unsigned long size)
118{
119 /*
120 * If the user wants hardware cache aligned objects then follow that
121 * suggestion if the object is sufficiently large.
122 *
123 * The hardware cache alignment cannot override the specified
124 * alignment though. If that is greater then use it.
125 */
126 if (flags & SLAB_HWCACHE_ALIGN) {
127 unsigned long ralign = cache_line_size();
128 while (size <= ralign / 2)
129 ralign /= 2;
130 align = max(align, ralign);
131 }
132
133 if (align < ARCH_SLAB_MINALIGN)
134 align = ARCH_SLAB_MINALIGN;
135
136 return ALIGN(align, sizeof(void *));
137}
138
139
73/* 140/*
74 * kmem_cache_create - Create a cache. 141 * kmem_cache_create - Create a cache.
75 * @name: A string which is used in /proc/slabinfo to identify this cache. 142 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
95 * as davem. 162 * as davem.
96 */ 163 */
97 164
98struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, 165struct kmem_cache *
99 unsigned long flags, void (*ctor)(void *)) 166kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
167 size_t align, unsigned long flags, void (*ctor)(void *),
168 struct kmem_cache *parent_cache)
100{ 169{
101 struct kmem_cache *s = NULL; 170 struct kmem_cache *s = NULL;
102 int err = 0; 171 int err = 0;
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
104 get_online_cpus(); 173 get_online_cpus();
105 mutex_lock(&slab_mutex); 174 mutex_lock(&slab_mutex);
106 175
107 if (!kmem_cache_sanity_check(name, size) == 0) 176 if (!kmem_cache_sanity_check(memcg, name, size) == 0)
108 goto out_locked; 177 goto out_locked;
109 178
179 /*
180 * Some allocators will constraint the set of valid flags to a subset
181 * of all flags. We expect them to define CACHE_CREATE_MASK in this
182 * case, and we'll just provide them with a sanitized version of the
183 * passed flags.
184 */
185 flags &= CACHE_CREATE_MASK;
110 186
111 s = __kmem_cache_alias(name, size, align, flags, ctor); 187 s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
112 if (s) 188 if (s)
113 goto out_locked; 189 goto out_locked;
114 190
115 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); 191 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
116 if (s) { 192 if (s) {
117 s->object_size = s->size = size; 193 s->object_size = s->size = size;
118 s->align = align; 194 s->align = calculate_alignment(flags, align, size);
119 s->ctor = ctor; 195 s->ctor = ctor;
196
197 if (memcg_register_cache(memcg, s, parent_cache)) {
198 kmem_cache_free(kmem_cache, s);
199 err = -ENOMEM;
200 goto out_locked;
201 }
202
120 s->name = kstrdup(name, GFP_KERNEL); 203 s->name = kstrdup(name, GFP_KERNEL);
121 if (!s->name) { 204 if (!s->name) {
122 kmem_cache_free(kmem_cache, s); 205 kmem_cache_free(kmem_cache, s);
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
126 209
127 err = __kmem_cache_create(s, flags); 210 err = __kmem_cache_create(s, flags);
128 if (!err) { 211 if (!err) {
129
130 s->refcount = 1; 212 s->refcount = 1;
131 list_add(&s->list, &slab_caches); 213 list_add(&s->list, &slab_caches);
132 214 memcg_cache_list_add(memcg, s);
133 } else { 215 } else {
134 kfree(s->name); 216 kfree(s->name);
135 kmem_cache_free(kmem_cache, s); 217 kmem_cache_free(kmem_cache, s);
@@ -157,10 +239,20 @@ out_locked:
157 239
158 return s; 240 return s;
159} 241}
242
243struct kmem_cache *
244kmem_cache_create(const char *name, size_t size, size_t align,
245 unsigned long flags, void (*ctor)(void *))
246{
247 return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
248}
160EXPORT_SYMBOL(kmem_cache_create); 249EXPORT_SYMBOL(kmem_cache_create);
161 250
162void kmem_cache_destroy(struct kmem_cache *s) 251void kmem_cache_destroy(struct kmem_cache *s)
163{ 252{
253 /* Destroy all the children caches if we aren't a memcg cache */
254 kmem_cache_destroy_memcg_children(s);
255
164 get_online_cpus(); 256 get_online_cpus();
165 mutex_lock(&slab_mutex); 257 mutex_lock(&slab_mutex);
166 s->refcount--; 258 s->refcount--;
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
172 if (s->flags & SLAB_DESTROY_BY_RCU) 264 if (s->flags & SLAB_DESTROY_BY_RCU)
173 rcu_barrier(); 265 rcu_barrier();
174 266
267 memcg_release_cache(s);
175 kfree(s->name); 268 kfree(s->name);
176 kmem_cache_free(kmem_cache, s); 269 kmem_cache_free(kmem_cache, s);
177 } else { 270 } else {
@@ -192,3 +285,182 @@ int slab_is_available(void)
192{ 285{
193 return slab_state >= UP; 286 return slab_state >= UP;
194} 287}
288
289#ifndef CONFIG_SLOB
290/* Create a cache during boot when no slab services are available yet */
291void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
292 unsigned long flags)
293{
294 int err;
295
296 s->name = name;
297 s->size = s->object_size = size;
298 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
299 err = __kmem_cache_create(s, flags);
300
301 if (err)
302 panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
303 name, size, err);
304
305 s->refcount = -1; /* Exempt from merging for now */
306}
307
308struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
309 unsigned long flags)
310{
311 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
312
313 if (!s)
314 panic("Out of memory when creating slab %s\n", name);
315
316 create_boot_cache(s, name, size, flags);
317 list_add(&s->list, &slab_caches);
318 s->refcount = 1;
319 return s;
320}
321
322#endif /* !CONFIG_SLOB */
323
324
325#ifdef CONFIG_SLABINFO
326void print_slabinfo_header(struct seq_file *m)
327{
328 /*
329 * Output format version, so at least we can change it
330 * without _too_ many complaints.
331 */
332#ifdef CONFIG_DEBUG_SLAB
333 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
334#else
335 seq_puts(m, "slabinfo - version: 2.1\n");
336#endif
337 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
338 "<objperslab> <pagesperslab>");
339 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
340 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
341#ifdef CONFIG_DEBUG_SLAB
342 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
343 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
344 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
345#endif
346 seq_putc(m, '\n');
347}
348
349static void *s_start(struct seq_file *m, loff_t *pos)
350{
351 loff_t n = *pos;
352
353 mutex_lock(&slab_mutex);
354 if (!n)
355 print_slabinfo_header(m);
356
357 return seq_list_start(&slab_caches, *pos);
358}
359
360static void *s_next(struct seq_file *m, void *p, loff_t *pos)
361{
362 return seq_list_next(p, &slab_caches, pos);
363}
364
365static void s_stop(struct seq_file *m, void *p)
366{
367 mutex_unlock(&slab_mutex);
368}
369
370static void
371memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
372{
373 struct kmem_cache *c;
374 struct slabinfo sinfo;
375 int i;
376
377 if (!is_root_cache(s))
378 return;
379
380 for_each_memcg_cache_index(i) {
381 c = cache_from_memcg(s, i);
382 if (!c)
383 continue;
384
385 memset(&sinfo, 0, sizeof(sinfo));
386 get_slabinfo(c, &sinfo);
387
388 info->active_slabs += sinfo.active_slabs;
389 info->num_slabs += sinfo.num_slabs;
390 info->shared_avail += sinfo.shared_avail;
391 info->active_objs += sinfo.active_objs;
392 info->num_objs += sinfo.num_objs;
393 }
394}
395
396int cache_show(struct kmem_cache *s, struct seq_file *m)
397{
398 struct slabinfo sinfo;
399
400 memset(&sinfo, 0, sizeof(sinfo));
401 get_slabinfo(s, &sinfo);
402
403 memcg_accumulate_slabinfo(s, &sinfo);
404
405 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
406 cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
407 sinfo.objects_per_slab, (1 << sinfo.cache_order));
408
409 seq_printf(m, " : tunables %4u %4u %4u",
410 sinfo.limit, sinfo.batchcount, sinfo.shared);
411 seq_printf(m, " : slabdata %6lu %6lu %6lu",
412 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
413 slabinfo_show_stats(m, s);
414 seq_putc(m, '\n');
415 return 0;
416}
417
418static int s_show(struct seq_file *m, void *p)
419{
420 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
421
422 if (!is_root_cache(s))
423 return 0;
424 return cache_show(s, m);
425}
426
427/*
428 * slabinfo_op - iterator that generates /proc/slabinfo
429 *
430 * Output layout:
431 * cache-name
432 * num-active-objs
433 * total-objs
434 * object size
435 * num-active-slabs
436 * total-slabs
437 * num-pages-per-slab
438 * + further values on SMP and with statistics enabled
439 */
440static const struct seq_operations slabinfo_op = {
441 .start = s_start,
442 .next = s_next,
443 .stop = s_stop,
444 .show = s_show,
445};
446
447static int slabinfo_open(struct inode *inode, struct file *file)
448{
449 return seq_open(file, &slabinfo_op);
450}
451
452static const struct file_operations proc_slabinfo_operations = {
453 .open = slabinfo_open,
454 .read = seq_read,
455 .write = slabinfo_write,
456 .llseek = seq_lseek,
457 .release = seq_release,
458};
459
460static int __init slab_proc_init(void)
461{
462 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
463 return 0;
464}
465module_init(slab_proc_init);
466#endif /* CONFIG_SLABINFO */
diff --git a/mm/slob.c b/mm/slob.c
index 1e921c5e9576..a99fdf7a0907 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -28,9 +28,8 @@
28 * from kmalloc are prepended with a 4-byte header with the kmalloc size. 28 * from kmalloc are prepended with a 4-byte header with the kmalloc size.
29 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls 29 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
30 * alloc_pages() directly, allocating compound pages so the page order 30 * alloc_pages() directly, allocating compound pages so the page order
31 * does not have to be separately tracked, and also stores the exact 31 * does not have to be separately tracked.
32 * allocation size in page->private so that it can be used to accurately 32 * These objects are detected in kfree() because PageSlab()
33 * provide ksize(). These objects are detected in kfree() because slob_page()
34 * is false for them. 33 * is false for them.
35 * 34 *
36 * SLAB is emulated on top of SLOB by simply calling constructors and 35 * SLAB is emulated on top of SLOB by simply calling constructors and
@@ -59,7 +58,6 @@
59 58
60#include <linux/kernel.h> 59#include <linux/kernel.h>
61#include <linux/slab.h> 60#include <linux/slab.h>
62#include "slab.h"
63 61
64#include <linux/mm.h> 62#include <linux/mm.h>
65#include <linux/swap.h> /* struct reclaim_state */ 63#include <linux/swap.h> /* struct reclaim_state */
@@ -74,6 +72,7 @@
74 72
75#include <linux/atomic.h> 73#include <linux/atomic.h>
76 74
75#include "slab.h"
77/* 76/*
78 * slob_block has a field 'units', which indicates size of block if +ve, 77 * slob_block has a field 'units', which indicates size of block if +ve,
79 * or offset of next block if -ve (in SLOB_UNITs). 78 * or offset of next block if -ve (in SLOB_UNITs).
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp)
124 123
125#define SLOB_UNIT sizeof(slob_t) 124#define SLOB_UNIT sizeof(slob_t)
126#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) 125#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
127#define SLOB_ALIGN L1_CACHE_BYTES
128 126
129/* 127/*
130 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
455 if (likely(order)) 453 if (likely(order))
456 gfp |= __GFP_COMP; 454 gfp |= __GFP_COMP;
457 ret = slob_new_pages(gfp, order, node); 455 ret = slob_new_pages(gfp, order, node);
458 if (ret) {
459 struct page *page;
460 page = virt_to_page(ret);
461 page->private = size;
462 }
463 456
464 trace_kmalloc_node(caller, ret, 457 trace_kmalloc_node(caller, ret,
465 size, PAGE_SIZE << order, gfp, node); 458 size, PAGE_SIZE << order, gfp, node);
@@ -506,7 +499,7 @@ void kfree(const void *block)
506 unsigned int *m = (unsigned int *)(block - align); 499 unsigned int *m = (unsigned int *)(block - align);
507 slob_free(m, *m + align); 500 slob_free(m, *m + align);
508 } else 501 } else
509 put_page(sp); 502 __free_pages(sp, compound_order(sp));
510} 503}
511EXPORT_SYMBOL(kfree); 504EXPORT_SYMBOL(kfree);
512 505
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree);
514size_t ksize(const void *block) 507size_t ksize(const void *block)
515{ 508{
516 struct page *sp; 509 struct page *sp;
510 int align;
511 unsigned int *m;
517 512
518 BUG_ON(!block); 513 BUG_ON(!block);
519 if (unlikely(block == ZERO_SIZE_PTR)) 514 if (unlikely(block == ZERO_SIZE_PTR))
520 return 0; 515 return 0;
521 516
522 sp = virt_to_page(block); 517 sp = virt_to_page(block);
523 if (PageSlab(sp)) { 518 if (unlikely(!PageSlab(sp)))
524 int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 519 return PAGE_SIZE << compound_order(sp);
525 unsigned int *m = (unsigned int *)(block - align); 520
526 return SLOB_UNITS(*m) * SLOB_UNIT; 521 align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
527 } else 522 m = (unsigned int *)(block - align);
528 return sp->private; 523 return SLOB_UNITS(*m) * SLOB_UNIT;
529} 524}
530EXPORT_SYMBOL(ksize); 525EXPORT_SYMBOL(ksize);
531 526
532int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) 527int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
533{ 528{
534 size_t align = c->size;
535
536 if (flags & SLAB_DESTROY_BY_RCU) { 529 if (flags & SLAB_DESTROY_BY_RCU) {
537 /* leave room for rcu footer at the end of object */ 530 /* leave room for rcu footer at the end of object */
538 c->size += sizeof(struct slob_rcu); 531 c->size += sizeof(struct slob_rcu);
539 } 532 }
540 c->flags = flags; 533 c->flags = flags;
541 /* ignore alignment unless it's forced */
542 c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
543 if (c->align < ARCH_SLAB_MINALIGN)
544 c->align = ARCH_SLAB_MINALIGN;
545 if (c->align < align)
546 c->align = align;
547
548 return 0; 534 return 0;
549} 535}
550 536
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
558 544
559 if (c->size < PAGE_SIZE) { 545 if (c->size < PAGE_SIZE) {
560 b = slob_alloc(c->size, flags, c->align, node); 546 b = slob_alloc(c->size, flags, c->align, node);
561 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 547 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
562 SLOB_UNITS(c->size) * SLOB_UNIT, 548 SLOB_UNITS(c->size) * SLOB_UNIT,
563 flags, node); 549 flags, node);
564 } else { 550 } else {
565 b = slob_new_pages(flags, get_order(c->size), node); 551 b = slob_new_pages(flags, get_order(c->size), node);
566 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 552 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
567 PAGE_SIZE << get_order(c->size), 553 PAGE_SIZE << get_order(c->size),
568 flags, node); 554 flags, node);
569 } 555 }
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
608} 594}
609EXPORT_SYMBOL(kmem_cache_free); 595EXPORT_SYMBOL(kmem_cache_free);
610 596
611unsigned int kmem_cache_size(struct kmem_cache *c)
612{
613 return c->size;
614}
615EXPORT_SYMBOL(kmem_cache_size);
616
617int __kmem_cache_shutdown(struct kmem_cache *c) 597int __kmem_cache_shutdown(struct kmem_cache *c)
618{ 598{
619 /* No way to check for remaining objects */ 599 /* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index a0d698467f70..ba2ca53f6c3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -31,6 +31,7 @@
31#include <linux/fault-inject.h> 31#include <linux/fault-inject.h>
32#include <linux/stacktrace.h> 32#include <linux/stacktrace.h>
33#include <linux/prefetch.h> 33#include <linux/prefetch.h>
34#include <linux/memcontrol.h>
34 35
35#include <trace/events/kmem.h> 36#include <trace/events/kmem.h>
36 37
@@ -112,9 +113,6 @@
112 * the fast path and disables lockless freelists. 113 * the fast path and disables lockless freelists.
113 */ 114 */
114 115
115#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
116 SLAB_TRACE | SLAB_DEBUG_FREE)
117
118static inline int kmem_cache_debug(struct kmem_cache *s) 116static inline int kmem_cache_debug(struct kmem_cache *s)
119{ 117{
120#ifdef CONFIG_SLUB_DEBUG 118#ifdef CONFIG_SLUB_DEBUG
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
179#define __OBJECT_POISON 0x80000000UL /* Poison object */ 177#define __OBJECT_POISON 0x80000000UL /* Poison object */
180#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 178#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
181 179
182static int kmem_size = sizeof(struct kmem_cache);
183
184#ifdef CONFIG_SMP 180#ifdef CONFIG_SMP
185static struct notifier_block slab_notifier; 181static struct notifier_block slab_notifier;
186#endif 182#endif
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
205static int sysfs_slab_add(struct kmem_cache *); 201static int sysfs_slab_add(struct kmem_cache *);
206static int sysfs_slab_alias(struct kmem_cache *, const char *); 202static int sysfs_slab_alias(struct kmem_cache *, const char *);
207static void sysfs_slab_remove(struct kmem_cache *); 203static void sysfs_slab_remove(struct kmem_cache *);
208 204static void memcg_propagate_slab_attrs(struct kmem_cache *s);
209#else 205#else
210static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 206static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
211static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 207static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
212 { return 0; } 208 { return 0; }
213static inline void sysfs_slab_remove(struct kmem_cache *s) { } 209static inline void sysfs_slab_remove(struct kmem_cache *s) { }
214 210
211static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
215#endif 212#endif
216 213
217static inline void stat(const struct kmem_cache *s, enum stat_item si) 214static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
1092 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1089 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1093 goto out; 1090 goto out;
1094 1091
1095 if (unlikely(s != page->slab)) { 1092 if (unlikely(s != page->slab_cache)) {
1096 if (!PageSlab(page)) { 1093 if (!PageSlab(page)) {
1097 slab_err(s, page, "Attempt to free object(0x%p) " 1094 slab_err(s, page, "Attempt to free object(0x%p) "
1098 "outside of slab", object); 1095 "outside of slab", object);
1099 } else if (!page->slab) { 1096 } else if (!page->slab_cache) {
1100 printk(KERN_ERR 1097 printk(KERN_ERR
1101 "SLUB <none>: no slab for object 0x%p.\n", 1098 "SLUB <none>: no slab for object 0x%p.\n",
1102 object); 1099 object);
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1348 void *start; 1345 void *start;
1349 void *last; 1346 void *last;
1350 void *p; 1347 void *p;
1348 int order;
1351 1349
1352 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1350 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1353 1351
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1356 if (!page) 1354 if (!page)
1357 goto out; 1355 goto out;
1358 1356
1357 order = compound_order(page);
1359 inc_slabs_node(s, page_to_nid(page), page->objects); 1358 inc_slabs_node(s, page_to_nid(page), page->objects);
1360 page->slab = s; 1359 memcg_bind_pages(s, order);
1360 page->slab_cache = s;
1361 __SetPageSlab(page); 1361 __SetPageSlab(page);
1362 if (page->pfmemalloc) 1362 if (page->pfmemalloc)
1363 SetPageSlabPfmemalloc(page); 1363 SetPageSlabPfmemalloc(page);
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1365 start = page_address(page); 1365 start = page_address(page);
1366 1366
1367 if (unlikely(s->flags & SLAB_POISON)) 1367 if (unlikely(s->flags & SLAB_POISON))
1368 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); 1368 memset(start, POISON_INUSE, PAGE_SIZE << order);
1369 1369
1370 last = start; 1370 last = start;
1371 for_each_object(p, s, start, page->objects) { 1371 for_each_object(p, s, start, page->objects) {
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1406 1406
1407 __ClearPageSlabPfmemalloc(page); 1407 __ClearPageSlabPfmemalloc(page);
1408 __ClearPageSlab(page); 1408 __ClearPageSlab(page);
1409
1410 memcg_release_pages(s, order);
1409 reset_page_mapcount(page); 1411 reset_page_mapcount(page);
1410 if (current->reclaim_state) 1412 if (current->reclaim_state)
1411 current->reclaim_state->reclaimed_slab += pages; 1413 current->reclaim_state->reclaimed_slab += pages;
1412 __free_pages(page, order); 1414 __free_memcg_kmem_pages(page, order);
1413} 1415}
1414 1416
1415#define need_reserve_slab_rcu \ 1417#define need_reserve_slab_rcu \
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
1424 else 1426 else
1425 page = container_of((struct list_head *)h, struct page, lru); 1427 page = container_of((struct list_head *)h, struct page, lru);
1426 1428
1427 __free_slab(page->slab, page); 1429 __free_slab(page->slab_cache, page);
1428} 1430}
1429 1431
1430static void free_slab(struct kmem_cache *s, struct page *page) 1432static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1872,12 +1874,14 @@ redo:
1872/* 1874/*
1873 * Unfreeze all the cpu partial slabs. 1875 * Unfreeze all the cpu partial slabs.
1874 * 1876 *
1875 * This function must be called with interrupt disabled. 1877 * This function must be called with interrupts disabled
1878 * for the cpu using c (or some other guarantee must be there
1879 * to guarantee no concurrent accesses).
1876 */ 1880 */
1877static void unfreeze_partials(struct kmem_cache *s) 1881static void unfreeze_partials(struct kmem_cache *s,
1882 struct kmem_cache_cpu *c)
1878{ 1883{
1879 struct kmem_cache_node *n = NULL, *n2 = NULL; 1884 struct kmem_cache_node *n = NULL, *n2 = NULL;
1880 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
1881 struct page *page, *discard_page = NULL; 1885 struct page *page, *discard_page = NULL;
1882 1886
1883 while ((page = c->partial)) { 1887 while ((page = c->partial)) {
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1963 * set to the per node partial list. 1967 * set to the per node partial list.
1964 */ 1968 */
1965 local_irq_save(flags); 1969 local_irq_save(flags);
1966 unfreeze_partials(s); 1970 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
1967 local_irq_restore(flags); 1971 local_irq_restore(flags);
1968 oldpage = NULL; 1972 oldpage = NULL;
1969 pobjects = 0; 1973 pobjects = 0;
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2006 if (c->page) 2010 if (c->page)
2007 flush_slab(s, c); 2011 flush_slab(s, c);
2008 2012
2009 unfreeze_partials(s); 2013 unfreeze_partials(s, c);
2010 } 2014 }
2011} 2015}
2012 2016
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2325 if (slab_pre_alloc_hook(s, gfpflags)) 2329 if (slab_pre_alloc_hook(s, gfpflags))
2326 return NULL; 2330 return NULL;
2327 2331
2332 s = memcg_kmem_get_cache(s, gfpflags);
2328redo: 2333redo:
2329 2334
2330 /* 2335 /*
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2459 void *prior; 2464 void *prior;
2460 void **object = (void *)x; 2465 void **object = (void *)x;
2461 int was_frozen; 2466 int was_frozen;
2462 int inuse;
2463 struct page new; 2467 struct page new;
2464 unsigned long counters; 2468 unsigned long counters;
2465 struct kmem_cache_node *n = NULL; 2469 struct kmem_cache_node *n = NULL;
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2472 return; 2476 return;
2473 2477
2474 do { 2478 do {
2479 if (unlikely(n)) {
2480 spin_unlock_irqrestore(&n->list_lock, flags);
2481 n = NULL;
2482 }
2475 prior = page->freelist; 2483 prior = page->freelist;
2476 counters = page->counters; 2484 counters = page->counters;
2477 set_freepointer(s, object, prior); 2485 set_freepointer(s, object, prior);
2478 new.counters = counters; 2486 new.counters = counters;
2479 was_frozen = new.frozen; 2487 was_frozen = new.frozen;
2480 new.inuse--; 2488 new.inuse--;
2481 if ((!new.inuse || !prior) && !was_frozen && !n) { 2489 if ((!new.inuse || !prior) && !was_frozen) {
2482 2490
2483 if (!kmem_cache_debug(s) && !prior) 2491 if (!kmem_cache_debug(s) && !prior)
2484 2492
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2503 2511
2504 } 2512 }
2505 } 2513 }
2506 inuse = new.inuse;
2507 2514
2508 } while (!cmpxchg_double_slab(s, page, 2515 } while (!cmpxchg_double_slab(s, page,
2509 prior, counters, 2516 prior, counters,
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2529 return; 2536 return;
2530 } 2537 }
2531 2538
2539 if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
2540 goto slab_empty;
2541
2532 /* 2542 /*
2533 * was_frozen may have been set after we acquired the list_lock in 2543 * Objects left in the slab. If it was not on the partial list before
2534 * an earlier loop. So we need to check it here again. 2544 * then add it.
2535 */ 2545 */
2536 if (was_frozen) 2546 if (kmem_cache_debug(s) && unlikely(!prior)) {
2537 stat(s, FREE_FROZEN); 2547 remove_full(s, page);
2538 else { 2548 add_partial(n, page, DEACTIVATE_TO_TAIL);
2539 if (unlikely(!inuse && n->nr_partial > s->min_partial)) 2549 stat(s, FREE_ADD_PARTIAL);
2540 goto slab_empty;
2541
2542 /*
2543 * Objects left in the slab. If it was not on the partial list before
2544 * then add it.
2545 */
2546 if (unlikely(!prior)) {
2547 remove_full(s, page);
2548 add_partial(n, page, DEACTIVATE_TO_TAIL);
2549 stat(s, FREE_ADD_PARTIAL);
2550 }
2551 } 2550 }
2552 spin_unlock_irqrestore(&n->list_lock, flags); 2551 spin_unlock_irqrestore(&n->list_lock, flags);
2553 return; 2552 return;
@@ -2619,19 +2618,10 @@ redo:
2619 2618
2620void kmem_cache_free(struct kmem_cache *s, void *x) 2619void kmem_cache_free(struct kmem_cache *s, void *x)
2621{ 2620{
2622 struct page *page; 2621 s = cache_from_obj(s, x);
2623 2622 if (!s)
2624 page = virt_to_head_page(x);
2625
2626 if (kmem_cache_debug(s) && page->slab != s) {
2627 pr_err("kmem_cache_free: Wrong slab cache. %s but object"
2628 " is from %s\n", page->slab->name, s->name);
2629 WARN_ON_ONCE(1);
2630 return; 2623 return;
2631 } 2624 slab_free(s, virt_to_head_page(x), x, _RET_IP_);
2632
2633 slab_free(s, page, x, _RET_IP_);
2634
2635 trace_kmem_cache_free(_RET_IP_, x); 2625 trace_kmem_cache_free(_RET_IP_, x);
2636} 2626}
2637EXPORT_SYMBOL(kmem_cache_free); 2627EXPORT_SYMBOL(kmem_cache_free);
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved)
2769 return -ENOSYS; 2759 return -ENOSYS;
2770} 2760}
2771 2761
2772/*
2773 * Figure out what the alignment of the objects will be.
2774 */
2775static unsigned long calculate_alignment(unsigned long flags,
2776 unsigned long align, unsigned long size)
2777{
2778 /*
2779 * If the user wants hardware cache aligned objects then follow that
2780 * suggestion if the object is sufficiently large.
2781 *
2782 * The hardware cache alignment cannot override the specified
2783 * alignment though. If that is greater then use it.
2784 */
2785 if (flags & SLAB_HWCACHE_ALIGN) {
2786 unsigned long ralign = cache_line_size();
2787 while (size <= ralign / 2)
2788 ralign /= 2;
2789 align = max(align, ralign);
2790 }
2791
2792 if (align < ARCH_SLAB_MINALIGN)
2793 align = ARCH_SLAB_MINALIGN;
2794
2795 return ALIGN(align, sizeof(void *));
2796}
2797
2798static void 2762static void
2799init_kmem_cache_node(struct kmem_cache_node *n) 2763init_kmem_cache_node(struct kmem_cache_node *n)
2800{ 2764{
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2928{ 2892{
2929 unsigned long flags = s->flags; 2893 unsigned long flags = s->flags;
2930 unsigned long size = s->object_size; 2894 unsigned long size = s->object_size;
2931 unsigned long align = s->align;
2932 int order; 2895 int order;
2933 2896
2934 /* 2897 /*
@@ -3000,19 +2963,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3000#endif 2963#endif
3001 2964
3002 /* 2965 /*
3003 * Determine the alignment based on various parameters that the
3004 * user specified and the dynamic determination of cache line size
3005 * on bootup.
3006 */
3007 align = calculate_alignment(flags, align, s->object_size);
3008 s->align = align;
3009
3010 /*
3011 * SLUB stores one object immediately after another beginning from 2966 * SLUB stores one object immediately after another beginning from
3012 * offset 0. In order to align the objects we have to simply size 2967 * offset 0. In order to align the objects we have to simply size
3013 * each object to conform to the alignment. 2968 * each object to conform to the alignment.
3014 */ 2969 */
3015 size = ALIGN(size, align); 2970 size = ALIGN(size, s->align);
3016 s->size = size; 2971 s->size = size;
3017 if (forced_order >= 0) 2972 if (forced_order >= 0)
3018 order = forced_order; 2973 order = forced_order;
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3041 s->max = s->oo; 2996 s->max = s->oo;
3042 2997
3043 return !!oo_objects(s->oo); 2998 return !!oo_objects(s->oo);
3044
3045} 2999}
3046 3000
3047static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) 3001static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
@@ -3127,15 +3081,6 @@ error:
3127 return -EINVAL; 3081 return -EINVAL;
3128} 3082}
3129 3083
3130/*
3131 * Determine the size of a slab object
3132 */
3133unsigned int kmem_cache_size(struct kmem_cache *s)
3134{
3135 return s->object_size;
3136}
3137EXPORT_SYMBOL(kmem_cache_size);
3138
3139static void list_slab_objects(struct kmem_cache *s, struct page *page, 3084static void list_slab_objects(struct kmem_cache *s, struct page *page,
3140 const char *text) 3085 const char *text)
3141{ 3086{
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
3208{ 3153{
3209 int rc = kmem_cache_close(s); 3154 int rc = kmem_cache_close(s);
3210 3155
3211 if (!rc) 3156 if (!rc) {
3157 /*
3158 * We do the same lock strategy around sysfs_slab_add, see
3159 * __kmem_cache_create. Because this is pretty much the last
3160 * operation we do and the lock will be released shortly after
3161 * that in slab_common.c, we could just move sysfs_slab_remove
3162 * to a later point in common code. We should do that when we
3163 * have a common sysfs framework for all allocators.
3164 */
3165 mutex_unlock(&slab_mutex);
3212 sysfs_slab_remove(s); 3166 sysfs_slab_remove(s);
3167 mutex_lock(&slab_mutex);
3168 }
3213 3169
3214 return rc; 3170 return rc;
3215} 3171}
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str)
3261 3217
3262__setup("slub_nomerge", setup_slub_nomerge); 3218__setup("slub_nomerge", setup_slub_nomerge);
3263 3219
3264static struct kmem_cache *__init create_kmalloc_cache(const char *name,
3265 int size, unsigned int flags)
3266{
3267 struct kmem_cache *s;
3268
3269 s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3270
3271 s->name = name;
3272 s->size = s->object_size = size;
3273 s->align = ARCH_KMALLOC_MINALIGN;
3274
3275 /*
3276 * This function is called with IRQs disabled during early-boot on
3277 * single CPU so there's no need to take slab_mutex here.
3278 */
3279 if (kmem_cache_open(s, flags))
3280 goto panic;
3281
3282 list_add(&s->list, &slab_caches);
3283 return s;
3284
3285panic:
3286 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
3287 return NULL;
3288}
3289
3290/* 3220/*
3291 * Conversion table for small slabs sizes / 8 to the index in the 3221 * Conversion table for small slabs sizes / 8 to the index in the
3292 * kmalloc array. This is necessary for slabs < 192 since we have non power 3222 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3372 struct page *page; 3302 struct page *page;
3373 void *ptr = NULL; 3303 void *ptr = NULL;
3374 3304
3375 flags |= __GFP_COMP | __GFP_NOTRACK; 3305 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
3376 page = alloc_pages_node(node, flags, get_order(size)); 3306 page = alloc_pages_node(node, flags, get_order(size));
3377 if (page) 3307 if (page)
3378 ptr = page_address(page); 3308 ptr = page_address(page);
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object)
3424 return PAGE_SIZE << compound_order(page); 3354 return PAGE_SIZE << compound_order(page);
3425 } 3355 }
3426 3356
3427 return slab_ksize(page->slab); 3357 return slab_ksize(page->slab_cache);
3428} 3358}
3429EXPORT_SYMBOL(ksize); 3359EXPORT_SYMBOL(ksize);
3430 3360
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x)
3449 } 3379 }
3450 3380
3451 slab_lock(page); 3381 slab_lock(page);
3452 if (on_freelist(page->slab, page, object)) { 3382 if (on_freelist(page->slab_cache, page, object)) {
3453 object_err(page->slab, page, object, "Object is on free-list"); 3383 object_err(page->slab_cache, page, object, "Object is on free-list");
3454 rv = false; 3384 rv = false;
3455 } else { 3385 } else {
3456 rv = true; 3386 rv = true;
@@ -3478,10 +3408,10 @@ void kfree(const void *x)
3478 if (unlikely(!PageSlab(page))) { 3408 if (unlikely(!PageSlab(page))) {
3479 BUG_ON(!PageCompound(page)); 3409 BUG_ON(!PageCompound(page));
3480 kmemleak_free(x); 3410 kmemleak_free(x);
3481 __free_pages(page, compound_order(page)); 3411 __free_memcg_kmem_pages(page, compound_order(page));
3482 return; 3412 return;
3483 } 3413 }
3484 slab_free(page->slab, page, object, _RET_IP_); 3414 slab_free(page->slab_cache, page, object, _RET_IP_);
3485} 3415}
3486EXPORT_SYMBOL(kfree); 3416EXPORT_SYMBOL(kfree);
3487 3417
@@ -3573,7 +3503,7 @@ static void slab_mem_offline_callback(void *arg)
3573 struct memory_notify *marg = arg; 3503 struct memory_notify *marg = arg;
3574 int offline_node; 3504 int offline_node;
3575 3505
3576 offline_node = marg->status_change_nid; 3506 offline_node = marg->status_change_nid_normal;
3577 3507
3578 /* 3508 /*
3579 * If the node still has available memory. we need kmem_cache_node 3509 * If the node still has available memory. we need kmem_cache_node
@@ -3606,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg)
3606 struct kmem_cache_node *n; 3536 struct kmem_cache_node *n;
3607 struct kmem_cache *s; 3537 struct kmem_cache *s;
3608 struct memory_notify *marg = arg; 3538 struct memory_notify *marg = arg;
3609 int nid = marg->status_change_nid; 3539 int nid = marg->status_change_nid_normal;
3610 int ret = 0; 3540 int ret = 0;
3611 3541
3612 /* 3542 /*
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self,
3676 3606
3677/* 3607/*
3678 * Used for early kmem_cache structures that were allocated using 3608 * Used for early kmem_cache structures that were allocated using
3679 * the page allocator 3609 * the page allocator. Allocate them properly then fix up the pointers
3610 * that may be pointing to the wrong kmem_cache structure.
3680 */ 3611 */
3681 3612
3682static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) 3613static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3683{ 3614{
3684 int node; 3615 int node;
3616 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3685 3617
3686 list_add(&s->list, &slab_caches); 3618 memcpy(s, static_cache, kmem_cache->object_size);
3687 s->refcount = -1;
3688 3619
3689 for_each_node_state(node, N_NORMAL_MEMORY) { 3620 for_each_node_state(node, N_NORMAL_MEMORY) {
3690 struct kmem_cache_node *n = get_node(s, node); 3621 struct kmem_cache_node *n = get_node(s, node);
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
3692 3623
3693 if (n) { 3624 if (n) {
3694 list_for_each_entry(p, &n->partial, lru) 3625 list_for_each_entry(p, &n->partial, lru)
3695 p->slab = s; 3626 p->slab_cache = s;
3696 3627
3697#ifdef CONFIG_SLUB_DEBUG 3628#ifdef CONFIG_SLUB_DEBUG
3698 list_for_each_entry(p, &n->full, lru) 3629 list_for_each_entry(p, &n->full, lru)
3699 p->slab = s; 3630 p->slab_cache = s;
3700#endif 3631#endif
3701 } 3632 }
3702 } 3633 }
3634 list_add(&s->list, &slab_caches);
3635 return s;
3703} 3636}
3704 3637
3705void __init kmem_cache_init(void) 3638void __init kmem_cache_init(void)
3706{ 3639{
3640 static __initdata struct kmem_cache boot_kmem_cache,
3641 boot_kmem_cache_node;
3707 int i; 3642 int i;
3708 int caches = 0; 3643 int caches = 2;
3709 struct kmem_cache *temp_kmem_cache;
3710 int order;
3711 struct kmem_cache *temp_kmem_cache_node;
3712 unsigned long kmalloc_size;
3713 3644
3714 if (debug_guardpage_minorder()) 3645 if (debug_guardpage_minorder())
3715 slub_max_order = 0; 3646 slub_max_order = 0;
3716 3647
3717 kmem_size = offsetof(struct kmem_cache, node) + 3648 kmem_cache_node = &boot_kmem_cache_node;
3718 nr_node_ids * sizeof(struct kmem_cache_node *); 3649 kmem_cache = &boot_kmem_cache;
3719
3720 /* Allocate two kmem_caches from the page allocator */
3721 kmalloc_size = ALIGN(kmem_size, cache_line_size());
3722 order = get_order(2 * kmalloc_size);
3723 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
3724
3725 /*
3726 * Must first have the slab cache available for the allocations of the
3727 * struct kmem_cache_node's. There is special bootstrap code in
3728 * kmem_cache_open for slab_state == DOWN.
3729 */
3730 kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3731 3650
3732 kmem_cache_node->name = "kmem_cache_node"; 3651 create_boot_cache(kmem_cache_node, "kmem_cache_node",
3733 kmem_cache_node->size = kmem_cache_node->object_size = 3652 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
3734 sizeof(struct kmem_cache_node);
3735 kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
3736 3653
3737 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3654 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3738 3655
3739 /* Able to allocate the per node structures */ 3656 /* Able to allocate the per node structures */
3740 slab_state = PARTIAL; 3657 slab_state = PARTIAL;
3741 3658
3742 temp_kmem_cache = kmem_cache; 3659 create_boot_cache(kmem_cache, "kmem_cache",
3743 kmem_cache->name = "kmem_cache"; 3660 offsetof(struct kmem_cache, node) +
3744 kmem_cache->size = kmem_cache->object_size = kmem_size; 3661 nr_node_ids * sizeof(struct kmem_cache_node *),
3745 kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); 3662 SLAB_HWCACHE_ALIGN);
3746 3663
3747 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3664 kmem_cache = bootstrap(&boot_kmem_cache);
3748 memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3749 3665
3750 /* 3666 /*
3751 * Allocate kmem_cache_node properly from the kmem_cache slab. 3667 * Allocate kmem_cache_node properly from the kmem_cache slab.
3752 * kmem_cache_node is separately allocated so no need to 3668 * kmem_cache_node is separately allocated so no need to
3753 * update any list pointers. 3669 * update any list pointers.
3754 */ 3670 */
3755 temp_kmem_cache_node = kmem_cache_node; 3671 kmem_cache_node = bootstrap(&boot_kmem_cache_node);
3756
3757 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3758 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
3759
3760 kmem_cache_bootstrap_fixup(kmem_cache_node);
3761
3762 caches++;
3763 kmem_cache_bootstrap_fixup(kmem_cache);
3764 caches++;
3765 /* Free temporary boot structure */
3766 free_pages((unsigned long)temp_kmem_cache, order);
3767 3672
3768 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3673 /* Now we can use the kmem_cache to allocate kmalloc slabs */
3769 3674
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3891 return 0; 3796 return 0;
3892} 3797}
3893 3798
3894static struct kmem_cache *find_mergeable(size_t size, 3799static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
3895 size_t align, unsigned long flags, const char *name, 3800 size_t align, unsigned long flags, const char *name,
3896 void (*ctor)(void *)) 3801 void (*ctor)(void *))
3897{ 3802{
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
3927 if (s->size - size >= sizeof(void *)) 3832 if (s->size - size >= sizeof(void *))
3928 continue; 3833 continue;
3929 3834
3835 if (!cache_match_memcg(s, memcg))
3836 continue;
3837
3930 return s; 3838 return s;
3931 } 3839 }
3932 return NULL; 3840 return NULL;
3933} 3841}
3934 3842
3935struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 3843struct kmem_cache *
3936 size_t align, unsigned long flags, void (*ctor)(void *)) 3844__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
3845 size_t align, unsigned long flags, void (*ctor)(void *))
3937{ 3846{
3938 struct kmem_cache *s; 3847 struct kmem_cache *s;
3939 3848
3940 s = find_mergeable(size, align, flags, name, ctor); 3849 s = find_mergeable(memcg, size, align, flags, name, ctor);
3941 if (s) { 3850 if (s) {
3942 s->refcount++; 3851 s->refcount++;
3943 /* 3852 /*
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3964 if (err) 3873 if (err)
3965 return err; 3874 return err;
3966 3875
3876 /* Mutex is not taken during early boot */
3877 if (slab_state <= UP)
3878 return 0;
3879
3880 memcg_propagate_slab_attrs(s);
3967 mutex_unlock(&slab_mutex); 3881 mutex_unlock(&slab_mutex);
3968 err = sysfs_slab_add(s); 3882 err = sysfs_slab_add(s);
3969 mutex_lock(&slab_mutex); 3883 mutex_lock(&slab_mutex);
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
5197 return -EIO; 5111 return -EIO;
5198 5112
5199 err = attribute->store(s, buf, len); 5113 err = attribute->store(s, buf, len);
5114#ifdef CONFIG_MEMCG_KMEM
5115 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5116 int i;
5117
5118 mutex_lock(&slab_mutex);
5119 if (s->max_attr_size < len)
5120 s->max_attr_size = len;
5200 5121
5122 /*
5123 * This is a best effort propagation, so this function's return
5124 * value will be determined by the parent cache only. This is
5125 * basically because not all attributes will have a well
5126 * defined semantics for rollbacks - most of the actions will
5127 * have permanent effects.
5128 *
5129 * Returning the error value of any of the children that fail
5130 * is not 100 % defined, in the sense that users seeing the
5131 * error code won't be able to know anything about the state of
5132 * the cache.
5133 *
5134 * Only returning the error code for the parent cache at least
5135 * has well defined semantics. The cache being written to
5136 * directly either failed or succeeded, in which case we loop
5137 * through the descendants with best-effort propagation.
5138 */
5139 for_each_memcg_cache_index(i) {
5140 struct kmem_cache *c = cache_from_memcg(s, i);
5141 if (c)
5142 attribute->store(c, buf, len);
5143 }
5144 mutex_unlock(&slab_mutex);
5145 }
5146#endif
5201 return err; 5147 return err;
5202} 5148}
5203 5149
5150static void memcg_propagate_slab_attrs(struct kmem_cache *s)
5151{
5152#ifdef CONFIG_MEMCG_KMEM
5153 int i;
5154 char *buffer = NULL;
5155
5156 if (!is_root_cache(s))
5157 return;
5158
5159 /*
5160 * This mean this cache had no attribute written. Therefore, no point
5161 * in copying default values around
5162 */
5163 if (!s->max_attr_size)
5164 return;
5165
5166 for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5167 char mbuf[64];
5168 char *buf;
5169 struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5170
5171 if (!attr || !attr->store || !attr->show)
5172 continue;
5173
5174 /*
5175 * It is really bad that we have to allocate here, so we will
5176 * do it only as a fallback. If we actually allocate, though,
5177 * we can just use the allocated buffer until the end.
5178 *
5179 * Most of the slub attributes will tend to be very small in
5180 * size, but sysfs allows buffers up to a page, so they can
5181 * theoretically happen.
5182 */
5183 if (buffer)
5184 buf = buffer;
5185 else if (s->max_attr_size < ARRAY_SIZE(mbuf))
5186 buf = mbuf;
5187 else {
5188 buffer = (char *) get_zeroed_page(GFP_KERNEL);
5189 if (WARN_ON(!buffer))
5190 continue;
5191 buf = buffer;
5192 }
5193
5194 attr->show(s->memcg_params->root_cache, buf);
5195 attr->store(s, buf, strlen(buf));
5196 }
5197
5198 if (buffer)
5199 free_page((unsigned long)buffer);
5200#endif
5201}
5202
5204static const struct sysfs_ops slab_sysfs_ops = { 5203static const struct sysfs_ops slab_sysfs_ops = {
5205 .show = slab_attr_show, 5204 .show = slab_attr_show,
5206 .store = slab_attr_store, 5205 .store = slab_attr_store,
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
5257 if (p != name + 1) 5256 if (p != name + 1)
5258 *p++ = '-'; 5257 *p++ = '-';
5259 p += sprintf(p, "%07d", s->size); 5258 p += sprintf(p, "%07d", s->size);
5259
5260#ifdef CONFIG_MEMCG_KMEM
5261 if (!is_root_cache(s))
5262 p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
5263#endif
5264
5260 BUG_ON(p > name + ID_STR_LENGTH - 1); 5265 BUG_ON(p > name + ID_STR_LENGTH - 1);
5261 return name; 5266 return name;
5262} 5267}
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
5265{ 5270{
5266 int err; 5271 int err;
5267 const char *name; 5272 const char *name;
5268 int unmergeable; 5273 int unmergeable = slab_unmergeable(s);
5269
5270 if (slab_state < FULL)
5271 /* Defer until later */
5272 return 0;
5273 5274
5274 unmergeable = slab_unmergeable(s);
5275 if (unmergeable) { 5275 if (unmergeable) {
5276 /* 5276 /*
5277 * Slabcache can never be merged so we can use the name proper. 5277 * Slabcache can never be merged so we can use the name proper.
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init);
5405 * The /proc/slabinfo ABI 5405 * The /proc/slabinfo ABI
5406 */ 5406 */
5407#ifdef CONFIG_SLABINFO 5407#ifdef CONFIG_SLABINFO
5408static void print_slabinfo_header(struct seq_file *m) 5408void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5409{
5410 seq_puts(m, "slabinfo - version: 2.1\n");
5411 seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
5412 "<objperslab> <pagesperslab>");
5413 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
5414 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
5415 seq_putc(m, '\n');
5416}
5417
5418static void *s_start(struct seq_file *m, loff_t *pos)
5419{
5420 loff_t n = *pos;
5421
5422 mutex_lock(&slab_mutex);
5423 if (!n)
5424 print_slabinfo_header(m);
5425
5426 return seq_list_start(&slab_caches, *pos);
5427}
5428
5429static void *s_next(struct seq_file *m, void *p, loff_t *pos)
5430{
5431 return seq_list_next(p, &slab_caches, pos);
5432}
5433
5434static void s_stop(struct seq_file *m, void *p)
5435{
5436 mutex_unlock(&slab_mutex);
5437}
5438
5439static int s_show(struct seq_file *m, void *p)
5440{ 5409{
5441 unsigned long nr_partials = 0; 5410 unsigned long nr_partials = 0;
5442 unsigned long nr_slabs = 0; 5411 unsigned long nr_slabs = 0;
5443 unsigned long nr_inuse = 0;
5444 unsigned long nr_objs = 0; 5412 unsigned long nr_objs = 0;
5445 unsigned long nr_free = 0; 5413 unsigned long nr_free = 0;
5446 struct kmem_cache *s;
5447 int node; 5414 int node;
5448 5415
5449 s = list_entry(p, struct kmem_cache, list);
5450
5451 for_each_online_node(node) { 5416 for_each_online_node(node) {
5452 struct kmem_cache_node *n = get_node(s, node); 5417 struct kmem_cache_node *n = get_node(s, node);
5453 5418
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p)
5460 nr_free += count_partial(n, count_free); 5425 nr_free += count_partial(n, count_free);
5461 } 5426 }
5462 5427
5463 nr_inuse = nr_objs - nr_free; 5428 sinfo->active_objs = nr_objs - nr_free;
5464 5429 sinfo->num_objs = nr_objs;
5465 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, 5430 sinfo->active_slabs = nr_slabs;
5466 nr_objs, s->size, oo_objects(s->oo), 5431 sinfo->num_slabs = nr_slabs;
5467 (1 << oo_order(s->oo))); 5432 sinfo->objects_per_slab = oo_objects(s->oo);
5468 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); 5433 sinfo->cache_order = oo_order(s->oo);
5469 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
5470 0UL);
5471 seq_putc(m, '\n');
5472 return 0;
5473} 5434}
5474 5435
5475static const struct seq_operations slabinfo_op = { 5436void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5476 .start = s_start,
5477 .next = s_next,
5478 .stop = s_stop,
5479 .show = s_show,
5480};
5481
5482static int slabinfo_open(struct inode *inode, struct file *file)
5483{ 5437{
5484 return seq_open(file, &slabinfo_op);
5485} 5438}
5486 5439
5487static const struct file_operations proc_slabinfo_operations = { 5440ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5488 .open = slabinfo_open, 5441 size_t count, loff_t *ppos)
5489 .read = seq_read,
5490 .llseek = seq_lseek,
5491 .release = seq_release,
5492};
5493
5494static int __init slab_proc_init(void)
5495{ 5442{
5496 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); 5443 return -EIO;
5497 return 0;
5498} 5444}
5499module_init(slab_proc_init);
5500#endif /* CONFIG_SLABINFO */ 5445#endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse.c b/mm/sparse.c
index a83de2f72b30..6b5fb762e2ca 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
638got_map_page: 638got_map_page:
639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); 639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
640got_map_ptr: 640got_map_ptr:
641 memset(ret, 0, memmap_size);
642 641
643 return ret; 642 return ret;
644} 643}
@@ -758,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
758 goto out; 757 goto out;
759 } 758 }
760 759
760 memset(memmap, 0, sizeof(struct page) * nr_pages);
761
761 ms->section_mem_map |= SECTION_MARKED_PRESENT; 762 ms->section_mem_map |= SECTION_MARKED_PRESENT;
762 763
763 ret = sparse_init_one_section(ms, section_nr, memmap, usemap); 764 ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
@@ -771,6 +772,27 @@ out:
771 return ret; 772 return ret;
772} 773}
773 774
775#ifdef CONFIG_MEMORY_FAILURE
776static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
777{
778 int i;
779
780 if (!memmap)
781 return;
782
783 for (i = 0; i < PAGES_PER_SECTION; i++) {
784 if (PageHWPoison(&memmap[i])) {
785 atomic_long_sub(1, &mce_bad_pages);
786 ClearPageHWPoison(&memmap[i]);
787 }
788 }
789}
790#else
791static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
792{
793}
794#endif
795
774void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 796void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
775{ 797{
776 struct page *memmap = NULL; 798 struct page *memmap = NULL;
@@ -784,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
784 ms->pageblock_flags = NULL; 806 ms->pageblock_flags = NULL;
785 } 807 }
786 808
809 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
787 free_section_usemap(memmap, usemap); 810 free_section_usemap(memmap, usemap);
788} 811}
789#endif 812#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f91a25547ffe..e97a0e5aea91 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1443 return generic_swapfile_activate(sis, swap_file, span); 1443 return generic_swapfile_activate(sis, swap_file, span);
1444} 1444}
1445 1445
1446static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void _enable_swap_info(struct swap_info_struct *p, int prio,
1447 unsigned char *swap_map, 1447 unsigned char *swap_map,
1448 unsigned long *frontswap_map) 1448 unsigned long *frontswap_map)
1449{ 1449{
1450 int i, prev; 1450 int i, prev;
1451 1451
1452 spin_lock(&swap_lock);
1453 if (prio >= 0) 1452 if (prio >= 0)
1454 p->prio = prio; 1453 p->prio = prio;
1455 else 1454 else
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1472 swap_list.head = swap_list.next = p->type; 1471 swap_list.head = swap_list.next = p->type;
1473 else 1472 else
1474 swap_info[prev]->next = p->type; 1473 swap_info[prev]->next = p->type;
1474}
1475
1476static void enable_swap_info(struct swap_info_struct *p, int prio,
1477 unsigned char *swap_map,
1478 unsigned long *frontswap_map)
1479{
1480 spin_lock(&swap_lock);
1481 _enable_swap_info(p, prio, swap_map, frontswap_map);
1475 frontswap_init(p->type); 1482 frontswap_init(p->type);
1476 spin_unlock(&swap_lock); 1483 spin_unlock(&swap_lock);
1477} 1484}
1478 1485
1486static void reinsert_swap_info(struct swap_info_struct *p)
1487{
1488 spin_lock(&swap_lock);
1489 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1490 spin_unlock(&swap_lock);
1491}
1492
1479SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1493SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1480{ 1494{
1481 struct swap_info_struct *p = NULL; 1495 struct swap_info_struct *p = NULL;
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1484 struct address_space *mapping; 1498 struct address_space *mapping;
1485 struct inode *inode; 1499 struct inode *inode;
1486 struct filename *pathname; 1500 struct filename *pathname;
1487 int oom_score_adj;
1488 int i, type, prev; 1501 int i, type, prev;
1489 int err; 1502 int err;
1490 1503
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1543 p->flags &= ~SWP_WRITEOK; 1556 p->flags &= ~SWP_WRITEOK;
1544 spin_unlock(&swap_lock); 1557 spin_unlock(&swap_lock);
1545 1558
1546 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1559 set_current_oom_origin();
1547 err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1560 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1548 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1561 clear_current_oom_origin();
1549 1562
1550 if (err) { 1563 if (err) {
1551 /*
1552 * reading p->prio and p->swap_map outside the lock is
1553 * safe here because only sys_swapon and sys_swapoff
1554 * change them, and there can be no other sys_swapon or
1555 * sys_swapoff for this swap_info_struct at this point.
1556 */
1557 /* re-insert swap space back into swap_list */ 1564 /* re-insert swap space back into swap_list */
1558 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1565 reinsert_swap_info(p);
1559 goto out_dput; 1566 goto out_dput;
1560 } 1567 }
1561 1568
diff --git a/mm/util.c b/mm/util.c
index dc3036cdcc6a..c55e26b17d93 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__krealloc);
152 * 152 *
153 * The contents of the object pointed to are preserved up to the 153 * The contents of the object pointed to are preserved up to the
154 * lesser of the new and old sizes. If @p is %NULL, krealloc() 154 * lesser of the new and old sizes. If @p is %NULL, krealloc()
155 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 155 * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
156 * %NULL pointer, the object pointed to is freed. 156 * %NULL pointer, the object pointed to is freed.
157 */ 157 */
158void *krealloc(const void *p, size_t new_size, gfp_t flags) 158void *krealloc(const void *p, size_t new_size, gfp_t flags)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 78e08300db21..5123a169ab7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
2550 2550
2551static void show_numa_info(struct seq_file *m, struct vm_struct *v) 2551static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2552{ 2552{
2553 if (NUMA_BUILD) { 2553 if (IS_ENABLED(CONFIG_NUMA)) {
2554 unsigned int nr, *counters = m->private; 2554 unsigned int nr, *counters = m->private;
2555 2555
2556 if (!counters) 2556 if (!counters)
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
2615 unsigned int *ptr = NULL; 2615 unsigned int *ptr = NULL;
2616 int ret; 2616 int ret;
2617 2617
2618 if (NUMA_BUILD) { 2618 if (IS_ENABLED(CONFIG_NUMA)) {
2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
2620 if (ptr == NULL) 2620 if (ptr == NULL)
2621 return -ENOMEM; 2621 return -ENOMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b7ed37675644..adc7e9058181 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
1177} 1177}
1178 1178
1179/* 1179/*
1180 * Are there way too many processes in the direct reclaim path already? 1180 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1181 * then get resheduled. When there are massive number of tasks doing page
1182 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1183 * the LRU list will go small and be scanned faster than necessary, leading to
1184 * unnecessary swapping, thrashing and OOM.
1181 */ 1185 */
1182static int too_many_isolated(struct zone *zone, int file, 1186static int too_many_isolated(struct zone *zone, int file,
1183 struct scan_control *sc) 1187 struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
1198 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1202 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1199 } 1203 }
1200 1204
1205 /*
1206 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1207 * won't get blocked by normal direct-reclaimers, forming a circular
1208 * deadlock.
1209 */
1210 if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
1211 inactive >>= 3;
1212
1201 return isolated > inactive; 1213 return isolated > inactive;
1202} 1214}
1203 1215
@@ -1679,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1679 1691
1680 if (global_reclaim(sc)) { 1692 if (global_reclaim(sc)) {
1681 free = zone_page_state(zone, NR_FREE_PAGES); 1693 free = zone_page_state(zone, NR_FREE_PAGES);
1682 /* If we have very few page cache pages,
1683 force-scan anon pages. */
1684 if (unlikely(file + free <= high_wmark_pages(zone))) { 1694 if (unlikely(file + free <= high_wmark_pages(zone))) {
1695 /*
1696 * If we have very few page cache pages, force-scan
1697 * anon pages.
1698 */
1685 fraction[0] = 1; 1699 fraction[0] = 1;
1686 fraction[1] = 0; 1700 fraction[1] = 0;
1687 denominator = 1; 1701 denominator = 1;
1688 goto out; 1702 goto out;
1703 } else if (!inactive_file_is_low_global(zone)) {
1704 /*
1705 * There is enough inactive page cache, do not
1706 * reclaim anything from the working set right now.
1707 */
1708 fraction[0] = 0;
1709 fraction[1] = 1;
1710 denominator = 1;
1711 goto out;
1689 } 1712 }
1690 } 1713 }
1691 1714
@@ -1752,7 +1775,7 @@ out:
1752/* Use reclaim/compaction for costly allocs or under memory pressure */ 1775/* Use reclaim/compaction for costly allocs or under memory pressure */
1753static bool in_reclaim_compaction(struct scan_control *sc) 1776static bool in_reclaim_compaction(struct scan_control *sc)
1754{ 1777{
1755 if (COMPACTION_BUILD && sc->order && 1778 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
1756 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 1779 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1757 sc->priority < DEF_PRIORITY - 2)) 1780 sc->priority < DEF_PRIORITY - 2))
1758 return true; 1781 return true;
@@ -2005,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2005 if (zone->all_unreclaimable && 2028 if (zone->all_unreclaimable &&
2006 sc->priority != DEF_PRIORITY) 2029 sc->priority != DEF_PRIORITY)
2007 continue; /* Let kswapd poll it */ 2030 continue; /* Let kswapd poll it */
2008 if (COMPACTION_BUILD) { 2031 if (IS_ENABLED(CONFIG_COMPACTION)) {
2009 /* 2032 /*
2010 * If we already have plenty of memory free for 2033 * If we already have plenty of memory free for
2011 * compaction in this zone, don't free any more. 2034 * compaction in this zone, don't free any more.
@@ -2421,7 +2444,8 @@ static bool zone_balanced(struct zone *zone, int order,
2421 balance_gap, classzone_idx, 0)) 2444 balance_gap, classzone_idx, 0))
2422 return false; 2445 return false;
2423 2446
2424 if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) 2447 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2448 !compaction_suitable(zone, order))
2425 return false; 2449 return false;
2426 2450
2427 return true; 2451 return true;
@@ -2546,7 +2570,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2546static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2570static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2547 int *classzone_idx) 2571 int *classzone_idx)
2548{ 2572{
2549 int all_zones_ok; 2573 struct zone *unbalanced_zone;
2550 unsigned long balanced; 2574 unsigned long balanced;
2551 int i; 2575 int i;
2552 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2576 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -2580,7 +2604,7 @@ loop_again:
2580 unsigned long lru_pages = 0; 2604 unsigned long lru_pages = 0;
2581 int has_under_min_watermark_zone = 0; 2605 int has_under_min_watermark_zone = 0;
2582 2606
2583 all_zones_ok = 1; 2607 unbalanced_zone = NULL;
2584 balanced = 0; 2608 balanced = 0;
2585 2609
2586 /* 2610 /*
@@ -2684,7 +2708,7 @@ loop_again:
2684 * Do not reclaim more than needed for compaction. 2708 * Do not reclaim more than needed for compaction.
2685 */ 2709 */
2686 testorder = order; 2710 testorder = order;
2687 if (COMPACTION_BUILD && order && 2711 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2688 compaction_suitable(zone, order) != 2712 compaction_suitable(zone, order) !=
2689 COMPACT_SKIPPED) 2713 COMPACT_SKIPPED)
2690 testorder = 0; 2714 testorder = 0;
@@ -2719,7 +2743,7 @@ loop_again:
2719 } 2743 }
2720 2744
2721 if (!zone_balanced(zone, testorder, 0, end_zone)) { 2745 if (!zone_balanced(zone, testorder, 0, end_zone)) {
2722 all_zones_ok = 0; 2746 unbalanced_zone = zone;
2723 /* 2747 /*
2724 * We are still under min water mark. This 2748 * We are still under min water mark. This
2725 * means that we have a GFP_ATOMIC allocation 2749 * means that we have a GFP_ATOMIC allocation
@@ -2752,7 +2776,7 @@ loop_again:
2752 pfmemalloc_watermark_ok(pgdat)) 2776 pfmemalloc_watermark_ok(pgdat))
2753 wake_up(&pgdat->pfmemalloc_wait); 2777 wake_up(&pgdat->pfmemalloc_wait);
2754 2778
2755 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2779 if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2756 break; /* kswapd: all done */ 2780 break; /* kswapd: all done */
2757 /* 2781 /*
2758 * OK, kswapd is getting into trouble. Take a nap, then take 2782 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2762,7 +2786,7 @@ loop_again:
2762 if (has_under_min_watermark_zone) 2786 if (has_under_min_watermark_zone)
2763 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2787 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2764 else 2788 else
2765 congestion_wait(BLK_RW_ASYNC, HZ/10); 2789 wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
2766 } 2790 }
2767 2791
2768 /* 2792 /*
@@ -2781,7 +2805,7 @@ out:
2781 * high-order: Balanced zones must make up at least 25% of the node 2805 * high-order: Balanced zones must make up at least 25% of the node
2782 * for the node to be balanced 2806 * for the node to be balanced
2783 */ 2807 */
2784 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { 2808 if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) {
2785 cond_resched(); 2809 cond_resched();
2786 2810
2787 try_to_freeze(); 2811 try_to_freeze();
@@ -2951,7 +2975,7 @@ static int kswapd(void *p)
2951 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2975 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2952 balanced_classzone_idx = classzone_idx; 2976 balanced_classzone_idx = classzone_idx;
2953 for ( ; ; ) { 2977 for ( ; ; ) {
2954 int ret; 2978 bool ret;
2955 2979
2956 /* 2980 /*
2957 * If the last balance_pgdat was unsuccessful it's unlikely a 2981 * If the last balance_pgdat was unsuccessful it's unlikely a
@@ -3119,7 +3143,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
3119 int nid; 3143 int nid;
3120 3144
3121 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 3145 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3122 for_each_node_state(nid, N_HIGH_MEMORY) { 3146 for_each_node_state(nid, N_MEMORY) {
3123 pg_data_t *pgdat = NODE_DATA(nid); 3147 pg_data_t *pgdat = NODE_DATA(nid);
3124 const struct cpumask *mask; 3148 const struct cpumask *mask;
3125 3149
@@ -3175,7 +3199,7 @@ static int __init kswapd_init(void)
3175 int nid; 3199 int nid;
3176 3200
3177 swap_setup(); 3201 swap_setup();
3178 for_each_node_state(nid, N_HIGH_MEMORY) 3202 for_each_node_state(nid, N_MEMORY)
3179 kswapd_run(nid); 3203 kswapd_run(nid);
3180 hotcpu_notifier(cpu_callback, 0); 3204 hotcpu_notifier(cpu_callback, 0);
3181 return 0; 3205 return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7370579111b..9800306c8195 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
774 774
775 "pgrotated", 775 "pgrotated",
776 776
777#ifdef CONFIG_NUMA_BALANCING
778 "numa_pte_updates",
779 "numa_hint_faults",
780 "numa_hint_faults_local",
781 "numa_pages_migrated",
782#endif
783#ifdef CONFIG_MIGRATION
784 "pgmigrate_success",
785 "pgmigrate_fail",
786#endif
777#ifdef CONFIG_COMPACTION 787#ifdef CONFIG_COMPACTION
778 "compact_blocks_moved", 788 "compact_migrate_scanned",
779 "compact_pages_moved", 789 "compact_free_scanned",
780 "compact_pagemigrate_failed", 790 "compact_isolated",
781 "compact_stall", 791 "compact_stall",
782 "compact_fail", 792 "compact_fail",
783 "compact_success", 793 "compact_success",
@@ -801,6 +811,8 @@ const char * const vmstat_text[] = {
801 "thp_collapse_alloc", 811 "thp_collapse_alloc",
802 "thp_collapse_alloc_failed", 812 "thp_collapse_alloc_failed",
803 "thp_split", 813 "thp_split",
814 "thp_zero_page_alloc",
815 "thp_zero_page_alloc_failed",
804#endif 816#endif
805 817
806#endif /* CONFIG_VM_EVENTS_COUNTERS */ 818#endif /* CONFIG_VM_EVENTS_COUNTERS */
@@ -930,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
930 pg_data_t *pgdat = (pg_data_t *)arg; 942 pg_data_t *pgdat = (pg_data_t *)arg;
931 943
932 /* check memoryless node */ 944 /* check memoryless node */
933 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 945 if (!node_state(pgdat->node_id, N_MEMORY))
934 return 0; 946 return 0;
935 947
936 seq_printf(m, "Page block order: %d\n", pageblock_order); 948 seq_printf(m, "Page block order: %d\n", pageblock_order);
@@ -992,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
992 "\n high %lu" 1004 "\n high %lu"
993 "\n scanned %lu" 1005 "\n scanned %lu"
994 "\n spanned %lu" 1006 "\n spanned %lu"
995 "\n present %lu", 1007 "\n present %lu"
1008 "\n managed %lu",
996 zone_page_state(zone, NR_FREE_PAGES), 1009 zone_page_state(zone, NR_FREE_PAGES),
997 min_wmark_pages(zone), 1010 min_wmark_pages(zone),
998 low_wmark_pages(zone), 1011 low_wmark_pages(zone),
999 high_wmark_pages(zone), 1012 high_wmark_pages(zone),
1000 zone->pages_scanned, 1013 zone->pages_scanned,
1001 zone->spanned_pages, 1014 zone->spanned_pages,
1002 zone->present_pages); 1015 zone->present_pages,
1016 zone->managed_pages);
1003 1017
1004 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1018 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1005 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1019 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
@@ -1292,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg)
1292 pg_data_t *pgdat = (pg_data_t *)arg; 1306 pg_data_t *pgdat = (pg_data_t *)arg;
1293 1307
1294 /* check memoryless node */ 1308 /* check memoryless node */
1295 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 1309 if (!node_state(pgdat->node_id, N_MEMORY))
1296 return 0; 1310 return 0;
1297 1311
1298 walk_zones_in_node(m, pgdat, unusable_show_print); 1312 walk_zones_in_node(m, pgdat, unusable_show_print);