aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig34
-rw-r--r--mm/Makefile3
-rw-r--r--mm/balloon_compaction.c302
-rw-r--r--mm/bootmem.c103
-rw-r--r--mm/compaction.c156
-rw-r--r--mm/dmapool.c55
-rw-r--r--mm/highmem.c30
-rw-r--r--mm/huge_memory.c662
-rw-r--r--mm/hugetlb.c64
-rw-r--r--mm/hugetlb_cgroup.c42
-rw-r--r--mm/internal.h13
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/ksm.c37
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c1485
-rw-r--r--mm/memory-failure.c35
-rw-r--r--mm/memory.c251
-rw-r--r--mm/memory_hotplug.c430
-rw-r--r--mm/mempolicy.c448
-rw-r--r--mm/migrate.c454
-rw-r--r--mm/mlock.c6
-rw-r--r--mm/mmap.c571
-rw-r--r--mm/mprotect.c151
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nobootmem.c22
-rw-r--r--mm/nommu.c15
-rw-r--r--mm/oom_kill.c138
-rw-r--r--mm/page-writeback.c36
-rw-r--r--mm/page_alloc.c420
-rw-r--r--mm/page_cgroup.c5
-rw-r--r--mm/page_isolation.c53
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c5
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c134
-rw-r--r--mm/shmem.c96
-rw-r--r--mm/slab.c383
-rw-r--r--mm/slab.h190
-rw-r--r--mm/slab_common.c292
-rw-r--r--mm/slob.c48
-rw-r--r--mm/slub.c451
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swapfile.c31
-rw-r--r--mm/truncate.c23
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c163
-rw-r--r--mm/vmstat.c28
48 files changed, 5890 insertions, 2027 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a3f8dddaaab3..278e3ab1f169 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,25 @@ config NO_BOOTMEM
143config MEMORY_ISOLATION 143config MEMORY_ISOLATION
144 boolean 144 boolean
145 145
146config MOVABLE_NODE
147 boolean "Enable to assign a node which has only movable memory"
148 depends on HAVE_MEMBLOCK
149 depends on NO_BOOTMEM
150 depends on X86_64
151 depends on NUMA
152 default n
153 help
154 Allow a node to have only movable memory. Pages used by the kernel,
155 such as direct mapping pages cannot be migrated. So the corresponding
156 memory device cannot be hotplugged. This option allows users to
157 online all the memory of a node as movable memory so that the whole
158 node can be hotplugged. Users who don't use the memory hotplug
159 feature are fine with this option on since they don't online memory
160 as movable.
161
162 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly.
164
146# eventually, we can have this option just 'select SPARSEMEM' 165# eventually, we can have this option just 'select SPARSEMEM'
147config MEMORY_HOTPLUG 166config MEMORY_HOTPLUG
148 bool "Allow for memory hot-add" 167 bool "Allow for memory hot-add"
@@ -188,6 +207,21 @@ config SPLIT_PTLOCK_CPUS
188 default "4" 207 default "4"
189 208
190# 209#
210# support for memory balloon compaction
211config BALLOON_COMPACTION
212 bool "Allow for balloon memory compaction/migration"
213 def_bool y
214 depends on COMPACTION && VIRTIO_BALLOON
215 help
216 Memory fragmentation introduced by ballooning might reduce
217 significantly the number of 2MB contiguous memory blocks that can be
218 used within a guest, thus imposing performance penalties associated
219 with the reduced number of transparent huge pages that could be used
220 by the guest workload. Allowing the compaction & migration for memory
221 pages enlisted as being part of memory balloon devices avoids the
222 scenario aforementioned and helps improving memory defragmentation.
223
224#
191# support for memory compaction 225# support for memory compaction
192config COMPACTION 226config COMPACTION
193 bool "Allow for memory compaction" 227 bool "Allow for memory compaction"
diff --git a/mm/Makefile b/mm/Makefile
index 6b025f80af34..3a4628751f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o interval_tree.o $(mmu-y) 19 compaction.o balloon_compaction.o \
20 interval_tree.o $(mmu-y)
20 21
21obj-y += init-mm.o 22obj-y += init-mm.o
22 23
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 000000000000..07dbc8ec46cf
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
1/*
2 * mm/balloon_compaction.c
3 *
4 * Common interface for making balloon pages movable by compaction.
5 *
6 * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
7 */
8#include <linux/mm.h>
9#include <linux/slab.h>
10#include <linux/export.h>
11#include <linux/balloon_compaction.h>
12
13/*
14 * balloon_devinfo_alloc - allocates a balloon device information descriptor.
15 * @balloon_dev_descriptor: pointer to reference the balloon device which
16 * this struct balloon_dev_info will be servicing.
17 *
18 * Driver must call it to properly allocate and initialize an instance of
19 * struct balloon_dev_info which will be used to reference a balloon device
20 * as well as to keep track of the balloon device page list.
21 */
22struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
23{
24 struct balloon_dev_info *b_dev_info;
25 b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
26 if (!b_dev_info)
27 return ERR_PTR(-ENOMEM);
28
29 b_dev_info->balloon_device = balloon_dev_descriptor;
30 b_dev_info->mapping = NULL;
31 b_dev_info->isolated_pages = 0;
32 spin_lock_init(&b_dev_info->pages_lock);
33 INIT_LIST_HEAD(&b_dev_info->pages);
34
35 return b_dev_info;
36}
37EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
38
39/*
40 * balloon_page_enqueue - allocates a new page and inserts it into the balloon
41 * page list.
42 * @b_dev_info: balloon device decriptor where we will insert a new page to
43 *
44 * Driver must call it to properly allocate a new enlisted balloon page
45 * before definetively removing it from the guest system.
46 * This function returns the page address for the recently enqueued page or
47 * NULL in the case we fail to allocate a new page this turn.
48 */
49struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
50{
51 unsigned long flags;
52 struct page *page = alloc_page(balloon_mapping_gfp_mask() |
53 __GFP_NOMEMALLOC | __GFP_NORETRY);
54 if (!page)
55 return NULL;
56
57 /*
58 * Block others from accessing the 'page' when we get around to
59 * establishing additional references. We should be the only one
60 * holding a reference to the 'page' at this point.
61 */
62 BUG_ON(!trylock_page(page));
63 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
64 balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
65 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
66 unlock_page(page);
67 return page;
68}
69EXPORT_SYMBOL_GPL(balloon_page_enqueue);
70
71/*
72 * balloon_page_dequeue - removes a page from balloon's page list and returns
73 * the its address to allow the driver release the page.
74 * @b_dev_info: balloon device decriptor where we will grab a page from.
75 *
76 * Driver must call it to properly de-allocate a previous enlisted balloon page
77 * before definetively releasing it back to the guest system.
78 * This function returns the page address for the recently dequeued page or
79 * NULL in the case we find balloon's page list temporarily empty due to
80 * compaction isolated pages.
81 */
82struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
83{
84 struct page *page, *tmp;
85 unsigned long flags;
86 bool dequeued_page;
87
88 dequeued_page = false;
89 list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
90 /*
91 * Block others from accessing the 'page' while we get around
92 * establishing additional references and preparing the 'page'
93 * to be released by the balloon driver.
94 */
95 if (trylock_page(page)) {
96 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
97 /*
98 * Raise the page refcount here to prevent any wrong
99 * attempt to isolate this page, in case of coliding
100 * with balloon_page_isolate() just after we release
101 * the page lock.
102 *
103 * balloon_page_free() will take care of dropping
104 * this extra refcount later.
105 */
106 get_page(page);
107 balloon_page_delete(page);
108 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
109 unlock_page(page);
110 dequeued_page = true;
111 break;
112 }
113 }
114
115 if (!dequeued_page) {
116 /*
117 * If we are unable to dequeue a balloon page because the page
118 * list is empty and there is no isolated pages, then something
119 * went out of track and some balloon pages are lost.
120 * BUG() here, otherwise the balloon driver may get stuck into
121 * an infinite loop while attempting to release all its pages.
122 */
123 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
124 if (unlikely(list_empty(&b_dev_info->pages) &&
125 !b_dev_info->isolated_pages))
126 BUG();
127 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
128 page = NULL;
129 }
130 return page;
131}
132EXPORT_SYMBOL_GPL(balloon_page_dequeue);
133
134#ifdef CONFIG_BALLOON_COMPACTION
135/*
136 * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
137 * @b_dev_info: holds the balloon device information descriptor.
138 * @a_ops: balloon_mapping address_space_operations descriptor.
139 *
140 * Driver must call it to properly allocate and initialize an instance of
141 * struct address_space which will be used as the special page->mapping for
142 * balloon device enlisted page instances.
143 */
144struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
145 const struct address_space_operations *a_ops)
146{
147 struct address_space *mapping;
148
149 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
150 if (!mapping)
151 return ERR_PTR(-ENOMEM);
152
153 /*
154 * Give a clean 'zeroed' status to all elements of this special
155 * balloon page->mapping struct address_space instance.
156 */
157 address_space_init_once(mapping);
158
159 /*
160 * Set mapping->flags appropriately, to allow balloon pages
161 * ->mapping identification.
162 */
163 mapping_set_balloon(mapping);
164 mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
165
166 /* balloon's page->mapping->a_ops callback descriptor */
167 mapping->a_ops = a_ops;
168
169 /*
170 * Establish a pointer reference back to the balloon device descriptor
171 * this particular page->mapping will be servicing.
172 * This is used by compaction / migration procedures to identify and
173 * access the balloon device pageset while isolating / migrating pages.
174 *
175 * As some balloon drivers can register multiple balloon devices
176 * for a single guest, this also helps compaction / migration to
177 * properly deal with multiple balloon pagesets, when required.
178 */
179 mapping->private_data = b_dev_info;
180 b_dev_info->mapping = mapping;
181
182 return mapping;
183}
184EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
185
186static inline void __isolate_balloon_page(struct page *page)
187{
188 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
189 unsigned long flags;
190 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
191 list_del(&page->lru);
192 b_dev_info->isolated_pages++;
193 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
194}
195
196static inline void __putback_balloon_page(struct page *page)
197{
198 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
199 unsigned long flags;
200 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
201 list_add(&page->lru, &b_dev_info->pages);
202 b_dev_info->isolated_pages--;
203 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
204}
205
206static inline int __migrate_balloon_page(struct address_space *mapping,
207 struct page *newpage, struct page *page, enum migrate_mode mode)
208{
209 return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
210}
211
212/* __isolate_lru_page() counterpart for a ballooned page */
213bool balloon_page_isolate(struct page *page)
214{
215 /*
216 * Avoid burning cycles with pages that are yet under __free_pages(),
217 * or just got freed under us.
218 *
219 * In case we 'win' a race for a balloon page being freed under us and
220 * raise its refcount preventing __free_pages() from doing its job
221 * the put_page() at the end of this block will take care of
222 * release this page, thus avoiding a nasty leakage.
223 */
224 if (likely(get_page_unless_zero(page))) {
225 /*
226 * As balloon pages are not isolated from LRU lists, concurrent
227 * compaction threads can race against page migration functions
228 * as well as race against the balloon driver releasing a page.
229 *
230 * In order to avoid having an already isolated balloon page
231 * being (wrongly) re-isolated while it is under migration,
232 * or to avoid attempting to isolate pages being released by
233 * the balloon driver, lets be sure we have the page lock
234 * before proceeding with the balloon page isolation steps.
235 */
236 if (likely(trylock_page(page))) {
237 /*
238 * A ballooned page, by default, has just one refcount.
239 * Prevent concurrent compaction threads from isolating
240 * an already isolated balloon page by refcount check.
241 */
242 if (__is_movable_balloon_page(page) &&
243 page_count(page) == 2) {
244 __isolate_balloon_page(page);
245 unlock_page(page);
246 return true;
247 }
248 unlock_page(page);
249 }
250 put_page(page);
251 }
252 return false;
253}
254
255/* putback_lru_page() counterpart for a ballooned page */
256void balloon_page_putback(struct page *page)
257{
258 /*
259 * 'lock_page()' stabilizes the page and prevents races against
260 * concurrent isolation threads attempting to re-isolate it.
261 */
262 lock_page(page);
263
264 if (__is_movable_balloon_page(page)) {
265 __putback_balloon_page(page);
266 /* drop the extra ref count taken for page isolation */
267 put_page(page);
268 } else {
269 WARN_ON(1);
270 dump_page(page);
271 }
272 unlock_page(page);
273}
274
275/* move_to_new_page() counterpart for a ballooned page */
276int balloon_page_migrate(struct page *newpage,
277 struct page *page, enum migrate_mode mode)
278{
279 struct address_space *mapping;
280 int rc = -EAGAIN;
281
282 /*
283 * Block others from accessing the 'newpage' when we get around to
284 * establishing additional references. We should be the only one
285 * holding a reference to the 'newpage' at this point.
286 */
287 BUG_ON(!trylock_page(newpage));
288
289 if (WARN_ON(!__is_movable_balloon_page(page))) {
290 dump_page(page);
291 unlock_page(newpage);
292 return rc;
293 }
294
295 mapping = page->mapping;
296 if (mapping)
297 rc = __migrate_balloon_page(mapping, newpage, page, mode);
298
299 unlock_page(newpage);
300 return rc;
301}
302#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..b93376c39b61 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
147 147
148/* 148/*
149 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
150 * @addr: starting address of the range 150 * @addr: starting physical address of the range
151 * @size: size of the range in bytes 151 * @size: size of the range in bytes
152 * 152 *
153 * This is only useful when the bootmem allocator has already been torn 153 * This is only useful when the bootmem allocator has already been torn
154 * down, but we are still initializing the system. Pages are given directly 154 * down, but we are still initializing the system. Pages are given directly
155 * to the page allocator, no bootmem metadata is updated because it is gone. 155 * to the page allocator, no bootmem metadata is updated because it is gone.
156 */ 156 */
157void __init free_bootmem_late(unsigned long addr, unsigned long size) 157void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
158{ 158{
159 unsigned long cursor, end; 159 unsigned long cursor, end;
160 160
161 kmemleak_free_part(__va(addr), size); 161 kmemleak_free_part(__va(physaddr), size);
162 162
163 cursor = PFN_UP(addr); 163 cursor = PFN_UP(physaddr);
164 end = PFN_DOWN(addr + size); 164 end = PFN_DOWN(physaddr + size);
165 165
166 for (; cursor < end; cursor++) { 166 for (; cursor < end; cursor++) {
167 __free_pages_bootmem(pfn_to_page(cursor), 0); 167 __free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
185 185
186 while (start < end) { 186 while (start < end) {
187 unsigned long *map, idx, vec; 187 unsigned long *map, idx, vec;
188 unsigned shift;
188 189
189 map = bdata->node_bootmem_map; 190 map = bdata->node_bootmem_map;
190 idx = start - bdata->node_min_pfn; 191 idx = start - bdata->node_min_pfn;
192 shift = idx & (BITS_PER_LONG - 1);
193 /*
194 * vec holds at most BITS_PER_LONG map bits,
195 * bit 0 corresponds to start.
196 */
191 vec = ~map[idx / BITS_PER_LONG]; 197 vec = ~map[idx / BITS_PER_LONG];
198
199 if (shift) {
200 vec >>= shift;
201 if (end - start >= BITS_PER_LONG)
202 vec |= ~map[idx / BITS_PER_LONG + 1] <<
203 (BITS_PER_LONG - shift);
204 }
192 /* 205 /*
193 * If we have a properly aligned and fully unreserved 206 * If we have a properly aligned and fully unreserved
194 * BITS_PER_LONG block of pages in front of us, free 207 * BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
201 count += BITS_PER_LONG; 214 count += BITS_PER_LONG;
202 start += BITS_PER_LONG; 215 start += BITS_PER_LONG;
203 } else { 216 } else {
204 unsigned long off = 0; 217 unsigned long cur = start;
205 218
206 vec >>= start & (BITS_PER_LONG - 1); 219 start = ALIGN(start + 1, BITS_PER_LONG);
207 while (vec) { 220 while (vec && cur != start) {
208 if (vec & 1) { 221 if (vec & 1) {
209 page = pfn_to_page(start + off); 222 page = pfn_to_page(cur);
210 __free_pages_bootmem(page, 0); 223 __free_pages_bootmem(page, 0);
211 count++; 224 count++;
212 } 225 }
213 vec >>= 1; 226 vec >>= 1;
214 off++; 227 ++cur;
215 } 228 }
216 start = ALIGN(start + 1, BITS_PER_LONG);
217 } 229 }
218 } 230 }
219 231
@@ -229,6 +241,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
229 return count; 241 return count;
230} 242}
231 243
244static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
245{
246 struct zone *z;
247
248 /*
249 * In free_area_init_core(), highmem zone's managed_pages is set to
250 * present_pages, and bootmem allocator doesn't allocate from highmem
251 * zones. So there's no need to recalculate managed_pages because all
252 * highmem pages will be managed by the buddy system. Here highmem
253 * zone also includes highmem movable zone.
254 */
255 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
256 if (!is_highmem(z))
257 z->managed_pages = 0;
258}
259
232/** 260/**
233 * free_all_bootmem_node - release a node's free pages to the buddy allocator 261 * free_all_bootmem_node - release a node's free pages to the buddy allocator
234 * @pgdat: node to be released 262 * @pgdat: node to be released
@@ -238,6 +266,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
238unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 266unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
239{ 267{
240 register_page_bootmem_info_node(pgdat); 268 register_page_bootmem_info_node(pgdat);
269 reset_node_lowmem_managed_pages(pgdat);
241 return free_all_bootmem_core(pgdat->bdata); 270 return free_all_bootmem_core(pgdat->bdata);
242} 271}
243 272
@@ -250,6 +279,10 @@ unsigned long __init free_all_bootmem(void)
250{ 279{
251 unsigned long total_pages = 0; 280 unsigned long total_pages = 0;
252 bootmem_data_t *bdata; 281 bootmem_data_t *bdata;
282 struct pglist_data *pgdat;
283
284 for_each_online_pgdat(pgdat)
285 reset_node_lowmem_managed_pages(pgdat);
253 286
254 list_for_each_entry(bdata, &bdata_list, list) 287 list_for_each_entry(bdata, &bdata_list, list)
255 total_pages += free_all_bootmem_core(bdata); 288 total_pages += free_all_bootmem_core(bdata);
@@ -377,21 +410,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
377 410
378/** 411/**
379 * free_bootmem - mark a page range as usable 412 * free_bootmem - mark a page range as usable
380 * @addr: starting address of the range 413 * @addr: starting physical address of the range
381 * @size: size of the range in bytes 414 * @size: size of the range in bytes
382 * 415 *
383 * Partial pages will be considered reserved and left as they are. 416 * Partial pages will be considered reserved and left as they are.
384 * 417 *
385 * The range must be contiguous but may span node boundaries. 418 * The range must be contiguous but may span node boundaries.
386 */ 419 */
387void __init free_bootmem(unsigned long addr, unsigned long size) 420void __init free_bootmem(unsigned long physaddr, unsigned long size)
388{ 421{
389 unsigned long start, end; 422 unsigned long start, end;
390 423
391 kmemleak_free_part(__va(addr), size); 424 kmemleak_free_part(__va(physaddr), size);
392 425
393 start = PFN_UP(addr); 426 start = PFN_UP(physaddr);
394 end = PFN_DOWN(addr + size); 427 end = PFN_DOWN(physaddr + size);
395 428
396 mark_bootmem(start, end, 0, 0); 429 mark_bootmem(start, end, 0, 0);
397} 430}
@@ -439,12 +472,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
439 return mark_bootmem(start, end, 1, flags); 472 return mark_bootmem(start, end, 1, flags);
440} 473}
441 474
442int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
443 int flags)
444{
445 return reserve_bootmem(phys, len, flags);
446}
447
448static unsigned long __init align_idx(struct bootmem_data *bdata, 475static unsigned long __init align_idx(struct bootmem_data *bdata,
449 unsigned long idx, unsigned long step) 476 unsigned long idx, unsigned long step)
450{ 477{
@@ -575,27 +602,6 @@ find_block:
575 return NULL; 602 return NULL;
576} 603}
577 604
578static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
579 unsigned long size, unsigned long align,
580 unsigned long goal, unsigned long limit)
581{
582 if (WARN_ON_ONCE(slab_is_available()))
583 return kzalloc(size, GFP_NOWAIT);
584
585#ifdef CONFIG_HAVE_ARCH_BOOTMEM
586 {
587 bootmem_data_t *p_bdata;
588
589 p_bdata = bootmem_arch_preferred_node(bdata, size, align,
590 goal, limit);
591 if (p_bdata)
592 return alloc_bootmem_bdata(p_bdata, size, align,
593 goal, limit);
594 }
595#endif
596 return NULL;
597}
598
599static void * __init alloc_bootmem_core(unsigned long size, 605static void * __init alloc_bootmem_core(unsigned long size,
600 unsigned long align, 606 unsigned long align,
601 unsigned long goal, 607 unsigned long goal,
@@ -604,9 +610,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
604 bootmem_data_t *bdata; 610 bootmem_data_t *bdata;
605 void *region; 611 void *region;
606 612
607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); 613 if (WARN_ON_ONCE(slab_is_available()))
608 if (region) 614 return kzalloc(size, GFP_NOWAIT);
609 return region;
610 615
611 list_for_each_entry(bdata, &bdata_list, list) { 616 list_for_each_entry(bdata, &bdata_list, list) {
612 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) 617 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -704,11 +709,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
704{ 709{
705 void *ptr; 710 void *ptr;
706 711
712 if (WARN_ON_ONCE(slab_is_available()))
713 return kzalloc(size, GFP_NOWAIT);
707again: 714again:
708 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
709 align, goal, limit);
710 if (ptr)
711 return ptr;
712 715
713 /* do not panic in alloc_bootmem_bdata() */ 716 /* do not panic in alloc_bootmem_bdata() */
714 if (limit && goal + size > limit) 717 if (limit && goal + size > limit)
diff --git a/mm/compaction.c b/mm/compaction.c
index 694eaabaaebd..c62bd063d766 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -14,8 +14,24 @@
14#include <linux/backing-dev.h> 14#include <linux/backing-dev.h>
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include <linux/balloon_compaction.h>
17#include "internal.h" 18#include "internal.h"
18 19
20#ifdef CONFIG_COMPACTION
21static inline void count_compact_event(enum vm_event_item item)
22{
23 count_vm_event(item);
24}
25
26static inline void count_compact_events(enum vm_event_item item, long delta)
27{
28 count_vm_events(item, delta);
29}
30#else
31#define count_compact_event(item) do { } while (0)
32#define count_compact_events(item, delta) do { } while (0)
33#endif
34
19#if defined CONFIG_COMPACTION || defined CONFIG_CMA 35#if defined CONFIG_COMPACTION || defined CONFIG_CMA
20 36
21#define CREATE_TRACE_POINTS 37#define CREATE_TRACE_POINTS
@@ -214,60 +230,6 @@ static bool suitable_migration_target(struct page *page)
214 return false; 230 return false;
215} 231}
216 232
217static void compact_capture_page(struct compact_control *cc)
218{
219 unsigned long flags;
220 int mtype, mtype_low, mtype_high;
221
222 if (!cc->page || *cc->page)
223 return;
224
225 /*
226 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
227 * regardless of the migratetype of the freelist is is captured from.
228 * This is fine because the order for a high-order MIGRATE_MOVABLE
229 * allocation is typically at least a pageblock size and overall
230 * fragmentation is not impaired. Other allocation types must
231 * capture pages from their own migratelist because otherwise they
232 * could pollute other pageblocks like MIGRATE_MOVABLE with
233 * difficult to move pages and making fragmentation worse overall.
234 */
235 if (cc->migratetype == MIGRATE_MOVABLE) {
236 mtype_low = 0;
237 mtype_high = MIGRATE_PCPTYPES;
238 } else {
239 mtype_low = cc->migratetype;
240 mtype_high = cc->migratetype + 1;
241 }
242
243 /* Speculatively examine the free lists without zone lock */
244 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
245 int order;
246 for (order = cc->order; order < MAX_ORDER; order++) {
247 struct page *page;
248 struct free_area *area;
249 area = &(cc->zone->free_area[order]);
250 if (list_empty(&area->free_list[mtype]))
251 continue;
252
253 /* Take the lock and attempt capture of the page */
254 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
255 return;
256 if (!list_empty(&area->free_list[mtype])) {
257 page = list_entry(area->free_list[mtype].next,
258 struct page, lru);
259 if (capture_free_page(page, cc->order, mtype)) {
260 spin_unlock_irqrestore(&cc->zone->lock,
261 flags);
262 *cc->page = page;
263 return;
264 }
265 }
266 spin_unlock_irqrestore(&cc->zone->lock, flags);
267 }
268 }
269}
270
271/* 233/*
272 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 234 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
273 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 235 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -356,6 +318,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
356 if (blockpfn == end_pfn) 318 if (blockpfn == end_pfn)
357 update_pageblock_skip(cc, valid_page, total_isolated, false); 319 update_pageblock_skip(cc, valid_page, total_isolated, false);
358 320
321 count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
322 if (total_isolated)
323 count_compact_events(COMPACTISOLATED, total_isolated);
359 return total_isolated; 324 return total_isolated;
360} 325}
361 326
@@ -565,9 +530,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
565 goto next_pageblock; 530 goto next_pageblock;
566 } 531 }
567 532
568 /* Check may be lockless but that's ok as we recheck later */ 533 /*
569 if (!PageLRU(page)) 534 * Check may be lockless but that's ok as we recheck later.
535 * It's possible to migrate LRU pages and balloon pages
536 * Skip any other type of page
537 */
538 if (!PageLRU(page)) {
539 if (unlikely(balloon_page_movable(page))) {
540 if (locked && balloon_page_isolate(page)) {
541 /* Successfully isolated */
542 cc->finished_update_migrate = true;
543 list_add(&page->lru, migratelist);
544 cc->nr_migratepages++;
545 nr_isolated++;
546 goto check_compact_cluster;
547 }
548 }
570 continue; 549 continue;
550 }
571 551
572 /* 552 /*
573 * PageLRU is set. lru_lock normally excludes isolation 553 * PageLRU is set. lru_lock normally excludes isolation
@@ -621,6 +601,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
621 cc->nr_migratepages++; 601 cc->nr_migratepages++;
622 nr_isolated++; 602 nr_isolated++;
623 603
604check_compact_cluster:
624 /* Avoid isolating too much */ 605 /* Avoid isolating too much */
625 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 606 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
626 ++low_pfn; 607 ++low_pfn;
@@ -646,6 +627,10 @@ next_pageblock:
646 627
647 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 628 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
648 629
630 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
631 if (nr_isolated)
632 count_compact_events(COMPACTISOLATED, nr_isolated);
633
649 return low_pfn; 634 return low_pfn;
650} 635}
651 636
@@ -831,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
831static int compact_finished(struct zone *zone, 816static int compact_finished(struct zone *zone,
832 struct compact_control *cc) 817 struct compact_control *cc)
833{ 818{
819 unsigned int order;
834 unsigned long watermark; 820 unsigned long watermark;
835 821
836 if (fatal_signal_pending(current)) 822 if (fatal_signal_pending(current))
@@ -865,22 +851,16 @@ static int compact_finished(struct zone *zone,
865 return COMPACT_CONTINUE; 851 return COMPACT_CONTINUE;
866 852
867 /* Direct compactor: Is a suitable page free? */ 853 /* Direct compactor: Is a suitable page free? */
868 if (cc->page) { 854 for (order = cc->order; order < MAX_ORDER; order++) {
869 /* Was a suitable page captured? */ 855 struct free_area *area = &zone->free_area[order];
870 if (*cc->page) 856
857 /* Job done if page is free of the right migratetype */
858 if (!list_empty(&area->free_list[cc->migratetype]))
859 return COMPACT_PARTIAL;
860
861 /* Job done if allocation would set block type */
862 if (cc->order >= pageblock_order && area->nr_free)
871 return COMPACT_PARTIAL; 863 return COMPACT_PARTIAL;
872 } else {
873 unsigned int order;
874 for (order = cc->order; order < MAX_ORDER; order++) {
875 struct free_area *area = &zone->free_area[cc->order];
876 /* Job done if page is free of the right migratetype */
877 if (!list_empty(&area->free_list[cc->migratetype]))
878 return COMPACT_PARTIAL;
879
880 /* Job done if allocation would set block type */
881 if (cc->order >= pageblock_order && area->nr_free)
882 return COMPACT_PARTIAL;
883 }
884 } 864 }
885 865
886 return COMPACT_CONTINUE; 866 return COMPACT_CONTINUE;
@@ -986,7 +966,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
986 switch (isolate_migratepages(zone, cc)) { 966 switch (isolate_migratepages(zone, cc)) {
987 case ISOLATE_ABORT: 967 case ISOLATE_ABORT:
988 ret = COMPACT_PARTIAL; 968 ret = COMPACT_PARTIAL;
989 putback_lru_pages(&cc->migratepages); 969 putback_movable_pages(&cc->migratepages);
990 cc->nr_migratepages = 0; 970 cc->nr_migratepages = 0;
991 goto out; 971 goto out;
992 case ISOLATE_NONE: 972 case ISOLATE_NONE:
@@ -998,29 +978,23 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
998 nr_migrate = cc->nr_migratepages; 978 nr_migrate = cc->nr_migratepages;
999 err = migrate_pages(&cc->migratepages, compaction_alloc, 979 err = migrate_pages(&cc->migratepages, compaction_alloc,
1000 (unsigned long)cc, false, 980 (unsigned long)cc, false,
1001 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); 981 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
982 MR_COMPACTION);
1002 update_nr_listpages(cc); 983 update_nr_listpages(cc);
1003 nr_remaining = cc->nr_migratepages; 984 nr_remaining = cc->nr_migratepages;
1004 985
1005 count_vm_event(COMPACTBLOCKS);
1006 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
1007 if (nr_remaining)
1008 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
1009 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 986 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
1010 nr_remaining); 987 nr_remaining);
1011 988
1012 /* Release LRU pages not migrated */ 989 /* Release isolated pages not migrated */
1013 if (err) { 990 if (err) {
1014 putback_lru_pages(&cc->migratepages); 991 putback_movable_pages(&cc->migratepages);
1015 cc->nr_migratepages = 0; 992 cc->nr_migratepages = 0;
1016 if (err == -ENOMEM) { 993 if (err == -ENOMEM) {
1017 ret = COMPACT_PARTIAL; 994 ret = COMPACT_PARTIAL;
1018 goto out; 995 goto out;
1019 } 996 }
1020 } 997 }
1021
1022 /* Capture a page now if it is a suitable size */
1023 compact_capture_page(cc);
1024 } 998 }
1025 999
1026out: 1000out:
@@ -1033,8 +1007,7 @@ out:
1033 1007
1034static unsigned long compact_zone_order(struct zone *zone, 1008static unsigned long compact_zone_order(struct zone *zone,
1035 int order, gfp_t gfp_mask, 1009 int order, gfp_t gfp_mask,
1036 bool sync, bool *contended, 1010 bool sync, bool *contended)
1037 struct page **page)
1038{ 1011{
1039 unsigned long ret; 1012 unsigned long ret;
1040 struct compact_control cc = { 1013 struct compact_control cc = {
@@ -1044,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
1044 .migratetype = allocflags_to_migratetype(gfp_mask), 1017 .migratetype = allocflags_to_migratetype(gfp_mask),
1045 .zone = zone, 1018 .zone = zone,
1046 .sync = sync, 1019 .sync = sync,
1047 .page = page,
1048 }; 1020 };
1049 INIT_LIST_HEAD(&cc.freepages); 1021 INIT_LIST_HEAD(&cc.freepages);
1050 INIT_LIST_HEAD(&cc.migratepages); 1022 INIT_LIST_HEAD(&cc.migratepages);
@@ -1074,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
1074 */ 1046 */
1075unsigned long try_to_compact_pages(struct zonelist *zonelist, 1047unsigned long try_to_compact_pages(struct zonelist *zonelist,
1076 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1048 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1077 bool sync, bool *contended, struct page **page) 1049 bool sync, bool *contended)
1078{ 1050{
1079 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1051 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1080 int may_enter_fs = gfp_mask & __GFP_FS; 1052 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1088,7 +1060,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1088 if (!order || !may_enter_fs || !may_perform_io) 1060 if (!order || !may_enter_fs || !may_perform_io)
1089 return rc; 1061 return rc;
1090 1062
1091 count_vm_event(COMPACTSTALL); 1063 count_compact_event(COMPACTSTALL);
1092 1064
1093#ifdef CONFIG_CMA 1065#ifdef CONFIG_CMA
1094 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 1066 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -1100,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1100 int status; 1072 int status;
1101 1073
1102 status = compact_zone_order(zone, order, gfp_mask, sync, 1074 status = compact_zone_order(zone, order, gfp_mask, sync,
1103 contended, page); 1075 contended);
1104 rc = max(status, rc); 1076 rc = max(status, rc);
1105 1077
1106 /* If a normal allocation would succeed, stop compacting */ 1078 /* If a normal allocation would succeed, stop compacting */
@@ -1156,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
1156 struct compact_control cc = { 1128 struct compact_control cc = {
1157 .order = order, 1129 .order = order,
1158 .sync = false, 1130 .sync = false,
1159 .page = NULL,
1160 }; 1131 };
1161 1132
1162 return __compact_pgdat(pgdat, &cc); 1133 return __compact_pgdat(pgdat, &cc);
@@ -1167,14 +1138,13 @@ static int compact_node(int nid)
1167 struct compact_control cc = { 1138 struct compact_control cc = {
1168 .order = -1, 1139 .order = -1,
1169 .sync = true, 1140 .sync = true,
1170 .page = NULL,
1171 }; 1141 };
1172 1142
1173 return __compact_pgdat(NODE_DATA(nid), &cc); 1143 return __compact_pgdat(NODE_DATA(nid), &cc);
1174} 1144}
1175 1145
1176/* Compact all nodes in the system */ 1146/* Compact all nodes in the system */
1177static int compact_nodes(void) 1147static void compact_nodes(void)
1178{ 1148{
1179 int nid; 1149 int nid;
1180 1150
@@ -1183,8 +1153,6 @@ static int compact_nodes(void)
1183 1153
1184 for_each_online_node(nid) 1154 for_each_online_node(nid)
1185 compact_node(nid); 1155 compact_node(nid);
1186
1187 return COMPACT_COMPLETE;
1188} 1156}
1189 1157
1190/* The written value is actually unused, all memory is compacted */ 1158/* The written value is actually unused, all memory is compacted */
@@ -1195,7 +1163,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
1195 void __user *buffer, size_t *length, loff_t *ppos) 1163 void __user *buffer, size_t *length, loff_t *ppos)
1196{ 1164{
1197 if (write) 1165 if (write)
1198 return compact_nodes(); 1166 compact_nodes();
1199 1167
1200 return 0; 1168 return 0;
1201} 1169}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c5ab33bca0a8..c69781e97cf9 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */
50 size_t allocation; 50 size_t allocation;
51 size_t boundary; 51 size_t boundary;
52 char name[32]; 52 char name[32];
53 wait_queue_head_t waitq;
54 struct list_head pools; 53 struct list_head pools;
55}; 54};
56 55
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
62 unsigned int offset; 61 unsigned int offset;
63}; 62};
64 63
65#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
66
67static DEFINE_MUTEX(pools_lock); 64static DEFINE_MUTEX(pools_lock);
68 65
69static ssize_t 66static ssize_t
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
172 retval->size = size; 169 retval->size = size;
173 retval->boundary = boundary; 170 retval->boundary = boundary;
174 retval->allocation = allocation; 171 retval->allocation = allocation;
175 init_waitqueue_head(&retval->waitq);
176 172
177 if (dev) { 173 if (dev) {
178 int ret; 174 int ret;
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
227 memset(page->vaddr, POOL_POISON_FREED, pool->allocation); 223 memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
228#endif 224#endif
229 pool_initialise_page(pool, page); 225 pool_initialise_page(pool, page);
230 list_add(&page->page_list, &pool->page_list);
231 page->in_use = 0; 226 page->in_use = 0;
232 page->offset = 0; 227 page->offset = 0;
233 } else { 228 } else {
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
315 might_sleep_if(mem_flags & __GFP_WAIT); 310 might_sleep_if(mem_flags & __GFP_WAIT);
316 311
317 spin_lock_irqsave(&pool->lock, flags); 312 spin_lock_irqsave(&pool->lock, flags);
318 restart:
319 list_for_each_entry(page, &pool->page_list, page_list) { 313 list_for_each_entry(page, &pool->page_list, page_list) {
320 if (page->offset < pool->allocation) 314 if (page->offset < pool->allocation)
321 goto ready; 315 goto ready;
322 } 316 }
323 page = pool_alloc_page(pool, GFP_ATOMIC);
324 if (!page) {
325 if (mem_flags & __GFP_WAIT) {
326 DECLARE_WAITQUEUE(wait, current);
327 317
328 __set_current_state(TASK_UNINTERRUPTIBLE); 318 /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
329 __add_wait_queue(&pool->waitq, &wait); 319 spin_unlock_irqrestore(&pool->lock, flags);
330 spin_unlock_irqrestore(&pool->lock, flags);
331 320
332 schedule_timeout(POOL_TIMEOUT_JIFFIES); 321 page = pool_alloc_page(pool, mem_flags);
322 if (!page)
323 return NULL;
333 324
334 spin_lock_irqsave(&pool->lock, flags); 325 spin_lock_irqsave(&pool->lock, flags);
335 __remove_wait_queue(&pool->waitq, &wait);
336 goto restart;
337 }
338 retval = NULL;
339 goto done;
340 }
341 326
327 list_add(&page->page_list, &pool->page_list);
342 ready: 328 ready:
343 page->in_use++; 329 page->in_use++;
344 offset = page->offset; 330 offset = page->offset;
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
346 retval = offset + page->vaddr; 332 retval = offset + page->vaddr;
347 *handle = offset + page->dma; 333 *handle = offset + page->dma;
348#ifdef DMAPOOL_DEBUG 334#ifdef DMAPOOL_DEBUG
335 {
336 int i;
337 u8 *data = retval;
338 /* page->offset is stored in first 4 bytes */
339 for (i = sizeof(page->offset); i < pool->size; i++) {
340 if (data[i] == POOL_POISON_FREED)
341 continue;
342 if (pool->dev)
343 dev_err(pool->dev,
344 "dma_pool_alloc %s, %p (corruped)\n",
345 pool->name, retval);
346 else
347 pr_err("dma_pool_alloc %s, %p (corruped)\n",
348 pool->name, retval);
349
350 /*
351 * Dump the first 4 bytes even if they are not
352 * POOL_POISON_FREED
353 */
354 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
355 data, pool->size, 1);
356 break;
357 }
358 }
349 memset(retval, POOL_POISON_ALLOCATED, pool->size); 359 memset(retval, POOL_POISON_ALLOCATED, pool->size);
350#endif 360#endif
351 done:
352 spin_unlock_irqrestore(&pool->lock, flags); 361 spin_unlock_irqrestore(&pool->lock, flags);
353 return retval; 362 return retval;
354} 363}
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
435 page->in_use--; 444 page->in_use--;
436 *(int *)vaddr = page->offset; 445 *(int *)vaddr = page->offset;
437 page->offset = offset; 446 page->offset = offset;
438 if (waitqueue_active(&pool->waitq))
439 wake_up_locked(&pool->waitq);
440 /* 447 /*
441 * Resist a temptation to do 448 * Resist a temptation to do
442 * if (!is_page_busy(page)) pool_free_page(pool, page); 449 * if (!is_page_busy(page)) pool_free_page(pool, page);
diff --git a/mm/highmem.c b/mm/highmem.c
index 2da13a5c50e2..b32b70cdaed6 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -99,12 +99,13 @@ struct page *kmap_to_page(void *vaddr)
99 unsigned long addr = (unsigned long)vaddr; 99 unsigned long addr = (unsigned long)vaddr;
100 100
101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { 101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; 102 int i = PKMAP_NR(addr);
103 return pte_page(pkmap_page_table[i]); 103 return pte_page(pkmap_page_table[i]);
104 } 104 }
105 105
106 return virt_to_page(addr); 106 return virt_to_page(addr);
107} 107}
108EXPORT_SYMBOL(kmap_to_page);
108 109
109static void flush_all_zero_pkmaps(void) 110static void flush_all_zero_pkmaps(void)
110{ 111{
@@ -137,8 +138,7 @@ static void flush_all_zero_pkmaps(void)
137 * So no dangers, even with speculative execution. 138 * So no dangers, even with speculative execution.
138 */ 139 */
139 page = pte_page(pkmap_page_table[i]); 140 page = pte_page(pkmap_page_table[i]);
140 pte_clear(&init_mm, (unsigned long)page_address(page), 141 pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
141 &pkmap_page_table[i]);
142 142
143 set_page_address(page, NULL); 143 set_page_address(page, NULL);
144 need_flush = 1; 144 need_flush = 1;
@@ -324,11 +324,7 @@ struct page_address_map {
324 struct list_head list; 324 struct list_head list;
325}; 325};
326 326
327/* 327static struct page_address_map page_address_maps[LAST_PKMAP];
328 * page_address_map freelist, allocated from page_address_maps.
329 */
330static struct list_head page_address_pool; /* freelist */
331static spinlock_t pool_lock; /* protects page_address_pool */
332 328
333/* 329/*
334 * Hash table bucket 330 * Hash table bucket
@@ -393,14 +389,7 @@ void set_page_address(struct page *page, void *virtual)
393 389
394 pas = page_slot(page); 390 pas = page_slot(page);
395 if (virtual) { /* Add */ 391 if (virtual) { /* Add */
396 BUG_ON(list_empty(&page_address_pool)); 392 pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
397
398 spin_lock_irqsave(&pool_lock, flags);
399 pam = list_entry(page_address_pool.next,
400 struct page_address_map, list);
401 list_del(&pam->list);
402 spin_unlock_irqrestore(&pool_lock, flags);
403
404 pam->page = page; 393 pam->page = page;
405 pam->virtual = virtual; 394 pam->virtual = virtual;
406 395
@@ -413,9 +402,6 @@ void set_page_address(struct page *page, void *virtual)
413 if (pam->page == page) { 402 if (pam->page == page) {
414 list_del(&pam->list); 403 list_del(&pam->list);
415 spin_unlock_irqrestore(&pas->lock, flags); 404 spin_unlock_irqrestore(&pas->lock, flags);
416 spin_lock_irqsave(&pool_lock, flags);
417 list_add_tail(&pam->list, &page_address_pool);
418 spin_unlock_irqrestore(&pool_lock, flags);
419 goto done; 405 goto done;
420 } 406 }
421 } 407 }
@@ -425,20 +411,14 @@ done:
425 return; 411 return;
426} 412}
427 413
428static struct page_address_map page_address_maps[LAST_PKMAP];
429
430void __init page_address_init(void) 414void __init page_address_init(void)
431{ 415{
432 int i; 416 int i;
433 417
434 INIT_LIST_HEAD(&page_address_pool);
435 for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
436 list_add(&page_address_maps[i].list, &page_address_pool);
437 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { 418 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
438 INIT_LIST_HEAD(&page_address_htable[i].lh); 419 INIT_LIST_HEAD(&page_address_htable[i].lh);
439 spin_lock_init(&page_address_htable[i].lock); 420 spin_lock_init(&page_address_htable[i].lock);
440 } 421 }
441 spin_lock_init(&pool_lock);
442} 422}
443 423
444#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 424#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40f17c34b415..b5783d81eda9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,15 @@
12#include <linux/mmu_notifier.h> 12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h> 13#include <linux/rmap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/shrinker.h>
15#include <linux/mm_inline.h> 16#include <linux/mm_inline.h>
16#include <linux/kthread.h> 17#include <linux/kthread.h>
17#include <linux/khugepaged.h> 18#include <linux/khugepaged.h>
18#include <linux/freezer.h> 19#include <linux/freezer.h>
19#include <linux/mman.h> 20#include <linux/mman.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/migrate.h>
23
21#include <asm/tlb.h> 24#include <asm/tlb.h>
22#include <asm/pgalloc.h> 25#include <asm/pgalloc.h>
23#include "internal.h" 26#include "internal.h"
@@ -37,7 +40,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
37 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 40 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
38#endif 41#endif
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 42 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
40 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 43 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
44 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41 45
42/* default scan 8*512 pte (or vmas) every 30 second */ 46/* default scan 8*512 pte (or vmas) every 30 second */
43static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 47static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -159,6 +163,77 @@ static int start_khugepaged(void)
159 return err; 163 return err;
160} 164}
161 165
166static atomic_t huge_zero_refcount;
167static unsigned long huge_zero_pfn __read_mostly;
168
169static inline bool is_huge_zero_pfn(unsigned long pfn)
170{
171 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
172 return zero_pfn && pfn == zero_pfn;
173}
174
175static inline bool is_huge_zero_pmd(pmd_t pmd)
176{
177 return is_huge_zero_pfn(pmd_pfn(pmd));
178}
179
180static unsigned long get_huge_zero_page(void)
181{
182 struct page *zero_page;
183retry:
184 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
185 return ACCESS_ONCE(huge_zero_pfn);
186
187 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
188 HPAGE_PMD_ORDER);
189 if (!zero_page) {
190 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
191 return 0;
192 }
193 count_vm_event(THP_ZERO_PAGE_ALLOC);
194 preempt_disable();
195 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
196 preempt_enable();
197 __free_page(zero_page);
198 goto retry;
199 }
200
201 /* We take additional reference here. It will be put back by shrinker */
202 atomic_set(&huge_zero_refcount, 2);
203 preempt_enable();
204 return ACCESS_ONCE(huge_zero_pfn);
205}
206
207static void put_huge_zero_page(void)
208{
209 /*
210 * Counter should never go to zero here. Only shrinker can put
211 * last reference.
212 */
213 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
214}
215
216static int shrink_huge_zero_page(struct shrinker *shrink,
217 struct shrink_control *sc)
218{
219 if (!sc->nr_to_scan)
220 /* we can free zero page only if last reference remains */
221 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
222
223 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
224 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
225 BUG_ON(zero_pfn == 0);
226 __free_page(__pfn_to_page(zero_pfn));
227 }
228
229 return 0;
230}
231
232static struct shrinker huge_zero_page_shrinker = {
233 .shrink = shrink_huge_zero_page,
234 .seeks = DEFAULT_SEEKS,
235};
236
162#ifdef CONFIG_SYSFS 237#ifdef CONFIG_SYSFS
163 238
164static ssize_t double_flag_show(struct kobject *kobj, 239static ssize_t double_flag_show(struct kobject *kobj,
@@ -284,6 +359,20 @@ static ssize_t defrag_store(struct kobject *kobj,
284static struct kobj_attribute defrag_attr = 359static struct kobj_attribute defrag_attr =
285 __ATTR(defrag, 0644, defrag_show, defrag_store); 360 __ATTR(defrag, 0644, defrag_show, defrag_store);
286 361
362static ssize_t use_zero_page_show(struct kobject *kobj,
363 struct kobj_attribute *attr, char *buf)
364{
365 return single_flag_show(kobj, attr, buf,
366 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
367}
368static ssize_t use_zero_page_store(struct kobject *kobj,
369 struct kobj_attribute *attr, const char *buf, size_t count)
370{
371 return single_flag_store(kobj, attr, buf, count,
372 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
373}
374static struct kobj_attribute use_zero_page_attr =
375 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
287#ifdef CONFIG_DEBUG_VM 376#ifdef CONFIG_DEBUG_VM
288static ssize_t debug_cow_show(struct kobject *kobj, 377static ssize_t debug_cow_show(struct kobject *kobj,
289 struct kobj_attribute *attr, char *buf) 378 struct kobj_attribute *attr, char *buf)
@@ -305,6 +394,7 @@ static struct kobj_attribute debug_cow_attr =
305static struct attribute *hugepage_attr[] = { 394static struct attribute *hugepage_attr[] = {
306 &enabled_attr.attr, 395 &enabled_attr.attr,
307 &defrag_attr.attr, 396 &defrag_attr.attr,
397 &use_zero_page_attr.attr,
308#ifdef CONFIG_DEBUG_VM 398#ifdef CONFIG_DEBUG_VM
309 &debug_cow_attr.attr, 399 &debug_cow_attr.attr,
310#endif 400#endif
@@ -484,19 +574,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
484 574
485 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 575 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
486 if (unlikely(!*hugepage_kobj)) { 576 if (unlikely(!*hugepage_kobj)) {
487 printk(KERN_ERR "hugepage: failed kobject create\n"); 577 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
488 return -ENOMEM; 578 return -ENOMEM;
489 } 579 }
490 580
491 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 581 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
492 if (err) { 582 if (err) {
493 printk(KERN_ERR "hugepage: failed register hugeage group\n"); 583 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
494 goto delete_obj; 584 goto delete_obj;
495 } 585 }
496 586
497 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 587 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
498 if (err) { 588 if (err) {
499 printk(KERN_ERR "hugepage: failed register hugeage group\n"); 589 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
500 goto remove_hp_group; 590 goto remove_hp_group;
501 } 591 }
502 592
@@ -550,6 +640,8 @@ static int __init hugepage_init(void)
550 goto out; 640 goto out;
551 } 641 }
552 642
643 register_shrinker(&huge_zero_page_shrinker);
644
553 /* 645 /*
554 * By default disable transparent hugepages on smaller systems, 646 * By default disable transparent hugepages on smaller systems,
555 * where the extra memory used could hurt more than TLB overhead 647 * where the extra memory used could hurt more than TLB overhead
@@ -599,13 +691,22 @@ out:
599} 691}
600__setup("transparent_hugepage=", setup_transparent_hugepage); 692__setup("transparent_hugepage=", setup_transparent_hugepage);
601 693
602static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 694pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
603{ 695{
604 if (likely(vma->vm_flags & VM_WRITE)) 696 if (likely(vma->vm_flags & VM_WRITE))
605 pmd = pmd_mkwrite(pmd); 697 pmd = pmd_mkwrite(pmd);
606 return pmd; 698 return pmd;
607} 699}
608 700
701static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
702{
703 pmd_t entry;
704 entry = mk_pmd(page, vma->vm_page_prot);
705 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
706 entry = pmd_mkhuge(entry);
707 return entry;
708}
709
609static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 710static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
610 struct vm_area_struct *vma, 711 struct vm_area_struct *vma,
611 unsigned long haddr, pmd_t *pmd, 712 unsigned long haddr, pmd_t *pmd,
@@ -629,9 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
629 pte_free(mm, pgtable); 730 pte_free(mm, pgtable);
630 } else { 731 } else {
631 pmd_t entry; 732 pmd_t entry;
632 entry = mk_pmd(page, vma->vm_page_prot); 733 entry = mk_huge_pmd(page, vma);
633 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
634 entry = pmd_mkhuge(entry);
635 /* 734 /*
636 * The spinlocking to take the lru_lock inside 735 * The spinlocking to take the lru_lock inside
637 * page_add_new_anon_rmap() acts as a full memory 736 * page_add_new_anon_rmap() acts as a full memory
@@ -671,6 +770,22 @@ static inline struct page *alloc_hugepage(int defrag)
671} 770}
672#endif 771#endif
673 772
773static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
774 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
775 unsigned long zero_pfn)
776{
777 pmd_t entry;
778 if (!pmd_none(*pmd))
779 return false;
780 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
781 entry = pmd_wrprotect(entry);
782 entry = pmd_mkhuge(entry);
783 set_pmd_at(mm, haddr, pmd, entry);
784 pgtable_trans_huge_deposit(mm, pgtable);
785 mm->nr_ptes++;
786 return true;
787}
788
674int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 789int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
675 unsigned long address, pmd_t *pmd, 790 unsigned long address, pmd_t *pmd,
676 unsigned int flags) 791 unsigned int flags)
@@ -684,6 +799,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
684 return VM_FAULT_OOM; 799 return VM_FAULT_OOM;
685 if (unlikely(khugepaged_enter(vma))) 800 if (unlikely(khugepaged_enter(vma)))
686 return VM_FAULT_OOM; 801 return VM_FAULT_OOM;
802 if (!(flags & FAULT_FLAG_WRITE) &&
803 transparent_hugepage_use_zero_page()) {
804 pgtable_t pgtable;
805 unsigned long zero_pfn;
806 bool set;
807 pgtable = pte_alloc_one(mm, haddr);
808 if (unlikely(!pgtable))
809 return VM_FAULT_OOM;
810 zero_pfn = get_huge_zero_page();
811 if (unlikely(!zero_pfn)) {
812 pte_free(mm, pgtable);
813 count_vm_event(THP_FAULT_FALLBACK);
814 goto out;
815 }
816 spin_lock(&mm->page_table_lock);
817 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
818 zero_pfn);
819 spin_unlock(&mm->page_table_lock);
820 if (!set) {
821 pte_free(mm, pgtable);
822 put_huge_zero_page();
823 }
824 return 0;
825 }
687 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 826 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
688 vma, haddr, numa_node_id(), 0); 827 vma, haddr, numa_node_id(), 0);
689 if (unlikely(!page)) { 828 if (unlikely(!page)) {
@@ -710,7 +849,8 @@ out:
710 * run pte_offset_map on the pmd, if an huge pmd could 849 * run pte_offset_map on the pmd, if an huge pmd could
711 * materialize from under us from a different thread. 850 * materialize from under us from a different thread.
712 */ 851 */
713 if (unlikely(__pte_alloc(mm, vma, pmd, address))) 852 if (unlikely(pmd_none(*pmd)) &&
853 unlikely(__pte_alloc(mm, vma, pmd, address)))
714 return VM_FAULT_OOM; 854 return VM_FAULT_OOM;
715 /* if an huge pmd materialized from under us just retry later */ 855 /* if an huge pmd materialized from under us just retry later */
716 if (unlikely(pmd_trans_huge(*pmd))) 856 if (unlikely(pmd_trans_huge(*pmd)))
@@ -748,6 +888,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
748 pte_free(dst_mm, pgtable); 888 pte_free(dst_mm, pgtable);
749 goto out_unlock; 889 goto out_unlock;
750 } 890 }
891 /*
892 * mm->page_table_lock is enough to be sure that huge zero pmd is not
893 * under splitting since we don't split the page itself, only pmd to
894 * a page table.
895 */
896 if (is_huge_zero_pmd(pmd)) {
897 unsigned long zero_pfn;
898 bool set;
899 /*
900 * get_huge_zero_page() will never allocate a new page here,
901 * since we already have a zero page to copy. It just takes a
902 * reference.
903 */
904 zero_pfn = get_huge_zero_page();
905 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
906 zero_pfn);
907 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
908 ret = 0;
909 goto out_unlock;
910 }
751 if (unlikely(pmd_trans_splitting(pmd))) { 911 if (unlikely(pmd_trans_splitting(pmd))) {
752 /* split huge page running from under us */ 912 /* split huge page running from under us */
753 spin_unlock(&src_mm->page_table_lock); 913 spin_unlock(&src_mm->page_table_lock);
@@ -777,6 +937,102 @@ out:
777 return ret; 937 return ret;
778} 938}
779 939
940void huge_pmd_set_accessed(struct mm_struct *mm,
941 struct vm_area_struct *vma,
942 unsigned long address,
943 pmd_t *pmd, pmd_t orig_pmd,
944 int dirty)
945{
946 pmd_t entry;
947 unsigned long haddr;
948
949 spin_lock(&mm->page_table_lock);
950 if (unlikely(!pmd_same(*pmd, orig_pmd)))
951 goto unlock;
952
953 entry = pmd_mkyoung(orig_pmd);
954 haddr = address & HPAGE_PMD_MASK;
955 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
956 update_mmu_cache_pmd(vma, address, pmd);
957
958unlock:
959 spin_unlock(&mm->page_table_lock);
960}
961
962static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
963 struct vm_area_struct *vma, unsigned long address,
964 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
965{
966 pgtable_t pgtable;
967 pmd_t _pmd;
968 struct page *page;
969 int i, ret = 0;
970 unsigned long mmun_start; /* For mmu_notifiers */
971 unsigned long mmun_end; /* For mmu_notifiers */
972
973 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
974 if (!page) {
975 ret |= VM_FAULT_OOM;
976 goto out;
977 }
978
979 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
980 put_page(page);
981 ret |= VM_FAULT_OOM;
982 goto out;
983 }
984
985 clear_user_highpage(page, address);
986 __SetPageUptodate(page);
987
988 mmun_start = haddr;
989 mmun_end = haddr + HPAGE_PMD_SIZE;
990 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
991
992 spin_lock(&mm->page_table_lock);
993 if (unlikely(!pmd_same(*pmd, orig_pmd)))
994 goto out_free_page;
995
996 pmdp_clear_flush(vma, haddr, pmd);
997 /* leave pmd empty until pte is filled */
998
999 pgtable = pgtable_trans_huge_withdraw(mm);
1000 pmd_populate(mm, &_pmd, pgtable);
1001
1002 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1003 pte_t *pte, entry;
1004 if (haddr == (address & PAGE_MASK)) {
1005 entry = mk_pte(page, vma->vm_page_prot);
1006 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1007 page_add_new_anon_rmap(page, vma, haddr);
1008 } else {
1009 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
1010 entry = pte_mkspecial(entry);
1011 }
1012 pte = pte_offset_map(&_pmd, haddr);
1013 VM_BUG_ON(!pte_none(*pte));
1014 set_pte_at(mm, haddr, pte, entry);
1015 pte_unmap(pte);
1016 }
1017 smp_wmb(); /* make pte visible before pmd */
1018 pmd_populate(mm, pmd, pgtable);
1019 spin_unlock(&mm->page_table_lock);
1020 put_huge_zero_page();
1021 inc_mm_counter(mm, MM_ANONPAGES);
1022
1023 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1024
1025 ret |= VM_FAULT_WRITE;
1026out:
1027 return ret;
1028out_free_page:
1029 spin_unlock(&mm->page_table_lock);
1030 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1031 mem_cgroup_uncharge_page(page);
1032 put_page(page);
1033 goto out;
1034}
1035
780static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1036static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
781 struct vm_area_struct *vma, 1037 struct vm_area_struct *vma,
782 unsigned long address, 1038 unsigned long address,
@@ -883,19 +1139,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
883 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1139 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
884{ 1140{
885 int ret = 0; 1141 int ret = 0;
886 struct page *page, *new_page; 1142 struct page *page = NULL, *new_page;
887 unsigned long haddr; 1143 unsigned long haddr;
888 unsigned long mmun_start; /* For mmu_notifiers */ 1144 unsigned long mmun_start; /* For mmu_notifiers */
889 unsigned long mmun_end; /* For mmu_notifiers */ 1145 unsigned long mmun_end; /* For mmu_notifiers */
890 1146
891 VM_BUG_ON(!vma->anon_vma); 1147 VM_BUG_ON(!vma->anon_vma);
1148 haddr = address & HPAGE_PMD_MASK;
1149 if (is_huge_zero_pmd(orig_pmd))
1150 goto alloc;
892 spin_lock(&mm->page_table_lock); 1151 spin_lock(&mm->page_table_lock);
893 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1152 if (unlikely(!pmd_same(*pmd, orig_pmd)))
894 goto out_unlock; 1153 goto out_unlock;
895 1154
896 page = pmd_page(orig_pmd); 1155 page = pmd_page(orig_pmd);
897 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1156 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
898 haddr = address & HPAGE_PMD_MASK;
899 if (page_mapcount(page) == 1) { 1157 if (page_mapcount(page) == 1) {
900 pmd_t entry; 1158 pmd_t entry;
901 entry = pmd_mkyoung(orig_pmd); 1159 entry = pmd_mkyoung(orig_pmd);
@@ -907,7 +1165,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
907 } 1165 }
908 get_page(page); 1166 get_page(page);
909 spin_unlock(&mm->page_table_lock); 1167 spin_unlock(&mm->page_table_lock);
910 1168alloc:
911 if (transparent_hugepage_enabled(vma) && 1169 if (transparent_hugepage_enabled(vma) &&
912 !transparent_hugepage_debug_cow()) 1170 !transparent_hugepage_debug_cow())
913 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1171 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -917,24 +1175,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
917 1175
918 if (unlikely(!new_page)) { 1176 if (unlikely(!new_page)) {
919 count_vm_event(THP_FAULT_FALLBACK); 1177 count_vm_event(THP_FAULT_FALLBACK);
920 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1178 if (is_huge_zero_pmd(orig_pmd)) {
921 pmd, orig_pmd, page, haddr); 1179 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
922 if (ret & VM_FAULT_OOM) 1180 address, pmd, orig_pmd, haddr);
923 split_huge_page(page); 1181 } else {
924 put_page(page); 1182 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1183 pmd, orig_pmd, page, haddr);
1184 if (ret & VM_FAULT_OOM)
1185 split_huge_page(page);
1186 put_page(page);
1187 }
925 goto out; 1188 goto out;
926 } 1189 }
927 count_vm_event(THP_FAULT_ALLOC); 1190 count_vm_event(THP_FAULT_ALLOC);
928 1191
929 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1192 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
930 put_page(new_page); 1193 put_page(new_page);
931 split_huge_page(page); 1194 if (page) {
932 put_page(page); 1195 split_huge_page(page);
1196 put_page(page);
1197 }
933 ret |= VM_FAULT_OOM; 1198 ret |= VM_FAULT_OOM;
934 goto out; 1199 goto out;
935 } 1200 }
936 1201
937 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1202 if (is_huge_zero_pmd(orig_pmd))
1203 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1204 else
1205 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
938 __SetPageUptodate(new_page); 1206 __SetPageUptodate(new_page);
939 1207
940 mmun_start = haddr; 1208 mmun_start = haddr;
@@ -942,7 +1210,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
942 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1210 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
943 1211
944 spin_lock(&mm->page_table_lock); 1212 spin_lock(&mm->page_table_lock);
945 put_page(page); 1213 if (page)
1214 put_page(page);
946 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1215 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
947 spin_unlock(&mm->page_table_lock); 1216 spin_unlock(&mm->page_table_lock);
948 mem_cgroup_uncharge_page(new_page); 1217 mem_cgroup_uncharge_page(new_page);
@@ -950,16 +1219,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
950 goto out_mn; 1219 goto out_mn;
951 } else { 1220 } else {
952 pmd_t entry; 1221 pmd_t entry;
953 VM_BUG_ON(!PageHead(page)); 1222 entry = mk_huge_pmd(new_page, vma);
954 entry = mk_pmd(new_page, vma->vm_page_prot);
955 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
956 entry = pmd_mkhuge(entry);
957 pmdp_clear_flush(vma, haddr, pmd); 1223 pmdp_clear_flush(vma, haddr, pmd);
958 page_add_new_anon_rmap(new_page, vma, haddr); 1224 page_add_new_anon_rmap(new_page, vma, haddr);
959 set_pmd_at(mm, haddr, pmd, entry); 1225 set_pmd_at(mm, haddr, pmd, entry);
960 update_mmu_cache_pmd(vma, address, pmd); 1226 update_mmu_cache_pmd(vma, address, pmd);
961 page_remove_rmap(page); 1227 if (is_huge_zero_pmd(orig_pmd)) {
962 put_page(page); 1228 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1229 put_huge_zero_page();
1230 } else {
1231 VM_BUG_ON(!PageHead(page));
1232 page_remove_rmap(page);
1233 put_page(page);
1234 }
963 ret |= VM_FAULT_WRITE; 1235 ret |= VM_FAULT_WRITE;
964 } 1236 }
965 spin_unlock(&mm->page_table_lock); 1237 spin_unlock(&mm->page_table_lock);
@@ -985,6 +1257,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
985 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1257 if (flags & FOLL_WRITE && !pmd_write(*pmd))
986 goto out; 1258 goto out;
987 1259
1260 /* Avoid dumping huge zero page */
1261 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1262 return ERR_PTR(-EFAULT);
1263
988 page = pmd_page(*pmd); 1264 page = pmd_page(*pmd);
989 VM_BUG_ON(!PageHead(page)); 1265 VM_BUG_ON(!PageHead(page));
990 if (flags & FOLL_TOUCH) { 1266 if (flags & FOLL_TOUCH) {
@@ -1017,6 +1293,81 @@ out:
1017 return page; 1293 return page;
1018} 1294}
1019 1295
1296/* NUMA hinting page fault entry point for trans huge pmds */
1297int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1298 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1299{
1300 struct page *page;
1301 unsigned long haddr = addr & HPAGE_PMD_MASK;
1302 int target_nid;
1303 int current_nid = -1;
1304 bool migrated;
1305 bool page_locked = false;
1306
1307 spin_lock(&mm->page_table_lock);
1308 if (unlikely(!pmd_same(pmd, *pmdp)))
1309 goto out_unlock;
1310
1311 page = pmd_page(pmd);
1312 get_page(page);
1313 current_nid = page_to_nid(page);
1314 count_vm_numa_event(NUMA_HINT_FAULTS);
1315 if (current_nid == numa_node_id())
1316 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1317
1318 target_nid = mpol_misplaced(page, vma, haddr);
1319 if (target_nid == -1) {
1320 put_page(page);
1321 goto clear_pmdnuma;
1322 }
1323
1324 /* Acquire the page lock to serialise THP migrations */
1325 spin_unlock(&mm->page_table_lock);
1326 lock_page(page);
1327 page_locked = true;
1328
1329 /* Confirm the PTE did not while locked */
1330 spin_lock(&mm->page_table_lock);
1331 if (unlikely(!pmd_same(pmd, *pmdp))) {
1332 unlock_page(page);
1333 put_page(page);
1334 goto out_unlock;
1335 }
1336 spin_unlock(&mm->page_table_lock);
1337
1338 /* Migrate the THP to the requested node */
1339 migrated = migrate_misplaced_transhuge_page(mm, vma,
1340 pmdp, pmd, addr,
1341 page, target_nid);
1342 if (migrated)
1343 current_nid = target_nid;
1344 else {
1345 spin_lock(&mm->page_table_lock);
1346 if (unlikely(!pmd_same(pmd, *pmdp))) {
1347 unlock_page(page);
1348 goto out_unlock;
1349 }
1350 goto clear_pmdnuma;
1351 }
1352
1353 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1354 return 0;
1355
1356clear_pmdnuma:
1357 pmd = pmd_mknonnuma(pmd);
1358 set_pmd_at(mm, haddr, pmdp, pmd);
1359 VM_BUG_ON(pmd_numa(*pmdp));
1360 update_mmu_cache_pmd(vma, addr, pmdp);
1361 if (page_locked)
1362 unlock_page(page);
1363
1364out_unlock:
1365 spin_unlock(&mm->page_table_lock);
1366 if (current_nid != -1)
1367 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1368 return 0;
1369}
1370
1020int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1371int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1021 pmd_t *pmd, unsigned long addr) 1372 pmd_t *pmd, unsigned long addr)
1022{ 1373{
@@ -1028,15 +1379,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1028 pmd_t orig_pmd; 1379 pmd_t orig_pmd;
1029 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1380 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1030 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1381 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1031 page = pmd_page(orig_pmd);
1032 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1382 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1033 page_remove_rmap(page); 1383 if (is_huge_zero_pmd(orig_pmd)) {
1034 VM_BUG_ON(page_mapcount(page) < 0); 1384 tlb->mm->nr_ptes--;
1035 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1385 spin_unlock(&tlb->mm->page_table_lock);
1036 VM_BUG_ON(!PageHead(page)); 1386 put_huge_zero_page();
1037 tlb->mm->nr_ptes--; 1387 } else {
1038 spin_unlock(&tlb->mm->page_table_lock); 1388 page = pmd_page(orig_pmd);
1039 tlb_remove_page(tlb, page); 1389 page_remove_rmap(page);
1390 VM_BUG_ON(page_mapcount(page) < 0);
1391 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1392 VM_BUG_ON(!PageHead(page));
1393 tlb->mm->nr_ptes--;
1394 spin_unlock(&tlb->mm->page_table_lock);
1395 tlb_remove_page(tlb, page);
1396 }
1040 pte_free(tlb->mm, pgtable); 1397 pte_free(tlb->mm, pgtable);
1041 ret = 1; 1398 ret = 1;
1042 } 1399 }
@@ -1099,7 +1456,7 @@ out:
1099} 1456}
1100 1457
1101int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1458int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1102 unsigned long addr, pgprot_t newprot) 1459 unsigned long addr, pgprot_t newprot, int prot_numa)
1103{ 1460{
1104 struct mm_struct *mm = vma->vm_mm; 1461 struct mm_struct *mm = vma->vm_mm;
1105 int ret = 0; 1462 int ret = 0;
@@ -1107,7 +1464,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1107 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1464 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1108 pmd_t entry; 1465 pmd_t entry;
1109 entry = pmdp_get_and_clear(mm, addr, pmd); 1466 entry = pmdp_get_and_clear(mm, addr, pmd);
1110 entry = pmd_modify(entry, newprot); 1467 if (!prot_numa) {
1468 entry = pmd_modify(entry, newprot);
1469 BUG_ON(pmd_write(entry));
1470 } else {
1471 struct page *page = pmd_page(*pmd);
1472
1473 /* only check non-shared pages */
1474 if (page_mapcount(page) == 1 &&
1475 !pmd_numa(*pmd)) {
1476 entry = pmd_mknuma(entry);
1477 }
1478 }
1111 set_pmd_at(mm, addr, pmd, entry); 1479 set_pmd_at(mm, addr, pmd, entry);
1112 spin_unlock(&vma->vm_mm->page_table_lock); 1480 spin_unlock(&vma->vm_mm->page_table_lock);
1113 ret = 1; 1481 ret = 1;
@@ -1146,22 +1514,14 @@ pmd_t *page_check_address_pmd(struct page *page,
1146 unsigned long address, 1514 unsigned long address,
1147 enum page_check_address_pmd_flag flag) 1515 enum page_check_address_pmd_flag flag)
1148{ 1516{
1149 pgd_t *pgd;
1150 pud_t *pud;
1151 pmd_t *pmd, *ret = NULL; 1517 pmd_t *pmd, *ret = NULL;
1152 1518
1153 if (address & ~HPAGE_PMD_MASK) 1519 if (address & ~HPAGE_PMD_MASK)
1154 goto out; 1520 goto out;
1155 1521
1156 pgd = pgd_offset(mm, address); 1522 pmd = mm_find_pmd(mm, address);
1157 if (!pgd_present(*pgd)) 1523 if (!pmd)
1158 goto out;
1159
1160 pud = pud_offset(pgd, address);
1161 if (!pud_present(*pud))
1162 goto out; 1524 goto out;
1163
1164 pmd = pmd_offset(pud, address);
1165 if (pmd_none(*pmd)) 1525 if (pmd_none(*pmd))
1166 goto out; 1526 goto out;
1167 if (pmd_page(*pmd) != page) 1527 if (pmd_page(*pmd) != page)
@@ -1205,7 +1565,7 @@ static int __split_huge_page_splitting(struct page *page,
1205 * We can't temporarily set the pmd to null in order 1565 * We can't temporarily set the pmd to null in order
1206 * to split it, the pmd must remain marked huge at all 1566 * to split it, the pmd must remain marked huge at all
1207 * times or the VM won't take the pmd_trans_huge paths 1567 * times or the VM won't take the pmd_trans_huge paths
1208 * and it won't wait on the anon_vma->root->mutex to 1568 * and it won't wait on the anon_vma->root->rwsem to
1209 * serialize against split_huge_page*. 1569 * serialize against split_huge_page*.
1210 */ 1570 */
1211 pmdp_splitting_flush(vma, address, pmd); 1571 pmdp_splitting_flush(vma, address, pmd);
@@ -1296,6 +1656,7 @@ static void __split_huge_page_refcount(struct page *page)
1296 page_tail->mapping = page->mapping; 1656 page_tail->mapping = page->mapping;
1297 1657
1298 page_tail->index = page->index + i; 1658 page_tail->index = page->index + i;
1659 page_xchg_last_nid(page_tail, page_last_nid(page));
1299 1660
1300 BUG_ON(!PageAnon(page_tail)); 1661 BUG_ON(!PageAnon(page_tail));
1301 BUG_ON(!PageUptodate(page_tail)); 1662 BUG_ON(!PageUptodate(page_tail));
@@ -1363,6 +1724,8 @@ static int __split_huge_page_map(struct page *page,
1363 BUG_ON(page_mapcount(page) != 1); 1724 BUG_ON(page_mapcount(page) != 1);
1364 if (!pmd_young(*pmd)) 1725 if (!pmd_young(*pmd))
1365 entry = pte_mkold(entry); 1726 entry = pte_mkold(entry);
1727 if (pmd_numa(*pmd))
1728 entry = pte_mknuma(entry);
1366 pte = pte_offset_map(&_pmd, haddr); 1729 pte = pte_offset_map(&_pmd, haddr);
1367 BUG_ON(!pte_none(*pte)); 1730 BUG_ON(!pte_none(*pte));
1368 set_pte_at(mm, haddr, pte, entry); 1731 set_pte_at(mm, haddr, pte, entry);
@@ -1405,7 +1768,7 @@ static int __split_huge_page_map(struct page *page,
1405 return ret; 1768 return ret;
1406} 1769}
1407 1770
1408/* must be called with anon_vma->root->mutex hold */ 1771/* must be called with anon_vma->root->rwsem held */
1409static void __split_huge_page(struct page *page, 1772static void __split_huge_page(struct page *page,
1410 struct anon_vma *anon_vma) 1773 struct anon_vma *anon_vma)
1411{ 1774{
@@ -1458,10 +1821,21 @@ int split_huge_page(struct page *page)
1458 struct anon_vma *anon_vma; 1821 struct anon_vma *anon_vma;
1459 int ret = 1; 1822 int ret = 1;
1460 1823
1824 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1461 BUG_ON(!PageAnon(page)); 1825 BUG_ON(!PageAnon(page));
1462 anon_vma = page_lock_anon_vma(page); 1826
1827 /*
1828 * The caller does not necessarily hold an mmap_sem that would prevent
1829 * the anon_vma disappearing so we first we take a reference to it
1830 * and then lock the anon_vma for write. This is similar to
1831 * page_lock_anon_vma_read except the write lock is taken to serialise
1832 * against parallel split or collapse operations.
1833 */
1834 anon_vma = page_get_anon_vma(page);
1463 if (!anon_vma) 1835 if (!anon_vma)
1464 goto out; 1836 goto out;
1837 anon_vma_lock_write(anon_vma);
1838
1465 ret = 0; 1839 ret = 0;
1466 if (!PageCompound(page)) 1840 if (!PageCompound(page))
1467 goto out_unlock; 1841 goto out_unlock;
@@ -1472,7 +1846,8 @@ int split_huge_page(struct page *page)
1472 1846
1473 BUG_ON(PageCompound(page)); 1847 BUG_ON(PageCompound(page));
1474out_unlock: 1848out_unlock:
1475 page_unlock_anon_vma(anon_vma); 1849 anon_vma_unlock(anon_vma);
1850 put_anon_vma(anon_vma);
1476out: 1851out:
1477 return ret; 1852 return ret;
1478} 1853}
@@ -1701,64 +2076,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
1701 } 2076 }
1702} 2077}
1703 2078
1704static void release_all_pte_pages(pte_t *pte)
1705{
1706 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1707}
1708
1709static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 2079static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1710 unsigned long address, 2080 unsigned long address,
1711 pte_t *pte) 2081 pte_t *pte)
1712{ 2082{
1713 struct page *page; 2083 struct page *page;
1714 pte_t *_pte; 2084 pte_t *_pte;
1715 int referenced = 0, isolated = 0, none = 0; 2085 int referenced = 0, none = 0;
1716 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2086 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1717 _pte++, address += PAGE_SIZE) { 2087 _pte++, address += PAGE_SIZE) {
1718 pte_t pteval = *_pte; 2088 pte_t pteval = *_pte;
1719 if (pte_none(pteval)) { 2089 if (pte_none(pteval)) {
1720 if (++none <= khugepaged_max_ptes_none) 2090 if (++none <= khugepaged_max_ptes_none)
1721 continue; 2091 continue;
1722 else { 2092 else
1723 release_pte_pages(pte, _pte);
1724 goto out; 2093 goto out;
1725 }
1726 } 2094 }
1727 if (!pte_present(pteval) || !pte_write(pteval)) { 2095 if (!pte_present(pteval) || !pte_write(pteval))
1728 release_pte_pages(pte, _pte);
1729 goto out; 2096 goto out;
1730 }
1731 page = vm_normal_page(vma, address, pteval); 2097 page = vm_normal_page(vma, address, pteval);
1732 if (unlikely(!page)) { 2098 if (unlikely(!page))
1733 release_pte_pages(pte, _pte);
1734 goto out; 2099 goto out;
1735 } 2100
1736 VM_BUG_ON(PageCompound(page)); 2101 VM_BUG_ON(PageCompound(page));
1737 BUG_ON(!PageAnon(page)); 2102 BUG_ON(!PageAnon(page));
1738 VM_BUG_ON(!PageSwapBacked(page)); 2103 VM_BUG_ON(!PageSwapBacked(page));
1739 2104
1740 /* cannot use mapcount: can't collapse if there's a gup pin */ 2105 /* cannot use mapcount: can't collapse if there's a gup pin */
1741 if (page_count(page) != 1) { 2106 if (page_count(page) != 1)
1742 release_pte_pages(pte, _pte);
1743 goto out; 2107 goto out;
1744 }
1745 /* 2108 /*
1746 * We can do it before isolate_lru_page because the 2109 * We can do it before isolate_lru_page because the
1747 * page can't be freed from under us. NOTE: PG_lock 2110 * page can't be freed from under us. NOTE: PG_lock
1748 * is needed to serialize against split_huge_page 2111 * is needed to serialize against split_huge_page
1749 * when invoked from the VM. 2112 * when invoked from the VM.
1750 */ 2113 */
1751 if (!trylock_page(page)) { 2114 if (!trylock_page(page))
1752 release_pte_pages(pte, _pte);
1753 goto out; 2115 goto out;
1754 }
1755 /* 2116 /*
1756 * Isolate the page to avoid collapsing an hugepage 2117 * Isolate the page to avoid collapsing an hugepage
1757 * currently in use by the VM. 2118 * currently in use by the VM.
1758 */ 2119 */
1759 if (isolate_lru_page(page)) { 2120 if (isolate_lru_page(page)) {
1760 unlock_page(page); 2121 unlock_page(page);
1761 release_pte_pages(pte, _pte);
1762 goto out; 2122 goto out;
1763 } 2123 }
1764 /* 0 stands for page_is_file_cache(page) == false */ 2124 /* 0 stands for page_is_file_cache(page) == false */
@@ -1771,12 +2131,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1771 mmu_notifier_test_young(vma->vm_mm, address)) 2131 mmu_notifier_test_young(vma->vm_mm, address))
1772 referenced = 1; 2132 referenced = 1;
1773 } 2133 }
1774 if (unlikely(!referenced)) 2134 if (likely(referenced))
1775 release_all_pte_pages(pte); 2135 return 1;
1776 else
1777 isolated = 1;
1778out: 2136out:
1779 return isolated; 2137 release_pte_pages(pte, _pte);
2138 return 0;
1780} 2139}
1781 2140
1782static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 2141static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -1918,14 +2277,26 @@ static struct page
1918} 2277}
1919#endif 2278#endif
1920 2279
2280static bool hugepage_vma_check(struct vm_area_struct *vma)
2281{
2282 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2283 (vma->vm_flags & VM_NOHUGEPAGE))
2284 return false;
2285
2286 if (!vma->anon_vma || vma->vm_ops)
2287 return false;
2288 if (is_vma_temporary_stack(vma))
2289 return false;
2290 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2291 return true;
2292}
2293
1921static void collapse_huge_page(struct mm_struct *mm, 2294static void collapse_huge_page(struct mm_struct *mm,
1922 unsigned long address, 2295 unsigned long address,
1923 struct page **hpage, 2296 struct page **hpage,
1924 struct vm_area_struct *vma, 2297 struct vm_area_struct *vma,
1925 int node) 2298 int node)
1926{ 2299{
1927 pgd_t *pgd;
1928 pud_t *pud;
1929 pmd_t *pmd, _pmd; 2300 pmd_t *pmd, _pmd;
1930 pte_t *pte; 2301 pte_t *pte;
1931 pgtable_t pgtable; 2302 pgtable_t pgtable;
@@ -1960,31 +2331,15 @@ static void collapse_huge_page(struct mm_struct *mm,
1960 hend = vma->vm_end & HPAGE_PMD_MASK; 2331 hend = vma->vm_end & HPAGE_PMD_MASK;
1961 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 2332 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1962 goto out; 2333 goto out;
1963 2334 if (!hugepage_vma_check(vma))
1964 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1965 (vma->vm_flags & VM_NOHUGEPAGE))
1966 goto out;
1967
1968 if (!vma->anon_vma || vma->vm_ops)
1969 goto out;
1970 if (is_vma_temporary_stack(vma))
1971 goto out; 2335 goto out;
1972 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2336 pmd = mm_find_pmd(mm, address);
1973 2337 if (!pmd)
1974 pgd = pgd_offset(mm, address);
1975 if (!pgd_present(*pgd))
1976 goto out;
1977
1978 pud = pud_offset(pgd, address);
1979 if (!pud_present(*pud))
1980 goto out; 2338 goto out;
1981 2339 if (pmd_trans_huge(*pmd))
1982 pmd = pmd_offset(pud, address);
1983 /* pmd can't go away or become huge under us */
1984 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1985 goto out; 2340 goto out;
1986 2341
1987 anon_vma_lock(vma->anon_vma); 2342 anon_vma_lock_write(vma->anon_vma);
1988 2343
1989 pte = pte_offset_map(pmd, address); 2344 pte = pte_offset_map(pmd, address);
1990 ptl = pte_lockptr(mm, pmd); 2345 ptl = pte_lockptr(mm, pmd);
@@ -2028,9 +2383,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2028 __SetPageUptodate(new_page); 2383 __SetPageUptodate(new_page);
2029 pgtable = pmd_pgtable(_pmd); 2384 pgtable = pmd_pgtable(_pmd);
2030 2385
2031 _pmd = mk_pmd(new_page, vma->vm_page_prot); 2386 _pmd = mk_huge_pmd(new_page, vma);
2032 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
2033 _pmd = pmd_mkhuge(_pmd);
2034 2387
2035 /* 2388 /*
2036 * spin_lock() below is not the equivalent of smp_wmb(), so 2389 * spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2064,8 +2417,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2064 unsigned long address, 2417 unsigned long address,
2065 struct page **hpage) 2418 struct page **hpage)
2066{ 2419{
2067 pgd_t *pgd;
2068 pud_t *pud;
2069 pmd_t *pmd; 2420 pmd_t *pmd;
2070 pte_t *pte, *_pte; 2421 pte_t *pte, *_pte;
2071 int ret = 0, referenced = 0, none = 0; 2422 int ret = 0, referenced = 0, none = 0;
@@ -2076,16 +2427,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2076 2427
2077 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2428 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2078 2429
2079 pgd = pgd_offset(mm, address); 2430 pmd = mm_find_pmd(mm, address);
2080 if (!pgd_present(*pgd)) 2431 if (!pmd)
2081 goto out; 2432 goto out;
2082 2433 if (pmd_trans_huge(*pmd))
2083 pud = pud_offset(pgd, address);
2084 if (!pud_present(*pud))
2085 goto out;
2086
2087 pmd = pmd_offset(pud, address);
2088 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
2089 goto out; 2434 goto out;
2090 2435
2091 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2436 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2193,20 +2538,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2193 progress++; 2538 progress++;
2194 break; 2539 break;
2195 } 2540 }
2196 2541 if (!hugepage_vma_check(vma)) {
2197 if ((!(vma->vm_flags & VM_HUGEPAGE) && 2542skip:
2198 !khugepaged_always()) ||
2199 (vma->vm_flags & VM_NOHUGEPAGE)) {
2200 skip:
2201 progress++; 2543 progress++;
2202 continue; 2544 continue;
2203 } 2545 }
2204 if (!vma->anon_vma || vma->vm_ops)
2205 goto skip;
2206 if (is_vma_temporary_stack(vma))
2207 goto skip;
2208 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2209
2210 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2546 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2211 hend = vma->vm_end & HPAGE_PMD_MASK; 2547 hend = vma->vm_end & HPAGE_PMD_MASK;
2212 if (hstart >= hend) 2548 if (hstart >= hend)
@@ -2356,19 +2692,65 @@ static int khugepaged(void *none)
2356 return 0; 2692 return 0;
2357} 2693}
2358 2694
2359void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) 2695static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2696 unsigned long haddr, pmd_t *pmd)
2697{
2698 struct mm_struct *mm = vma->vm_mm;
2699 pgtable_t pgtable;
2700 pmd_t _pmd;
2701 int i;
2702
2703 pmdp_clear_flush(vma, haddr, pmd);
2704 /* leave pmd empty until pte is filled */
2705
2706 pgtable = pgtable_trans_huge_withdraw(mm);
2707 pmd_populate(mm, &_pmd, pgtable);
2708
2709 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2710 pte_t *pte, entry;
2711 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2712 entry = pte_mkspecial(entry);
2713 pte = pte_offset_map(&_pmd, haddr);
2714 VM_BUG_ON(!pte_none(*pte));
2715 set_pte_at(mm, haddr, pte, entry);
2716 pte_unmap(pte);
2717 }
2718 smp_wmb(); /* make pte visible before pmd */
2719 pmd_populate(mm, pmd, pgtable);
2720 put_huge_zero_page();
2721}
2722
2723void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2724 pmd_t *pmd)
2360{ 2725{
2361 struct page *page; 2726 struct page *page;
2727 struct mm_struct *mm = vma->vm_mm;
2728 unsigned long haddr = address & HPAGE_PMD_MASK;
2729 unsigned long mmun_start; /* For mmu_notifiers */
2730 unsigned long mmun_end; /* For mmu_notifiers */
2731
2732 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2362 2733
2734 mmun_start = haddr;
2735 mmun_end = haddr + HPAGE_PMD_SIZE;
2736 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2363 spin_lock(&mm->page_table_lock); 2737 spin_lock(&mm->page_table_lock);
2364 if (unlikely(!pmd_trans_huge(*pmd))) { 2738 if (unlikely(!pmd_trans_huge(*pmd))) {
2365 spin_unlock(&mm->page_table_lock); 2739 spin_unlock(&mm->page_table_lock);
2740 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2741 return;
2742 }
2743 if (is_huge_zero_pmd(*pmd)) {
2744 __split_huge_zero_page_pmd(vma, haddr, pmd);
2745 spin_unlock(&mm->page_table_lock);
2746 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2366 return; 2747 return;
2367 } 2748 }
2368 page = pmd_page(*pmd); 2749 page = pmd_page(*pmd);
2369 VM_BUG_ON(!page_count(page)); 2750 VM_BUG_ON(!page_count(page));
2370 get_page(page); 2751 get_page(page);
2371 spin_unlock(&mm->page_table_lock); 2752 spin_unlock(&mm->page_table_lock);
2753 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2372 2754
2373 split_huge_page(page); 2755 split_huge_page(page);
2374 2756
@@ -2376,31 +2758,31 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2376 BUG_ON(pmd_trans_huge(*pmd)); 2758 BUG_ON(pmd_trans_huge(*pmd));
2377} 2759}
2378 2760
2761void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2762 pmd_t *pmd)
2763{
2764 struct vm_area_struct *vma;
2765
2766 vma = find_vma(mm, address);
2767 BUG_ON(vma == NULL);
2768 split_huge_page_pmd(vma, address, pmd);
2769}
2770
2379static void split_huge_page_address(struct mm_struct *mm, 2771static void split_huge_page_address(struct mm_struct *mm,
2380 unsigned long address) 2772 unsigned long address)
2381{ 2773{
2382 pgd_t *pgd;
2383 pud_t *pud;
2384 pmd_t *pmd; 2774 pmd_t *pmd;
2385 2775
2386 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2776 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2387 2777
2388 pgd = pgd_offset(mm, address); 2778 pmd = mm_find_pmd(mm, address);
2389 if (!pgd_present(*pgd)) 2779 if (!pmd)
2390 return;
2391
2392 pud = pud_offset(pgd, address);
2393 if (!pud_present(*pud))
2394 return;
2395
2396 pmd = pmd_offset(pud, address);
2397 if (!pmd_present(*pmd))
2398 return; 2780 return;
2399 /* 2781 /*
2400 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2782 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2401 * materialize from under us. 2783 * materialize from under us.
2402 */ 2784 */
2403 split_huge_page_pmd(mm, pmd); 2785 split_huge_page_pmd_mm(mm, address, pmd);
2404} 2786}
2405 2787
2406void __vma_adjust_trans_huge(struct vm_area_struct *vma, 2788void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 59a0059b39e2..546db81820e4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Generic hugetlb support. 2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004 3 * (C) Nadia Yvette Chambers, April 2004
4 */ 4 */
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/init.h> 6#include <linux/init.h>
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
1057 * on-line nodes with memory and will handle the hstate accounting. 1057 * on-line nodes with memory and will handle the hstate accounting.
1058 */ 1058 */
1059 while (nr_pages--) { 1059 while (nr_pages--) {
1060 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) 1060 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1061 break; 1061 break;
1062 } 1062 }
1063} 1063}
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1180int __weak alloc_bootmem_huge_page(struct hstate *h) 1180int __weak alloc_bootmem_huge_page(struct hstate *h)
1181{ 1181{
1182 struct huge_bootmem_page *m; 1182 struct huge_bootmem_page *m;
1183 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 1183 int nr_nodes = nodes_weight(node_states[N_MEMORY]);
1184 1184
1185 while (nr_nodes) { 1185 while (nr_nodes) {
1186 void *addr; 1186 void *addr;
1187 1187
1188 addr = __alloc_bootmem_node_nopanic( 1188 addr = __alloc_bootmem_node_nopanic(
1189 NODE_DATA(hstate_next_node_to_alloc(h, 1189 NODE_DATA(hstate_next_node_to_alloc(h,
1190 &node_states[N_HIGH_MEMORY])), 1190 &node_states[N_MEMORY])),
1191 huge_page_size(h), huge_page_size(h), 0); 1191 huge_page_size(h), huge_page_size(h), 0);
1192 1192
1193 if (addr) { 1193 if (addr) {
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1259 if (!alloc_bootmem_huge_page(h)) 1259 if (!alloc_bootmem_huge_page(h))
1260 break; 1260 break;
1261 } else if (!alloc_fresh_huge_page(h, 1261 } else if (!alloc_fresh_huge_page(h,
1262 &node_states[N_HIGH_MEMORY])) 1262 &node_states[N_MEMORY]))
1263 break; 1263 break;
1264 } 1264 }
1265 h->max_huge_pages = i; 1265 h->max_huge_pages = i;
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1527 if (!(obey_mempolicy && 1527 if (!(obey_mempolicy &&
1528 init_nodemask_of_mempolicy(nodes_allowed))) { 1528 init_nodemask_of_mempolicy(nodes_allowed))) {
1529 NODEMASK_FREE(nodes_allowed); 1529 NODEMASK_FREE(nodes_allowed);
1530 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1530 nodes_allowed = &node_states[N_MEMORY];
1531 } 1531 }
1532 } else if (nodes_allowed) { 1532 } else if (nodes_allowed) {
1533 /* 1533 /*
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1537 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 1537 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1538 init_nodemask_of_node(nodes_allowed, nid); 1538 init_nodemask_of_node(nodes_allowed, nid);
1539 } else 1539 } else
1540 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1540 nodes_allowed = &node_states[N_MEMORY];
1541 1541
1542 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 1542 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1543 1543
1544 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1544 if (nodes_allowed != &node_states[N_MEMORY])
1545 NODEMASK_FREE(nodes_allowed); 1545 NODEMASK_FREE(nodes_allowed);
1546 1546
1547 return len; 1547 return len;
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
1800 * remove hstate attributes from any nodes that have them. 1800 * remove hstate attributes from any nodes that have them.
1801 */ 1801 */
1802 for (nid = 0; nid < nr_node_ids; nid++) 1802 for (nid = 0; nid < nr_node_ids; nid++)
1803 hugetlb_unregister_node(&node_devices[nid]); 1803 hugetlb_unregister_node(node_devices[nid]);
1804} 1804}
1805 1805
1806/* 1806/*
@@ -1844,8 +1844,8 @@ static void hugetlb_register_all_nodes(void)
1844{ 1844{
1845 int nid; 1845 int nid;
1846 1846
1847 for_each_node_state(nid, N_HIGH_MEMORY) { 1847 for_each_node_state(nid, N_MEMORY) {
1848 struct node *node = &node_devices[nid]; 1848 struct node *node = node_devices[nid];
1849 if (node->dev.id == nid) 1849 if (node->dev.id == nid)
1850 hugetlb_register_node(node); 1850 hugetlb_register_node(node);
1851 } 1851 }
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
1906 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1906 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1907 1907
1908 hugetlb_init_hstates(); 1908 hugetlb_init_hstates();
1909
1910 gather_bootmem_prealloc(); 1909 gather_bootmem_prealloc();
1911
1912 report_hugepages(); 1910 report_hugepages();
1913 1911
1914 hugetlb_sysfs_init(); 1912 hugetlb_sysfs_init();
1915
1916 hugetlb_register_all_nodes(); 1913 hugetlb_register_all_nodes();
1914 hugetlb_cgroup_file_init();
1917 1915
1918 return 0; 1916 return 0;
1919} 1917}
@@ -1939,17 +1937,10 @@ void __init hugetlb_add_hstate(unsigned order)
1939 for (i = 0; i < MAX_NUMNODES; ++i) 1937 for (i = 0; i < MAX_NUMNODES; ++i)
1940 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1938 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1941 INIT_LIST_HEAD(&h->hugepage_activelist); 1939 INIT_LIST_HEAD(&h->hugepage_activelist);
1942 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1940 h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
1943 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1941 h->next_nid_to_free = first_node(node_states[N_MEMORY]);
1944 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1942 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1945 huge_page_size(h)/1024); 1943 huge_page_size(h)/1024);
1946 /*
1947 * Add cgroup control files only if the huge page consists
1948 * of more than two normal pages. This is because we use
1949 * page[2].lru.next for storing cgoup details.
1950 */
1951 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1952 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1953 1944
1954 parsed_hstate = h; 1945 parsed_hstate = h;
1955} 1946}
@@ -2035,11 +2026,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2035 if (!(obey_mempolicy && 2026 if (!(obey_mempolicy &&
2036 init_nodemask_of_mempolicy(nodes_allowed))) { 2027 init_nodemask_of_mempolicy(nodes_allowed))) {
2037 NODEMASK_FREE(nodes_allowed); 2028 NODEMASK_FREE(nodes_allowed);
2038 nodes_allowed = &node_states[N_HIGH_MEMORY]; 2029 nodes_allowed = &node_states[N_MEMORY];
2039 } 2030 }
2040 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); 2031 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
2041 2032
2042 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 2033 if (nodes_allowed != &node_states[N_MEMORY])
2043 NODEMASK_FREE(nodes_allowed); 2034 NODEMASK_FREE(nodes_allowed);
2044 } 2035 }
2045out: 2036out:
@@ -2386,8 +2377,10 @@ again:
2386 /* 2377 /*
2387 * HWPoisoned hugepage is already unmapped and dropped reference 2378 * HWPoisoned hugepage is already unmapped and dropped reference
2388 */ 2379 */
2389 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) 2380 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
2381 pte_clear(mm, address, ptep);
2390 continue; 2382 continue;
2383 }
2391 2384
2392 page = pte_page(pte); 2385 page = pte_page(pte);
2393 /* 2386 /*
@@ -3014,7 +3007,7 @@ same_page:
3014 return i ? i : -EFAULT; 3007 return i ? i : -EFAULT;
3015} 3008}
3016 3009
3017void hugetlb_change_protection(struct vm_area_struct *vma, 3010unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3018 unsigned long address, unsigned long end, pgprot_t newprot) 3011 unsigned long address, unsigned long end, pgprot_t newprot)
3019{ 3012{
3020 struct mm_struct *mm = vma->vm_mm; 3013 struct mm_struct *mm = vma->vm_mm;
@@ -3022,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3022 pte_t *ptep; 3015 pte_t *ptep;
3023 pte_t pte; 3016 pte_t pte;
3024 struct hstate *h = hstate_vma(vma); 3017 struct hstate *h = hstate_vma(vma);
3018 unsigned long pages = 0;
3025 3019
3026 BUG_ON(address >= end); 3020 BUG_ON(address >= end);
3027 flush_cache_range(vma, address, end); 3021 flush_cache_range(vma, address, end);
@@ -3032,12 +3026,16 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3032 ptep = huge_pte_offset(mm, address); 3026 ptep = huge_pte_offset(mm, address);
3033 if (!ptep) 3027 if (!ptep)
3034 continue; 3028 continue;
3035 if (huge_pmd_unshare(mm, &address, ptep)) 3029 if (huge_pmd_unshare(mm, &address, ptep)) {
3030 pages++;
3036 continue; 3031 continue;
3032 }
3037 if (!huge_pte_none(huge_ptep_get(ptep))) { 3033 if (!huge_pte_none(huge_ptep_get(ptep))) {
3038 pte = huge_ptep_get_and_clear(mm, address, ptep); 3034 pte = huge_ptep_get_and_clear(mm, address, ptep);
3039 pte = pte_mkhuge(pte_modify(pte, newprot)); 3035 pte = pte_mkhuge(pte_modify(pte, newprot));
3036 pte = arch_make_huge_pte(pte, vma, NULL, 0);
3040 set_huge_pte_at(mm, address, ptep, pte); 3037 set_huge_pte_at(mm, address, ptep, pte);
3038 pages++;
3041 } 3039 }
3042 } 3040 }
3043 spin_unlock(&mm->page_table_lock); 3041 spin_unlock(&mm->page_table_lock);
@@ -3049,6 +3047,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3049 */ 3047 */
3050 flush_tlb_range(vma, start, end); 3048 flush_tlb_range(vma, start, end);
3051 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3049 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
3050
3051 return pages << h->order;
3052} 3052}
3053 3053
3054int hugetlb_reserve_pages(struct inode *inode, 3054int hugetlb_reserve_pages(struct inode *inode,
@@ -3170,7 +3170,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3170 3170
3171 spin_lock(&hugetlb_lock); 3171 spin_lock(&hugetlb_lock);
3172 if (is_hugepage_on_freelist(hpage)) { 3172 if (is_hugepage_on_freelist(hpage)) {
3173 list_del(&hpage->lru); 3173 /*
3174 * Hwpoisoned hugepage isn't linked to activelist or freelist,
3175 * but dangling hpage->lru can trigger list-debug warnings
3176 * (this happens when we call unpoison_memory() on it),
3177 * so let it point to itself with list_del_init().
3178 */
3179 list_del_init(&hpage->lru);
3174 set_page_refcounted(hpage); 3180 set_page_refcounted(hpage);
3175 h->free_huge_pages--; 3181 h->free_huge_pages--;
3176 h->free_huge_pages_node[nid]--; 3182 h->free_huge_pages_node[nid]--;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a3f358fb8a0c..9cea7de22ffb 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
77 return false; 77 return false;
78} 78}
79 79
80static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) 80static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
81{ 81{
82 int idx; 82 int idx;
83 struct cgroup *parent_cgroup; 83 struct cgroup *parent_cgroup;
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
101 return &h_cgroup->css; 101 return &h_cgroup->css;
102} 102}
103 103
104static void hugetlb_cgroup_destroy(struct cgroup *cgroup) 104static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
105{ 105{
106 struct hugetlb_cgroup *h_cgroup; 106 struct hugetlb_cgroup *h_cgroup;
107 107
@@ -155,18 +155,13 @@ out:
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup. 156 * the parent cgroup.
157 */ 157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) 158static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
159{ 159{
160 struct hstate *h; 160 struct hstate *h;
161 struct page *page; 161 struct page *page;
162 int ret = 0, idx = 0; 162 int idx = 0;
163 163
164 do { 164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) { 165 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock); 166 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru) 167 list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
177 } 172 }
178 cond_resched(); 173 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup)); 174 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182} 175}
183 176
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 177int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -340,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
340 return buf; 333 return buf;
341} 334}
342 335
343int __init hugetlb_cgroup_file_init(int idx) 336static void __init __hugetlb_cgroup_file_init(int idx)
344{ 337{
345 char buf[32]; 338 char buf[32];
346 struct cftype *cft; 339 struct cftype *cft;
@@ -382,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
382 375
383 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); 376 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
384 377
385 return 0; 378 return;
379}
380
381void __init hugetlb_cgroup_file_init(void)
382{
383 struct hstate *h;
384
385 for_each_hstate(h) {
386 /*
387 * Add cgroup control files only if the huge page consists
388 * of more than two normal pages. This is because we use
389 * page[2].lru.next for storing cgroup details.
390 */
391 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
392 __hugetlb_cgroup_file_init(hstate_index(h));
393 }
386} 394}
387 395
388/* 396/*
@@ -411,8 +419,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
411 419
412struct cgroup_subsys hugetlb_subsys = { 420struct cgroup_subsys hugetlb_subsys = {
413 .name = "hugetlb", 421 .name = "hugetlb",
414 .create = hugetlb_cgroup_create, 422 .css_alloc = hugetlb_cgroup_css_alloc,
415 .pre_destroy = hugetlb_cgroup_pre_destroy, 423 .css_offline = hugetlb_cgroup_css_offline,
416 .destroy = hugetlb_cgroup_destroy, 424 .css_free = hugetlb_cgroup_css_free,
417 .subsys_id = hugetlb_subsys_id, 425 .subsys_id = hugetlb_subsys_id,
418}; 426};
diff --git a/mm/internal.h b/mm/internal.h
index a4fa284f6bc2..9ba21100ebf3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page);
92extern void putback_lru_page(struct page *page); 92extern void putback_lru_page(struct page *page);
93 93
94/* 94/*
95 * in mm/rmap.c:
96 */
97extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
98
99/*
95 * in mm/page_alloc.c 100 * in mm/page_alloc.c
96 */ 101 */
97extern void __free_pages_bootmem(struct page *page, unsigned int order); 102extern void __free_pages_bootmem(struct page *page, unsigned int order);
@@ -130,7 +135,6 @@ struct compact_control {
130 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 135 int migratetype; /* MOVABLE, RECLAIMABLE etc */
131 struct zone *zone; 136 struct zone *zone;
132 bool contended; /* True if a lock was contended */ 137 bool contended; /* True if a lock was contended */
133 struct page **page; /* Page captured of requested size */
134}; 138};
135 139
136unsigned long 140unsigned long
@@ -212,15 +216,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
212{ 216{
213 if (TestClearPageMlocked(page)) { 217 if (TestClearPageMlocked(page)) {
214 unsigned long flags; 218 unsigned long flags;
219 int nr_pages = hpage_nr_pages(page);
215 220
216 local_irq_save(flags); 221 local_irq_save(flags);
217 __dec_zone_page_state(page, NR_MLOCK); 222 __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
218 SetPageMlocked(newpage); 223 SetPageMlocked(newpage);
219 __inc_zone_page_state(newpage, NR_MLOCK); 224 __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
220 local_irq_restore(flags); 225 local_irq_restore(flags);
221 } 226 }
222} 227}
223 228
229extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
230
224#ifdef CONFIG_TRANSPARENT_HUGEPAGE 231#ifdef CONFIG_TRANSPARENT_HUGEPAGE
225extern unsigned long vma_address(struct page *page, 232extern unsigned long vma_address(struct page *page,
226 struct vm_area_struct *vma); 233 struct vm_area_struct *vma);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a217cc544060..752a705c77c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
1556 struct kmemleak_object *object; 1556 struct kmemleak_object *object;
1557 unsigned long addr; 1557 unsigned long addr;
1558 1558
1559 addr= simple_strtoul(str, NULL, 0); 1559 if (kstrtoul(str, 0, &addr))
1560 return -EINVAL;
1560 object = find_and_get_object(addr, 0); 1561 object = find_and_get_object(addr, 0);
1561 if (!object) { 1562 if (!object) {
1562 pr_info("Unknown object at 0x%08lx\n", addr); 1563 pr_info("Unknown object at 0x%08lx\n", addr);
diff --git a/mm/ksm.c b/mm/ksm.c
index ae539f0b8aa1..51573858938d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
778 struct page *kpage, pte_t orig_pte) 778 struct page *kpage, pte_t orig_pte)
779{ 779{
780 struct mm_struct *mm = vma->vm_mm; 780 struct mm_struct *mm = vma->vm_mm;
781 pgd_t *pgd;
782 pud_t *pud;
783 pmd_t *pmd; 781 pmd_t *pmd;
784 pte_t *ptep; 782 pte_t *ptep;
785 spinlock_t *ptl; 783 spinlock_t *ptl;
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
792 if (addr == -EFAULT) 790 if (addr == -EFAULT)
793 goto out; 791 goto out;
794 792
795 pgd = pgd_offset(mm, addr); 793 pmd = mm_find_pmd(mm, addr);
796 if (!pgd_present(*pgd)) 794 if (!pmd)
797 goto out; 795 goto out;
798
799 pud = pud_offset(pgd, addr);
800 if (!pud_present(*pud))
801 goto out;
802
803 pmd = pmd_offset(pud, addr);
804 BUG_ON(pmd_trans_huge(*pmd)); 796 BUG_ON(pmd_trans_huge(*pmd));
805 if (!pmd_present(*pmd))
806 goto out;
807 797
808 mmun_start = addr; 798 mmun_start = addr;
809 mmun_end = addr + PAGE_SIZE; 799 mmun_end = addr + PAGE_SIZE;
@@ -1634,7 +1624,7 @@ again:
1634 struct anon_vma_chain *vmac; 1624 struct anon_vma_chain *vmac;
1635 struct vm_area_struct *vma; 1625 struct vm_area_struct *vma;
1636 1626
1637 anon_vma_lock(anon_vma); 1627 anon_vma_lock_read(anon_vma);
1638 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1628 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1639 0, ULONG_MAX) { 1629 0, ULONG_MAX) {
1640 vma = vmac->vma; 1630 vma = vmac->vma;
@@ -1658,7 +1648,7 @@ again:
1658 if (!search_new_forks || !mapcount) 1648 if (!search_new_forks || !mapcount)
1659 break; 1649 break;
1660 } 1650 }
1661 anon_vma_unlock(anon_vma); 1651 anon_vma_unlock_read(anon_vma);
1662 if (!mapcount) 1652 if (!mapcount)
1663 goto out; 1653 goto out;
1664 } 1654 }
@@ -1688,7 +1678,7 @@ again:
1688 struct anon_vma_chain *vmac; 1678 struct anon_vma_chain *vmac;
1689 struct vm_area_struct *vma; 1679 struct vm_area_struct *vma;
1690 1680
1691 anon_vma_lock(anon_vma); 1681 anon_vma_lock_read(anon_vma);
1692 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1682 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1693 0, ULONG_MAX) { 1683 0, ULONG_MAX) {
1694 vma = vmac->vma; 1684 vma = vmac->vma;
@@ -1707,11 +1697,11 @@ again:
1707 ret = try_to_unmap_one(page, vma, 1697 ret = try_to_unmap_one(page, vma,
1708 rmap_item->address, flags); 1698 rmap_item->address, flags);
1709 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1699 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1710 anon_vma_unlock(anon_vma); 1700 anon_vma_unlock_read(anon_vma);
1711 goto out; 1701 goto out;
1712 } 1702 }
1713 } 1703 }
1714 anon_vma_unlock(anon_vma); 1704 anon_vma_unlock_read(anon_vma);
1715 } 1705 }
1716 if (!search_new_forks++) 1706 if (!search_new_forks++)
1717 goto again; 1707 goto again;
@@ -1741,7 +1731,7 @@ again:
1741 struct anon_vma_chain *vmac; 1731 struct anon_vma_chain *vmac;
1742 struct vm_area_struct *vma; 1732 struct vm_area_struct *vma;
1743 1733
1744 anon_vma_lock(anon_vma); 1734 anon_vma_lock_read(anon_vma);
1745 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1735 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1746 0, ULONG_MAX) { 1736 0, ULONG_MAX) {
1747 vma = vmac->vma; 1737 vma = vmac->vma;
@@ -1759,11 +1749,11 @@ again:
1759 1749
1760 ret = rmap_one(page, vma, rmap_item->address, arg); 1750 ret = rmap_one(page, vma, rmap_item->address, arg);
1761 if (ret != SWAP_AGAIN) { 1751 if (ret != SWAP_AGAIN) {
1762 anon_vma_unlock(anon_vma); 1752 anon_vma_unlock_read(anon_vma);
1763 goto out; 1753 goto out;
1764 } 1754 }
1765 } 1755 }
1766 anon_vma_unlock(anon_vma); 1756 anon_vma_unlock_read(anon_vma);
1767 } 1757 }
1768 if (!search_new_forks++) 1758 if (!search_new_forks++)
1769 goto again; 1759 goto again;
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1929 if (ksm_run != flags) { 1919 if (ksm_run != flags) {
1930 ksm_run = flags; 1920 ksm_run = flags;
1931 if (flags & KSM_RUN_UNMERGE) { 1921 if (flags & KSM_RUN_UNMERGE) {
1932 int oom_score_adj; 1922 set_current_oom_origin();
1933
1934 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1935 err = unmerge_and_remove_all_rmap_items(); 1923 err = unmerge_and_remove_all_rmap_items();
1936 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, 1924 clear_current_oom_origin();
1937 oom_score_adj);
1938 if (err) { 1925 if (err) {
1939 ksm_run = KSM_RUN_STOP; 1926 ksm_run = KSM_RUN_STOP;
1940 count = err; 1927 count = err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 625905523c2a..88adc8afb610 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -314,7 +314,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
314 } 314 }
315 315
316 this->size += next->size; 316 this->size += next->size;
317 memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); 317 /* move forward from next + 1, index of which is i + 2 */
318 memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
318 type->cnt--; 319 type->cnt--;
319 } 320 }
320} 321}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dd39ba000b31..fbb60b103e64 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
10 * Copyright (C) 2009 Nokia Corporation 10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov 11 * Author: Kirill A. Shutemov
12 * 12 *
13 * Kernel Memory Controller
14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
15 * Authors: Glauber Costa and Suleiman Souhlal
16 *
13 * This program is free software; you can redistribute it and/or modify 17 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 18 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or 19 * the Free Software Foundation; either version 2 of the License, or
@@ -59,6 +63,8 @@
59#include <trace/events/vmscan.h> 63#include <trace/events/vmscan.h>
60 64
61struct cgroup_subsys mem_cgroup_subsys __read_mostly; 65struct cgroup_subsys mem_cgroup_subsys __read_mostly;
66EXPORT_SYMBOL(mem_cgroup_subsys);
67
62#define MEM_CGROUP_RECLAIM_RETRIES 5 68#define MEM_CGROUP_RECLAIM_RETRIES 5
63static struct mem_cgroup *root_mem_cgroup __read_mostly; 69static struct mem_cgroup *root_mem_cgroup __read_mostly;
64 70
@@ -266,6 +272,10 @@ struct mem_cgroup {
266 }; 272 };
267 273
268 /* 274 /*
275 * the counter to account for kernel memory usage.
276 */
277 struct res_counter kmem;
278 /*
269 * Per cgroup active and inactive list, similar to the 279 * Per cgroup active and inactive list, similar to the
270 * per zone LRU lists. 280 * per zone LRU lists.
271 */ 281 */
@@ -280,6 +290,7 @@ struct mem_cgroup {
280 * Should the accounting and control be hierarchical, per subtree? 290 * Should the accounting and control be hierarchical, per subtree?
281 */ 291 */
282 bool use_hierarchy; 292 bool use_hierarchy;
293 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
283 294
284 bool oom_lock; 295 bool oom_lock;
285 atomic_t under_oom; 296 atomic_t under_oom;
@@ -330,8 +341,61 @@ struct mem_cgroup {
330#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 341#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
331 struct tcp_memcontrol tcp_mem; 342 struct tcp_memcontrol tcp_mem;
332#endif 343#endif
344#if defined(CONFIG_MEMCG_KMEM)
345 /* analogous to slab_common's slab_caches list. per-memcg */
346 struct list_head memcg_slab_caches;
347 /* Not a spinlock, we can take a lot of time walking the list */
348 struct mutex slab_caches_mutex;
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id;
351#endif
333}; 352};
334 353
354/* internal only representation about the status of kmem accounting. */
355enum {
356 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
357 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
358 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
359};
360
361/* We account when limit is on, but only after call sites are patched */
362#define KMEM_ACCOUNTED_MASK \
363 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
364
365#ifdef CONFIG_MEMCG_KMEM
366static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
367{
368 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
369}
370
371static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
372{
373 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
374}
375
376static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
377{
378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
379}
380
381static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
382{
383 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
384}
385
386static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
387{
388 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
389 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
390}
391
392static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
393{
394 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
395 &memcg->kmem_account_flags);
396}
397#endif
398
335/* Stuffs for move charges at task migration. */ 399/* Stuffs for move charges at task migration. */
336/* 400/*
337 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 401 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -386,9 +450,13 @@ enum charge_type {
386}; 450};
387 451
388/* for encoding cft->private value on file */ 452/* for encoding cft->private value on file */
389#define _MEM (0) 453enum res_type {
390#define _MEMSWAP (1) 454 _MEM,
391#define _OOM_TYPE (2) 455 _MEMSWAP,
456 _OOM_TYPE,
457 _KMEM,
458};
459
392#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 460#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
393#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 461#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
394#define MEMFILE_ATTR(val) ((val) & 0xffff) 462#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -485,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
485} 553}
486#endif 554#endif
487 555
556#ifdef CONFIG_MEMCG_KMEM
557/*
558 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
559 * There are two main reasons for not using the css_id for this:
560 * 1) this works better in sparse environments, where we have a lot of memcgs,
561 * but only a few kmem-limited. Or also, if we have, for instance, 200
562 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
563 * 200 entry array for that.
564 *
565 * 2) In order not to violate the cgroup API, we would like to do all memory
566 * allocation in ->create(). At that point, we haven't yet allocated the
567 * css_id. Having a separate index prevents us from messing with the cgroup
568 * core for this
569 *
570 * The current size of the caches array is stored in
571 * memcg_limited_groups_array_size. It will double each time we have to
572 * increase it.
573 */
574static DEFINE_IDA(kmem_limited_groups);
575int memcg_limited_groups_array_size;
576
577/*
578 * MIN_SIZE is different than 1, because we would like to avoid going through
579 * the alloc/free process all the time. In a small machine, 4 kmem-limited
580 * cgroups is a reasonable guess. In the future, it could be a parameter or
581 * tunable, but that is strictly not necessary.
582 *
583 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
584 * this constant directly from cgroup, but it is understandable that this is
585 * better kept as an internal representation in cgroup.c. In any case, the
586 * css_id space is not getting any smaller, and we don't have to necessarily
587 * increase ours as well if it increases.
588 */
589#define MEMCG_CACHES_MIN_SIZE 4
590#define MEMCG_CACHES_MAX_SIZE 65535
591
592/*
593 * A lot of the calls to the cache allocation functions are expected to be
594 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
595 * conditional to this static branch, we'll have to allow modules that does
596 * kmem_cache_alloc and the such to see this symbol as well
597 */
598struct static_key memcg_kmem_enabled_key;
599EXPORT_SYMBOL(memcg_kmem_enabled_key);
600
601static void disarm_kmem_keys(struct mem_cgroup *memcg)
602{
603 if (memcg_kmem_is_active(memcg)) {
604 static_key_slow_dec(&memcg_kmem_enabled_key);
605 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
606 }
607 /*
608 * This check can't live in kmem destruction function,
609 * since the charges will outlive the cgroup
610 */
611 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
612}
613#else
614static void disarm_kmem_keys(struct mem_cgroup *memcg)
615{
616}
617#endif /* CONFIG_MEMCG_KMEM */
618
619static void disarm_static_keys(struct mem_cgroup *memcg)
620{
621 disarm_sock_keys(memcg);
622 disarm_kmem_keys(memcg);
623}
624
488static void drain_all_stock_async(struct mem_cgroup *memcg); 625static void drain_all_stock_async(struct mem_cgroup *memcg);
489 626
490static struct mem_cgroup_per_zone * 627static struct mem_cgroup_per_zone *
@@ -800,7 +937,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
800 int nid; 937 int nid;
801 u64 total = 0; 938 u64 total = 0;
802 939
803 for_each_node_state(nid, N_HIGH_MEMORY) 940 for_each_node_state(nid, N_MEMORY)
804 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 941 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
805 return total; 942 return total;
806} 943}
@@ -1015,13 +1152,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
1015 iter != NULL; \ 1152 iter != NULL; \
1016 iter = mem_cgroup_iter(NULL, iter, NULL)) 1153 iter = mem_cgroup_iter(NULL, iter, NULL))
1017 1154
1018void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1155void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1019{ 1156{
1020 struct mem_cgroup *memcg; 1157 struct mem_cgroup *memcg;
1021 1158
1022 if (!mm)
1023 return;
1024
1025 rcu_read_lock(); 1159 rcu_read_lock();
1026 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1160 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1027 if (unlikely(!memcg)) 1161 if (unlikely(!memcg))
@@ -1040,7 +1174,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1040out: 1174out:
1041 rcu_read_unlock(); 1175 rcu_read_unlock();
1042} 1176}
1043EXPORT_SYMBOL(mem_cgroup_count_vm_event); 1177EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1044 1178
1045/** 1179/**
1046 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1180 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
@@ -1454,6 +1588,10 @@ done:
1454 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1588 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1455 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1589 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1456 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1590 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1591 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1592 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1593 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1594 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1457} 1595}
1458 1596
1459/* 1597/*
@@ -1498,8 +1636,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1498 return limit; 1636 return limit;
1499} 1637}
1500 1638
1501void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1639static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1502 int order) 1640 int order)
1503{ 1641{
1504 struct mem_cgroup *iter; 1642 struct mem_cgroup *iter;
1505 unsigned long chosen_points = 0; 1643 unsigned long chosen_points = 0;
@@ -1644,9 +1782,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1644 return; 1782 return;
1645 1783
1646 /* make a nodemask where this memcg uses memory from */ 1784 /* make a nodemask where this memcg uses memory from */
1647 memcg->scan_nodes = node_states[N_HIGH_MEMORY]; 1785 memcg->scan_nodes = node_states[N_MEMORY];
1648 1786
1649 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1787 for_each_node_mask(nid, node_states[N_MEMORY]) {
1650 1788
1651 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1789 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1652 node_clear(nid, memcg->scan_nodes); 1790 node_clear(nid, memcg->scan_nodes);
@@ -1717,7 +1855,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1717 /* 1855 /*
1718 * Check rest of nodes. 1856 * Check rest of nodes.
1719 */ 1857 */
1720 for_each_node_state(nid, N_HIGH_MEMORY) { 1858 for_each_node_state(nid, N_MEMORY) {
1721 if (node_isset(nid, memcg->scan_nodes)) 1859 if (node_isset(nid, memcg->scan_nodes))
1722 continue; 1860 continue;
1723 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1861 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
@@ -2061,20 +2199,28 @@ struct memcg_stock_pcp {
2061static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2199static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2062static DEFINE_MUTEX(percpu_charge_mutex); 2200static DEFINE_MUTEX(percpu_charge_mutex);
2063 2201
2064/* 2202/**
2065 * Try to consume stocked charge on this cpu. If success, one page is consumed 2203 * consume_stock: Try to consume stocked charge on this cpu.
2066 * from local stock and true is returned. If the stock is 0 or charges from a 2204 * @memcg: memcg to consume from.
2067 * cgroup which is not current target, returns false. This stock will be 2205 * @nr_pages: how many pages to charge.
2068 * refilled. 2206 *
2207 * The charges will only happen if @memcg matches the current cpu's memcg
2208 * stock, and at least @nr_pages are available in that stock. Failure to
2209 * service an allocation will refill the stock.
2210 *
2211 * returns true if successful, false otherwise.
2069 */ 2212 */
2070static bool consume_stock(struct mem_cgroup *memcg) 2213static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2071{ 2214{
2072 struct memcg_stock_pcp *stock; 2215 struct memcg_stock_pcp *stock;
2073 bool ret = true; 2216 bool ret = true;
2074 2217
2218 if (nr_pages > CHARGE_BATCH)
2219 return false;
2220
2075 stock = &get_cpu_var(memcg_stock); 2221 stock = &get_cpu_var(memcg_stock);
2076 if (memcg == stock->cached && stock->nr_pages) 2222 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2077 stock->nr_pages--; 2223 stock->nr_pages -= nr_pages;
2078 else /* need to call res_counter_charge */ 2224 else /* need to call res_counter_charge */
2079 ret = false; 2225 ret = false;
2080 put_cpu_var(memcg_stock); 2226 put_cpu_var(memcg_stock);
@@ -2251,7 +2397,8 @@ enum {
2251}; 2397};
2252 2398
2253static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2399static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2254 unsigned int nr_pages, bool oom_check) 2400 unsigned int nr_pages, unsigned int min_pages,
2401 bool oom_check)
2255{ 2402{
2256 unsigned long csize = nr_pages * PAGE_SIZE; 2403 unsigned long csize = nr_pages * PAGE_SIZE;
2257 struct mem_cgroup *mem_over_limit; 2404 struct mem_cgroup *mem_over_limit;
@@ -2274,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2274 } else 2421 } else
2275 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2422 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2276 /* 2423 /*
2277 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2278 * of regular pages (CHARGE_BATCH), or a single regular page (1).
2279 *
2280 * Never reclaim on behalf of optional batching, retry with a 2424 * Never reclaim on behalf of optional batching, retry with a
2281 * single page instead. 2425 * single page instead.
2282 */ 2426 */
2283 if (nr_pages == CHARGE_BATCH) 2427 if (nr_pages > min_pages)
2284 return CHARGE_RETRY; 2428 return CHARGE_RETRY;
2285 2429
2286 if (!(gfp_mask & __GFP_WAIT)) 2430 if (!(gfp_mask & __GFP_WAIT))
2287 return CHARGE_WOULDBLOCK; 2431 return CHARGE_WOULDBLOCK;
2288 2432
2433 if (gfp_mask & __GFP_NORETRY)
2434 return CHARGE_NOMEM;
2435
2289 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2436 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2290 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2437 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2291 return CHARGE_RETRY; 2438 return CHARGE_RETRY;
@@ -2298,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2298 * unlikely to succeed so close to the limit, and we fall back 2445 * unlikely to succeed so close to the limit, and we fall back
2299 * to regular pages anyway in case of failure. 2446 * to regular pages anyway in case of failure.
2300 */ 2447 */
2301 if (nr_pages == 1 && ret) 2448 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2302 return CHARGE_RETRY; 2449 return CHARGE_RETRY;
2303 2450
2304 /* 2451 /*
@@ -2370,10 +2517,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2370again: 2517again:
2371 if (*ptr) { /* css should be a valid one */ 2518 if (*ptr) { /* css should be a valid one */
2372 memcg = *ptr; 2519 memcg = *ptr;
2373 VM_BUG_ON(css_is_removed(&memcg->css));
2374 if (mem_cgroup_is_root(memcg)) 2520 if (mem_cgroup_is_root(memcg))
2375 goto done; 2521 goto done;
2376 if (nr_pages == 1 && consume_stock(memcg)) 2522 if (consume_stock(memcg, nr_pages))
2377 goto done; 2523 goto done;
2378 css_get(&memcg->css); 2524 css_get(&memcg->css);
2379 } else { 2525 } else {
@@ -2398,7 +2544,7 @@ again:
2398 rcu_read_unlock(); 2544 rcu_read_unlock();
2399 goto done; 2545 goto done;
2400 } 2546 }
2401 if (nr_pages == 1 && consume_stock(memcg)) { 2547 if (consume_stock(memcg, nr_pages)) {
2402 /* 2548 /*
2403 * It seems dagerous to access memcg without css_get(). 2549 * It seems dagerous to access memcg without css_get().
2404 * But considering how consume_stok works, it's not 2550 * But considering how consume_stok works, it's not
@@ -2433,7 +2579,8 @@ again:
2433 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2579 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2434 } 2580 }
2435 2581
2436 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); 2582 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2583 oom_check);
2437 switch (ret) { 2584 switch (ret) {
2438 case CHARGE_OK: 2585 case CHARGE_OK:
2439 break; 2586 break;
@@ -2510,9 +2657,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2510 2657
2511/* 2658/*
2512 * A helper function to get mem_cgroup from ID. must be called under 2659 * A helper function to get mem_cgroup from ID. must be called under
2513 * rcu_read_lock(). The caller must check css_is_removed() or some if 2660 * rcu_read_lock(). The caller is responsible for calling css_tryget if
2514 * it's concern. (dropping refcnt from swap can be called against removed 2661 * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2515 * memcg.) 2662 * called against removed memcg.)
2516 */ 2663 */
2517static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2664static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2518{ 2665{
@@ -2626,6 +2773,768 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2626 memcg_check_events(memcg, page); 2773 memcg_check_events(memcg, page);
2627} 2774}
2628 2775
2776static DEFINE_MUTEX(set_limit_mutex);
2777
2778#ifdef CONFIG_MEMCG_KMEM
2779static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2780{
2781 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2782 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2783}
2784
2785/*
2786 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2787 * in the memcg_cache_params struct.
2788 */
2789static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2790{
2791 struct kmem_cache *cachep;
2792
2793 VM_BUG_ON(p->is_root_cache);
2794 cachep = p->root_cache;
2795 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2796}
2797
2798#ifdef CONFIG_SLABINFO
2799static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
2800 struct seq_file *m)
2801{
2802 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2803 struct memcg_cache_params *params;
2804
2805 if (!memcg_can_account_kmem(memcg))
2806 return -EIO;
2807
2808 print_slabinfo_header(m);
2809
2810 mutex_lock(&memcg->slab_caches_mutex);
2811 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2812 cache_show(memcg_params_to_cache(params), m);
2813 mutex_unlock(&memcg->slab_caches_mutex);
2814
2815 return 0;
2816}
2817#endif
2818
2819static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2820{
2821 struct res_counter *fail_res;
2822 struct mem_cgroup *_memcg;
2823 int ret = 0;
2824 bool may_oom;
2825
2826 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2827 if (ret)
2828 return ret;
2829
2830 /*
2831 * Conditions under which we can wait for the oom_killer. Those are
2832 * the same conditions tested by the core page allocator
2833 */
2834 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2835
2836 _memcg = memcg;
2837 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2838 &_memcg, may_oom);
2839
2840 if (ret == -EINTR) {
2841 /*
2842 * __mem_cgroup_try_charge() chosed to bypass to root due to
2843 * OOM kill or fatal signal. Since our only options are to
2844 * either fail the allocation or charge it to this cgroup, do
2845 * it as a temporary condition. But we can't fail. From a
2846 * kmem/slab perspective, the cache has already been selected,
2847 * by mem_cgroup_kmem_get_cache(), so it is too late to change
2848 * our minds.
2849 *
2850 * This condition will only trigger if the task entered
2851 * memcg_charge_kmem in a sane state, but was OOM-killed during
2852 * __mem_cgroup_try_charge() above. Tasks that were already
2853 * dying when the allocation triggers should have been already
2854 * directed to the root cgroup in memcontrol.h
2855 */
2856 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2857 if (do_swap_account)
2858 res_counter_charge_nofail(&memcg->memsw, size,
2859 &fail_res);
2860 ret = 0;
2861 } else if (ret)
2862 res_counter_uncharge(&memcg->kmem, size);
2863
2864 return ret;
2865}
2866
2867static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2868{
2869 res_counter_uncharge(&memcg->res, size);
2870 if (do_swap_account)
2871 res_counter_uncharge(&memcg->memsw, size);
2872
2873 /* Not down to 0 */
2874 if (res_counter_uncharge(&memcg->kmem, size))
2875 return;
2876
2877 if (memcg_kmem_test_and_clear_dead(memcg))
2878 mem_cgroup_put(memcg);
2879}
2880
2881void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
2882{
2883 if (!memcg)
2884 return;
2885
2886 mutex_lock(&memcg->slab_caches_mutex);
2887 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2888 mutex_unlock(&memcg->slab_caches_mutex);
2889}
2890
2891/*
2892 * helper for acessing a memcg's index. It will be used as an index in the
2893 * child cache array in kmem_cache, and also to derive its name. This function
2894 * will return -1 when this is not a kmem-limited memcg.
2895 */
2896int memcg_cache_id(struct mem_cgroup *memcg)
2897{
2898 return memcg ? memcg->kmemcg_id : -1;
2899}
2900
2901/*
2902 * This ends up being protected by the set_limit mutex, during normal
2903 * operation, because that is its main call site.
2904 *
2905 * But when we create a new cache, we can call this as well if its parent
2906 * is kmem-limited. That will have to hold set_limit_mutex as well.
2907 */
2908int memcg_update_cache_sizes(struct mem_cgroup *memcg)
2909{
2910 int num, ret;
2911
2912 num = ida_simple_get(&kmem_limited_groups,
2913 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2914 if (num < 0)
2915 return num;
2916 /*
2917 * After this point, kmem_accounted (that we test atomically in
2918 * the beginning of this conditional), is no longer 0. This
2919 * guarantees only one process will set the following boolean
2920 * to true. We don't need test_and_set because we're protected
2921 * by the set_limit_mutex anyway.
2922 */
2923 memcg_kmem_set_activated(memcg);
2924
2925 ret = memcg_update_all_caches(num+1);
2926 if (ret) {
2927 ida_simple_remove(&kmem_limited_groups, num);
2928 memcg_kmem_clear_activated(memcg);
2929 return ret;
2930 }
2931
2932 memcg->kmemcg_id = num;
2933 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
2934 mutex_init(&memcg->slab_caches_mutex);
2935 return 0;
2936}
2937
2938static size_t memcg_caches_array_size(int num_groups)
2939{
2940 ssize_t size;
2941 if (num_groups <= 0)
2942 return 0;
2943
2944 size = 2 * num_groups;
2945 if (size < MEMCG_CACHES_MIN_SIZE)
2946 size = MEMCG_CACHES_MIN_SIZE;
2947 else if (size > MEMCG_CACHES_MAX_SIZE)
2948 size = MEMCG_CACHES_MAX_SIZE;
2949
2950 return size;
2951}
2952
2953/*
2954 * We should update the current array size iff all caches updates succeed. This
2955 * can only be done from the slab side. The slab mutex needs to be held when
2956 * calling this.
2957 */
2958void memcg_update_array_size(int num)
2959{
2960 if (num > memcg_limited_groups_array_size)
2961 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2962}
2963
2964int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2965{
2966 struct memcg_cache_params *cur_params = s->memcg_params;
2967
2968 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
2969
2970 if (num_groups > memcg_limited_groups_array_size) {
2971 int i;
2972 ssize_t size = memcg_caches_array_size(num_groups);
2973
2974 size *= sizeof(void *);
2975 size += sizeof(struct memcg_cache_params);
2976
2977 s->memcg_params = kzalloc(size, GFP_KERNEL);
2978 if (!s->memcg_params) {
2979 s->memcg_params = cur_params;
2980 return -ENOMEM;
2981 }
2982
2983 s->memcg_params->is_root_cache = true;
2984
2985 /*
2986 * There is the chance it will be bigger than
2987 * memcg_limited_groups_array_size, if we failed an allocation
2988 * in a cache, in which case all caches updated before it, will
2989 * have a bigger array.
2990 *
2991 * But if that is the case, the data after
2992 * memcg_limited_groups_array_size is certainly unused
2993 */
2994 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2995 if (!cur_params->memcg_caches[i])
2996 continue;
2997 s->memcg_params->memcg_caches[i] =
2998 cur_params->memcg_caches[i];
2999 }
3000
3001 /*
3002 * Ideally, we would wait until all caches succeed, and only
3003 * then free the old one. But this is not worth the extra
3004 * pointer per-cache we'd have to have for this.
3005 *
3006 * It is not a big deal if some caches are left with a size
3007 * bigger than the others. And all updates will reset this
3008 * anyway.
3009 */
3010 kfree(cur_params);
3011 }
3012 return 0;
3013}
3014
3015int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3016 struct kmem_cache *root_cache)
3017{
3018 size_t size = sizeof(struct memcg_cache_params);
3019
3020 if (!memcg_kmem_enabled())
3021 return 0;
3022
3023 if (!memcg)
3024 size += memcg_limited_groups_array_size * sizeof(void *);
3025
3026 s->memcg_params = kzalloc(size, GFP_KERNEL);
3027 if (!s->memcg_params)
3028 return -ENOMEM;
3029
3030 if (memcg) {
3031 s->memcg_params->memcg = memcg;
3032 s->memcg_params->root_cache = root_cache;
3033 } else
3034 s->memcg_params->is_root_cache = true;
3035
3036 return 0;
3037}
3038
3039void memcg_release_cache(struct kmem_cache *s)
3040{
3041 struct kmem_cache *root;
3042 struct mem_cgroup *memcg;
3043 int id;
3044
3045 /*
3046 * This happens, for instance, when a root cache goes away before we
3047 * add any memcg.
3048 */
3049 if (!s->memcg_params)
3050 return;
3051
3052 if (s->memcg_params->is_root_cache)
3053 goto out;
3054
3055 memcg = s->memcg_params->memcg;
3056 id = memcg_cache_id(memcg);
3057
3058 root = s->memcg_params->root_cache;
3059 root->memcg_params->memcg_caches[id] = NULL;
3060 mem_cgroup_put(memcg);
3061
3062 mutex_lock(&memcg->slab_caches_mutex);
3063 list_del(&s->memcg_params->list);
3064 mutex_unlock(&memcg->slab_caches_mutex);
3065
3066out:
3067 kfree(s->memcg_params);
3068}
3069
3070/*
3071 * During the creation a new cache, we need to disable our accounting mechanism
3072 * altogether. This is true even if we are not creating, but rather just
3073 * enqueing new caches to be created.
3074 *
3075 * This is because that process will trigger allocations; some visible, like
3076 * explicit kmallocs to auxiliary data structures, name strings and internal
3077 * cache structures; some well concealed, like INIT_WORK() that can allocate
3078 * objects during debug.
3079 *
3080 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
3081 * to it. This may not be a bounded recursion: since the first cache creation
3082 * failed to complete (waiting on the allocation), we'll just try to create the
3083 * cache again, failing at the same point.
3084 *
3085 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
3086 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
3087 * inside the following two functions.
3088 */
3089static inline void memcg_stop_kmem_account(void)
3090{
3091 VM_BUG_ON(!current->mm);
3092 current->memcg_kmem_skip_account++;
3093}
3094
3095static inline void memcg_resume_kmem_account(void)
3096{
3097 VM_BUG_ON(!current->mm);
3098 current->memcg_kmem_skip_account--;
3099}
3100
3101static void kmem_cache_destroy_work_func(struct work_struct *w)
3102{
3103 struct kmem_cache *cachep;
3104 struct memcg_cache_params *p;
3105
3106 p = container_of(w, struct memcg_cache_params, destroy);
3107
3108 cachep = memcg_params_to_cache(p);
3109
3110 /*
3111 * If we get down to 0 after shrink, we could delete right away.
3112 * However, memcg_release_pages() already puts us back in the workqueue
3113 * in that case. If we proceed deleting, we'll get a dangling
3114 * reference, and removing the object from the workqueue in that case
3115 * is unnecessary complication. We are not a fast path.
3116 *
3117 * Note that this case is fundamentally different from racing with
3118 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3119 * kmem_cache_shrink, not only we would be reinserting a dead cache
3120 * into the queue, but doing so from inside the worker racing to
3121 * destroy it.
3122 *
3123 * So if we aren't down to zero, we'll just schedule a worker and try
3124 * again
3125 */
3126 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3127 kmem_cache_shrink(cachep);
3128 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3129 return;
3130 } else
3131 kmem_cache_destroy(cachep);
3132}
3133
3134void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3135{
3136 if (!cachep->memcg_params->dead)
3137 return;
3138
3139 /*
3140 * There are many ways in which we can get here.
3141 *
3142 * We can get to a memory-pressure situation while the delayed work is
3143 * still pending to run. The vmscan shrinkers can then release all
3144 * cache memory and get us to destruction. If this is the case, we'll
3145 * be executed twice, which is a bug (the second time will execute over
3146 * bogus data). In this case, cancelling the work should be fine.
3147 *
3148 * But we can also get here from the worker itself, if
3149 * kmem_cache_shrink is enough to shake all the remaining objects and
3150 * get the page count to 0. In this case, we'll deadlock if we try to
3151 * cancel the work (the worker runs with an internal lock held, which
3152 * is the same lock we would hold for cancel_work_sync().)
3153 *
3154 * Since we can't possibly know who got us here, just refrain from
3155 * running if there is already work pending
3156 */
3157 if (work_pending(&cachep->memcg_params->destroy))
3158 return;
3159 /*
3160 * We have to defer the actual destroying to a workqueue, because
3161 * we might currently be in a context that cannot sleep.
3162 */
3163 schedule_work(&cachep->memcg_params->destroy);
3164}
3165
3166static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
3167{
3168 char *name;
3169 struct dentry *dentry;
3170
3171 rcu_read_lock();
3172 dentry = rcu_dereference(memcg->css.cgroup->dentry);
3173 rcu_read_unlock();
3174
3175 BUG_ON(dentry == NULL);
3176
3177 name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
3178 memcg_cache_id(memcg), dentry->d_name.name);
3179
3180 return name;
3181}
3182
3183static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3184 struct kmem_cache *s)
3185{
3186 char *name;
3187 struct kmem_cache *new;
3188
3189 name = memcg_cache_name(memcg, s);
3190 if (!name)
3191 return NULL;
3192
3193 new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
3194 (s->flags & ~SLAB_PANIC), s->ctor, s);
3195
3196 if (new)
3197 new->allocflags |= __GFP_KMEMCG;
3198
3199 kfree(name);
3200 return new;
3201}
3202
3203/*
3204 * This lock protects updaters, not readers. We want readers to be as fast as
3205 * they can, and they will either see NULL or a valid cache value. Our model
3206 * allow them to see NULL, in which case the root memcg will be selected.
3207 *
3208 * We need this lock because multiple allocations to the same cache from a non
3209 * will span more than one worker. Only one of them can create the cache.
3210 */
3211static DEFINE_MUTEX(memcg_cache_mutex);
3212static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3213 struct kmem_cache *cachep)
3214{
3215 struct kmem_cache *new_cachep;
3216 int idx;
3217
3218 BUG_ON(!memcg_can_account_kmem(memcg));
3219
3220 idx = memcg_cache_id(memcg);
3221
3222 mutex_lock(&memcg_cache_mutex);
3223 new_cachep = cachep->memcg_params->memcg_caches[idx];
3224 if (new_cachep)
3225 goto out;
3226
3227 new_cachep = kmem_cache_dup(memcg, cachep);
3228 if (new_cachep == NULL) {
3229 new_cachep = cachep;
3230 goto out;
3231 }
3232
3233 mem_cgroup_get(memcg);
3234 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3235
3236 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3237 /*
3238 * the readers won't lock, make sure everybody sees the updated value,
3239 * so they won't put stuff in the queue again for no reason
3240 */
3241 wmb();
3242out:
3243 mutex_unlock(&memcg_cache_mutex);
3244 return new_cachep;
3245}
3246
3247void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3248{
3249 struct kmem_cache *c;
3250 int i;
3251
3252 if (!s->memcg_params)
3253 return;
3254 if (!s->memcg_params->is_root_cache)
3255 return;
3256
3257 /*
3258 * If the cache is being destroyed, we trust that there is no one else
3259 * requesting objects from it. Even if there are, the sanity checks in
3260 * kmem_cache_destroy should caught this ill-case.
3261 *
3262 * Still, we don't want anyone else freeing memcg_caches under our
3263 * noses, which can happen if a new memcg comes to life. As usual,
3264 * we'll take the set_limit_mutex to protect ourselves against this.
3265 */
3266 mutex_lock(&set_limit_mutex);
3267 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3268 c = s->memcg_params->memcg_caches[i];
3269 if (!c)
3270 continue;
3271
3272 /*
3273 * We will now manually delete the caches, so to avoid races
3274 * we need to cancel all pending destruction workers and
3275 * proceed with destruction ourselves.
3276 *
3277 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3278 * and that could spawn the workers again: it is likely that
3279 * the cache still have active pages until this very moment.
3280 * This would lead us back to mem_cgroup_destroy_cache.
3281 *
3282 * But that will not execute at all if the "dead" flag is not
3283 * set, so flip it down to guarantee we are in control.
3284 */
3285 c->memcg_params->dead = false;
3286 cancel_work_sync(&c->memcg_params->destroy);
3287 kmem_cache_destroy(c);
3288 }
3289 mutex_unlock(&set_limit_mutex);
3290}
3291
3292struct create_work {
3293 struct mem_cgroup *memcg;
3294 struct kmem_cache *cachep;
3295 struct work_struct work;
3296};
3297
3298static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3299{
3300 struct kmem_cache *cachep;
3301 struct memcg_cache_params *params;
3302
3303 if (!memcg_kmem_is_active(memcg))
3304 return;
3305
3306 mutex_lock(&memcg->slab_caches_mutex);
3307 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3308 cachep = memcg_params_to_cache(params);
3309 cachep->memcg_params->dead = true;
3310 INIT_WORK(&cachep->memcg_params->destroy,
3311 kmem_cache_destroy_work_func);
3312 schedule_work(&cachep->memcg_params->destroy);
3313 }
3314 mutex_unlock(&memcg->slab_caches_mutex);
3315}
3316
3317static void memcg_create_cache_work_func(struct work_struct *w)
3318{
3319 struct create_work *cw;
3320
3321 cw = container_of(w, struct create_work, work);
3322 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3323 /* Drop the reference gotten when we enqueued. */
3324 css_put(&cw->memcg->css);
3325 kfree(cw);
3326}
3327
3328/*
3329 * Enqueue the creation of a per-memcg kmem_cache.
3330 * Called with rcu_read_lock.
3331 */
3332static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3333 struct kmem_cache *cachep)
3334{
3335 struct create_work *cw;
3336
3337 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3338 if (cw == NULL)
3339 return;
3340
3341 /* The corresponding put will be done in the workqueue. */
3342 if (!css_tryget(&memcg->css)) {
3343 kfree(cw);
3344 return;
3345 }
3346
3347 cw->memcg = memcg;
3348 cw->cachep = cachep;
3349
3350 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3351 schedule_work(&cw->work);
3352}
3353
3354static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3355 struct kmem_cache *cachep)
3356{
3357 /*
3358 * We need to stop accounting when we kmalloc, because if the
3359 * corresponding kmalloc cache is not yet created, the first allocation
3360 * in __memcg_create_cache_enqueue will recurse.
3361 *
3362 * However, it is better to enclose the whole function. Depending on
3363 * the debugging options enabled, INIT_WORK(), for instance, can
3364 * trigger an allocation. This too, will make us recurse. Because at
3365 * this point we can't allow ourselves back into memcg_kmem_get_cache,
3366 * the safest choice is to do it like this, wrapping the whole function.
3367 */
3368 memcg_stop_kmem_account();
3369 __memcg_create_cache_enqueue(memcg, cachep);
3370 memcg_resume_kmem_account();
3371}
3372/*
3373 * Return the kmem_cache we're supposed to use for a slab allocation.
3374 * We try to use the current memcg's version of the cache.
3375 *
3376 * If the cache does not exist yet, if we are the first user of it,
3377 * we either create it immediately, if possible, or create it asynchronously
3378 * in a workqueue.
3379 * In the latter case, we will let the current allocation go through with
3380 * the original cache.
3381 *
3382 * Can't be called in interrupt context or from kernel threads.
3383 * This function needs to be called with rcu_read_lock() held.
3384 */
3385struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3386 gfp_t gfp)
3387{
3388 struct mem_cgroup *memcg;
3389 int idx;
3390
3391 VM_BUG_ON(!cachep->memcg_params);
3392 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3393
3394 if (!current->mm || current->memcg_kmem_skip_account)
3395 return cachep;
3396
3397 rcu_read_lock();
3398 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3399 rcu_read_unlock();
3400
3401 if (!memcg_can_account_kmem(memcg))
3402 return cachep;
3403
3404 idx = memcg_cache_id(memcg);
3405
3406 /*
3407 * barrier to mare sure we're always seeing the up to date value. The
3408 * code updating memcg_caches will issue a write barrier to match this.
3409 */
3410 read_barrier_depends();
3411 if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
3412 /*
3413 * If we are in a safe context (can wait, and not in interrupt
3414 * context), we could be be predictable and return right away.
3415 * This would guarantee that the allocation being performed
3416 * already belongs in the new cache.
3417 *
3418 * However, there are some clashes that can arrive from locking.
3419 * For instance, because we acquire the slab_mutex while doing
3420 * kmem_cache_dup, this means no further allocation could happen
3421 * with the slab_mutex held.
3422 *
3423 * Also, because cache creation issue get_online_cpus(), this
3424 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3425 * that ends up reversed during cpu hotplug. (cpuset allocates
3426 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3427 * better to defer everything.
3428 */
3429 memcg_create_cache_enqueue(memcg, cachep);
3430 return cachep;
3431 }
3432
3433 return cachep->memcg_params->memcg_caches[idx];
3434}
3435EXPORT_SYMBOL(__memcg_kmem_get_cache);
3436
3437/*
3438 * We need to verify if the allocation against current->mm->owner's memcg is
3439 * possible for the given order. But the page is not allocated yet, so we'll
3440 * need a further commit step to do the final arrangements.
3441 *
3442 * It is possible for the task to switch cgroups in this mean time, so at
3443 * commit time, we can't rely on task conversion any longer. We'll then use
3444 * the handle argument to return to the caller which cgroup we should commit
3445 * against. We could also return the memcg directly and avoid the pointer
3446 * passing, but a boolean return value gives better semantics considering
3447 * the compiled-out case as well.
3448 *
3449 * Returning true means the allocation is possible.
3450 */
3451bool
3452__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3453{
3454 struct mem_cgroup *memcg;
3455 int ret;
3456
3457 *_memcg = NULL;
3458 memcg = try_get_mem_cgroup_from_mm(current->mm);
3459
3460 /*
3461 * very rare case described in mem_cgroup_from_task. Unfortunately there
3462 * isn't much we can do without complicating this too much, and it would
3463 * be gfp-dependent anyway. Just let it go
3464 */
3465 if (unlikely(!memcg))
3466 return true;
3467
3468 if (!memcg_can_account_kmem(memcg)) {
3469 css_put(&memcg->css);
3470 return true;
3471 }
3472
3473 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3474 if (!ret)
3475 *_memcg = memcg;
3476
3477 css_put(&memcg->css);
3478 return (ret == 0);
3479}
3480
3481void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3482 int order)
3483{
3484 struct page_cgroup *pc;
3485
3486 VM_BUG_ON(mem_cgroup_is_root(memcg));
3487
3488 /* The page allocation failed. Revert */
3489 if (!page) {
3490 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3491 return;
3492 }
3493
3494 pc = lookup_page_cgroup(page);
3495 lock_page_cgroup(pc);
3496 pc->mem_cgroup = memcg;
3497 SetPageCgroupUsed(pc);
3498 unlock_page_cgroup(pc);
3499}
3500
3501void __memcg_kmem_uncharge_pages(struct page *page, int order)
3502{
3503 struct mem_cgroup *memcg = NULL;
3504 struct page_cgroup *pc;
3505
3506
3507 pc = lookup_page_cgroup(page);
3508 /*
3509 * Fast unlocked return. Theoretically might have changed, have to
3510 * check again after locking.
3511 */
3512 if (!PageCgroupUsed(pc))
3513 return;
3514
3515 lock_page_cgroup(pc);
3516 if (PageCgroupUsed(pc)) {
3517 memcg = pc->mem_cgroup;
3518 ClearPageCgroupUsed(pc);
3519 }
3520 unlock_page_cgroup(pc);
3521
3522 /*
3523 * We trust that only if there is a memcg associated with the page, it
3524 * is a valid allocation
3525 */
3526 if (!memcg)
3527 return;
3528
3529 VM_BUG_ON(mem_cgroup_is_root(memcg));
3530 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3531}
3532#else
3533static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3534{
3535}
3536#endif /* CONFIG_MEMCG_KMEM */
3537
2629#ifdef CONFIG_TRANSPARENT_HUGEPAGE 3538#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2630 3539
2631#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3540#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
@@ -2709,13 +3618,6 @@ static int mem_cgroup_move_account(struct page *page,
2709 /* caller should have done css_get */ 3618 /* caller should have done css_get */
2710 pc->mem_cgroup = to; 3619 pc->mem_cgroup = to;
2711 mem_cgroup_charge_statistics(to, anon, nr_pages); 3620 mem_cgroup_charge_statistics(to, anon, nr_pages);
2712 /*
2713 * We charges against "to" which may not have any tasks. Then, "to"
2714 * can be under rmdir(). But in current implementation, caller of
2715 * this function is just force_empty() and move charge, so it's
2716 * guaranteed that "to" is never removed. So, we don't check rmdir
2717 * status here.
2718 */
2719 move_unlock_mem_cgroup(from, &flags); 3621 move_unlock_mem_cgroup(from, &flags);
2720 ret = 0; 3622 ret = 0;
2721unlock: 3623unlock:
@@ -2729,10 +3631,27 @@ out:
2729 return ret; 3631 return ret;
2730} 3632}
2731 3633
2732/* 3634/**
2733 * move charges to its parent. 3635 * mem_cgroup_move_parent - moves page to the parent group
3636 * @page: the page to move
3637 * @pc: page_cgroup of the page
3638 * @child: page's cgroup
3639 *
3640 * move charges to its parent or the root cgroup if the group has no
3641 * parent (aka use_hierarchy==0).
3642 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3643 * mem_cgroup_move_account fails) the failure is always temporary and
3644 * it signals a race with a page removal/uncharge or migration. In the
3645 * first case the page is on the way out and it will vanish from the LRU
3646 * on the next attempt and the call should be retried later.
3647 * Isolation from the LRU fails only if page has been isolated from
3648 * the LRU since we looked at it and that usually means either global
3649 * reclaim or migration going on. The page will either get back to the
3650 * LRU or vanish.
3651 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3652 * (!PageCgroupUsed) or moved to a different group. The page will
3653 * disappear in the next attempt.
2734 */ 3654 */
2735
2736static int mem_cgroup_move_parent(struct page *page, 3655static int mem_cgroup_move_parent(struct page *page,
2737 struct page_cgroup *pc, 3656 struct page_cgroup *pc,
2738 struct mem_cgroup *child) 3657 struct mem_cgroup *child)
@@ -2742,9 +3661,7 @@ static int mem_cgroup_move_parent(struct page *page,
2742 unsigned long uninitialized_var(flags); 3661 unsigned long uninitialized_var(flags);
2743 int ret; 3662 int ret;
2744 3663
2745 /* Is ROOT ? */ 3664 VM_BUG_ON(mem_cgroup_is_root(child));
2746 if (mem_cgroup_is_root(child))
2747 return -EINVAL;
2748 3665
2749 ret = -EBUSY; 3666 ret = -EBUSY;
2750 if (!get_page_unless_zero(page)) 3667 if (!get_page_unless_zero(page))
@@ -2761,8 +3678,10 @@ static int mem_cgroup_move_parent(struct page *page,
2761 if (!parent) 3678 if (!parent)
2762 parent = root_mem_cgroup; 3679 parent = root_mem_cgroup;
2763 3680
2764 if (nr_pages > 1) 3681 if (nr_pages > 1) {
3682 VM_BUG_ON(!PageTransHuge(page));
2765 flags = compound_lock_irqsave(page); 3683 flags = compound_lock_irqsave(page);
3684 }
2766 3685
2767 ret = mem_cgroup_move_account(page, nr_pages, 3686 ret = mem_cgroup_move_account(page, nr_pages,
2768 pc, child, parent); 3687 pc, child, parent);
@@ -2904,7 +3823,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2904 return; 3823 return;
2905 if (!memcg) 3824 if (!memcg)
2906 return; 3825 return;
2907 cgroup_exclude_rmdir(&memcg->css);
2908 3826
2909 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 3827 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2910 /* 3828 /*
@@ -2918,12 +3836,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2918 swp_entry_t ent = {.val = page_private(page)}; 3836 swp_entry_t ent = {.val = page_private(page)};
2919 mem_cgroup_uncharge_swap(ent); 3837 mem_cgroup_uncharge_swap(ent);
2920 } 3838 }
2921 /*
2922 * At swapin, we may charge account against cgroup which has no tasks.
2923 * So, rmdir()->pre_destroy() can be called while we do this charge.
2924 * In that case, we need to call pre_destroy() again. check it here.
2925 */
2926 cgroup_release_and_wakeup_rmdir(&memcg->css);
2927} 3839}
2928 3840
2929void mem_cgroup_commit_charge_swapin(struct page *page, 3841void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3288,15 +4200,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3288 struct mem_cgroup **memcgp) 4200 struct mem_cgroup **memcgp)
3289{ 4201{
3290 struct mem_cgroup *memcg = NULL; 4202 struct mem_cgroup *memcg = NULL;
4203 unsigned int nr_pages = 1;
3291 struct page_cgroup *pc; 4204 struct page_cgroup *pc;
3292 enum charge_type ctype; 4205 enum charge_type ctype;
3293 4206
3294 *memcgp = NULL; 4207 *memcgp = NULL;
3295 4208
3296 VM_BUG_ON(PageTransHuge(page));
3297 if (mem_cgroup_disabled()) 4209 if (mem_cgroup_disabled())
3298 return; 4210 return;
3299 4211
4212 if (PageTransHuge(page))
4213 nr_pages <<= compound_order(page);
4214
3300 pc = lookup_page_cgroup(page); 4215 pc = lookup_page_cgroup(page);
3301 lock_page_cgroup(pc); 4216 lock_page_cgroup(pc);
3302 if (PageCgroupUsed(pc)) { 4217 if (PageCgroupUsed(pc)) {
@@ -3358,7 +4273,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3358 * charged to the res_counter since we plan on replacing the 4273 * charged to the res_counter since we plan on replacing the
3359 * old one and only one page is going to be left afterwards. 4274 * old one and only one page is going to be left afterwards.
3360 */ 4275 */
3361 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 4276 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
3362} 4277}
3363 4278
3364/* remove redundant charge if migration failed*/ 4279/* remove redundant charge if migration failed*/
@@ -3371,8 +4286,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3371 4286
3372 if (!memcg) 4287 if (!memcg)
3373 return; 4288 return;
3374 /* blocks rmdir() */ 4289
3375 cgroup_exclude_rmdir(&memcg->css);
3376 if (!migration_ok) { 4290 if (!migration_ok) {
3377 used = oldpage; 4291 used = oldpage;
3378 unused = newpage; 4292 unused = newpage;
@@ -3406,13 +4320,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3406 */ 4320 */
3407 if (anon) 4321 if (anon)
3408 mem_cgroup_uncharge_page(used); 4322 mem_cgroup_uncharge_page(used);
3409 /*
3410 * At migration, we may charge account against cgroup which has no
3411 * tasks.
3412 * So, rmdir()->pre_destroy() can be called while we do this charge.
3413 * In that case, we need to call pre_destroy() again. check it here.
3414 */
3415 cgroup_release_and_wakeup_rmdir(&memcg->css);
3416} 4323}
3417 4324
3418/* 4325/*
@@ -3490,8 +4397,6 @@ void mem_cgroup_print_bad_page(struct page *page)
3490} 4397}
3491#endif 4398#endif
3492 4399
3493static DEFINE_MUTEX(set_limit_mutex);
3494
3495static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4400static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3496 unsigned long long val) 4401 unsigned long long val)
3497{ 4402{
@@ -3712,17 +4617,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3712 return nr_reclaimed; 4617 return nr_reclaimed;
3713} 4618}
3714 4619
3715/* 4620/**
4621 * mem_cgroup_force_empty_list - clears LRU of a group
4622 * @memcg: group to clear
4623 * @node: NUMA node
4624 * @zid: zone id
4625 * @lru: lru to to clear
4626 *
3716 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 4627 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3717 * reclaim the pages page themselves - it just removes the page_cgroups. 4628 * reclaim the pages page themselves - pages are moved to the parent (or root)
3718 * Returns true if some page_cgroups were not freed, indicating that the caller 4629 * group.
3719 * must retry this operation.
3720 */ 4630 */
3721static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 4631static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3722 int node, int zid, enum lru_list lru) 4632 int node, int zid, enum lru_list lru)
3723{ 4633{
3724 struct lruvec *lruvec; 4634 struct lruvec *lruvec;
3725 unsigned long flags, loop; 4635 unsigned long flags;
3726 struct list_head *list; 4636 struct list_head *list;
3727 struct page *busy; 4637 struct page *busy;
3728 struct zone *zone; 4638 struct zone *zone;
@@ -3731,11 +4641,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3731 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 4641 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3732 list = &lruvec->lists[lru]; 4642 list = &lruvec->lists[lru];
3733 4643
3734 loop = mem_cgroup_get_lru_size(lruvec, lru);
3735 /* give some margin against EBUSY etc...*/
3736 loop += 256;
3737 busy = NULL; 4644 busy = NULL;
3738 while (loop--) { 4645 do {
3739 struct page_cgroup *pc; 4646 struct page_cgroup *pc;
3740 struct page *page; 4647 struct page *page;
3741 4648
@@ -3761,76 +4668,80 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3761 cond_resched(); 4668 cond_resched();
3762 } else 4669 } else
3763 busy = NULL; 4670 busy = NULL;
3764 } 4671 } while (!list_empty(list));
3765 return !list_empty(list);
3766} 4672}
3767 4673
3768/* 4674/*
3769 * make mem_cgroup's charge to be 0 if there is no task. 4675 * make mem_cgroup's charge to be 0 if there is no task by moving
4676 * all the charges and pages to the parent.
3770 * This enables deleting this mem_cgroup. 4677 * This enables deleting this mem_cgroup.
4678 *
4679 * Caller is responsible for holding css reference on the memcg.
3771 */ 4680 */
3772static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) 4681static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3773{ 4682{
3774 int ret; 4683 int node, zid;
3775 int node, zid, shrink; 4684 u64 usage;
3776 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3777 struct cgroup *cgrp = memcg->css.cgroup;
3778
3779 css_get(&memcg->css);
3780 4685
3781 shrink = 0;
3782 /* should free all ? */
3783 if (free_all)
3784 goto try_to_free;
3785move_account:
3786 do { 4686 do {
3787 ret = -EBUSY;
3788 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3789 goto out;
3790 /* This is for making all *used* pages to be on LRU. */ 4687 /* This is for making all *used* pages to be on LRU. */
3791 lru_add_drain_all(); 4688 lru_add_drain_all();
3792 drain_all_stock_sync(memcg); 4689 drain_all_stock_sync(memcg);
3793 ret = 0;
3794 mem_cgroup_start_move(memcg); 4690 mem_cgroup_start_move(memcg);
3795 for_each_node_state(node, N_HIGH_MEMORY) { 4691 for_each_node_state(node, N_MEMORY) {
3796 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 4692 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3797 enum lru_list lru; 4693 enum lru_list lru;
3798 for_each_lru(lru) { 4694 for_each_lru(lru) {
3799 ret = mem_cgroup_force_empty_list(memcg, 4695 mem_cgroup_force_empty_list(memcg,
3800 node, zid, lru); 4696 node, zid, lru);
3801 if (ret)
3802 break;
3803 } 4697 }
3804 } 4698 }
3805 if (ret)
3806 break;
3807 } 4699 }
3808 mem_cgroup_end_move(memcg); 4700 mem_cgroup_end_move(memcg);
3809 memcg_oom_recover(memcg); 4701 memcg_oom_recover(memcg);
3810 cond_resched(); 4702 cond_resched();
3811 /* "ret" should also be checked to ensure all lists are empty. */
3812 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3813out:
3814 css_put(&memcg->css);
3815 return ret;
3816 4703
3817try_to_free: 4704 /*
4705 * Kernel memory may not necessarily be trackable to a specific
4706 * process. So they are not migrated, and therefore we can't
4707 * expect their value to drop to 0 here.
4708 * Having res filled up with kmem only is enough.
4709 *
4710 * This is a safety check because mem_cgroup_force_empty_list
4711 * could have raced with mem_cgroup_replace_page_cache callers
4712 * so the lru seemed empty but the page could have been added
4713 * right after the check. RES_USAGE should be safe as we always
4714 * charge before adding to the LRU.
4715 */
4716 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4717 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4718 } while (usage > 0);
4719}
4720
4721/*
4722 * Reclaims as many pages from the given memcg as possible and moves
4723 * the rest to the parent.
4724 *
4725 * Caller is responsible for holding css reference for memcg.
4726 */
4727static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4728{
4729 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4730 struct cgroup *cgrp = memcg->css.cgroup;
4731
3818 /* returns EBUSY if there is a task or if we come here twice. */ 4732 /* returns EBUSY if there is a task or if we come here twice. */
3819 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 4733 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3820 ret = -EBUSY; 4734 return -EBUSY;
3821 goto out; 4735
3822 }
3823 /* we call try-to-free pages for make this cgroup empty */ 4736 /* we call try-to-free pages for make this cgroup empty */
3824 lru_add_drain_all(); 4737 lru_add_drain_all();
3825 /* try to free all pages in this cgroup */ 4738 /* try to free all pages in this cgroup */
3826 shrink = 1;
3827 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 4739 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3828 int progress; 4740 int progress;
3829 4741
3830 if (signal_pending(current)) { 4742 if (signal_pending(current))
3831 ret = -EINTR; 4743 return -EINTR;
3832 goto out; 4744
3833 }
3834 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 4745 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3835 false); 4746 false);
3836 if (!progress) { 4747 if (!progress) {
@@ -3841,13 +4752,23 @@ try_to_free:
3841 4752
3842 } 4753 }
3843 lru_add_drain(); 4754 lru_add_drain();
3844 /* try move_account...there may be some *locked* pages. */ 4755 mem_cgroup_reparent_charges(memcg);
3845 goto move_account; 4756
4757 return 0;
3846} 4758}
3847 4759
3848static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 4760static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3849{ 4761{
3850 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 4762 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4763 int ret;
4764
4765 if (mem_cgroup_is_root(memcg))
4766 return -EINVAL;
4767 css_get(&memcg->css);
4768 ret = mem_cgroup_force_empty(memcg);
4769 css_put(&memcg->css);
4770
4771 return ret;
3851} 4772}
3852 4773
3853 4774
@@ -3938,7 +4859,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3938 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4859 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3939 char str[64]; 4860 char str[64];
3940 u64 val; 4861 u64 val;
3941 int type, name, len; 4862 int name, len;
4863 enum res_type type;
3942 4864
3943 type = MEMFILE_TYPE(cft->private); 4865 type = MEMFILE_TYPE(cft->private);
3944 name = MEMFILE_ATTR(cft->private); 4866 name = MEMFILE_ATTR(cft->private);
@@ -3959,6 +4881,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3959 else 4881 else
3960 val = res_counter_read_u64(&memcg->memsw, name); 4882 val = res_counter_read_u64(&memcg->memsw, name);
3961 break; 4883 break;
4884 case _KMEM:
4885 val = res_counter_read_u64(&memcg->kmem, name);
4886 break;
3962 default: 4887 default:
3963 BUG(); 4888 BUG();
3964 } 4889 }
@@ -3966,6 +4891,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3966 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 4891 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3967 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 4892 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3968} 4893}
4894
4895static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4896{
4897 int ret = -EINVAL;
4898#ifdef CONFIG_MEMCG_KMEM
4899 bool must_inc_static_branch = false;
4900
4901 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4902 /*
4903 * For simplicity, we won't allow this to be disabled. It also can't
4904 * be changed if the cgroup has children already, or if tasks had
4905 * already joined.
4906 *
4907 * If tasks join before we set the limit, a person looking at
4908 * kmem.usage_in_bytes will have no way to determine when it took
4909 * place, which makes the value quite meaningless.
4910 *
4911 * After it first became limited, changes in the value of the limit are
4912 * of course permitted.
4913 *
4914 * Taking the cgroup_lock is really offensive, but it is so far the only
4915 * way to guarantee that no children will appear. There are plenty of
4916 * other offenders, and they should all go away. Fine grained locking
4917 * is probably the way to go here. When we are fully hierarchical, we
4918 * can also get rid of the use_hierarchy check.
4919 */
4920 cgroup_lock();
4921 mutex_lock(&set_limit_mutex);
4922 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4923 if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
4924 !list_empty(&cont->children))) {
4925 ret = -EBUSY;
4926 goto out;
4927 }
4928 ret = res_counter_set_limit(&memcg->kmem, val);
4929 VM_BUG_ON(ret);
4930
4931 ret = memcg_update_cache_sizes(memcg);
4932 if (ret) {
4933 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4934 goto out;
4935 }
4936 must_inc_static_branch = true;
4937 /*
4938 * kmem charges can outlive the cgroup. In the case of slab
4939 * pages, for instance, a page contain objects from various
4940 * processes, so it is unfeasible to migrate them away. We
4941 * need to reference count the memcg because of that.
4942 */
4943 mem_cgroup_get(memcg);
4944 } else
4945 ret = res_counter_set_limit(&memcg->kmem, val);
4946out:
4947 mutex_unlock(&set_limit_mutex);
4948 cgroup_unlock();
4949
4950 /*
4951 * We are by now familiar with the fact that we can't inc the static
4952 * branch inside cgroup_lock. See disarm functions for details. A
4953 * worker here is overkill, but also wrong: After the limit is set, we
4954 * must start accounting right away. Since this operation can't fail,
4955 * we can safely defer it to here - no rollback will be needed.
4956 *
4957 * The boolean used to control this is also safe, because
4958 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
4959 * able to set it to true;
4960 */
4961 if (must_inc_static_branch) {
4962 static_key_slow_inc(&memcg_kmem_enabled_key);
4963 /*
4964 * setting the active bit after the inc will guarantee no one
4965 * starts accounting before all call sites are patched
4966 */
4967 memcg_kmem_set_active(memcg);
4968 }
4969
4970#endif
4971 return ret;
4972}
4973
4974static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4975{
4976 int ret = 0;
4977 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4978 if (!parent)
4979 goto out;
4980
4981 memcg->kmem_account_flags = parent->kmem_account_flags;
4982#ifdef CONFIG_MEMCG_KMEM
4983 /*
4984 * When that happen, we need to disable the static branch only on those
4985 * memcgs that enabled it. To achieve this, we would be forced to
4986 * complicate the code by keeping track of which memcgs were the ones
4987 * that actually enabled limits, and which ones got it from its
4988 * parents.
4989 *
4990 * It is a lot simpler just to do static_key_slow_inc() on every child
4991 * that is accounted.
4992 */
4993 if (!memcg_kmem_is_active(memcg))
4994 goto out;
4995
4996 /*
4997 * destroy(), called if we fail, will issue static_key_slow_inc() and
4998 * mem_cgroup_put() if kmem is enabled. We have to either call them
4999 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
5000 * this more consistent, since it always leads to the same destroy path
5001 */
5002 mem_cgroup_get(memcg);
5003 static_key_slow_inc(&memcg_kmem_enabled_key);
5004
5005 mutex_lock(&set_limit_mutex);
5006 ret = memcg_update_cache_sizes(memcg);
5007 mutex_unlock(&set_limit_mutex);
5008#endif
5009out:
5010 return ret;
5011}
5012
3969/* 5013/*
3970 * The user of this function is... 5014 * The user of this function is...
3971 * RES_LIMIT. 5015 * RES_LIMIT.
@@ -3974,7 +5018,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3974 const char *buffer) 5018 const char *buffer)
3975{ 5019{
3976 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5020 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3977 int type, name; 5021 enum res_type type;
5022 int name;
3978 unsigned long long val; 5023 unsigned long long val;
3979 int ret; 5024 int ret;
3980 5025
@@ -3996,8 +5041,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3996 break; 5041 break;
3997 if (type == _MEM) 5042 if (type == _MEM)
3998 ret = mem_cgroup_resize_limit(memcg, val); 5043 ret = mem_cgroup_resize_limit(memcg, val);
3999 else 5044 else if (type == _MEMSWAP)
4000 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5045 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5046 else if (type == _KMEM)
5047 ret = memcg_update_kmem_limit(cont, val);
5048 else
5049 return -EINVAL;
4001 break; 5050 break;
4002 case RES_SOFT_LIMIT: 5051 case RES_SOFT_LIMIT:
4003 ret = res_counter_memparse_write_strategy(buffer, &val); 5052 ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4050,7 +5099,8 @@ out:
4050static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 5099static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4051{ 5100{
4052 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5101 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4053 int type, name; 5102 int name;
5103 enum res_type type;
4054 5104
4055 type = MEMFILE_TYPE(event); 5105 type = MEMFILE_TYPE(event);
4056 name = MEMFILE_ATTR(event); 5106 name = MEMFILE_ATTR(event);
@@ -4062,14 +5112,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4062 case RES_MAX_USAGE: 5112 case RES_MAX_USAGE:
4063 if (type == _MEM) 5113 if (type == _MEM)
4064 res_counter_reset_max(&memcg->res); 5114 res_counter_reset_max(&memcg->res);
4065 else 5115 else if (type == _MEMSWAP)
4066 res_counter_reset_max(&memcg->memsw); 5116 res_counter_reset_max(&memcg->memsw);
5117 else if (type == _KMEM)
5118 res_counter_reset_max(&memcg->kmem);
5119 else
5120 return -EINVAL;
4067 break; 5121 break;
4068 case RES_FAILCNT: 5122 case RES_FAILCNT:
4069 if (type == _MEM) 5123 if (type == _MEM)
4070 res_counter_reset_failcnt(&memcg->res); 5124 res_counter_reset_failcnt(&memcg->res);
4071 else 5125 else if (type == _MEMSWAP)
4072 res_counter_reset_failcnt(&memcg->memsw); 5126 res_counter_reset_failcnt(&memcg->memsw);
5127 else if (type == _KMEM)
5128 res_counter_reset_failcnt(&memcg->kmem);
5129 else
5130 return -EINVAL;
4073 break; 5131 break;
4074 } 5132 }
4075 5133
@@ -4120,7 +5178,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4120 5178
4121 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5179 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4122 seq_printf(m, "total=%lu", total_nr); 5180 seq_printf(m, "total=%lu", total_nr);
4123 for_each_node_state(nid, N_HIGH_MEMORY) { 5181 for_each_node_state(nid, N_MEMORY) {
4124 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 5182 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4125 seq_printf(m, " N%d=%lu", nid, node_nr); 5183 seq_printf(m, " N%d=%lu", nid, node_nr);
4126 } 5184 }
@@ -4128,7 +5186,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4128 5186
4129 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 5187 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4130 seq_printf(m, "file=%lu", file_nr); 5188 seq_printf(m, "file=%lu", file_nr);
4131 for_each_node_state(nid, N_HIGH_MEMORY) { 5189 for_each_node_state(nid, N_MEMORY) {
4132 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5190 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4133 LRU_ALL_FILE); 5191 LRU_ALL_FILE);
4134 seq_printf(m, " N%d=%lu", nid, node_nr); 5192 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4137,7 +5195,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4137 5195
4138 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 5196 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4139 seq_printf(m, "anon=%lu", anon_nr); 5197 seq_printf(m, "anon=%lu", anon_nr);
4140 for_each_node_state(nid, N_HIGH_MEMORY) { 5198 for_each_node_state(nid, N_MEMORY) {
4141 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5199 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4142 LRU_ALL_ANON); 5200 LRU_ALL_ANON);
4143 seq_printf(m, " N%d=%lu", nid, node_nr); 5201 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4146,7 +5204,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4146 5204
4147 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 5205 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4148 seq_printf(m, "unevictable=%lu", unevictable_nr); 5206 seq_printf(m, "unevictable=%lu", unevictable_nr);
4149 for_each_node_state(nid, N_HIGH_MEMORY) { 5207 for_each_node_state(nid, N_MEMORY) {
4150 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5208 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4151 BIT(LRU_UNEVICTABLE)); 5209 BIT(LRU_UNEVICTABLE));
4152 seq_printf(m, " N%d=%lu", nid, node_nr); 5210 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4386,7 +5444,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4386 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5444 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4387 struct mem_cgroup_thresholds *thresholds; 5445 struct mem_cgroup_thresholds *thresholds;
4388 struct mem_cgroup_threshold_ary *new; 5446 struct mem_cgroup_threshold_ary *new;
4389 int type = MEMFILE_TYPE(cft->private); 5447 enum res_type type = MEMFILE_TYPE(cft->private);
4390 u64 threshold, usage; 5448 u64 threshold, usage;
4391 int i, size, ret; 5449 int i, size, ret;
4392 5450
@@ -4469,7 +5527,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4469 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5527 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4470 struct mem_cgroup_thresholds *thresholds; 5528 struct mem_cgroup_thresholds *thresholds;
4471 struct mem_cgroup_threshold_ary *new; 5529 struct mem_cgroup_threshold_ary *new;
4472 int type = MEMFILE_TYPE(cft->private); 5530 enum res_type type = MEMFILE_TYPE(cft->private);
4473 u64 usage; 5531 u64 usage;
4474 int i, j, size; 5532 int i, j, size;
4475 5533
@@ -4547,7 +5605,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4547{ 5605{
4548 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5606 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4549 struct mem_cgroup_eventfd_list *event; 5607 struct mem_cgroup_eventfd_list *event;
4550 int type = MEMFILE_TYPE(cft->private); 5608 enum res_type type = MEMFILE_TYPE(cft->private);
4551 5609
4552 BUG_ON(type != _OOM_TYPE); 5610 BUG_ON(type != _OOM_TYPE);
4553 event = kmalloc(sizeof(*event), GFP_KERNEL); 5611 event = kmalloc(sizeof(*event), GFP_KERNEL);
@@ -4572,7 +5630,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4572{ 5630{
4573 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5631 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4574 struct mem_cgroup_eventfd_list *ev, *tmp; 5632 struct mem_cgroup_eventfd_list *ev, *tmp;
4575 int type = MEMFILE_TYPE(cft->private); 5633 enum res_type type = MEMFILE_TYPE(cft->private);
4576 5634
4577 BUG_ON(type != _OOM_TYPE); 5635 BUG_ON(type != _OOM_TYPE);
4578 5636
@@ -4631,12 +5689,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4631#ifdef CONFIG_MEMCG_KMEM 5689#ifdef CONFIG_MEMCG_KMEM
4632static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5690static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4633{ 5691{
5692 int ret;
5693
5694 memcg->kmemcg_id = -1;
5695 ret = memcg_propagate_kmem(memcg);
5696 if (ret)
5697 return ret;
5698
4634 return mem_cgroup_sockets_init(memcg, ss); 5699 return mem_cgroup_sockets_init(memcg, ss);
4635}; 5700};
4636 5701
4637static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5702static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4638{ 5703{
4639 mem_cgroup_sockets_destroy(memcg); 5704 mem_cgroup_sockets_destroy(memcg);
5705
5706 memcg_kmem_mark_dead(memcg);
5707
5708 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5709 return;
5710
5711 /*
5712 * Charges already down to 0, undo mem_cgroup_get() done in the charge
5713 * path here, being careful not to race with memcg_uncharge_kmem: it is
5714 * possible that the charges went down to 0 between mark_dead and the
5715 * res_counter read, so in that case, we don't need the put
5716 */
5717 if (memcg_kmem_test_and_clear_dead(memcg))
5718 mem_cgroup_put(memcg);
4640} 5719}
4641#else 5720#else
4642static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5721static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4745,6 +5824,37 @@ static struct cftype mem_cgroup_files[] = {
4745 .read = mem_cgroup_read, 5824 .read = mem_cgroup_read,
4746 }, 5825 },
4747#endif 5826#endif
5827#ifdef CONFIG_MEMCG_KMEM
5828 {
5829 .name = "kmem.limit_in_bytes",
5830 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5831 .write_string = mem_cgroup_write,
5832 .read = mem_cgroup_read,
5833 },
5834 {
5835 .name = "kmem.usage_in_bytes",
5836 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5837 .read = mem_cgroup_read,
5838 },
5839 {
5840 .name = "kmem.failcnt",
5841 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5842 .trigger = mem_cgroup_reset,
5843 .read = mem_cgroup_read,
5844 },
5845 {
5846 .name = "kmem.max_usage_in_bytes",
5847 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5848 .trigger = mem_cgroup_reset,
5849 .read = mem_cgroup_read,
5850 },
5851#ifdef CONFIG_SLABINFO
5852 {
5853 .name = "kmem.slabinfo",
5854 .read_seq_string = mem_cgroup_slabinfo_read,
5855 },
5856#endif
5857#endif
4748 { }, /* terminate */ 5858 { }, /* terminate */
4749}; 5859};
4750 5860
@@ -4812,16 +5922,29 @@ out_free:
4812} 5922}
4813 5923
4814/* 5924/*
4815 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, 5925 * At destroying mem_cgroup, references from swap_cgroup can remain.
4816 * but in process context. The work_freeing structure is overlaid 5926 * (scanning all at force_empty is too costly...)
4817 * on the rcu_freeing structure, which itself is overlaid on memsw. 5927 *
5928 * Instead of clearing all references at force_empty, we remember
5929 * the number of reference from swap_cgroup and free mem_cgroup when
5930 * it goes down to 0.
5931 *
5932 * Removal of cgroup itself succeeds regardless of refs from swap.
4818 */ 5933 */
4819static void free_work(struct work_struct *work) 5934
5935static void __mem_cgroup_free(struct mem_cgroup *memcg)
4820{ 5936{
4821 struct mem_cgroup *memcg; 5937 int node;
4822 int size = sizeof(struct mem_cgroup); 5938 int size = sizeof(struct mem_cgroup);
4823 5939
4824 memcg = container_of(work, struct mem_cgroup, work_freeing); 5940 mem_cgroup_remove_from_trees(memcg);
5941 free_css_id(&mem_cgroup_subsys, &memcg->css);
5942
5943 for_each_node(node)
5944 free_mem_cgroup_per_zone_info(memcg, node);
5945
5946 free_percpu(memcg->stat);
5947
4825 /* 5948 /*
4826 * We need to make sure that (at least for now), the jump label 5949 * We need to make sure that (at least for now), the jump label
4827 * destruction code runs outside of the cgroup lock. This is because 5950 * destruction code runs outside of the cgroup lock. This is because
@@ -4833,45 +5956,34 @@ static void free_work(struct work_struct *work)
4833 * to move this code around, and make sure it is outside 5956 * to move this code around, and make sure it is outside
4834 * the cgroup_lock. 5957 * the cgroup_lock.
4835 */ 5958 */
4836 disarm_sock_keys(memcg); 5959 disarm_static_keys(memcg);
4837 if (size < PAGE_SIZE) 5960 if (size < PAGE_SIZE)
4838 kfree(memcg); 5961 kfree(memcg);
4839 else 5962 else
4840 vfree(memcg); 5963 vfree(memcg);
4841} 5964}
4842 5965
4843static void free_rcu(struct rcu_head *rcu_head)
4844{
4845 struct mem_cgroup *memcg;
4846
4847 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4848 INIT_WORK(&memcg->work_freeing, free_work);
4849 schedule_work(&memcg->work_freeing);
4850}
4851 5966
4852/* 5967/*
4853 * At destroying mem_cgroup, references from swap_cgroup can remain. 5968 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
4854 * (scanning all at force_empty is too costly...) 5969 * but in process context. The work_freeing structure is overlaid
4855 * 5970 * on the rcu_freeing structure, which itself is overlaid on memsw.
4856 * Instead of clearing all references at force_empty, we remember
4857 * the number of reference from swap_cgroup and free mem_cgroup when
4858 * it goes down to 0.
4859 *
4860 * Removal of cgroup itself succeeds regardless of refs from swap.
4861 */ 5971 */
4862 5972static void free_work(struct work_struct *work)
4863static void __mem_cgroup_free(struct mem_cgroup *memcg)
4864{ 5973{
4865 int node; 5974 struct mem_cgroup *memcg;
4866 5975
4867 mem_cgroup_remove_from_trees(memcg); 5976 memcg = container_of(work, struct mem_cgroup, work_freeing);
4868 free_css_id(&mem_cgroup_subsys, &memcg->css); 5977 __mem_cgroup_free(memcg);
5978}
4869 5979
4870 for_each_node(node) 5980static void free_rcu(struct rcu_head *rcu_head)
4871 free_mem_cgroup_per_zone_info(memcg, node); 5981{
5982 struct mem_cgroup *memcg;
4872 5983
4873 free_percpu(memcg->stat); 5984 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4874 call_rcu(&memcg->rcu_freeing, free_rcu); 5985 INIT_WORK(&memcg->work_freeing, free_work);
5986 schedule_work(&memcg->work_freeing);
4875} 5987}
4876 5988
4877static void mem_cgroup_get(struct mem_cgroup *memcg) 5989static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4883,7 +5995,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4883{ 5995{
4884 if (atomic_sub_and_test(count, &memcg->refcnt)) { 5996 if (atomic_sub_and_test(count, &memcg->refcnt)) {
4885 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5997 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4886 __mem_cgroup_free(memcg); 5998 call_rcu(&memcg->rcu_freeing, free_rcu);
4887 if (parent) 5999 if (parent)
4888 mem_cgroup_put(parent); 6000 mem_cgroup_put(parent);
4889 } 6001 }
@@ -4953,7 +6065,7 @@ err_cleanup:
4953} 6065}
4954 6066
4955static struct cgroup_subsys_state * __ref 6067static struct cgroup_subsys_state * __ref
4956mem_cgroup_create(struct cgroup *cont) 6068mem_cgroup_css_alloc(struct cgroup *cont)
4957{ 6069{
4958 struct mem_cgroup *memcg, *parent; 6070 struct mem_cgroup *memcg, *parent;
4959 long error = -ENOMEM; 6071 long error = -ENOMEM;
@@ -4980,7 +6092,6 @@ mem_cgroup_create(struct cgroup *cont)
4980 &per_cpu(memcg_stock, cpu); 6092 &per_cpu(memcg_stock, cpu);
4981 INIT_WORK(&stock->work, drain_local_stock); 6093 INIT_WORK(&stock->work, drain_local_stock);
4982 } 6094 }
4983 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4984 } else { 6095 } else {
4985 parent = mem_cgroup_from_cont(cont->parent); 6096 parent = mem_cgroup_from_cont(cont->parent);
4986 memcg->use_hierarchy = parent->use_hierarchy; 6097 memcg->use_hierarchy = parent->use_hierarchy;
@@ -4990,6 +6101,8 @@ mem_cgroup_create(struct cgroup *cont)
4990 if (parent && parent->use_hierarchy) { 6101 if (parent && parent->use_hierarchy) {
4991 res_counter_init(&memcg->res, &parent->res); 6102 res_counter_init(&memcg->res, &parent->res);
4992 res_counter_init(&memcg->memsw, &parent->memsw); 6103 res_counter_init(&memcg->memsw, &parent->memsw);
6104 res_counter_init(&memcg->kmem, &parent->kmem);
6105
4993 /* 6106 /*
4994 * We increment refcnt of the parent to ensure that we can 6107 * We increment refcnt of the parent to ensure that we can
4995 * safely access it on res_counter_charge/uncharge. 6108 * safely access it on res_counter_charge/uncharge.
@@ -5000,6 +6113,7 @@ mem_cgroup_create(struct cgroup *cont)
5000 } else { 6113 } else {
5001 res_counter_init(&memcg->res, NULL); 6114 res_counter_init(&memcg->res, NULL);
5002 res_counter_init(&memcg->memsw, NULL); 6115 res_counter_init(&memcg->memsw, NULL);
6116 res_counter_init(&memcg->kmem, NULL);
5003 /* 6117 /*
5004 * Deeper hierachy with use_hierarchy == false doesn't make 6118 * Deeper hierachy with use_hierarchy == false doesn't make
5005 * much sense so let cgroup subsystem know about this 6119 * much sense so let cgroup subsystem know about this
@@ -5034,14 +6148,15 @@ free_out:
5034 return ERR_PTR(error); 6148 return ERR_PTR(error);
5035} 6149}
5036 6150
5037static int mem_cgroup_pre_destroy(struct cgroup *cont) 6151static void mem_cgroup_css_offline(struct cgroup *cont)
5038{ 6152{
5039 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6153 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5040 6154
5041 return mem_cgroup_force_empty(memcg, false); 6155 mem_cgroup_reparent_charges(memcg);
6156 mem_cgroup_destroy_all_caches(memcg);
5042} 6157}
5043 6158
5044static void mem_cgroup_destroy(struct cgroup *cont) 6159static void mem_cgroup_css_free(struct cgroup *cont)
5045{ 6160{
5046 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6161 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5047 6162
@@ -5631,18 +6746,30 @@ static void mem_cgroup_move_task(struct cgroup *cont,
5631struct cgroup_subsys mem_cgroup_subsys = { 6746struct cgroup_subsys mem_cgroup_subsys = {
5632 .name = "memory", 6747 .name = "memory",
5633 .subsys_id = mem_cgroup_subsys_id, 6748 .subsys_id = mem_cgroup_subsys_id,
5634 .create = mem_cgroup_create, 6749 .css_alloc = mem_cgroup_css_alloc,
5635 .pre_destroy = mem_cgroup_pre_destroy, 6750 .css_offline = mem_cgroup_css_offline,
5636 .destroy = mem_cgroup_destroy, 6751 .css_free = mem_cgroup_css_free,
5637 .can_attach = mem_cgroup_can_attach, 6752 .can_attach = mem_cgroup_can_attach,
5638 .cancel_attach = mem_cgroup_cancel_attach, 6753 .cancel_attach = mem_cgroup_cancel_attach,
5639 .attach = mem_cgroup_move_task, 6754 .attach = mem_cgroup_move_task,
5640 .base_cftypes = mem_cgroup_files, 6755 .base_cftypes = mem_cgroup_files,
5641 .early_init = 0, 6756 .early_init = 0,
5642 .use_id = 1, 6757 .use_id = 1,
5643 .__DEPRECATED_clear_css_refs = true,
5644}; 6758};
5645 6759
6760/*
6761 * The rest of init is performed during ->css_alloc() for root css which
6762 * happens before initcalls. hotcpu_notifier() can't be done together as
6763 * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
6764 * dependency. Do it from a subsys_initcall().
6765 */
6766static int __init mem_cgroup_init(void)
6767{
6768 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6769 return 0;
6770}
6771subsys_initcall(mem_cgroup_init);
6772
5646#ifdef CONFIG_MEMCG_SWAP 6773#ifdef CONFIG_MEMCG_SWAP
5647static int __init enable_swap_account(char *s) 6774static int __init enable_swap_account(char *s)
5648{ 6775{
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8b20278be6a6..c6e4dd3e1c08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
402 struct anon_vma *av; 402 struct anon_vma *av;
403 pgoff_t pgoff; 403 pgoff_t pgoff;
404 404
405 av = page_lock_anon_vma(page); 405 av = page_lock_anon_vma_read(page);
406 if (av == NULL) /* Not actually mapped anymore */ 406 if (av == NULL) /* Not actually mapped anymore */
407 return; 407 return;
408 408
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
423 } 423 }
424 } 424 }
425 read_unlock(&tasklist_lock); 425 read_unlock(&tasklist_lock);
426 page_unlock_anon_vma(av); 426 page_unlock_anon_vma_read(av);
427} 427}
428 428
429/* 429/*
@@ -781,16 +781,16 @@ static struct page_state {
781 { compound, compound, "huge", me_huge_page }, 781 { compound, compound, "huge", me_huge_page },
782#endif 782#endif
783 783
784 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, 784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "swapcache", me_swapcache_clean }, 785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786 786
787 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, 787 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
788 { unevict, unevict, "unevictable LRU", me_pagecache_clean}, 788 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
789 789
790 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, 790 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
791 { mlock, mlock, "mlocked LRU", me_pagecache_clean }, 791 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
792 792
793 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795 795
796 /* 796 /*
@@ -812,14 +812,14 @@ static struct page_state {
812#undef slab 812#undef slab
813#undef reserved 813#undef reserved
814 814
815/*
816 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
817 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
818 */
815static void action_result(unsigned long pfn, char *msg, int result) 819static void action_result(unsigned long pfn, char *msg, int result)
816{ 820{
817 struct page *page = pfn_to_page(pfn); 821 pr_err("MCE %#lx: %s page recovery: %s\n",
818 822 pfn, msg, action_name[result]);
819 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
820 pfn,
821 PageDirty(page) ? "dirty " : "",
822 msg, action_name[result]);
823} 823}
824 824
825static int page_action(struct page_state *ps, struct page *p, 825static int page_action(struct page_state *ps, struct page *p,
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1385 * Isolate the page, so that it doesn't get reallocated if it 1385 * Isolate the page, so that it doesn't get reallocated if it
1386 * was free. 1386 * was free.
1387 */ 1387 */
1388 set_migratetype_isolate(p); 1388 set_migratetype_isolate(p, true);
1389 /* 1389 /*
1390 * When the target page is a free hugepage, just remove it 1390 * When the target page is a free hugepage, just remove it
1391 * from free hugepage list. 1391 * from free hugepage list.
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
1566 page_is_file_cache(page)); 1566 page_is_file_cache(page));
1567 list_add(&page->lru, &pagelist); 1567 list_add(&page->lru, &pagelist);
1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1569 false, MIGRATE_SYNC); 1569 false, MIGRATE_SYNC,
1570 MR_MEMORY_FAILURE);
1570 if (ret) { 1571 if (ret) {
1571 putback_lru_pages(&pagelist); 1572 putback_lru_pages(&pagelist);
1572 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1573 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 221fc9ffcab1..bb1369f7b9b4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,8 @@
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h>
61#include <linux/string.h>
60 62
61#include <asm/io.h> 63#include <asm/io.h>
62#include <asm/pgalloc.h> 64#include <asm/pgalloc.h>
@@ -182,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
182 return 1; 184 return 1;
183 } 185 }
184 186
187 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
188 return 0;
189
185 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); 190 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
186 if (!batch) 191 if (!batch)
187 return 0; 192 return 0;
188 193
194 tlb->batch_count++;
189 batch->next = NULL; 195 batch->next = NULL;
190 batch->nr = 0; 196 batch->nr = 0;
191 batch->max = MAX_GATHER_BATCH; 197 batch->max = MAX_GATHER_BATCH;
@@ -214,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
214 tlb->local.nr = 0; 220 tlb->local.nr = 0;
215 tlb->local.max = ARRAY_SIZE(tlb->__pages); 221 tlb->local.max = ARRAY_SIZE(tlb->__pages);
216 tlb->active = &tlb->local; 222 tlb->active = &tlb->local;
223 tlb->batch_count = 0;
217 224
218#ifdef CONFIG_HAVE_RCU_TABLE_FREE 225#ifdef CONFIG_HAVE_RCU_TABLE_FREE
219 tlb->batch = NULL; 226 tlb->batch = NULL;
@@ -717,20 +724,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 724 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718} 725}
719 726
720#ifndef is_zero_pfn
721static inline int is_zero_pfn(unsigned long pfn)
722{
723 return pfn == zero_pfn;
724}
725#endif
726
727#ifndef my_zero_pfn
728static inline unsigned long my_zero_pfn(unsigned long addr)
729{
730 return zero_pfn;
731}
732#endif
733
734/* 727/*
735 * vm_normal_page -- This function gets the "struct page" associated with a pte. 728 * vm_normal_page -- This function gets the "struct page" associated with a pte.
736 * 729 *
@@ -1250,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1250 BUG(); 1243 BUG();
1251 } 1244 }
1252#endif 1245#endif
1253 split_huge_page_pmd(vma->vm_mm, pmd); 1246 split_huge_page_pmd(vma, addr, pmd);
1254 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1247 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1255 goto next; 1248 goto next;
1256 /* fall through */ 1249 /* fall through */
@@ -1517,9 +1510,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1517 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1510 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1518 goto out; 1511 goto out;
1519 } 1512 }
1513 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1514 goto no_page_table;
1520 if (pmd_trans_huge(*pmd)) { 1515 if (pmd_trans_huge(*pmd)) {
1521 if (flags & FOLL_SPLIT) { 1516 if (flags & FOLL_SPLIT) {
1522 split_huge_page_pmd(mm, pmd); 1517 split_huge_page_pmd(vma, address, pmd);
1523 goto split_fallthrough; 1518 goto split_fallthrough;
1524 } 1519 }
1525 spin_lock(&mm->page_table_lock); 1520 spin_lock(&mm->page_table_lock);
@@ -1546,6 +1541,8 @@ split_fallthrough:
1546 pte = *ptep; 1541 pte = *ptep;
1547 if (!pte_present(pte)) 1542 if (!pte_present(pte))
1548 goto no_page; 1543 goto no_page;
1544 if ((flags & FOLL_NUMA) && pte_numa(pte))
1545 goto no_page;
1549 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1546 if ((flags & FOLL_WRITE) && !pte_write(pte))
1550 goto unlock; 1547 goto unlock;
1551 1548
@@ -1697,6 +1694,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1697 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1694 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1698 vm_flags &= (gup_flags & FOLL_FORCE) ? 1695 vm_flags &= (gup_flags & FOLL_FORCE) ?
1699 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1696 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1697
1698 /*
1699 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1700 * would be called on PROT_NONE ranges. We must never invoke
1701 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1702 * page faults would unprotect the PROT_NONE ranges if
1703 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1704 * bitflag. So to avoid that, don't set FOLL_NUMA if
1705 * FOLL_FORCE is set.
1706 */
1707 if (!(gup_flags & FOLL_FORCE))
1708 gup_flags |= FOLL_NUMA;
1709
1700 i = 0; 1710 i = 0;
1701 1711
1702 do { 1712 do {
@@ -2794,13 +2804,8 @@ unlock:
2794oom_free_new: 2804oom_free_new:
2795 page_cache_release(new_page); 2805 page_cache_release(new_page);
2796oom: 2806oom:
2797 if (old_page) { 2807 if (old_page)
2798 if (page_mkwrite) {
2799 unlock_page(old_page);
2800 page_cache_release(old_page);
2801 }
2802 page_cache_release(old_page); 2808 page_cache_release(old_page);
2803 }
2804 return VM_FAULT_OOM; 2809 return VM_FAULT_OOM;
2805 2810
2806unwritable_page: 2811unwritable_page:
@@ -3431,6 +3436,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3431 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3436 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3432} 3437}
3433 3438
3439int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3440 unsigned long addr, int current_nid)
3441{
3442 get_page(page);
3443
3444 count_vm_numa_event(NUMA_HINT_FAULTS);
3445 if (current_nid == numa_node_id())
3446 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3447
3448 return mpol_misplaced(page, vma, addr);
3449}
3450
3451int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3452 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3453{
3454 struct page *page = NULL;
3455 spinlock_t *ptl;
3456 int current_nid = -1;
3457 int target_nid;
3458 bool migrated = false;
3459
3460 /*
3461 * The "pte" at this point cannot be used safely without
3462 * validation through pte_unmap_same(). It's of NUMA type but
3463 * the pfn may be screwed if the read is non atomic.
3464 *
3465 * ptep_modify_prot_start is not called as this is clearing
3466 * the _PAGE_NUMA bit and it is not really expected that there
3467 * would be concurrent hardware modifications to the PTE.
3468 */
3469 ptl = pte_lockptr(mm, pmd);
3470 spin_lock(ptl);
3471 if (unlikely(!pte_same(*ptep, pte))) {
3472 pte_unmap_unlock(ptep, ptl);
3473 goto out;
3474 }
3475
3476 pte = pte_mknonnuma(pte);
3477 set_pte_at(mm, addr, ptep, pte);
3478 update_mmu_cache(vma, addr, ptep);
3479
3480 page = vm_normal_page(vma, addr, pte);
3481 if (!page) {
3482 pte_unmap_unlock(ptep, ptl);
3483 return 0;
3484 }
3485
3486 current_nid = page_to_nid(page);
3487 target_nid = numa_migrate_prep(page, vma, addr, current_nid);
3488 pte_unmap_unlock(ptep, ptl);
3489 if (target_nid == -1) {
3490 /*
3491 * Account for the fault against the current node if it not
3492 * being replaced regardless of where the page is located.
3493 */
3494 current_nid = numa_node_id();
3495 put_page(page);
3496 goto out;
3497 }
3498
3499 /* Migrate to the requested node */
3500 migrated = migrate_misplaced_page(page, target_nid);
3501 if (migrated)
3502 current_nid = target_nid;
3503
3504out:
3505 if (current_nid != -1)
3506 task_numa_fault(current_nid, 1, migrated);
3507 return 0;
3508}
3509
3510/* NUMA hinting page fault entry point for regular pmds */
3511#ifdef CONFIG_NUMA_BALANCING
3512static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3513 unsigned long addr, pmd_t *pmdp)
3514{
3515 pmd_t pmd;
3516 pte_t *pte, *orig_pte;
3517 unsigned long _addr = addr & PMD_MASK;
3518 unsigned long offset;
3519 spinlock_t *ptl;
3520 bool numa = false;
3521 int local_nid = numa_node_id();
3522
3523 spin_lock(&mm->page_table_lock);
3524 pmd = *pmdp;
3525 if (pmd_numa(pmd)) {
3526 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3527 numa = true;
3528 }
3529 spin_unlock(&mm->page_table_lock);
3530
3531 if (!numa)
3532 return 0;
3533
3534 /* we're in a page fault so some vma must be in the range */
3535 BUG_ON(!vma);
3536 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3537 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3538 VM_BUG_ON(offset >= PMD_SIZE);
3539 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3540 pte += offset >> PAGE_SHIFT;
3541 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3542 pte_t pteval = *pte;
3543 struct page *page;
3544 int curr_nid = local_nid;
3545 int target_nid;
3546 bool migrated;
3547 if (!pte_present(pteval))
3548 continue;
3549 if (!pte_numa(pteval))
3550 continue;
3551 if (addr >= vma->vm_end) {
3552 vma = find_vma(mm, addr);
3553 /* there's a pte present so there must be a vma */
3554 BUG_ON(!vma);
3555 BUG_ON(addr < vma->vm_start);
3556 }
3557 if (pte_numa(pteval)) {
3558 pteval = pte_mknonnuma(pteval);
3559 set_pte_at(mm, addr, pte, pteval);
3560 }
3561 page = vm_normal_page(vma, addr, pteval);
3562 if (unlikely(!page))
3563 continue;
3564 /* only check non-shared pages */
3565 if (unlikely(page_mapcount(page) != 1))
3566 continue;
3567
3568 /*
3569 * Note that the NUMA fault is later accounted to either
3570 * the node that is currently running or where the page is
3571 * migrated to.
3572 */
3573 curr_nid = local_nid;
3574 target_nid = numa_migrate_prep(page, vma, addr,
3575 page_to_nid(page));
3576 if (target_nid == -1) {
3577 put_page(page);
3578 continue;
3579 }
3580
3581 /* Migrate to the requested node */
3582 pte_unmap_unlock(pte, ptl);
3583 migrated = migrate_misplaced_page(page, target_nid);
3584 if (migrated)
3585 curr_nid = target_nid;
3586 task_numa_fault(curr_nid, 1, migrated);
3587
3588 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3589 }
3590 pte_unmap_unlock(orig_pte, ptl);
3591
3592 return 0;
3593}
3594#else
3595static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3596 unsigned long addr, pmd_t *pmdp)
3597{
3598 BUG();
3599 return 0;
3600}
3601#endif /* CONFIG_NUMA_BALANCING */
3602
3434/* 3603/*
3435 * These routines also need to handle stuff like marking pages dirty 3604 * These routines also need to handle stuff like marking pages dirty
3436 * and/or accessed for architectures that don't do it in hardware (most 3605 * and/or accessed for architectures that don't do it in hardware (most
@@ -3469,6 +3638,9 @@ int handle_pte_fault(struct mm_struct *mm,
3469 pte, pmd, flags, entry); 3638 pte, pmd, flags, entry);
3470 } 3639 }
3471 3640
3641 if (pte_numa(entry))
3642 return do_numa_page(mm, vma, address, entry, pte, pmd);
3643
3472 ptl = pte_lockptr(mm, pmd); 3644 ptl = pte_lockptr(mm, pmd);
3473 spin_lock(ptl); 3645 spin_lock(ptl);
3474 if (unlikely(!pte_same(*pte, entry))) 3646 if (unlikely(!pte_same(*pte, entry)))
@@ -3537,9 +3709,21 @@ retry:
3537 3709
3538 barrier(); 3710 barrier();
3539 if (pmd_trans_huge(orig_pmd)) { 3711 if (pmd_trans_huge(orig_pmd)) {
3540 if (flags & FAULT_FLAG_WRITE && 3712 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3541 !pmd_write(orig_pmd) && 3713
3542 !pmd_trans_splitting(orig_pmd)) { 3714 /*
3715 * If the pmd is splitting, return and retry the
3716 * the fault. Alternative: wait until the split
3717 * is done, and goto retry.
3718 */
3719 if (pmd_trans_splitting(orig_pmd))
3720 return 0;
3721
3722 if (pmd_numa(orig_pmd))
3723 return do_huge_pmd_numa_page(mm, vma, address,
3724 orig_pmd, pmd);
3725
3726 if (dirty && !pmd_write(orig_pmd)) {
3543 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3727 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3544 orig_pmd); 3728 orig_pmd);
3545 /* 3729 /*
@@ -3550,17 +3734,25 @@ retry:
3550 if (unlikely(ret & VM_FAULT_OOM)) 3734 if (unlikely(ret & VM_FAULT_OOM))
3551 goto retry; 3735 goto retry;
3552 return ret; 3736 return ret;
3737 } else {
3738 huge_pmd_set_accessed(mm, vma, address, pmd,
3739 orig_pmd, dirty);
3553 } 3740 }
3741
3554 return 0; 3742 return 0;
3555 } 3743 }
3556 } 3744 }
3557 3745
3746 if (pmd_numa(*pmd))
3747 return do_pmd_numa_page(mm, vma, address, pmd);
3748
3558 /* 3749 /*
3559 * Use __pte_alloc instead of pte_alloc_map, because we can't 3750 * Use __pte_alloc instead of pte_alloc_map, because we can't
3560 * run pte_offset_map on the pmd, if an huge pmd could 3751 * run pte_offset_map on the pmd, if an huge pmd could
3561 * materialize from under us from a different thread. 3752 * materialize from under us from a different thread.
3562 */ 3753 */
3563 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) 3754 if (unlikely(pmd_none(*pmd)) &&
3755 unlikely(__pte_alloc(mm, vma, pmd, address)))
3564 return VM_FAULT_OOM; 3756 return VM_FAULT_OOM;
3565 /* if an huge pmd materialized from under us just retry later */ 3757 /* if an huge pmd materialized from under us just retry later */
3566 if (unlikely(pmd_trans_huge(*pmd))) 3758 if (unlikely(pmd_trans_huge(*pmd)))
@@ -3940,15 +4132,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
3940 struct file *f = vma->vm_file; 4132 struct file *f = vma->vm_file;
3941 char *buf = (char *)__get_free_page(GFP_KERNEL); 4133 char *buf = (char *)__get_free_page(GFP_KERNEL);
3942 if (buf) { 4134 if (buf) {
3943 char *p, *s; 4135 char *p;
3944 4136
3945 p = d_path(&f->f_path, buf, PAGE_SIZE); 4137 p = d_path(&f->f_path, buf, PAGE_SIZE);
3946 if (IS_ERR(p)) 4138 if (IS_ERR(p))
3947 p = "?"; 4139 p = "?";
3948 s = strrchr(p, '/'); 4140 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3949 if (s)
3950 p = s+1;
3951 printk("%s%s[%lx+%lx]", prefix, p,
3952 vma->vm_start, 4141 vma->vm_start,
3953 vma->vm_end - vma->vm_start); 4142 vma->vm_end - vma->vm_start);
3954 free_page((unsigned long)buf); 4143 free_page((unsigned long)buf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4eeacae2b91..d04ed87bfacb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
106void __ref put_page_bootmem(struct page *page) 106void __ref put_page_bootmem(struct page *page)
107{ 107{
108 unsigned long type; 108 unsigned long type;
109 static DEFINE_MUTEX(ppb_lock);
109 110
110 type = (unsigned long) page->lru.next; 111 type = (unsigned long) page->lru.next;
111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
115 ClearPagePrivate(page); 116 ClearPagePrivate(page);
116 set_page_private(page, 0); 117 set_page_private(page, 0);
117 INIT_LIST_HEAD(&page->lru); 118 INIT_LIST_HEAD(&page->lru);
119
120 /*
121 * Please refer to comment for __free_pages_bootmem()
122 * for why we serialize here.
123 */
124 mutex_lock(&ppb_lock);
118 __free_pages_bootmem(page, 0); 125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock);
119 } 127 }
120 128
121} 129}
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
205 zone_span_writelock(zone); 213 zone_span_writelock(zone);
206 214
207 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 215 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
208 if (start_pfn < zone->zone_start_pfn) 216 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
209 zone->zone_start_pfn = start_pfn; 217 zone->zone_start_pfn = start_pfn;
210 218
211 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 219 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
214 zone_span_writeunlock(zone); 222 zone_span_writeunlock(zone);
215} 223}
216 224
225static void resize_zone(struct zone *zone, unsigned long start_pfn,
226 unsigned long end_pfn)
227{
228 zone_span_writelock(zone);
229
230 if (end_pfn - start_pfn) {
231 zone->zone_start_pfn = start_pfn;
232 zone->spanned_pages = end_pfn - start_pfn;
233 } else {
234 /*
235 * make it consist as free_area_init_core(),
236 * if spanned_pages = 0, then keep start_pfn = 0
237 */
238 zone->zone_start_pfn = 0;
239 zone->spanned_pages = 0;
240 }
241
242 zone_span_writeunlock(zone);
243}
244
245static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
246 unsigned long end_pfn)
247{
248 enum zone_type zid = zone_idx(zone);
249 int nid = zone->zone_pgdat->node_id;
250 unsigned long pfn;
251
252 for (pfn = start_pfn; pfn < end_pfn; pfn++)
253 set_page_links(pfn_to_page(pfn), zid, nid, pfn);
254}
255
256static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
257 unsigned long start_pfn, unsigned long end_pfn)
258{
259 int ret;
260 unsigned long flags;
261 unsigned long z1_start_pfn;
262
263 if (!z1->wait_table) {
264 ret = init_currently_empty_zone(z1, start_pfn,
265 end_pfn - start_pfn, MEMMAP_HOTPLUG);
266 if (ret)
267 return ret;
268 }
269
270 pgdat_resize_lock(z1->zone_pgdat, &flags);
271
272 /* can't move pfns which are higher than @z2 */
273 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
274 goto out_fail;
275 /* the move out part mast at the left most of @z2 */
276 if (start_pfn > z2->zone_start_pfn)
277 goto out_fail;
278 /* must included/overlap */
279 if (end_pfn <= z2->zone_start_pfn)
280 goto out_fail;
281
282 /* use start_pfn for z1's start_pfn if z1 is empty */
283 if (z1->spanned_pages)
284 z1_start_pfn = z1->zone_start_pfn;
285 else
286 z1_start_pfn = start_pfn;
287
288 resize_zone(z1, z1_start_pfn, end_pfn);
289 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
290
291 pgdat_resize_unlock(z1->zone_pgdat, &flags);
292
293 fix_zone_id(z1, start_pfn, end_pfn);
294
295 return 0;
296out_fail:
297 pgdat_resize_unlock(z1->zone_pgdat, &flags);
298 return -1;
299}
300
301static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
302 unsigned long start_pfn, unsigned long end_pfn)
303{
304 int ret;
305 unsigned long flags;
306 unsigned long z2_end_pfn;
307
308 if (!z2->wait_table) {
309 ret = init_currently_empty_zone(z2, start_pfn,
310 end_pfn - start_pfn, MEMMAP_HOTPLUG);
311 if (ret)
312 return ret;
313 }
314
315 pgdat_resize_lock(z1->zone_pgdat, &flags);
316
317 /* can't move pfns which are lower than @z1 */
318 if (z1->zone_start_pfn > start_pfn)
319 goto out_fail;
320 /* the move out part mast at the right most of @z1 */
321 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
322 goto out_fail;
323 /* must included/overlap */
324 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
325 goto out_fail;
326
327 /* use end_pfn for z2's end_pfn if z2 is empty */
328 if (z2->spanned_pages)
329 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
330 else
331 z2_end_pfn = end_pfn;
332
333 resize_zone(z1, z1->zone_start_pfn, start_pfn);
334 resize_zone(z2, start_pfn, z2_end_pfn);
335
336 pgdat_resize_unlock(z1->zone_pgdat, &flags);
337
338 fix_zone_id(z2, start_pfn, end_pfn);
339
340 return 0;
341out_fail:
342 pgdat_resize_unlock(z1->zone_pgdat, &flags);
343 return -1;
344}
345
217static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 346static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
218 unsigned long end_pfn) 347 unsigned long end_pfn)
219{ 348{
220 unsigned long old_pgdat_end_pfn = 349 unsigned long old_pgdat_end_pfn =
221 pgdat->node_start_pfn + pgdat->node_spanned_pages; 350 pgdat->node_start_pfn + pgdat->node_spanned_pages;
222 351
223 if (start_pfn < pgdat->node_start_pfn) 352 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
224 pgdat->node_start_pfn = start_pfn; 353 pgdat->node_start_pfn = start_pfn;
225 354
226 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 355 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +589,99 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
460 return 0; 589 return 0;
461} 590}
462 591
592#ifdef CONFIG_MOVABLE_NODE
593/*
594 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
595 * normal memory.
596 */
597static bool can_online_high_movable(struct zone *zone)
598{
599 return true;
600}
601#else /* CONFIG_MOVABLE_NODE */
602/* ensure every online node has NORMAL memory */
603static bool can_online_high_movable(struct zone *zone)
604{
605 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
606}
607#endif /* CONFIG_MOVABLE_NODE */
463 608
464int __ref online_pages(unsigned long pfn, unsigned long nr_pages) 609/* check which state of node_states will be changed when online memory */
610static void node_states_check_changes_online(unsigned long nr_pages,
611 struct zone *zone, struct memory_notify *arg)
612{
613 int nid = zone_to_nid(zone);
614 enum zone_type zone_last = ZONE_NORMAL;
615
616 /*
617 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
618 * contains nodes which have zones of 0...ZONE_NORMAL,
619 * set zone_last to ZONE_NORMAL.
620 *
621 * If we don't have HIGHMEM nor movable node,
622 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
623 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
624 */
625 if (N_MEMORY == N_NORMAL_MEMORY)
626 zone_last = ZONE_MOVABLE;
627
628 /*
629 * if the memory to be online is in a zone of 0...zone_last, and
630 * the zones of 0...zone_last don't have memory before online, we will
631 * need to set the node to node_states[N_NORMAL_MEMORY] after
632 * the memory is online.
633 */
634 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
635 arg->status_change_nid_normal = nid;
636 else
637 arg->status_change_nid_normal = -1;
638
639#ifdef CONFIG_HIGHMEM
640 /*
641 * If we have movable node, node_states[N_HIGH_MEMORY]
642 * contains nodes which have zones of 0...ZONE_HIGHMEM,
643 * set zone_last to ZONE_HIGHMEM.
644 *
645 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
646 * contains nodes which have zones of 0...ZONE_MOVABLE,
647 * set zone_last to ZONE_MOVABLE.
648 */
649 zone_last = ZONE_HIGHMEM;
650 if (N_MEMORY == N_HIGH_MEMORY)
651 zone_last = ZONE_MOVABLE;
652
653 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
654 arg->status_change_nid_high = nid;
655 else
656 arg->status_change_nid_high = -1;
657#else
658 arg->status_change_nid_high = arg->status_change_nid_normal;
659#endif
660
661 /*
662 * if the node don't have memory befor online, we will need to
663 * set the node to node_states[N_MEMORY] after the memory
664 * is online.
665 */
666 if (!node_state(nid, N_MEMORY))
667 arg->status_change_nid = nid;
668 else
669 arg->status_change_nid = -1;
670}
671
672static void node_states_set_node(int node, struct memory_notify *arg)
673{
674 if (arg->status_change_nid_normal >= 0)
675 node_set_state(node, N_NORMAL_MEMORY);
676
677 if (arg->status_change_nid_high >= 0)
678 node_set_state(node, N_HIGH_MEMORY);
679
680 node_set_state(node, N_MEMORY);
681}
682
683
684int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
465{ 685{
466 unsigned long onlined_pages = 0; 686 unsigned long onlined_pages = 0;
467 struct zone *zone; 687 struct zone *zone;
@@ -471,13 +691,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
471 struct memory_notify arg; 691 struct memory_notify arg;
472 692
473 lock_memory_hotplug(); 693 lock_memory_hotplug();
694 /*
695 * This doesn't need a lock to do pfn_to_page().
696 * The section can't be removed here because of the
697 * memory_block->state_mutex.
698 */
699 zone = page_zone(pfn_to_page(pfn));
700
701 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
702 !can_online_high_movable(zone)) {
703 unlock_memory_hotplug();
704 return -1;
705 }
706
707 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
708 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
709 unlock_memory_hotplug();
710 return -1;
711 }
712 }
713 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
714 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
715 unlock_memory_hotplug();
716 return -1;
717 }
718 }
719
720 /* Previous code may changed the zone of the pfn range */
721 zone = page_zone(pfn_to_page(pfn));
722
474 arg.start_pfn = pfn; 723 arg.start_pfn = pfn;
475 arg.nr_pages = nr_pages; 724 arg.nr_pages = nr_pages;
476 arg.status_change_nid = -1; 725 node_states_check_changes_online(nr_pages, zone, &arg);
477 726
478 nid = page_to_nid(pfn_to_page(pfn)); 727 nid = page_to_nid(pfn_to_page(pfn));
479 if (node_present_pages(nid) == 0)
480 arg.status_change_nid = nid;
481 728
482 ret = memory_notify(MEM_GOING_ONLINE, &arg); 729 ret = memory_notify(MEM_GOING_ONLINE, &arg);
483 ret = notifier_to_errno(ret); 730 ret = notifier_to_errno(ret);
@@ -487,23 +734,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
487 return ret; 734 return ret;
488 } 735 }
489 /* 736 /*
490 * This doesn't need a lock to do pfn_to_page().
491 * The section can't be removed here because of the
492 * memory_block->state_mutex.
493 */
494 zone = page_zone(pfn_to_page(pfn));
495 /*
496 * If this zone is not populated, then it is not in zonelist. 737 * If this zone is not populated, then it is not in zonelist.
497 * This means the page allocator ignores this zone. 738 * This means the page allocator ignores this zone.
498 * So, zonelist must be updated after online. 739 * So, zonelist must be updated after online.
499 */ 740 */
500 mutex_lock(&zonelists_mutex); 741 mutex_lock(&zonelists_mutex);
501 if (!populated_zone(zone)) 742 if (!populated_zone(zone)) {
502 need_zonelists_rebuild = 1; 743 need_zonelists_rebuild = 1;
744 build_all_zonelists(NULL, zone);
745 }
503 746
504 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 747 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
505 online_pages_range); 748 online_pages_range);
506 if (ret) { 749 if (ret) {
750 if (need_zonelists_rebuild)
751 zone_pcp_reset(zone);
507 mutex_unlock(&zonelists_mutex); 752 mutex_unlock(&zonelists_mutex);
508 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 753 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
509 (unsigned long long) pfn << PAGE_SHIFT, 754 (unsigned long long) pfn << PAGE_SHIFT,
@@ -514,12 +759,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
514 return ret; 759 return ret;
515 } 760 }
516 761
762 zone->managed_pages += onlined_pages;
517 zone->present_pages += onlined_pages; 763 zone->present_pages += onlined_pages;
518 zone->zone_pgdat->node_present_pages += onlined_pages; 764 zone->zone_pgdat->node_present_pages += onlined_pages;
519 if (onlined_pages) { 765 if (onlined_pages) {
520 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 766 node_states_set_node(zone_to_nid(zone), &arg);
521 if (need_zonelists_rebuild) 767 if (need_zonelists_rebuild)
522 build_all_zonelists(NULL, zone); 768 build_all_zonelists(NULL, NULL);
523 else 769 else
524 zone_pcp_update(zone); 770 zone_pcp_update(zone);
525 } 771 }
@@ -812,7 +1058,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
812 * migrate_pages returns # of failed pages. 1058 * migrate_pages returns # of failed pages.
813 */ 1059 */
814 ret = migrate_pages(&source, alloc_migrate_target, 0, 1060 ret = migrate_pages(&source, alloc_migrate_target, 0,
815 true, MIGRATE_SYNC); 1061 true, MIGRATE_SYNC,
1062 MR_MEMORY_HOTPLUG);
816 if (ret) 1063 if (ret)
817 putback_lru_pages(&source); 1064 putback_lru_pages(&source);
818 } 1065 }
@@ -847,7 +1094,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
847{ 1094{
848 int ret; 1095 int ret;
849 long offlined = *(long *)data; 1096 long offlined = *(long *)data;
850 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); 1097 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
851 offlined = nr_pages; 1098 offlined = nr_pages;
852 if (!ret) 1099 if (!ret)
853 *(long *)data += offlined; 1100 *(long *)data += offlined;
@@ -867,6 +1114,132 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
867 return offlined; 1114 return offlined;
868} 1115}
869 1116
1117#ifdef CONFIG_MOVABLE_NODE
1118/*
1119 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
1120 * normal memory.
1121 */
1122static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1123{
1124 return true;
1125}
1126#else /* CONFIG_MOVABLE_NODE */
1127/* ensure the node has NORMAL memory if it is still online */
1128static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1129{
1130 struct pglist_data *pgdat = zone->zone_pgdat;
1131 unsigned long present_pages = 0;
1132 enum zone_type zt;
1133
1134 for (zt = 0; zt <= ZONE_NORMAL; zt++)
1135 present_pages += pgdat->node_zones[zt].present_pages;
1136
1137 if (present_pages > nr_pages)
1138 return true;
1139
1140 present_pages = 0;
1141 for (; zt <= ZONE_MOVABLE; zt++)
1142 present_pages += pgdat->node_zones[zt].present_pages;
1143
1144 /*
1145 * we can't offline the last normal memory until all
1146 * higher memory is offlined.
1147 */
1148 return present_pages == 0;
1149}
1150#endif /* CONFIG_MOVABLE_NODE */
1151
1152/* check which state of node_states will be changed when offline memory */
1153static void node_states_check_changes_offline(unsigned long nr_pages,
1154 struct zone *zone, struct memory_notify *arg)
1155{
1156 struct pglist_data *pgdat = zone->zone_pgdat;
1157 unsigned long present_pages = 0;
1158 enum zone_type zt, zone_last = ZONE_NORMAL;
1159
1160 /*
1161 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1162 * contains nodes which have zones of 0...ZONE_NORMAL,
1163 * set zone_last to ZONE_NORMAL.
1164 *
1165 * If we don't have HIGHMEM nor movable node,
1166 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1167 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1168 */
1169 if (N_MEMORY == N_NORMAL_MEMORY)
1170 zone_last = ZONE_MOVABLE;
1171
1172 /*
1173 * check whether node_states[N_NORMAL_MEMORY] will be changed.
1174 * If the memory to be offline is in a zone of 0...zone_last,
1175 * and it is the last present memory, 0...zone_last will
1176 * become empty after offline , thus we can determind we will
1177 * need to clear the node from node_states[N_NORMAL_MEMORY].
1178 */
1179 for (zt = 0; zt <= zone_last; zt++)
1180 present_pages += pgdat->node_zones[zt].present_pages;
1181 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1182 arg->status_change_nid_normal = zone_to_nid(zone);
1183 else
1184 arg->status_change_nid_normal = -1;
1185
1186#ifdef CONFIG_HIGHMEM
1187 /*
1188 * If we have movable node, node_states[N_HIGH_MEMORY]
1189 * contains nodes which have zones of 0...ZONE_HIGHMEM,
1190 * set zone_last to ZONE_HIGHMEM.
1191 *
1192 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1193 * contains nodes which have zones of 0...ZONE_MOVABLE,
1194 * set zone_last to ZONE_MOVABLE.
1195 */
1196 zone_last = ZONE_HIGHMEM;
1197 if (N_MEMORY == N_HIGH_MEMORY)
1198 zone_last = ZONE_MOVABLE;
1199
1200 for (; zt <= zone_last; zt++)
1201 present_pages += pgdat->node_zones[zt].present_pages;
1202 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1203 arg->status_change_nid_high = zone_to_nid(zone);
1204 else
1205 arg->status_change_nid_high = -1;
1206#else
1207 arg->status_change_nid_high = arg->status_change_nid_normal;
1208#endif
1209
1210 /*
1211 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1212 */
1213 zone_last = ZONE_MOVABLE;
1214
1215 /*
1216 * check whether node_states[N_HIGH_MEMORY] will be changed
1217 * If we try to offline the last present @nr_pages from the node,
1218 * we can determind we will need to clear the node from
1219 * node_states[N_HIGH_MEMORY].
1220 */
1221 for (; zt <= zone_last; zt++)
1222 present_pages += pgdat->node_zones[zt].present_pages;
1223 if (nr_pages >= present_pages)
1224 arg->status_change_nid = zone_to_nid(zone);
1225 else
1226 arg->status_change_nid = -1;
1227}
1228
1229static void node_states_clear_node(int node, struct memory_notify *arg)
1230{
1231 if (arg->status_change_nid_normal >= 0)
1232 node_clear_state(node, N_NORMAL_MEMORY);
1233
1234 if ((N_MEMORY != N_NORMAL_MEMORY) &&
1235 (arg->status_change_nid_high >= 0))
1236 node_clear_state(node, N_HIGH_MEMORY);
1237
1238 if ((N_MEMORY != N_HIGH_MEMORY) &&
1239 (arg->status_change_nid >= 0))
1240 node_clear_state(node, N_MEMORY);
1241}
1242
870static int __ref __offline_pages(unsigned long start_pfn, 1243static int __ref __offline_pages(unsigned long start_pfn,
871 unsigned long end_pfn, unsigned long timeout) 1244 unsigned long end_pfn, unsigned long timeout)
872{ 1245{
@@ -893,16 +1266,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
893 node = zone_to_nid(zone); 1266 node = zone_to_nid(zone);
894 nr_pages = end_pfn - start_pfn; 1267 nr_pages = end_pfn - start_pfn;
895 1268
1269 ret = -EINVAL;
1270 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
1271 goto out;
1272
896 /* set above range as isolated */ 1273 /* set above range as isolated */
897 ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1274 ret = start_isolate_page_range(start_pfn, end_pfn,
1275 MIGRATE_MOVABLE, true);
898 if (ret) 1276 if (ret)
899 goto out; 1277 goto out;
900 1278
901 arg.start_pfn = start_pfn; 1279 arg.start_pfn = start_pfn;
902 arg.nr_pages = nr_pages; 1280 arg.nr_pages = nr_pages;
903 arg.status_change_nid = -1; 1281 node_states_check_changes_offline(nr_pages, zone, &arg);
904 if (nr_pages >= node_present_pages(node))
905 arg.status_change_nid = node;
906 1282
907 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1283 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
908 ret = notifier_to_errno(ret); 1284 ret = notifier_to_errno(ret);
@@ -943,10 +1319,10 @@ repeat:
943 goto repeat; 1319 goto repeat;
944 } 1320 }
945 } 1321 }
946 /* drain all zone's lru pagevec, this is asyncronous... */ 1322 /* drain all zone's lru pagevec, this is asynchronous... */
947 lru_add_drain_all(); 1323 lru_add_drain_all();
948 yield(); 1324 yield();
949 /* drain pcp pages , this is synchrouns. */ 1325 /* drain pcp pages, this is synchronous. */
950 drain_all_pages(); 1326 drain_all_pages();
951 /* check again */ 1327 /* check again */
952 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1328 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
@@ -955,12 +1331,13 @@ repeat:
955 goto failed_removal; 1331 goto failed_removal;
956 } 1332 }
957 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1333 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
958 /* Ok, all of our target is islaoted. 1334 /* Ok, all of our target is isolated.
959 We cannot do rollback at this point. */ 1335 We cannot do rollback at this point. */
960 offline_isolated_pages(start_pfn, end_pfn); 1336 offline_isolated_pages(start_pfn, end_pfn);
961 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1337 /* reset pagetype flags and makes migrate type to be MOVABLE */
962 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1338 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
963 /* removal success */ 1339 /* removal success */
1340 zone->managed_pages -= offlined_pages;
964 zone->present_pages -= offlined_pages; 1341 zone->present_pages -= offlined_pages;
965 zone->zone_pgdat->node_present_pages -= offlined_pages; 1342 zone->zone_pgdat->node_present_pages -= offlined_pages;
966 totalram_pages -= offlined_pages; 1343 totalram_pages -= offlined_pages;
@@ -975,10 +1352,9 @@ repeat:
975 } else 1352 } else
976 zone_pcp_update(zone); 1353 zone_pcp_update(zone);
977 1354
978 if (!node_present_pages(node)) { 1355 node_states_clear_node(node, &arg);
979 node_clear_state(node, N_HIGH_MEMORY); 1356 if (arg.status_change_nid >= 0)
980 kswapd_stop(node); 1357 kswapd_stop(node);
981 }
982 1358
983 vm_total_pages = nr_free_pagecache_pages(); 1359 vm_total_pages = nr_free_pagecache_pages();
984 writeback_set_ratelimit(); 1360 writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ea600da8940..e2df1c1fb41f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
90#include <linux/syscalls.h> 90#include <linux/syscalls.h>
91#include <linux/ctype.h> 91#include <linux/ctype.h>
92#include <linux/mm_inline.h> 92#include <linux/mm_inline.h>
93#include <linux/mmu_notifier.h>
93 94
94#include <asm/tlbflush.h> 95#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 96#include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
117 .flags = MPOL_F_LOCAL, 118 .flags = MPOL_F_LOCAL,
118}; 119};
119 120
121static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122
123static struct mempolicy *get_task_policy(struct task_struct *p)
124{
125 struct mempolicy *pol = p->mempolicy;
126 int node;
127
128 if (!pol) {
129 node = numa_node_id();
130 if (node != -1)
131 pol = &preferred_node_policy[node];
132
133 /* preferred_node_policy is not initialised early in boot */
134 if (!pol->mode)
135 pol = NULL;
136 }
137
138 return pol;
139}
140
120static const struct mempolicy_operations { 141static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 142 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 /* 143 /*
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
212 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 233 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
213 if (pol == NULL) 234 if (pol == NULL)
214 return 0; 235 return 0;
215 /* Check N_HIGH_MEMORY */ 236 /* Check N_MEMORY */
216 nodes_and(nsc->mask1, 237 nodes_and(nsc->mask1,
217 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); 238 cpuset_current_mems_allowed, node_states[N_MEMORY]);
218 239
219 VM_BUG_ON(!nodes); 240 VM_BUG_ON(!nodes);
220 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 241 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
254 if (mode == MPOL_DEFAULT) { 275 if (mode == MPOL_DEFAULT) {
255 if (nodes && !nodes_empty(*nodes)) 276 if (nodes && !nodes_empty(*nodes))
256 return ERR_PTR(-EINVAL); 277 return ERR_PTR(-EINVAL);
257 return NULL; /* simply delete any existing policy */ 278 return NULL;
258 } 279 }
259 VM_BUG_ON(!nodes); 280 VM_BUG_ON(!nodes);
260 281
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
269 (flags & MPOL_F_RELATIVE_NODES))) 290 (flags & MPOL_F_RELATIVE_NODES)))
270 return ERR_PTR(-EINVAL); 291 return ERR_PTR(-EINVAL);
271 } 292 }
293 } else if (mode == MPOL_LOCAL) {
294 if (!nodes_empty(*nodes))
295 return ERR_PTR(-EINVAL);
296 mode = MPOL_PREFERRED;
272 } else if (nodes_empty(*nodes)) 297 } else if (nodes_empty(*nodes))
273 return ERR_PTR(-EINVAL); 298 return ERR_PTR(-EINVAL);
274 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 299 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
511 pmd = pmd_offset(pud, addr); 536 pmd = pmd_offset(pud, addr);
512 do { 537 do {
513 next = pmd_addr_end(addr, end); 538 next = pmd_addr_end(addr, end);
514 split_huge_page_pmd(vma->vm_mm, pmd); 539 split_huge_page_pmd(vma, addr, pmd);
515 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 540 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
516 continue; 541 continue;
517 if (check_pte_range(vma, pmd, addr, next, nodes, 542 if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
561 return 0; 586 return 0;
562} 587}
563 588
589#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
590/*
591 * This is used to mark a range of virtual addresses to be inaccessible.
592 * These are later cleared by a NUMA hinting fault. Depending on these
593 * faults, pages may be migrated for better NUMA placement.
594 *
595 * This is assuming that NUMA faults are handled using PROT_NONE. If
596 * an architecture makes a different choice, it will need further
597 * changes to the core.
598 */
599unsigned long change_prot_numa(struct vm_area_struct *vma,
600 unsigned long addr, unsigned long end)
601{
602 int nr_updated;
603 BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
604
605 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
606 if (nr_updated)
607 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
608
609 return nr_updated;
610}
611#else
612static unsigned long change_prot_numa(struct vm_area_struct *vma,
613 unsigned long addr, unsigned long end)
614{
615 return 0;
616}
617#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
618
564/* 619/*
565 * Check if all pages in a range are on a set of nodes. 620 * Check if all pages in a range are on a set of nodes.
566 * If pagelist != NULL then isolate pages from the LRU and 621 * If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
579 return ERR_PTR(-EFAULT); 634 return ERR_PTR(-EFAULT);
580 prev = NULL; 635 prev = NULL;
581 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 636 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
637 unsigned long endvma = vma->vm_end;
638
639 if (endvma > end)
640 endvma = end;
641 if (vma->vm_start > start)
642 start = vma->vm_start;
643
582 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 644 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
583 if (!vma->vm_next && vma->vm_end < end) 645 if (!vma->vm_next && vma->vm_end < end)
584 return ERR_PTR(-EFAULT); 646 return ERR_PTR(-EFAULT);
585 if (prev && prev->vm_end < vma->vm_start) 647 if (prev && prev->vm_end < vma->vm_start)
586 return ERR_PTR(-EFAULT); 648 return ERR_PTR(-EFAULT);
587 } 649 }
588 if (!is_vm_hugetlb_page(vma) && 650
589 ((flags & MPOL_MF_STRICT) || 651 if (is_vm_hugetlb_page(vma))
652 goto next;
653
654 if (flags & MPOL_MF_LAZY) {
655 change_prot_numa(vma, start, endvma);
656 goto next;
657 }
658
659 if ((flags & MPOL_MF_STRICT) ||
590 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 660 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
591 vma_migratable(vma)))) { 661 vma_migratable(vma))) {
592 unsigned long endvma = vma->vm_end;
593 662
594 if (endvma > end)
595 endvma = end;
596 if (vma->vm_start > start)
597 start = vma->vm_start;
598 err = check_pgd_range(vma, start, endvma, nodes, 663 err = check_pgd_range(vma, start, endvma, nodes,
599 flags, private); 664 flags, private);
600 if (err) { 665 if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
602 break; 667 break;
603 } 668 }
604 } 669 }
670next:
605 prev = vma; 671 prev = vma;
606 } 672 }
607 return first; 673 return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
961 1027
962 if (!list_empty(&pagelist)) { 1028 if (!list_empty(&pagelist)) {
963 err = migrate_pages(&pagelist, new_node_page, dest, 1029 err = migrate_pages(&pagelist, new_node_page, dest,
964 false, MIGRATE_SYNC); 1030 false, MIGRATE_SYNC,
1031 MR_SYSCALL);
965 if (err) 1032 if (err)
966 putback_lru_pages(&pagelist); 1033 putback_lru_pages(&pagelist);
967 } 1034 }
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1133 int err; 1200 int err;
1134 LIST_HEAD(pagelist); 1201 LIST_HEAD(pagelist);
1135 1202
1136 if (flags & ~(unsigned long)(MPOL_MF_STRICT | 1203 if (flags & ~(unsigned long)MPOL_MF_VALID)
1137 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1138 return -EINVAL; 1204 return -EINVAL;
1139 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1205 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1140 return -EPERM; 1206 return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1157 if (IS_ERR(new)) 1223 if (IS_ERR(new))
1158 return PTR_ERR(new); 1224 return PTR_ERR(new);
1159 1225
1226 if (flags & MPOL_MF_LAZY)
1227 new->flags |= MPOL_F_MOF;
1228
1160 /* 1229 /*
1161 * If we are using the default policy then operation 1230 * If we are using the default policy then operation
1162 * on discontinuous address spaces is okay after all 1231 * on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
1193 vma = check_range(mm, start, end, nmask, 1262 vma = check_range(mm, start, end, nmask,
1194 flags | MPOL_MF_INVERT, &pagelist); 1263 flags | MPOL_MF_INVERT, &pagelist);
1195 1264
1196 err = PTR_ERR(vma); 1265 err = PTR_ERR(vma); /* maybe ... */
1197 if (!IS_ERR(vma)) { 1266 if (!IS_ERR(vma))
1198 int nr_failed = 0;
1199
1200 err = mbind_range(mm, start, end, new); 1267 err = mbind_range(mm, start, end, new);
1201 1268
1269 if (!err) {
1270 int nr_failed = 0;
1271
1202 if (!list_empty(&pagelist)) { 1272 if (!list_empty(&pagelist)) {
1273 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1203 nr_failed = migrate_pages(&pagelist, new_vma_page, 1274 nr_failed = migrate_pages(&pagelist, new_vma_page,
1204 (unsigned long)vma, 1275 (unsigned long)vma,
1205 false, MIGRATE_SYNC); 1276 false, MIGRATE_SYNC,
1277 MR_MEMPOLICY_MBIND);
1206 if (nr_failed) 1278 if (nr_failed)
1207 putback_lru_pages(&pagelist); 1279 putback_lru_pages(&pagelist);
1208 } 1280 }
1209 1281
1210 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1282 if (nr_failed && (flags & MPOL_MF_STRICT))
1211 err = -EIO; 1283 err = -EIO;
1212 } else 1284 } else
1213 putback_lru_pages(&pagelist); 1285 putback_lru_pages(&pagelist);
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1388 goto out_put; 1460 goto out_put;
1389 } 1461 }
1390 1462
1391 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { 1463 if (!nodes_subset(*new, node_states[N_MEMORY])) {
1392 err = -EINVAL; 1464 err = -EINVAL;
1393 goto out_put; 1465 goto out_put;
1394 } 1466 }
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1546struct mempolicy *get_vma_policy(struct task_struct *task, 1618struct mempolicy *get_vma_policy(struct task_struct *task,
1547 struct vm_area_struct *vma, unsigned long addr) 1619 struct vm_area_struct *vma, unsigned long addr)
1548{ 1620{
1549 struct mempolicy *pol = task->mempolicy; 1621 struct mempolicy *pol = get_task_policy(task);
1550 1622
1551 if (vma) { 1623 if (vma) {
1552 if (vma->vm_ops && vma->vm_ops->get_policy) { 1624 if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1907,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1907 unsigned long addr, int node) 1979 unsigned long addr, int node)
1908{ 1980{
1909 struct mempolicy *pol; 1981 struct mempolicy *pol;
1910 struct zonelist *zl;
1911 struct page *page; 1982 struct page *page;
1912 unsigned int cpuset_mems_cookie; 1983 unsigned int cpuset_mems_cookie;
1913 1984
@@ -1926,23 +1997,11 @@ retry_cpuset:
1926 1997
1927 return page; 1998 return page;
1928 } 1999 }
1929 zl = policy_zonelist(gfp, pol, node); 2000 page = __alloc_pages_nodemask(gfp, order,
1930 if (unlikely(mpol_needs_cond_ref(pol))) { 2001 policy_zonelist(gfp, pol, node),
1931 /*
1932 * slow path: ref counted shared policy
1933 */
1934 struct page *page = __alloc_pages_nodemask(gfp, order,
1935 zl, policy_nodemask(gfp, pol));
1936 __mpol_put(pol);
1937 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1938 goto retry_cpuset;
1939 return page;
1940 }
1941 /*
1942 * fast path: default or task policy
1943 */
1944 page = __alloc_pages_nodemask(gfp, order, zl,
1945 policy_nodemask(gfp, pol)); 2002 policy_nodemask(gfp, pol));
2003 if (unlikely(mpol_needs_cond_ref(pol)))
2004 __mpol_put(pol);
1946 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2005 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1947 goto retry_cpuset; 2006 goto retry_cpuset;
1948 return page; 2007 return page;
@@ -1969,7 +2028,7 @@ retry_cpuset:
1969 */ 2028 */
1970struct page *alloc_pages_current(gfp_t gfp, unsigned order) 2029struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1971{ 2030{
1972 struct mempolicy *pol = current->mempolicy; 2031 struct mempolicy *pol = get_task_policy(current);
1973 struct page *page; 2032 struct page *page;
1974 unsigned int cpuset_mems_cookie; 2033 unsigned int cpuset_mems_cookie;
1975 2034
@@ -2073,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2073 */ 2132 */
2074 2133
2075/* lookup first element intersecting start-end */ 2134/* lookup first element intersecting start-end */
2076/* Caller holds sp->mutex */ 2135/* Caller holds sp->lock */
2077static struct sp_node * 2136static struct sp_node *
2078sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2137sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2079{ 2138{
@@ -2137,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2137 2196
2138 if (!sp->root.rb_node) 2197 if (!sp->root.rb_node)
2139 return NULL; 2198 return NULL;
2140 mutex_lock(&sp->mutex); 2199 spin_lock(&sp->lock);
2141 sn = sp_lookup(sp, idx, idx+1); 2200 sn = sp_lookup(sp, idx, idx+1);
2142 if (sn) { 2201 if (sn) {
2143 mpol_get(sn->policy); 2202 mpol_get(sn->policy);
2144 pol = sn->policy; 2203 pol = sn->policy;
2145 } 2204 }
2146 mutex_unlock(&sp->mutex); 2205 spin_unlock(&sp->lock);
2147 return pol; 2206 return pol;
2148} 2207}
2149 2208
@@ -2153,6 +2212,115 @@ static void sp_free(struct sp_node *n)
2153 kmem_cache_free(sn_cache, n); 2212 kmem_cache_free(sn_cache, n);
2154} 2213}
2155 2214
2215/**
2216 * mpol_misplaced - check whether current page node is valid in policy
2217 *
2218 * @page - page to be checked
2219 * @vma - vm area where page mapped
2220 * @addr - virtual address where page mapped
2221 *
2222 * Lookup current policy node id for vma,addr and "compare to" page's
2223 * node id.
2224 *
2225 * Returns:
2226 * -1 - not misplaced, page is in the right node
2227 * node - node id where the page should be
2228 *
2229 * Policy determination "mimics" alloc_page_vma().
2230 * Called from fault path where we know the vma and faulting address.
2231 */
2232int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2233{
2234 struct mempolicy *pol;
2235 struct zone *zone;
2236 int curnid = page_to_nid(page);
2237 unsigned long pgoff;
2238 int polnid = -1;
2239 int ret = -1;
2240
2241 BUG_ON(!vma);
2242
2243 pol = get_vma_policy(current, vma, addr);
2244 if (!(pol->flags & MPOL_F_MOF))
2245 goto out;
2246
2247 switch (pol->mode) {
2248 case MPOL_INTERLEAVE:
2249 BUG_ON(addr >= vma->vm_end);
2250 BUG_ON(addr < vma->vm_start);
2251
2252 pgoff = vma->vm_pgoff;
2253 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2254 polnid = offset_il_node(pol, vma, pgoff);
2255 break;
2256
2257 case MPOL_PREFERRED:
2258 if (pol->flags & MPOL_F_LOCAL)
2259 polnid = numa_node_id();
2260 else
2261 polnid = pol->v.preferred_node;
2262 break;
2263
2264 case MPOL_BIND:
2265 /*
2266 * allows binding to multiple nodes.
2267 * use current page if in policy nodemask,
2268 * else select nearest allowed node, if any.
2269 * If no allowed nodes, use current [!misplaced].
2270 */
2271 if (node_isset(curnid, pol->v.nodes))
2272 goto out;
2273 (void)first_zones_zonelist(
2274 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2275 gfp_zone(GFP_HIGHUSER),
2276 &pol->v.nodes, &zone);
2277 polnid = zone->node;
2278 break;
2279
2280 default:
2281 BUG();
2282 }
2283
2284 /* Migrate the page towards the node whose CPU is referencing it */
2285 if (pol->flags & MPOL_F_MORON) {
2286 int last_nid;
2287
2288 polnid = numa_node_id();
2289
2290 /*
2291 * Multi-stage node selection is used in conjunction
2292 * with a periodic migration fault to build a temporal
2293 * task<->page relation. By using a two-stage filter we
2294 * remove short/unlikely relations.
2295 *
2296 * Using P(p) ~ n_p / n_t as per frequentist
2297 * probability, we can equate a task's usage of a
2298 * particular page (n_p) per total usage of this
2299 * page (n_t) (in a given time-span) to a probability.
2300 *
2301 * Our periodic faults will sample this probability and
2302 * getting the same result twice in a row, given these
2303 * samples are fully independent, is then given by
2304 * P(n)^2, provided our sample period is sufficiently
2305 * short compared to the usage pattern.
2306 *
2307 * This quadric squishes small probabilities, making
2308 * it less likely we act on an unlikely task<->page
2309 * relation.
2310 */
2311 last_nid = page_xchg_last_nid(page, polnid);
2312 if (last_nid != polnid)
2313 goto out;
2314 }
2315
2316 if (curnid != polnid)
2317 ret = polnid;
2318out:
2319 mpol_cond_put(pol);
2320
2321 return ret;
2322}
2323
2156static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2324static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2157{ 2325{
2158 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2326 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2160,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2160 sp_free(n); 2328 sp_free(n);
2161} 2329}
2162 2330
2331static void sp_node_init(struct sp_node *node, unsigned long start,
2332 unsigned long end, struct mempolicy *pol)
2333{
2334 node->start = start;
2335 node->end = end;
2336 node->policy = pol;
2337}
2338
2163static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2339static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2164 struct mempolicy *pol) 2340 struct mempolicy *pol)
2165{ 2341{
@@ -2176,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2176 return NULL; 2352 return NULL;
2177 } 2353 }
2178 newpol->flags |= MPOL_F_SHARED; 2354 newpol->flags |= MPOL_F_SHARED;
2179 2355 sp_node_init(n, start, end, newpol);
2180 n->start = start;
2181 n->end = end;
2182 n->policy = newpol;
2183 2356
2184 return n; 2357 return n;
2185} 2358}
@@ -2189,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2189 unsigned long end, struct sp_node *new) 2362 unsigned long end, struct sp_node *new)
2190{ 2363{
2191 struct sp_node *n; 2364 struct sp_node *n;
2365 struct sp_node *n_new = NULL;
2366 struct mempolicy *mpol_new = NULL;
2192 int ret = 0; 2367 int ret = 0;
2193 2368
2194 mutex_lock(&sp->mutex); 2369restart:
2370 spin_lock(&sp->lock);
2195 n = sp_lookup(sp, start, end); 2371 n = sp_lookup(sp, start, end);
2196 /* Take care of old policies in the same range. */ 2372 /* Take care of old policies in the same range. */
2197 while (n && n->start < end) { 2373 while (n && n->start < end) {
@@ -2204,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2204 } else { 2380 } else {
2205 /* Old policy spanning whole new range. */ 2381 /* Old policy spanning whole new range. */
2206 if (n->end > end) { 2382 if (n->end > end) {
2207 struct sp_node *new2; 2383 if (!n_new)
2208 new2 = sp_alloc(end, n->end, n->policy); 2384 goto alloc_new;
2209 if (!new2) { 2385
2210 ret = -ENOMEM; 2386 *mpol_new = *n->policy;
2211 goto out; 2387 atomic_set(&mpol_new->refcnt, 1);
2212 } 2388 sp_node_init(n_new, n->end, end, mpol_new);
2389 sp_insert(sp, n_new);
2213 n->end = start; 2390 n->end = start;
2214 sp_insert(sp, new2); 2391 n_new = NULL;
2392 mpol_new = NULL;
2215 break; 2393 break;
2216 } else 2394 } else
2217 n->end = start; 2395 n->end = start;
@@ -2222,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2222 } 2400 }
2223 if (new) 2401 if (new)
2224 sp_insert(sp, new); 2402 sp_insert(sp, new);
2225out: 2403 spin_unlock(&sp->lock);
2226 mutex_unlock(&sp->mutex); 2404 ret = 0;
2405
2406err_out:
2407 if (mpol_new)
2408 mpol_put(mpol_new);
2409 if (n_new)
2410 kmem_cache_free(sn_cache, n_new);
2411
2227 return ret; 2412 return ret;
2413
2414alloc_new:
2415 spin_unlock(&sp->lock);
2416 ret = -ENOMEM;
2417 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2418 if (!n_new)
2419 goto err_out;
2420 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2421 if (!mpol_new)
2422 goto err_out;
2423 goto restart;
2228} 2424}
2229 2425
2230/** 2426/**
@@ -2242,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2242 int ret; 2438 int ret;
2243 2439
2244 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2440 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2245 mutex_init(&sp->mutex); 2441 spin_lock_init(&sp->lock);
2246 2442
2247 if (mpol) { 2443 if (mpol) {
2248 struct vm_area_struct pvma; 2444 struct vm_area_struct pvma;
@@ -2308,16 +2504,60 @@ void mpol_free_shared_policy(struct shared_policy *p)
2308 2504
2309 if (!p->root.rb_node) 2505 if (!p->root.rb_node)
2310 return; 2506 return;
2311 mutex_lock(&p->mutex); 2507 spin_lock(&p->lock);
2312 next = rb_first(&p->root); 2508 next = rb_first(&p->root);
2313 while (next) { 2509 while (next) {
2314 n = rb_entry(next, struct sp_node, nd); 2510 n = rb_entry(next, struct sp_node, nd);
2315 next = rb_next(&n->nd); 2511 next = rb_next(&n->nd);
2316 sp_delete(p, n); 2512 sp_delete(p, n);
2317 } 2513 }
2318 mutex_unlock(&p->mutex); 2514 spin_unlock(&p->lock);
2515}
2516
2517#ifdef CONFIG_NUMA_BALANCING
2518static bool __initdata numabalancing_override;
2519
2520static void __init check_numabalancing_enable(void)
2521{
2522 bool numabalancing_default = false;
2523
2524 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2525 numabalancing_default = true;
2526
2527 if (nr_node_ids > 1 && !numabalancing_override) {
2528 printk(KERN_INFO "Enabling automatic NUMA balancing. "
2529 "Configure with numa_balancing= or sysctl");
2530 set_numabalancing_state(numabalancing_default);
2531 }
2319} 2532}
2320 2533
2534static int __init setup_numabalancing(char *str)
2535{
2536 int ret = 0;
2537 if (!str)
2538 goto out;
2539 numabalancing_override = true;
2540
2541 if (!strcmp(str, "enable")) {
2542 set_numabalancing_state(true);
2543 ret = 1;
2544 } else if (!strcmp(str, "disable")) {
2545 set_numabalancing_state(false);
2546 ret = 1;
2547 }
2548out:
2549 if (!ret)
2550 printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2551
2552 return ret;
2553}
2554__setup("numa_balancing=", setup_numabalancing);
2555#else
2556static inline void __init check_numabalancing_enable(void)
2557{
2558}
2559#endif /* CONFIG_NUMA_BALANCING */
2560
2321/* assumes fs == KERNEL_DS */ 2561/* assumes fs == KERNEL_DS */
2322void __init numa_policy_init(void) 2562void __init numa_policy_init(void)
2323{ 2563{
@@ -2333,13 +2573,22 @@ void __init numa_policy_init(void)
2333 sizeof(struct sp_node), 2573 sizeof(struct sp_node),
2334 0, SLAB_PANIC, NULL); 2574 0, SLAB_PANIC, NULL);
2335 2575
2576 for_each_node(nid) {
2577 preferred_node_policy[nid] = (struct mempolicy) {
2578 .refcnt = ATOMIC_INIT(1),
2579 .mode = MPOL_PREFERRED,
2580 .flags = MPOL_F_MOF | MPOL_F_MORON,
2581 .v = { .preferred_node = nid, },
2582 };
2583 }
2584
2336 /* 2585 /*
2337 * Set interleaving policy for system init. Interleaving is only 2586 * Set interleaving policy for system init. Interleaving is only
2338 * enabled across suitably sized nodes (default is >= 16MB), or 2587 * enabled across suitably sized nodes (default is >= 16MB), or
2339 * fall back to the largest node if they're all smaller. 2588 * fall back to the largest node if they're all smaller.
2340 */ 2589 */
2341 nodes_clear(interleave_nodes); 2590 nodes_clear(interleave_nodes);
2342 for_each_node_state(nid, N_HIGH_MEMORY) { 2591 for_each_node_state(nid, N_MEMORY) {
2343 unsigned long total_pages = node_present_pages(nid); 2592 unsigned long total_pages = node_present_pages(nid);
2344 2593
2345 /* Preserve the largest node */ 2594 /* Preserve the largest node */
@@ -2359,6 +2608,8 @@ void __init numa_policy_init(void)
2359 2608
2360 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 2609 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2361 printk("numa_policy_init: interleaving failed\n"); 2610 printk("numa_policy_init: interleaving failed\n");
2611
2612 check_numabalancing_enable();
2362} 2613}
2363 2614
2364/* Reset policy of current process to default */ 2615/* Reset policy of current process to default */
@@ -2372,44 +2623,34 @@ void numa_default_policy(void)
2372 */ 2623 */
2373 2624
2374/* 2625/*
2375 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2626 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2376 * Used only for mpol_parse_str() and mpol_to_str()
2377 */ 2627 */
2378#define MPOL_LOCAL MPOL_MAX
2379static const char * const policy_modes[] = 2628static const char * const policy_modes[] =
2380{ 2629{
2381 [MPOL_DEFAULT] = "default", 2630 [MPOL_DEFAULT] = "default",
2382 [MPOL_PREFERRED] = "prefer", 2631 [MPOL_PREFERRED] = "prefer",
2383 [MPOL_BIND] = "bind", 2632 [MPOL_BIND] = "bind",
2384 [MPOL_INTERLEAVE] = "interleave", 2633 [MPOL_INTERLEAVE] = "interleave",
2385 [MPOL_LOCAL] = "local" 2634 [MPOL_LOCAL] = "local",
2386}; 2635};
2387 2636
2388 2637
2389#ifdef CONFIG_TMPFS 2638#ifdef CONFIG_TMPFS
2390/** 2639/**
2391 * mpol_parse_str - parse string to mempolicy 2640 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2392 * @str: string containing mempolicy to parse 2641 * @str: string containing mempolicy to parse
2393 * @mpol: pointer to struct mempolicy pointer, returned on success. 2642 * @mpol: pointer to struct mempolicy pointer, returned on success.
2394 * @no_context: flag whether to "contextualize" the mempolicy
2395 * 2643 *
2396 * Format of input: 2644 * Format of input:
2397 * <mode>[=<flags>][:<nodelist>] 2645 * <mode>[=<flags>][:<nodelist>]
2398 * 2646 *
2399 * if @no_context is true, save the input nodemask in w.user_nodemask in
2400 * the returned mempolicy. This will be used to "clone" the mempolicy in
2401 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
2402 * mount option. Note that if 'static' or 'relative' mode flags were
2403 * specified, the input nodemask will already have been saved. Saving
2404 * it again is redundant, but safe.
2405 *
2406 * On success, returns 0, else 1 2647 * On success, returns 0, else 1
2407 */ 2648 */
2408int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) 2649int mpol_parse_str(char *str, struct mempolicy **mpol)
2409{ 2650{
2410 struct mempolicy *new = NULL; 2651 struct mempolicy *new = NULL;
2411 unsigned short mode; 2652 unsigned short mode;
2412 unsigned short uninitialized_var(mode_flags); 2653 unsigned short mode_flags;
2413 nodemask_t nodes; 2654 nodemask_t nodes;
2414 char *nodelist = strchr(str, ':'); 2655 char *nodelist = strchr(str, ':');
2415 char *flags = strchr(str, '='); 2656 char *flags = strchr(str, '=');
@@ -2420,7 +2661,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2420 *nodelist++ = '\0'; 2661 *nodelist++ = '\0';
2421 if (nodelist_parse(nodelist, nodes)) 2662 if (nodelist_parse(nodelist, nodes))
2422 goto out; 2663 goto out;
2423 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) 2664 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2424 goto out; 2665 goto out;
2425 } else 2666 } else
2426 nodes_clear(nodes); 2667 nodes_clear(nodes);
@@ -2428,12 +2669,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2428 if (flags) 2669 if (flags)
2429 *flags++ = '\0'; /* terminate mode string */ 2670 *flags++ = '\0'; /* terminate mode string */
2430 2671
2431 for (mode = 0; mode <= MPOL_LOCAL; mode++) { 2672 for (mode = 0; mode < MPOL_MAX; mode++) {
2432 if (!strcmp(str, policy_modes[mode])) { 2673 if (!strcmp(str, policy_modes[mode])) {
2433 break; 2674 break;
2434 } 2675 }
2435 } 2676 }
2436 if (mode > MPOL_LOCAL) 2677 if (mode >= MPOL_MAX)
2437 goto out; 2678 goto out;
2438 2679
2439 switch (mode) { 2680 switch (mode) {
@@ -2454,7 +2695,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2454 * Default to online nodes with memory if no nodelist 2695 * Default to online nodes with memory if no nodelist
2455 */ 2696 */
2456 if (!nodelist) 2697 if (!nodelist)
2457 nodes = node_states[N_HIGH_MEMORY]; 2698 nodes = node_states[N_MEMORY];
2458 break; 2699 break;
2459 case MPOL_LOCAL: 2700 case MPOL_LOCAL:
2460 /* 2701 /*
@@ -2497,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2497 if (IS_ERR(new)) 2738 if (IS_ERR(new))
2498 goto out; 2739 goto out;
2499 2740
2500 if (no_context) { 2741 /*
2501 /* save for contextualization */ 2742 * Save nodes for mpol_to_str() to show the tmpfs mount options
2502 new->w.user_nodemask = nodes; 2743 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2503 } else { 2744 */
2504 int ret; 2745 if (mode != MPOL_PREFERRED)
2505 NODEMASK_SCRATCH(scratch); 2746 new->v.nodes = nodes;
2506 if (scratch) { 2747 else if (nodelist)
2507 task_lock(current); 2748 new->v.preferred_node = first_node(nodes);
2508 ret = mpol_set_nodemask(new, &nodes, scratch); 2749 else
2509 task_unlock(current); 2750 new->flags |= MPOL_F_LOCAL;
2510 } else 2751
2511 ret = -ENOMEM; 2752 /*
2512 NODEMASK_SCRATCH_FREE(scratch); 2753 * Save nodes for contextualization: this will be used to "clone"
2513 if (ret) { 2754 * the mempolicy in a specific context [cpuset] at a later time.
2514 mpol_put(new); 2755 */
2515 goto out; 2756 new->w.user_nodemask = nodes;
2516 } 2757
2517 }
2518 err = 0; 2758 err = 0;
2519 2759
2520out: 2760out:
@@ -2534,13 +2774,12 @@ out:
2534 * @buffer: to contain formatted mempolicy string 2774 * @buffer: to contain formatted mempolicy string
2535 * @maxlen: length of @buffer 2775 * @maxlen: length of @buffer
2536 * @pol: pointer to mempolicy to be formatted 2776 * @pol: pointer to mempolicy to be formatted
2537 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2538 * 2777 *
2539 * Convert a mempolicy into a string. 2778 * Convert a mempolicy into a string.
2540 * Returns the number of characters in buffer (if positive) 2779 * Returns the number of characters in buffer (if positive)
2541 * or an error (negative) 2780 * or an error (negative)
2542 */ 2781 */
2543int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) 2782int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2544{ 2783{
2545 char *p = buffer; 2784 char *p = buffer;
2546 int l; 2785 int l;
@@ -2566,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2566 case MPOL_PREFERRED: 2805 case MPOL_PREFERRED:
2567 nodes_clear(nodes); 2806 nodes_clear(nodes);
2568 if (flags & MPOL_F_LOCAL) 2807 if (flags & MPOL_F_LOCAL)
2569 mode = MPOL_LOCAL; /* pseudo-policy */ 2808 mode = MPOL_LOCAL;
2570 else 2809 else
2571 node_set(pol->v.preferred_node, nodes); 2810 node_set(pol->v.preferred_node, nodes);
2572 break; 2811 break;
@@ -2574,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2574 case MPOL_BIND: 2813 case MPOL_BIND:
2575 /* Fall through */ 2814 /* Fall through */
2576 case MPOL_INTERLEAVE: 2815 case MPOL_INTERLEAVE:
2577 if (no_context) 2816 nodes = pol->v.nodes;
2578 nodes = pol->w.user_nodemask;
2579 else
2580 nodes = pol->v.nodes;
2581 break; 2817 break;
2582 2818
2583 default: 2819 default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d773705..2fd8b4af4744 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,9 +35,13 @@
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h> 36#include <linux/hugetlb_cgroup.h>
37#include <linux/gfp.h> 37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h>
38 39
39#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
40 41
42#define CREATE_TRACE_POINTS
43#include <trace/events/migrate.h>
44
41#include "internal.h" 45#include "internal.h"
42 46
43/* 47/*
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l)
79 list_del(&page->lru); 83 list_del(&page->lru);
80 dec_zone_page_state(page, NR_ISOLATED_ANON + 84 dec_zone_page_state(page, NR_ISOLATED_ANON +
81 page_is_file_cache(page)); 85 page_is_file_cache(page));
82 putback_lru_page(page); 86 putback_lru_page(page);
87 }
88}
89
90/*
91 * Put previously isolated pages back onto the appropriate lists
92 * from where they were once taken off for compaction/migration.
93 *
94 * This function shall be used instead of putback_lru_pages(),
95 * whenever the isolated pageset has been built by isolate_migratepages_range()
96 */
97void putback_movable_pages(struct list_head *l)
98{
99 struct page *page;
100 struct page *page2;
101
102 list_for_each_entry_safe(page, page2, l, lru) {
103 list_del(&page->lru);
104 dec_zone_page_state(page, NR_ISOLATED_ANON +
105 page_is_file_cache(page));
106 if (unlikely(balloon_page_movable(page)))
107 balloon_page_putback(page);
108 else
109 putback_lru_page(page);
83 } 110 }
84} 111}
85 112
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
91{ 118{
92 struct mm_struct *mm = vma->vm_mm; 119 struct mm_struct *mm = vma->vm_mm;
93 swp_entry_t entry; 120 swp_entry_t entry;
94 pgd_t *pgd;
95 pud_t *pud;
96 pmd_t *pmd; 121 pmd_t *pmd;
97 pte_t *ptep, pte; 122 pte_t *ptep, pte;
98 spinlock_t *ptl; 123 spinlock_t *ptl;
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
103 goto out; 128 goto out;
104 ptl = &mm->page_table_lock; 129 ptl = &mm->page_table_lock;
105 } else { 130 } else {
106 pgd = pgd_offset(mm, addr); 131 pmd = mm_find_pmd(mm, addr);
107 if (!pgd_present(*pgd)) 132 if (!pmd)
108 goto out;
109
110 pud = pud_offset(pgd, addr);
111 if (!pud_present(*pud))
112 goto out; 133 goto out;
113
114 pmd = pmd_offset(pud, addr);
115 if (pmd_trans_huge(*pmd)) 134 if (pmd_trans_huge(*pmd))
116 goto out; 135 goto out;
117 if (!pmd_present(*pmd))
118 goto out;
119 136
120 ptep = pte_offset_map(pmd, addr); 137 ptep = pte_offset_map(pmd, addr);
121 138
@@ -143,8 +160,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
143 if (is_write_migration_entry(entry)) 160 if (is_write_migration_entry(entry))
144 pte = pte_mkwrite(pte); 161 pte = pte_mkwrite(pte);
145#ifdef CONFIG_HUGETLB_PAGE 162#ifdef CONFIG_HUGETLB_PAGE
146 if (PageHuge(new)) 163 if (PageHuge(new)) {
147 pte = pte_mkhuge(pte); 164 pte = pte_mkhuge(pte);
165 pte = arch_make_huge_pte(pte, vma, new, 0);
166 }
148#endif 167#endif
149 flush_cache_page(vma, addr, pte_pfn(pte)); 168 flush_cache_page(vma, addr, pte_pfn(pte));
150 set_pte_at(mm, addr, ptep, pte); 169 set_pte_at(mm, addr, ptep, pte);
@@ -279,14 +298,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
279 struct page *newpage, struct page *page, 298 struct page *newpage, struct page *page,
280 struct buffer_head *head, enum migrate_mode mode) 299 struct buffer_head *head, enum migrate_mode mode)
281{ 300{
282 int expected_count; 301 int expected_count = 0;
283 void **pslot; 302 void **pslot;
284 303
285 if (!mapping) { 304 if (!mapping) {
286 /* Anonymous page without mapping */ 305 /* Anonymous page without mapping */
287 if (page_count(page) != 1) 306 if (page_count(page) != 1)
288 return -EAGAIN; 307 return -EAGAIN;
289 return 0; 308 return MIGRATEPAGE_SUCCESS;
290 } 309 }
291 310
292 spin_lock_irq(&mapping->tree_lock); 311 spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +375,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
356 } 375 }
357 spin_unlock_irq(&mapping->tree_lock); 376 spin_unlock_irq(&mapping->tree_lock);
358 377
359 return 0; 378 return MIGRATEPAGE_SUCCESS;
360} 379}
361 380
362/* 381/*
@@ -372,7 +391,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
372 if (!mapping) { 391 if (!mapping) {
373 if (page_count(page) != 1) 392 if (page_count(page) != 1)
374 return -EAGAIN; 393 return -EAGAIN;
375 return 0; 394 return MIGRATEPAGE_SUCCESS;
376 } 395 }
377 396
378 spin_lock_irq(&mapping->tree_lock); 397 spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +418,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
399 page_unfreeze_refs(page, expected_count - 1); 418 page_unfreeze_refs(page, expected_count - 1);
400 419
401 spin_unlock_irq(&mapping->tree_lock); 420 spin_unlock_irq(&mapping->tree_lock);
402 return 0; 421 return MIGRATEPAGE_SUCCESS;
403} 422}
404 423
405/* 424/*
@@ -407,7 +426,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
407 */ 426 */
408void migrate_page_copy(struct page *newpage, struct page *page) 427void migrate_page_copy(struct page *newpage, struct page *page)
409{ 428{
410 if (PageHuge(page)) 429 if (PageHuge(page) || PageTransHuge(page))
411 copy_huge_page(newpage, page); 430 copy_huge_page(newpage, page);
412 else 431 else
413 copy_highpage(newpage, page); 432 copy_highpage(newpage, page);
@@ -486,11 +505,11 @@ int migrate_page(struct address_space *mapping,
486 505
487 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); 506 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
488 507
489 if (rc) 508 if (rc != MIGRATEPAGE_SUCCESS)
490 return rc; 509 return rc;
491 510
492 migrate_page_copy(newpage, page); 511 migrate_page_copy(newpage, page);
493 return 0; 512 return MIGRATEPAGE_SUCCESS;
494} 513}
495EXPORT_SYMBOL(migrate_page); 514EXPORT_SYMBOL(migrate_page);
496 515
@@ -513,7 +532,7 @@ int buffer_migrate_page(struct address_space *mapping,
513 532
514 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); 533 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
515 534
516 if (rc) 535 if (rc != MIGRATEPAGE_SUCCESS)
517 return rc; 536 return rc;
518 537
519 /* 538 /*
@@ -549,7 +568,7 @@ int buffer_migrate_page(struct address_space *mapping,
549 568
550 } while (bh != head); 569 } while (bh != head);
551 570
552 return 0; 571 return MIGRATEPAGE_SUCCESS;
553} 572}
554EXPORT_SYMBOL(buffer_migrate_page); 573EXPORT_SYMBOL(buffer_migrate_page);
555#endif 574#endif
@@ -628,7 +647,7 @@ static int fallback_migrate_page(struct address_space *mapping,
628 * 647 *
629 * Return value: 648 * Return value:
630 * < 0 - error code 649 * < 0 - error code
631 * == 0 - success 650 * MIGRATEPAGE_SUCCESS - success
632 */ 651 */
633static int move_to_new_page(struct page *newpage, struct page *page, 652static int move_to_new_page(struct page *newpage, struct page *page,
634 int remap_swapcache, enum migrate_mode mode) 653 int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +684,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
665 else 684 else
666 rc = fallback_migrate_page(mapping, newpage, page, mode); 685 rc = fallback_migrate_page(mapping, newpage, page, mode);
667 686
668 if (rc) { 687 if (rc != MIGRATEPAGE_SUCCESS) {
669 newpage->mapping = NULL; 688 newpage->mapping = NULL;
670 } else { 689 } else {
671 if (remap_swapcache) 690 if (remap_swapcache)
@@ -751,7 +770,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
751 */ 770 */
752 if (PageAnon(page)) { 771 if (PageAnon(page)) {
753 /* 772 /*
754 * Only page_lock_anon_vma() understands the subtleties of 773 * Only page_lock_anon_vma_read() understands the subtleties of
755 * getting a hold on an anon_vma from outside one of its mms. 774 * getting a hold on an anon_vma from outside one of its mms.
756 */ 775 */
757 anon_vma = page_get_anon_vma(page); 776 anon_vma = page_get_anon_vma(page);
@@ -778,6 +797,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
778 } 797 }
779 } 798 }
780 799
800 if (unlikely(balloon_page_movable(page))) {
801 /*
802 * A ballooned page does not need any special attention from
803 * physical to virtual reverse mapping procedures.
804 * Skip any attempt to unmap PTEs or to remap swap cache,
805 * in order to avoid burning cycles at rmap level, and perform
806 * the page migration right away (proteced by page lock).
807 */
808 rc = balloon_page_migrate(newpage, page, mode);
809 goto uncharge;
810 }
811
781 /* 812 /*
782 * Corner case handling: 813 * Corner case handling:
783 * 1. When a new swap-cache page is read into, it is added to the LRU 814 * 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +845,9 @@ skip_unmap:
814 put_anon_vma(anon_vma); 845 put_anon_vma(anon_vma);
815 846
816uncharge: 847uncharge:
817 mem_cgroup_end_migration(mem, page, newpage, rc == 0); 848 mem_cgroup_end_migration(mem, page, newpage,
849 (rc == MIGRATEPAGE_SUCCESS ||
850 rc == MIGRATEPAGE_BALLOON_SUCCESS));
818unlock: 851unlock:
819 unlock_page(page); 852 unlock_page(page);
820out: 853out:
@@ -846,6 +879,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
846 goto out; 879 goto out;
847 880
848 rc = __unmap_and_move(page, newpage, force, offlining, mode); 881 rc = __unmap_and_move(page, newpage, force, offlining, mode);
882
883 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
884 /*
885 * A ballooned page has been migrated already.
886 * Now, it's the time to wrap-up counters,
887 * handle the page back to Buddy and return.
888 */
889 dec_zone_page_state(page, NR_ISOLATED_ANON +
890 page_is_file_cache(page));
891 balloon_page_free(page);
892 return MIGRATEPAGE_SUCCESS;
893 }
849out: 894out:
850 if (rc != -EAGAIN) { 895 if (rc != -EAGAIN) {
851 /* 896 /*
@@ -958,10 +1003,11 @@ out:
958 */ 1003 */
959int migrate_pages(struct list_head *from, 1004int migrate_pages(struct list_head *from,
960 new_page_t get_new_page, unsigned long private, bool offlining, 1005 new_page_t get_new_page, unsigned long private, bool offlining,
961 enum migrate_mode mode) 1006 enum migrate_mode mode, int reason)
962{ 1007{
963 int retry = 1; 1008 int retry = 1;
964 int nr_failed = 0; 1009 int nr_failed = 0;
1010 int nr_succeeded = 0;
965 int pass = 0; 1011 int pass = 0;
966 struct page *page; 1012 struct page *page;
967 struct page *page2; 1013 struct page *page2;
@@ -987,7 +1033,8 @@ int migrate_pages(struct list_head *from,
987 case -EAGAIN: 1033 case -EAGAIN:
988 retry++; 1034 retry++;
989 break; 1035 break;
990 case 0: 1036 case MIGRATEPAGE_SUCCESS:
1037 nr_succeeded++;
991 break; 1038 break;
992 default: 1039 default:
993 /* Permanent failure */ 1040 /* Permanent failure */
@@ -996,15 +1043,18 @@ int migrate_pages(struct list_head *from,
996 } 1043 }
997 } 1044 }
998 } 1045 }
999 rc = 0; 1046 rc = nr_failed + retry;
1000out: 1047out:
1048 if (nr_succeeded)
1049 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1050 if (nr_failed)
1051 count_vm_events(PGMIGRATE_FAIL, nr_failed);
1052 trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1053
1001 if (!swapwrite) 1054 if (!swapwrite)
1002 current->flags &= ~PF_SWAPWRITE; 1055 current->flags &= ~PF_SWAPWRITE;
1003 1056
1004 if (rc) 1057 return rc;
1005 return rc;
1006
1007 return nr_failed + retry;
1008} 1058}
1009 1059
1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 1060int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1074,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1024 /* try again */ 1074 /* try again */
1025 cond_resched(); 1075 cond_resched();
1026 break; 1076 break;
1027 case 0: 1077 case MIGRATEPAGE_SUCCESS:
1028 goto out; 1078 goto out;
1029 default: 1079 default:
1030 rc = -EIO; 1080 rc = -EIO;
@@ -1139,7 +1189,8 @@ set_status:
1139 err = 0; 1189 err = 0;
1140 if (!list_empty(&pagelist)) { 1190 if (!list_empty(&pagelist)) {
1141 err = migrate_pages(&pagelist, new_page_node, 1191 err = migrate_pages(&pagelist, new_page_node,
1142 (unsigned long)pm, 0, MIGRATE_SYNC); 1192 (unsigned long)pm, 0, MIGRATE_SYNC,
1193 MR_SYSCALL);
1143 if (err) 1194 if (err)
1144 putback_lru_pages(&pagelist); 1195 putback_lru_pages(&pagelist);
1145 } 1196 }
@@ -1201,7 +1252,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1201 if (node < 0 || node >= MAX_NUMNODES) 1252 if (node < 0 || node >= MAX_NUMNODES)
1202 goto out_pm; 1253 goto out_pm;
1203 1254
1204 if (!node_state(node, N_HIGH_MEMORY)) 1255 if (!node_state(node, N_MEMORY))
1205 goto out_pm; 1256 goto out_pm;
1206 1257
1207 err = -EACCES; 1258 err = -EACCES;
@@ -1403,4 +1454,329 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1403 } 1454 }
1404 return err; 1455 return err;
1405} 1456}
1406#endif 1457
1458#ifdef CONFIG_NUMA_BALANCING
1459/*
1460 * Returns true if this is a safe migration target node for misplaced NUMA
1461 * pages. Currently it only checks the watermarks which crude
1462 */
1463static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1464 int nr_migrate_pages)
1465{
1466 int z;
1467 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1468 struct zone *zone = pgdat->node_zones + z;
1469
1470 if (!populated_zone(zone))
1471 continue;
1472
1473 if (zone->all_unreclaimable)
1474 continue;
1475
1476 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
1477 if (!zone_watermark_ok(zone, 0,
1478 high_wmark_pages(zone) +
1479 nr_migrate_pages,
1480 0, 0))
1481 continue;
1482 return true;
1483 }
1484 return false;
1485}
1486
1487static struct page *alloc_misplaced_dst_page(struct page *page,
1488 unsigned long data,
1489 int **result)
1490{
1491 int nid = (int) data;
1492 struct page *newpage;
1493
1494 newpage = alloc_pages_exact_node(nid,
1495 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
1496 __GFP_NOMEMALLOC | __GFP_NORETRY |
1497 __GFP_NOWARN) &
1498 ~GFP_IOFS, 0);
1499 if (newpage)
1500 page_xchg_last_nid(newpage, page_last_nid(page));
1501
1502 return newpage;
1503}
1504
1505/*
1506 * page migration rate limiting control.
1507 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1508 * window of time. Default here says do not migrate more than 1280M per second.
1509 * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
1510 * as it is faults that reset the window, pte updates will happen unconditionally
1511 * if there has not been a fault since @pteupdate_interval_millisecs after the
1512 * throttle window closed.
1513 */
1514static unsigned int migrate_interval_millisecs __read_mostly = 100;
1515static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1516static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1517
1518/* Returns true if NUMA migration is currently rate limited */
1519bool migrate_ratelimited(int node)
1520{
1521 pg_data_t *pgdat = NODE_DATA(node);
1522
1523 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1524 msecs_to_jiffies(pteupdate_interval_millisecs)))
1525 return false;
1526
1527 if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1528 return false;
1529
1530 return true;
1531}
1532
1533/* Returns true if the node is migrate rate-limited after the update */
1534bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
1535{
1536 bool rate_limited = false;
1537
1538 /*
1539 * Rate-limit the amount of data that is being migrated to a node.
1540 * Optimal placement is no good if the memory bus is saturated and
1541 * all the time is being spent migrating!
1542 */
1543 spin_lock(&pgdat->numabalancing_migrate_lock);
1544 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1545 pgdat->numabalancing_migrate_nr_pages = 0;
1546 pgdat->numabalancing_migrate_next_window = jiffies +
1547 msecs_to_jiffies(migrate_interval_millisecs);
1548 }
1549 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
1550 rate_limited = true;
1551 else
1552 pgdat->numabalancing_migrate_nr_pages += nr_pages;
1553 spin_unlock(&pgdat->numabalancing_migrate_lock);
1554
1555 return rate_limited;
1556}
1557
1558int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1559{
1560 int ret = 0;
1561
1562 /* Avoid migrating to a node that is nearly full */
1563 if (migrate_balanced_pgdat(pgdat, 1)) {
1564 int page_lru;
1565
1566 if (isolate_lru_page(page)) {
1567 put_page(page);
1568 return 0;
1569 }
1570
1571 /* Page is isolated */
1572 ret = 1;
1573 page_lru = page_is_file_cache(page);
1574 if (!PageTransHuge(page))
1575 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
1576 else
1577 mod_zone_page_state(page_zone(page),
1578 NR_ISOLATED_ANON + page_lru,
1579 HPAGE_PMD_NR);
1580 }
1581
1582 /*
1583 * Page is either isolated or there is not enough space on the target
1584 * node. If isolated, then it has taken a reference count and the
1585 * callers reference can be safely dropped without the page
1586 * disappearing underneath us during migration. Otherwise the page is
1587 * not to be migrated but the callers reference should still be
1588 * dropped so it does not leak.
1589 */
1590 put_page(page);
1591
1592 return ret;
1593}
1594
1595/*
1596 * Attempt to migrate a misplaced page to the specified destination
1597 * node. Caller is expected to have an elevated reference count on
1598 * the page that will be dropped by this function before returning.
1599 */
1600int migrate_misplaced_page(struct page *page, int node)
1601{
1602 pg_data_t *pgdat = NODE_DATA(node);
1603 int isolated = 0;
1604 int nr_remaining;
1605 LIST_HEAD(migratepages);
1606
1607 /*
1608 * Don't migrate pages that are mapped in multiple processes.
1609 * TODO: Handle false sharing detection instead of this hammer
1610 */
1611 if (page_mapcount(page) != 1) {
1612 put_page(page);
1613 goto out;
1614 }
1615
1616 /*
1617 * Rate-limit the amount of data that is being migrated to a node.
1618 * Optimal placement is no good if the memory bus is saturated and
1619 * all the time is being spent migrating!
1620 */
1621 if (numamigrate_update_ratelimit(pgdat, 1)) {
1622 put_page(page);
1623 goto out;
1624 }
1625
1626 isolated = numamigrate_isolate_page(pgdat, page);
1627 if (!isolated)
1628 goto out;
1629
1630 list_add(&page->lru, &migratepages);
1631 nr_remaining = migrate_pages(&migratepages,
1632 alloc_misplaced_dst_page,
1633 node, false, MIGRATE_ASYNC,
1634 MR_NUMA_MISPLACED);
1635 if (nr_remaining) {
1636 putback_lru_pages(&migratepages);
1637 isolated = 0;
1638 } else
1639 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1640 BUG_ON(!list_empty(&migratepages));
1641out:
1642 return isolated;
1643}
1644#endif /* CONFIG_NUMA_BALANCING */
1645
1646#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1647int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1648 struct vm_area_struct *vma,
1649 pmd_t *pmd, pmd_t entry,
1650 unsigned long address,
1651 struct page *page, int node)
1652{
1653 unsigned long haddr = address & HPAGE_PMD_MASK;
1654 pg_data_t *pgdat = NODE_DATA(node);
1655 int isolated = 0;
1656 struct page *new_page = NULL;
1657 struct mem_cgroup *memcg = NULL;
1658 int page_lru = page_is_file_cache(page);
1659
1660 /*
1661 * Don't migrate pages that are mapped in multiple processes.
1662 * TODO: Handle false sharing detection instead of this hammer
1663 */
1664 if (page_mapcount(page) != 1)
1665 goto out_dropref;
1666
1667 /*
1668 * Rate-limit the amount of data that is being migrated to a node.
1669 * Optimal placement is no good if the memory bus is saturated and
1670 * all the time is being spent migrating!
1671 */
1672 if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
1673 goto out_dropref;
1674
1675 new_page = alloc_pages_node(node,
1676 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
1677 if (!new_page) {
1678 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1679 goto out_dropref;
1680 }
1681 page_xchg_last_nid(new_page, page_last_nid(page));
1682
1683 isolated = numamigrate_isolate_page(pgdat, page);
1684
1685 /*
1686 * Failing to isolate or a GUP pin prevents migration. The expected
1687 * page count is 2. 1 for anonymous pages without a mapping and 1
1688 * for the callers pin. If the page was isolated, the page will
1689 * need to be put back on the LRU.
1690 */
1691 if (!isolated || page_count(page) != 2) {
1692 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1693 put_page(new_page);
1694 if (isolated) {
1695 putback_lru_page(page);
1696 isolated = 0;
1697 goto out;
1698 }
1699 goto out_keep_locked;
1700 }
1701
1702 /* Prepare a page as a migration target */
1703 __set_page_locked(new_page);
1704 SetPageSwapBacked(new_page);
1705
1706 /* anon mapping, we can simply copy page->mapping to the new page: */
1707 new_page->mapping = page->mapping;
1708 new_page->index = page->index;
1709 migrate_page_copy(new_page, page);
1710 WARN_ON(PageLRU(new_page));
1711
1712 /* Recheck the target PMD */
1713 spin_lock(&mm->page_table_lock);
1714 if (unlikely(!pmd_same(*pmd, entry))) {
1715 spin_unlock(&mm->page_table_lock);
1716
1717 /* Reverse changes made by migrate_page_copy() */
1718 if (TestClearPageActive(new_page))
1719 SetPageActive(page);
1720 if (TestClearPageUnevictable(new_page))
1721 SetPageUnevictable(page);
1722 mlock_migrate_page(page, new_page);
1723
1724 unlock_page(new_page);
1725 put_page(new_page); /* Free it */
1726
1727 unlock_page(page);
1728 putback_lru_page(page);
1729
1730 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1731 goto out;
1732 }
1733
1734 /*
1735 * Traditional migration needs to prepare the memcg charge
1736 * transaction early to prevent the old page from being
1737 * uncharged when installing migration entries. Here we can
1738 * save the potential rollback and start the charge transfer
1739 * only when migration is already known to end successfully.
1740 */
1741 mem_cgroup_prepare_migration(page, new_page, &memcg);
1742
1743 entry = mk_pmd(new_page, vma->vm_page_prot);
1744 entry = pmd_mknonnuma(entry);
1745 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1746 entry = pmd_mkhuge(entry);
1747
1748 page_add_new_anon_rmap(new_page, vma, haddr);
1749
1750 set_pmd_at(mm, haddr, pmd, entry);
1751 update_mmu_cache_pmd(vma, address, &entry);
1752 page_remove_rmap(page);
1753 /*
1754 * Finish the charge transaction under the page table lock to
1755 * prevent split_huge_page() from dividing up the charge
1756 * before it's fully transferred to the new page.
1757 */
1758 mem_cgroup_end_migration(memcg, page, new_page, true);
1759 spin_unlock(&mm->page_table_lock);
1760
1761 unlock_page(new_page);
1762 unlock_page(page);
1763 put_page(page); /* Drop the rmap reference */
1764 put_page(page); /* Drop the LRU isolation reference */
1765
1766 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1767 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1768
1769out:
1770 mod_zone_page_state(page_zone(page),
1771 NR_ISOLATED_ANON + page_lru,
1772 -HPAGE_PMD_NR);
1773 return isolated;
1774
1775out_dropref:
1776 put_page(page);
1777out_keep_locked:
1778 return 0;
1779}
1780#endif /* CONFIG_NUMA_BALANCING */
1781
1782#endif /* CONFIG_NUMA */
diff --git a/mm/mlock.c b/mm/mlock.c
index f0b9ce572fc7..c9bd528b01d2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -517,11 +517,11 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
517static int do_mlockall(int flags) 517static int do_mlockall(int flags)
518{ 518{
519 struct vm_area_struct * vma, * prev = NULL; 519 struct vm_area_struct * vma, * prev = NULL;
520 unsigned int def_flags = 0;
521 520
522 if (flags & MCL_FUTURE) 521 if (flags & MCL_FUTURE)
523 def_flags = VM_LOCKED; 522 current->mm->def_flags |= VM_LOCKED;
524 current->mm->def_flags = def_flags; 523 else
524 current->mm->def_flags &= ~VM_LOCKED;
525 if (flags == MCL_FUTURE) 525 if (flags == MCL_FUTURE)
526 goto out; 526 goto out;
527 527
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c41e7d9..d1e4124f3d0e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/rbtree_augmented.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
@@ -89,6 +90,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
89struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 90struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
90 91
91/* 92/*
93 * The global memory commitment made in the system can be a metric
94 * that can be used to drive ballooning decisions when Linux is hosted
95 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
96 * balancing memory across competing virtual machines that are hosted.
97 * Several metrics drive this policy engine including the guest reported
98 * memory commitment.
99 */
100unsigned long vm_memory_committed(void)
101{
102 return percpu_counter_read_positive(&vm_committed_as);
103}
104EXPORT_SYMBOL_GPL(vm_memory_committed);
105
106/*
92 * Check that a process has enough memory to allocate a new virtual 107 * Check that a process has enough memory to allocate a new virtual
93 * mapping. 0 means there is enough memory for the allocation to 108 * mapping. 0 means there is enough memory for the allocation to
94 * succeed and -ENOMEM implies there is not. 109 * succeed and -ENOMEM implies there is not.
@@ -297,40 +312,88 @@ out:
297 return retval; 312 return retval;
298} 313}
299 314
315static long vma_compute_subtree_gap(struct vm_area_struct *vma)
316{
317 unsigned long max, subtree_gap;
318 max = vma->vm_start;
319 if (vma->vm_prev)
320 max -= vma->vm_prev->vm_end;
321 if (vma->vm_rb.rb_left) {
322 subtree_gap = rb_entry(vma->vm_rb.rb_left,
323 struct vm_area_struct, vm_rb)->rb_subtree_gap;
324 if (subtree_gap > max)
325 max = subtree_gap;
326 }
327 if (vma->vm_rb.rb_right) {
328 subtree_gap = rb_entry(vma->vm_rb.rb_right,
329 struct vm_area_struct, vm_rb)->rb_subtree_gap;
330 if (subtree_gap > max)
331 max = subtree_gap;
332 }
333 return max;
334}
335
300#ifdef CONFIG_DEBUG_VM_RB 336#ifdef CONFIG_DEBUG_VM_RB
301static int browse_rb(struct rb_root *root) 337static int browse_rb(struct rb_root *root)
302{ 338{
303 int i = 0, j; 339 int i = 0, j, bug = 0;
304 struct rb_node *nd, *pn = NULL; 340 struct rb_node *nd, *pn = NULL;
305 unsigned long prev = 0, pend = 0; 341 unsigned long prev = 0, pend = 0;
306 342
307 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 343 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
308 struct vm_area_struct *vma; 344 struct vm_area_struct *vma;
309 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 345 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
310 if (vma->vm_start < prev) 346 if (vma->vm_start < prev) {
311 printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; 347 printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
312 if (vma->vm_start < pend) 348 bug = 1;
349 }
350 if (vma->vm_start < pend) {
313 printk("vm_start %lx pend %lx\n", vma->vm_start, pend); 351 printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
314 if (vma->vm_start > vma->vm_end) 352 bug = 1;
315 printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); 353 }
354 if (vma->vm_start > vma->vm_end) {
355 printk("vm_end %lx < vm_start %lx\n",
356 vma->vm_end, vma->vm_start);
357 bug = 1;
358 }
359 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
360 printk("free gap %lx, correct %lx\n",
361 vma->rb_subtree_gap,
362 vma_compute_subtree_gap(vma));
363 bug = 1;
364 }
316 i++; 365 i++;
317 pn = nd; 366 pn = nd;
318 prev = vma->vm_start; 367 prev = vma->vm_start;
319 pend = vma->vm_end; 368 pend = vma->vm_end;
320 } 369 }
321 j = 0; 370 j = 0;
322 for (nd = pn; nd; nd = rb_prev(nd)) { 371 for (nd = pn; nd; nd = rb_prev(nd))
323 j++; 372 j++;
373 if (i != j) {
374 printk("backwards %d, forwards %d\n", j, i);
375 bug = 1;
376 }
377 return bug ? -1 : i;
378}
379
380static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
381{
382 struct rb_node *nd;
383
384 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
385 struct vm_area_struct *vma;
386 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
387 BUG_ON(vma != ignore &&
388 vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
324 } 389 }
325 if (i != j)
326 printk("backwards %d, forwards %d\n", j, i), i = 0;
327 return i;
328} 390}
329 391
330void validate_mm(struct mm_struct *mm) 392void validate_mm(struct mm_struct *mm)
331{ 393{
332 int bug = 0; 394 int bug = 0;
333 int i = 0; 395 int i = 0;
396 unsigned long highest_address = 0;
334 struct vm_area_struct *vma = mm->mmap; 397 struct vm_area_struct *vma = mm->mmap;
335 while (vma) { 398 while (vma) {
336 struct anon_vma_chain *avc; 399 struct anon_vma_chain *avc;
@@ -338,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
338 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 401 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
339 anon_vma_interval_tree_verify(avc); 402 anon_vma_interval_tree_verify(avc);
340 vma_unlock_anon_vma(vma); 403 vma_unlock_anon_vma(vma);
404 highest_address = vma->vm_end;
341 vma = vma->vm_next; 405 vma = vma->vm_next;
342 i++; 406 i++;
343 } 407 }
344 if (i != mm->map_count) 408 if (i != mm->map_count) {
345 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; 409 printk("map_count %d vm_next %d\n", mm->map_count, i);
410 bug = 1;
411 }
412 if (highest_address != mm->highest_vm_end) {
413 printk("mm->highest_vm_end %lx, found %lx\n",
414 mm->highest_vm_end, highest_address);
415 bug = 1;
416 }
346 i = browse_rb(&mm->mm_rb); 417 i = browse_rb(&mm->mm_rb);
347 if (i != mm->map_count) 418 if (i != mm->map_count) {
348 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; 419 printk("map_count %d rb %d\n", mm->map_count, i);
420 bug = 1;
421 }
349 BUG_ON(bug); 422 BUG_ON(bug);
350} 423}
351#else 424#else
425#define validate_mm_rb(root, ignore) do { } while (0)
352#define validate_mm(mm) do { } while (0) 426#define validate_mm(mm) do { } while (0)
353#endif 427#endif
354 428
429RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
430 unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
431
432/*
433 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
434 * vma->vm_prev->vm_end values changed, without modifying the vma's position
435 * in the rbtree.
436 */
437static void vma_gap_update(struct vm_area_struct *vma)
438{
439 /*
440 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
441 * function that does exacltly what we want.
442 */
443 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
444}
445
446static inline void vma_rb_insert(struct vm_area_struct *vma,
447 struct rb_root *root)
448{
449 /* All rb_subtree_gap values must be consistent prior to insertion */
450 validate_mm_rb(root, NULL);
451
452 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
453}
454
455static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
456{
457 /*
458 * All rb_subtree_gap values must be consistent prior to erase,
459 * with the possible exception of the vma being erased.
460 */
461 validate_mm_rb(root, vma);
462
463 /*
464 * Note rb_erase_augmented is a fairly large inline function,
465 * so make sure we instantiate it only once with our desired
466 * augmented rbtree callbacks.
467 */
468 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
469}
470
355/* 471/*
356 * vma has some anon_vma assigned, and is already inserted on that 472 * vma has some anon_vma assigned, and is already inserted on that
357 * anon_vma's interval trees. 473 * anon_vma's interval trees.
@@ -421,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
421void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 537void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
422 struct rb_node **rb_link, struct rb_node *rb_parent) 538 struct rb_node **rb_link, struct rb_node *rb_parent)
423{ 539{
540 /* Update tracking information for the gap following the new vma. */
541 if (vma->vm_next)
542 vma_gap_update(vma->vm_next);
543 else
544 mm->highest_vm_end = vma->vm_end;
545
546 /*
547 * vma->vm_prev wasn't known when we followed the rbtree to find the
548 * correct insertion point for that vma. As a result, we could not
549 * update the vma vm_rb parents rb_subtree_gap values on the way down.
550 * So, we first insert the vma with a zero rb_subtree_gap value
551 * (to be consistent with what we did on the way down), and then
552 * immediately update the gap to the correct value. Finally we
553 * rebalance the rbtree after all augmented values have been set.
554 */
424 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 555 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
425 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 556 vma->rb_subtree_gap = 0;
557 vma_gap_update(vma);
558 vma_rb_insert(vma, &mm->mm_rb);
426} 559}
427 560
428static void __vma_link_file(struct vm_area_struct *vma) 561static void __vma_link_file(struct vm_area_struct *vma)
@@ -498,12 +631,12 @@ static inline void
498__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 631__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
499 struct vm_area_struct *prev) 632 struct vm_area_struct *prev)
500{ 633{
501 struct vm_area_struct *next = vma->vm_next; 634 struct vm_area_struct *next;
502 635
503 prev->vm_next = next; 636 vma_rb_erase(vma, &mm->mm_rb);
637 prev->vm_next = next = vma->vm_next;
504 if (next) 638 if (next)
505 next->vm_prev = prev; 639 next->vm_prev = prev;
506 rb_erase(&vma->vm_rb, &mm->mm_rb);
507 if (mm->mmap_cache == vma) 640 if (mm->mmap_cache == vma)
508 mm->mmap_cache = prev; 641 mm->mmap_cache = prev;
509} 642}
@@ -525,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
525 struct rb_root *root = NULL; 658 struct rb_root *root = NULL;
526 struct anon_vma *anon_vma = NULL; 659 struct anon_vma *anon_vma = NULL;
527 struct file *file = vma->vm_file; 660 struct file *file = vma->vm_file;
661 bool start_changed = false, end_changed = false;
528 long adjust_next = 0; 662 long adjust_next = 0;
529 int remove_next = 0; 663 int remove_next = 0;
530 664
@@ -602,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end);
602 if (anon_vma) { 736 if (anon_vma) {
603 VM_BUG_ON(adjust_next && next->anon_vma && 737 VM_BUG_ON(adjust_next && next->anon_vma &&
604 anon_vma != next->anon_vma); 738 anon_vma != next->anon_vma);
605 anon_vma_lock(anon_vma); 739 anon_vma_lock_write(anon_vma);
606 anon_vma_interval_tree_pre_update_vma(vma); 740 anon_vma_interval_tree_pre_update_vma(vma);
607 if (adjust_next) 741 if (adjust_next)
608 anon_vma_interval_tree_pre_update_vma(next); 742 anon_vma_interval_tree_pre_update_vma(next);
@@ -615,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end);
615 vma_interval_tree_remove(next, root); 749 vma_interval_tree_remove(next, root);
616 } 750 }
617 751
618 vma->vm_start = start; 752 if (start != vma->vm_start) {
619 vma->vm_end = end; 753 vma->vm_start = start;
754 start_changed = true;
755 }
756 if (end != vma->vm_end) {
757 vma->vm_end = end;
758 end_changed = true;
759 }
620 vma->vm_pgoff = pgoff; 760 vma->vm_pgoff = pgoff;
621 if (adjust_next) { 761 if (adjust_next) {
622 next->vm_start += adjust_next << PAGE_SHIFT; 762 next->vm_start += adjust_next << PAGE_SHIFT;
@@ -645,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end);
645 * (it may either follow vma or precede it). 785 * (it may either follow vma or precede it).
646 */ 786 */
647 __insert_vm_struct(mm, insert); 787 __insert_vm_struct(mm, insert);
788 } else {
789 if (start_changed)
790 vma_gap_update(vma);
791 if (end_changed) {
792 if (!next)
793 mm->highest_vm_end = end;
794 else if (!adjust_next)
795 vma_gap_update(next);
796 }
648 } 797 }
649 798
650 if (anon_vma) { 799 if (anon_vma) {
@@ -678,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end);
678 * we must remove another next too. It would clutter 827 * we must remove another next too. It would clutter
679 * up the code too much to do both in one go. 828 * up the code too much to do both in one go.
680 */ 829 */
681 if (remove_next == 2) { 830 next = vma->vm_next;
682 next = vma->vm_next; 831 if (remove_next == 2)
683 goto again; 832 goto again;
684 } 833 else if (next)
834 vma_gap_update(next);
835 else
836 mm->highest_vm_end = end;
685 } 837 }
686 if (insert && file) 838 if (insert && file)
687 uprobe_mmap(insert); 839 uprobe_mmap(insert);
@@ -1153,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1153 * memory so no accounting is necessary 1305 * memory so no accounting is necessary
1154 */ 1306 */
1155 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, 1307 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1156 VM_NORESERVE, &user, 1308 VM_NORESERVE,
1157 HUGETLB_ANONHUGE_INODE); 1309 &user, HUGETLB_ANONHUGE_INODE,
1310 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1158 if (IS_ERR(file)) 1311 if (IS_ERR(file))
1159 return PTR_ERR(file); 1312 return PTR_ERR(file);
1160 } 1313 }
@@ -1335,7 +1488,11 @@ munmap_back:
1335 * 1488 *
1336 * Answer: Yes, several device drivers can do it in their 1489 * Answer: Yes, several device drivers can do it in their
1337 * f_op->mmap method. -DaveM 1490 * f_op->mmap method. -DaveM
1491 * Bug: If addr is changed, prev, rb_link, rb_parent should
1492 * be updated for vma_link()
1338 */ 1493 */
1494 WARN_ON_ONCE(addr != vma->vm_start);
1495
1339 addr = vma->vm_start; 1496 addr = vma->vm_start;
1340 pgoff = vma->vm_pgoff; 1497 pgoff = vma->vm_pgoff;
1341 vm_flags = vma->vm_flags; 1498 vm_flags = vma->vm_flags;
@@ -1400,6 +1557,206 @@ unacct_error:
1400 return error; 1557 return error;
1401} 1558}
1402 1559
1560unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1561{
1562 /*
1563 * We implement the search by looking for an rbtree node that
1564 * immediately follows a suitable gap. That is,
1565 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1566 * - gap_end = vma->vm_start >= info->low_limit + length;
1567 * - gap_end - gap_start >= length
1568 */
1569
1570 struct mm_struct *mm = current->mm;
1571 struct vm_area_struct *vma;
1572 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1573
1574 /* Adjust search length to account for worst case alignment overhead */
1575 length = info->length + info->align_mask;
1576 if (length < info->length)
1577 return -ENOMEM;
1578
1579 /* Adjust search limits by the desired length */
1580 if (info->high_limit < length)
1581 return -ENOMEM;
1582 high_limit = info->high_limit - length;
1583
1584 if (info->low_limit > high_limit)
1585 return -ENOMEM;
1586 low_limit = info->low_limit + length;
1587
1588 /* Check if rbtree root looks promising */
1589 if (RB_EMPTY_ROOT(&mm->mm_rb))
1590 goto check_highest;
1591 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1592 if (vma->rb_subtree_gap < length)
1593 goto check_highest;
1594
1595 while (true) {
1596 /* Visit left subtree if it looks promising */
1597 gap_end = vma->vm_start;
1598 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1599 struct vm_area_struct *left =
1600 rb_entry(vma->vm_rb.rb_left,
1601 struct vm_area_struct, vm_rb);
1602 if (left->rb_subtree_gap >= length) {
1603 vma = left;
1604 continue;
1605 }
1606 }
1607
1608 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1609check_current:
1610 /* Check if current node has a suitable gap */
1611 if (gap_start > high_limit)
1612 return -ENOMEM;
1613 if (gap_end >= low_limit && gap_end - gap_start >= length)
1614 goto found;
1615
1616 /* Visit right subtree if it looks promising */
1617 if (vma->vm_rb.rb_right) {
1618 struct vm_area_struct *right =
1619 rb_entry(vma->vm_rb.rb_right,
1620 struct vm_area_struct, vm_rb);
1621 if (right->rb_subtree_gap >= length) {
1622 vma = right;
1623 continue;
1624 }
1625 }
1626
1627 /* Go back up the rbtree to find next candidate node */
1628 while (true) {
1629 struct rb_node *prev = &vma->vm_rb;
1630 if (!rb_parent(prev))
1631 goto check_highest;
1632 vma = rb_entry(rb_parent(prev),
1633 struct vm_area_struct, vm_rb);
1634 if (prev == vma->vm_rb.rb_left) {
1635 gap_start = vma->vm_prev->vm_end;
1636 gap_end = vma->vm_start;
1637 goto check_current;
1638 }
1639 }
1640 }
1641
1642check_highest:
1643 /* Check highest gap, which does not precede any rbtree node */
1644 gap_start = mm->highest_vm_end;
1645 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
1646 if (gap_start > high_limit)
1647 return -ENOMEM;
1648
1649found:
1650 /* We found a suitable gap. Clip it with the original low_limit. */
1651 if (gap_start < info->low_limit)
1652 gap_start = info->low_limit;
1653
1654 /* Adjust gap address to the desired alignment */
1655 gap_start += (info->align_offset - gap_start) & info->align_mask;
1656
1657 VM_BUG_ON(gap_start + info->length > info->high_limit);
1658 VM_BUG_ON(gap_start + info->length > gap_end);
1659 return gap_start;
1660}
1661
1662unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1663{
1664 struct mm_struct *mm = current->mm;
1665 struct vm_area_struct *vma;
1666 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1667
1668 /* Adjust search length to account for worst case alignment overhead */
1669 length = info->length + info->align_mask;
1670 if (length < info->length)
1671 return -ENOMEM;
1672
1673 /*
1674 * Adjust search limits by the desired length.
1675 * See implementation comment at top of unmapped_area().
1676 */
1677 gap_end = info->high_limit;
1678 if (gap_end < length)
1679 return -ENOMEM;
1680 high_limit = gap_end - length;
1681
1682 if (info->low_limit > high_limit)
1683 return -ENOMEM;
1684 low_limit = info->low_limit + length;
1685
1686 /* Check highest gap, which does not precede any rbtree node */
1687 gap_start = mm->highest_vm_end;
1688 if (gap_start <= high_limit)
1689 goto found_highest;
1690
1691 /* Check if rbtree root looks promising */
1692 if (RB_EMPTY_ROOT(&mm->mm_rb))
1693 return -ENOMEM;
1694 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1695 if (vma->rb_subtree_gap < length)
1696 return -ENOMEM;
1697
1698 while (true) {
1699 /* Visit right subtree if it looks promising */
1700 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1701 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1702 struct vm_area_struct *right =
1703 rb_entry(vma->vm_rb.rb_right,
1704 struct vm_area_struct, vm_rb);
1705 if (right->rb_subtree_gap >= length) {
1706 vma = right;
1707 continue;
1708 }
1709 }
1710
1711check_current:
1712 /* Check if current node has a suitable gap */
1713 gap_end = vma->vm_start;
1714 if (gap_end < low_limit)
1715 return -ENOMEM;
1716 if (gap_start <= high_limit && gap_end - gap_start >= length)
1717 goto found;
1718
1719 /* Visit left subtree if it looks promising */
1720 if (vma->vm_rb.rb_left) {
1721 struct vm_area_struct *left =
1722 rb_entry(vma->vm_rb.rb_left,
1723 struct vm_area_struct, vm_rb);
1724 if (left->rb_subtree_gap >= length) {
1725 vma = left;
1726 continue;
1727 }
1728 }
1729
1730 /* Go back up the rbtree to find next candidate node */
1731 while (true) {
1732 struct rb_node *prev = &vma->vm_rb;
1733 if (!rb_parent(prev))
1734 return -ENOMEM;
1735 vma = rb_entry(rb_parent(prev),
1736 struct vm_area_struct, vm_rb);
1737 if (prev == vma->vm_rb.rb_right) {
1738 gap_start = vma->vm_prev ?
1739 vma->vm_prev->vm_end : 0;
1740 goto check_current;
1741 }
1742 }
1743 }
1744
1745found:
1746 /* We found a suitable gap. Clip it with the original high_limit. */
1747 if (gap_end > info->high_limit)
1748 gap_end = info->high_limit;
1749
1750found_highest:
1751 /* Compute highest gap address at the desired alignment */
1752 gap_end -= info->length;
1753 gap_end -= (gap_end - info->align_offset) & info->align_mask;
1754
1755 VM_BUG_ON(gap_end < info->low_limit);
1756 VM_BUG_ON(gap_end < gap_start);
1757 return gap_end;
1758}
1759
1403/* Get an address range which is currently unmapped. 1760/* Get an address range which is currently unmapped.
1404 * For shmat() with addr=0. 1761 * For shmat() with addr=0.
1405 * 1762 *
@@ -1418,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1418{ 1775{
1419 struct mm_struct *mm = current->mm; 1776 struct mm_struct *mm = current->mm;
1420 struct vm_area_struct *vma; 1777 struct vm_area_struct *vma;
1421 unsigned long start_addr; 1778 struct vm_unmapped_area_info info;
1422 1779
1423 if (len > TASK_SIZE) 1780 if (len > TASK_SIZE)
1424 return -ENOMEM; 1781 return -ENOMEM;
@@ -1433,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1433 (!vma || addr + len <= vma->vm_start)) 1790 (!vma || addr + len <= vma->vm_start))
1434 return addr; 1791 return addr;
1435 } 1792 }
1436 if (len > mm->cached_hole_size) {
1437 start_addr = addr = mm->free_area_cache;
1438 } else {
1439 start_addr = addr = TASK_UNMAPPED_BASE;
1440 mm->cached_hole_size = 0;
1441 }
1442 1793
1443full_search: 1794 info.flags = 0;
1444 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 1795 info.length = len;
1445 /* At this point: (!vma || addr < vma->vm_end). */ 1796 info.low_limit = TASK_UNMAPPED_BASE;
1446 if (TASK_SIZE - len < addr) { 1797 info.high_limit = TASK_SIZE;
1447 /* 1798 info.align_mask = 0;
1448 * Start a new search - just in case we missed 1799 return vm_unmapped_area(&info);
1449 * some holes.
1450 */
1451 if (start_addr != TASK_UNMAPPED_BASE) {
1452 addr = TASK_UNMAPPED_BASE;
1453 start_addr = addr;
1454 mm->cached_hole_size = 0;
1455 goto full_search;
1456 }
1457 return -ENOMEM;
1458 }
1459 if (!vma || addr + len <= vma->vm_start) {
1460 /*
1461 * Remember the place where we stopped the search:
1462 */
1463 mm->free_area_cache = addr + len;
1464 return addr;
1465 }
1466 if (addr + mm->cached_hole_size < vma->vm_start)
1467 mm->cached_hole_size = vma->vm_start - addr;
1468 addr = vma->vm_end;
1469 }
1470} 1800}
1471#endif 1801#endif
1472 1802
@@ -1491,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1491{ 1821{
1492 struct vm_area_struct *vma; 1822 struct vm_area_struct *vma;
1493 struct mm_struct *mm = current->mm; 1823 struct mm_struct *mm = current->mm;
1494 unsigned long addr = addr0, start_addr; 1824 unsigned long addr = addr0;
1825 struct vm_unmapped_area_info info;
1495 1826
1496 /* requested length too big for entire address space */ 1827 /* requested length too big for entire address space */
1497 if (len > TASK_SIZE) 1828 if (len > TASK_SIZE)
@@ -1509,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1509 return addr; 1840 return addr;
1510 } 1841 }
1511 1842
1512 /* check if free_area_cache is useful for us */ 1843 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1513 if (len <= mm->cached_hole_size) { 1844 info.length = len;
1514 mm->cached_hole_size = 0; 1845 info.low_limit = PAGE_SIZE;
1515 mm->free_area_cache = mm->mmap_base; 1846 info.high_limit = mm->mmap_base;
1516 } 1847 info.align_mask = 0;
1517 1848 addr = vm_unmapped_area(&info);
1518try_again:
1519 /* either no address requested or can't fit in requested address hole */
1520 start_addr = addr = mm->free_area_cache;
1521
1522 if (addr < len)
1523 goto fail;
1524
1525 addr -= len;
1526 do {
1527 /*
1528 * Lookup failure means no vma is above this address,
1529 * else if new region fits below vma->vm_start,
1530 * return with success:
1531 */
1532 vma = find_vma(mm, addr);
1533 if (!vma || addr+len <= vma->vm_start)
1534 /* remember the address as a hint for next time */
1535 return (mm->free_area_cache = addr);
1536
1537 /* remember the largest hole we saw so far */
1538 if (addr + mm->cached_hole_size < vma->vm_start)
1539 mm->cached_hole_size = vma->vm_start - addr;
1540
1541 /* try just below the current vma->vm_start */
1542 addr = vma->vm_start-len;
1543 } while (len < vma->vm_start);
1544
1545fail:
1546 /*
1547 * if hint left us with no space for the requested
1548 * mapping then try again:
1549 *
1550 * Note: this is different with the case of bottomup
1551 * which does the fully line-search, but we use find_vma
1552 * here that causes some holes skipped.
1553 */
1554 if (start_addr != mm->mmap_base) {
1555 mm->free_area_cache = mm->mmap_base;
1556 mm->cached_hole_size = 0;
1557 goto try_again;
1558 }
1559 1849
1560 /* 1850 /*
1561 * A failed mmap() very likely causes application failure, 1851 * A failed mmap() very likely causes application failure,
@@ -1563,14 +1853,13 @@ fail:
1563 * can happen with large stack limits and large mmap() 1853 * can happen with large stack limits and large mmap()
1564 * allocations. 1854 * allocations.
1565 */ 1855 */
1566 mm->cached_hole_size = ~0UL; 1856 if (addr & ~PAGE_MASK) {
1567 mm->free_area_cache = TASK_UNMAPPED_BASE; 1857 VM_BUG_ON(addr != -ENOMEM);
1568 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 1858 info.flags = 0;
1569 /* 1859 info.low_limit = TASK_UNMAPPED_BASE;
1570 * Restore the topdown base: 1860 info.high_limit = TASK_SIZE;
1571 */ 1861 addr = vm_unmapped_area(&info);
1572 mm->free_area_cache = mm->mmap_base; 1862 }
1573 mm->cached_hole_size = ~0UL;
1574 1863
1575 return addr; 1864 return addr;
1576} 1865}
@@ -1780,9 +2069,27 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1780 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 2069 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
1781 error = acct_stack_growth(vma, size, grow); 2070 error = acct_stack_growth(vma, size, grow);
1782 if (!error) { 2071 if (!error) {
2072 /*
2073 * vma_gap_update() doesn't support concurrent
2074 * updates, but we only hold a shared mmap_sem
2075 * lock here, so we need to protect against
2076 * concurrent vma expansions.
2077 * vma_lock_anon_vma() doesn't help here, as
2078 * we don't guarantee that all growable vmas
2079 * in a mm share the same root anon vma.
2080 * So, we reuse mm->page_table_lock to guard
2081 * against concurrent vma expansions.
2082 */
2083 spin_lock(&vma->vm_mm->page_table_lock);
1783 anon_vma_interval_tree_pre_update_vma(vma); 2084 anon_vma_interval_tree_pre_update_vma(vma);
1784 vma->vm_end = address; 2085 vma->vm_end = address;
1785 anon_vma_interval_tree_post_update_vma(vma); 2086 anon_vma_interval_tree_post_update_vma(vma);
2087 if (vma->vm_next)
2088 vma_gap_update(vma->vm_next);
2089 else
2090 vma->vm_mm->highest_vm_end = address;
2091 spin_unlock(&vma->vm_mm->page_table_lock);
2092
1786 perf_event_mmap(vma); 2093 perf_event_mmap(vma);
1787 } 2094 }
1788 } 2095 }
@@ -1833,10 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma,
1833 if (grow <= vma->vm_pgoff) { 2140 if (grow <= vma->vm_pgoff) {
1834 error = acct_stack_growth(vma, size, grow); 2141 error = acct_stack_growth(vma, size, grow);
1835 if (!error) { 2142 if (!error) {
2143 /*
2144 * vma_gap_update() doesn't support concurrent
2145 * updates, but we only hold a shared mmap_sem
2146 * lock here, so we need to protect against
2147 * concurrent vma expansions.
2148 * vma_lock_anon_vma() doesn't help here, as
2149 * we don't guarantee that all growable vmas
2150 * in a mm share the same root anon vma.
2151 * So, we reuse mm->page_table_lock to guard
2152 * against concurrent vma expansions.
2153 */
2154 spin_lock(&vma->vm_mm->page_table_lock);
1836 anon_vma_interval_tree_pre_update_vma(vma); 2155 anon_vma_interval_tree_pre_update_vma(vma);
1837 vma->vm_start = address; 2156 vma->vm_start = address;
1838 vma->vm_pgoff -= grow; 2157 vma->vm_pgoff -= grow;
1839 anon_vma_interval_tree_post_update_vma(vma); 2158 anon_vma_interval_tree_post_update_vma(vma);
2159 vma_gap_update(vma);
2160 spin_unlock(&vma->vm_mm->page_table_lock);
2161
1840 perf_event_mmap(vma); 2162 perf_event_mmap(vma);
1841 } 2163 }
1842 } 2164 }
@@ -1959,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1959 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2281 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1960 vma->vm_prev = NULL; 2282 vma->vm_prev = NULL;
1961 do { 2283 do {
1962 rb_erase(&vma->vm_rb, &mm->mm_rb); 2284 vma_rb_erase(vma, &mm->mm_rb);
1963 mm->map_count--; 2285 mm->map_count--;
1964 tail_vma = vma; 2286 tail_vma = vma;
1965 vma = vma->vm_next; 2287 vma = vma->vm_next;
1966 } while (vma && vma->vm_start < end); 2288 } while (vma && vma->vm_start < end);
1967 *insertion_point = vma; 2289 *insertion_point = vma;
1968 if (vma) 2290 if (vma) {
1969 vma->vm_prev = prev; 2291 vma->vm_prev = prev;
2292 vma_gap_update(vma);
2293 } else
2294 mm->highest_vm_end = prev ? prev->vm_end : 0;
1970 tail_vma->vm_next = NULL; 2295 tail_vma->vm_next = NULL;
1971 if (mm->unmap_area == arch_unmap_area) 2296 if (mm->unmap_area == arch_unmap_area)
1972 addr = prev ? prev->vm_end : mm->mmap_base; 2297 addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2561,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2561 * The LSB of head.next can't change from under us 2886 * The LSB of head.next can't change from under us
2562 * because we hold the mm_all_locks_mutex. 2887 * because we hold the mm_all_locks_mutex.
2563 */ 2888 */
2564 mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); 2889 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
2565 /* 2890 /*
2566 * We can safely modify head.next after taking the 2891 * We can safely modify head.next after taking the
2567 * anon_vma->root->mutex. If some other vma in this mm shares 2892 * anon_vma->root->rwsem. If some other vma in this mm shares
2568 * the same anon_vma we won't take it again. 2893 * the same anon_vma we won't take it again.
2569 * 2894 *
2570 * No need of atomic instructions here, head.next 2895 * No need of atomic instructions here, head.next
2571 * can't change from under us thanks to the 2896 * can't change from under us thanks to the
2572 * anon_vma->root->mutex. 2897 * anon_vma->root->rwsem.
2573 */ 2898 */
2574 if (__test_and_set_bit(0, (unsigned long *) 2899 if (__test_and_set_bit(0, (unsigned long *)
2575 &anon_vma->root->rb_root.rb_node)) 2900 &anon_vma->root->rb_root.rb_node))
@@ -2618,7 +2943,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2618 * vma in this mm is backed by the same anon_vma or address_space. 2943 * vma in this mm is backed by the same anon_vma or address_space.
2619 * 2944 *
2620 * We can take all the locks in random order because the VM code 2945 * We can take all the locks in random order because the VM code
2621 * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never 2946 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
2622 * takes more than one of them in a row. Secondly we're protected 2947 * takes more than one of them in a row. Secondly we're protected
2623 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 2948 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2624 * 2949 *
@@ -2671,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2671 * 2996 *
2672 * No need of atomic instructions here, head.next 2997 * No need of atomic instructions here, head.next
2673 * can't change from under us until we release the 2998 * can't change from under us until we release the
2674 * anon_vma->root->mutex. 2999 * anon_vma->root->rwsem.
2675 */ 3000 */
2676 if (!__test_and_clear_bit(0, (unsigned long *) 3001 if (!__test_and_clear_bit(0, (unsigned long *)
2677 &anon_vma->root->rb_root.rb_node)) 3002 &anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a40992610ab6..94722a4d6b43 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
35} 35}
36#endif 36#endif
37 37
38static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable) 40 int dirty_accountable, int prot_numa, bool *ret_all_same_node)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm;
42 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
43 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0;
46 bool all_same_node = true;
47 int last_nid = -1;
44 48
45 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
46 arch_enter_lazy_mmu_mode(); 50 arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
48 oldpte = *pte; 52 oldpte = *pte;
49 if (pte_present(oldpte)) { 53 if (pte_present(oldpte)) {
50 pte_t ptent; 54 pte_t ptent;
55 bool updated = false;
51 56
52 ptent = ptep_modify_prot_start(mm, addr, pte); 57 ptent = ptep_modify_prot_start(mm, addr, pte);
53 ptent = pte_modify(ptent, newprot); 58 if (!prot_numa) {
59 ptent = pte_modify(ptent, newprot);
60 updated = true;
61 } else {
62 struct page *page;
63
64 page = vm_normal_page(vma, addr, oldpte);
65 if (page) {
66 int this_nid = page_to_nid(page);
67 if (last_nid == -1)
68 last_nid = this_nid;
69 if (last_nid != this_nid)
70 all_same_node = false;
71
72 /* only check non-shared pages */
73 if (!pte_numa(oldpte) &&
74 page_mapcount(page) == 1) {
75 ptent = pte_mknuma(ptent);
76 updated = true;
77 }
78 }
79 }
54 80
55 /* 81 /*
56 * Avoid taking write faults for pages we know to be 82 * Avoid taking write faults for pages we know to be
57 * dirty. 83 * dirty.
58 */ 84 */
59 if (dirty_accountable && pte_dirty(ptent)) 85 if (dirty_accountable && pte_dirty(ptent)) {
60 ptent = pte_mkwrite(ptent); 86 ptent = pte_mkwrite(ptent);
87 updated = true;
88 }
61 89
90 if (updated)
91 pages++;
62 ptep_modify_prot_commit(mm, addr, pte, ptent); 92 ptep_modify_prot_commit(mm, addr, pte, ptent);
63 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 93 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 94 swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,61 +102,101 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
72 set_pte_at(mm, addr, pte, 102 set_pte_at(mm, addr, pte,
73 swp_entry_to_pte(entry)); 103 swp_entry_to_pte(entry));
74 } 104 }
105 pages++;
75 } 106 }
76 } while (pte++, addr += PAGE_SIZE, addr != end); 107 } while (pte++, addr += PAGE_SIZE, addr != end);
77 arch_leave_lazy_mmu_mode(); 108 arch_leave_lazy_mmu_mode();
78 pte_unmap_unlock(pte - 1, ptl); 109 pte_unmap_unlock(pte - 1, ptl);
110
111 *ret_all_same_node = all_same_node;
112 return pages;
79} 113}
80 114
81static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, 115#ifdef CONFIG_NUMA_BALANCING
82 unsigned long addr, unsigned long end, pgprot_t newprot, 116static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
83 int dirty_accountable) 117 pmd_t *pmd)
118{
119 spin_lock(&mm->page_table_lock);
120 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
121 spin_unlock(&mm->page_table_lock);
122}
123#else
124static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
125 pmd_t *pmd)
126{
127 BUG();
128}
129#endif /* CONFIG_NUMA_BALANCING */
130
131static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
132 pud_t *pud, unsigned long addr, unsigned long end,
133 pgprot_t newprot, int dirty_accountable, int prot_numa)
84{ 134{
85 pmd_t *pmd; 135 pmd_t *pmd;
86 unsigned long next; 136 unsigned long next;
137 unsigned long pages = 0;
138 bool all_same_node;
87 139
88 pmd = pmd_offset(pud, addr); 140 pmd = pmd_offset(pud, addr);
89 do { 141 do {
90 next = pmd_addr_end(addr, end); 142 next = pmd_addr_end(addr, end);
91 if (pmd_trans_huge(*pmd)) { 143 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE) 144 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma->vm_mm, pmd); 145 split_huge_page_pmd(vma, addr, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot)) 146 else if (change_huge_pmd(vma, pmd, addr, newprot,
147 prot_numa)) {
148 pages += HPAGE_PMD_NR;
95 continue; 149 continue;
150 }
96 /* fall through */ 151 /* fall through */
97 } 152 }
98 if (pmd_none_or_clear_bad(pmd)) 153 if (pmd_none_or_clear_bad(pmd))
99 continue; 154 continue;
100 change_pte_range(vma->vm_mm, pmd, addr, next, newprot, 155 pages += change_pte_range(vma, pmd, addr, next, newprot,
101 dirty_accountable); 156 dirty_accountable, prot_numa, &all_same_node);
157
158 /*
159 * If we are changing protections for NUMA hinting faults then
160 * set pmd_numa if the examined pages were all on the same
161 * node. This allows a regular PMD to be handled as one fault
162 * and effectively batches the taking of the PTL
163 */
164 if (prot_numa && all_same_node)
165 change_pmd_protnuma(vma->vm_mm, addr, pmd);
102 } while (pmd++, addr = next, addr != end); 166 } while (pmd++, addr = next, addr != end);
167
168 return pages;
103} 169}
104 170
105static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 171static inline unsigned long change_pud_range(struct vm_area_struct *vma,
106 unsigned long addr, unsigned long end, pgprot_t newprot, 172 pgd_t *pgd, unsigned long addr, unsigned long end,
107 int dirty_accountable) 173 pgprot_t newprot, int dirty_accountable, int prot_numa)
108{ 174{
109 pud_t *pud; 175 pud_t *pud;
110 unsigned long next; 176 unsigned long next;
177 unsigned long pages = 0;
111 178
112 pud = pud_offset(pgd, addr); 179 pud = pud_offset(pgd, addr);
113 do { 180 do {
114 next = pud_addr_end(addr, end); 181 next = pud_addr_end(addr, end);
115 if (pud_none_or_clear_bad(pud)) 182 if (pud_none_or_clear_bad(pud))
116 continue; 183 continue;
117 change_pmd_range(vma, pud, addr, next, newprot, 184 pages += change_pmd_range(vma, pud, addr, next, newprot,
118 dirty_accountable); 185 dirty_accountable, prot_numa);
119 } while (pud++, addr = next, addr != end); 186 } while (pud++, addr = next, addr != end);
187
188 return pages;
120} 189}
121 190
122static void change_protection(struct vm_area_struct *vma, 191static unsigned long change_protection_range(struct vm_area_struct *vma,
123 unsigned long addr, unsigned long end, pgprot_t newprot, 192 unsigned long addr, unsigned long end, pgprot_t newprot,
124 int dirty_accountable) 193 int dirty_accountable, int prot_numa)
125{ 194{
126 struct mm_struct *mm = vma->vm_mm; 195 struct mm_struct *mm = vma->vm_mm;
127 pgd_t *pgd; 196 pgd_t *pgd;
128 unsigned long next; 197 unsigned long next;
129 unsigned long start = addr; 198 unsigned long start = addr;
199 unsigned long pages = 0;
130 200
131 BUG_ON(addr >= end); 201 BUG_ON(addr >= end);
132 pgd = pgd_offset(mm, addr); 202 pgd = pgd_offset(mm, addr);
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma,
135 next = pgd_addr_end(addr, end); 205 next = pgd_addr_end(addr, end);
136 if (pgd_none_or_clear_bad(pgd)) 206 if (pgd_none_or_clear_bad(pgd))
137 continue; 207 continue;
138 change_pud_range(vma, pgd, addr, next, newprot, 208 pages += change_pud_range(vma, pgd, addr, next, newprot,
139 dirty_accountable); 209 dirty_accountable, prot_numa);
140 } while (pgd++, addr = next, addr != end); 210 } while (pgd++, addr = next, addr != end);
141 flush_tlb_range(vma, start, end); 211
212 /* Only flush the TLB if we actually modified any entries: */
213 if (pages)
214 flush_tlb_range(vma, start, end);
215
216 return pages;
217}
218
219unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
220 unsigned long end, pgprot_t newprot,
221 int dirty_accountable, int prot_numa)
222{
223 struct mm_struct *mm = vma->vm_mm;
224 unsigned long pages;
225
226 mmu_notifier_invalidate_range_start(mm, start, end);
227 if (is_vm_hugetlb_page(vma))
228 pages = hugetlb_change_protection(vma, start, end, newprot);
229 else
230 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
231 mmu_notifier_invalidate_range_end(mm, start, end);
232
233 return pages;
142} 234}
143 235
144int 236int
@@ -213,12 +305,9 @@ success:
213 dirty_accountable = 1; 305 dirty_accountable = 1;
214 } 306 }
215 307
216 mmu_notifier_invalidate_range_start(mm, start, end); 308 change_protection(vma, start, end, vma->vm_page_prot,
217 if (is_vm_hugetlb_page(vma)) 309 dirty_accountable, 0);
218 hugetlb_change_protection(vma, start, end, vma->vm_page_prot); 310
219 else
220 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
221 mmu_notifier_invalidate_range_end(mm, start, end);
222 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 311 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
223 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 312 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
224 perf_event_mmap(vma); 313 perf_event_mmap(vma);
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
274 error = -EINVAL; 363 error = -EINVAL;
275 if (!(vma->vm_flags & VM_GROWSDOWN)) 364 if (!(vma->vm_flags & VM_GROWSDOWN))
276 goto out; 365 goto out;
277 } 366 } else {
278 else {
279 if (vma->vm_start > start) 367 if (vma->vm_start > start)
280 goto out; 368 goto out;
281 if (unlikely(grows & PROT_GROWSUP)) { 369 if (unlikely(grows & PROT_GROWSUP)) {
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
291 for (nstart = start ; ; ) { 379 for (nstart = start ; ; ) {
292 unsigned long newflags; 380 unsigned long newflags;
293 381
294 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 382 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
295 383
296 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 384 newflags = vm_flags;
385 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
297 386
298 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 387 /* newflags >> 4 shift VM_MAY% in place of VM_% */
299 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { 388 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b61c2d3307a..e1031e1f6a61 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
104 } 104 }
105 if (vma->anon_vma) { 105 if (vma->anon_vma) {
106 anon_vma = vma->anon_vma; 106 anon_vma = vma->anon_vma;
107 anon_vma_lock(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 } 108 }
109 } 109 }
110 110
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
182 need_flush = true; 182 need_flush = true;
183 continue; 183 continue;
184 } else if (!err) { 184 } else if (!err) {
185 split_huge_page_pmd(vma->vm_mm, old_pmd); 185 split_huge_page_pmd(vma, old_addr, old_pmd);
186 } 186 }
187 VM_BUG_ON(pmd_trans_huge(*old_pmd)); 187 VM_BUG_ON(pmd_trans_huge(*old_pmd));
188 } 188 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bd82f6b31411..b8294fc03df8 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
137 return count; 137 return count;
138} 138}
139 139
140static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
141{
142 struct zone *z;
143
144 /*
145 * In free_area_init_core(), highmem zone's managed_pages is set to
146 * present_pages, and bootmem allocator doesn't allocate from highmem
147 * zones. So there's no need to recalculate managed_pages because all
148 * highmem pages will be managed by the buddy system. Here highmem
149 * zone also includes highmem movable zone.
150 */
151 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
152 if (!is_highmem(z))
153 z->managed_pages = 0;
154}
155
140/** 156/**
141 * free_all_bootmem_node - release a node's free pages to the buddy allocator 157 * free_all_bootmem_node - release a node's free pages to the buddy allocator
142 * @pgdat: node to be released 158 * @pgdat: node to be released
@@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
146unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 162unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
147{ 163{
148 register_page_bootmem_info_node(pgdat); 164 register_page_bootmem_info_node(pgdat);
165 reset_node_lowmem_managed_pages(pgdat);
149 166
150 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ 167 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
151 return 0; 168 return 0;
@@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
158 */ 175 */
159unsigned long __init free_all_bootmem(void) 176unsigned long __init free_all_bootmem(void)
160{ 177{
178 struct pglist_data *pgdat;
179
180 for_each_online_pgdat(pgdat)
181 reset_node_lowmem_managed_pages(pgdat);
182
161 /* 183 /*
162 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 184 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
163 * because in some case like Node0 doesn't have RAM installed 185 * because in some case like Node0 doesn't have RAM installed
diff --git a/mm/nommu.c b/mm/nommu.c
index 45131b41bcdb..79c3cac87afa 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -66,6 +66,21 @@ int heap_stack_gap = 0;
66 66
67atomic_long_t mmap_pages_allocated; 67atomic_long_t mmap_pages_allocated;
68 68
69/*
70 * The global memory commitment made in the system can be a metric
71 * that can be used to drive ballooning decisions when Linux is hosted
72 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
73 * balancing memory across competing virtual machines that are hosted.
74 * Several metrics drive this policy engine including the guest reported
75 * memory commitment.
76 */
77unsigned long vm_memory_committed(void)
78{
79 return percpu_counter_read_positive(&vm_committed_as);
80}
81
82EXPORT_SYMBOL_GPL(vm_memory_committed);
83
69EXPORT_SYMBOL(mem_map); 84EXPORT_SYMBOL(mem_map);
70EXPORT_SYMBOL(num_physpages); 85EXPORT_SYMBOL(num_physpages);
71 86
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e0f3e24831..0399f146ae49 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
44int sysctl_oom_dump_tasks = 1; 44int sysctl_oom_dump_tasks = 1;
45static DEFINE_SPINLOCK(zone_scan_lock); 45static DEFINE_SPINLOCK(zone_scan_lock);
46 46
47/*
48 * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
49 * @old_val: old oom_score_adj for compare
50 * @new_val: new oom_score_adj for swap
51 *
52 * Sets the oom_score_adj value for current to @new_val iff its present value is
53 * @old_val. Usually used to reinstate a previous value to prevent racing with
54 * userspacing tuning the value in the interim.
55 */
56void compare_swap_oom_score_adj(int old_val, int new_val)
57{
58 struct sighand_struct *sighand = current->sighand;
59
60 spin_lock_irq(&sighand->siglock);
61 if (current->signal->oom_score_adj == old_val)
62 current->signal->oom_score_adj = new_val;
63 trace_oom_score_adj_update(current);
64 spin_unlock_irq(&sighand->siglock);
65}
66
67/**
68 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
69 * @new_val: new oom_score_adj value
70 *
71 * Sets the oom_score_adj value for current to @new_val with proper
72 * synchronization and returns the old value. Usually used to temporarily
73 * set a value, save the old value in the caller, and then reinstate it later.
74 */
75int test_set_oom_score_adj(int new_val)
76{
77 struct sighand_struct *sighand = current->sighand;
78 int old_val;
79
80 spin_lock_irq(&sighand->siglock);
81 old_val = current->signal->oom_score_adj;
82 current->signal->oom_score_adj = new_val;
83 trace_oom_score_adj_update(current);
84 spin_unlock_irq(&sighand->siglock);
85
86 return old_val;
87}
88
89#ifdef CONFIG_NUMA 47#ifdef CONFIG_NUMA
90/** 48/**
91 * has_intersects_mems_allowed() - check task eligiblity for kill 49 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
193 if (!p) 151 if (!p)
194 return 0; 152 return 0;
195 153
196 adj = p->signal->oom_score_adj; 154 adj = (long)p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) { 155 if (adj == OOM_SCORE_ADJ_MIN) {
198 task_unlock(p); 156 task_unlock(p);
199 return 0; 157 return 0;
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
257 * the page allocator means a mempolicy is in effect. Cpuset policy 215 * the page allocator means a mempolicy is in effect. Cpuset policy
258 * is enforced in get_page_from_freelist(). 216 * is enforced in get_page_from_freelist().
259 */ 217 */
260 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { 218 if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
261 *totalpages = total_swap_pages; 219 *totalpages = total_swap_pages;
262 for_each_node_mask(nid, *nodemask) 220 for_each_node_mask(nid, *nodemask)
263 *totalpages += node_spanned_pages(nid); 221 *totalpages += node_spanned_pages(nid);
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
310 if (!task->mm) 268 if (!task->mm)
311 return OOM_SCAN_CONTINUE; 269 return OOM_SCAN_CONTINUE;
312 270
313 if (task->flags & PF_EXITING) { 271 /*
272 * If task is allocating a lot of memory and has been marked to be
273 * killed first if it triggers an oom, then select it.
274 */
275 if (oom_task_origin(task))
276 return OOM_SCAN_SELECT;
277
278 if (task->flags & PF_EXITING && !force_kill) {
314 /* 279 /*
315 * If task is current and is in the process of releasing memory, 280 * If this task is not being ptraced on exit, then wait for it
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to 281 * to finish before killing some other task unnecessarily.
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */ 282 */
322 if (task == current) 283 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
323 return OOM_SCAN_SELECT; 284 return OOM_SCAN_ABORT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 } 285 }
334 return OOM_SCAN_OK; 286 return OOM_SCAN_OK;
335} 287}
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
412 continue; 364 continue;
413 } 365 }
414 366
415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", 367 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
416 task->pid, from_kuid(&init_user_ns, task_uid(task)), 368 task->pid, from_kuid(&init_user_ns, task_uid(task)),
417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 369 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
418 task->mm->nr_ptes, 370 task->mm->nr_ptes,
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
428{ 380{
429 task_lock(current); 381 task_lock(current);
430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 382 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
431 "oom_score_adj=%d\n", 383 "oom_score_adj=%hd\n",
432 current->comm, gfp_mask, order, 384 current->comm, gfp_mask, order,
433 current->signal->oom_score_adj); 385 current->signal->oom_score_adj);
434 cpuset_print_task_mems_allowed(current); 386 cpuset_print_task_mems_allowed(current);
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
639 spin_unlock(&zone_scan_lock); 591 spin_unlock(&zone_scan_lock);
640} 592}
641 593
642/*
643 * Try to acquire the oom killer lock for all system zones. Returns zero if a
644 * parallel oom killing is taking place, otherwise locks all zones and returns
645 * non-zero.
646 */
647static int try_set_system_oom(void)
648{
649 struct zone *zone;
650 int ret = 1;
651
652 spin_lock(&zone_scan_lock);
653 for_each_populated_zone(zone)
654 if (zone_is_oom_locked(zone)) {
655 ret = 0;
656 goto out;
657 }
658 for_each_populated_zone(zone)
659 zone_set_flag(zone, ZONE_OOM_LOCKED);
660out:
661 spin_unlock(&zone_scan_lock);
662 return ret;
663}
664
665/*
666 * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
667 * attempts or page faults may now recall the oom killer, if necessary.
668 */
669static void clear_system_oom(void)
670{
671 struct zone *zone;
672
673 spin_lock(&zone_scan_lock);
674 for_each_populated_zone(zone)
675 zone_clear_flag(zone, ZONE_OOM_LOCKED);
676 spin_unlock(&zone_scan_lock);
677}
678
679/** 594/**
680 * out_of_memory - kill the "best" process when we run out of memory 595 * out_of_memory - kill the "best" process when we run out of memory
681 * @zonelist: zonelist pointer 596 * @zonelist: zonelist pointer
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
706 return; 621 return;
707 622
708 /* 623 /*
709 * If current has a pending SIGKILL, then automatically select it. The 624 * If current has a pending SIGKILL or is exiting, then automatically
710 * goal is to allow it to allocate so that it may quickly exit and free 625 * select it. The goal is to allow it to allocate so that it may
711 * its memory. 626 * quickly exit and free its memory.
712 */ 627 */
713 if (fatal_signal_pending(current)) { 628 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
714 set_thread_flag(TIF_MEMDIE); 629 set_thread_flag(TIF_MEMDIE);
715 return; 630 return;
716 } 631 }
@@ -756,15 +671,16 @@ out:
756 671
757/* 672/*
758 * The pagefault handler calls here because it is out of memory, so kill a 673 * The pagefault handler calls here because it is out of memory, so kill a
759 * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel 674 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
760 * oom killing is already in progress so do nothing. If a task is found with 675 * parallel oom killing is already in progress so do nothing.
761 * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
762 */ 676 */
763void pagefault_out_of_memory(void) 677void pagefault_out_of_memory(void)
764{ 678{
765 if (try_set_system_oom()) { 679 struct zonelist *zonelist = node_zonelist(first_online_node,
680 GFP_KERNEL);
681
682 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
766 out_of_memory(NULL, 0, 0, NULL, false); 683 out_of_memory(NULL, 0, 0, NULL, false);
767 clear_system_oom(); 684 clear_zonelist_oom(zonelist, GFP_KERNEL);
768 } 685 }
769 schedule_timeout_killable(1);
770} 686}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 830893b2b3c7..0713bfbf0954 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -201,6 +201,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
201 zone_reclaimable_pages(z) - z->dirty_balance_reserve; 201 zone_reclaimable_pages(z) - z->dirty_balance_reserve;
202 } 202 }
203 /* 203 /*
204 * Unreclaimable memory (kernel memory or anonymous memory
205 * without swap) can bring down the dirtyable pages below
206 * the zone's dirty balance reserve and the above calculation
207 * will underflow. However we still want to add in nodes
208 * which are below threshold (negative values) to get a more
209 * accurate calculation but make sure that the total never
210 * underflows.
211 */
212 if ((long)x < 0)
213 x = 0;
214
215 /*
204 * Make sure that the number of highmem pages is never larger 216 * Make sure that the number of highmem pages is never larger
205 * than the number of the total dirtyable memory. This can only 217 * than the number of the total dirtyable memory. This can only
206 * occur in very strange VM situations but we want to make sure 218 * occur in very strange VM situations but we want to make sure
@@ -222,8 +234,8 @@ static unsigned long global_dirtyable_memory(void)
222{ 234{
223 unsigned long x; 235 unsigned long x;
224 236
225 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - 237 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
226 dirty_balance_reserve; 238 x -= min(x, dirty_balance_reserve);
227 239
228 if (!vm_highmem_is_dirtyable) 240 if (!vm_highmem_is_dirtyable)
229 x -= highmem_dirtyable_memory(x); 241 x -= highmem_dirtyable_memory(x);
@@ -290,9 +302,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
290 * highmem zone can hold its share of dirty pages, so we don't 302 * highmem zone can hold its share of dirty pages, so we don't
291 * care about vm_highmem_is_dirtyable here. 303 * care about vm_highmem_is_dirtyable here.
292 */ 304 */
293 return zone_page_state(zone, NR_FREE_PAGES) + 305 unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
294 zone_reclaimable_pages(zone) - 306 zone_reclaimable_pages(zone);
295 zone->dirty_balance_reserve; 307
308 /* don't allow this to underflow */
309 nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
310 return nr_pages;
296} 311}
297 312
298/** 313/**
@@ -1069,7 +1084,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
1069} 1084}
1070 1085
1071/* 1086/*
1072 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() 1087 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
1073 * will look to see if it needs to start dirty throttling. 1088 * will look to see if it needs to start dirty throttling.
1074 * 1089 *
1075 * If dirty_poll_interval is too low, big NUMA machines will call the expensive 1090 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1451,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
1436DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; 1451DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1437 1452
1438/** 1453/**
1439 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1454 * balance_dirty_pages_ratelimited - balance dirty memory state
1440 * @mapping: address_space which was dirtied 1455 * @mapping: address_space which was dirtied
1441 * @nr_pages_dirtied: number of pages which the caller has just dirtied
1442 * 1456 *
1443 * Processes which are dirtying memory should call in here once for each page 1457 * Processes which are dirtying memory should call in here once for each page
1444 * which was newly dirtied. The function will periodically check the system's 1458 * which was newly dirtied. The function will periodically check the system's
@@ -1449,8 +1463,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1449 * limit we decrease the ratelimiting by a lot, to prevent individual processes 1463 * limit we decrease the ratelimiting by a lot, to prevent individual processes
1450 * from overshooting the limit by (ratelimit_pages) each. 1464 * from overshooting the limit by (ratelimit_pages) each.
1451 */ 1465 */
1452void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 1466void balance_dirty_pages_ratelimited(struct address_space *mapping)
1453 unsigned long nr_pages_dirtied)
1454{ 1467{
1455 struct backing_dev_info *bdi = mapping->backing_dev_info; 1468 struct backing_dev_info *bdi = mapping->backing_dev_info;
1456 int ratelimit; 1469 int ratelimit;
@@ -1484,6 +1497,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1484 */ 1497 */
1485 p = &__get_cpu_var(dirty_throttle_leaks); 1498 p = &__get_cpu_var(dirty_throttle_leaks);
1486 if (*p > 0 && current->nr_dirtied < ratelimit) { 1499 if (*p > 0 && current->nr_dirtied < ratelimit) {
1500 unsigned long nr_pages_dirtied;
1487 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); 1501 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1488 *p -= nr_pages_dirtied; 1502 *p -= nr_pages_dirtied;
1489 current->nr_dirtied += nr_pages_dirtied; 1503 current->nr_dirtied += nr_pages_dirtied;
@@ -1493,7 +1507,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1493 if (unlikely(current->nr_dirtied >= ratelimit)) 1507 if (unlikely(current->nr_dirtied >= ratelimit))
1494 balance_dirty_pages(mapping, current->nr_dirtied); 1508 balance_dirty_pages(mapping, current->nr_dirtied);
1495} 1509}
1496EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 1510EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
1497 1511
1498void throttle_vm_writeout(gfp_t gfp_mask) 1512void throttle_vm_writeout(gfp_t gfp_mask)
1499{ 1513{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e208f0ad68c..9673d96b1ba7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
90#ifdef CONFIG_HIGHMEM 90#ifdef CONFIG_HIGHMEM
91 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 91 [N_HIGH_MEMORY] = { { [0] = 1UL } },
92#endif 92#endif
93#ifdef CONFIG_MOVABLE_NODE
94 [N_MEMORY] = { { [0] = 1UL } },
95#endif
93 [N_CPU] = { { [0] = 1UL } }, 96 [N_CPU] = { { [0] = 1UL } },
94#endif /* NUMA */ 97#endif /* NUMA */
95}; 98};
@@ -218,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes);
218 221
219int page_group_by_mobility_disabled __read_mostly; 222int page_group_by_mobility_disabled __read_mostly;
220 223
221/*
222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate.
225 */
226void set_pageblock_migratetype(struct page *page, int migratetype) 224void set_pageblock_migratetype(struct page *page, int migratetype)
227{ 225{
228 226
@@ -368,8 +366,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
368 int nr_pages = 1 << order; 366 int nr_pages = 1 << order;
369 int bad = 0; 367 int bad = 0;
370 368
371 if (unlikely(compound_order(page) != order) || 369 if (unlikely(compound_order(page) != order)) {
372 unlikely(!PageHead(page))) {
373 bad_page(page); 370 bad_page(page);
374 bad++; 371 bad++;
375 } 372 }
@@ -523,7 +520,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
523 * If a block is freed, and its buddy is also free, then this 520 * If a block is freed, and its buddy is also free, then this
524 * triggers coalescing into a block of larger size. 521 * triggers coalescing into a block of larger size.
525 * 522 *
526 * -- wli 523 * -- nyc
527 */ 524 */
528 525
529static inline void __free_one_page(struct page *page, 526static inline void __free_one_page(struct page *page,
@@ -608,6 +605,7 @@ static inline int free_pages_check(struct page *page)
608 bad_page(page); 605 bad_page(page);
609 return 1; 606 return 1;
610 } 607 }
608 reset_page_last_nid(page);
611 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 609 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
612 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 610 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
613 return 0; 611 return 0;
@@ -667,11 +665,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 665 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
668 __free_one_page(page, zone, 0, mt); 666 __free_one_page(page, zone, 0, mt);
669 trace_mm_page_pcpu_drain(page, 0, mt); 667 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt)) 668 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 669 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
670 if (is_migrate_cma(mt))
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
672 }
672 } while (--to_free && --batch_free && !list_empty(list)); 673 } while (--to_free && --batch_free && !list_empty(list));
673 } 674 }
674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
675 spin_unlock(&zone->lock); 675 spin_unlock(&zone->lock);
676} 676}
677 677
@@ -730,6 +730,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
730 local_irq_restore(flags); 730 local_irq_restore(flags);
731} 731}
732 732
733/*
734 * Read access to zone->managed_pages is safe because it's unsigned long,
735 * but we still need to serialize writers. Currently all callers of
736 * __free_pages_bootmem() except put_page_bootmem() should only be used
737 * at boot time. So for shorter boot time, we shift the burden to
738 * put_page_bootmem() to serialize writers.
739 */
733void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 740void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
734{ 741{
735 unsigned int nr_pages = 1 << order; 742 unsigned int nr_pages = 1 << order;
@@ -745,6 +752,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
745 set_page_count(p, 0); 752 set_page_count(p, 0);
746 } 753 }
747 754
755 page_zone(page)->managed_pages += 1 << order;
748 set_page_refcounted(page); 756 set_page_refcounted(page);
749 __free_pages(page, order); 757 __free_pages(page, order);
750} 758}
@@ -765,6 +773,10 @@ void __init init_cma_reserved_pageblock(struct page *page)
765 set_pageblock_migratetype(page, MIGRATE_CMA); 773 set_pageblock_migratetype(page, MIGRATE_CMA);
766 __free_pages(page, pageblock_order); 774 __free_pages(page, pageblock_order);
767 totalram_pages += pageblock_nr_pages; 775 totalram_pages += pageblock_nr_pages;
776#ifdef CONFIG_HIGHMEM
777 if (PageHighMem(page))
778 totalhigh_pages += pageblock_nr_pages;
779#endif
768} 780}
769#endif 781#endif
770 782
@@ -780,7 +792,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
780 * large block of memory acted on by a series of small allocations. 792 * large block of memory acted on by a series of small allocations.
781 * This behavior is a critical factor in sglist merging's success. 793 * This behavior is a critical factor in sglist merging's success.
782 * 794 *
783 * -- wli 795 * -- nyc
784 */ 796 */
785static inline void expand(struct zone *zone, struct page *page, 797static inline void expand(struct zone *zone, struct page *page,
786 int low, int high, struct free_area *area, 798 int low, int high, struct free_area *area,
@@ -1376,14 +1388,8 @@ void split_page(struct page *page, unsigned int order)
1376 set_page_refcounted(page + i); 1388 set_page_refcounted(page + i);
1377} 1389}
1378 1390
1379/* 1391static int __isolate_free_page(struct page *page, unsigned int order)
1380 * Similar to the split_page family of functions except that the page
1381 * required at the given order and being isolated now to prevent races
1382 * with parallel allocators
1383 */
1384int capture_free_page(struct page *page, int alloc_order, int migratetype)
1385{ 1392{
1386 unsigned int order;
1387 unsigned long watermark; 1393 unsigned long watermark;
1388 struct zone *zone; 1394 struct zone *zone;
1389 int mt; 1395 int mt;
@@ -1391,27 +1397,23 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1391 BUG_ON(!PageBuddy(page)); 1397 BUG_ON(!PageBuddy(page));
1392 1398
1393 zone = page_zone(page); 1399 zone = page_zone(page);
1394 order = page_order(page); 1400 mt = get_pageblock_migratetype(page);
1395 1401
1396 /* Obey watermarks as if the page was being allocated */ 1402 if (mt != MIGRATE_ISOLATE) {
1397 watermark = low_wmark_pages(zone) + (1 << order); 1403 /* Obey watermarks as if the page was being allocated */
1398 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1404 watermark = low_wmark_pages(zone) + (1 << order);
1399 return 0; 1405 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1406 return 0;
1407
1408 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1409 }
1400 1410
1401 /* Remove page from free list */ 1411 /* Remove page from free list */
1402 list_del(&page->lru); 1412 list_del(&page->lru);
1403 zone->free_area[order].nr_free--; 1413 zone->free_area[order].nr_free--;
1404 rmv_page_order(page); 1414 rmv_page_order(page);
1405 1415
1406 mt = get_pageblock_migratetype(page); 1416 /* Set the pageblock if the isolated page is at least a pageblock */
1407 if (unlikely(mt != MIGRATE_ISOLATE))
1408 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1409
1410 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order,
1412 &zone->free_area[order], migratetype);
1413
1414 /* Set the pageblock if the captured page is at least a pageblock */
1415 if (order >= pageblock_order - 1) { 1417 if (order >= pageblock_order - 1) {
1416 struct page *endpage = page + (1 << order) - 1; 1418 struct page *endpage = page + (1 << order) - 1;
1417 for (; page < endpage; page += pageblock_nr_pages) { 1419 for (; page < endpage; page += pageblock_nr_pages) {
@@ -1422,7 +1424,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1422 } 1424 }
1423 } 1425 }
1424 1426
1425 return 1UL << alloc_order; 1427 return 1UL << order;
1426} 1428}
1427 1429
1428/* 1430/*
@@ -1440,10 +1442,9 @@ int split_free_page(struct page *page)
1440 unsigned int order; 1442 unsigned int order;
1441 int nr_pages; 1443 int nr_pages;
1442 1444
1443 BUG_ON(!PageBuddy(page));
1444 order = page_order(page); 1445 order = page_order(page);
1445 1446
1446 nr_pages = capture_free_page(page, order, 0); 1447 nr_pages = __isolate_free_page(page, order);
1447 if (!nr_pages) 1448 if (!nr_pages)
1448 return 0; 1449 return 0;
1449 1450
@@ -1641,20 +1642,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1641 return true; 1642 return true;
1642} 1643}
1643 1644
1644#ifdef CONFIG_MEMORY_ISOLATION
1645static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1646{
1647 if (unlikely(zone->nr_pageblock_isolate))
1648 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1649 return 0;
1650}
1651#else
1652static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1653{
1654 return 0;
1655}
1656#endif
1657
1658bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1645bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1659 int classzone_idx, int alloc_flags) 1646 int classzone_idx, int alloc_flags)
1660{ 1647{
@@ -1670,14 +1657,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1670 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1657 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1671 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1658 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1672 1659
1673 /*
1674 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1675 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1676 * sleep although it could do so. But this is more desirable for memory
1677 * hotplug than sleeping which can cause a livelock in the direct
1678 * reclaim path.
1679 */
1680 free_pages -= nr_zone_isolate_freepages(z);
1681 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1660 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1682 free_pages); 1661 free_pages);
1683} 1662}
@@ -1692,7 +1671,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1692 * 1671 *
1693 * If the zonelist cache is present in the passed in zonelist, then 1672 * If the zonelist cache is present in the passed in zonelist, then
1694 * returns a pointer to the allowed node mask (either the current 1673 * returns a pointer to the allowed node mask (either the current
1695 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1674 * tasks mems_allowed, or node_states[N_MEMORY].)
1696 * 1675 *
1697 * If the zonelist cache is not available for this zonelist, does 1676 * If the zonelist cache is not available for this zonelist, does
1698 * nothing and returns NULL. 1677 * nothing and returns NULL.
@@ -1721,7 +1700,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1721 1700
1722 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1701 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1723 &cpuset_current_mems_allowed : 1702 &cpuset_current_mems_allowed :
1724 &node_states[N_HIGH_MEMORY]; 1703 &node_states[N_MEMORY];
1725 return allowednodes; 1704 return allowednodes;
1726} 1705}
1727 1706
@@ -1871,7 +1850,7 @@ zonelist_scan:
1871 */ 1850 */
1872 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1851 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1873 high_zoneidx, nodemask) { 1852 high_zoneidx, nodemask) {
1874 if (NUMA_BUILD && zlc_active && 1853 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1875 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1854 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1876 continue; 1855 continue;
1877 if ((alloc_flags & ALLOC_CPUSET) && 1856 if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1896,8 @@ zonelist_scan:
1917 classzone_idx, alloc_flags)) 1896 classzone_idx, alloc_flags))
1918 goto try_this_zone; 1897 goto try_this_zone;
1919 1898
1920 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { 1899 if (IS_ENABLED(CONFIG_NUMA) &&
1900 !did_zlc_setup && nr_online_nodes > 1) {
1921 /* 1901 /*
1922 * we do zlc_setup if there are multiple nodes 1902 * we do zlc_setup if there are multiple nodes
1923 * and before considering the first zone allowed 1903 * and before considering the first zone allowed
@@ -1936,7 +1916,7 @@ zonelist_scan:
1936 * As we may have just activated ZLC, check if the first 1916 * As we may have just activated ZLC, check if the first
1937 * eligible zone has failed zone_reclaim recently. 1917 * eligible zone has failed zone_reclaim recently.
1938 */ 1918 */
1939 if (NUMA_BUILD && zlc_active && 1919 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1940 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1920 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1941 continue; 1921 continue;
1942 1922
@@ -1962,11 +1942,11 @@ try_this_zone:
1962 if (page) 1942 if (page)
1963 break; 1943 break;
1964this_zone_full: 1944this_zone_full:
1965 if (NUMA_BUILD) 1945 if (IS_ENABLED(CONFIG_NUMA))
1966 zlc_mark_zone_full(zonelist, z); 1946 zlc_mark_zone_full(zonelist, z);
1967 } 1947 }
1968 1948
1969 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1949 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
1970 /* Disable zlc cache for second zonelist scan */ 1950 /* Disable zlc cache for second zonelist scan */
1971 zlc_active = 0; 1951 zlc_active = 0;
1972 goto zonelist_scan; 1952 goto zonelist_scan;
@@ -2148,8 +2128,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2148 bool *contended_compaction, bool *deferred_compaction, 2128 bool *contended_compaction, bool *deferred_compaction,
2149 unsigned long *did_some_progress) 2129 unsigned long *did_some_progress)
2150{ 2130{
2151 struct page *page = NULL;
2152
2153 if (!order) 2131 if (!order)
2154 return NULL; 2132 return NULL;
2155 2133
@@ -2161,16 +2139,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2161 current->flags |= PF_MEMALLOC; 2139 current->flags |= PF_MEMALLOC;
2162 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2140 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2163 nodemask, sync_migration, 2141 nodemask, sync_migration,
2164 contended_compaction, &page); 2142 contended_compaction);
2165 current->flags &= ~PF_MEMALLOC; 2143 current->flags &= ~PF_MEMALLOC;
2166 2144
2167 /* If compaction captured a page, prep and use it */
2168 if (page) {
2169 prep_new_page(page, order, gfp_mask);
2170 goto got_page;
2171 }
2172
2173 if (*did_some_progress != COMPACT_SKIPPED) { 2145 if (*did_some_progress != COMPACT_SKIPPED) {
2146 struct page *page;
2147
2174 /* Page migration frees to the PCP lists but we want merging */ 2148 /* Page migration frees to the PCP lists but we want merging */
2175 drain_pages(get_cpu()); 2149 drain_pages(get_cpu());
2176 put_cpu(); 2150 put_cpu();
@@ -2180,7 +2154,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2180 alloc_flags & ~ALLOC_NO_WATERMARKS, 2154 alloc_flags & ~ALLOC_NO_WATERMARKS,
2181 preferred_zone, migratetype); 2155 preferred_zone, migratetype);
2182 if (page) { 2156 if (page) {
2183got_page:
2184 preferred_zone->compact_blockskip_flush = false; 2157 preferred_zone->compact_blockskip_flush = false;
2185 preferred_zone->compact_considered = 0; 2158 preferred_zone->compact_considered = 0;
2186 preferred_zone->compact_defer_shift = 0; 2159 preferred_zone->compact_defer_shift = 0;
@@ -2266,7 +2239,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2266 return NULL; 2239 return NULL;
2267 2240
2268 /* After successful reclaim, reconsider all zones for allocation */ 2241 /* After successful reclaim, reconsider all zones for allocation */
2269 if (NUMA_BUILD) 2242 if (IS_ENABLED(CONFIG_NUMA))
2270 zlc_clear_zones_full(zonelist); 2243 zlc_clear_zones_full(zonelist);
2271 2244
2272retry: 2245retry:
@@ -2412,7 +2385,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2412 * allowed per node queues are empty and that nodes are 2385 * allowed per node queues are empty and that nodes are
2413 * over allocated. 2386 * over allocated.
2414 */ 2387 */
2415 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2388 if (IS_ENABLED(CONFIG_NUMA) &&
2389 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2416 goto nopage; 2390 goto nopage;
2417 2391
2418restart: 2392restart:
@@ -2596,6 +2570,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2596 int migratetype = allocflags_to_migratetype(gfp_mask); 2570 int migratetype = allocflags_to_migratetype(gfp_mask);
2597 unsigned int cpuset_mems_cookie; 2571 unsigned int cpuset_mems_cookie;
2598 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; 2572 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2573 struct mem_cgroup *memcg = NULL;
2599 2574
2600 gfp_mask &= gfp_allowed_mask; 2575 gfp_mask &= gfp_allowed_mask;
2601 2576
@@ -2614,6 +2589,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2614 if (unlikely(!zonelist->_zonerefs->zone)) 2589 if (unlikely(!zonelist->_zonerefs->zone))
2615 return NULL; 2590 return NULL;
2616 2591
2592 /*
2593 * Will only have any effect when __GFP_KMEMCG is set. This is
2594 * verified in the (always inline) callee
2595 */
2596 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2597 return NULL;
2598
2617retry_cpuset: 2599retry_cpuset:
2618 cpuset_mems_cookie = get_mems_allowed(); 2600 cpuset_mems_cookie = get_mems_allowed();
2619 2601
@@ -2649,6 +2631,8 @@ out:
2649 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2631 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2650 goto retry_cpuset; 2632 goto retry_cpuset;
2651 2633
2634 memcg_kmem_commit_charge(page, memcg, order);
2635
2652 return page; 2636 return page;
2653} 2637}
2654EXPORT_SYMBOL(__alloc_pages_nodemask); 2638EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2701,6 +2685,31 @@ void free_pages(unsigned long addr, unsigned int order)
2701 2685
2702EXPORT_SYMBOL(free_pages); 2686EXPORT_SYMBOL(free_pages);
2703 2687
2688/*
2689 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2690 * pages allocated with __GFP_KMEMCG.
2691 *
2692 * Those pages are accounted to a particular memcg, embedded in the
2693 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2694 * for that information only to find out that it is NULL for users who have no
2695 * interest in that whatsoever, we provide these functions.
2696 *
2697 * The caller knows better which flags it relies on.
2698 */
2699void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2700{
2701 memcg_kmem_uncharge_pages(page, order);
2702 __free_pages(page, order);
2703}
2704
2705void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2706{
2707 if (addr != 0) {
2708 VM_BUG_ON(!virt_addr_valid((void *)addr));
2709 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2710 }
2711}
2712
2704static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2713static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2705{ 2714{
2706 if (addr) { 2715 if (addr) {
@@ -2819,7 +2828,7 @@ unsigned int nr_free_pagecache_pages(void)
2819 2828
2820static inline void show_node(struct zone *zone) 2829static inline void show_node(struct zone *zone)
2821{ 2830{
2822 if (NUMA_BUILD) 2831 if (IS_ENABLED(CONFIG_NUMA))
2823 printk("Node %d ", zone_to_nid(zone)); 2832 printk("Node %d ", zone_to_nid(zone));
2824} 2833}
2825 2834
@@ -2877,6 +2886,31 @@ out:
2877 2886
2878#define K(x) ((x) << (PAGE_SHIFT-10)) 2887#define K(x) ((x) << (PAGE_SHIFT-10))
2879 2888
2889static void show_migration_types(unsigned char type)
2890{
2891 static const char types[MIGRATE_TYPES] = {
2892 [MIGRATE_UNMOVABLE] = 'U',
2893 [MIGRATE_RECLAIMABLE] = 'E',
2894 [MIGRATE_MOVABLE] = 'M',
2895 [MIGRATE_RESERVE] = 'R',
2896#ifdef CONFIG_CMA
2897 [MIGRATE_CMA] = 'C',
2898#endif
2899 [MIGRATE_ISOLATE] = 'I',
2900 };
2901 char tmp[MIGRATE_TYPES + 1];
2902 char *p = tmp;
2903 int i;
2904
2905 for (i = 0; i < MIGRATE_TYPES; i++) {
2906 if (type & (1 << i))
2907 *p++ = types[i];
2908 }
2909
2910 *p = '\0';
2911 printk("(%s) ", tmp);
2912}
2913
2880/* 2914/*
2881 * Show free area list (used inside shift_scroll-lock stuff) 2915 * Show free area list (used inside shift_scroll-lock stuff)
2882 * We also calculate the percentage fragmentation. We do this by counting the 2916 * We also calculate the percentage fragmentation. We do this by counting the
@@ -2951,6 +2985,7 @@ void show_free_areas(unsigned int filter)
2951 " isolated(anon):%lukB" 2985 " isolated(anon):%lukB"
2952 " isolated(file):%lukB" 2986 " isolated(file):%lukB"
2953 " present:%lukB" 2987 " present:%lukB"
2988 " managed:%lukB"
2954 " mlocked:%lukB" 2989 " mlocked:%lukB"
2955 " dirty:%lukB" 2990 " dirty:%lukB"
2956 " writeback:%lukB" 2991 " writeback:%lukB"
@@ -2980,6 +3015,7 @@ void show_free_areas(unsigned int filter)
2980 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3015 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2981 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3016 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2982 K(zone->present_pages), 3017 K(zone->present_pages),
3018 K(zone->managed_pages),
2983 K(zone_page_state(zone, NR_MLOCK)), 3019 K(zone_page_state(zone, NR_MLOCK)),
2984 K(zone_page_state(zone, NR_FILE_DIRTY)), 3020 K(zone_page_state(zone, NR_FILE_DIRTY)),
2985 K(zone_page_state(zone, NR_WRITEBACK)), 3021 K(zone_page_state(zone, NR_WRITEBACK)),
@@ -3005,6 +3041,7 @@ void show_free_areas(unsigned int filter)
3005 3041
3006 for_each_populated_zone(zone) { 3042 for_each_populated_zone(zone) {
3007 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3043 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3044 unsigned char types[MAX_ORDER];
3008 3045
3009 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3046 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3010 continue; 3047 continue;
@@ -3013,12 +3050,24 @@ void show_free_areas(unsigned int filter)
3013 3050
3014 spin_lock_irqsave(&zone->lock, flags); 3051 spin_lock_irqsave(&zone->lock, flags);
3015 for (order = 0; order < MAX_ORDER; order++) { 3052 for (order = 0; order < MAX_ORDER; order++) {
3016 nr[order] = zone->free_area[order].nr_free; 3053 struct free_area *area = &zone->free_area[order];
3054 int type;
3055
3056 nr[order] = area->nr_free;
3017 total += nr[order] << order; 3057 total += nr[order] << order;
3058
3059 types[order] = 0;
3060 for (type = 0; type < MIGRATE_TYPES; type++) {
3061 if (!list_empty(&area->free_list[type]))
3062 types[order] |= 1 << type;
3063 }
3018 } 3064 }
3019 spin_unlock_irqrestore(&zone->lock, flags); 3065 spin_unlock_irqrestore(&zone->lock, flags);
3020 for (order = 0; order < MAX_ORDER; order++) 3066 for (order = 0; order < MAX_ORDER; order++) {
3021 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3067 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3068 if (nr[order])
3069 show_migration_types(types[order]);
3070 }
3022 printk("= %lukB\n", K(total)); 3071 printk("= %lukB\n", K(total));
3023 } 3072 }
3024 3073
@@ -3195,7 +3244,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
3195 return node; 3244 return node;
3196 } 3245 }
3197 3246
3198 for_each_node_state(n, N_HIGH_MEMORY) { 3247 for_each_node_state(n, N_MEMORY) {
3199 3248
3200 /* Don't want a node to appear more than once */ 3249 /* Don't want a node to appear more than once */
3201 if (node_isset(n, *used_node_mask)) 3250 if (node_isset(n, *used_node_mask))
@@ -3337,7 +3386,7 @@ static int default_zonelist_order(void)
3337 * local memory, NODE_ORDER may be suitable. 3386 * local memory, NODE_ORDER may be suitable.
3338 */ 3387 */
3339 average_size = total_size / 3388 average_size = total_size /
3340 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 3389 (nodes_weight(node_states[N_MEMORY]) + 1);
3341 for_each_online_node(nid) { 3390 for_each_online_node(nid) {
3342 low_kmem_size = 0; 3391 low_kmem_size = 0;
3343 total_size = 0; 3392 total_size = 0;
@@ -3827,6 +3876,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3827 mminit_verify_page_links(page, zone, nid, pfn); 3876 mminit_verify_page_links(page, zone, nid, pfn);
3828 init_page_count(page); 3877 init_page_count(page);
3829 reset_page_mapcount(page); 3878 reset_page_mapcount(page);
3879 reset_page_last_nid(page);
3830 SetPageReserved(page); 3880 SetPageReserved(page);
3831 /* 3881 /*
3832 * Mark the block movable so that blocks are reserved for 3882 * Mark the block movable so that blocks are reserved for
@@ -4433,6 +4483,26 @@ void __init set_pageblock_order(void)
4433 4483
4434#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4484#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4435 4485
4486static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4487 unsigned long present_pages)
4488{
4489 unsigned long pages = spanned_pages;
4490
4491 /*
4492 * Provide a more accurate estimation if there are holes within
4493 * the zone and SPARSEMEM is in use. If there are holes within the
4494 * zone, each populated memory region may cost us one or two extra
4495 * memmap pages due to alignment because memmap pages for each
4496 * populated regions may not naturally algined on page boundary.
4497 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4498 */
4499 if (spanned_pages > present_pages + (present_pages >> 4) &&
4500 IS_ENABLED(CONFIG_SPARSEMEM))
4501 pages = present_pages;
4502
4503 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4504}
4505
4436/* 4506/*
4437 * Set up the zone data structures: 4507 * Set up the zone data structures:
4438 * - mark all pages reserved 4508 * - mark all pages reserved
@@ -4450,54 +4520,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4450 int ret; 4520 int ret;
4451 4521
4452 pgdat_resize_init(pgdat); 4522 pgdat_resize_init(pgdat);
4523#ifdef CONFIG_NUMA_BALANCING
4524 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4525 pgdat->numabalancing_migrate_nr_pages = 0;
4526 pgdat->numabalancing_migrate_next_window = jiffies;
4527#endif
4453 init_waitqueue_head(&pgdat->kswapd_wait); 4528 init_waitqueue_head(&pgdat->kswapd_wait);
4454 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4529 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4455 pgdat_page_cgroup_init(pgdat); 4530 pgdat_page_cgroup_init(pgdat);
4456 4531
4457 for (j = 0; j < MAX_NR_ZONES; j++) { 4532 for (j = 0; j < MAX_NR_ZONES; j++) {
4458 struct zone *zone = pgdat->node_zones + j; 4533 struct zone *zone = pgdat->node_zones + j;
4459 unsigned long size, realsize, memmap_pages; 4534 unsigned long size, realsize, freesize, memmap_pages;
4460 4535
4461 size = zone_spanned_pages_in_node(nid, j, zones_size); 4536 size = zone_spanned_pages_in_node(nid, j, zones_size);
4462 realsize = size - zone_absent_pages_in_node(nid, j, 4537 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4463 zholes_size); 4538 zholes_size);
4464 4539
4465 /* 4540 /*
4466 * Adjust realsize so that it accounts for how much memory 4541 * Adjust freesize so that it accounts for how much memory
4467 * is used by this zone for memmap. This affects the watermark 4542 * is used by this zone for memmap. This affects the watermark
4468 * and per-cpu initialisations 4543 * and per-cpu initialisations
4469 */ 4544 */
4470 memmap_pages = 4545 memmap_pages = calc_memmap_size(size, realsize);
4471 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 4546 if (freesize >= memmap_pages) {
4472 if (realsize >= memmap_pages) { 4547 freesize -= memmap_pages;
4473 realsize -= memmap_pages;
4474 if (memmap_pages) 4548 if (memmap_pages)
4475 printk(KERN_DEBUG 4549 printk(KERN_DEBUG
4476 " %s zone: %lu pages used for memmap\n", 4550 " %s zone: %lu pages used for memmap\n",
4477 zone_names[j], memmap_pages); 4551 zone_names[j], memmap_pages);
4478 } else 4552 } else
4479 printk(KERN_WARNING 4553 printk(KERN_WARNING
4480 " %s zone: %lu pages exceeds realsize %lu\n", 4554 " %s zone: %lu pages exceeds freesize %lu\n",
4481 zone_names[j], memmap_pages, realsize); 4555 zone_names[j], memmap_pages, freesize);
4482 4556
4483 /* Account for reserved pages */ 4557 /* Account for reserved pages */
4484 if (j == 0 && realsize > dma_reserve) { 4558 if (j == 0 && freesize > dma_reserve) {
4485 realsize -= dma_reserve; 4559 freesize -= dma_reserve;
4486 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4560 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4487 zone_names[0], dma_reserve); 4561 zone_names[0], dma_reserve);
4488 } 4562 }
4489 4563
4490 if (!is_highmem_idx(j)) 4564 if (!is_highmem_idx(j))
4491 nr_kernel_pages += realsize; 4565 nr_kernel_pages += freesize;
4492 nr_all_pages += realsize; 4566 /* Charge for highmem memmap if there are enough kernel pages */
4567 else if (nr_kernel_pages > memmap_pages * 2)
4568 nr_kernel_pages -= memmap_pages;
4569 nr_all_pages += freesize;
4493 4570
4494 zone->spanned_pages = size; 4571 zone->spanned_pages = size;
4495 zone->present_pages = realsize; 4572 zone->present_pages = freesize;
4573 /*
4574 * Set an approximate value for lowmem here, it will be adjusted
4575 * when the bootmem allocator frees pages into the buddy system.
4576 * And all highmem pages will be managed by the buddy system.
4577 */
4578 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4496#ifdef CONFIG_NUMA 4579#ifdef CONFIG_NUMA
4497 zone->node = nid; 4580 zone->node = nid;
4498 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4581 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4499 / 100; 4582 / 100;
4500 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 4583 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4501#endif 4584#endif
4502 zone->name = zone_names[j]; 4585 zone->name = zone_names[j];
4503 spin_lock_init(&zone->lock); 4586 spin_lock_init(&zone->lock);
@@ -4688,7 +4771,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
4688/* 4771/*
4689 * early_calculate_totalpages() 4772 * early_calculate_totalpages()
4690 * Sum pages in active regions for movable zone. 4773 * Sum pages in active regions for movable zone.
4691 * Populate N_HIGH_MEMORY for calculating usable_nodes. 4774 * Populate N_MEMORY for calculating usable_nodes.
4692 */ 4775 */
4693static unsigned long __init early_calculate_totalpages(void) 4776static unsigned long __init early_calculate_totalpages(void)
4694{ 4777{
@@ -4701,7 +4784,7 @@ static unsigned long __init early_calculate_totalpages(void)
4701 4784
4702 totalpages += pages; 4785 totalpages += pages;
4703 if (pages) 4786 if (pages)
4704 node_set_state(nid, N_HIGH_MEMORY); 4787 node_set_state(nid, N_MEMORY);
4705 } 4788 }
4706 return totalpages; 4789 return totalpages;
4707} 4790}
@@ -4718,9 +4801,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4718 unsigned long usable_startpfn; 4801 unsigned long usable_startpfn;
4719 unsigned long kernelcore_node, kernelcore_remaining; 4802 unsigned long kernelcore_node, kernelcore_remaining;
4720 /* save the state before borrow the nodemask */ 4803 /* save the state before borrow the nodemask */
4721 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; 4804 nodemask_t saved_node_state = node_states[N_MEMORY];
4722 unsigned long totalpages = early_calculate_totalpages(); 4805 unsigned long totalpages = early_calculate_totalpages();
4723 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4806 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
4724 4807
4725 /* 4808 /*
4726 * If movablecore was specified, calculate what size of 4809 * If movablecore was specified, calculate what size of
@@ -4755,7 +4838,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4755restart: 4838restart:
4756 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4839 /* Spread kernelcore memory as evenly as possible throughout nodes */
4757 kernelcore_node = required_kernelcore / usable_nodes; 4840 kernelcore_node = required_kernelcore / usable_nodes;
4758 for_each_node_state(nid, N_HIGH_MEMORY) { 4841 for_each_node_state(nid, N_MEMORY) {
4759 unsigned long start_pfn, end_pfn; 4842 unsigned long start_pfn, end_pfn;
4760 4843
4761 /* 4844 /*
@@ -4847,23 +4930,27 @@ restart:
4847 4930
4848out: 4931out:
4849 /* restore the node_state */ 4932 /* restore the node_state */
4850 node_states[N_HIGH_MEMORY] = saved_node_state; 4933 node_states[N_MEMORY] = saved_node_state;
4851} 4934}
4852 4935
4853/* Any regular memory on that node ? */ 4936/* Any regular or high memory on that node ? */
4854static void __init check_for_regular_memory(pg_data_t *pgdat) 4937static void check_for_memory(pg_data_t *pgdat, int nid)
4855{ 4938{
4856#ifdef CONFIG_HIGHMEM
4857 enum zone_type zone_type; 4939 enum zone_type zone_type;
4858 4940
4859 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4941 if (N_MEMORY == N_NORMAL_MEMORY)
4942 return;
4943
4944 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
4860 struct zone *zone = &pgdat->node_zones[zone_type]; 4945 struct zone *zone = &pgdat->node_zones[zone_type];
4861 if (zone->present_pages) { 4946 if (zone->present_pages) {
4862 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4947 node_set_state(nid, N_HIGH_MEMORY);
4948 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
4949 zone_type <= ZONE_NORMAL)
4950 node_set_state(nid, N_NORMAL_MEMORY);
4863 break; 4951 break;
4864 } 4952 }
4865 } 4953 }
4866#endif
4867} 4954}
4868 4955
4869/** 4956/**
@@ -4946,8 +5033,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4946 5033
4947 /* Any memory on that node */ 5034 /* Any memory on that node */
4948 if (pgdat->node_present_pages) 5035 if (pgdat->node_present_pages)
4949 node_set_state(nid, N_HIGH_MEMORY); 5036 node_set_state(nid, N_MEMORY);
4950 check_for_regular_memory(pgdat); 5037 check_for_memory(pgdat, nid);
4951 } 5038 }
4952} 5039}
4953 5040
@@ -5175,10 +5262,6 @@ static void __setup_per_zone_wmarks(void)
5175 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5262 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5176 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5263 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5177 5264
5178 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5179 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5180 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5181
5182 setup_zone_migrate_reserve(zone); 5265 setup_zone_migrate_reserve(zone);
5183 spin_unlock_irqrestore(&zone->lock, flags); 5266 spin_unlock_irqrestore(&zone->lock, flags);
5184 } 5267 }
@@ -5506,7 +5589,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5506 pfn &= (PAGES_PER_SECTION-1); 5589 pfn &= (PAGES_PER_SECTION-1);
5507 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5590 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5508#else 5591#else
5509 pfn = pfn - zone->zone_start_pfn; 5592 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5510 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5593 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5511#endif /* CONFIG_SPARSEMEM */ 5594#endif /* CONFIG_SPARSEMEM */
5512} 5595}
@@ -5576,7 +5659,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5576 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5659 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5577 * expect this function should be exact. 5660 * expect this function should be exact.
5578 */ 5661 */
5579bool has_unmovable_pages(struct zone *zone, struct page *page, int count) 5662bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5663 bool skip_hwpoisoned_pages)
5580{ 5664{
5581 unsigned long pfn, iter, found; 5665 unsigned long pfn, iter, found;
5582 int mt; 5666 int mt;
@@ -5611,6 +5695,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5611 continue; 5695 continue;
5612 } 5696 }
5613 5697
5698 /*
5699 * The HWPoisoned page may be not in buddy system, and
5700 * page_count() is not 0.
5701 */
5702 if (skip_hwpoisoned_pages && PageHWPoison(page))
5703 continue;
5704
5614 if (!PageLRU(page)) 5705 if (!PageLRU(page))
5615 found++; 5706 found++;
5616 /* 5707 /*
@@ -5653,7 +5744,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5653 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5744 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5654 return false; 5745 return false;
5655 5746
5656 return !has_unmovable_pages(zone, page, 0); 5747 return !has_unmovable_pages(zone, page, 0, true);
5657} 5748}
5658 5749
5659#ifdef CONFIG_CMA 5750#ifdef CONFIG_CMA
@@ -5680,7 +5771,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5680 unsigned int tries = 0; 5771 unsigned int tries = 0;
5681 int ret = 0; 5772 int ret = 0;
5682 5773
5683 migrate_prep_local(); 5774 migrate_prep();
5684 5775
5685 while (pfn < end || !list_empty(&cc->migratepages)) { 5776 while (pfn < end || !list_empty(&cc->migratepages)) {
5686 if (fatal_signal_pending(current)) { 5777 if (fatal_signal_pending(current)) {
@@ -5708,61 +5799,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5708 5799
5709 ret = migrate_pages(&cc->migratepages, 5800 ret = migrate_pages(&cc->migratepages,
5710 alloc_migrate_target, 5801 alloc_migrate_target,
5711 0, false, MIGRATE_SYNC); 5802 0, false, MIGRATE_SYNC,
5803 MR_CMA);
5712 } 5804 }
5713 5805
5714 putback_lru_pages(&cc->migratepages); 5806 putback_movable_pages(&cc->migratepages);
5715 return ret > 0 ? 0 : ret; 5807 return ret > 0 ? 0 : ret;
5716} 5808}
5717 5809
5718/*
5719 * Update zone's cma pages counter used for watermark level calculation.
5720 */
5721static inline void __update_cma_watermarks(struct zone *zone, int count)
5722{
5723 unsigned long flags;
5724 spin_lock_irqsave(&zone->lock, flags);
5725 zone->min_cma_pages += count;
5726 spin_unlock_irqrestore(&zone->lock, flags);
5727 setup_per_zone_wmarks();
5728}
5729
5730/*
5731 * Trigger memory pressure bump to reclaim some pages in order to be able to
5732 * allocate 'count' pages in single page units. Does similar work as
5733 *__alloc_pages_slowpath() function.
5734 */
5735static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5736{
5737 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5738 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5739 int did_some_progress = 0;
5740 int order = 1;
5741
5742 /*
5743 * Increase level of watermarks to force kswapd do his job
5744 * to stabilise at new watermark level.
5745 */
5746 __update_cma_watermarks(zone, count);
5747
5748 /* Obey watermarks as if the page was being allocated */
5749 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5750 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5751
5752 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5753 NULL);
5754 if (!did_some_progress) {
5755 /* Exhausted what can be done so it's blamo time */
5756 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5757 }
5758 }
5759
5760 /* Restore original watermark levels. */
5761 __update_cma_watermarks(zone, -count);
5762
5763 return count;
5764}
5765
5766/** 5810/**
5767 * alloc_contig_range() -- tries to allocate given range of pages 5811 * alloc_contig_range() -- tries to allocate given range of pages
5768 * @start: start PFN to allocate 5812 * @start: start PFN to allocate
@@ -5786,7 +5830,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5786int alloc_contig_range(unsigned long start, unsigned long end, 5830int alloc_contig_range(unsigned long start, unsigned long end,
5787 unsigned migratetype) 5831 unsigned migratetype)
5788{ 5832{
5789 struct zone *zone = page_zone(pfn_to_page(start));
5790 unsigned long outer_start, outer_end; 5833 unsigned long outer_start, outer_end;
5791 int ret = 0, order; 5834 int ret = 0, order;
5792 5835
@@ -5824,7 +5867,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5824 */ 5867 */
5825 5868
5826 ret = start_isolate_page_range(pfn_max_align_down(start), 5869 ret = start_isolate_page_range(pfn_max_align_down(start),
5827 pfn_max_align_up(end), migratetype); 5870 pfn_max_align_up(end), migratetype,
5871 false);
5828 if (ret) 5872 if (ret)
5829 return ret; 5873 return ret;
5830 5874
@@ -5863,18 +5907,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5863 } 5907 }
5864 5908
5865 /* Make sure the range is really isolated. */ 5909 /* Make sure the range is really isolated. */
5866 if (test_pages_isolated(outer_start, end)) { 5910 if (test_pages_isolated(outer_start, end, false)) {
5867 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 5911 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5868 outer_start, end); 5912 outer_start, end);
5869 ret = -EBUSY; 5913 ret = -EBUSY;
5870 goto done; 5914 goto done;
5871 } 5915 }
5872 5916
5873 /*
5874 * Reclaim enough pages to make sure that contiguous allocation
5875 * will not starve the system.
5876 */
5877 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5878 5917
5879 /* Grab isolated pages from freelists. */ 5918 /* Grab isolated pages from freelists. */
5880 outer_end = isolate_freepages_range(&cc, outer_start, end); 5919 outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5897,8 +5936,15 @@ done:
5897 5936
5898void free_contig_range(unsigned long pfn, unsigned nr_pages) 5937void free_contig_range(unsigned long pfn, unsigned nr_pages)
5899{ 5938{
5900 for (; nr_pages--; ++pfn) 5939 unsigned int count = 0;
5901 __free_page(pfn_to_page(pfn)); 5940
5941 for (; nr_pages--; pfn++) {
5942 struct page *page = pfn_to_page(pfn);
5943
5944 count += page_count(page) != 1;
5945 __free_page(page);
5946 }
5947 WARN(count != 0, "%d pages are still in use!\n", count);
5902} 5948}
5903#endif 5949#endif
5904 5950
@@ -5932,7 +5978,6 @@ void __meminit zone_pcp_update(struct zone *zone)
5932} 5978}
5933#endif 5979#endif
5934 5980
5935#ifdef CONFIG_MEMORY_HOTREMOVE
5936void zone_pcp_reset(struct zone *zone) 5981void zone_pcp_reset(struct zone *zone)
5937{ 5982{
5938 unsigned long flags; 5983 unsigned long flags;
@@ -5952,6 +5997,7 @@ void zone_pcp_reset(struct zone *zone)
5952 local_irq_restore(flags); 5997 local_irq_restore(flags);
5953} 5998}
5954 5999
6000#ifdef CONFIG_MEMORY_HOTREMOVE
5955/* 6001/*
5956 * All pages in the range must be isolated before calling this. 6002 * All pages in the range must be isolated before calling this.
5957 */ 6003 */
@@ -5978,6 +6024,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5978 continue; 6024 continue;
5979 } 6025 }
5980 page = pfn_to_page(pfn); 6026 page = pfn_to_page(pfn);
6027 /*
6028 * The HWPoisoned page may be not in buddy system, and
6029 * page_count() is not 0.
6030 */
6031 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6032 pfn++;
6033 SetPageReserved(page);
6034 continue;
6035 }
6036
5981 BUG_ON(page_count(page)); 6037 BUG_ON(page_count(page));
5982 BUG_ON(!PageBuddy(page)); 6038 BUG_ON(!PageBuddy(page));
5983 order = page_order(page); 6039 order = page_order(page);
@@ -5988,8 +6044,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5988 list_del(&page->lru); 6044 list_del(&page->lru);
5989 rmv_page_order(page); 6045 rmv_page_order(page);
5990 zone->free_area[order].nr_free--; 6046 zone->free_area[order].nr_free--;
5991 __mod_zone_page_state(zone, NR_FREE_PAGES,
5992 - (1UL << order));
5993 for (i = 0; i < (1 << order); i++) 6047 for (i = 0; i < (1 << order); i++)
5994 SetPageReserved((page+i)); 6048 SetPageReserved((page+i));
5995 pfn += (1 << order); 6049 pfn += (1 << order);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c6daa6..6d757e3a872a 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
251 mn->nr_pages, mn->status_change_nid); 251 mn->nr_pages, mn->status_change_nid);
252 break; 252 break;
253 case MEM_CANCEL_ONLINE: 253 case MEM_CANCEL_ONLINE:
254 offline_page_cgroup(mn->start_pfn,
255 mn->nr_pages, mn->status_change_nid);
256 break;
254 case MEM_GOING_OFFLINE: 257 case MEM_GOING_OFFLINE:
255 break; 258 break;
256 case MEM_ONLINE: 259 case MEM_ONLINE:
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void)
271 if (mem_cgroup_disabled()) 274 if (mem_cgroup_disabled())
272 return; 275 return;
273 276
274 for_each_node_state(nid, N_HIGH_MEMORY) { 277 for_each_node_state(nid, N_MEMORY) {
275 unsigned long start_pfn, end_pfn; 278 unsigned long start_pfn, end_pfn;
276 279
277 start_pfn = node_start_pfn(nid); 280 start_pfn = node_start_pfn(nid);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f2f5b4818e94..383bdbb98b04 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,29 +8,7 @@
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include "internal.h" 9#include "internal.h"
10 10
11/* called while holding zone->lock */ 11int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
33int set_migratetype_isolate(struct page *page)
34{ 12{
35 struct zone *zone; 13 struct zone *zone;
36 unsigned long flags, pfn; 14 unsigned long flags, pfn;
@@ -66,7 +44,8 @@ int set_migratetype_isolate(struct page *page)
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 44 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages. 45 * We just check MOVABLE pages.
68 */ 46 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found)) 47 if (!has_unmovable_pages(zone, page, arg.pages_found,
48 skip_hwpoisoned_pages))
70 ret = 0; 49 ret = 0;
71 50
72 /* 51 /*
@@ -79,7 +58,7 @@ out:
79 unsigned long nr_pages; 58 unsigned long nr_pages;
80 int migratetype = get_pageblock_migratetype(page); 59 int migratetype = get_pageblock_migratetype(page);
81 60
82 set_pageblock_isolate(page); 61 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
83 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); 62 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
84 63
85 __mod_zone_freepage_state(zone, -nr_pages, migratetype); 64 __mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -102,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
102 goto out; 81 goto out;
103 nr_pages = move_freepages_block(zone, page, migratetype); 82 nr_pages = move_freepages_block(zone, page, migratetype);
104 __mod_zone_freepage_state(zone, nr_pages, migratetype); 83 __mod_zone_freepage_state(zone, nr_pages, migratetype);
105 restore_pageblock_isolate(page, migratetype); 84 set_pageblock_migratetype(page, migratetype);
106out: 85out:
107 spin_unlock_irqrestore(&zone->lock, flags); 86 spin_unlock_irqrestore(&zone->lock, flags);
108} 87}
@@ -134,7 +113,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
134 * Returns 0 on success and -EBUSY if any part of range cannot be isolated. 113 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
135 */ 114 */
136int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 115int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
137 unsigned migratetype) 116 unsigned migratetype, bool skip_hwpoisoned_pages)
138{ 117{
139 unsigned long pfn; 118 unsigned long pfn;
140 unsigned long undo_pfn; 119 unsigned long undo_pfn;
@@ -147,7 +126,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
147 pfn < end_pfn; 126 pfn < end_pfn;
148 pfn += pageblock_nr_pages) { 127 pfn += pageblock_nr_pages) {
149 page = __first_valid_page(pfn, pageblock_nr_pages); 128 page = __first_valid_page(pfn, pageblock_nr_pages);
150 if (page && set_migratetype_isolate(page)) { 129 if (page &&
130 set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
151 undo_pfn = pfn; 131 undo_pfn = pfn;
152 goto undo; 132 goto undo;
153 } 133 }
@@ -190,7 +170,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
190 * Returns 1 if all pages in the range are isolated. 170 * Returns 1 if all pages in the range are isolated.
191 */ 171 */
192static int 172static int
193__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 173__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
174 bool skip_hwpoisoned_pages)
194{ 175{
195 struct page *page; 176 struct page *page;
196 177
@@ -220,6 +201,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
220 else if (page_count(page) == 0 && 201 else if (page_count(page) == 0 &&
221 get_freepage_migratetype(page) == MIGRATE_ISOLATE) 202 get_freepage_migratetype(page) == MIGRATE_ISOLATE)
222 pfn += 1; 203 pfn += 1;
204 else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
205 /*
206 * The HWPoisoned page may be not in buddy
207 * system, and page_count() is not 0.
208 */
209 pfn++;
210 continue;
211 }
223 else 212 else
224 break; 213 break;
225 } 214 }
@@ -228,7 +217,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
228 return 1; 217 return 1;
229} 218}
230 219
231int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 220int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
221 bool skip_hwpoisoned_pages)
232{ 222{
233 unsigned long pfn, flags; 223 unsigned long pfn, flags;
234 struct page *page; 224 struct page *page;
@@ -251,7 +241,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
251 /* Check all pages are free or Marked as ISOLATED */ 241 /* Check all pages are free or Marked as ISOLATED */
252 zone = page_zone(page); 242 zone = page_zone(page);
253 spin_lock_irqsave(&zone->lock, flags); 243 spin_lock_irqsave(&zone->lock, flags);
254 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); 244 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
245 skip_hwpoisoned_pages);
255 spin_unlock_irqrestore(&zone->lock, flags); 246 spin_unlock_irqrestore(&zone->lock, flags);
256 return ret ? 0 : -EBUSY; 247 return ret ? 0 : -EBUSY;
257} 248}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 6c118d012bb5..35aa294656cd 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
58 if (!walk->pte_entry) 58 if (!walk->pte_entry)
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd_mm(walk->mm, addr, pmd);
62 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/percpu.c b/mm/percpu.c
index ddc5efb9c5bb..8c8e08f3a692 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
631 if (!chunk) 631 if (!chunk)
632 return; 632 return;
633 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 633 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
634 kfree(chunk); 634 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
635} 635}
636 636
637/* 637/*
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1380 1380
1381static int __init percpu_alloc_setup(char *str) 1381static int __init percpu_alloc_setup(char *str)
1382{ 1382{
1383 if (!str)
1384 return -EINVAL;
1385
1383 if (0) 1386 if (0)
1384 /* nada */; 1387 /* nada */;
1385#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK 1388#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b7..0c8323fe6c8f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
12 12
13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
14/* 14/*
15 * Only sets the access flags (dirty, accessed, and 15 * Only sets the access flags (dirty, accessed), as well as write
16 * writable). Furthermore, we know it always gets set to a "more 16 * permission. Furthermore, we know it always gets set to a "more
17 * permissive" setting, which allows most architectures to optimize 17 * permissive" setting, which allows most architectures to optimize
18 * this. We return whether the PTE actually changed, which in turn 18 * this. We return whether the PTE actually changed, which in turn
19 * instructs the caller to do things like update__mmu_cache. This 19 * instructs the caller to do things like update__mmu_cache. This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
27 int changed = !pte_same(*ptep, entry); 27 int changed = !pte_same(*ptep, entry);
28 if (changed) { 28 if (changed) {
29 set_pte_at(vma->vm_mm, address, ptep, entry); 29 set_pte_at(vma->vm_mm, address, ptep, entry);
30 flush_tlb_page(vma, address); 30 flush_tlb_fix_spurious_fault(vma, address);
31 } 31 }
32 return changed; 32 return changed;
33} 33}
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
88{ 88{
89 pte_t pte; 89 pte_t pte;
90 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); 90 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
91 flush_tlb_page(vma, address); 91 if (pte_accessible(pte))
92 flush_tlb_page(vma, address);
92 return pte; 93 return pte;
93} 94}
94#endif 95#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ee1ef0f317b..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
27 * anon_vma->mutex 27 * anon_vma->rwsem
28 * mm->page_table_lock or pte_lock 28 * mm->page_table_lock or pte_lock
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
30 * swap_lock (in swap_duplicate, swap_info_get) 30 * swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within bdi.wb->list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
39 * 39 *
40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 41 * ->tasklist_lock
42 * pte map lock 42 * pte map lock
43 */ 43 */
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
87 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 87 VM_BUG_ON(atomic_read(&anon_vma->refcount));
88 88
89 /* 89 /*
90 * Synchronize against page_lock_anon_vma() such that 90 * Synchronize against page_lock_anon_vma_read() such that
91 * we can safely hold the lock without the anon_vma getting 91 * we can safely hold the lock without the anon_vma getting
92 * freed. 92 * freed.
93 * 93 *
94 * Relies on the full mb implied by the atomic_dec_and_test() from 94 * Relies on the full mb implied by the atomic_dec_and_test() from
95 * put_anon_vma() against the acquire barrier implied by 95 * put_anon_vma() against the acquire barrier implied by
96 * mutex_trylock() from page_lock_anon_vma(). This orders: 96 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
97 * 97 *
98 * page_lock_anon_vma() VS put_anon_vma() 98 * page_lock_anon_vma_read() VS put_anon_vma()
99 * mutex_trylock() atomic_dec_and_test() 99 * down_read_trylock() atomic_dec_and_test()
100 * LOCK MB 100 * LOCK MB
101 * atomic_read() mutex_is_locked() 101 * atomic_read() rwsem_is_locked()
102 * 102 *
103 * LOCK should suffice since the actual taking of the lock must 103 * LOCK should suffice since the actual taking of the lock must
104 * happen _before_ what follows. 104 * happen _before_ what follows.
105 */ 105 */
106 if (mutex_is_locked(&anon_vma->root->mutex)) { 106 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock(anon_vma); 108 anon_vma_unlock(anon_vma);
109 } 109 }
110 110
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
146 * allocate a new one. 146 * allocate a new one.
147 * 147 *
148 * Anon-vma allocations are very subtle, because we may have 148 * Anon-vma allocations are very subtle, because we may have
149 * optimistically looked up an anon_vma in page_lock_anon_vma() 149 * optimistically looked up an anon_vma in page_lock_anon_vma_read()
150 * and that may actually touch the spinlock even in the newly 150 * and that may actually touch the spinlock even in the newly
151 * allocated vma (it depends on RCU to make sure that the 151 * allocated vma (it depends on RCU to make sure that the
152 * anon_vma isn't actually destroyed). 152 * anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
181 allocated = anon_vma; 181 allocated = anon_vma;
182 } 182 }
183 183
184 anon_vma_lock(anon_vma); 184 anon_vma_lock_write(anon_vma);
185 /* page_table_lock to protect against threads */ 185 /* page_table_lock to protect against threads */
186 spin_lock(&mm->page_table_lock); 186 spin_lock(&mm->page_table_lock);
187 if (likely(!vma->anon_vma)) { 187 if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
219 struct anon_vma *new_root = anon_vma->root; 219 struct anon_vma *new_root = anon_vma->root;
220 if (new_root != root) { 220 if (new_root != root) {
221 if (WARN_ON_ONCE(root)) 221 if (WARN_ON_ONCE(root))
222 mutex_unlock(&root->mutex); 222 up_write(&root->rwsem);
223 root = new_root; 223 root = new_root;
224 mutex_lock(&root->mutex); 224 down_write(&root->rwsem);
225 } 225 }
226 return root; 226 return root;
227} 227}
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
229static inline void unlock_anon_vma_root(struct anon_vma *root) 229static inline void unlock_anon_vma_root(struct anon_vma *root)
230{ 230{
231 if (root) 231 if (root)
232 mutex_unlock(&root->mutex); 232 up_write(&root->rwsem);
233} 233}
234 234
235/* 235/*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
306 get_anon_vma(anon_vma->root); 306 get_anon_vma(anon_vma->root);
307 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 307 /* Mark this anon_vma as the one where our new (COWed) pages go. */
308 vma->anon_vma = anon_vma; 308 vma->anon_vma = anon_vma;
309 anon_vma_lock(anon_vma); 309 anon_vma_lock_write(anon_vma);
310 anon_vma_chain_link(vma, avc, anon_vma); 310 anon_vma_chain_link(vma, avc, anon_vma);
311 anon_vma_unlock(anon_vma); 311 anon_vma_unlock(anon_vma);
312 312
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
349 /* 349 /*
350 * Iterate the list once more, it now only contains empty and unlinked 350 * Iterate the list once more, it now only contains empty and unlinked
351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
352 * needing to acquire the anon_vma->root->mutex. 352 * needing to write-acquire the anon_vma->root->rwsem.
353 */ 353 */
354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
355 struct anon_vma *anon_vma = avc->anon_vma; 355 struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
365{ 365{
366 struct anon_vma *anon_vma = data; 366 struct anon_vma *anon_vma = data;
367 367
368 mutex_init(&anon_vma->mutex); 368 init_rwsem(&anon_vma->rwsem);
369 atomic_set(&anon_vma->refcount, 0); 369 atomic_set(&anon_vma->refcount, 0);
370 anon_vma->rb_root = RB_ROOT; 370 anon_vma->rb_root = RB_ROOT;
371} 371}
@@ -442,7 +442,7 @@ out:
442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
443 * reference like with page_get_anon_vma() and then block on the mutex. 443 * reference like with page_get_anon_vma() and then block on the mutex.
444 */ 444 */
445struct anon_vma *page_lock_anon_vma(struct page *page) 445struct anon_vma *page_lock_anon_vma_read(struct page *page)
446{ 446{
447 struct anon_vma *anon_vma = NULL; 447 struct anon_vma *anon_vma = NULL;
448 struct anon_vma *root_anon_vma; 448 struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
457 457
458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
459 root_anon_vma = ACCESS_ONCE(anon_vma->root); 459 root_anon_vma = ACCESS_ONCE(anon_vma->root);
460 if (mutex_trylock(&root_anon_vma->mutex)) { 460 if (down_read_trylock(&root_anon_vma->rwsem)) {
461 /* 461 /*
462 * If the page is still mapped, then this anon_vma is still 462 * If the page is still mapped, then this anon_vma is still
463 * its anon_vma, and holding the mutex ensures that it will 463 * its anon_vma, and holding the mutex ensures that it will
464 * not go away, see anon_vma_free(). 464 * not go away, see anon_vma_free().
465 */ 465 */
466 if (!page_mapped(page)) { 466 if (!page_mapped(page)) {
467 mutex_unlock(&root_anon_vma->mutex); 467 up_read(&root_anon_vma->rwsem);
468 anon_vma = NULL; 468 anon_vma = NULL;
469 } 469 }
470 goto out; 470 goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
484 484
485 /* we pinned the anon_vma, its safe to sleep */ 485 /* we pinned the anon_vma, its safe to sleep */
486 rcu_read_unlock(); 486 rcu_read_unlock();
487 anon_vma_lock(anon_vma); 487 anon_vma_lock_read(anon_vma);
488 488
489 if (atomic_dec_and_test(&anon_vma->refcount)) { 489 if (atomic_dec_and_test(&anon_vma->refcount)) {
490 /* 490 /*
491 * Oops, we held the last refcount, release the lock 491 * Oops, we held the last refcount, release the lock
492 * and bail -- can't simply use put_anon_vma() because 492 * and bail -- can't simply use put_anon_vma() because
493 * we'll deadlock on the anon_vma_lock() recursion. 493 * we'll deadlock on the anon_vma_lock_write() recursion.
494 */ 494 */
495 anon_vma_unlock(anon_vma); 495 anon_vma_unlock_read(anon_vma);
496 __put_anon_vma(anon_vma); 496 __put_anon_vma(anon_vma);
497 anon_vma = NULL; 497 anon_vma = NULL;
498 } 498 }
@@ -504,9 +504,9 @@ out:
504 return anon_vma; 504 return anon_vma;
505} 505}
506 506
507void page_unlock_anon_vma(struct anon_vma *anon_vma) 507void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
508{ 508{
509 anon_vma_unlock(anon_vma); 509 anon_vma_unlock_read(anon_vma);
510} 510}
511 511
512/* 512/*
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
562 return address; 562 return address;
563} 563}
564 564
565pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
566{
567 pgd_t *pgd;
568 pud_t *pud;
569 pmd_t *pmd = NULL;
570
571 pgd = pgd_offset(mm, address);
572 if (!pgd_present(*pgd))
573 goto out;
574
575 pud = pud_offset(pgd, address);
576 if (!pud_present(*pud))
577 goto out;
578
579 pmd = pmd_offset(pud, address);
580 if (!pmd_present(*pmd))
581 pmd = NULL;
582out:
583 return pmd;
584}
585
565/* 586/*
566 * Check that @page is mapped at @address into @mm. 587 * Check that @page is mapped at @address into @mm.
567 * 588 *
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
574pte_t *__page_check_address(struct page *page, struct mm_struct *mm, 595pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
575 unsigned long address, spinlock_t **ptlp, int sync) 596 unsigned long address, spinlock_t **ptlp, int sync)
576{ 597{
577 pgd_t *pgd;
578 pud_t *pud;
579 pmd_t *pmd; 598 pmd_t *pmd;
580 pte_t *pte; 599 pte_t *pte;
581 spinlock_t *ptl; 600 spinlock_t *ptl;
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
586 goto check; 605 goto check;
587 } 606 }
588 607
589 pgd = pgd_offset(mm, address); 608 pmd = mm_find_pmd(mm, address);
590 if (!pgd_present(*pgd)) 609 if (!pmd)
591 return NULL;
592
593 pud = pud_offset(pgd, address);
594 if (!pud_present(*pud))
595 return NULL; 610 return NULL;
596 611
597 pmd = pmd_offset(pud, address);
598 if (!pmd_present(*pmd))
599 return NULL;
600 if (pmd_trans_huge(*pmd)) 612 if (pmd_trans_huge(*pmd))
601 return NULL; 613 return NULL;
602 614
@@ -732,7 +744,7 @@ static int page_referenced_anon(struct page *page,
732 struct anon_vma_chain *avc; 744 struct anon_vma_chain *avc;
733 int referenced = 0; 745 int referenced = 0;
734 746
735 anon_vma = page_lock_anon_vma(page); 747 anon_vma = page_lock_anon_vma_read(page);
736 if (!anon_vma) 748 if (!anon_vma)
737 return referenced; 749 return referenced;
738 750
@@ -754,7 +766,7 @@ static int page_referenced_anon(struct page *page,
754 break; 766 break;
755 } 767 }
756 768
757 page_unlock_anon_vma(anon_vma); 769 page_unlock_anon_vma_read(anon_vma);
758 return referenced; 770 return referenced;
759} 771}
760 772
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page)
1139 * containing the swap entry, but page not yet written to swap. 1151 * containing the swap entry, but page not yet written to swap.
1140 * 1152 *
1141 * And we can skip it on file pages, so long as the filesystem 1153 * And we can skip it on file pages, so long as the filesystem
1142 * participates in dirty tracking; but need to catch shm and tmpfs 1154 * participates in dirty tracking (note that this is not only an
1143 * and ramfs pages which have been modified since creation by read 1155 * optimization but also solves problems caused by dirty flag in
1144 * fault. 1156 * storage key getting set by a write from inside kernel); but need to
1157 * catch shm and tmpfs and ramfs pages which have been modified since
1158 * creation by read fault.
1145 * 1159 *
1146 * Note that mapping must be decided above, before decrementing 1160 * Note that mapping must be decided above, before decrementing
1147 * mapcount (which luckily provides a barrier): once page is unmapped, 1161 * mapcount (which luckily provides a barrier): once page is unmapped,
@@ -1235,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1235 update_hiwater_rss(mm); 1249 update_hiwater_rss(mm);
1236 1250
1237 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1251 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1238 if (PageAnon(page)) 1252 if (!PageHuge(page)) {
1239 dec_mm_counter(mm, MM_ANONPAGES); 1253 if (PageAnon(page))
1240 else 1254 dec_mm_counter(mm, MM_ANONPAGES);
1241 dec_mm_counter(mm, MM_FILEPAGES); 1255 else
1256 dec_mm_counter(mm, MM_FILEPAGES);
1257 }
1242 set_pte_at(mm, address, pte, 1258 set_pte_at(mm, address, pte,
1243 swp_entry_to_pte(make_hwpoison_entry(page))); 1259 swp_entry_to_pte(make_hwpoison_entry(page)));
1244 } else if (PageAnon(page)) { 1260 } else if (PageAnon(page)) {
1245 swp_entry_t entry = { .val = page_private(page) }; 1261 swp_entry_t entry = { .val = page_private(page) };
1246 1262
@@ -1299,7 +1315,7 @@ out_mlock:
1299 /* 1315 /*
1300 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1316 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1301 * unstable result and race. Plus, We can't wait here because 1317 * unstable result and race. Plus, We can't wait here because
1302 * we now hold anon_vma->mutex or mapping->i_mmap_mutex. 1318 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
1303 * if trylock failed, the page remain in evictable lru and later 1319 * if trylock failed, the page remain in evictable lru and later
1304 * vmscan could retry to move the page to unevictable lru if the 1320 * vmscan could retry to move the page to unevictable lru if the
1305 * page is actually mlocked. 1321 * page is actually mlocked.
@@ -1345,8 +1361,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1345 struct vm_area_struct *vma, struct page *check_page) 1361 struct vm_area_struct *vma, struct page *check_page)
1346{ 1362{
1347 struct mm_struct *mm = vma->vm_mm; 1363 struct mm_struct *mm = vma->vm_mm;
1348 pgd_t *pgd;
1349 pud_t *pud;
1350 pmd_t *pmd; 1364 pmd_t *pmd;
1351 pte_t *pte; 1365 pte_t *pte;
1352 pte_t pteval; 1366 pte_t pteval;
@@ -1366,16 +1380,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1366 if (end > vma->vm_end) 1380 if (end > vma->vm_end)
1367 end = vma->vm_end; 1381 end = vma->vm_end;
1368 1382
1369 pgd = pgd_offset(mm, address); 1383 pmd = mm_find_pmd(mm, address);
1370 if (!pgd_present(*pgd)) 1384 if (!pmd)
1371 return ret;
1372
1373 pud = pud_offset(pgd, address);
1374 if (!pud_present(*pud))
1375 return ret;
1376
1377 pmd = pmd_offset(pud, address);
1378 if (!pmd_present(*pmd))
1379 return ret; 1385 return ret;
1380 1386
1381 mmun_start = address; 1387 mmun_start = address;
@@ -1474,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1474 struct anon_vma_chain *avc; 1480 struct anon_vma_chain *avc;
1475 int ret = SWAP_AGAIN; 1481 int ret = SWAP_AGAIN;
1476 1482
1477 anon_vma = page_lock_anon_vma(page); 1483 anon_vma = page_lock_anon_vma_read(page);
1478 if (!anon_vma) 1484 if (!anon_vma)
1479 return ret; 1485 return ret;
1480 1486
@@ -1501,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1501 break; 1507 break;
1502 } 1508 }
1503 1509
1504 page_unlock_anon_vma(anon_vma); 1510 page_unlock_anon_vma_read(anon_vma);
1505 return ret; 1511 return ret;
1506} 1512}
1507 1513
@@ -1696,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1696 int ret = SWAP_AGAIN; 1702 int ret = SWAP_AGAIN;
1697 1703
1698 /* 1704 /*
1699 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1705 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
1700 * because that depends on page_mapped(); but not all its usages 1706 * because that depends on page_mapped(); but not all its usages
1701 * are holding mmap_sem. Users without mmap_sem are required to 1707 * are holding mmap_sem. Users without mmap_sem are required to
1702 * take a reference count to prevent the anon_vma disappearing 1708 * take a reference count to prevent the anon_vma disappearing
@@ -1704,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1704 anon_vma = page_anon_vma(page); 1710 anon_vma = page_anon_vma(page);
1705 if (!anon_vma) 1711 if (!anon_vma)
1706 return ret; 1712 return ret;
1707 anon_vma_lock(anon_vma); 1713 anon_vma_lock_read(anon_vma);
1708 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1714 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1709 struct vm_area_struct *vma = avc->vma; 1715 struct vm_area_struct *vma = avc->vma;
1710 unsigned long address = vma_address(page, vma); 1716 unsigned long address = vma_address(page, vma);
@@ -1712,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1712 if (ret != SWAP_AGAIN) 1718 if (ret != SWAP_AGAIN)
1713 break; 1719 break;
1714 } 1720 }
1715 anon_vma_unlock(anon_vma); 1721 anon_vma_unlock_read(anon_vma);
1716 return ret; 1722 return ret;
1717} 1723}
1718 1724
diff --git a/mm/shmem.c b/mm/shmem.c
index 50c5b8f3a359..5dd56f6efdbd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
889 if (!mpol || mpol->mode == MPOL_DEFAULT) 889 if (!mpol || mpol->mode == MPOL_DEFAULT)
890 return; /* show nothing */ 890 return; /* show nothing */
891 891
892 mpol_to_str(buffer, sizeof(buffer), mpol, 1); 892 mpol_to_str(buffer, sizeof(buffer), mpol);
893 893
894 seq_printf(seq, ",mpol=%s", buffer); 894 seq_printf(seq, ",mpol=%s", buffer);
895} 895}
@@ -1715,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1715 return error; 1715 return error;
1716} 1716}
1717 1717
1718/*
1719 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1720 */
1721static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1722 pgoff_t index, pgoff_t end, int whence)
1723{
1724 struct page *page;
1725 struct pagevec pvec;
1726 pgoff_t indices[PAGEVEC_SIZE];
1727 bool done = false;
1728 int i;
1729
1730 pagevec_init(&pvec, 0);
1731 pvec.nr = 1; /* start small: we may be there already */
1732 while (!done) {
1733 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1734 pvec.nr, pvec.pages, indices);
1735 if (!pvec.nr) {
1736 if (whence == SEEK_DATA)
1737 index = end;
1738 break;
1739 }
1740 for (i = 0; i < pvec.nr; i++, index++) {
1741 if (index < indices[i]) {
1742 if (whence == SEEK_HOLE) {
1743 done = true;
1744 break;
1745 }
1746 index = indices[i];
1747 }
1748 page = pvec.pages[i];
1749 if (page && !radix_tree_exceptional_entry(page)) {
1750 if (!PageUptodate(page))
1751 page = NULL;
1752 }
1753 if (index >= end ||
1754 (page && whence == SEEK_DATA) ||
1755 (!page && whence == SEEK_HOLE)) {
1756 done = true;
1757 break;
1758 }
1759 }
1760 shmem_deswap_pagevec(&pvec);
1761 pagevec_release(&pvec);
1762 pvec.nr = PAGEVEC_SIZE;
1763 cond_resched();
1764 }
1765 return index;
1766}
1767
1768static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1769{
1770 struct address_space *mapping = file->f_mapping;
1771 struct inode *inode = mapping->host;
1772 pgoff_t start, end;
1773 loff_t new_offset;
1774
1775 if (whence != SEEK_DATA && whence != SEEK_HOLE)
1776 return generic_file_llseek_size(file, offset, whence,
1777 MAX_LFS_FILESIZE, i_size_read(inode));
1778 mutex_lock(&inode->i_mutex);
1779 /* We're holding i_mutex so we can access i_size directly */
1780
1781 if (offset < 0)
1782 offset = -EINVAL;
1783 else if (offset >= inode->i_size)
1784 offset = -ENXIO;
1785 else {
1786 start = offset >> PAGE_CACHE_SHIFT;
1787 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1788 new_offset = shmem_seek_hole_data(mapping, start, end, whence);
1789 new_offset <<= PAGE_CACHE_SHIFT;
1790 if (new_offset > offset) {
1791 if (new_offset < inode->i_size)
1792 offset = new_offset;
1793 else if (whence == SEEK_DATA)
1794 offset = -ENXIO;
1795 else
1796 offset = inode->i_size;
1797 }
1798 }
1799
1800 if (offset >= 0 && offset != file->f_pos) {
1801 file->f_pos = offset;
1802 file->f_version = 0;
1803 }
1804 mutex_unlock(&inode->i_mutex);
1805 return offset;
1806}
1807
1718static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1808static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1719 loff_t len) 1809 loff_t len)
1720{ 1810{
@@ -2373,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2373 if (!gid_valid(sbinfo->gid)) 2463 if (!gid_valid(sbinfo->gid))
2374 goto bad_val; 2464 goto bad_val;
2375 } else if (!strcmp(this_char,"mpol")) { 2465 } else if (!strcmp(this_char,"mpol")) {
2376 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 2466 if (mpol_parse_str(value, &sbinfo->mpol))
2377 goto bad_val; 2467 goto bad_val;
2378 } else { 2468 } else {
2379 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2469 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2586,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
2586static const struct file_operations shmem_file_operations = { 2676static const struct file_operations shmem_file_operations = {
2587 .mmap = shmem_mmap, 2677 .mmap = shmem_mmap,
2588#ifdef CONFIG_TMPFS 2678#ifdef CONFIG_TMPFS
2589 .llseek = generic_file_llseek, 2679 .llseek = shmem_file_llseek,
2590 .read = do_sync_read, 2680 .read = do_sync_read,
2591 .write = do_sync_write, 2681 .write = do_sync_write,
2592 .aio_read = shmem_file_aio_read, 2682 .aio_read = shmem_file_aio_read,
diff --git a/mm/slab.c b/mm/slab.c
index 33d3363658df..e7667a3584bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,7 +87,6 @@
87 */ 87 */
88 88
89#include <linux/slab.h> 89#include <linux/slab.h>
90#include "slab.h"
91#include <linux/mm.h> 90#include <linux/mm.h>
92#include <linux/poison.h> 91#include <linux/poison.h>
93#include <linux/swap.h> 92#include <linux/swap.h>
@@ -128,6 +127,8 @@
128 127
129#include "internal.h" 128#include "internal.h"
130 129
130#include "slab.h"
131
131/* 132/*
132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 133 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
133 * 0 for faster, smaller code (especially in the critical paths). 134 * 0 for faster, smaller code (especially in the critical paths).
@@ -162,23 +163,6 @@
162 */ 163 */
163static bool pfmemalloc_active __read_mostly; 164static bool pfmemalloc_active __read_mostly;
164 165
165/* Legal flag mask for kmem_cache_create(). */
166#if DEBUG
167# define CREATE_MASK (SLAB_RED_ZONE | \
168 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
169 SLAB_CACHE_DMA | \
170 SLAB_STORE_USER | \
171 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
172 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
173 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
174#else
175# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
176 SLAB_CACHE_DMA | \
177 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
178 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
179 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
180#endif
181
182/* 166/*
183 * kmem_bufctl_t: 167 * kmem_bufctl_t:
184 * 168 *
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = {
564#undef CACHE 548#undef CACHE
565}; 549};
566 550
567static struct arraycache_init initarray_cache __initdata =
568 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
569static struct arraycache_init initarray_generic = 551static struct arraycache_init initarray_generic =
570 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 552 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
571 553
572/* internal cache of cache description objs */ 554/* internal cache of cache description objs */
573static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
574static struct kmem_cache kmem_cache_boot = { 555static struct kmem_cache kmem_cache_boot = {
575 .nodelists = kmem_cache_nodelists,
576 .batchcount = 1, 556 .batchcount = 1,
577 .limit = BOOT_CPUCACHE_ENTRIES, 557 .limit = BOOT_CPUCACHE_ENTRIES,
578 .shared = 1, 558 .shared = 1,
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q)
662 } 642 }
663} 643}
664 644
645static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
646{
647 struct kmem_list3 *l3;
648 l3 = cachep->nodelists[q];
649 if (!l3)
650 return;
651
652 slab_set_lock_classes(cachep, &on_slab_l3_key,
653 &on_slab_alc_key, q);
654}
655
656static inline void on_slab_lock_classes(struct kmem_cache *cachep)
657{
658 int node;
659
660 VM_BUG_ON(OFF_SLAB(cachep));
661 for_each_node(node)
662 on_slab_lock_classes_node(cachep, node);
663}
664
665static inline void init_lock_keys(void) 665static inline void init_lock_keys(void)
666{ 666{
667 int node; 667 int node;
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void)
678{ 678{
679} 679}
680 680
681static inline void on_slab_lock_classes(struct kmem_cache *cachep)
682{
683}
684
685static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
686{
687}
688
681static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) 689static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
682{ 690{
683} 691}
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
1406 free_alien_cache(alien); 1414 free_alien_cache(alien);
1407 if (cachep->flags & SLAB_DEBUG_OBJECTS) 1415 if (cachep->flags & SLAB_DEBUG_OBJECTS)
1408 slab_set_debugobj_lock_classes_node(cachep, node); 1416 slab_set_debugobj_lock_classes_node(cachep, node);
1417 else if (!OFF_SLAB(cachep) &&
1418 !(cachep->flags & SLAB_DESTROY_BY_RCU))
1419 on_slab_lock_classes_node(cachep, node);
1409 } 1420 }
1410 init_node_lock_keys(node); 1421 init_node_lock_keys(node);
1411 1422
@@ -1577,28 +1588,33 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1577} 1588}
1578 1589
1579/* 1590/*
1591 * The memory after the last cpu cache pointer is used for the
1592 * the nodelists pointer.
1593 */
1594static void setup_nodelists_pointer(struct kmem_cache *cachep)
1595{
1596 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
1597}
1598
1599/*
1580 * Initialisation. Called after the page allocator have been initialised and 1600 * Initialisation. Called after the page allocator have been initialised and
1581 * before smp_init(). 1601 * before smp_init().
1582 */ 1602 */
1583void __init kmem_cache_init(void) 1603void __init kmem_cache_init(void)
1584{ 1604{
1585 size_t left_over;
1586 struct cache_sizes *sizes; 1605 struct cache_sizes *sizes;
1587 struct cache_names *names; 1606 struct cache_names *names;
1588 int i; 1607 int i;
1589 int order;
1590 int node;
1591 1608
1592 kmem_cache = &kmem_cache_boot; 1609 kmem_cache = &kmem_cache_boot;
1610 setup_nodelists_pointer(kmem_cache);
1593 1611
1594 if (num_possible_nodes() == 1) 1612 if (num_possible_nodes() == 1)
1595 use_alien_caches = 0; 1613 use_alien_caches = 0;
1596 1614
1597 for (i = 0; i < NUM_INIT_LISTS; i++) { 1615 for (i = 0; i < NUM_INIT_LISTS; i++)
1598 kmem_list3_init(&initkmem_list3[i]); 1616 kmem_list3_init(&initkmem_list3[i]);
1599 if (i < MAX_NUMNODES) 1617
1600 kmem_cache->nodelists[i] = NULL;
1601 }
1602 set_up_list3s(kmem_cache, CACHE_CACHE); 1618 set_up_list3s(kmem_cache, CACHE_CACHE);
1603 1619
1604 /* 1620 /*
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void)
1629 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1645 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1630 */ 1646 */
1631 1647
1632 node = numa_mem_id();
1633
1634 /* 1) create the kmem_cache */ 1648 /* 1) create the kmem_cache */
1635 INIT_LIST_HEAD(&slab_caches);
1636 list_add(&kmem_cache->list, &slab_caches);
1637 kmem_cache->colour_off = cache_line_size();
1638 kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
1639 kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1640 1649
1641 /* 1650 /*
1642 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1651 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1643 */ 1652 */
1644 kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1653 create_boot_cache(kmem_cache, "kmem_cache",
1645 nr_node_ids * sizeof(struct kmem_list3 *); 1654 offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1646 kmem_cache->object_size = kmem_cache->size; 1655 nr_node_ids * sizeof(struct kmem_list3 *),
1647 kmem_cache->size = ALIGN(kmem_cache->object_size, 1656 SLAB_HWCACHE_ALIGN);
1648 cache_line_size()); 1657 list_add(&kmem_cache->list, &slab_caches);
1649 kmem_cache->reciprocal_buffer_size =
1650 reciprocal_value(kmem_cache->size);
1651
1652 for (order = 0; order < MAX_ORDER; order++) {
1653 cache_estimate(order, kmem_cache->size,
1654 cache_line_size(), 0, &left_over, &kmem_cache->num);
1655 if (kmem_cache->num)
1656 break;
1657 }
1658 BUG_ON(!kmem_cache->num);
1659 kmem_cache->gfporder = order;
1660 kmem_cache->colour = left_over / kmem_cache->colour_off;
1661 kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
1662 sizeof(struct slab), cache_line_size());
1663 1658
1664 /* 2+3) create the kmalloc caches */ 1659 /* 2+3) create the kmalloc caches */
1665 sizes = malloc_sizes; 1660 sizes = malloc_sizes;
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void)
1671 * bug. 1666 * bug.
1672 */ 1667 */
1673 1668
1674 sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1669 sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
1675 sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; 1670 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
1676 sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; 1671
1677 sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; 1672 if (INDEX_AC != INDEX_L3)
1678 sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; 1673 sizes[INDEX_L3].cs_cachep =
1679 __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); 1674 create_kmalloc_cache(names[INDEX_L3].name,
1680 list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); 1675 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
1681
1682 if (INDEX_AC != INDEX_L3) {
1683 sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1684 sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
1685 sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
1686 sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
1687 sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1688 __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1689 list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
1690 }
1691 1676
1692 slab_early_init = 0; 1677 slab_early_init = 0;
1693 1678
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void)
1699 * Note for systems short on memory removing the alignment will 1684 * Note for systems short on memory removing the alignment will
1700 * allow tighter packing of the smaller caches. 1685 * allow tighter packing of the smaller caches.
1701 */ 1686 */
1702 if (!sizes->cs_cachep) { 1687 if (!sizes->cs_cachep)
1703 sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1688 sizes->cs_cachep = create_kmalloc_cache(names->name,
1704 sizes->cs_cachep->name = names->name; 1689 sizes->cs_size, ARCH_KMALLOC_FLAGS);
1705 sizes->cs_cachep->size = sizes->cs_size; 1690
1706 sizes->cs_cachep->object_size = sizes->cs_size;
1707 sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1708 __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1709 list_add(&sizes->cs_cachep->list, &slab_caches);
1710 }
1711#ifdef CONFIG_ZONE_DMA 1691#ifdef CONFIG_ZONE_DMA
1712 sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 1692 sizes->cs_dmacachep = create_kmalloc_cache(
1713 sizes->cs_dmacachep->name = names->name_dma; 1693 names->name_dma, sizes->cs_size,
1714 sizes->cs_dmacachep->size = sizes->cs_size; 1694 SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
1715 sizes->cs_dmacachep->object_size = sizes->cs_size;
1716 sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
1717 __kmem_cache_create(sizes->cs_dmacachep,
1718 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
1719 list_add(&sizes->cs_dmacachep->list, &slab_caches);
1720#endif 1695#endif
1721 sizes++; 1696 sizes++;
1722 names++; 1697 names++;
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void)
1727 1702
1728 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1703 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1729 1704
1730 BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
1731 memcpy(ptr, cpu_cache_get(kmem_cache), 1705 memcpy(ptr, cpu_cache_get(kmem_cache),
1732 sizeof(struct arraycache_init)); 1706 sizeof(struct arraycache_init));
1733 /* 1707 /*
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1921 if (page->pfmemalloc) 1895 if (page->pfmemalloc)
1922 SetPageSlabPfmemalloc(page + i); 1896 SetPageSlabPfmemalloc(page + i);
1923 } 1897 }
1898 memcg_bind_pages(cachep, cachep->gfporder);
1924 1899
1925 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1900 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1926 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1901 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1957 __ClearPageSlab(page); 1932 __ClearPageSlab(page);
1958 page++; 1933 page++;
1959 } 1934 }
1935
1936 memcg_release_pages(cachep, cachep->gfporder);
1960 if (current->reclaim_state) 1937 if (current->reclaim_state)
1961 current->reclaim_state->reclaimed_slab += nr_freed; 1938 current->reclaim_state->reclaimed_slab += nr_freed;
1962 free_pages((unsigned long)addr, cachep->gfporder); 1939 free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
1963} 1940}
1964 1941
1965static void kmem_rcu_free(struct rcu_head *head) 1942static void kmem_rcu_free(struct rcu_head *head)
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2282 2259
2283 if (slab_state == DOWN) { 2260 if (slab_state == DOWN) {
2284 /* 2261 /*
2285 * Note: the first kmem_cache_create must create the cache 2262 * Note: Creation of first cache (kmem_cache).
2263 * The setup_list3s is taken care
2264 * of by the caller of __kmem_cache_create
2265 */
2266 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2267 slab_state = PARTIAL;
2268 } else if (slab_state == PARTIAL) {
2269 /*
2270 * Note: the second kmem_cache_create must create the cache
2286 * that's used by kmalloc(24), otherwise the creation of 2271 * that's used by kmalloc(24), otherwise the creation of
2287 * further caches will BUG(). 2272 * further caches will BUG().
2288 */ 2273 */
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2290 2275
2291 /* 2276 /*
2292 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is 2277 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2293 * the first cache, then we need to set up all its list3s, 2278 * the second cache, then we need to set up all its list3s,
2294 * otherwise the creation of further caches will BUG(). 2279 * otherwise the creation of further caches will BUG().
2295 */ 2280 */
2296 set_up_list3s(cachep, SIZE_AC); 2281 set_up_list3s(cachep, SIZE_AC);
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2299 else 2284 else
2300 slab_state = PARTIAL_ARRAYCACHE; 2285 slab_state = PARTIAL_ARRAYCACHE;
2301 } else { 2286 } else {
2287 /* Remaining boot caches */
2302 cachep->array[smp_processor_id()] = 2288 cachep->array[smp_processor_id()] =
2303 kmalloc(sizeof(struct arraycache_init), gfp); 2289 kmalloc(sizeof(struct arraycache_init), gfp);
2304 2290
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2331 2317
2332/** 2318/**
2333 * __kmem_cache_create - Create a cache. 2319 * __kmem_cache_create - Create a cache.
2334 * @name: A string which is used in /proc/slabinfo to identify this cache. 2320 * @cachep: cache management descriptor
2335 * @size: The size of objects to be created in this cache.
2336 * @align: The required alignment for the objects.
2337 * @flags: SLAB flags 2321 * @flags: SLAB flags
2338 * @ctor: A constructor for the objects.
2339 * 2322 *
2340 * Returns a ptr to the cache on success, NULL on failure. 2323 * Returns a ptr to the cache on success, NULL on failure.
2341 * Cannot be called within a int, but can be interrupted. 2324 * Cannot be called within a int, but can be interrupted.
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2378 if (flags & SLAB_DESTROY_BY_RCU) 2361 if (flags & SLAB_DESTROY_BY_RCU)
2379 BUG_ON(flags & SLAB_POISON); 2362 BUG_ON(flags & SLAB_POISON);
2380#endif 2363#endif
2381 /*
2382 * Always checks flags, a caller might be expecting debug support which
2383 * isn't available.
2384 */
2385 BUG_ON(flags & ~CREATE_MASK);
2386 2364
2387 /* 2365 /*
2388 * Check that size is in terms of words. This is needed to avoid 2366 * Check that size is in terms of words. This is needed to avoid
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2394 size &= ~(BYTES_PER_WORD - 1); 2372 size &= ~(BYTES_PER_WORD - 1);
2395 } 2373 }
2396 2374
2397 /* calculate the final buffer alignment: */
2398
2399 /* 1) arch recommendation: can be overridden for debug */
2400 if (flags & SLAB_HWCACHE_ALIGN) {
2401 /*
2402 * Default alignment: as specified by the arch code. Except if
2403 * an object is really small, then squeeze multiple objects into
2404 * one cacheline.
2405 */
2406 ralign = cache_line_size();
2407 while (size <= ralign / 2)
2408 ralign /= 2;
2409 } else {
2410 ralign = BYTES_PER_WORD;
2411 }
2412
2413 /* 2375 /*
2414 * Redzoning and user store require word alignment or possibly larger. 2376 * Redzoning and user store require word alignment or possibly larger.
2415 * Note this will be overridden by architecture or caller mandated 2377 * Note this will be overridden by architecture or caller mandated
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2426 size &= ~(REDZONE_ALIGN - 1); 2388 size &= ~(REDZONE_ALIGN - 1);
2427 } 2389 }
2428 2390
2429 /* 2) arch mandated alignment */
2430 if (ralign < ARCH_SLAB_MINALIGN) {
2431 ralign = ARCH_SLAB_MINALIGN;
2432 }
2433 /* 3) caller mandated alignment */ 2391 /* 3) caller mandated alignment */
2434 if (ralign < cachep->align) { 2392 if (ralign < cachep->align) {
2435 ralign = cachep->align; 2393 ralign = cachep->align;
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2447 else 2405 else
2448 gfp = GFP_NOWAIT; 2406 gfp = GFP_NOWAIT;
2449 2407
2450 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; 2408 setup_nodelists_pointer(cachep);
2451#if DEBUG 2409#if DEBUG
2452 2410
2453 /* 2411 /*
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2566 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); 2524 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2567 2525
2568 slab_set_debugobj_lock_classes(cachep); 2526 slab_set_debugobj_lock_classes(cachep);
2569 } 2527 } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
2528 on_slab_lock_classes(cachep);
2570 2529
2571 return 0; 2530 return 0;
2572} 2531}
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3530 if (slab_should_failslab(cachep, flags)) 3489 if (slab_should_failslab(cachep, flags))
3531 return NULL; 3490 return NULL;
3532 3491
3492 cachep = memcg_kmem_get_cache(cachep, flags);
3493
3533 cache_alloc_debugcheck_before(cachep, flags); 3494 cache_alloc_debugcheck_before(cachep, flags);
3534 local_irq_save(save_flags); 3495 local_irq_save(save_flags);
3535 3496
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3615 if (slab_should_failslab(cachep, flags)) 3576 if (slab_should_failslab(cachep, flags))
3616 return NULL; 3577 return NULL;
3617 3578
3579 cachep = memcg_kmem_get_cache(cachep, flags);
3580
3618 cache_alloc_debugcheck_before(cachep, flags); 3581 cache_alloc_debugcheck_before(cachep, flags);
3619 local_irq_save(save_flags); 3582 local_irq_save(save_flags);
3620 objp = __do_cache_alloc(cachep, flags); 3583 objp = __do_cache_alloc(cachep, flags);
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
3928void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3891void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3929{ 3892{
3930 unsigned long flags; 3893 unsigned long flags;
3894 cachep = cache_from_obj(cachep, objp);
3895 if (!cachep)
3896 return;
3931 3897
3932 local_irq_save(flags); 3898 local_irq_save(flags);
3933 debug_check_no_locks_freed(objp, cachep->object_size); 3899 debug_check_no_locks_freed(objp, cachep->object_size);
@@ -3969,12 +3935,6 @@ void kfree(const void *objp)
3969} 3935}
3970EXPORT_SYMBOL(kfree); 3936EXPORT_SYMBOL(kfree);
3971 3937
3972unsigned int kmem_cache_size(struct kmem_cache *cachep)
3973{
3974 return cachep->object_size;
3975}
3976EXPORT_SYMBOL(kmem_cache_size);
3977
3978/* 3938/*
3979 * This initializes kmem_list3 or resizes various caches for all nodes. 3939 * This initializes kmem_list3 or resizes various caches for all nodes.
3980 */ 3940 */
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info)
4081} 4041}
4082 4042
4083/* Always called with the slab_mutex held */ 4043/* Always called with the slab_mutex held */
4084static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 4044static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
4085 int batchcount, int shared, gfp_t gfp) 4045 int batchcount, int shared, gfp_t gfp)
4086{ 4046{
4087 struct ccupdate_struct *new; 4047 struct ccupdate_struct *new;
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4124 return alloc_kmemlist(cachep, gfp); 4084 return alloc_kmemlist(cachep, gfp);
4125} 4085}
4126 4086
4087static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4088 int batchcount, int shared, gfp_t gfp)
4089{
4090 int ret;
4091 struct kmem_cache *c = NULL;
4092 int i = 0;
4093
4094 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
4095
4096 if (slab_state < FULL)
4097 return ret;
4098
4099 if ((ret < 0) || !is_root_cache(cachep))
4100 return ret;
4101
4102 VM_BUG_ON(!mutex_is_locked(&slab_mutex));
4103 for_each_memcg_cache_index(i) {
4104 c = cache_from_memcg(cachep, i);
4105 if (c)
4106 /* return value determined by the parent cache only */
4107 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
4108 }
4109
4110 return ret;
4111}
4112
4127/* Called with slab_mutex held always */ 4113/* Called with slab_mutex held always */
4128static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 4114static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4129{ 4115{
4130 int err; 4116 int err;
4131 int limit, shared; 4117 int limit = 0;
4118 int shared = 0;
4119 int batchcount = 0;
4120
4121 if (!is_root_cache(cachep)) {
4122 struct kmem_cache *root = memcg_root_cache(cachep);
4123 limit = root->limit;
4124 shared = root->shared;
4125 batchcount = root->batchcount;
4126 }
4132 4127
4128 if (limit && shared && batchcount)
4129 goto skip_setup;
4133 /* 4130 /*
4134 * The head array serves three purposes: 4131 * The head array serves three purposes:
4135 * - create a LIFO ordering, i.e. return objects that are cache-warm 4132 * - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4171 if (limit > 32) 4168 if (limit > 32)
4172 limit = 32; 4169 limit = 32;
4173#endif 4170#endif
4174 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); 4171 batchcount = (limit + 1) / 2;
4172skip_setup:
4173 err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
4175 if (err) 4174 if (err)
4176 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 4175 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4177 cachep->name, -err); 4176 cachep->name, -err);
@@ -4276,54 +4275,8 @@ out:
4276} 4275}
4277 4276
4278#ifdef CONFIG_SLABINFO 4277#ifdef CONFIG_SLABINFO
4279 4278void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4280static void print_slabinfo_header(struct seq_file *m)
4281{
4282 /*
4283 * Output format version, so at least we can change it
4284 * without _too_ many complaints.
4285 */
4286#if STATS
4287 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4288#else
4289 seq_puts(m, "slabinfo - version: 2.1\n");
4290#endif
4291 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
4292 "<objperslab> <pagesperslab>");
4293 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4294 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4295#if STATS
4296 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4297 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4298 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4299#endif
4300 seq_putc(m, '\n');
4301}
4302
4303static void *s_start(struct seq_file *m, loff_t *pos)
4304{
4305 loff_t n = *pos;
4306
4307 mutex_lock(&slab_mutex);
4308 if (!n)
4309 print_slabinfo_header(m);
4310
4311 return seq_list_start(&slab_caches, *pos);
4312}
4313
4314static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4315{ 4279{
4316 return seq_list_next(p, &slab_caches, pos);
4317}
4318
4319static void s_stop(struct seq_file *m, void *p)
4320{
4321 mutex_unlock(&slab_mutex);
4322}
4323
4324static int s_show(struct seq_file *m, void *p)
4325{
4326 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4327 struct slab *slabp; 4280 struct slab *slabp;
4328 unsigned long active_objs; 4281 unsigned long active_objs;
4329 unsigned long num_objs; 4282 unsigned long num_objs;
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p)
4378 if (error) 4331 if (error)
4379 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4332 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4380 4333
4381 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4334 sinfo->active_objs = active_objs;
4382 name, active_objs, num_objs, cachep->size, 4335 sinfo->num_objs = num_objs;
4383 cachep->num, (1 << cachep->gfporder)); 4336 sinfo->active_slabs = active_slabs;
4384 seq_printf(m, " : tunables %4u %4u %4u", 4337 sinfo->num_slabs = num_slabs;
4385 cachep->limit, cachep->batchcount, cachep->shared); 4338 sinfo->shared_avail = shared_avail;
4386 seq_printf(m, " : slabdata %6lu %6lu %6lu", 4339 sinfo->limit = cachep->limit;
4387 active_slabs, num_slabs, shared_avail); 4340 sinfo->batchcount = cachep->batchcount;
4341 sinfo->shared = cachep->shared;
4342 sinfo->objects_per_slab = cachep->num;
4343 sinfo->cache_order = cachep->gfporder;
4344}
4345
4346void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
4347{
4388#if STATS 4348#if STATS
4389 { /* list3 stats */ 4349 { /* list3 stats */
4390 unsigned long high = cachep->high_mark; 4350 unsigned long high = cachep->high_mark;
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p)
4414 allochit, allocmiss, freehit, freemiss); 4374 allochit, allocmiss, freehit, freemiss);
4415 } 4375 }
4416#endif 4376#endif
4417 seq_putc(m, '\n');
4418 return 0;
4419} 4377}
4420 4378
4421/*
4422 * slabinfo_op - iterator that generates /proc/slabinfo
4423 *
4424 * Output layout:
4425 * cache-name
4426 * num-active-objs
4427 * total-objs
4428 * object size
4429 * num-active-slabs
4430 * total-slabs
4431 * num-pages-per-slab
4432 * + further values on SMP and with statistics enabled
4433 */
4434
4435static const struct seq_operations slabinfo_op = {
4436 .start = s_start,
4437 .next = s_next,
4438 .stop = s_stop,
4439 .show = s_show,
4440};
4441
4442#define MAX_SLABINFO_WRITE 128 4379#define MAX_SLABINFO_WRITE 128
4443/** 4380/**
4444 * slabinfo_write - Tuning for the slab allocator 4381 * slabinfo_write - Tuning for the slab allocator
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = {
4447 * @count: data length 4384 * @count: data length
4448 * @ppos: unused 4385 * @ppos: unused
4449 */ 4386 */
4450static ssize_t slabinfo_write(struct file *file, const char __user *buffer, 4387ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4451 size_t count, loff_t *ppos) 4388 size_t count, loff_t *ppos)
4452{ 4389{
4453 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4390 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4490 return res; 4427 return res;
4491} 4428}
4492 4429
4493static int slabinfo_open(struct inode *inode, struct file *file)
4494{
4495 return seq_open(file, &slabinfo_op);
4496}
4497
4498static const struct file_operations proc_slabinfo_operations = {
4499 .open = slabinfo_open,
4500 .read = seq_read,
4501 .write = slabinfo_write,
4502 .llseek = seq_lseek,
4503 .release = seq_release,
4504};
4505
4506#ifdef CONFIG_DEBUG_SLAB_LEAK 4430#ifdef CONFIG_DEBUG_SLAB_LEAK
4507 4431
4508static void *leaks_start(struct seq_file *m, loff_t *pos) 4432static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p)
4631 return 0; 4555 return 0;
4632} 4556}
4633 4557
4558static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4559{
4560 return seq_list_next(p, &slab_caches, pos);
4561}
4562
4563static void s_stop(struct seq_file *m, void *p)
4564{
4565 mutex_unlock(&slab_mutex);
4566}
4567
4634static const struct seq_operations slabstats_op = { 4568static const struct seq_operations slabstats_op = {
4635 .start = leaks_start, 4569 .start = leaks_start,
4636 .next = s_next, 4570 .next = s_next,
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = {
4665 4599
4666static int __init slab_proc_init(void) 4600static int __init slab_proc_init(void)
4667{ 4601{
4668 proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
4669#ifdef CONFIG_DEBUG_SLAB_LEAK 4602#ifdef CONFIG_DEBUG_SLAB_LEAK
4670 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4603 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4671#endif 4604#endif
diff --git a/mm/slab.h b/mm/slab.h
index 7deeb449a301..34a98d642196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -32,19 +32,201 @@ extern struct list_head slab_caches;
32/* The slab cache that manages slab cache information */ 32/* The slab cache that manages slab cache information */
33extern struct kmem_cache *kmem_cache; 33extern struct kmem_cache *kmem_cache;
34 34
35unsigned long calculate_alignment(unsigned long flags,
36 unsigned long align, unsigned long size);
37
35/* Functions provided by the slab allocators */ 38/* Functions provided by the slab allocators */
36extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); 39extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
37 40
41extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
42 unsigned long flags);
43extern void create_boot_cache(struct kmem_cache *, const char *name,
44 size_t size, unsigned long flags);
45
46struct mem_cgroup;
38#ifdef CONFIG_SLUB 47#ifdef CONFIG_SLUB
39struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 48struct kmem_cache *
40 size_t align, unsigned long flags, void (*ctor)(void *)); 49__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
50 size_t align, unsigned long flags, void (*ctor)(void *));
41#else 51#else
42static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 52static inline struct kmem_cache *
43 size_t align, unsigned long flags, void (*ctor)(void *)) 53__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
54 size_t align, unsigned long flags, void (*ctor)(void *))
44{ return NULL; } 55{ return NULL; }
45#endif 56#endif
46 57
47 58
59/* Legal flag mask for kmem_cache_create(), for various configurations */
60#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
61 SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
62
63#if defined(CONFIG_DEBUG_SLAB)
64#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
65#elif defined(CONFIG_SLUB_DEBUG)
66#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
67 SLAB_TRACE | SLAB_DEBUG_FREE)
68#else
69#define SLAB_DEBUG_FLAGS (0)
70#endif
71
72#if defined(CONFIG_SLAB)
73#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
74 SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
75#elif defined(CONFIG_SLUB)
76#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
77 SLAB_TEMPORARY | SLAB_NOTRACK)
78#else
79#define SLAB_CACHE_FLAGS (0)
80#endif
81
82#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
83
48int __kmem_cache_shutdown(struct kmem_cache *); 84int __kmem_cache_shutdown(struct kmem_cache *);
49 85
86struct seq_file;
87struct file;
88
89struct slabinfo {
90 unsigned long active_objs;
91 unsigned long num_objs;
92 unsigned long active_slabs;
93 unsigned long num_slabs;
94 unsigned long shared_avail;
95 unsigned int limit;
96 unsigned int batchcount;
97 unsigned int shared;
98 unsigned int objects_per_slab;
99 unsigned int cache_order;
100};
101
102void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
103void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
104ssize_t slabinfo_write(struct file *file, const char __user *buffer,
105 size_t count, loff_t *ppos);
106
107#ifdef CONFIG_MEMCG_KMEM
108static inline bool is_root_cache(struct kmem_cache *s)
109{
110 return !s->memcg_params || s->memcg_params->is_root_cache;
111}
112
113static inline bool cache_match_memcg(struct kmem_cache *cachep,
114 struct mem_cgroup *memcg)
115{
116 return (is_root_cache(cachep) && !memcg) ||
117 (cachep->memcg_params->memcg == memcg);
118}
119
120static inline void memcg_bind_pages(struct kmem_cache *s, int order)
121{
122 if (!is_root_cache(s))
123 atomic_add(1 << order, &s->memcg_params->nr_pages);
124}
125
126static inline void memcg_release_pages(struct kmem_cache *s, int order)
127{
128 if (is_root_cache(s))
129 return;
130
131 if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
132 mem_cgroup_destroy_cache(s);
133}
134
135static inline bool slab_equal_or_root(struct kmem_cache *s,
136 struct kmem_cache *p)
137{
138 return (p == s) ||
139 (s->memcg_params && (p == s->memcg_params->root_cache));
140}
141
142/*
143 * We use suffixes to the name in memcg because we can't have caches
144 * created in the system with the same name. But when we print them
145 * locally, better refer to them with the base name
146 */
147static inline const char *cache_name(struct kmem_cache *s)
148{
149 if (!is_root_cache(s))
150 return s->memcg_params->root_cache->name;
151 return s->name;
152}
153
154static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
155{
156 return s->memcg_params->memcg_caches[idx];
157}
158
159static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
160{
161 if (is_root_cache(s))
162 return s;
163 return s->memcg_params->root_cache;
164}
165#else
166static inline bool is_root_cache(struct kmem_cache *s)
167{
168 return true;
169}
170
171static inline bool cache_match_memcg(struct kmem_cache *cachep,
172 struct mem_cgroup *memcg)
173{
174 return true;
175}
176
177static inline void memcg_bind_pages(struct kmem_cache *s, int order)
178{
179}
180
181static inline void memcg_release_pages(struct kmem_cache *s, int order)
182{
183}
184
185static inline bool slab_equal_or_root(struct kmem_cache *s,
186 struct kmem_cache *p)
187{
188 return true;
189}
190
191static inline const char *cache_name(struct kmem_cache *s)
192{
193 return s->name;
194}
195
196static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
197{
198 return NULL;
199}
200
201static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
202{
203 return s;
204}
205#endif
206
207static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
208{
209 struct kmem_cache *cachep;
210 struct page *page;
211
212 /*
213 * When kmemcg is not being used, both assignments should return the
214 * same value. but we don't want to pay the assignment price in that
215 * case. If it is not compiled in, the compiler should be smart enough
216 * to not do even the assignment. In that case, slab_equal_or_root
217 * will also be a constant.
218 */
219 if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
220 return s;
221
222 page = virt_to_head_page(x);
223 cachep = page->slab_cache;
224 if (slab_equal_or_root(cachep, s))
225 return cachep;
226
227 pr_err("%s: Wrong slab cache. %s but object is from %s\n",
228 __FUNCTION__, cachep->name, s->name);
229 WARN_ON_ONCE(1);
230 return s;
231}
50#endif 232#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 069a24e64403..3f3cd97d3fdf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -13,9 +13,12 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/seq_file.h>
17#include <linux/proc_fs.h>
16#include <asm/cacheflush.h> 18#include <asm/cacheflush.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
18#include <asm/page.h> 20#include <asm/page.h>
21#include <linux/memcontrol.h>
19 22
20#include "slab.h" 23#include "slab.h"
21 24
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
25struct kmem_cache *kmem_cache; 28struct kmem_cache *kmem_cache;
26 29
27#ifdef CONFIG_DEBUG_VM 30#ifdef CONFIG_DEBUG_VM
28static int kmem_cache_sanity_check(const char *name, size_t size) 31static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
32 size_t size)
29{ 33{
30 struct kmem_cache *s = NULL; 34 struct kmem_cache *s = NULL;
31 35
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
51 continue; 55 continue;
52 } 56 }
53 57
54 if (!strcmp(s->name, name)) { 58 /*
59 * For simplicity, we won't check this in the list of memcg
60 * caches. We have control over memcg naming, and if there
61 * aren't duplicates in the global list, there won't be any
62 * duplicates in the memcg lists as well.
63 */
64 if (!memcg && !strcmp(s->name, name)) {
55 pr_err("%s (%s): Cache name already exists.\n", 65 pr_err("%s (%s): Cache name already exists.\n",
56 __func__, name); 66 __func__, name);
57 dump_stack(); 67 dump_stack();
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
64 return 0; 74 return 0;
65} 75}
66#else 76#else
67static inline int kmem_cache_sanity_check(const char *name, size_t size) 77static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
78 const char *name, size_t size)
68{ 79{
69 return 0; 80 return 0;
70} 81}
71#endif 82#endif
72 83
84#ifdef CONFIG_MEMCG_KMEM
85int memcg_update_all_caches(int num_memcgs)
86{
87 struct kmem_cache *s;
88 int ret = 0;
89 mutex_lock(&slab_mutex);
90
91 list_for_each_entry(s, &slab_caches, list) {
92 if (!is_root_cache(s))
93 continue;
94
95 ret = memcg_update_cache_size(s, num_memcgs);
96 /*
97 * See comment in memcontrol.c, memcg_update_cache_size:
98 * Instead of freeing the memory, we'll just leave the caches
99 * up to this point in an updated state.
100 */
101 if (ret)
102 goto out;
103 }
104
105 memcg_update_array_size(num_memcgs);
106out:
107 mutex_unlock(&slab_mutex);
108 return ret;
109}
110#endif
111
112/*
113 * Figure out what the alignment of the objects will be given a set of
114 * flags, a user specified alignment and the size of the objects.
115 */
116unsigned long calculate_alignment(unsigned long flags,
117 unsigned long align, unsigned long size)
118{
119 /*
120 * If the user wants hardware cache aligned objects then follow that
121 * suggestion if the object is sufficiently large.
122 *
123 * The hardware cache alignment cannot override the specified
124 * alignment though. If that is greater then use it.
125 */
126 if (flags & SLAB_HWCACHE_ALIGN) {
127 unsigned long ralign = cache_line_size();
128 while (size <= ralign / 2)
129 ralign /= 2;
130 align = max(align, ralign);
131 }
132
133 if (align < ARCH_SLAB_MINALIGN)
134 align = ARCH_SLAB_MINALIGN;
135
136 return ALIGN(align, sizeof(void *));
137}
138
139
73/* 140/*
74 * kmem_cache_create - Create a cache. 141 * kmem_cache_create - Create a cache.
75 * @name: A string which is used in /proc/slabinfo to identify this cache. 142 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
95 * as davem. 162 * as davem.
96 */ 163 */
97 164
98struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, 165struct kmem_cache *
99 unsigned long flags, void (*ctor)(void *)) 166kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
167 size_t align, unsigned long flags, void (*ctor)(void *),
168 struct kmem_cache *parent_cache)
100{ 169{
101 struct kmem_cache *s = NULL; 170 struct kmem_cache *s = NULL;
102 int err = 0; 171 int err = 0;
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
104 get_online_cpus(); 173 get_online_cpus();
105 mutex_lock(&slab_mutex); 174 mutex_lock(&slab_mutex);
106 175
107 if (!kmem_cache_sanity_check(name, size) == 0) 176 if (!kmem_cache_sanity_check(memcg, name, size) == 0)
108 goto out_locked; 177 goto out_locked;
109 178
179 /*
180 * Some allocators will constraint the set of valid flags to a subset
181 * of all flags. We expect them to define CACHE_CREATE_MASK in this
182 * case, and we'll just provide them with a sanitized version of the
183 * passed flags.
184 */
185 flags &= CACHE_CREATE_MASK;
110 186
111 s = __kmem_cache_alias(name, size, align, flags, ctor); 187 s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
112 if (s) 188 if (s)
113 goto out_locked; 189 goto out_locked;
114 190
115 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); 191 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
116 if (s) { 192 if (s) {
117 s->object_size = s->size = size; 193 s->object_size = s->size = size;
118 s->align = align; 194 s->align = calculate_alignment(flags, align, size);
119 s->ctor = ctor; 195 s->ctor = ctor;
196
197 if (memcg_register_cache(memcg, s, parent_cache)) {
198 kmem_cache_free(kmem_cache, s);
199 err = -ENOMEM;
200 goto out_locked;
201 }
202
120 s->name = kstrdup(name, GFP_KERNEL); 203 s->name = kstrdup(name, GFP_KERNEL);
121 if (!s->name) { 204 if (!s->name) {
122 kmem_cache_free(kmem_cache, s); 205 kmem_cache_free(kmem_cache, s);
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
126 209
127 err = __kmem_cache_create(s, flags); 210 err = __kmem_cache_create(s, flags);
128 if (!err) { 211 if (!err) {
129
130 s->refcount = 1; 212 s->refcount = 1;
131 list_add(&s->list, &slab_caches); 213 list_add(&s->list, &slab_caches);
132 214 memcg_cache_list_add(memcg, s);
133 } else { 215 } else {
134 kfree(s->name); 216 kfree(s->name);
135 kmem_cache_free(kmem_cache, s); 217 kmem_cache_free(kmem_cache, s);
@@ -157,10 +239,20 @@ out_locked:
157 239
158 return s; 240 return s;
159} 241}
242
243struct kmem_cache *
244kmem_cache_create(const char *name, size_t size, size_t align,
245 unsigned long flags, void (*ctor)(void *))
246{
247 return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
248}
160EXPORT_SYMBOL(kmem_cache_create); 249EXPORT_SYMBOL(kmem_cache_create);
161 250
162void kmem_cache_destroy(struct kmem_cache *s) 251void kmem_cache_destroy(struct kmem_cache *s)
163{ 252{
253 /* Destroy all the children caches if we aren't a memcg cache */
254 kmem_cache_destroy_memcg_children(s);
255
164 get_online_cpus(); 256 get_online_cpus();
165 mutex_lock(&slab_mutex); 257 mutex_lock(&slab_mutex);
166 s->refcount--; 258 s->refcount--;
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
172 if (s->flags & SLAB_DESTROY_BY_RCU) 264 if (s->flags & SLAB_DESTROY_BY_RCU)
173 rcu_barrier(); 265 rcu_barrier();
174 266
267 memcg_release_cache(s);
175 kfree(s->name); 268 kfree(s->name);
176 kmem_cache_free(kmem_cache, s); 269 kmem_cache_free(kmem_cache, s);
177 } else { 270 } else {
@@ -192,3 +285,182 @@ int slab_is_available(void)
192{ 285{
193 return slab_state >= UP; 286 return slab_state >= UP;
194} 287}
288
289#ifndef CONFIG_SLOB
290/* Create a cache during boot when no slab services are available yet */
291void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
292 unsigned long flags)
293{
294 int err;
295
296 s->name = name;
297 s->size = s->object_size = size;
298 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
299 err = __kmem_cache_create(s, flags);
300
301 if (err)
302 panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
303 name, size, err);
304
305 s->refcount = -1; /* Exempt from merging for now */
306}
307
308struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
309 unsigned long flags)
310{
311 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
312
313 if (!s)
314 panic("Out of memory when creating slab %s\n", name);
315
316 create_boot_cache(s, name, size, flags);
317 list_add(&s->list, &slab_caches);
318 s->refcount = 1;
319 return s;
320}
321
322#endif /* !CONFIG_SLOB */
323
324
325#ifdef CONFIG_SLABINFO
326void print_slabinfo_header(struct seq_file *m)
327{
328 /*
329 * Output format version, so at least we can change it
330 * without _too_ many complaints.
331 */
332#ifdef CONFIG_DEBUG_SLAB
333 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
334#else
335 seq_puts(m, "slabinfo - version: 2.1\n");
336#endif
337 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
338 "<objperslab> <pagesperslab>");
339 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
340 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
341#ifdef CONFIG_DEBUG_SLAB
342 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
343 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
344 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
345#endif
346 seq_putc(m, '\n');
347}
348
349static void *s_start(struct seq_file *m, loff_t *pos)
350{
351 loff_t n = *pos;
352
353 mutex_lock(&slab_mutex);
354 if (!n)
355 print_slabinfo_header(m);
356
357 return seq_list_start(&slab_caches, *pos);
358}
359
360static void *s_next(struct seq_file *m, void *p, loff_t *pos)
361{
362 return seq_list_next(p, &slab_caches, pos);
363}
364
365static void s_stop(struct seq_file *m, void *p)
366{
367 mutex_unlock(&slab_mutex);
368}
369
370static void
371memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
372{
373 struct kmem_cache *c;
374 struct slabinfo sinfo;
375 int i;
376
377 if (!is_root_cache(s))
378 return;
379
380 for_each_memcg_cache_index(i) {
381 c = cache_from_memcg(s, i);
382 if (!c)
383 continue;
384
385 memset(&sinfo, 0, sizeof(sinfo));
386 get_slabinfo(c, &sinfo);
387
388 info->active_slabs += sinfo.active_slabs;
389 info->num_slabs += sinfo.num_slabs;
390 info->shared_avail += sinfo.shared_avail;
391 info->active_objs += sinfo.active_objs;
392 info->num_objs += sinfo.num_objs;
393 }
394}
395
396int cache_show(struct kmem_cache *s, struct seq_file *m)
397{
398 struct slabinfo sinfo;
399
400 memset(&sinfo, 0, sizeof(sinfo));
401 get_slabinfo(s, &sinfo);
402
403 memcg_accumulate_slabinfo(s, &sinfo);
404
405 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
406 cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
407 sinfo.objects_per_slab, (1 << sinfo.cache_order));
408
409 seq_printf(m, " : tunables %4u %4u %4u",
410 sinfo.limit, sinfo.batchcount, sinfo.shared);
411 seq_printf(m, " : slabdata %6lu %6lu %6lu",
412 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
413 slabinfo_show_stats(m, s);
414 seq_putc(m, '\n');
415 return 0;
416}
417
418static int s_show(struct seq_file *m, void *p)
419{
420 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
421
422 if (!is_root_cache(s))
423 return 0;
424 return cache_show(s, m);
425}
426
427/*
428 * slabinfo_op - iterator that generates /proc/slabinfo
429 *
430 * Output layout:
431 * cache-name
432 * num-active-objs
433 * total-objs
434 * object size
435 * num-active-slabs
436 * total-slabs
437 * num-pages-per-slab
438 * + further values on SMP and with statistics enabled
439 */
440static const struct seq_operations slabinfo_op = {
441 .start = s_start,
442 .next = s_next,
443 .stop = s_stop,
444 .show = s_show,
445};
446
447static int slabinfo_open(struct inode *inode, struct file *file)
448{
449 return seq_open(file, &slabinfo_op);
450}
451
452static const struct file_operations proc_slabinfo_operations = {
453 .open = slabinfo_open,
454 .read = seq_read,
455 .write = slabinfo_write,
456 .llseek = seq_lseek,
457 .release = seq_release,
458};
459
460static int __init slab_proc_init(void)
461{
462 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
463 return 0;
464}
465module_init(slab_proc_init);
466#endif /* CONFIG_SLABINFO */
diff --git a/mm/slob.c b/mm/slob.c
index 1e921c5e9576..a99fdf7a0907 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -28,9 +28,8 @@
28 * from kmalloc are prepended with a 4-byte header with the kmalloc size. 28 * from kmalloc are prepended with a 4-byte header with the kmalloc size.
29 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls 29 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
30 * alloc_pages() directly, allocating compound pages so the page order 30 * alloc_pages() directly, allocating compound pages so the page order
31 * does not have to be separately tracked, and also stores the exact 31 * does not have to be separately tracked.
32 * allocation size in page->private so that it can be used to accurately 32 * These objects are detected in kfree() because PageSlab()
33 * provide ksize(). These objects are detected in kfree() because slob_page()
34 * is false for them. 33 * is false for them.
35 * 34 *
36 * SLAB is emulated on top of SLOB by simply calling constructors and 35 * SLAB is emulated on top of SLOB by simply calling constructors and
@@ -59,7 +58,6 @@
59 58
60#include <linux/kernel.h> 59#include <linux/kernel.h>
61#include <linux/slab.h> 60#include <linux/slab.h>
62#include "slab.h"
63 61
64#include <linux/mm.h> 62#include <linux/mm.h>
65#include <linux/swap.h> /* struct reclaim_state */ 63#include <linux/swap.h> /* struct reclaim_state */
@@ -74,6 +72,7 @@
74 72
75#include <linux/atomic.h> 73#include <linux/atomic.h>
76 74
75#include "slab.h"
77/* 76/*
78 * slob_block has a field 'units', which indicates size of block if +ve, 77 * slob_block has a field 'units', which indicates size of block if +ve,
79 * or offset of next block if -ve (in SLOB_UNITs). 78 * or offset of next block if -ve (in SLOB_UNITs).
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp)
124 123
125#define SLOB_UNIT sizeof(slob_t) 124#define SLOB_UNIT sizeof(slob_t)
126#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) 125#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
127#define SLOB_ALIGN L1_CACHE_BYTES
128 126
129/* 127/*
130 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
455 if (likely(order)) 453 if (likely(order))
456 gfp |= __GFP_COMP; 454 gfp |= __GFP_COMP;
457 ret = slob_new_pages(gfp, order, node); 455 ret = slob_new_pages(gfp, order, node);
458 if (ret) {
459 struct page *page;
460 page = virt_to_page(ret);
461 page->private = size;
462 }
463 456
464 trace_kmalloc_node(caller, ret, 457 trace_kmalloc_node(caller, ret,
465 size, PAGE_SIZE << order, gfp, node); 458 size, PAGE_SIZE << order, gfp, node);
@@ -506,7 +499,7 @@ void kfree(const void *block)
506 unsigned int *m = (unsigned int *)(block - align); 499 unsigned int *m = (unsigned int *)(block - align);
507 slob_free(m, *m + align); 500 slob_free(m, *m + align);
508 } else 501 } else
509 put_page(sp); 502 __free_pages(sp, compound_order(sp));
510} 503}
511EXPORT_SYMBOL(kfree); 504EXPORT_SYMBOL(kfree);
512 505
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree);
514size_t ksize(const void *block) 507size_t ksize(const void *block)
515{ 508{
516 struct page *sp; 509 struct page *sp;
510 int align;
511 unsigned int *m;
517 512
518 BUG_ON(!block); 513 BUG_ON(!block);
519 if (unlikely(block == ZERO_SIZE_PTR)) 514 if (unlikely(block == ZERO_SIZE_PTR))
520 return 0; 515 return 0;
521 516
522 sp = virt_to_page(block); 517 sp = virt_to_page(block);
523 if (PageSlab(sp)) { 518 if (unlikely(!PageSlab(sp)))
524 int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 519 return PAGE_SIZE << compound_order(sp);
525 unsigned int *m = (unsigned int *)(block - align); 520
526 return SLOB_UNITS(*m) * SLOB_UNIT; 521 align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
527 } else 522 m = (unsigned int *)(block - align);
528 return sp->private; 523 return SLOB_UNITS(*m) * SLOB_UNIT;
529} 524}
530EXPORT_SYMBOL(ksize); 525EXPORT_SYMBOL(ksize);
531 526
532int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) 527int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
533{ 528{
534 size_t align = c->size;
535
536 if (flags & SLAB_DESTROY_BY_RCU) { 529 if (flags & SLAB_DESTROY_BY_RCU) {
537 /* leave room for rcu footer at the end of object */ 530 /* leave room for rcu footer at the end of object */
538 c->size += sizeof(struct slob_rcu); 531 c->size += sizeof(struct slob_rcu);
539 } 532 }
540 c->flags = flags; 533 c->flags = flags;
541 /* ignore alignment unless it's forced */
542 c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
543 if (c->align < ARCH_SLAB_MINALIGN)
544 c->align = ARCH_SLAB_MINALIGN;
545 if (c->align < align)
546 c->align = align;
547
548 return 0; 534 return 0;
549} 535}
550 536
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
558 544
559 if (c->size < PAGE_SIZE) { 545 if (c->size < PAGE_SIZE) {
560 b = slob_alloc(c->size, flags, c->align, node); 546 b = slob_alloc(c->size, flags, c->align, node);
561 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 547 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
562 SLOB_UNITS(c->size) * SLOB_UNIT, 548 SLOB_UNITS(c->size) * SLOB_UNIT,
563 flags, node); 549 flags, node);
564 } else { 550 } else {
565 b = slob_new_pages(flags, get_order(c->size), node); 551 b = slob_new_pages(flags, get_order(c->size), node);
566 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 552 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
567 PAGE_SIZE << get_order(c->size), 553 PAGE_SIZE << get_order(c->size),
568 flags, node); 554 flags, node);
569 } 555 }
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
608} 594}
609EXPORT_SYMBOL(kmem_cache_free); 595EXPORT_SYMBOL(kmem_cache_free);
610 596
611unsigned int kmem_cache_size(struct kmem_cache *c)
612{
613 return c->size;
614}
615EXPORT_SYMBOL(kmem_cache_size);
616
617int __kmem_cache_shutdown(struct kmem_cache *c) 597int __kmem_cache_shutdown(struct kmem_cache *c)
618{ 598{
619 /* No way to check for remaining objects */ 599 /* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index a0d698467f70..ba2ca53f6c3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -31,6 +31,7 @@
31#include <linux/fault-inject.h> 31#include <linux/fault-inject.h>
32#include <linux/stacktrace.h> 32#include <linux/stacktrace.h>
33#include <linux/prefetch.h> 33#include <linux/prefetch.h>
34#include <linux/memcontrol.h>
34 35
35#include <trace/events/kmem.h> 36#include <trace/events/kmem.h>
36 37
@@ -112,9 +113,6 @@
112 * the fast path and disables lockless freelists. 113 * the fast path and disables lockless freelists.
113 */ 114 */
114 115
115#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
116 SLAB_TRACE | SLAB_DEBUG_FREE)
117
118static inline int kmem_cache_debug(struct kmem_cache *s) 116static inline int kmem_cache_debug(struct kmem_cache *s)
119{ 117{
120#ifdef CONFIG_SLUB_DEBUG 118#ifdef CONFIG_SLUB_DEBUG
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
179#define __OBJECT_POISON 0x80000000UL /* Poison object */ 177#define __OBJECT_POISON 0x80000000UL /* Poison object */
180#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 178#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
181 179
182static int kmem_size = sizeof(struct kmem_cache);
183
184#ifdef CONFIG_SMP 180#ifdef CONFIG_SMP
185static struct notifier_block slab_notifier; 181static struct notifier_block slab_notifier;
186#endif 182#endif
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
205static int sysfs_slab_add(struct kmem_cache *); 201static int sysfs_slab_add(struct kmem_cache *);
206static int sysfs_slab_alias(struct kmem_cache *, const char *); 202static int sysfs_slab_alias(struct kmem_cache *, const char *);
207static void sysfs_slab_remove(struct kmem_cache *); 203static void sysfs_slab_remove(struct kmem_cache *);
208 204static void memcg_propagate_slab_attrs(struct kmem_cache *s);
209#else 205#else
210static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 206static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
211static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 207static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
212 { return 0; } 208 { return 0; }
213static inline void sysfs_slab_remove(struct kmem_cache *s) { } 209static inline void sysfs_slab_remove(struct kmem_cache *s) { }
214 210
211static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
215#endif 212#endif
216 213
217static inline void stat(const struct kmem_cache *s, enum stat_item si) 214static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
1092 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1089 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1093 goto out; 1090 goto out;
1094 1091
1095 if (unlikely(s != page->slab)) { 1092 if (unlikely(s != page->slab_cache)) {
1096 if (!PageSlab(page)) { 1093 if (!PageSlab(page)) {
1097 slab_err(s, page, "Attempt to free object(0x%p) " 1094 slab_err(s, page, "Attempt to free object(0x%p) "
1098 "outside of slab", object); 1095 "outside of slab", object);
1099 } else if (!page->slab) { 1096 } else if (!page->slab_cache) {
1100 printk(KERN_ERR 1097 printk(KERN_ERR
1101 "SLUB <none>: no slab for object 0x%p.\n", 1098 "SLUB <none>: no slab for object 0x%p.\n",
1102 object); 1099 object);
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1348 void *start; 1345 void *start;
1349 void *last; 1346 void *last;
1350 void *p; 1347 void *p;
1348 int order;
1351 1349
1352 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1350 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1353 1351
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1356 if (!page) 1354 if (!page)
1357 goto out; 1355 goto out;
1358 1356
1357 order = compound_order(page);
1359 inc_slabs_node(s, page_to_nid(page), page->objects); 1358 inc_slabs_node(s, page_to_nid(page), page->objects);
1360 page->slab = s; 1359 memcg_bind_pages(s, order);
1360 page->slab_cache = s;
1361 __SetPageSlab(page); 1361 __SetPageSlab(page);
1362 if (page->pfmemalloc) 1362 if (page->pfmemalloc)
1363 SetPageSlabPfmemalloc(page); 1363 SetPageSlabPfmemalloc(page);
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1365 start = page_address(page); 1365 start = page_address(page);
1366 1366
1367 if (unlikely(s->flags & SLAB_POISON)) 1367 if (unlikely(s->flags & SLAB_POISON))
1368 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); 1368 memset(start, POISON_INUSE, PAGE_SIZE << order);
1369 1369
1370 last = start; 1370 last = start;
1371 for_each_object(p, s, start, page->objects) { 1371 for_each_object(p, s, start, page->objects) {
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1406 1406
1407 __ClearPageSlabPfmemalloc(page); 1407 __ClearPageSlabPfmemalloc(page);
1408 __ClearPageSlab(page); 1408 __ClearPageSlab(page);
1409
1410 memcg_release_pages(s, order);
1409 reset_page_mapcount(page); 1411 reset_page_mapcount(page);
1410 if (current->reclaim_state) 1412 if (current->reclaim_state)
1411 current->reclaim_state->reclaimed_slab += pages; 1413 current->reclaim_state->reclaimed_slab += pages;
1412 __free_pages(page, order); 1414 __free_memcg_kmem_pages(page, order);
1413} 1415}
1414 1416
1415#define need_reserve_slab_rcu \ 1417#define need_reserve_slab_rcu \
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
1424 else 1426 else
1425 page = container_of((struct list_head *)h, struct page, lru); 1427 page = container_of((struct list_head *)h, struct page, lru);
1426 1428
1427 __free_slab(page->slab, page); 1429 __free_slab(page->slab_cache, page);
1428} 1430}
1429 1431
1430static void free_slab(struct kmem_cache *s, struct page *page) 1432static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1872,12 +1874,14 @@ redo:
1872/* 1874/*
1873 * Unfreeze all the cpu partial slabs. 1875 * Unfreeze all the cpu partial slabs.
1874 * 1876 *
1875 * This function must be called with interrupt disabled. 1877 * This function must be called with interrupts disabled
1878 * for the cpu using c (or some other guarantee must be there
1879 * to guarantee no concurrent accesses).
1876 */ 1880 */
1877static void unfreeze_partials(struct kmem_cache *s) 1881static void unfreeze_partials(struct kmem_cache *s,
1882 struct kmem_cache_cpu *c)
1878{ 1883{
1879 struct kmem_cache_node *n = NULL, *n2 = NULL; 1884 struct kmem_cache_node *n = NULL, *n2 = NULL;
1880 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
1881 struct page *page, *discard_page = NULL; 1885 struct page *page, *discard_page = NULL;
1882 1886
1883 while ((page = c->partial)) { 1887 while ((page = c->partial)) {
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1963 * set to the per node partial list. 1967 * set to the per node partial list.
1964 */ 1968 */
1965 local_irq_save(flags); 1969 local_irq_save(flags);
1966 unfreeze_partials(s); 1970 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
1967 local_irq_restore(flags); 1971 local_irq_restore(flags);
1968 oldpage = NULL; 1972 oldpage = NULL;
1969 pobjects = 0; 1973 pobjects = 0;
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2006 if (c->page) 2010 if (c->page)
2007 flush_slab(s, c); 2011 flush_slab(s, c);
2008 2012
2009 unfreeze_partials(s); 2013 unfreeze_partials(s, c);
2010 } 2014 }
2011} 2015}
2012 2016
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2325 if (slab_pre_alloc_hook(s, gfpflags)) 2329 if (slab_pre_alloc_hook(s, gfpflags))
2326 return NULL; 2330 return NULL;
2327 2331
2332 s = memcg_kmem_get_cache(s, gfpflags);
2328redo: 2333redo:
2329 2334
2330 /* 2335 /*
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2459 void *prior; 2464 void *prior;
2460 void **object = (void *)x; 2465 void **object = (void *)x;
2461 int was_frozen; 2466 int was_frozen;
2462 int inuse;
2463 struct page new; 2467 struct page new;
2464 unsigned long counters; 2468 unsigned long counters;
2465 struct kmem_cache_node *n = NULL; 2469 struct kmem_cache_node *n = NULL;
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2472 return; 2476 return;
2473 2477
2474 do { 2478 do {
2479 if (unlikely(n)) {
2480 spin_unlock_irqrestore(&n->list_lock, flags);
2481 n = NULL;
2482 }
2475 prior = page->freelist; 2483 prior = page->freelist;
2476 counters = page->counters; 2484 counters = page->counters;
2477 set_freepointer(s, object, prior); 2485 set_freepointer(s, object, prior);
2478 new.counters = counters; 2486 new.counters = counters;
2479 was_frozen = new.frozen; 2487 was_frozen = new.frozen;
2480 new.inuse--; 2488 new.inuse--;
2481 if ((!new.inuse || !prior) && !was_frozen && !n) { 2489 if ((!new.inuse || !prior) && !was_frozen) {
2482 2490
2483 if (!kmem_cache_debug(s) && !prior) 2491 if (!kmem_cache_debug(s) && !prior)
2484 2492
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2503 2511
2504 } 2512 }
2505 } 2513 }
2506 inuse = new.inuse;
2507 2514
2508 } while (!cmpxchg_double_slab(s, page, 2515 } while (!cmpxchg_double_slab(s, page,
2509 prior, counters, 2516 prior, counters,
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2529 return; 2536 return;
2530 } 2537 }
2531 2538
2539 if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
2540 goto slab_empty;
2541
2532 /* 2542 /*
2533 * was_frozen may have been set after we acquired the list_lock in 2543 * Objects left in the slab. If it was not on the partial list before
2534 * an earlier loop. So we need to check it here again. 2544 * then add it.
2535 */ 2545 */
2536 if (was_frozen) 2546 if (kmem_cache_debug(s) && unlikely(!prior)) {
2537 stat(s, FREE_FROZEN); 2547 remove_full(s, page);
2538 else { 2548 add_partial(n, page, DEACTIVATE_TO_TAIL);
2539 if (unlikely(!inuse && n->nr_partial > s->min_partial)) 2549 stat(s, FREE_ADD_PARTIAL);
2540 goto slab_empty;
2541
2542 /*
2543 * Objects left in the slab. If it was not on the partial list before
2544 * then add it.
2545 */
2546 if (unlikely(!prior)) {
2547 remove_full(s, page);
2548 add_partial(n, page, DEACTIVATE_TO_TAIL);
2549 stat(s, FREE_ADD_PARTIAL);
2550 }
2551 } 2550 }
2552 spin_unlock_irqrestore(&n->list_lock, flags); 2551 spin_unlock_irqrestore(&n->list_lock, flags);
2553 return; 2552 return;
@@ -2619,19 +2618,10 @@ redo:
2619 2618
2620void kmem_cache_free(struct kmem_cache *s, void *x) 2619void kmem_cache_free(struct kmem_cache *s, void *x)
2621{ 2620{
2622 struct page *page; 2621 s = cache_from_obj(s, x);
2623 2622 if (!s)
2624 page = virt_to_head_page(x);
2625
2626 if (kmem_cache_debug(s) && page->slab != s) {
2627 pr_err("kmem_cache_free: Wrong slab cache. %s but object"
2628 " is from %s\n", page->slab->name, s->name);
2629 WARN_ON_ONCE(1);
2630 return; 2623 return;
2631 } 2624 slab_free(s, virt_to_head_page(x), x, _RET_IP_);
2632
2633 slab_free(s, page, x, _RET_IP_);
2634
2635 trace_kmem_cache_free(_RET_IP_, x); 2625 trace_kmem_cache_free(_RET_IP_, x);
2636} 2626}
2637EXPORT_SYMBOL(kmem_cache_free); 2627EXPORT_SYMBOL(kmem_cache_free);
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved)
2769 return -ENOSYS; 2759 return -ENOSYS;
2770} 2760}
2771 2761
2772/*
2773 * Figure out what the alignment of the objects will be.
2774 */
2775static unsigned long calculate_alignment(unsigned long flags,
2776 unsigned long align, unsigned long size)
2777{
2778 /*
2779 * If the user wants hardware cache aligned objects then follow that
2780 * suggestion if the object is sufficiently large.
2781 *
2782 * The hardware cache alignment cannot override the specified
2783 * alignment though. If that is greater then use it.
2784 */
2785 if (flags & SLAB_HWCACHE_ALIGN) {
2786 unsigned long ralign = cache_line_size();
2787 while (size <= ralign / 2)
2788 ralign /= 2;
2789 align = max(align, ralign);
2790 }
2791
2792 if (align < ARCH_SLAB_MINALIGN)
2793 align = ARCH_SLAB_MINALIGN;
2794
2795 return ALIGN(align, sizeof(void *));
2796}
2797
2798static void 2762static void
2799init_kmem_cache_node(struct kmem_cache_node *n) 2763init_kmem_cache_node(struct kmem_cache_node *n)
2800{ 2764{
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2928{ 2892{
2929 unsigned long flags = s->flags; 2893 unsigned long flags = s->flags;
2930 unsigned long size = s->object_size; 2894 unsigned long size = s->object_size;
2931 unsigned long align = s->align;
2932 int order; 2895 int order;
2933 2896
2934 /* 2897 /*
@@ -3000,19 +2963,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3000#endif 2963#endif
3001 2964
3002 /* 2965 /*
3003 * Determine the alignment based on various parameters that the
3004 * user specified and the dynamic determination of cache line size
3005 * on bootup.
3006 */
3007 align = calculate_alignment(flags, align, s->object_size);
3008 s->align = align;
3009
3010 /*
3011 * SLUB stores one object immediately after another beginning from 2966 * SLUB stores one object immediately after another beginning from
3012 * offset 0. In order to align the objects we have to simply size 2967 * offset 0. In order to align the objects we have to simply size
3013 * each object to conform to the alignment. 2968 * each object to conform to the alignment.
3014 */ 2969 */
3015 size = ALIGN(size, align); 2970 size = ALIGN(size, s->align);
3016 s->size = size; 2971 s->size = size;
3017 if (forced_order >= 0) 2972 if (forced_order >= 0)
3018 order = forced_order; 2973 order = forced_order;
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3041 s->max = s->oo; 2996 s->max = s->oo;
3042 2997
3043 return !!oo_objects(s->oo); 2998 return !!oo_objects(s->oo);
3044
3045} 2999}
3046 3000
3047static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) 3001static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
@@ -3127,15 +3081,6 @@ error:
3127 return -EINVAL; 3081 return -EINVAL;
3128} 3082}
3129 3083
3130/*
3131 * Determine the size of a slab object
3132 */
3133unsigned int kmem_cache_size(struct kmem_cache *s)
3134{
3135 return s->object_size;
3136}
3137EXPORT_SYMBOL(kmem_cache_size);
3138
3139static void list_slab_objects(struct kmem_cache *s, struct page *page, 3084static void list_slab_objects(struct kmem_cache *s, struct page *page,
3140 const char *text) 3085 const char *text)
3141{ 3086{
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
3208{ 3153{
3209 int rc = kmem_cache_close(s); 3154 int rc = kmem_cache_close(s);
3210 3155
3211 if (!rc) 3156 if (!rc) {
3157 /*
3158 * We do the same lock strategy around sysfs_slab_add, see
3159 * __kmem_cache_create. Because this is pretty much the last
3160 * operation we do and the lock will be released shortly after
3161 * that in slab_common.c, we could just move sysfs_slab_remove
3162 * to a later point in common code. We should do that when we
3163 * have a common sysfs framework for all allocators.
3164 */
3165 mutex_unlock(&slab_mutex);
3212 sysfs_slab_remove(s); 3166 sysfs_slab_remove(s);
3167 mutex_lock(&slab_mutex);
3168 }
3213 3169
3214 return rc; 3170 return rc;
3215} 3171}
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str)
3261 3217
3262__setup("slub_nomerge", setup_slub_nomerge); 3218__setup("slub_nomerge", setup_slub_nomerge);
3263 3219
3264static struct kmem_cache *__init create_kmalloc_cache(const char *name,
3265 int size, unsigned int flags)
3266{
3267 struct kmem_cache *s;
3268
3269 s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3270
3271 s->name = name;
3272 s->size = s->object_size = size;
3273 s->align = ARCH_KMALLOC_MINALIGN;
3274
3275 /*
3276 * This function is called with IRQs disabled during early-boot on
3277 * single CPU so there's no need to take slab_mutex here.
3278 */
3279 if (kmem_cache_open(s, flags))
3280 goto panic;
3281
3282 list_add(&s->list, &slab_caches);
3283 return s;
3284
3285panic:
3286 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
3287 return NULL;
3288}
3289
3290/* 3220/*
3291 * Conversion table for small slabs sizes / 8 to the index in the 3221 * Conversion table for small slabs sizes / 8 to the index in the
3292 * kmalloc array. This is necessary for slabs < 192 since we have non power 3222 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3372 struct page *page; 3302 struct page *page;
3373 void *ptr = NULL; 3303 void *ptr = NULL;
3374 3304
3375 flags |= __GFP_COMP | __GFP_NOTRACK; 3305 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
3376 page = alloc_pages_node(node, flags, get_order(size)); 3306 page = alloc_pages_node(node, flags, get_order(size));
3377 if (page) 3307 if (page)
3378 ptr = page_address(page); 3308 ptr = page_address(page);
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object)
3424 return PAGE_SIZE << compound_order(page); 3354 return PAGE_SIZE << compound_order(page);
3425 } 3355 }
3426 3356
3427 return slab_ksize(page->slab); 3357 return slab_ksize(page->slab_cache);
3428} 3358}
3429EXPORT_SYMBOL(ksize); 3359EXPORT_SYMBOL(ksize);
3430 3360
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x)
3449 } 3379 }
3450 3380
3451 slab_lock(page); 3381 slab_lock(page);
3452 if (on_freelist(page->slab, page, object)) { 3382 if (on_freelist(page->slab_cache, page, object)) {
3453 object_err(page->slab, page, object, "Object is on free-list"); 3383 object_err(page->slab_cache, page, object, "Object is on free-list");
3454 rv = false; 3384 rv = false;
3455 } else { 3385 } else {
3456 rv = true; 3386 rv = true;
@@ -3478,10 +3408,10 @@ void kfree(const void *x)
3478 if (unlikely(!PageSlab(page))) { 3408 if (unlikely(!PageSlab(page))) {
3479 BUG_ON(!PageCompound(page)); 3409 BUG_ON(!PageCompound(page));
3480 kmemleak_free(x); 3410 kmemleak_free(x);
3481 __free_pages(page, compound_order(page)); 3411 __free_memcg_kmem_pages(page, compound_order(page));
3482 return; 3412 return;
3483 } 3413 }
3484 slab_free(page->slab, page, object, _RET_IP_); 3414 slab_free(page->slab_cache, page, object, _RET_IP_);
3485} 3415}
3486EXPORT_SYMBOL(kfree); 3416EXPORT_SYMBOL(kfree);
3487 3417
@@ -3573,7 +3503,7 @@ static void slab_mem_offline_callback(void *arg)
3573 struct memory_notify *marg = arg; 3503 struct memory_notify *marg = arg;
3574 int offline_node; 3504 int offline_node;
3575 3505
3576 offline_node = marg->status_change_nid; 3506 offline_node = marg->status_change_nid_normal;
3577 3507
3578 /* 3508 /*
3579 * If the node still has available memory. we need kmem_cache_node 3509 * If the node still has available memory. we need kmem_cache_node
@@ -3606,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg)
3606 struct kmem_cache_node *n; 3536 struct kmem_cache_node *n;
3607 struct kmem_cache *s; 3537 struct kmem_cache *s;
3608 struct memory_notify *marg = arg; 3538 struct memory_notify *marg = arg;
3609 int nid = marg->status_change_nid; 3539 int nid = marg->status_change_nid_normal;
3610 int ret = 0; 3540 int ret = 0;
3611 3541
3612 /* 3542 /*
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self,
3676 3606
3677/* 3607/*
3678 * Used for early kmem_cache structures that were allocated using 3608 * Used for early kmem_cache structures that were allocated using
3679 * the page allocator 3609 * the page allocator. Allocate them properly then fix up the pointers
3610 * that may be pointing to the wrong kmem_cache structure.
3680 */ 3611 */
3681 3612
3682static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) 3613static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3683{ 3614{
3684 int node; 3615 int node;
3616 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3685 3617
3686 list_add(&s->list, &slab_caches); 3618 memcpy(s, static_cache, kmem_cache->object_size);
3687 s->refcount = -1;
3688 3619
3689 for_each_node_state(node, N_NORMAL_MEMORY) { 3620 for_each_node_state(node, N_NORMAL_MEMORY) {
3690 struct kmem_cache_node *n = get_node(s, node); 3621 struct kmem_cache_node *n = get_node(s, node);
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
3692 3623
3693 if (n) { 3624 if (n) {
3694 list_for_each_entry(p, &n->partial, lru) 3625 list_for_each_entry(p, &n->partial, lru)
3695 p->slab = s; 3626 p->slab_cache = s;
3696 3627
3697#ifdef CONFIG_SLUB_DEBUG 3628#ifdef CONFIG_SLUB_DEBUG
3698 list_for_each_entry(p, &n->full, lru) 3629 list_for_each_entry(p, &n->full, lru)
3699 p->slab = s; 3630 p->slab_cache = s;
3700#endif 3631#endif
3701 } 3632 }
3702 } 3633 }
3634 list_add(&s->list, &slab_caches);
3635 return s;
3703} 3636}
3704 3637
3705void __init kmem_cache_init(void) 3638void __init kmem_cache_init(void)
3706{ 3639{
3640 static __initdata struct kmem_cache boot_kmem_cache,
3641 boot_kmem_cache_node;
3707 int i; 3642 int i;
3708 int caches = 0; 3643 int caches = 2;
3709 struct kmem_cache *temp_kmem_cache;
3710 int order;
3711 struct kmem_cache *temp_kmem_cache_node;
3712 unsigned long kmalloc_size;
3713 3644
3714 if (debug_guardpage_minorder()) 3645 if (debug_guardpage_minorder())
3715 slub_max_order = 0; 3646 slub_max_order = 0;
3716 3647
3717 kmem_size = offsetof(struct kmem_cache, node) + 3648 kmem_cache_node = &boot_kmem_cache_node;
3718 nr_node_ids * sizeof(struct kmem_cache_node *); 3649 kmem_cache = &boot_kmem_cache;
3719
3720 /* Allocate two kmem_caches from the page allocator */
3721 kmalloc_size = ALIGN(kmem_size, cache_line_size());
3722 order = get_order(2 * kmalloc_size);
3723 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
3724
3725 /*
3726 * Must first have the slab cache available for the allocations of the
3727 * struct kmem_cache_node's. There is special bootstrap code in
3728 * kmem_cache_open for slab_state == DOWN.
3729 */
3730 kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3731 3650
3732 kmem_cache_node->name = "kmem_cache_node"; 3651 create_boot_cache(kmem_cache_node, "kmem_cache_node",
3733 kmem_cache_node->size = kmem_cache_node->object_size = 3652 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
3734 sizeof(struct kmem_cache_node);
3735 kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
3736 3653
3737 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3654 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3738 3655
3739 /* Able to allocate the per node structures */ 3656 /* Able to allocate the per node structures */
3740 slab_state = PARTIAL; 3657 slab_state = PARTIAL;
3741 3658
3742 temp_kmem_cache = kmem_cache; 3659 create_boot_cache(kmem_cache, "kmem_cache",
3743 kmem_cache->name = "kmem_cache"; 3660 offsetof(struct kmem_cache, node) +
3744 kmem_cache->size = kmem_cache->object_size = kmem_size; 3661 nr_node_ids * sizeof(struct kmem_cache_node *),
3745 kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); 3662 SLAB_HWCACHE_ALIGN);
3746 3663
3747 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3664 kmem_cache = bootstrap(&boot_kmem_cache);
3748 memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3749 3665
3750 /* 3666 /*
3751 * Allocate kmem_cache_node properly from the kmem_cache slab. 3667 * Allocate kmem_cache_node properly from the kmem_cache slab.
3752 * kmem_cache_node is separately allocated so no need to 3668 * kmem_cache_node is separately allocated so no need to
3753 * update any list pointers. 3669 * update any list pointers.
3754 */ 3670 */
3755 temp_kmem_cache_node = kmem_cache_node; 3671 kmem_cache_node = bootstrap(&boot_kmem_cache_node);
3756
3757 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3758 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
3759
3760 kmem_cache_bootstrap_fixup(kmem_cache_node);
3761
3762 caches++;
3763 kmem_cache_bootstrap_fixup(kmem_cache);
3764 caches++;
3765 /* Free temporary boot structure */
3766 free_pages((unsigned long)temp_kmem_cache, order);
3767 3672
3768 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3673 /* Now we can use the kmem_cache to allocate kmalloc slabs */
3769 3674
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3891 return 0; 3796 return 0;
3892} 3797}
3893 3798
3894static struct kmem_cache *find_mergeable(size_t size, 3799static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
3895 size_t align, unsigned long flags, const char *name, 3800 size_t align, unsigned long flags, const char *name,
3896 void (*ctor)(void *)) 3801 void (*ctor)(void *))
3897{ 3802{
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
3927 if (s->size - size >= sizeof(void *)) 3832 if (s->size - size >= sizeof(void *))
3928 continue; 3833 continue;
3929 3834
3835 if (!cache_match_memcg(s, memcg))
3836 continue;
3837
3930 return s; 3838 return s;
3931 } 3839 }
3932 return NULL; 3840 return NULL;
3933} 3841}
3934 3842
3935struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 3843struct kmem_cache *
3936 size_t align, unsigned long flags, void (*ctor)(void *)) 3844__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
3845 size_t align, unsigned long flags, void (*ctor)(void *))
3937{ 3846{
3938 struct kmem_cache *s; 3847 struct kmem_cache *s;
3939 3848
3940 s = find_mergeable(size, align, flags, name, ctor); 3849 s = find_mergeable(memcg, size, align, flags, name, ctor);
3941 if (s) { 3850 if (s) {
3942 s->refcount++; 3851 s->refcount++;
3943 /* 3852 /*
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3964 if (err) 3873 if (err)
3965 return err; 3874 return err;
3966 3875
3876 /* Mutex is not taken during early boot */
3877 if (slab_state <= UP)
3878 return 0;
3879
3880 memcg_propagate_slab_attrs(s);
3967 mutex_unlock(&slab_mutex); 3881 mutex_unlock(&slab_mutex);
3968 err = sysfs_slab_add(s); 3882 err = sysfs_slab_add(s);
3969 mutex_lock(&slab_mutex); 3883 mutex_lock(&slab_mutex);
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
5197 return -EIO; 5111 return -EIO;
5198 5112
5199 err = attribute->store(s, buf, len); 5113 err = attribute->store(s, buf, len);
5114#ifdef CONFIG_MEMCG_KMEM
5115 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5116 int i;
5117
5118 mutex_lock(&slab_mutex);
5119 if (s->max_attr_size < len)
5120 s->max_attr_size = len;
5200 5121
5122 /*
5123 * This is a best effort propagation, so this function's return
5124 * value will be determined by the parent cache only. This is
5125 * basically because not all attributes will have a well
5126 * defined semantics for rollbacks - most of the actions will
5127 * have permanent effects.
5128 *
5129 * Returning the error value of any of the children that fail
5130 * is not 100 % defined, in the sense that users seeing the
5131 * error code won't be able to know anything about the state of
5132 * the cache.
5133 *
5134 * Only returning the error code for the parent cache at least
5135 * has well defined semantics. The cache being written to
5136 * directly either failed or succeeded, in which case we loop
5137 * through the descendants with best-effort propagation.
5138 */
5139 for_each_memcg_cache_index(i) {
5140 struct kmem_cache *c = cache_from_memcg(s, i);
5141 if (c)
5142 attribute->store(c, buf, len);
5143 }
5144 mutex_unlock(&slab_mutex);
5145 }
5146#endif
5201 return err; 5147 return err;
5202} 5148}
5203 5149
5150static void memcg_propagate_slab_attrs(struct kmem_cache *s)
5151{
5152#ifdef CONFIG_MEMCG_KMEM
5153 int i;
5154 char *buffer = NULL;
5155
5156 if (!is_root_cache(s))
5157 return;
5158
5159 /*
5160 * This mean this cache had no attribute written. Therefore, no point
5161 * in copying default values around
5162 */
5163 if (!s->max_attr_size)
5164 return;
5165
5166 for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5167 char mbuf[64];
5168 char *buf;
5169 struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5170
5171 if (!attr || !attr->store || !attr->show)
5172 continue;
5173
5174 /*
5175 * It is really bad that we have to allocate here, so we will
5176 * do it only as a fallback. If we actually allocate, though,
5177 * we can just use the allocated buffer until the end.
5178 *
5179 * Most of the slub attributes will tend to be very small in
5180 * size, but sysfs allows buffers up to a page, so they can
5181 * theoretically happen.
5182 */
5183 if (buffer)
5184 buf = buffer;
5185 else if (s->max_attr_size < ARRAY_SIZE(mbuf))
5186 buf = mbuf;
5187 else {
5188 buffer = (char *) get_zeroed_page(GFP_KERNEL);
5189 if (WARN_ON(!buffer))
5190 continue;
5191 buf = buffer;
5192 }
5193
5194 attr->show(s->memcg_params->root_cache, buf);
5195 attr->store(s, buf, strlen(buf));
5196 }
5197
5198 if (buffer)
5199 free_page((unsigned long)buffer);
5200#endif
5201}
5202
5204static const struct sysfs_ops slab_sysfs_ops = { 5203static const struct sysfs_ops slab_sysfs_ops = {
5205 .show = slab_attr_show, 5204 .show = slab_attr_show,
5206 .store = slab_attr_store, 5205 .store = slab_attr_store,
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
5257 if (p != name + 1) 5256 if (p != name + 1)
5258 *p++ = '-'; 5257 *p++ = '-';
5259 p += sprintf(p, "%07d", s->size); 5258 p += sprintf(p, "%07d", s->size);
5259
5260#ifdef CONFIG_MEMCG_KMEM
5261 if (!is_root_cache(s))
5262 p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
5263#endif
5264
5260 BUG_ON(p > name + ID_STR_LENGTH - 1); 5265 BUG_ON(p > name + ID_STR_LENGTH - 1);
5261 return name; 5266 return name;
5262} 5267}
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
5265{ 5270{
5266 int err; 5271 int err;
5267 const char *name; 5272 const char *name;
5268 int unmergeable; 5273 int unmergeable = slab_unmergeable(s);
5269
5270 if (slab_state < FULL)
5271 /* Defer until later */
5272 return 0;
5273 5274
5274 unmergeable = slab_unmergeable(s);
5275 if (unmergeable) { 5275 if (unmergeable) {
5276 /* 5276 /*
5277 * Slabcache can never be merged so we can use the name proper. 5277 * Slabcache can never be merged so we can use the name proper.
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init);
5405 * The /proc/slabinfo ABI 5405 * The /proc/slabinfo ABI
5406 */ 5406 */
5407#ifdef CONFIG_SLABINFO 5407#ifdef CONFIG_SLABINFO
5408static void print_slabinfo_header(struct seq_file *m) 5408void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5409{
5410 seq_puts(m, "slabinfo - version: 2.1\n");
5411 seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
5412 "<objperslab> <pagesperslab>");
5413 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
5414 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
5415 seq_putc(m, '\n');
5416}
5417
5418static void *s_start(struct seq_file *m, loff_t *pos)
5419{
5420 loff_t n = *pos;
5421
5422 mutex_lock(&slab_mutex);
5423 if (!n)
5424 print_slabinfo_header(m);
5425
5426 return seq_list_start(&slab_caches, *pos);
5427}
5428
5429static void *s_next(struct seq_file *m, void *p, loff_t *pos)
5430{
5431 return seq_list_next(p, &slab_caches, pos);
5432}
5433
5434static void s_stop(struct seq_file *m, void *p)
5435{
5436 mutex_unlock(&slab_mutex);
5437}
5438
5439static int s_show(struct seq_file *m, void *p)
5440{ 5409{
5441 unsigned long nr_partials = 0; 5410 unsigned long nr_partials = 0;
5442 unsigned long nr_slabs = 0; 5411 unsigned long nr_slabs = 0;
5443 unsigned long nr_inuse = 0;
5444 unsigned long nr_objs = 0; 5412 unsigned long nr_objs = 0;
5445 unsigned long nr_free = 0; 5413 unsigned long nr_free = 0;
5446 struct kmem_cache *s;
5447 int node; 5414 int node;
5448 5415
5449 s = list_entry(p, struct kmem_cache, list);
5450
5451 for_each_online_node(node) { 5416 for_each_online_node(node) {
5452 struct kmem_cache_node *n = get_node(s, node); 5417 struct kmem_cache_node *n = get_node(s, node);
5453 5418
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p)
5460 nr_free += count_partial(n, count_free); 5425 nr_free += count_partial(n, count_free);
5461 } 5426 }
5462 5427
5463 nr_inuse = nr_objs - nr_free; 5428 sinfo->active_objs = nr_objs - nr_free;
5464 5429 sinfo->num_objs = nr_objs;
5465 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, 5430 sinfo->active_slabs = nr_slabs;
5466 nr_objs, s->size, oo_objects(s->oo), 5431 sinfo->num_slabs = nr_slabs;
5467 (1 << oo_order(s->oo))); 5432 sinfo->objects_per_slab = oo_objects(s->oo);
5468 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); 5433 sinfo->cache_order = oo_order(s->oo);
5469 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
5470 0UL);
5471 seq_putc(m, '\n');
5472 return 0;
5473} 5434}
5474 5435
5475static const struct seq_operations slabinfo_op = { 5436void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5476 .start = s_start,
5477 .next = s_next,
5478 .stop = s_stop,
5479 .show = s_show,
5480};
5481
5482static int slabinfo_open(struct inode *inode, struct file *file)
5483{ 5437{
5484 return seq_open(file, &slabinfo_op);
5485} 5438}
5486 5439
5487static const struct file_operations proc_slabinfo_operations = { 5440ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5488 .open = slabinfo_open, 5441 size_t count, loff_t *ppos)
5489 .read = seq_read,
5490 .llseek = seq_lseek,
5491 .release = seq_release,
5492};
5493
5494static int __init slab_proc_init(void)
5495{ 5442{
5496 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); 5443 return -EIO;
5497 return 0;
5498} 5444}
5499module_init(slab_proc_init);
5500#endif /* CONFIG_SLABINFO */ 5445#endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse.c b/mm/sparse.c
index a83de2f72b30..6b5fb762e2ca 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
638got_map_page: 638got_map_page:
639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); 639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
640got_map_ptr: 640got_map_ptr:
641 memset(ret, 0, memmap_size);
642 641
643 return ret; 642 return ret;
644} 643}
@@ -758,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
758 goto out; 757 goto out;
759 } 758 }
760 759
760 memset(memmap, 0, sizeof(struct page) * nr_pages);
761
761 ms->section_mem_map |= SECTION_MARKED_PRESENT; 762 ms->section_mem_map |= SECTION_MARKED_PRESENT;
762 763
763 ret = sparse_init_one_section(ms, section_nr, memmap, usemap); 764 ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
@@ -771,6 +772,27 @@ out:
771 return ret; 772 return ret;
772} 773}
773 774
775#ifdef CONFIG_MEMORY_FAILURE
776static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
777{
778 int i;
779
780 if (!memmap)
781 return;
782
783 for (i = 0; i < PAGES_PER_SECTION; i++) {
784 if (PageHWPoison(&memmap[i])) {
785 atomic_long_sub(1, &mce_bad_pages);
786 ClearPageHWPoison(&memmap[i]);
787 }
788 }
789}
790#else
791static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
792{
793}
794#endif
795
774void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 796void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
775{ 797{
776 struct page *memmap = NULL; 798 struct page *memmap = NULL;
@@ -784,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
784 ms->pageblock_flags = NULL; 806 ms->pageblock_flags = NULL;
785 } 807 }
786 808
809 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
787 free_section_usemap(memmap, usemap); 810 free_section_usemap(memmap, usemap);
788} 811}
789#endif 812#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f91a25547ffe..e97a0e5aea91 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1443 return generic_swapfile_activate(sis, swap_file, span); 1443 return generic_swapfile_activate(sis, swap_file, span);
1444} 1444}
1445 1445
1446static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void _enable_swap_info(struct swap_info_struct *p, int prio,
1447 unsigned char *swap_map, 1447 unsigned char *swap_map,
1448 unsigned long *frontswap_map) 1448 unsigned long *frontswap_map)
1449{ 1449{
1450 int i, prev; 1450 int i, prev;
1451 1451
1452 spin_lock(&swap_lock);
1453 if (prio >= 0) 1452 if (prio >= 0)
1454 p->prio = prio; 1453 p->prio = prio;
1455 else 1454 else
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1472 swap_list.head = swap_list.next = p->type; 1471 swap_list.head = swap_list.next = p->type;
1473 else 1472 else
1474 swap_info[prev]->next = p->type; 1473 swap_info[prev]->next = p->type;
1474}
1475
1476static void enable_swap_info(struct swap_info_struct *p, int prio,
1477 unsigned char *swap_map,
1478 unsigned long *frontswap_map)
1479{
1480 spin_lock(&swap_lock);
1481 _enable_swap_info(p, prio, swap_map, frontswap_map);
1475 frontswap_init(p->type); 1482 frontswap_init(p->type);
1476 spin_unlock(&swap_lock); 1483 spin_unlock(&swap_lock);
1477} 1484}
1478 1485
1486static void reinsert_swap_info(struct swap_info_struct *p)
1487{
1488 spin_lock(&swap_lock);
1489 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1490 spin_unlock(&swap_lock);
1491}
1492
1479SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1493SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1480{ 1494{
1481 struct swap_info_struct *p = NULL; 1495 struct swap_info_struct *p = NULL;
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1484 struct address_space *mapping; 1498 struct address_space *mapping;
1485 struct inode *inode; 1499 struct inode *inode;
1486 struct filename *pathname; 1500 struct filename *pathname;
1487 int oom_score_adj;
1488 int i, type, prev; 1501 int i, type, prev;
1489 int err; 1502 int err;
1490 1503
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1543 p->flags &= ~SWP_WRITEOK; 1556 p->flags &= ~SWP_WRITEOK;
1544 spin_unlock(&swap_lock); 1557 spin_unlock(&swap_lock);
1545 1558
1546 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1559 set_current_oom_origin();
1547 err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1560 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1548 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1561 clear_current_oom_origin();
1549 1562
1550 if (err) { 1563 if (err) {
1551 /*
1552 * reading p->prio and p->swap_map outside the lock is
1553 * safe here because only sys_swapon and sys_swapoff
1554 * change them, and there can be no other sys_swapon or
1555 * sys_swapoff for this swap_info_struct at this point.
1556 */
1557 /* re-insert swap space back into swap_list */ 1564 /* re-insert swap space back into swap_list */
1558 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1565 reinsert_swap_info(p);
1559 goto out_dput; 1566 goto out_dput;
1560 } 1567 }
1561 1568
diff --git a/mm/truncate.c b/mm/truncate.c
index d51ce92d6e83..c75b736e54b7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -577,29 +577,6 @@ void truncate_setsize(struct inode *inode, loff_t newsize)
577EXPORT_SYMBOL(truncate_setsize); 577EXPORT_SYMBOL(truncate_setsize);
578 578
579/** 579/**
580 * vmtruncate - unmap mappings "freed" by truncate() syscall
581 * @inode: inode of the file used
582 * @newsize: file offset to start truncating
583 *
584 * This function is deprecated and truncate_setsize or truncate_pagecache
585 * should be used instead, together with filesystem specific block truncation.
586 */
587int vmtruncate(struct inode *inode, loff_t newsize)
588{
589 int error;
590
591 error = inode_newsize_ok(inode, newsize);
592 if (error)
593 return error;
594
595 truncate_setsize(inode, newsize);
596 if (inode->i_op->truncate)
597 inode->i_op->truncate(inode);
598 return 0;
599}
600EXPORT_SYMBOL(vmtruncate);
601
602/**
603 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched 580 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
604 * @inode: inode 581 * @inode: inode
605 * @lstart: offset of beginning of hole 582 * @lstart: offset of beginning of hole
diff --git a/mm/util.c b/mm/util.c
index dc3036cdcc6a..c55e26b17d93 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__krealloc);
152 * 152 *
153 * The contents of the object pointed to are preserved up to the 153 * The contents of the object pointed to are preserved up to the
154 * lesser of the new and old sizes. If @p is %NULL, krealloc() 154 * lesser of the new and old sizes. If @p is %NULL, krealloc()
155 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 155 * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
156 * %NULL pointer, the object pointed to is freed. 156 * %NULL pointer, the object pointed to is freed.
157 */ 157 */
158void *krealloc(const void *p, size_t new_size, gfp_t flags) 158void *krealloc(const void *p, size_t new_size, gfp_t flags)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 78e08300db21..5123a169ab7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
2550 2550
2551static void show_numa_info(struct seq_file *m, struct vm_struct *v) 2551static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2552{ 2552{
2553 if (NUMA_BUILD) { 2553 if (IS_ENABLED(CONFIG_NUMA)) {
2554 unsigned int nr, *counters = m->private; 2554 unsigned int nr, *counters = m->private;
2555 2555
2556 if (!counters) 2556 if (!counters)
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
2615 unsigned int *ptr = NULL; 2615 unsigned int *ptr = NULL;
2616 int ret; 2616 int ret;
2617 2617
2618 if (NUMA_BUILD) { 2618 if (IS_ENABLED(CONFIG_NUMA)) {
2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
2620 if (ptr == NULL) 2620 if (ptr == NULL)
2621 return -ENOMEM; 2621 return -ENOMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b7ed37675644..196709f5ee58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
1177} 1177}
1178 1178
1179/* 1179/*
1180 * Are there way too many processes in the direct reclaim path already? 1180 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1181 * then get resheduled. When there are massive number of tasks doing page
1182 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1183 * the LRU list will go small and be scanned faster than necessary, leading to
1184 * unnecessary swapping, thrashing and OOM.
1181 */ 1185 */
1182static int too_many_isolated(struct zone *zone, int file, 1186static int too_many_isolated(struct zone *zone, int file,
1183 struct scan_control *sc) 1187 struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
1198 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1202 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1199 } 1203 }
1200 1204
1205 /*
1206 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1207 * won't get blocked by normal direct-reclaimers, forming a circular
1208 * deadlock.
1209 */
1210 if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
1211 inactive >>= 3;
1212
1201 return isolated > inactive; 1213 return isolated > inactive;
1202} 1214}
1203 1215
@@ -1679,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1679 1691
1680 if (global_reclaim(sc)) { 1692 if (global_reclaim(sc)) {
1681 free = zone_page_state(zone, NR_FREE_PAGES); 1693 free = zone_page_state(zone, NR_FREE_PAGES);
1682 /* If we have very few page cache pages,
1683 force-scan anon pages. */
1684 if (unlikely(file + free <= high_wmark_pages(zone))) { 1694 if (unlikely(file + free <= high_wmark_pages(zone))) {
1695 /*
1696 * If we have very few page cache pages, force-scan
1697 * anon pages.
1698 */
1685 fraction[0] = 1; 1699 fraction[0] = 1;
1686 fraction[1] = 0; 1700 fraction[1] = 0;
1687 denominator = 1; 1701 denominator = 1;
1688 goto out; 1702 goto out;
1703 } else if (!inactive_file_is_low_global(zone)) {
1704 /*
1705 * There is enough inactive page cache, do not
1706 * reclaim anything from the working set right now.
1707 */
1708 fraction[0] = 0;
1709 fraction[1] = 1;
1710 denominator = 1;
1711 goto out;
1689 } 1712 }
1690 } 1713 }
1691 1714
@@ -1752,7 +1775,7 @@ out:
1752/* Use reclaim/compaction for costly allocs or under memory pressure */ 1775/* Use reclaim/compaction for costly allocs or under memory pressure */
1753static bool in_reclaim_compaction(struct scan_control *sc) 1776static bool in_reclaim_compaction(struct scan_control *sc)
1754{ 1777{
1755 if (COMPACTION_BUILD && sc->order && 1778 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
1756 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 1779 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1757 sc->priority < DEF_PRIORITY - 2)) 1780 sc->priority < DEF_PRIORITY - 2))
1758 return true; 1781 return true;
@@ -2005,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2005 if (zone->all_unreclaimable && 2028 if (zone->all_unreclaimable &&
2006 sc->priority != DEF_PRIORITY) 2029 sc->priority != DEF_PRIORITY)
2007 continue; /* Let kswapd poll it */ 2030 continue; /* Let kswapd poll it */
2008 if (COMPACTION_BUILD) { 2031 if (IS_ENABLED(CONFIG_COMPACTION)) {
2009 /* 2032 /*
2010 * If we already have plenty of memory free for 2033 * If we already have plenty of memory free for
2011 * compaction in this zone, don't free any more. 2034 * compaction in this zone, don't free any more.
@@ -2421,19 +2444,24 @@ static bool zone_balanced(struct zone *zone, int order,
2421 balance_gap, classzone_idx, 0)) 2444 balance_gap, classzone_idx, 0))
2422 return false; 2445 return false;
2423 2446
2424 if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) 2447 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2448 !compaction_suitable(zone, order))
2425 return false; 2449 return false;
2426 2450
2427 return true; 2451 return true;
2428} 2452}
2429 2453
2430/* 2454/*
2431 * pgdat_balanced is used when checking if a node is balanced for high-order 2455 * pgdat_balanced() is used when checking if a node is balanced.
2432 * allocations. Only zones that meet watermarks and are in a zone allowed 2456 *
2433 * by the callers classzone_idx are added to balanced_pages. The total of 2457 * For order-0, all zones must be balanced!
2434 * balanced pages must be at least 25% of the zones allowed by classzone_idx 2458 *
2435 * for the node to be considered balanced. Forcing all zones to be balanced 2459 * For high-order allocations only zones that meet watermarks and are in a
2436 * for high orders can cause excessive reclaim when there are imbalanced zones. 2460 * zone allowed by the callers classzone_idx are added to balanced_pages. The
2461 * total of balanced pages must be at least 25% of the zones allowed by
2462 * classzone_idx for the node to be considered balanced. Forcing all zones to
2463 * be balanced for high orders can cause excessive reclaim when there are
2464 * imbalanced zones.
2437 * The choice of 25% is due to 2465 * The choice of 25% is due to
2438 * o a 16M DMA zone that is balanced will not balance a zone on any 2466 * o a 16M DMA zone that is balanced will not balance a zone on any
2439 * reasonable sized machine 2467 * reasonable sized machine
@@ -2443,17 +2471,43 @@ static bool zone_balanced(struct zone *zone, int order,
2443 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2471 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2444 * to balance a node on its own. These seemed like reasonable ratios. 2472 * to balance a node on its own. These seemed like reasonable ratios.
2445 */ 2473 */
2446static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, 2474static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2447 int classzone_idx)
2448{ 2475{
2449 unsigned long present_pages = 0; 2476 unsigned long present_pages = 0;
2477 unsigned long balanced_pages = 0;
2450 int i; 2478 int i;
2451 2479
2452 for (i = 0; i <= classzone_idx; i++) 2480 /* Check the watermark levels */
2453 present_pages += pgdat->node_zones[i].present_pages; 2481 for (i = 0; i <= classzone_idx; i++) {
2482 struct zone *zone = pgdat->node_zones + i;
2483
2484 if (!populated_zone(zone))
2485 continue;
2486
2487 present_pages += zone->present_pages;
2488
2489 /*
2490 * A special case here:
2491 *
2492 * balance_pgdat() skips over all_unreclaimable after
2493 * DEF_PRIORITY. Effectively, it considers them balanced so
2494 * they must be considered balanced here as well!
2495 */
2496 if (zone->all_unreclaimable) {
2497 balanced_pages += zone->present_pages;
2498 continue;
2499 }
2500
2501 if (zone_balanced(zone, order, 0, i))
2502 balanced_pages += zone->present_pages;
2503 else if (!order)
2504 return false;
2505 }
2454 2506
2455 /* A special case here: if zone has no page, we think it's balanced */ 2507 if (order)
2456 return balanced_pages >= (present_pages >> 2); 2508 return balanced_pages >= (present_pages >> 2);
2509 else
2510 return true;
2457} 2511}
2458 2512
2459/* 2513/*
@@ -2465,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2465static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, 2519static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2466 int classzone_idx) 2520 int classzone_idx)
2467{ 2521{
2468 int i;
2469 unsigned long balanced = 0;
2470 bool all_zones_ok = true;
2471
2472 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2522 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2473 if (remaining) 2523 if (remaining)
2474 return false; 2524 return false;
@@ -2487,39 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2487 return false; 2537 return false;
2488 } 2538 }
2489 2539
2490 /* Check the watermark levels */ 2540 return pgdat_balanced(pgdat, order, classzone_idx);
2491 for (i = 0; i <= classzone_idx; i++) {
2492 struct zone *zone = pgdat->node_zones + i;
2493
2494 if (!populated_zone(zone))
2495 continue;
2496
2497 /*
2498 * balance_pgdat() skips over all_unreclaimable after
2499 * DEF_PRIORITY. Effectively, it considers them balanced so
2500 * they must be considered balanced here as well if kswapd
2501 * is to sleep
2502 */
2503 if (zone->all_unreclaimable) {
2504 balanced += zone->present_pages;
2505 continue;
2506 }
2507
2508 if (!zone_balanced(zone, order, 0, i))
2509 all_zones_ok = false;
2510 else
2511 balanced += zone->present_pages;
2512 }
2513
2514 /*
2515 * For high-order requests, the balanced zones must contain at least
2516 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2517 * must be balanced
2518 */
2519 if (order)
2520 return pgdat_balanced(pgdat, balanced, classzone_idx);
2521 else
2522 return all_zones_ok;
2523} 2541}
2524 2542
2525/* 2543/*
@@ -2546,8 +2564,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2546static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2564static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2547 int *classzone_idx) 2565 int *classzone_idx)
2548{ 2566{
2549 int all_zones_ok; 2567 struct zone *unbalanced_zone;
2550 unsigned long balanced;
2551 int i; 2568 int i;
2552 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2569 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2553 unsigned long total_scanned; 2570 unsigned long total_scanned;
@@ -2580,8 +2597,7 @@ loop_again:
2580 unsigned long lru_pages = 0; 2597 unsigned long lru_pages = 0;
2581 int has_under_min_watermark_zone = 0; 2598 int has_under_min_watermark_zone = 0;
2582 2599
2583 all_zones_ok = 1; 2600 unbalanced_zone = NULL;
2584 balanced = 0;
2585 2601
2586 /* 2602 /*
2587 * Scan in the highmem->dma direction for the highest 2603 * Scan in the highmem->dma direction for the highest
@@ -2684,7 +2700,7 @@ loop_again:
2684 * Do not reclaim more than needed for compaction. 2700 * Do not reclaim more than needed for compaction.
2685 */ 2701 */
2686 testorder = order; 2702 testorder = order;
2687 if (COMPACTION_BUILD && order && 2703 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2688 compaction_suitable(zone, order) != 2704 compaction_suitable(zone, order) !=
2689 COMPACT_SKIPPED) 2705 COMPACT_SKIPPED)
2690 testorder = 0; 2706 testorder = 0;
@@ -2719,7 +2735,7 @@ loop_again:
2719 } 2735 }
2720 2736
2721 if (!zone_balanced(zone, testorder, 0, end_zone)) { 2737 if (!zone_balanced(zone, testorder, 0, end_zone)) {
2722 all_zones_ok = 0; 2738 unbalanced_zone = zone;
2723 /* 2739 /*
2724 * We are still under min water mark. This 2740 * We are still under min water mark. This
2725 * means that we have a GFP_ATOMIC allocation 2741 * means that we have a GFP_ATOMIC allocation
@@ -2737,8 +2753,6 @@ loop_again:
2737 * speculatively avoid congestion waits 2753 * speculatively avoid congestion waits
2738 */ 2754 */
2739 zone_clear_flag(zone, ZONE_CONGESTED); 2755 zone_clear_flag(zone, ZONE_CONGESTED);
2740 if (i <= *classzone_idx)
2741 balanced += zone->present_pages;
2742 } 2756 }
2743 2757
2744 } 2758 }
@@ -2752,7 +2766,7 @@ loop_again:
2752 pfmemalloc_watermark_ok(pgdat)) 2766 pfmemalloc_watermark_ok(pgdat))
2753 wake_up(&pgdat->pfmemalloc_wait); 2767 wake_up(&pgdat->pfmemalloc_wait);
2754 2768
2755 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2769 if (pgdat_balanced(pgdat, order, *classzone_idx))
2756 break; /* kswapd: all done */ 2770 break; /* kswapd: all done */
2757 /* 2771 /*
2758 * OK, kswapd is getting into trouble. Take a nap, then take 2772 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2761,8 +2775,8 @@ loop_again:
2761 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { 2775 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2762 if (has_under_min_watermark_zone) 2776 if (has_under_min_watermark_zone)
2763 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2777 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2764 else 2778 else if (unbalanced_zone)
2765 congestion_wait(BLK_RW_ASYNC, HZ/10); 2779 wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
2766 } 2780 }
2767 2781
2768 /* 2782 /*
@@ -2776,12 +2790,7 @@ loop_again:
2776 } while (--sc.priority >= 0); 2790 } while (--sc.priority >= 0);
2777out: 2791out:
2778 2792
2779 /* 2793 if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
2780 * order-0: All zones must meet high watermark for a balanced node
2781 * high-order: Balanced zones must make up at least 25% of the node
2782 * for the node to be balanced
2783 */
2784 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2785 cond_resched(); 2794 cond_resched();
2786 2795
2787 try_to_freeze(); 2796 try_to_freeze();
@@ -2951,7 +2960,7 @@ static int kswapd(void *p)
2951 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2960 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2952 balanced_classzone_idx = classzone_idx; 2961 balanced_classzone_idx = classzone_idx;
2953 for ( ; ; ) { 2962 for ( ; ; ) {
2954 int ret; 2963 bool ret;
2955 2964
2956 /* 2965 /*
2957 * If the last balance_pgdat was unsuccessful it's unlikely a 2966 * If the last balance_pgdat was unsuccessful it's unlikely a
@@ -3113,13 +3122,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3113 not required for correctness. So if the last cpu in a node goes 3122 not required for correctness. So if the last cpu in a node goes
3114 away, we get changed to run anywhere: as the first one comes back, 3123 away, we get changed to run anywhere: as the first one comes back,
3115 restore their cpu bindings. */ 3124 restore their cpu bindings. */
3116static int __devinit cpu_callback(struct notifier_block *nfb, 3125static int cpu_callback(struct notifier_block *nfb, unsigned long action,
3117 unsigned long action, void *hcpu) 3126 void *hcpu)
3118{ 3127{
3119 int nid; 3128 int nid;
3120 3129
3121 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 3130 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3122 for_each_node_state(nid, N_HIGH_MEMORY) { 3131 for_each_node_state(nid, N_MEMORY) {
3123 pg_data_t *pgdat = NODE_DATA(nid); 3132 pg_data_t *pgdat = NODE_DATA(nid);
3124 const struct cpumask *mask; 3133 const struct cpumask *mask;
3125 3134
@@ -3175,7 +3184,7 @@ static int __init kswapd_init(void)
3175 int nid; 3184 int nid;
3176 3185
3177 swap_setup(); 3186 swap_setup();
3178 for_each_node_state(nid, N_HIGH_MEMORY) 3187 for_each_node_state(nid, N_MEMORY)
3179 kswapd_run(nid); 3188 kswapd_run(nid);
3180 hotcpu_notifier(cpu_callback, 0); 3189 hotcpu_notifier(cpu_callback, 0);
3181 return 0; 3190 return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7370579111b..9800306c8195 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
774 774
775 "pgrotated", 775 "pgrotated",
776 776
777#ifdef CONFIG_NUMA_BALANCING
778 "numa_pte_updates",
779 "numa_hint_faults",
780 "numa_hint_faults_local",
781 "numa_pages_migrated",
782#endif
783#ifdef CONFIG_MIGRATION
784 "pgmigrate_success",
785 "pgmigrate_fail",
786#endif
777#ifdef CONFIG_COMPACTION 787#ifdef CONFIG_COMPACTION
778 "compact_blocks_moved", 788 "compact_migrate_scanned",
779 "compact_pages_moved", 789 "compact_free_scanned",
780 "compact_pagemigrate_failed", 790 "compact_isolated",
781 "compact_stall", 791 "compact_stall",
782 "compact_fail", 792 "compact_fail",
783 "compact_success", 793 "compact_success",
@@ -801,6 +811,8 @@ const char * const vmstat_text[] = {
801 "thp_collapse_alloc", 811 "thp_collapse_alloc",
802 "thp_collapse_alloc_failed", 812 "thp_collapse_alloc_failed",
803 "thp_split", 813 "thp_split",
814 "thp_zero_page_alloc",
815 "thp_zero_page_alloc_failed",
804#endif 816#endif
805 817
806#endif /* CONFIG_VM_EVENTS_COUNTERS */ 818#endif /* CONFIG_VM_EVENTS_COUNTERS */
@@ -930,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
930 pg_data_t *pgdat = (pg_data_t *)arg; 942 pg_data_t *pgdat = (pg_data_t *)arg;
931 943
932 /* check memoryless node */ 944 /* check memoryless node */
933 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 945 if (!node_state(pgdat->node_id, N_MEMORY))
934 return 0; 946 return 0;
935 947
936 seq_printf(m, "Page block order: %d\n", pageblock_order); 948 seq_printf(m, "Page block order: %d\n", pageblock_order);
@@ -992,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
992 "\n high %lu" 1004 "\n high %lu"
993 "\n scanned %lu" 1005 "\n scanned %lu"
994 "\n spanned %lu" 1006 "\n spanned %lu"
995 "\n present %lu", 1007 "\n present %lu"
1008 "\n managed %lu",
996 zone_page_state(zone, NR_FREE_PAGES), 1009 zone_page_state(zone, NR_FREE_PAGES),
997 min_wmark_pages(zone), 1010 min_wmark_pages(zone),
998 low_wmark_pages(zone), 1011 low_wmark_pages(zone),
999 high_wmark_pages(zone), 1012 high_wmark_pages(zone),
1000 zone->pages_scanned, 1013 zone->pages_scanned,
1001 zone->spanned_pages, 1014 zone->spanned_pages,
1002 zone->present_pages); 1015 zone->present_pages,
1016 zone->managed_pages);
1003 1017
1004 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1018 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1005 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1019 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
@@ -1292,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg)
1292 pg_data_t *pgdat = (pg_data_t *)arg; 1306 pg_data_t *pgdat = (pg_data_t *)arg;
1293 1307
1294 /* check memoryless node */ 1308 /* check memoryless node */
1295 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 1309 if (!node_state(pgdat->node_id, N_MEMORY))
1296 return 0; 1310 return 0;
1297 1311
1298 walk_zones_in_node(m, pgdat, unusable_show_print); 1312 walk_zones_in_node(m, pgdat, unusable_show_print);