aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c15
-rw-r--r--mm/compaction.c605
-rw-r--r--mm/filemap.c20
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memcontrol.c691
-rw-r--r--mm/memory.c13
-rw-r--r--mm/memory_hotplug.c36
-rw-r--r--mm/mempolicy.c227
-rw-r--r--mm/migrate.c74
-rw-r--r--mm/mincore.c263
-rw-r--r--mm/mlock.c41
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c32
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c44
-rw-r--r--mm/page_alloc.c319
-rw-r--r--mm/percpu-km.c104
-rw-r--r--mm/percpu-vm.c451
-rw-r--r--mm/percpu.c585
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c40
-rw-r--r--mm/shmem.c95
-rw-r--r--mm/slab.c249
-rw-r--r--mm/slob.c8
-rw-r--r--mm/slub.c52
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/swapfile.c14
-rw-r--r--mm/vmscan.c213
-rw-r--r--mm/vmstat.c253
33 files changed, 3227 insertions, 1271 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 9c61158308dc..527136b22384 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -172,6 +172,15 @@ config SPLIT_PTLOCK_CPUS
172 default "4" 172 default "4"
173 173
174# 174#
175# support for memory compaction
176config COMPACTION
177 bool "Allow for memory compaction"
178 select MIGRATION
179 depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
180 help
181 Allows the compaction of memory for the allocation of huge pages.
182
183#
175# support for page migration 184# support for page migration
176# 185#
177config MIGRATION 186config MIGRATION
@@ -180,9 +189,11 @@ config MIGRATION
180 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE 189 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
181 help 190 help
182 Allows the migration of the physical location of pages of processes 191 Allows the migration of the physical location of pages of processes
183 while the virtual addresses are not changed. This is useful for 192 while the virtual addresses are not changed. This is useful in
184 example on NUMA systems to put pages nearer to the processors accessing 193 two situations. The first is on NUMA systems to put pages nearer
185 the page. 194 to the processors accessing. The second is when allocating huge
195 pages as migration can relocate pages to satisfy a huge page
196 allocation instead of reclaiming.
186 197
187config PHYS_ADDR_T_64BIT 198config PHYS_ADDR_T_64BIT
188 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT 199 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 6c2a73a54a43..8982504bd03b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
23obj-$(CONFIG_SPARSEMEM) += sparse.o 23obj-$(CONFIG_SPARSEMEM) += sparse.o
24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
26obj-$(CONFIG_COMPACTION) += compaction.o
26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_KSM) += ksm.o 28obj-$(CONFIG_KSM) += ksm.o
28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 29obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 707d0dc6da0f..660a87a22511 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -48,7 +48,6 @@ static struct timer_list sync_supers_timer;
48 48
49static int bdi_sync_supers(void *); 49static int bdi_sync_supers(void *);
50static void sync_supers_timer_fn(unsigned long); 50static void sync_supers_timer_fn(unsigned long);
51static void arm_supers_timer(void);
52 51
53static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); 52static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
54 53
@@ -252,7 +251,7 @@ static int __init default_bdi_init(void)
252 251
253 init_timer(&sync_supers_timer); 252 init_timer(&sync_supers_timer);
254 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); 253 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
255 arm_supers_timer(); 254 bdi_arm_supers_timer();
256 255
257 err = bdi_init(&default_backing_dev_info); 256 err = bdi_init(&default_backing_dev_info);
258 if (!err) 257 if (!err)
@@ -374,10 +373,13 @@ static int bdi_sync_supers(void *unused)
374 return 0; 373 return 0;
375} 374}
376 375
377static void arm_supers_timer(void) 376void bdi_arm_supers_timer(void)
378{ 377{
379 unsigned long next; 378 unsigned long next;
380 379
380 if (!dirty_writeback_interval)
381 return;
382
381 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; 383 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
382 mod_timer(&sync_supers_timer, round_jiffies_up(next)); 384 mod_timer(&sync_supers_timer, round_jiffies_up(next));
383} 385}
@@ -385,7 +387,7 @@ static void arm_supers_timer(void)
385static void sync_supers_timer_fn(unsigned long unused) 387static void sync_supers_timer_fn(unsigned long unused)
386{ 388{
387 wake_up_process(sync_supers_tsk); 389 wake_up_process(sync_supers_tsk);
388 arm_supers_timer(); 390 bdi_arm_supers_timer();
389} 391}
390 392
391static int bdi_forker_task(void *ptr) 393static int bdi_forker_task(void *ptr)
@@ -428,7 +430,10 @@ static int bdi_forker_task(void *ptr)
428 430
429 spin_unlock_bh(&bdi_lock); 431 spin_unlock_bh(&bdi_lock);
430 wait = msecs_to_jiffies(dirty_writeback_interval * 10); 432 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
431 schedule_timeout(wait); 433 if (wait)
434 schedule_timeout(wait);
435 else
436 schedule();
432 try_to_freeze(); 437 try_to_freeze();
433 continue; 438 continue;
434 } 439 }
diff --git a/mm/compaction.c b/mm/compaction.c
new file mode 100644
index 000000000000..94cce51b0b35
--- /dev/null
+++ b/mm/compaction.c
@@ -0,0 +1,605 @@
1/*
2 * linux/mm/compaction.c
3 *
4 * Memory compaction for the reduction of external fragmentation. Note that
5 * this heavily depends upon page migration to do all the real heavy
6 * lifting
7 *
8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
9 */
10#include <linux/swap.h>
11#include <linux/migrate.h>
12#include <linux/compaction.h>
13#include <linux/mm_inline.h>
14#include <linux/backing-dev.h>
15#include <linux/sysctl.h>
16#include <linux/sysfs.h>
17#include "internal.h"
18
19/*
20 * compact_control is used to track pages being migrated and the free pages
21 * they are being migrated to during memory compaction. The free_pfn starts
22 * at the end of a zone and migrate_pfn begins at the start. Movable pages
23 * are moved to the end of a zone during a compaction run and the run
24 * completes when free_pfn <= migrate_pfn
25 */
26struct compact_control {
27 struct list_head freepages; /* List of free pages to migrate to */
28 struct list_head migratepages; /* List of pages being migrated */
29 unsigned long nr_freepages; /* Number of isolated free pages */
30 unsigned long nr_migratepages; /* Number of pages to migrate */
31 unsigned long free_pfn; /* isolate_freepages search base */
32 unsigned long migrate_pfn; /* isolate_migratepages search base */
33
34 /* Account for isolated anon and file pages */
35 unsigned long nr_anon;
36 unsigned long nr_file;
37
38 unsigned int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone;
41};
42
43static unsigned long release_freepages(struct list_head *freelist)
44{
45 struct page *page, *next;
46 unsigned long count = 0;
47
48 list_for_each_entry_safe(page, next, freelist, lru) {
49 list_del(&page->lru);
50 __free_page(page);
51 count++;
52 }
53
54 return count;
55}
56
57/* Isolate free pages onto a private freelist. Must hold zone->lock */
58static unsigned long isolate_freepages_block(struct zone *zone,
59 unsigned long blockpfn,
60 struct list_head *freelist)
61{
62 unsigned long zone_end_pfn, end_pfn;
63 int total_isolated = 0;
64 struct page *cursor;
65
66 /* Get the last PFN we should scan for free pages at */
67 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
68 end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
69
70 /* Find the first usable PFN in the block to initialse page cursor */
71 for (; blockpfn < end_pfn; blockpfn++) {
72 if (pfn_valid_within(blockpfn))
73 break;
74 }
75 cursor = pfn_to_page(blockpfn);
76
77 /* Isolate free pages. This assumes the block is valid */
78 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
79 int isolated, i;
80 struct page *page = cursor;
81
82 if (!pfn_valid_within(blockpfn))
83 continue;
84
85 if (!PageBuddy(page))
86 continue;
87
88 /* Found a free page, break it into order-0 pages */
89 isolated = split_free_page(page);
90 total_isolated += isolated;
91 for (i = 0; i < isolated; i++) {
92 list_add(&page->lru, freelist);
93 page++;
94 }
95
96 /* If a page was split, advance to the end of it */
97 if (isolated) {
98 blockpfn += isolated - 1;
99 cursor += isolated - 1;
100 }
101 }
102
103 return total_isolated;
104}
105
106/* Returns true if the page is within a block suitable for migration to */
107static bool suitable_migration_target(struct page *page)
108{
109
110 int migratetype = get_pageblock_migratetype(page);
111
112 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
113 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
114 return false;
115
116 /* If the page is a large free page, then allow migration */
117 if (PageBuddy(page) && page_order(page) >= pageblock_order)
118 return true;
119
120 /* If the block is MIGRATE_MOVABLE, allow migration */
121 if (migratetype == MIGRATE_MOVABLE)
122 return true;
123
124 /* Otherwise skip the block */
125 return false;
126}
127
128/*
129 * Based on information in the current compact_control, find blocks
130 * suitable for isolating free pages from and then isolate them.
131 */
132static void isolate_freepages(struct zone *zone,
133 struct compact_control *cc)
134{
135 struct page *page;
136 unsigned long high_pfn, low_pfn, pfn;
137 unsigned long flags;
138 int nr_freepages = cc->nr_freepages;
139 struct list_head *freelist = &cc->freepages;
140
141 pfn = cc->free_pfn;
142 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
143 high_pfn = low_pfn;
144
145 /*
146 * Isolate free pages until enough are available to migrate the
147 * pages on cc->migratepages. We stop searching if the migrate
148 * and free page scanners meet or enough free pages are isolated.
149 */
150 spin_lock_irqsave(&zone->lock, flags);
151 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
152 pfn -= pageblock_nr_pages) {
153 unsigned long isolated;
154
155 if (!pfn_valid(pfn))
156 continue;
157
158 /*
159 * Check for overlapping nodes/zones. It's possible on some
160 * configurations to have a setup like
161 * node0 node1 node0
162 * i.e. it's possible that all pages within a zones range of
163 * pages do not belong to a single zone.
164 */
165 page = pfn_to_page(pfn);
166 if (page_zone(page) != zone)
167 continue;
168
169 /* Check the block is suitable for migration */
170 if (!suitable_migration_target(page))
171 continue;
172
173 /* Found a block suitable for isolating free pages from */
174 isolated = isolate_freepages_block(zone, pfn, freelist);
175 nr_freepages += isolated;
176
177 /*
178 * Record the highest PFN we isolated pages from. When next
179 * looking for free pages, the search will restart here as
180 * page migration may have returned some pages to the allocator
181 */
182 if (isolated)
183 high_pfn = max(high_pfn, pfn);
184 }
185 spin_unlock_irqrestore(&zone->lock, flags);
186
187 /* split_free_page does not map the pages */
188 list_for_each_entry(page, freelist, lru) {
189 arch_alloc_page(page, 0);
190 kernel_map_pages(page, 1, 1);
191 }
192
193 cc->free_pfn = high_pfn;
194 cc->nr_freepages = nr_freepages;
195}
196
197/* Update the number of anon and file isolated pages in the zone */
198static void acct_isolated(struct zone *zone, struct compact_control *cc)
199{
200 struct page *page;
201 unsigned int count[NR_LRU_LISTS] = { 0, };
202
203 list_for_each_entry(page, &cc->migratepages, lru) {
204 int lru = page_lru_base_type(page);
205 count[lru]++;
206 }
207
208 cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
209 cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
210 __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
211 __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
212}
213
214/* Similar to reclaim, but different enough that they don't share logic */
215static bool too_many_isolated(struct zone *zone)
216{
217
218 unsigned long inactive, isolated;
219
220 inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
221 zone_page_state(zone, NR_INACTIVE_ANON);
222 isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
223 zone_page_state(zone, NR_ISOLATED_ANON);
224
225 return isolated > inactive;
226}
227
228/*
229 * Isolate all pages that can be migrated from the block pointed to by
230 * the migrate scanner within compact_control.
231 */
232static unsigned long isolate_migratepages(struct zone *zone,
233 struct compact_control *cc)
234{
235 unsigned long low_pfn, end_pfn;
236 struct list_head *migratelist = &cc->migratepages;
237
238 /* Do not scan outside zone boundaries */
239 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
240
241 /* Only scan within a pageblock boundary */
242 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
243
244 /* Do not cross the free scanner or scan within a memory hole */
245 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
246 cc->migrate_pfn = end_pfn;
247 return 0;
248 }
249
250 /*
251 * Ensure that there are not too many pages isolated from the LRU
252 * list by either parallel reclaimers or compaction. If there are,
253 * delay for some time until fewer pages are isolated
254 */
255 while (unlikely(too_many_isolated(zone))) {
256 congestion_wait(BLK_RW_ASYNC, HZ/10);
257
258 if (fatal_signal_pending(current))
259 return 0;
260 }
261
262 /* Time to isolate some pages for migration */
263 spin_lock_irq(&zone->lru_lock);
264 for (; low_pfn < end_pfn; low_pfn++) {
265 struct page *page;
266 if (!pfn_valid_within(low_pfn))
267 continue;
268
269 /* Get the page and skip if free */
270 page = pfn_to_page(low_pfn);
271 if (PageBuddy(page))
272 continue;
273
274 /* Try isolate the page */
275 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
276 continue;
277
278 /* Successfully isolated */
279 del_page_from_lru_list(zone, page, page_lru(page));
280 list_add(&page->lru, migratelist);
281 mem_cgroup_del_lru(page);
282 cc->nr_migratepages++;
283
284 /* Avoid isolating too much */
285 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
286 break;
287 }
288
289 acct_isolated(zone, cc);
290
291 spin_unlock_irq(&zone->lru_lock);
292 cc->migrate_pfn = low_pfn;
293
294 return cc->nr_migratepages;
295}
296
297/*
298 * This is a migrate-callback that "allocates" freepages by taking pages
299 * from the isolated freelists in the block we are migrating to.
300 */
301static struct page *compaction_alloc(struct page *migratepage,
302 unsigned long data,
303 int **result)
304{
305 struct compact_control *cc = (struct compact_control *)data;
306 struct page *freepage;
307
308 /* Isolate free pages if necessary */
309 if (list_empty(&cc->freepages)) {
310 isolate_freepages(cc->zone, cc);
311
312 if (list_empty(&cc->freepages))
313 return NULL;
314 }
315
316 freepage = list_entry(cc->freepages.next, struct page, lru);
317 list_del(&freepage->lru);
318 cc->nr_freepages--;
319
320 return freepage;
321}
322
323/*
324 * We cannot control nr_migratepages and nr_freepages fully when migration is
325 * running as migrate_pages() has no knowledge of compact_control. When
326 * migration is complete, we count the number of pages on the lists by hand.
327 */
328static void update_nr_listpages(struct compact_control *cc)
329{
330 int nr_migratepages = 0;
331 int nr_freepages = 0;
332 struct page *page;
333
334 list_for_each_entry(page, &cc->migratepages, lru)
335 nr_migratepages++;
336 list_for_each_entry(page, &cc->freepages, lru)
337 nr_freepages++;
338
339 cc->nr_migratepages = nr_migratepages;
340 cc->nr_freepages = nr_freepages;
341}
342
343static int compact_finished(struct zone *zone,
344 struct compact_control *cc)
345{
346 unsigned int order;
347 unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
348
349 if (fatal_signal_pending(current))
350 return COMPACT_PARTIAL;
351
352 /* Compaction run completes if the migrate and free scanner meet */
353 if (cc->free_pfn <= cc->migrate_pfn)
354 return COMPACT_COMPLETE;
355
356 /* Compaction run is not finished if the watermark is not met */
357 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
358 return COMPACT_CONTINUE;
359
360 if (cc->order == -1)
361 return COMPACT_CONTINUE;
362
363 /* Direct compactor: Is a suitable page free? */
364 for (order = cc->order; order < MAX_ORDER; order++) {
365 /* Job done if page is free of the right migratetype */
366 if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
367 return COMPACT_PARTIAL;
368
369 /* Job done if allocation would set block type */
370 if (order >= pageblock_order && zone->free_area[order].nr_free)
371 return COMPACT_PARTIAL;
372 }
373
374 return COMPACT_CONTINUE;
375}
376
377static int compact_zone(struct zone *zone, struct compact_control *cc)
378{
379 int ret;
380
381 /* Setup to move all movable pages to the end of the zone */
382 cc->migrate_pfn = zone->zone_start_pfn;
383 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
384 cc->free_pfn &= ~(pageblock_nr_pages-1);
385
386 migrate_prep_local();
387
388 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
389 unsigned long nr_migrate, nr_remaining;
390
391 if (!isolate_migratepages(zone, cc))
392 continue;
393
394 nr_migrate = cc->nr_migratepages;
395 migrate_pages(&cc->migratepages, compaction_alloc,
396 (unsigned long)cc, 0);
397 update_nr_listpages(cc);
398 nr_remaining = cc->nr_migratepages;
399
400 count_vm_event(COMPACTBLOCKS);
401 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
402 if (nr_remaining)
403 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
404
405 /* Release LRU pages not migrated */
406 if (!list_empty(&cc->migratepages)) {
407 putback_lru_pages(&cc->migratepages);
408 cc->nr_migratepages = 0;
409 }
410
411 }
412
413 /* Release free pages and check accounting */
414 cc->nr_freepages -= release_freepages(&cc->freepages);
415 VM_BUG_ON(cc->nr_freepages != 0);
416
417 return ret;
418}
419
420static unsigned long compact_zone_order(struct zone *zone,
421 int order, gfp_t gfp_mask)
422{
423 struct compact_control cc = {
424 .nr_freepages = 0,
425 .nr_migratepages = 0,
426 .order = order,
427 .migratetype = allocflags_to_migratetype(gfp_mask),
428 .zone = zone,
429 };
430 INIT_LIST_HEAD(&cc.freepages);
431 INIT_LIST_HEAD(&cc.migratepages);
432
433 return compact_zone(zone, &cc);
434}
435
436int sysctl_extfrag_threshold = 500;
437
438/**
439 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
440 * @zonelist: The zonelist used for the current allocation
441 * @order: The order of the current allocation
442 * @gfp_mask: The GFP mask of the current allocation
443 * @nodemask: The allowed nodes to allocate from
444 *
445 * This is the main entry point for direct page compaction.
446 */
447unsigned long try_to_compact_pages(struct zonelist *zonelist,
448 int order, gfp_t gfp_mask, nodemask_t *nodemask)
449{
450 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
451 int may_enter_fs = gfp_mask & __GFP_FS;
452 int may_perform_io = gfp_mask & __GFP_IO;
453 unsigned long watermark;
454 struct zoneref *z;
455 struct zone *zone;
456 int rc = COMPACT_SKIPPED;
457
458 /*
459 * Check whether it is worth even starting compaction. The order check is
460 * made because an assumption is made that the page allocator can satisfy
461 * the "cheaper" orders without taking special steps
462 */
463 if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
464 return rc;
465
466 count_vm_event(COMPACTSTALL);
467
468 /* Compact each zone in the list */
469 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
470 nodemask) {
471 int fragindex;
472 int status;
473
474 /*
475 * Watermarks for order-0 must be met for compaction. Note
476 * the 2UL. This is because during migration, copies of
477 * pages need to be allocated and for a short time, the
478 * footprint is higher
479 */
480 watermark = low_wmark_pages(zone) + (2UL << order);
481 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
482 continue;
483
484 /*
485 * fragmentation index determines if allocation failures are
486 * due to low memory or external fragmentation
487 *
488 * index of -1 implies allocations might succeed depending
489 * on watermarks
490 * index towards 0 implies failure is due to lack of memory
491 * index towards 1000 implies failure is due to fragmentation
492 *
493 * Only compact if a failure would be due to fragmentation.
494 */
495 fragindex = fragmentation_index(zone, order);
496 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
497 continue;
498
499 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
500 rc = COMPACT_PARTIAL;
501 break;
502 }
503
504 status = compact_zone_order(zone, order, gfp_mask);
505 rc = max(status, rc);
506
507 if (zone_watermark_ok(zone, order, watermark, 0, 0))
508 break;
509 }
510
511 return rc;
512}
513
514
515/* Compact all zones within a node */
516static int compact_node(int nid)
517{
518 int zoneid;
519 pg_data_t *pgdat;
520 struct zone *zone;
521
522 if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
523 return -EINVAL;
524 pgdat = NODE_DATA(nid);
525
526 /* Flush pending updates to the LRU lists */
527 lru_add_drain_all();
528
529 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
530 struct compact_control cc = {
531 .nr_freepages = 0,
532 .nr_migratepages = 0,
533 .order = -1,
534 };
535
536 zone = &pgdat->node_zones[zoneid];
537 if (!populated_zone(zone))
538 continue;
539
540 cc.zone = zone;
541 INIT_LIST_HEAD(&cc.freepages);
542 INIT_LIST_HEAD(&cc.migratepages);
543
544 compact_zone(zone, &cc);
545
546 VM_BUG_ON(!list_empty(&cc.freepages));
547 VM_BUG_ON(!list_empty(&cc.migratepages));
548 }
549
550 return 0;
551}
552
553/* Compact all nodes in the system */
554static int compact_nodes(void)
555{
556 int nid;
557
558 for_each_online_node(nid)
559 compact_node(nid);
560
561 return COMPACT_COMPLETE;
562}
563
564/* The written value is actually unused, all memory is compacted */
565int sysctl_compact_memory;
566
567/* This is the entry point for compacting all nodes via /proc/sys/vm */
568int sysctl_compaction_handler(struct ctl_table *table, int write,
569 void __user *buffer, size_t *length, loff_t *ppos)
570{
571 if (write)
572 return compact_nodes();
573
574 return 0;
575}
576
577int sysctl_extfrag_handler(struct ctl_table *table, int write,
578 void __user *buffer, size_t *length, loff_t *ppos)
579{
580 proc_dointvec_minmax(table, write, buffer, length, ppos);
581
582 return 0;
583}
584
585#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
586ssize_t sysfs_compact_node(struct sys_device *dev,
587 struct sysdev_attribute *attr,
588 const char *buf, size_t count)
589{
590 compact_node(dev->id);
591
592 return count;
593}
594static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
595
596int compaction_register_node(struct node *node)
597{
598 return sysdev_create_file(&node->sysdev, &attr_compact);
599}
600
601void compaction_unregister_node(struct node *node)
602{
603 return sysdev_remove_file(&node->sysdev, &attr_compact);
604}
605#endif /* CONFIG_SYSFS && CONFIG_NUMA */
diff --git a/mm/filemap.c b/mm/filemap.c
index 829ac9cdbd70..45a2d18df849 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -441,7 +441,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
441 /* 441 /*
442 * Splice_read and readahead add shmem/tmpfs pages into the page cache 442 * Splice_read and readahead add shmem/tmpfs pages into the page cache
443 * before shmem_readpage has a chance to mark them as SwapBacked: they 443 * before shmem_readpage has a chance to mark them as SwapBacked: they
444 * need to go on the active_anon lru below, and mem_cgroup_cache_charge 444 * need to go on the anon lru below, and mem_cgroup_cache_charge
445 * (called in add_to_page_cache) needs to know where they're going too. 445 * (called in add_to_page_cache) needs to know where they're going too.
446 */ 446 */
447 if (mapping_cap_swap_backed(mapping)) 447 if (mapping_cap_swap_backed(mapping))
@@ -452,7 +452,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
452 if (page_is_file_cache(page)) 452 if (page_is_file_cache(page))
453 lru_cache_add_file(page); 453 lru_cache_add_file(page);
454 else 454 else
455 lru_cache_add_active_anon(page); 455 lru_cache_add_anon(page);
456 } 456 }
457 return ret; 457 return ret;
458} 458}
@@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
461#ifdef CONFIG_NUMA 461#ifdef CONFIG_NUMA
462struct page *__page_cache_alloc(gfp_t gfp) 462struct page *__page_cache_alloc(gfp_t gfp)
463{ 463{
464 int n;
465 struct page *page;
466
464 if (cpuset_do_page_mem_spread()) { 467 if (cpuset_do_page_mem_spread()) {
465 int n = cpuset_mem_spread_node(); 468 get_mems_allowed();
466 return alloc_pages_exact_node(n, gfp, 0); 469 n = cpuset_mem_spread_node();
470 page = alloc_pages_exact_node(n, gfp, 0);
471 put_mems_allowed();
472 return page;
467 } 473 }
468 return alloc_pages(gfp, 0); 474 return alloc_pages(gfp, 0);
469} 475}
@@ -1099,6 +1105,12 @@ page_not_up_to_date_locked:
1099 } 1105 }
1100 1106
1101readpage: 1107readpage:
1108 /*
1109 * A previous I/O error may have been due to temporary
1110 * failures, eg. multipath errors.
1111 * PG_error will be set again if readpage fails.
1112 */
1113 ClearPageError(page);
1102 /* Start the actual read. The read will unlock the page. */ 1114 /* Start the actual read. The read will unlock the page. */
1103 error = mapping->a_ops->readpage(filp, page); 1115 error = mapping->a_ops->readpage(filp, page);
1104 1116
diff --git a/mm/highmem.c b/mm/highmem.c
index bed8a8bfd01f..66baa20f78f5 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -422,7 +422,7 @@ void __init page_address_init(void)
422 422
423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
424 424
425#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) 425#ifdef CONFIG_DEBUG_HIGHMEM
426 426
427void debug_kmap_atomic(enum km_type type) 427void debug_kmap_atomic(enum km_type type)
428{ 428{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c9e6bbf3772..54d42b009dbe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
465 struct page *page = NULL; 465 struct page *page = NULL;
466 struct mempolicy *mpol; 466 struct mempolicy *mpol;
467 nodemask_t *nodemask; 467 nodemask_t *nodemask;
468 struct zonelist *zonelist = huge_zonelist(vma, address, 468 struct zonelist *zonelist;
469 htlb_alloc_mask, &mpol, &nodemask);
470 struct zone *zone; 469 struct zone *zone;
471 struct zoneref *z; 470 struct zoneref *z;
472 471
472 get_mems_allowed();
473 zonelist = huge_zonelist(vma, address,
474 htlb_alloc_mask, &mpol, &nodemask);
473 /* 475 /*
474 * A child process with MAP_PRIVATE mappings created by their parent 476 * A child process with MAP_PRIVATE mappings created by their parent
475 * have no page reserves. This check ensures that reservations are 477 * have no page reserves. This check ensures that reservations are
@@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
477 */ 479 */
478 if (!vma_has_reserves(vma) && 480 if (!vma_has_reserves(vma) &&
479 h->free_huge_pages - h->resv_huge_pages == 0) 481 h->free_huge_pages - h->resv_huge_pages == 0)
480 return NULL; 482 goto err;
481 483
482 /* If reserves cannot be used, ensure enough pages are in the pool */ 484 /* If reserves cannot be used, ensure enough pages are in the pool */
483 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 485 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
484 return NULL; 486 goto err;;
485 487
486 for_each_zone_zonelist_nodemask(zone, z, zonelist, 488 for_each_zone_zonelist_nodemask(zone, z, zonelist,
487 MAX_NR_ZONES - 1, nodemask) { 489 MAX_NR_ZONES - 1, nodemask) {
@@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
500 break; 502 break;
501 } 503 }
502 } 504 }
505err:
503 mpol_cond_put(mpol); 506 mpol_cond_put(mpol);
507 put_mems_allowed();
504 return page; 508 return page;
505} 509}
506 510
diff --git a/mm/ksm.c b/mm/ksm.c
index 956880f2ff49..6c3e99b4ae7c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -318,14 +318,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
318 struct anon_vma *anon_vma) 318 struct anon_vma *anon_vma)
319{ 319{
320 rmap_item->anon_vma = anon_vma; 320 rmap_item->anon_vma = anon_vma;
321 atomic_inc(&anon_vma->ksm_refcount); 321 atomic_inc(&anon_vma->external_refcount);
322} 322}
323 323
324static void drop_anon_vma(struct rmap_item *rmap_item) 324static void drop_anon_vma(struct rmap_item *rmap_item)
325{ 325{
326 struct anon_vma *anon_vma = rmap_item->anon_vma; 326 struct anon_vma *anon_vma = rmap_item->anon_vma;
327 327
328 if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { 328 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
329 int empty = list_empty(&anon_vma->head); 329 int empty = list_empty(&anon_vma->head);
330 spin_unlock(&anon_vma->lock); 330 spin_unlock(&anon_vma->lock);
331 if (empty) 331 if (empty)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8a79a6f0f029..c6ece0a57595 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -149,16 +149,35 @@ struct mem_cgroup_threshold {
149 u64 threshold; 149 u64 threshold;
150}; 150};
151 151
152/* For threshold */
152struct mem_cgroup_threshold_ary { 153struct mem_cgroup_threshold_ary {
153 /* An array index points to threshold just below usage. */ 154 /* An array index points to threshold just below usage. */
154 atomic_t current_threshold; 155 int current_threshold;
155 /* Size of entries[] */ 156 /* Size of entries[] */
156 unsigned int size; 157 unsigned int size;
157 /* Array of thresholds */ 158 /* Array of thresholds */
158 struct mem_cgroup_threshold entries[0]; 159 struct mem_cgroup_threshold entries[0];
159}; 160};
160 161
162struct mem_cgroup_thresholds {
163 /* Primary thresholds array */
164 struct mem_cgroup_threshold_ary *primary;
165 /*
166 * Spare threshold array.
167 * This is needed to make mem_cgroup_unregister_event() "never fail".
168 * It must be able to store at least primary->size - 1 entries.
169 */
170 struct mem_cgroup_threshold_ary *spare;
171};
172
173/* for OOM */
174struct mem_cgroup_eventfd_list {
175 struct list_head list;
176 struct eventfd_ctx *eventfd;
177};
178
161static void mem_cgroup_threshold(struct mem_cgroup *mem); 179static void mem_cgroup_threshold(struct mem_cgroup *mem);
180static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
162 181
163/* 182/*
164 * The memory controller data structure. The memory controller controls both 183 * The memory controller data structure. The memory controller controls both
@@ -207,6 +226,8 @@ struct mem_cgroup {
207 atomic_t refcnt; 226 atomic_t refcnt;
208 227
209 unsigned int swappiness; 228 unsigned int swappiness;
229 /* OOM-Killer disable */
230 int oom_kill_disable;
210 231
211 /* set when res.limit == memsw.limit */ 232 /* set when res.limit == memsw.limit */
212 bool memsw_is_minimum; 233 bool memsw_is_minimum;
@@ -215,17 +236,19 @@ struct mem_cgroup {
215 struct mutex thresholds_lock; 236 struct mutex thresholds_lock;
216 237
217 /* thresholds for memory usage. RCU-protected */ 238 /* thresholds for memory usage. RCU-protected */
218 struct mem_cgroup_threshold_ary *thresholds; 239 struct mem_cgroup_thresholds thresholds;
219 240
220 /* thresholds for mem+swap usage. RCU-protected */ 241 /* thresholds for mem+swap usage. RCU-protected */
221 struct mem_cgroup_threshold_ary *memsw_thresholds; 242 struct mem_cgroup_thresholds memsw_thresholds;
243
244 /* For oom notifier event fd */
245 struct list_head oom_notify;
222 246
223 /* 247 /*
224 * Should we move charges of a task when a task is moved into this 248 * Should we move charges of a task when a task is moved into this
225 * mem_cgroup ? And what type of charges should we move ? 249 * mem_cgroup ? And what type of charges should we move ?
226 */ 250 */
227 unsigned long move_charge_at_immigrate; 251 unsigned long move_charge_at_immigrate;
228
229 /* 252 /*
230 * percpu counter. 253 * percpu counter.
231 */ 254 */
@@ -239,6 +262,7 @@ struct mem_cgroup {
239 */ 262 */
240enum move_type { 263enum move_type {
241 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 264 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
265 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
242 NR_MOVE_TYPE, 266 NR_MOVE_TYPE,
243}; 267};
244 268
@@ -255,6 +279,18 @@ static struct move_charge_struct {
255 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 279 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
256}; 280};
257 281
282static bool move_anon(void)
283{
284 return test_bit(MOVE_CHARGE_TYPE_ANON,
285 &mc.to->move_charge_at_immigrate);
286}
287
288static bool move_file(void)
289{
290 return test_bit(MOVE_CHARGE_TYPE_FILE,
291 &mc.to->move_charge_at_immigrate);
292}
293
258/* 294/*
259 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 295 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
260 * limit reclaim to prevent infinite loops, if they ever occur. 296 * limit reclaim to prevent infinite loops, if they ever occur.
@@ -282,9 +318,12 @@ enum charge_type {
282/* for encoding cft->private value on file */ 318/* for encoding cft->private value on file */
283#define _MEM (0) 319#define _MEM (0)
284#define _MEMSWAP (1) 320#define _MEMSWAP (1)
321#define _OOM_TYPE (2)
285#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 322#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
286#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 323#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
287#define MEMFILE_ATTR(val) ((val) & 0xffff) 324#define MEMFILE_ATTR(val) ((val) & 0xffff)
325/* Used for OOM nofiier */
326#define OOM_CONTROL (0)
288 327
289/* 328/*
290 * Reclaim flags for mem_cgroup_hierarchical_reclaim 329 * Reclaim flags for mem_cgroup_hierarchical_reclaim
@@ -1293,14 +1332,62 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1293static DEFINE_MUTEX(memcg_oom_mutex); 1332static DEFINE_MUTEX(memcg_oom_mutex);
1294static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1333static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1295 1334
1335struct oom_wait_info {
1336 struct mem_cgroup *mem;
1337 wait_queue_t wait;
1338};
1339
1340static int memcg_oom_wake_function(wait_queue_t *wait,
1341 unsigned mode, int sync, void *arg)
1342{
1343 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1344 struct oom_wait_info *oom_wait_info;
1345
1346 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1347
1348 if (oom_wait_info->mem == wake_mem)
1349 goto wakeup;
1350 /* if no hierarchy, no match */
1351 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1352 return 0;
1353 /*
1354 * Both of oom_wait_info->mem and wake_mem are stable under us.
1355 * Then we can use css_is_ancestor without taking care of RCU.
1356 */
1357 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1358 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1359 return 0;
1360
1361wakeup:
1362 return autoremove_wake_function(wait, mode, sync, arg);
1363}
1364
1365static void memcg_wakeup_oom(struct mem_cgroup *mem)
1366{
1367 /* for filtering, pass "mem" as argument. */
1368 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1369}
1370
1371static void memcg_oom_recover(struct mem_cgroup *mem)
1372{
1373 if (mem->oom_kill_disable && atomic_read(&mem->oom_lock))
1374 memcg_wakeup_oom(mem);
1375}
1376
1296/* 1377/*
1297 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1378 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1298 */ 1379 */
1299bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1380bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1300{ 1381{
1301 DEFINE_WAIT(wait); 1382 struct oom_wait_info owait;
1302 bool locked; 1383 bool locked, need_to_kill;
1303 1384
1385 owait.mem = mem;
1386 owait.wait.flags = 0;
1387 owait.wait.func = memcg_oom_wake_function;
1388 owait.wait.private = current;
1389 INIT_LIST_HEAD(&owait.wait.task_list);
1390 need_to_kill = true;
1304 /* At first, try to OOM lock hierarchy under mem.*/ 1391 /* At first, try to OOM lock hierarchy under mem.*/
1305 mutex_lock(&memcg_oom_mutex); 1392 mutex_lock(&memcg_oom_mutex);
1306 locked = mem_cgroup_oom_lock(mem); 1393 locked = mem_cgroup_oom_lock(mem);
@@ -1309,32 +1396,23 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1309 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1396 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1310 * under OOM is always welcomed, use TASK_KILLABLE here. 1397 * under OOM is always welcomed, use TASK_KILLABLE here.
1311 */ 1398 */
1312 if (!locked) 1399 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1313 prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); 1400 if (!locked || mem->oom_kill_disable)
1401 need_to_kill = false;
1402 if (locked)
1403 mem_cgroup_oom_notify(mem);
1314 mutex_unlock(&memcg_oom_mutex); 1404 mutex_unlock(&memcg_oom_mutex);
1315 1405
1316 if (locked) 1406 if (need_to_kill) {
1407 finish_wait(&memcg_oom_waitq, &owait.wait);
1317 mem_cgroup_out_of_memory(mem, mask); 1408 mem_cgroup_out_of_memory(mem, mask);
1318 else { 1409 } else {
1319 schedule(); 1410 schedule();
1320 finish_wait(&memcg_oom_waitq, &wait); 1411 finish_wait(&memcg_oom_waitq, &owait.wait);
1321 } 1412 }
1322 mutex_lock(&memcg_oom_mutex); 1413 mutex_lock(&memcg_oom_mutex);
1323 mem_cgroup_oom_unlock(mem); 1414 mem_cgroup_oom_unlock(mem);
1324 /* 1415 memcg_wakeup_oom(mem);
1325 * Here, we use global waitq .....more fine grained waitq ?
1326 * Assume following hierarchy.
1327 * A/
1328 * 01
1329 * 02
1330 * assume OOM happens both in A and 01 at the same time. Tthey are
1331 * mutually exclusive by lock. (kill in 01 helps A.)
1332 * When we use per memcg waitq, we have to wake up waiters on A and 02
1333 * in addtion to waiters on 01. We use global waitq for avoiding mess.
1334 * It will not be a big problem.
1335 * (And a task may be moved to other groups while it's waiting for OOM.)
1336 */
1337 wake_up_all(&memcg_oom_waitq);
1338 mutex_unlock(&memcg_oom_mutex); 1416 mutex_unlock(&memcg_oom_mutex);
1339 1417
1340 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1418 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
@@ -1438,7 +1516,7 @@ static void drain_local_stock(struct work_struct *dummy)
1438 1516
1439/* 1517/*
1440 * Cache charges(val) which is from res_counter, to local per_cpu area. 1518 * Cache charges(val) which is from res_counter, to local per_cpu area.
1441 * This will be consumed by consumt_stock() function, later. 1519 * This will be consumed by consume_stock() function, later.
1442 */ 1520 */
1443static void refill_stock(struct mem_cgroup *mem, int val) 1521static void refill_stock(struct mem_cgroup *mem, int val)
1444{ 1522{
@@ -2118,15 +2196,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2118 /* If swapout, usage of swap doesn't decrease */ 2196 /* If swapout, usage of swap doesn't decrease */
2119 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2197 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2120 uncharge_memsw = false; 2198 uncharge_memsw = false;
2121 /*
2122 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2123 * In those cases, all pages freed continously can be expected to be in
2124 * the same cgroup and we have chance to coalesce uncharges.
2125 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2126 * because we want to do uncharge as soon as possible.
2127 */
2128 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
2129 goto direct_uncharge;
2130 2199
2131 batch = &current->memcg_batch; 2200 batch = &current->memcg_batch;
2132 /* 2201 /*
@@ -2137,6 +2206,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2137 if (!batch->memcg) 2206 if (!batch->memcg)
2138 batch->memcg = mem; 2207 batch->memcg = mem;
2139 /* 2208 /*
2209 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2210 * In those cases, all pages freed continously can be expected to be in
2211 * the same cgroup and we have chance to coalesce uncharges.
2212 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2213 * because we want to do uncharge as soon as possible.
2214 */
2215
2216 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2217 goto direct_uncharge;
2218
2219 /*
2140 * In typical case, batch->memcg == mem. This means we can 2220 * In typical case, batch->memcg == mem. This means we can
2141 * merge a series of uncharges to an uncharge of res_counter. 2221 * merge a series of uncharges to an uncharge of res_counter.
2142 * If not, we uncharge res_counter ony by one. 2222 * If not, we uncharge res_counter ony by one.
@@ -2152,6 +2232,8 @@ direct_uncharge:
2152 res_counter_uncharge(&mem->res, PAGE_SIZE); 2232 res_counter_uncharge(&mem->res, PAGE_SIZE);
2153 if (uncharge_memsw) 2233 if (uncharge_memsw)
2154 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2234 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2235 if (unlikely(batch->memcg != mem))
2236 memcg_oom_recover(mem);
2155 return; 2237 return;
2156} 2238}
2157 2239
@@ -2188,7 +2270,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2188 switch (ctype) { 2270 switch (ctype) {
2189 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2271 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2190 case MEM_CGROUP_CHARGE_TYPE_DROP: 2272 case MEM_CGROUP_CHARGE_TYPE_DROP:
2191 if (page_mapped(page)) 2273 /* See mem_cgroup_prepare_migration() */
2274 if (page_mapped(page) || PageCgroupMigration(pc))
2192 goto unlock_out; 2275 goto unlock_out;
2193 break; 2276 break;
2194 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2277 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2288,6 +2371,7 @@ void mem_cgroup_uncharge_end(void)
2288 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2371 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2289 if (batch->memsw_bytes) 2372 if (batch->memsw_bytes)
2290 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2373 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2374 memcg_oom_recover(batch->memcg);
2291 /* forget this pointer (for sanity check) */ 2375 /* forget this pointer (for sanity check) */
2292 batch->memcg = NULL; 2376 batch->memcg = NULL;
2293} 2377}
@@ -2410,10 +2494,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2410 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2494 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2411 * page belongs to. 2495 * page belongs to.
2412 */ 2496 */
2413int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 2497int mem_cgroup_prepare_migration(struct page *page,
2498 struct page *newpage, struct mem_cgroup **ptr)
2414{ 2499{
2415 struct page_cgroup *pc; 2500 struct page_cgroup *pc;
2416 struct mem_cgroup *mem = NULL; 2501 struct mem_cgroup *mem = NULL;
2502 enum charge_type ctype;
2417 int ret = 0; 2503 int ret = 0;
2418 2504
2419 if (mem_cgroup_disabled()) 2505 if (mem_cgroup_disabled())
@@ -2424,69 +2510,125 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2424 if (PageCgroupUsed(pc)) { 2510 if (PageCgroupUsed(pc)) {
2425 mem = pc->mem_cgroup; 2511 mem = pc->mem_cgroup;
2426 css_get(&mem->css); 2512 css_get(&mem->css);
2513 /*
2514 * At migrating an anonymous page, its mapcount goes down
2515 * to 0 and uncharge() will be called. But, even if it's fully
2516 * unmapped, migration may fail and this page has to be
2517 * charged again. We set MIGRATION flag here and delay uncharge
2518 * until end_migration() is called
2519 *
2520 * Corner Case Thinking
2521 * A)
2522 * When the old page was mapped as Anon and it's unmap-and-freed
2523 * while migration was ongoing.
2524 * If unmap finds the old page, uncharge() of it will be delayed
2525 * until end_migration(). If unmap finds a new page, it's
2526 * uncharged when it make mapcount to be 1->0. If unmap code
2527 * finds swap_migration_entry, the new page will not be mapped
2528 * and end_migration() will find it(mapcount==0).
2529 *
2530 * B)
2531 * When the old page was mapped but migraion fails, the kernel
2532 * remaps it. A charge for it is kept by MIGRATION flag even
2533 * if mapcount goes down to 0. We can do remap successfully
2534 * without charging it again.
2535 *
2536 * C)
2537 * The "old" page is under lock_page() until the end of
2538 * migration, so, the old page itself will not be swapped-out.
2539 * If the new page is swapped out before end_migraton, our
2540 * hook to usual swap-out path will catch the event.
2541 */
2542 if (PageAnon(page))
2543 SetPageCgroupMigration(pc);
2427 } 2544 }
2428 unlock_page_cgroup(pc); 2545 unlock_page_cgroup(pc);
2546 /*
2547 * If the page is not charged at this point,
2548 * we return here.
2549 */
2550 if (!mem)
2551 return 0;
2429 2552
2430 *ptr = mem; 2553 *ptr = mem;
2431 if (mem) { 2554 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2432 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 2555 css_put(&mem->css);/* drop extra refcnt */
2433 css_put(&mem->css); 2556 if (ret || *ptr == NULL) {
2557 if (PageAnon(page)) {
2558 lock_page_cgroup(pc);
2559 ClearPageCgroupMigration(pc);
2560 unlock_page_cgroup(pc);
2561 /*
2562 * The old page may be fully unmapped while we kept it.
2563 */
2564 mem_cgroup_uncharge_page(page);
2565 }
2566 return -ENOMEM;
2434 } 2567 }
2568 /*
2569 * We charge new page before it's used/mapped. So, even if unlock_page()
2570 * is called before end_migration, we can catch all events on this new
2571 * page. In the case new page is migrated but not remapped, new page's
2572 * mapcount will be finally 0 and we call uncharge in end_migration().
2573 */
2574 pc = lookup_page_cgroup(newpage);
2575 if (PageAnon(page))
2576 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2577 else if (page_is_file_cache(page))
2578 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2579 else
2580 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2581 __mem_cgroup_commit_charge(mem, pc, ctype);
2435 return ret; 2582 return ret;
2436} 2583}
2437 2584
2438/* remove redundant charge if migration failed*/ 2585/* remove redundant charge if migration failed*/
2439void mem_cgroup_end_migration(struct mem_cgroup *mem, 2586void mem_cgroup_end_migration(struct mem_cgroup *mem,
2440 struct page *oldpage, struct page *newpage) 2587 struct page *oldpage, struct page *newpage)
2441{ 2588{
2442 struct page *target, *unused; 2589 struct page *used, *unused;
2443 struct page_cgroup *pc; 2590 struct page_cgroup *pc;
2444 enum charge_type ctype;
2445 2591
2446 if (!mem) 2592 if (!mem)
2447 return; 2593 return;
2594 /* blocks rmdir() */
2448 cgroup_exclude_rmdir(&mem->css); 2595 cgroup_exclude_rmdir(&mem->css);
2449 /* at migration success, oldpage->mapping is NULL. */ 2596 /* at migration success, oldpage->mapping is NULL. */
2450 if (oldpage->mapping) { 2597 if (oldpage->mapping) {
2451 target = oldpage; 2598 used = oldpage;
2452 unused = NULL; 2599 unused = newpage;
2453 } else { 2600 } else {
2454 target = newpage; 2601 used = newpage;
2455 unused = oldpage; 2602 unused = oldpage;
2456 } 2603 }
2457
2458 if (PageAnon(target))
2459 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2460 else if (page_is_file_cache(target))
2461 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2462 else
2463 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2464
2465 /* unused page is not on radix-tree now. */
2466 if (unused)
2467 __mem_cgroup_uncharge_common(unused, ctype);
2468
2469 pc = lookup_page_cgroup(target);
2470 /* 2604 /*
2471 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 2605 * We disallowed uncharge of pages under migration because mapcount
2472 * So, double-counting is effectively avoided. 2606 * of the page goes down to zero, temporarly.
2607 * Clear the flag and check the page should be charged.
2473 */ 2608 */
2474 __mem_cgroup_commit_charge(mem, pc, ctype); 2609 pc = lookup_page_cgroup(oldpage);
2610 lock_page_cgroup(pc);
2611 ClearPageCgroupMigration(pc);
2612 unlock_page_cgroup(pc);
2613
2614 if (unused != oldpage)
2615 pc = lookup_page_cgroup(unused);
2616 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2475 2617
2618 pc = lookup_page_cgroup(used);
2476 /* 2619 /*
2477 * Both of oldpage and newpage are still under lock_page(). 2620 * If a page is a file cache, radix-tree replacement is very atomic
2478 * Then, we don't have to care about race in radix-tree. 2621 * and we can skip this check. When it was an Anon page, its mapcount
2479 * But we have to be careful that this page is unmapped or not. 2622 * goes down to 0. But because we added MIGRATION flage, it's not
2480 * 2623 * uncharged yet. There are several case but page->mapcount check
2481 * There is a case for !page_mapped(). At the start of 2624 * and USED bit check in mem_cgroup_uncharge_page() will do enough
2482 * migration, oldpage was mapped. But now, it's zapped. 2625 * check. (see prepare_charge() also)
2483 * But we know *target* page is not freed/reused under us.
2484 * mem_cgroup_uncharge_page() does all necessary checks.
2485 */ 2626 */
2486 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2627 if (PageAnon(used))
2487 mem_cgroup_uncharge_page(target); 2628 mem_cgroup_uncharge_page(used);
2488 /* 2629 /*
2489 * At migration, we may charge account against cgroup which has no tasks 2630 * At migration, we may charge account against cgroup which has no
2631 * tasks.
2490 * So, rmdir()->pre_destroy() can be called while we do this charge. 2632 * So, rmdir()->pre_destroy() can be called while we do this charge.
2491 * In that case, we need to call pre_destroy() again. check it here. 2633 * In that case, we need to call pre_destroy() again. check it here.
2492 */ 2634 */
@@ -2524,10 +2666,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2524 unsigned long long val) 2666 unsigned long long val)
2525{ 2667{
2526 int retry_count; 2668 int retry_count;
2527 u64 memswlimit; 2669 u64 memswlimit, memlimit;
2528 int ret = 0; 2670 int ret = 0;
2529 int children = mem_cgroup_count_children(memcg); 2671 int children = mem_cgroup_count_children(memcg);
2530 u64 curusage, oldusage; 2672 u64 curusage, oldusage;
2673 int enlarge;
2531 2674
2532 /* 2675 /*
2533 * For keeping hierarchical_reclaim simple, how long we should retry 2676 * For keeping hierarchical_reclaim simple, how long we should retry
@@ -2538,6 +2681,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2538 2681
2539 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2682 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2540 2683
2684 enlarge = 0;
2541 while (retry_count) { 2685 while (retry_count) {
2542 if (signal_pending(current)) { 2686 if (signal_pending(current)) {
2543 ret = -EINTR; 2687 ret = -EINTR;
@@ -2555,6 +2699,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2555 mutex_unlock(&set_limit_mutex); 2699 mutex_unlock(&set_limit_mutex);
2556 break; 2700 break;
2557 } 2701 }
2702
2703 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2704 if (memlimit < val)
2705 enlarge = 1;
2706
2558 ret = res_counter_set_limit(&memcg->res, val); 2707 ret = res_counter_set_limit(&memcg->res, val);
2559 if (!ret) { 2708 if (!ret) {
2560 if (memswlimit == val) 2709 if (memswlimit == val)
@@ -2576,6 +2725,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2576 else 2725 else
2577 oldusage = curusage; 2726 oldusage = curusage;
2578 } 2727 }
2728 if (!ret && enlarge)
2729 memcg_oom_recover(memcg);
2579 2730
2580 return ret; 2731 return ret;
2581} 2732}
@@ -2584,9 +2735,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2584 unsigned long long val) 2735 unsigned long long val)
2585{ 2736{
2586 int retry_count; 2737 int retry_count;
2587 u64 memlimit, oldusage, curusage; 2738 u64 memlimit, memswlimit, oldusage, curusage;
2588 int children = mem_cgroup_count_children(memcg); 2739 int children = mem_cgroup_count_children(memcg);
2589 int ret = -EBUSY; 2740 int ret = -EBUSY;
2741 int enlarge = 0;
2590 2742
2591 /* see mem_cgroup_resize_res_limit */ 2743 /* see mem_cgroup_resize_res_limit */
2592 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2744 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
@@ -2608,6 +2760,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2608 mutex_unlock(&set_limit_mutex); 2760 mutex_unlock(&set_limit_mutex);
2609 break; 2761 break;
2610 } 2762 }
2763 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2764 if (memswlimit < val)
2765 enlarge = 1;
2611 ret = res_counter_set_limit(&memcg->memsw, val); 2766 ret = res_counter_set_limit(&memcg->memsw, val);
2612 if (!ret) { 2767 if (!ret) {
2613 if (memlimit == val) 2768 if (memlimit == val)
@@ -2630,6 +2785,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2630 else 2785 else
2631 oldusage = curusage; 2786 oldusage = curusage;
2632 } 2787 }
2788 if (!ret && enlarge)
2789 memcg_oom_recover(memcg);
2633 return ret; 2790 return ret;
2634} 2791}
2635 2792
@@ -2821,6 +2978,7 @@ move_account:
2821 if (ret) 2978 if (ret)
2822 break; 2979 break;
2823 } 2980 }
2981 memcg_oom_recover(mem);
2824 /* it seems parent cgroup doesn't have enough mem */ 2982 /* it seems parent cgroup doesn't have enough mem */
2825 if (ret == -ENOMEM) 2983 if (ret == -ENOMEM)
2826 goto try_to_free; 2984 goto try_to_free;
@@ -3311,9 +3469,9 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3311 3469
3312 rcu_read_lock(); 3470 rcu_read_lock();
3313 if (!swap) 3471 if (!swap)
3314 t = rcu_dereference(memcg->thresholds); 3472 t = rcu_dereference(memcg->thresholds.primary);
3315 else 3473 else
3316 t = rcu_dereference(memcg->memsw_thresholds); 3474 t = rcu_dereference(memcg->memsw_thresholds.primary);
3317 3475
3318 if (!t) 3476 if (!t)
3319 goto unlock; 3477 goto unlock;
@@ -3325,7 +3483,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3325 * If it's not true, a threshold was crossed after last 3483 * If it's not true, a threshold was crossed after last
3326 * call of __mem_cgroup_threshold(). 3484 * call of __mem_cgroup_threshold().
3327 */ 3485 */
3328 i = atomic_read(&t->current_threshold); 3486 i = t->current_threshold;
3329 3487
3330 /* 3488 /*
3331 * Iterate backward over array of thresholds starting from 3489 * Iterate backward over array of thresholds starting from
@@ -3349,7 +3507,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3349 eventfd_signal(t->entries[i].eventfd, 1); 3507 eventfd_signal(t->entries[i].eventfd, 1);
3350 3508
3351 /* Update current_threshold */ 3509 /* Update current_threshold */
3352 atomic_set(&t->current_threshold, i - 1); 3510 t->current_threshold = i - 1;
3353unlock: 3511unlock:
3354 rcu_read_unlock(); 3512 rcu_read_unlock();
3355} 3513}
@@ -3369,106 +3527,117 @@ static int compare_thresholds(const void *a, const void *b)
3369 return _a->threshold - _b->threshold; 3527 return _a->threshold - _b->threshold;
3370} 3528}
3371 3529
3372static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, 3530static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3373 struct eventfd_ctx *eventfd, const char *args) 3531{
3532 struct mem_cgroup_eventfd_list *ev;
3533
3534 list_for_each_entry(ev, &mem->oom_notify, list)
3535 eventfd_signal(ev->eventfd, 1);
3536 return 0;
3537}
3538
3539static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3540{
3541 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
3542}
3543
3544static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
3545 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3374{ 3546{
3375 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3547 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3376 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3548 struct mem_cgroup_thresholds *thresholds;
3549 struct mem_cgroup_threshold_ary *new;
3377 int type = MEMFILE_TYPE(cft->private); 3550 int type = MEMFILE_TYPE(cft->private);
3378 u64 threshold, usage; 3551 u64 threshold, usage;
3379 int size; 3552 int i, size, ret;
3380 int i, ret;
3381 3553
3382 ret = res_counter_memparse_write_strategy(args, &threshold); 3554 ret = res_counter_memparse_write_strategy(args, &threshold);
3383 if (ret) 3555 if (ret)
3384 return ret; 3556 return ret;
3385 3557
3386 mutex_lock(&memcg->thresholds_lock); 3558 mutex_lock(&memcg->thresholds_lock);
3559
3387 if (type == _MEM) 3560 if (type == _MEM)
3388 thresholds = memcg->thresholds; 3561 thresholds = &memcg->thresholds;
3389 else if (type == _MEMSWAP) 3562 else if (type == _MEMSWAP)
3390 thresholds = memcg->memsw_thresholds; 3563 thresholds = &memcg->memsw_thresholds;
3391 else 3564 else
3392 BUG(); 3565 BUG();
3393 3566
3394 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3567 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3395 3568
3396 /* Check if a threshold crossed before adding a new one */ 3569 /* Check if a threshold crossed before adding a new one */
3397 if (thresholds) 3570 if (thresholds->primary)
3398 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3571 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3399 3572
3400 if (thresholds) 3573 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3401 size = thresholds->size + 1;
3402 else
3403 size = 1;
3404 3574
3405 /* Allocate memory for new array of thresholds */ 3575 /* Allocate memory for new array of thresholds */
3406 thresholds_new = kmalloc(sizeof(*thresholds_new) + 3576 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3407 size * sizeof(struct mem_cgroup_threshold),
3408 GFP_KERNEL); 3577 GFP_KERNEL);
3409 if (!thresholds_new) { 3578 if (!new) {
3410 ret = -ENOMEM; 3579 ret = -ENOMEM;
3411 goto unlock; 3580 goto unlock;
3412 } 3581 }
3413 thresholds_new->size = size; 3582 new->size = size;
3414 3583
3415 /* Copy thresholds (if any) to new array */ 3584 /* Copy thresholds (if any) to new array */
3416 if (thresholds) 3585 if (thresholds->primary) {
3417 memcpy(thresholds_new->entries, thresholds->entries, 3586 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3418 thresholds->size *
3419 sizeof(struct mem_cgroup_threshold)); 3587 sizeof(struct mem_cgroup_threshold));
3588 }
3589
3420 /* Add new threshold */ 3590 /* Add new threshold */
3421 thresholds_new->entries[size - 1].eventfd = eventfd; 3591 new->entries[size - 1].eventfd = eventfd;
3422 thresholds_new->entries[size - 1].threshold = threshold; 3592 new->entries[size - 1].threshold = threshold;
3423 3593
3424 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3594 /* Sort thresholds. Registering of new threshold isn't time-critical */
3425 sort(thresholds_new->entries, size, 3595 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3426 sizeof(struct mem_cgroup_threshold),
3427 compare_thresholds, NULL); 3596 compare_thresholds, NULL);
3428 3597
3429 /* Find current threshold */ 3598 /* Find current threshold */
3430 atomic_set(&thresholds_new->current_threshold, -1); 3599 new->current_threshold = -1;
3431 for (i = 0; i < size; i++) { 3600 for (i = 0; i < size; i++) {
3432 if (thresholds_new->entries[i].threshold < usage) { 3601 if (new->entries[i].threshold < usage) {
3433 /* 3602 /*
3434 * thresholds_new->current_threshold will not be used 3603 * new->current_threshold will not be used until
3435 * until rcu_assign_pointer(), so it's safe to increment 3604 * rcu_assign_pointer(), so it's safe to increment
3436 * it here. 3605 * it here.
3437 */ 3606 */
3438 atomic_inc(&thresholds_new->current_threshold); 3607 ++new->current_threshold;
3439 } 3608 }
3440 } 3609 }
3441 3610
3442 if (type == _MEM) 3611 /* Free old spare buffer and save old primary buffer as spare */
3443 rcu_assign_pointer(memcg->thresholds, thresholds_new); 3612 kfree(thresholds->spare);
3444 else 3613 thresholds->spare = thresholds->primary;
3445 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); 3614
3615 rcu_assign_pointer(thresholds->primary, new);
3446 3616
3447 /* To be sure that nobody uses thresholds before freeing it */ 3617 /* To be sure that nobody uses thresholds */
3448 synchronize_rcu(); 3618 synchronize_rcu();
3449 3619
3450 kfree(thresholds);
3451unlock: 3620unlock:
3452 mutex_unlock(&memcg->thresholds_lock); 3621 mutex_unlock(&memcg->thresholds_lock);
3453 3622
3454 return ret; 3623 return ret;
3455} 3624}
3456 3625
3457static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, 3626static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
3458 struct eventfd_ctx *eventfd) 3627 struct cftype *cft, struct eventfd_ctx *eventfd)
3459{ 3628{
3460 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3629 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3461 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3630 struct mem_cgroup_thresholds *thresholds;
3631 struct mem_cgroup_threshold_ary *new;
3462 int type = MEMFILE_TYPE(cft->private); 3632 int type = MEMFILE_TYPE(cft->private);
3463 u64 usage; 3633 u64 usage;
3464 int size = 0; 3634 int i, j, size;
3465 int i, j, ret;
3466 3635
3467 mutex_lock(&memcg->thresholds_lock); 3636 mutex_lock(&memcg->thresholds_lock);
3468 if (type == _MEM) 3637 if (type == _MEM)
3469 thresholds = memcg->thresholds; 3638 thresholds = &memcg->thresholds;
3470 else if (type == _MEMSWAP) 3639 else if (type == _MEMSWAP)
3471 thresholds = memcg->memsw_thresholds; 3640 thresholds = &memcg->memsw_thresholds;
3472 else 3641 else
3473 BUG(); 3642 BUG();
3474 3643
@@ -3484,59 +3653,136 @@ static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
3484 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3653 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3485 3654
3486 /* Calculate new number of threshold */ 3655 /* Calculate new number of threshold */
3487 for (i = 0; i < thresholds->size; i++) { 3656 size = 0;
3488 if (thresholds->entries[i].eventfd != eventfd) 3657 for (i = 0; i < thresholds->primary->size; i++) {
3658 if (thresholds->primary->entries[i].eventfd != eventfd)
3489 size++; 3659 size++;
3490 } 3660 }
3491 3661
3662 new = thresholds->spare;
3663
3492 /* Set thresholds array to NULL if we don't have thresholds */ 3664 /* Set thresholds array to NULL if we don't have thresholds */
3493 if (!size) { 3665 if (!size) {
3494 thresholds_new = NULL; 3666 kfree(new);
3495 goto assign; 3667 new = NULL;
3668 goto swap_buffers;
3496 } 3669 }
3497 3670
3498 /* Allocate memory for new array of thresholds */ 3671 new->size = size;
3499 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3500 size * sizeof(struct mem_cgroup_threshold),
3501 GFP_KERNEL);
3502 if (!thresholds_new) {
3503 ret = -ENOMEM;
3504 goto unlock;
3505 }
3506 thresholds_new->size = size;
3507 3672
3508 /* Copy thresholds and find current threshold */ 3673 /* Copy thresholds and find current threshold */
3509 atomic_set(&thresholds_new->current_threshold, -1); 3674 new->current_threshold = -1;
3510 for (i = 0, j = 0; i < thresholds->size; i++) { 3675 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3511 if (thresholds->entries[i].eventfd == eventfd) 3676 if (thresholds->primary->entries[i].eventfd == eventfd)
3512 continue; 3677 continue;
3513 3678
3514 thresholds_new->entries[j] = thresholds->entries[i]; 3679 new->entries[j] = thresholds->primary->entries[i];
3515 if (thresholds_new->entries[j].threshold < usage) { 3680 if (new->entries[j].threshold < usage) {
3516 /* 3681 /*
3517 * thresholds_new->current_threshold will not be used 3682 * new->current_threshold will not be used
3518 * until rcu_assign_pointer(), so it's safe to increment 3683 * until rcu_assign_pointer(), so it's safe to increment
3519 * it here. 3684 * it here.
3520 */ 3685 */
3521 atomic_inc(&thresholds_new->current_threshold); 3686 ++new->current_threshold;
3522 } 3687 }
3523 j++; 3688 j++;
3524 } 3689 }
3525 3690
3526assign: 3691swap_buffers:
3527 if (type == _MEM) 3692 /* Swap primary and spare array */
3528 rcu_assign_pointer(memcg->thresholds, thresholds_new); 3693 thresholds->spare = thresholds->primary;
3529 else 3694 rcu_assign_pointer(thresholds->primary, new);
3530 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3531 3695
3532 /* To be sure that nobody uses thresholds before freeing it */ 3696 /* To be sure that nobody uses thresholds */
3533 synchronize_rcu(); 3697 synchronize_rcu();
3534 3698
3535 kfree(thresholds);
3536unlock:
3537 mutex_unlock(&memcg->thresholds_lock); 3699 mutex_unlock(&memcg->thresholds_lock);
3700}
3538 3701
3539 return ret; 3702static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
3703 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3704{
3705 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3706 struct mem_cgroup_eventfd_list *event;
3707 int type = MEMFILE_TYPE(cft->private);
3708
3709 BUG_ON(type != _OOM_TYPE);
3710 event = kmalloc(sizeof(*event), GFP_KERNEL);
3711 if (!event)
3712 return -ENOMEM;
3713
3714 mutex_lock(&memcg_oom_mutex);
3715
3716 event->eventfd = eventfd;
3717 list_add(&event->list, &memcg->oom_notify);
3718
3719 /* already in OOM ? */
3720 if (atomic_read(&memcg->oom_lock))
3721 eventfd_signal(eventfd, 1);
3722 mutex_unlock(&memcg_oom_mutex);
3723
3724 return 0;
3725}
3726
3727static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3728 struct cftype *cft, struct eventfd_ctx *eventfd)
3729{
3730 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3731 struct mem_cgroup_eventfd_list *ev, *tmp;
3732 int type = MEMFILE_TYPE(cft->private);
3733
3734 BUG_ON(type != _OOM_TYPE);
3735
3736 mutex_lock(&memcg_oom_mutex);
3737
3738 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
3739 if (ev->eventfd == eventfd) {
3740 list_del(&ev->list);
3741 kfree(ev);
3742 }
3743 }
3744
3745 mutex_unlock(&memcg_oom_mutex);
3746}
3747
3748static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3749 struct cftype *cft, struct cgroup_map_cb *cb)
3750{
3751 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3752
3753 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
3754
3755 if (atomic_read(&mem->oom_lock))
3756 cb->fill(cb, "under_oom", 1);
3757 else
3758 cb->fill(cb, "under_oom", 0);
3759 return 0;
3760}
3761
3762/*
3763 */
3764static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3765 struct cftype *cft, u64 val)
3766{
3767 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3768 struct mem_cgroup *parent;
3769
3770 /* cannot set to root cgroup and only 0 and 1 are allowed */
3771 if (!cgrp->parent || !((val == 0) || (val == 1)))
3772 return -EINVAL;
3773
3774 parent = mem_cgroup_from_cont(cgrp->parent);
3775
3776 cgroup_lock();
3777 /* oom-kill-disable is a flag for subhierarchy. */
3778 if ((parent->use_hierarchy) ||
3779 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
3780 cgroup_unlock();
3781 return -EINVAL;
3782 }
3783 mem->oom_kill_disable = val;
3784 cgroup_unlock();
3785 return 0;
3540} 3786}
3541 3787
3542static struct cftype mem_cgroup_files[] = { 3788static struct cftype mem_cgroup_files[] = {
@@ -3544,8 +3790,8 @@ static struct cftype mem_cgroup_files[] = {
3544 .name = "usage_in_bytes", 3790 .name = "usage_in_bytes",
3545 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3791 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3546 .read_u64 = mem_cgroup_read, 3792 .read_u64 = mem_cgroup_read,
3547 .register_event = mem_cgroup_register_event, 3793 .register_event = mem_cgroup_usage_register_event,
3548 .unregister_event = mem_cgroup_unregister_event, 3794 .unregister_event = mem_cgroup_usage_unregister_event,
3549 }, 3795 },
3550 { 3796 {
3551 .name = "max_usage_in_bytes", 3797 .name = "max_usage_in_bytes",
@@ -3594,6 +3840,14 @@ static struct cftype mem_cgroup_files[] = {
3594 .read_u64 = mem_cgroup_move_charge_read, 3840 .read_u64 = mem_cgroup_move_charge_read,
3595 .write_u64 = mem_cgroup_move_charge_write, 3841 .write_u64 = mem_cgroup_move_charge_write,
3596 }, 3842 },
3843 {
3844 .name = "oom_control",
3845 .read_map = mem_cgroup_oom_control_read,
3846 .write_u64 = mem_cgroup_oom_control_write,
3847 .register_event = mem_cgroup_oom_register_event,
3848 .unregister_event = mem_cgroup_oom_unregister_event,
3849 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3850 },
3597}; 3851};
3598 3852
3599#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3853#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3602,8 +3856,8 @@ static struct cftype memsw_cgroup_files[] = {
3602 .name = "memsw.usage_in_bytes", 3856 .name = "memsw.usage_in_bytes",
3603 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3857 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3604 .read_u64 = mem_cgroup_read, 3858 .read_u64 = mem_cgroup_read,
3605 .register_event = mem_cgroup_register_event, 3859 .register_event = mem_cgroup_usage_register_event,
3606 .unregister_event = mem_cgroup_unregister_event, 3860 .unregister_event = mem_cgroup_usage_unregister_event,
3607 }, 3861 },
3608 { 3862 {
3609 .name = "memsw.max_usage_in_bytes", 3863 .name = "memsw.max_usage_in_bytes",
@@ -3831,6 +4085,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3831 } else { 4085 } else {
3832 parent = mem_cgroup_from_cont(cont->parent); 4086 parent = mem_cgroup_from_cont(cont->parent);
3833 mem->use_hierarchy = parent->use_hierarchy; 4087 mem->use_hierarchy = parent->use_hierarchy;
4088 mem->oom_kill_disable = parent->oom_kill_disable;
3834 } 4089 }
3835 4090
3836 if (parent && parent->use_hierarchy) { 4091 if (parent && parent->use_hierarchy) {
@@ -3849,6 +4104,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3849 } 4104 }
3850 mem->last_scanned_child = 0; 4105 mem->last_scanned_child = 0;
3851 spin_lock_init(&mem->reclaim_param_lock); 4106 spin_lock_init(&mem->reclaim_param_lock);
4107 INIT_LIST_HEAD(&mem->oom_notify);
3852 4108
3853 if (parent) 4109 if (parent)
3854 mem->swappiness = get_swappiness(parent); 4110 mem->swappiness = get_swappiness(parent);
@@ -3976,6 +4232,80 @@ enum mc_target_type {
3976 MC_TARGET_SWAP, 4232 MC_TARGET_SWAP,
3977}; 4233};
3978 4234
4235static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4236 unsigned long addr, pte_t ptent)
4237{
4238 struct page *page = vm_normal_page(vma, addr, ptent);
4239
4240 if (!page || !page_mapped(page))
4241 return NULL;
4242 if (PageAnon(page)) {
4243 /* we don't move shared anon */
4244 if (!move_anon() || page_mapcount(page) > 2)
4245 return NULL;
4246 } else if (!move_file())
4247 /* we ignore mapcount for file pages */
4248 return NULL;
4249 if (!get_page_unless_zero(page))
4250 return NULL;
4251
4252 return page;
4253}
4254
4255static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4256 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4257{
4258 int usage_count;
4259 struct page *page = NULL;
4260 swp_entry_t ent = pte_to_swp_entry(ptent);
4261
4262 if (!move_anon() || non_swap_entry(ent))
4263 return NULL;
4264 usage_count = mem_cgroup_count_swap_user(ent, &page);
4265 if (usage_count > 1) { /* we don't move shared anon */
4266 if (page)
4267 put_page(page);
4268 return NULL;
4269 }
4270 if (do_swap_account)
4271 entry->val = ent.val;
4272
4273 return page;
4274}
4275
4276static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4277 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4278{
4279 struct page *page = NULL;
4280 struct inode *inode;
4281 struct address_space *mapping;
4282 pgoff_t pgoff;
4283
4284 if (!vma->vm_file) /* anonymous vma */
4285 return NULL;
4286 if (!move_file())
4287 return NULL;
4288
4289 inode = vma->vm_file->f_path.dentry->d_inode;
4290 mapping = vma->vm_file->f_mapping;
4291 if (pte_none(ptent))
4292 pgoff = linear_page_index(vma, addr);
4293 else /* pte_file(ptent) is true */
4294 pgoff = pte_to_pgoff(ptent);
4295
4296 /* page is moved even if it's not RSS of this task(page-faulted). */
4297 if (!mapping_cap_swap_backed(mapping)) { /* normal file */
4298 page = find_get_page(mapping, pgoff);
4299 } else { /* shmem/tmpfs file. we should take account of swap too. */
4300 swp_entry_t ent;
4301 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
4302 if (do_swap_account)
4303 entry->val = ent.val;
4304 }
4305
4306 return page;
4307}
4308
3979static int is_target_pte_for_mc(struct vm_area_struct *vma, 4309static int is_target_pte_for_mc(struct vm_area_struct *vma,
3980 unsigned long addr, pte_t ptent, union mc_target *target) 4310 unsigned long addr, pte_t ptent, union mc_target *target)
3981{ 4311{
@@ -3983,43 +4313,16 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
3983 struct page_cgroup *pc; 4313 struct page_cgroup *pc;
3984 int ret = 0; 4314 int ret = 0;
3985 swp_entry_t ent = { .val = 0 }; 4315 swp_entry_t ent = { .val = 0 };
3986 int usage_count = 0;
3987 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
3988 &mc.to->move_charge_at_immigrate);
3989 4316
3990 if (!pte_present(ptent)) { 4317 if (pte_present(ptent))
3991 /* TODO: handle swap of shmes/tmpfs */ 4318 page = mc_handle_present_pte(vma, addr, ptent);
3992 if (pte_none(ptent) || pte_file(ptent)) 4319 else if (is_swap_pte(ptent))
3993 return 0; 4320 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
3994 else if (is_swap_pte(ptent)) { 4321 else if (pte_none(ptent) || pte_file(ptent))
3995 ent = pte_to_swp_entry(ptent); 4322 page = mc_handle_file_pte(vma, addr, ptent, &ent);
3996 if (!move_anon || non_swap_entry(ent)) 4323
3997 return 0; 4324 if (!page && !ent.val)
3998 usage_count = mem_cgroup_count_swap_user(ent, &page);
3999 }
4000 } else {
4001 page = vm_normal_page(vma, addr, ptent);
4002 if (!page || !page_mapped(page))
4003 return 0;
4004 /*
4005 * TODO: We don't move charges of file(including shmem/tmpfs)
4006 * pages for now.
4007 */
4008 if (!move_anon || !PageAnon(page))
4009 return 0;
4010 if (!get_page_unless_zero(page))
4011 return 0;
4012 usage_count = page_mapcount(page);
4013 }
4014 if (usage_count > 1) {
4015 /*
4016 * TODO: We don't move charges of shared(used by multiple
4017 * processes) pages for now.
4018 */
4019 if (page)
4020 put_page(page);
4021 return 0; 4325 return 0;
4022 }
4023 if (page) { 4326 if (page) {
4024 pc = lookup_page_cgroup(page); 4327 pc = lookup_page_cgroup(page);
4025 /* 4328 /*
@@ -4035,8 +4338,8 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
4035 if (!ret || !target) 4338 if (!ret || !target)
4036 put_page(page); 4339 put_page(page);
4037 } 4340 }
4038 /* throught */ 4341 /* There is a swap entry and a page doesn't exist or isn't charged */
4039 if (ent.val && do_swap_account && !ret && 4342 if (ent.val && !ret &&
4040 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 4343 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4041 ret = MC_TARGET_SWAP; 4344 ret = MC_TARGET_SWAP;
4042 if (target) 4345 if (target)
@@ -4077,9 +4380,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4077 }; 4380 };
4078 if (is_vm_hugetlb_page(vma)) 4381 if (is_vm_hugetlb_page(vma))
4079 continue; 4382 continue;
4080 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4081 if (vma->vm_flags & VM_SHARED)
4082 continue;
4083 walk_page_range(vma->vm_start, vma->vm_end, 4383 walk_page_range(vma->vm_start, vma->vm_end,
4084 &mem_cgroup_count_precharge_walk); 4384 &mem_cgroup_count_precharge_walk);
4085 } 4385 }
@@ -4102,6 +4402,7 @@ static void mem_cgroup_clear_mc(void)
4102 if (mc.precharge) { 4402 if (mc.precharge) {
4103 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4403 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4104 mc.precharge = 0; 4404 mc.precharge = 0;
4405 memcg_oom_recover(mc.to);
4105 } 4406 }
4106 /* 4407 /*
4107 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4408 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4110,6 +4411,7 @@ static void mem_cgroup_clear_mc(void)
4110 if (mc.moved_charge) { 4411 if (mc.moved_charge) {
4111 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4412 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4112 mc.moved_charge = 0; 4413 mc.moved_charge = 0;
4414 memcg_oom_recover(mc.from);
4113 } 4415 }
4114 /* we must fixup refcnts and charges */ 4416 /* we must fixup refcnts and charges */
4115 if (mc.moved_swap) { 4417 if (mc.moved_swap) {
@@ -4274,9 +4576,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4274 }; 4576 };
4275 if (is_vm_hugetlb_page(vma)) 4577 if (is_vm_hugetlb_page(vma))
4276 continue; 4578 continue;
4277 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4278 if (vma->vm_flags & VM_SHARED)
4279 continue;
4280 ret = walk_page_range(vma->vm_start, vma->vm_end, 4579 ret = walk_page_range(vma->vm_start, vma->vm_end,
4281 &mem_cgroup_move_charge_walk); 4580 &mem_cgroup_move_charge_walk);
4282 if (ret) 4581 if (ret)
diff --git a/mm/memory.c b/mm/memory.c
index 833952d8b74d..119b7ccdf39b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1227,8 +1227,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1227} 1227}
1228EXPORT_SYMBOL_GPL(zap_vma_ptes); 1228EXPORT_SYMBOL_GPL(zap_vma_ptes);
1229 1229
1230/* 1230/**
1231 * Do a quick page-table lookup for a single page. 1231 * follow_page - look up a page descriptor from a user-virtual address
1232 * @vma: vm_area_struct mapping @address
1233 * @address: virtual address to look up
1234 * @flags: flags modifying lookup behaviour
1235 *
1236 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1237 *
1238 * Returns the mapped (struct page *), %NULL if no mapping exists, or
1239 * an error pointer if there is a mapping to something not represented
1240 * by a page descriptor (see also vm_normal_page()).
1232 */ 1241 */
1233struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1242struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1234 unsigned int flags) 1243 unsigned int flags)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index be211a582930..a4cfcdc00455 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -415,12 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
415 * This means the page allocator ignores this zone. 415 * This means the page allocator ignores this zone.
416 * So, zonelist must be updated after online. 416 * So, zonelist must be updated after online.
417 */ 417 */
418 mutex_lock(&zonelists_mutex);
418 if (!populated_zone(zone)) 419 if (!populated_zone(zone))
419 need_zonelists_rebuild = 1; 420 need_zonelists_rebuild = 1;
420 421
421 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 422 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
422 online_pages_range); 423 online_pages_range);
423 if (ret) { 424 if (ret) {
425 mutex_unlock(&zonelists_mutex);
424 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 426 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
425 nr_pages, pfn); 427 nr_pages, pfn);
426 memory_notify(MEM_CANCEL_ONLINE, &arg); 428 memory_notify(MEM_CANCEL_ONLINE, &arg);
@@ -429,8 +431,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
429 431
430 zone->present_pages += onlined_pages; 432 zone->present_pages += onlined_pages;
431 zone->zone_pgdat->node_present_pages += onlined_pages; 433 zone->zone_pgdat->node_present_pages += onlined_pages;
434 if (need_zonelists_rebuild)
435 build_all_zonelists(zone);
436 else
437 zone_pcp_update(zone);
432 438
433 zone_pcp_update(zone); 439 mutex_unlock(&zonelists_mutex);
434 setup_per_zone_wmarks(); 440 setup_per_zone_wmarks();
435 calculate_zone_inactive_ratio(zone); 441 calculate_zone_inactive_ratio(zone);
436 if (onlined_pages) { 442 if (onlined_pages) {
@@ -438,10 +444,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
438 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 444 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
439 } 445 }
440 446
441 if (need_zonelists_rebuild) 447 vm_total_pages = nr_free_pagecache_pages();
442 build_all_zonelists();
443 else
444 vm_total_pages = nr_free_pagecache_pages();
445 448
446 writeback_set_ratelimit(); 449 writeback_set_ratelimit();
447 450
@@ -482,6 +485,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
482} 485}
483 486
484 487
488/*
489 * called by cpu_up() to online a node without onlined memory.
490 */
491int mem_online_node(int nid)
492{
493 pg_data_t *pgdat;
494 int ret;
495
496 lock_system_sleep();
497 pgdat = hotadd_new_pgdat(nid, 0);
498 if (pgdat) {
499 ret = -ENOMEM;
500 goto out;
501 }
502 node_set_online(nid);
503 ret = register_one_node(nid);
504 BUG_ON(ret);
505
506out:
507 unlock_system_sleep();
508 return ret;
509}
510
485/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 511/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
486int __ref add_memory(int nid, u64 start, u64 size) 512int __ref add_memory(int nid, u64 start, u64 size)
487{ 513{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08f40a2f3fe0..5d6fb339de03 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -119,7 +119,22 @@ struct mempolicy default_policy = {
119 119
120static const struct mempolicy_operations { 120static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 122 /*
123 * If read-side task has no lock to protect task->mempolicy, write-side
124 * task will rebind the task->mempolicy by two step. The first step is
125 * setting all the newly nodes, and the second step is cleaning all the
126 * disallowed nodes. In this way, we can avoid finding no node to alloc
127 * page.
128 * If we have a lock to protect task->mempolicy in read-side, we do
129 * rebind directly.
130 *
131 * step:
132 * MPOL_REBIND_ONCE - do rebind work at once
133 * MPOL_REBIND_STEP1 - set all the newly nodes
134 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
135 */
136 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
137 enum mpol_rebind_step step);
123} mpol_ops[MPOL_MAX]; 138} mpol_ops[MPOL_MAX];
124 139
125/* Check that the nodemask contains at least one populated zone */ 140/* Check that the nodemask contains at least one populated zone */
@@ -127,9 +142,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
127{ 142{
128 int nd, k; 143 int nd, k;
129 144
130 /* Check that there is something useful in this mask */
131 k = policy_zone;
132
133 for_each_node_mask(nd, *nodemask) { 145 for_each_node_mask(nd, *nodemask) {
134 struct zone *z; 146 struct zone *z;
135 147
@@ -145,7 +157,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
145 157
146static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 158static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
147{ 159{
148 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); 160 return pol->flags & MPOL_MODE_FLAGS;
149} 161}
150 162
151static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 163static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
@@ -277,12 +289,19 @@ void __mpol_put(struct mempolicy *p)
277 kmem_cache_free(policy_cache, p); 289 kmem_cache_free(policy_cache, p);
278} 290}
279 291
280static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 292static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
293 enum mpol_rebind_step step)
281{ 294{
282} 295}
283 296
284static void mpol_rebind_nodemask(struct mempolicy *pol, 297/*
285 const nodemask_t *nodes) 298 * step:
299 * MPOL_REBIND_ONCE - do rebind work at once
300 * MPOL_REBIND_STEP1 - set all the newly nodes
301 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
302 */
303static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
304 enum mpol_rebind_step step)
286{ 305{
287 nodemask_t tmp; 306 nodemask_t tmp;
288 307
@@ -291,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
291 else if (pol->flags & MPOL_F_RELATIVE_NODES) 310 else if (pol->flags & MPOL_F_RELATIVE_NODES)
292 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 311 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
293 else { 312 else {
294 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, 313 /*
295 *nodes); 314 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
296 pol->w.cpuset_mems_allowed = *nodes; 315 * result
316 */
317 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
318 nodes_remap(tmp, pol->v.nodes,
319 pol->w.cpuset_mems_allowed, *nodes);
320 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
321 } else if (step == MPOL_REBIND_STEP2) {
322 tmp = pol->w.cpuset_mems_allowed;
323 pol->w.cpuset_mems_allowed = *nodes;
324 } else
325 BUG();
297 } 326 }
298 327
299 pol->v.nodes = tmp; 328 if (nodes_empty(tmp))
329 tmp = *nodes;
330
331 if (step == MPOL_REBIND_STEP1)
332 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
333 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
334 pol->v.nodes = tmp;
335 else
336 BUG();
337
300 if (!node_isset(current->il_next, tmp)) { 338 if (!node_isset(current->il_next, tmp)) {
301 current->il_next = next_node(current->il_next, tmp); 339 current->il_next = next_node(current->il_next, tmp);
302 if (current->il_next >= MAX_NUMNODES) 340 if (current->il_next >= MAX_NUMNODES)
@@ -307,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
307} 345}
308 346
309static void mpol_rebind_preferred(struct mempolicy *pol, 347static void mpol_rebind_preferred(struct mempolicy *pol,
310 const nodemask_t *nodes) 348 const nodemask_t *nodes,
349 enum mpol_rebind_step step)
311{ 350{
312 nodemask_t tmp; 351 nodemask_t tmp;
313 352
@@ -330,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
330 } 369 }
331} 370}
332 371
333/* Migrate a policy to a different set of nodes */ 372/*
334static void mpol_rebind_policy(struct mempolicy *pol, 373 * mpol_rebind_policy - Migrate a policy to a different set of nodes
335 const nodemask_t *newmask) 374 *
375 * If read-side task has no lock to protect task->mempolicy, write-side
376 * task will rebind the task->mempolicy by two step. The first step is
377 * setting all the newly nodes, and the second step is cleaning all the
378 * disallowed nodes. In this way, we can avoid finding no node to alloc
379 * page.
380 * If we have a lock to protect task->mempolicy in read-side, we do
381 * rebind directly.
382 *
383 * step:
384 * MPOL_REBIND_ONCE - do rebind work at once
385 * MPOL_REBIND_STEP1 - set all the newly nodes
386 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
387 */
388static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
389 enum mpol_rebind_step step)
336{ 390{
337 if (!pol) 391 if (!pol)
338 return; 392 return;
339 if (!mpol_store_user_nodemask(pol) && 393 if (!mpol_store_user_nodemask(pol) && step == 0 &&
340 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
341 return; 395 return;
342 mpol_ops[pol->mode].rebind(pol, newmask); 396
397 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
398 return;
399
400 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
401 BUG();
402
403 if (step == MPOL_REBIND_STEP1)
404 pol->flags |= MPOL_F_REBINDING;
405 else if (step == MPOL_REBIND_STEP2)
406 pol->flags &= ~MPOL_F_REBINDING;
407 else if (step >= MPOL_REBIND_NSTEP)
408 BUG();
409
410 mpol_ops[pol->mode].rebind(pol, newmask, step);
343} 411}
344 412
345/* 413/*
@@ -349,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
349 * Called with task's alloc_lock held. 417 * Called with task's alloc_lock held.
350 */ 418 */
351 419
352void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 420void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
421 enum mpol_rebind_step step)
353{ 422{
354 mpol_rebind_policy(tsk->mempolicy, new); 423 mpol_rebind_policy(tsk->mempolicy, new, step);
355} 424}
356 425
357/* 426/*
@@ -366,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
366 435
367 down_write(&mm->mmap_sem); 436 down_write(&mm->mmap_sem);
368 for (vma = mm->mmap; vma; vma = vma->vm_next) 437 for (vma = mm->mmap; vma; vma = vma->vm_next)
369 mpol_rebind_policy(vma->vm_policy, new); 438 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
370 up_write(&mm->mmap_sem); 439 up_write(&mm->mmap_sem);
371} 440}
372 441
@@ -859,7 +928,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
859 nodes_clear(nmask); 928 nodes_clear(nmask);
860 node_set(source, nmask); 929 node_set(source, nmask);
861 930
862 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, 931 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
863 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 932 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
864 933
865 if (!list_empty(&pagelist)) 934 if (!list_empty(&pagelist))
@@ -1444,15 +1513,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1444 /* 1513 /*
1445 * Normally, MPOL_BIND allocations are node-local within the 1514 * Normally, MPOL_BIND allocations are node-local within the
1446 * allowed nodemask. However, if __GFP_THISNODE is set and the 1515 * allowed nodemask. However, if __GFP_THISNODE is set and the
1447 * current node is part of the mask, we use the zonelist for 1516 * current node isn't part of the mask, we use the zonelist for
1448 * the first node in the mask instead. 1517 * the first node in the mask instead.
1449 */ 1518 */
1450 if (unlikely(gfp & __GFP_THISNODE) && 1519 if (unlikely(gfp & __GFP_THISNODE) &&
1451 unlikely(!node_isset(nd, policy->v.nodes))) 1520 unlikely(!node_isset(nd, policy->v.nodes)))
1452 nd = first_node(policy->v.nodes); 1521 nd = first_node(policy->v.nodes);
1453 break; 1522 break;
1454 case MPOL_INTERLEAVE: /* should not happen */
1455 break;
1456 default: 1523 default:
1457 BUG(); 1524 BUG();
1458 } 1525 }
@@ -1572,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1572 * to the struct mempolicy for conditional unref after allocation. 1639 * to the struct mempolicy for conditional unref after allocation.
1573 * If the effective policy is 'BIND, returns a pointer to the mempolicy's 1640 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1574 * @nodemask for filtering the zonelist. 1641 * @nodemask for filtering the zonelist.
1642 *
1643 * Must be protected by get_mems_allowed()
1575 */ 1644 */
1576struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1645struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1577 gfp_t gfp_flags, struct mempolicy **mpol, 1646 gfp_t gfp_flags, struct mempolicy **mpol,
@@ -1617,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1617 if (!(mask && current->mempolicy)) 1686 if (!(mask && current->mempolicy))
1618 return false; 1687 return false;
1619 1688
1689 task_lock(current);
1620 mempolicy = current->mempolicy; 1690 mempolicy = current->mempolicy;
1621 switch (mempolicy->mode) { 1691 switch (mempolicy->mode) {
1622 case MPOL_PREFERRED: 1692 case MPOL_PREFERRED:
@@ -1636,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1636 default: 1706 default:
1637 BUG(); 1707 BUG();
1638 } 1708 }
1709 task_unlock(current);
1639 1710
1640 return true; 1711 return true;
1641} 1712}
@@ -1683,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1683{ 1754{
1684 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1755 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1685 struct zonelist *zl; 1756 struct zonelist *zl;
1757 struct page *page;
1686 1758
1759 get_mems_allowed();
1687 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1760 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1688 unsigned nid; 1761 unsigned nid;
1689 1762
1690 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1763 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1691 mpol_cond_put(pol); 1764 mpol_cond_put(pol);
1692 return alloc_page_interleave(gfp, 0, nid); 1765 page = alloc_page_interleave(gfp, 0, nid);
1766 put_mems_allowed();
1767 return page;
1693 } 1768 }
1694 zl = policy_zonelist(gfp, pol); 1769 zl = policy_zonelist(gfp, pol);
1695 if (unlikely(mpol_needs_cond_ref(pol))) { 1770 if (unlikely(mpol_needs_cond_ref(pol))) {
@@ -1699,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1699 struct page *page = __alloc_pages_nodemask(gfp, 0, 1774 struct page *page = __alloc_pages_nodemask(gfp, 0,
1700 zl, policy_nodemask(gfp, pol)); 1775 zl, policy_nodemask(gfp, pol));
1701 __mpol_put(pol); 1776 __mpol_put(pol);
1777 put_mems_allowed();
1702 return page; 1778 return page;
1703 } 1779 }
1704 /* 1780 /*
1705 * fast path: default or task policy 1781 * fast path: default or task policy
1706 */ 1782 */
1707 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); 1783 page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1784 put_mems_allowed();
1785 return page;
1708} 1786}
1709 1787
1710/** 1788/**
@@ -1729,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1729struct page *alloc_pages_current(gfp_t gfp, unsigned order) 1807struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1730{ 1808{
1731 struct mempolicy *pol = current->mempolicy; 1809 struct mempolicy *pol = current->mempolicy;
1810 struct page *page;
1732 1811
1733 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1812 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1734 pol = &default_policy; 1813 pol = &default_policy;
1735 1814
1815 get_mems_allowed();
1736 /* 1816 /*
1737 * No reference counting needed for current->mempolicy 1817 * No reference counting needed for current->mempolicy
1738 * nor system default_policy 1818 * nor system default_policy
1739 */ 1819 */
1740 if (pol->mode == MPOL_INTERLEAVE) 1820 if (pol->mode == MPOL_INTERLEAVE)
1741 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1821 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1742 return __alloc_pages_nodemask(gfp, order, 1822 else
1823 page = __alloc_pages_nodemask(gfp, order,
1743 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); 1824 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1825 put_mems_allowed();
1826 return page;
1744} 1827}
1745EXPORT_SYMBOL(alloc_pages_current); 1828EXPORT_SYMBOL(alloc_pages_current);
1746 1829
@@ -1750,6 +1833,9 @@ EXPORT_SYMBOL(alloc_pages_current);
1750 * with the mems_allowed returned by cpuset_mems_allowed(). This 1833 * with the mems_allowed returned by cpuset_mems_allowed(). This
1751 * keeps mempolicies cpuset relative after its cpuset moves. See 1834 * keeps mempolicies cpuset relative after its cpuset moves. See
1752 * further kernel/cpuset.c update_nodemask(). 1835 * further kernel/cpuset.c update_nodemask().
1836 *
1837 * current's mempolicy may be rebinded by the other task(the task that changes
1838 * cpuset's mems), so we needn't do rebind work for current task.
1753 */ 1839 */
1754 1840
1755/* Slow path of a mempolicy duplicate */ 1841/* Slow path of a mempolicy duplicate */
@@ -1759,13 +1845,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1759 1845
1760 if (!new) 1846 if (!new)
1761 return ERR_PTR(-ENOMEM); 1847 return ERR_PTR(-ENOMEM);
1848
1849 /* task's mempolicy is protected by alloc_lock */
1850 if (old == current->mempolicy) {
1851 task_lock(current);
1852 *new = *old;
1853 task_unlock(current);
1854 } else
1855 *new = *old;
1856
1762 rcu_read_lock(); 1857 rcu_read_lock();
1763 if (current_cpuset_is_being_rebound()) { 1858 if (current_cpuset_is_being_rebound()) {
1764 nodemask_t mems = cpuset_mems_allowed(current); 1859 nodemask_t mems = cpuset_mems_allowed(current);
1765 mpol_rebind_policy(old, &mems); 1860 if (new->flags & MPOL_F_REBINDING)
1861 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1862 else
1863 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1766 } 1864 }
1767 rcu_read_unlock(); 1865 rcu_read_unlock();
1768 *new = *old;
1769 atomic_set(&new->refcnt, 1); 1866 atomic_set(&new->refcnt, 1);
1770 return new; 1867 return new;
1771} 1868}
@@ -1792,16 +1889,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1792 return tompol; 1889 return tompol;
1793} 1890}
1794 1891
1795static int mpol_match_intent(const struct mempolicy *a,
1796 const struct mempolicy *b)
1797{
1798 if (a->flags != b->flags)
1799 return 0;
1800 if (!mpol_store_user_nodemask(a))
1801 return 1;
1802 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1803}
1804
1805/* Slow path of a mempolicy comparison */ 1892/* Slow path of a mempolicy comparison */
1806int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 1893int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1807{ 1894{
@@ -1809,8 +1896,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1809 return 0; 1896 return 0;
1810 if (a->mode != b->mode) 1897 if (a->mode != b->mode)
1811 return 0; 1898 return 0;
1812 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) 1899 if (a->flags != b->flags)
1813 return 0; 1900 return 0;
1901 if (mpol_store_user_nodemask(a))
1902 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1903 return 0;
1904
1814 switch (a->mode) { 1905 switch (a->mode) {
1815 case MPOL_BIND: 1906 case MPOL_BIND:
1816 /* Fall through */ 1907 /* Fall through */
@@ -2006,27 +2097,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2006 return; 2097 return;
2007 /* contextualize the tmpfs mount point mempolicy */ 2098 /* contextualize the tmpfs mount point mempolicy */
2008 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 2099 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2009 if (IS_ERR(new)) { 2100 if (IS_ERR(new))
2010 mpol_put(mpol); /* drop our ref on sb mpol */ 2101 goto free_scratch; /* no valid nodemask intersection */
2011 NODEMASK_SCRATCH_FREE(scratch);
2012 return; /* no valid nodemask intersection */
2013 }
2014 2102
2015 task_lock(current); 2103 task_lock(current);
2016 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); 2104 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2017 task_unlock(current); 2105 task_unlock(current);
2018 mpol_put(mpol); /* drop our ref on sb mpol */ 2106 mpol_put(mpol); /* drop our ref on sb mpol */
2019 if (ret) { 2107 if (ret)
2020 NODEMASK_SCRATCH_FREE(scratch); 2108 goto put_free;
2021 mpol_put(new);
2022 return;
2023 }
2024 2109
2025 /* Create pseudo-vma that contains just the policy */ 2110 /* Create pseudo-vma that contains just the policy */
2026 memset(&pvma, 0, sizeof(struct vm_area_struct)); 2111 memset(&pvma, 0, sizeof(struct vm_area_struct));
2027 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 2112 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2028 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 2113 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2114
2115put_free:
2029 mpol_put(new); /* drop initial ref */ 2116 mpol_put(new); /* drop initial ref */
2117free_scratch:
2030 NODEMASK_SCRATCH_FREE(scratch); 2118 NODEMASK_SCRATCH_FREE(scratch);
2031 } 2119 }
2032} 2120}
@@ -2132,9 +2220,15 @@ void numa_default_policy(void)
2132 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2220 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
2133 * Used only for mpol_parse_str() and mpol_to_str() 2221 * Used only for mpol_parse_str() and mpol_to_str()
2134 */ 2222 */
2135#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) 2223#define MPOL_LOCAL MPOL_MAX
2136static const char * const policy_types[] = 2224static const char * const policy_modes[] =
2137 { "default", "prefer", "bind", "interleave", "local" }; 2225{
2226 [MPOL_DEFAULT] = "default",
2227 [MPOL_PREFERRED] = "prefer",
2228 [MPOL_BIND] = "bind",
2229 [MPOL_INTERLEAVE] = "interleave",
2230 [MPOL_LOCAL] = "local"
2231};
2138 2232
2139 2233
2140#ifdef CONFIG_TMPFS 2234#ifdef CONFIG_TMPFS
@@ -2159,12 +2253,11 @@ static const char * const policy_types[] =
2159int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) 2253int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2160{ 2254{
2161 struct mempolicy *new = NULL; 2255 struct mempolicy *new = NULL;
2162 unsigned short uninitialized_var(mode); 2256 unsigned short mode;
2163 unsigned short uninitialized_var(mode_flags); 2257 unsigned short uninitialized_var(mode_flags);
2164 nodemask_t nodes; 2258 nodemask_t nodes;
2165 char *nodelist = strchr(str, ':'); 2259 char *nodelist = strchr(str, ':');
2166 char *flags = strchr(str, '='); 2260 char *flags = strchr(str, '=');
2167 int i;
2168 int err = 1; 2261 int err = 1;
2169 2262
2170 if (nodelist) { 2263 if (nodelist) {
@@ -2180,13 +2273,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2180 if (flags) 2273 if (flags)
2181 *flags++ = '\0'; /* terminate mode string */ 2274 *flags++ = '\0'; /* terminate mode string */
2182 2275
2183 for (i = 0; i <= MPOL_LOCAL; i++) { 2276 for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2184 if (!strcmp(str, policy_types[i])) { 2277 if (!strcmp(str, policy_modes[mode])) {
2185 mode = i;
2186 break; 2278 break;
2187 } 2279 }
2188 } 2280 }
2189 if (i > MPOL_LOCAL) 2281 if (mode > MPOL_LOCAL)
2190 goto out; 2282 goto out;
2191 2283
2192 switch (mode) { 2284 switch (mode) {
@@ -2250,7 +2342,10 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2250 if (IS_ERR(new)) 2342 if (IS_ERR(new))
2251 goto out; 2343 goto out;
2252 2344
2253 { 2345 if (no_context) {
2346 /* save for contextualization */
2347 new->w.user_nodemask = nodes;
2348 } else {
2254 int ret; 2349 int ret;
2255 NODEMASK_SCRATCH(scratch); 2350 NODEMASK_SCRATCH(scratch);
2256 if (scratch) { 2351 if (scratch) {
@@ -2266,10 +2361,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2266 } 2361 }
2267 } 2362 }
2268 err = 0; 2363 err = 0;
2269 if (no_context) {
2270 /* save for contextualization */
2271 new->w.user_nodemask = nodes;
2272 }
2273 2364
2274out: 2365out:
2275 /* Restore string for error message */ 2366 /* Restore string for error message */
@@ -2338,11 +2429,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2338 BUG(); 2429 BUG();
2339 } 2430 }
2340 2431
2341 l = strlen(policy_types[mode]); 2432 l = strlen(policy_modes[mode]);
2342 if (buffer + maxlen < p + l + 1) 2433 if (buffer + maxlen < p + l + 1)
2343 return -ENOSPC; 2434 return -ENOSPC;
2344 2435
2345 strcpy(p, policy_types[mode]); 2436 strcpy(p, policy_modes[mode]);
2346 p += l; 2437 p += l;
2347 2438
2348 if (flags & MPOL_MODE_FLAGS) { 2439 if (flags & MPOL_MODE_FLAGS) {
diff --git a/mm/migrate.c b/mm/migrate.c
index d3f3f7f81075..4205b1d6049e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -40,7 +40,8 @@
40 40
41/* 41/*
42 * migrate_prep() needs to be called before we start compiling a list of pages 42 * migrate_prep() needs to be called before we start compiling a list of pages
43 * to be migrated using isolate_lru_page(). 43 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
44 * undesirable, use migrate_prep_local()
44 */ 45 */
45int migrate_prep(void) 46int migrate_prep(void)
46{ 47{
@@ -55,26 +56,29 @@ int migrate_prep(void)
55 return 0; 56 return 0;
56} 57}
57 58
59/* Do the necessary work of migrate_prep but not if it involves other CPUs */
60int migrate_prep_local(void)
61{
62 lru_add_drain();
63
64 return 0;
65}
66
58/* 67/*
59 * Add isolated pages on the list back to the LRU under page lock 68 * Add isolated pages on the list back to the LRU under page lock
60 * to avoid leaking evictable pages back onto unevictable list. 69 * to avoid leaking evictable pages back onto unevictable list.
61 *
62 * returns the number of pages put back.
63 */ 70 */
64int putback_lru_pages(struct list_head *l) 71void putback_lru_pages(struct list_head *l)
65{ 72{
66 struct page *page; 73 struct page *page;
67 struct page *page2; 74 struct page *page2;
68 int count = 0;
69 75
70 list_for_each_entry_safe(page, page2, l, lru) { 76 list_for_each_entry_safe(page, page2, l, lru) {
71 list_del(&page->lru); 77 list_del(&page->lru);
72 dec_zone_page_state(page, NR_ISOLATED_ANON + 78 dec_zone_page_state(page, NR_ISOLATED_ANON +
73 page_is_file_cache(page)); 79 page_is_file_cache(page));
74 putback_lru_page(page); 80 putback_lru_page(page);
75 count++;
76 } 81 }
77 return count;
78} 82}
79 83
80/* 84/*
@@ -490,7 +494,8 @@ static int fallback_migrate_page(struct address_space *mapping,
490 * < 0 - error code 494 * < 0 - error code
491 * == 0 - success 495 * == 0 - success
492 */ 496 */
493static int move_to_new_page(struct page *newpage, struct page *page) 497static int move_to_new_page(struct page *newpage, struct page *page,
498 int remap_swapcache)
494{ 499{
495 struct address_space *mapping; 500 struct address_space *mapping;
496 int rc; 501 int rc;
@@ -525,10 +530,12 @@ static int move_to_new_page(struct page *newpage, struct page *page)
525 else 530 else
526 rc = fallback_migrate_page(mapping, newpage, page); 531 rc = fallback_migrate_page(mapping, newpage, page);
527 532
528 if (!rc) 533 if (rc) {
529 remove_migration_ptes(page, newpage);
530 else
531 newpage->mapping = NULL; 534 newpage->mapping = NULL;
535 } else {
536 if (remap_swapcache)
537 remove_migration_ptes(page, newpage);
538 }
532 539
533 unlock_page(newpage); 540 unlock_page(newpage);
534 541
@@ -545,9 +552,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
545 int rc = 0; 552 int rc = 0;
546 int *result = NULL; 553 int *result = NULL;
547 struct page *newpage = get_new_page(page, private, &result); 554 struct page *newpage = get_new_page(page, private, &result);
555 int remap_swapcache = 1;
548 int rcu_locked = 0; 556 int rcu_locked = 0;
549 int charge = 0; 557 int charge = 0;
550 struct mem_cgroup *mem = NULL; 558 struct mem_cgroup *mem = NULL;
559 struct anon_vma *anon_vma = NULL;
551 560
552 if (!newpage) 561 if (!newpage)
553 return -ENOMEM; 562 return -ENOMEM;
@@ -581,7 +590,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
581 } 590 }
582 591
583 /* charge against new page */ 592 /* charge against new page */
584 charge = mem_cgroup_prepare_migration(page, &mem); 593 charge = mem_cgroup_prepare_migration(page, newpage, &mem);
585 if (charge == -ENOMEM) { 594 if (charge == -ENOMEM) {
586 rc = -ENOMEM; 595 rc = -ENOMEM;
587 goto unlock; 596 goto unlock;
@@ -604,6 +613,34 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
604 if (PageAnon(page)) { 613 if (PageAnon(page)) {
605 rcu_read_lock(); 614 rcu_read_lock();
606 rcu_locked = 1; 615 rcu_locked = 1;
616
617 /* Determine how to safely use anon_vma */
618 if (!page_mapped(page)) {
619 if (!PageSwapCache(page))
620 goto rcu_unlock;
621
622 /*
623 * We cannot be sure that the anon_vma of an unmapped
624 * swapcache page is safe to use because we don't
625 * know in advance if the VMA that this page belonged
626 * to still exists. If the VMA and others sharing the
627 * data have been freed, then the anon_vma could
628 * already be invalid.
629 *
630 * To avoid this possibility, swapcache pages get
631 * migrated but are not remapped when migration
632 * completes
633 */
634 remap_swapcache = 0;
635 } else {
636 /*
637 * Take a reference count on the anon_vma if the
638 * page is mapped so that it is guaranteed to
639 * exist when the page is remapped later
640 */
641 anon_vma = page_anon_vma(page);
642 atomic_inc(&anon_vma->external_refcount);
643 }
607 } 644 }
608 645
609 /* 646 /*
@@ -638,11 +675,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
638 675
639skip_unmap: 676skip_unmap:
640 if (!page_mapped(page)) 677 if (!page_mapped(page))
641 rc = move_to_new_page(newpage, page); 678 rc = move_to_new_page(newpage, page, remap_swapcache);
642 679
643 if (rc) 680 if (rc && remap_swapcache)
644 remove_migration_ptes(page, page); 681 remove_migration_ptes(page, page);
645rcu_unlock: 682rcu_unlock:
683
684 /* Drop an anon_vma reference if we took one */
685 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
686 int empty = list_empty(&anon_vma->head);
687 spin_unlock(&anon_vma->lock);
688 if (empty)
689 anon_vma_free(anon_vma);
690 }
691
646 if (rcu_locked) 692 if (rcu_locked)
647 rcu_read_unlock(); 693 rcu_read_unlock();
648uncharge: 694uncharge:
diff --git a/mm/mincore.c b/mm/mincore.c
index f77433c20279..9ac42dc6d7b6 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,6 +19,40 @@
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20#include <asm/pgtable.h> 20#include <asm/pgtable.h>
21 21
22static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
23 unsigned long addr, unsigned long end,
24 unsigned char *vec)
25{
26#ifdef CONFIG_HUGETLB_PAGE
27 struct hstate *h;
28
29 h = hstate_vma(vma);
30 while (1) {
31 unsigned char present;
32 pte_t *ptep;
33 /*
34 * Huge pages are always in RAM for now, but
35 * theoretically it needs to be checked.
36 */
37 ptep = huge_pte_offset(current->mm,
38 addr & huge_page_mask(h));
39 present = ptep && !huge_pte_none(huge_ptep_get(ptep));
40 while (1) {
41 *vec = present;
42 vec++;
43 addr += PAGE_SIZE;
44 if (addr == end)
45 return;
46 /* check hugepage border */
47 if (!(addr & ~huge_page_mask(h)))
48 break;
49 }
50 }
51#else
52 BUG();
53#endif
54}
55
22/* 56/*
23 * Later we can get more picky about what "in core" means precisely. 57 * Later we can get more picky about what "in core" means precisely.
24 * For now, simply check to see if the page is in the page cache, 58 * For now, simply check to see if the page is in the page cache,
@@ -49,145 +83,150 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
49 return present; 83 return present;
50} 84}
51 85
52/* 86static void mincore_unmapped_range(struct vm_area_struct *vma,
53 * Do a chunk of "sys_mincore()". We've already checked 87 unsigned long addr, unsigned long end,
54 * all the arguments, we hold the mmap semaphore: we should 88 unsigned char *vec)
55 * just return the amount of info we're asked for.
56 */
57static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
58{ 89{
59 pgd_t *pgd; 90 unsigned long nr = (end - addr) >> PAGE_SHIFT;
60 pud_t *pud;
61 pmd_t *pmd;
62 pte_t *ptep;
63 spinlock_t *ptl;
64 unsigned long nr;
65 int i; 91 int i;
66 pgoff_t pgoff;
67 struct vm_area_struct *vma = find_vma(current->mm, addr);
68 92
69 /* 93 if (vma->vm_file) {
70 * find_vma() didn't find anything above us, or we're 94 pgoff_t pgoff;
71 * in an unmapped hole in the address space: ENOMEM.
72 */
73 if (!vma || addr < vma->vm_start)
74 return -ENOMEM;
75
76#ifdef CONFIG_HUGETLB_PAGE
77 if (is_vm_hugetlb_page(vma)) {
78 struct hstate *h;
79 unsigned long nr_huge;
80 unsigned char present;
81 95
82 i = 0; 96 pgoff = linear_page_index(vma, addr);
83 nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); 97 for (i = 0; i < nr; i++, pgoff++)
84 h = hstate_vma(vma); 98 vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
85 nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) 99 } else {
86 - (addr >> huge_page_shift(h)) + 1; 100 for (i = 0; i < nr; i++)
87 nr_huge = min(nr_huge, 101 vec[i] = 0;
88 (vma->vm_end - addr) >> huge_page_shift(h));
89 while (1) {
90 /* hugepage always in RAM for now,
91 * but generally it needs to be check */
92 ptep = huge_pte_offset(current->mm,
93 addr & huge_page_mask(h));
94 present = !!(ptep &&
95 !huge_pte_none(huge_ptep_get(ptep)));
96 while (1) {
97 vec[i++] = present;
98 addr += PAGE_SIZE;
99 /* reach buffer limit */
100 if (i == nr)
101 return nr;
102 /* check hugepage border */
103 if (!((addr & ~huge_page_mask(h))
104 >> PAGE_SHIFT))
105 break;
106 }
107 }
108 return nr;
109 } 102 }
110#endif 103}
111
112 /*
113 * Calculate how many pages there are left in the last level of the
114 * PTE array for our address.
115 */
116 nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1));
117
118 /*
119 * Don't overrun this vma
120 */
121 nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT);
122
123 /*
124 * Don't return more than the caller asked for
125 */
126 nr = min(nr, pages);
127 104
128 pgd = pgd_offset(vma->vm_mm, addr); 105static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
129 if (pgd_none_or_clear_bad(pgd)) 106 unsigned long addr, unsigned long end,
130 goto none_mapped; 107 unsigned char *vec)
131 pud = pud_offset(pgd, addr); 108{
132 if (pud_none_or_clear_bad(pud)) 109 unsigned long next;
133 goto none_mapped; 110 spinlock_t *ptl;
134 pmd = pmd_offset(pud, addr); 111 pte_t *ptep;
135 if (pmd_none_or_clear_bad(pmd))
136 goto none_mapped;
137 112
138 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 113 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
139 for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { 114 do {
140 unsigned char present;
141 pte_t pte = *ptep; 115 pte_t pte = *ptep;
116 pgoff_t pgoff;
142 117
143 if (pte_present(pte)) { 118 next = addr + PAGE_SIZE;
144 present = 1; 119 if (pte_none(pte))
145 120 mincore_unmapped_range(vma, addr, next, vec);
146 } else if (pte_none(pte)) { 121 else if (pte_present(pte))
147 if (vma->vm_file) { 122 *vec = 1;
148 pgoff = linear_page_index(vma, addr); 123 else if (pte_file(pte)) {
149 present = mincore_page(vma->vm_file->f_mapping,
150 pgoff);
151 } else
152 present = 0;
153
154 } else if (pte_file(pte)) {
155 pgoff = pte_to_pgoff(pte); 124 pgoff = pte_to_pgoff(pte);
156 present = mincore_page(vma->vm_file->f_mapping, pgoff); 125 *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
157
158 } else { /* pte is a swap entry */ 126 } else { /* pte is a swap entry */
159 swp_entry_t entry = pte_to_swp_entry(pte); 127 swp_entry_t entry = pte_to_swp_entry(pte);
128
160 if (is_migration_entry(entry)) { 129 if (is_migration_entry(entry)) {
161 /* migration entries are always uptodate */ 130 /* migration entries are always uptodate */
162 present = 1; 131 *vec = 1;
163 } else { 132 } else {
164#ifdef CONFIG_SWAP 133#ifdef CONFIG_SWAP
165 pgoff = entry.val; 134 pgoff = entry.val;
166 present = mincore_page(&swapper_space, pgoff); 135 *vec = mincore_page(&swapper_space, pgoff);
167#else 136#else
168 WARN_ON(1); 137 WARN_ON(1);
169 present = 1; 138 *vec = 1;
170#endif 139#endif
171 } 140 }
172 } 141 }
142 vec++;
143 } while (ptep++, addr = next, addr != end);
144 pte_unmap_unlock(ptep - 1, ptl);
145}
173 146
174 vec[i] = present; 147static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
175 } 148 unsigned long addr, unsigned long end,
176 pte_unmap_unlock(ptep-1, ptl); 149 unsigned char *vec)
150{
151 unsigned long next;
152 pmd_t *pmd;
177 153
178 return nr; 154 pmd = pmd_offset(pud, addr);
155 do {
156 next = pmd_addr_end(addr, end);
157 if (pmd_none_or_clear_bad(pmd))
158 mincore_unmapped_range(vma, addr, next, vec);
159 else
160 mincore_pte_range(vma, pmd, addr, next, vec);
161 vec += (next - addr) >> PAGE_SHIFT;
162 } while (pmd++, addr = next, addr != end);
163}
179 164
180none_mapped: 165static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
181 if (vma->vm_file) { 166 unsigned long addr, unsigned long end,
182 pgoff = linear_page_index(vma, addr); 167 unsigned char *vec)
183 for (i = 0; i < nr; i++, pgoff++) 168{
184 vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); 169 unsigned long next;
185 } else { 170 pud_t *pud;
186 for (i = 0; i < nr; i++) 171
187 vec[i] = 0; 172 pud = pud_offset(pgd, addr);
173 do {
174 next = pud_addr_end(addr, end);
175 if (pud_none_or_clear_bad(pud))
176 mincore_unmapped_range(vma, addr, next, vec);
177 else
178 mincore_pmd_range(vma, pud, addr, next, vec);
179 vec += (next - addr) >> PAGE_SHIFT;
180 } while (pud++, addr = next, addr != end);
181}
182
183static void mincore_page_range(struct vm_area_struct *vma,
184 unsigned long addr, unsigned long end,
185 unsigned char *vec)
186{
187 unsigned long next;
188 pgd_t *pgd;
189
190 pgd = pgd_offset(vma->vm_mm, addr);
191 do {
192 next = pgd_addr_end(addr, end);
193 if (pgd_none_or_clear_bad(pgd))
194 mincore_unmapped_range(vma, addr, next, vec);
195 else
196 mincore_pud_range(vma, pgd, addr, next, vec);
197 vec += (next - addr) >> PAGE_SHIFT;
198 } while (pgd++, addr = next, addr != end);
199}
200
201/*
202 * Do a chunk of "sys_mincore()". We've already checked
203 * all the arguments, we hold the mmap semaphore: we should
204 * just return the amount of info we're asked for.
205 */
206static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
207{
208 struct vm_area_struct *vma;
209 unsigned long end;
210
211 vma = find_vma(current->mm, addr);
212 if (!vma || addr < vma->vm_start)
213 return -ENOMEM;
214
215 end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
216
217 if (is_vm_hugetlb_page(vma)) {
218 mincore_hugetlb_page_range(vma, addr, end, vec);
219 return (end - addr) >> PAGE_SHIFT;
188 } 220 }
189 221
190 return nr; 222 end = pmd_addr_end(addr, end);
223
224 if (is_vm_hugetlb_page(vma))
225 mincore_hugetlb_page_range(vma, addr, end, vec);
226 else
227 mincore_page_range(vma, addr, end, vec);
228
229 return (end - addr) >> PAGE_SHIFT;
191} 230}
192 231
193/* 232/*
@@ -247,7 +286,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
247 * the temporary buffer size. 286 * the temporary buffer size.
248 */ 287 */
249 down_read(&current->mm->mmap_sem); 288 down_read(&current->mm->mmap_sem);
250 retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); 289 retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
251 up_read(&current->mm->mmap_sem); 290 up_read(&current->mm->mmap_sem);
252 291
253 if (retval <= 0) 292 if (retval <= 0)
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f4e2dfceec1..3f82720e0515 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -607,44 +607,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
607 spin_unlock(&shmlock_user_lock); 607 spin_unlock(&shmlock_user_lock);
608 free_uid(user); 608 free_uid(user);
609} 609}
610
611int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
612 size_t size)
613{
614 unsigned long lim, vm, pgsz;
615 int error = -ENOMEM;
616
617 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
618
619 down_write(&mm->mmap_sem);
620
621 lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
622 vm = mm->total_vm + pgsz;
623 if (lim < vm)
624 goto out;
625
626 lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
627 vm = mm->locked_vm + pgsz;
628 if (lim < vm)
629 goto out;
630
631 mm->total_vm += pgsz;
632 mm->locked_vm += pgsz;
633
634 error = 0;
635 out:
636 up_write(&mm->mmap_sem);
637 return error;
638}
639
640void refund_locked_memory(struct mm_struct *mm, size_t size)
641{
642 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
643
644 down_write(&mm->mmap_sem);
645
646 mm->total_vm -= pgsz;
647 mm->locked_vm -= pgsz;
648
649 up_write(&mm->mmap_sem);
650}
diff --git a/mm/msync.c b/mm/msync.c
index 4083209b7f02..632df4527c01 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -82,7 +82,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
82 (vma->vm_flags & VM_SHARED)) { 82 (vma->vm_flags & VM_SHARED)) {
83 get_file(file); 83 get_file(file);
84 up_read(&mm->mmap_sem); 84 up_read(&mm->mmap_sem);
85 error = vfs_fsync(file, file->f_path.dentry, 0); 85 error = vfs_fsync(file, 0);
86 fput(file); 86 fput(file);
87 if (error || start >= end) 87 if (error || start >= end)
88 goto out; 88 goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 63fa17d121f0..b76f3ee0abe0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -918,14 +918,6 @@ static int validate_mmap_request(struct file *file,
918 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 918 if (!(capabilities & BDI_CAP_MAP_DIRECT))
919 return -ENODEV; 919 return -ENODEV;
920 920
921 if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
922 ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
923 ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
924 ) {
925 printk("MAP_SHARED not completely supported on !MMU\n");
926 return -EINVAL;
927 }
928
929 /* we mustn't privatise shared mappings */ 921 /* we mustn't privatise shared mappings */
930 capabilities &= ~BDI_CAP_MAP_COPY; 922 capabilities &= ~BDI_CAP_MAP_COPY;
931 } 923 }
@@ -941,6 +933,20 @@ static int validate_mmap_request(struct file *file,
941 capabilities &= ~BDI_CAP_MAP_DIRECT; 933 capabilities &= ~BDI_CAP_MAP_DIRECT;
942 } 934 }
943 935
936 if (capabilities & BDI_CAP_MAP_DIRECT) {
937 if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
938 ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
939 ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
940 ) {
941 capabilities &= ~BDI_CAP_MAP_DIRECT;
942 if (flags & MAP_SHARED) {
943 printk(KERN_WARNING
944 "MAP_SHARED not completely supported on !MMU\n");
945 return -EINVAL;
946 }
947 }
948 }
949
944 /* handle executable mappings and implied executable 950 /* handle executable mappings and implied executable
945 * mappings */ 951 * mappings */
946 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { 952 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
@@ -996,22 +1002,20 @@ static unsigned long determine_vm_flags(struct file *file,
996 unsigned long vm_flags; 1002 unsigned long vm_flags;
997 1003
998 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); 1004 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
999 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1000 /* vm_flags |= mm->def_flags; */ 1005 /* vm_flags |= mm->def_flags; */
1001 1006
1002 if (!(capabilities & BDI_CAP_MAP_DIRECT)) { 1007 if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
1003 /* attempt to share read-only copies of mapped file chunks */ 1008 /* attempt to share read-only copies of mapped file chunks */
1009 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1004 if (file && !(prot & PROT_WRITE)) 1010 if (file && !(prot & PROT_WRITE))
1005 vm_flags |= VM_MAYSHARE; 1011 vm_flags |= VM_MAYSHARE;
1006 } 1012 } else {
1007 else {
1008 /* overlay a shareable mapping on the backing device or inode 1013 /* overlay a shareable mapping on the backing device or inode
1009 * if possible - used for chardevs, ramfs/tmpfs/shmfs and 1014 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
1010 * romfs/cramfs */ 1015 * romfs/cramfs */
1016 vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
1011 if (flags & MAP_SHARED) 1017 if (flags & MAP_SHARED)
1012 vm_flags |= VM_MAYSHARE | VM_SHARED; 1018 vm_flags |= VM_SHARED;
1013 else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
1014 vm_flags |= VM_MAYSHARE;
1015 } 1019 }
1016 1020
1017 /* refuse to let anyone share private mappings with this process if 1021 /* refuse to let anyone share private mappings with this process if
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b68e802a7a7d..709aedfaa014 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -479,12 +479,9 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
479 read_lock(&tasklist_lock); 479 read_lock(&tasklist_lock);
480retry: 480retry:
481 p = select_bad_process(&points, mem); 481 p = select_bad_process(&points, mem);
482 if (PTR_ERR(p) == -1UL) 482 if (!p || PTR_ERR(p) == -1UL)
483 goto out; 483 goto out;
484 484
485 if (!p)
486 p = current;
487
488 if (oom_kill_process(p, gfp_mask, 0, points, mem, 485 if (oom_kill_process(p, gfp_mask, 0, points, mem,
489 "Memory cgroup out of memory")) 486 "Memory cgroup out of memory"))
490 goto retry; 487 goto retry;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8b..b289310e2c89 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS)) 598 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 599 > background_thresh)))
600 bdi_start_writeback(bdi, NULL, 0); 600 bdi_start_writeback(bdi, NULL, 0, 0);
601} 601}
602 602
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 603void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
683 } 683 }
684} 684}
685 685
686static void laptop_timer_fn(unsigned long unused);
687
688static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
689
690/* 686/*
691 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 687 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
692 */ 688 */
@@ -694,24 +690,24 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
694 void __user *buffer, size_t *length, loff_t *ppos) 690 void __user *buffer, size_t *length, loff_t *ppos)
695{ 691{
696 proc_dointvec(table, write, buffer, length, ppos); 692 proc_dointvec(table, write, buffer, length, ppos);
693 bdi_arm_supers_timer();
697 return 0; 694 return 0;
698} 695}
699 696
700static void do_laptop_sync(struct work_struct *work) 697#ifdef CONFIG_BLOCK
698void laptop_mode_timer_fn(unsigned long data)
701{ 699{
702 wakeup_flusher_threads(0); 700 struct request_queue *q = (struct request_queue *)data;
703 kfree(work); 701 int nr_pages = global_page_state(NR_FILE_DIRTY) +
704} 702 global_page_state(NR_UNSTABLE_NFS);
705 703
706static void laptop_timer_fn(unsigned long unused) 704 /*
707{ 705 * We want to write everything out, not just down to the dirty
708 struct work_struct *work; 706 * threshold
707 */
709 708
710 work = kmalloc(sizeof(*work), GFP_ATOMIC); 709 if (bdi_has_dirty_io(&q->backing_dev_info))
711 if (work) { 710 bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0);
712 INIT_WORK(work, do_laptop_sync);
713 schedule_work(work);
714 }
715} 711}
716 712
717/* 713/*
@@ -719,9 +715,9 @@ static void laptop_timer_fn(unsigned long unused)
719 * of all dirty data a few seconds from now. If the flush is already scheduled 715 * of all dirty data a few seconds from now. If the flush is already scheduled
720 * then push it back - the user is still using the disk. 716 * then push it back - the user is still using the disk.
721 */ 717 */
722void laptop_io_completion(void) 718void laptop_io_completion(struct backing_dev_info *info)
723{ 719{
724 mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); 720 mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
725} 721}
726 722
727/* 723/*
@@ -731,8 +727,16 @@ void laptop_io_completion(void)
731 */ 727 */
732void laptop_sync_completion(void) 728void laptop_sync_completion(void)
733{ 729{
734 del_timer(&laptop_mode_wb_timer); 730 struct backing_dev_info *bdi;
731
732 rcu_read_lock();
733
734 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
735 del_timer(&bdi->laptop_mode_wb_timer);
736
737 rcu_read_unlock();
735} 738}
739#endif
736 740
737/* 741/*
738 * If ratelimit_pages is too high then we can get into dirty-data overload 742 * If ratelimit_pages is too high then we can get into dirty-data overload
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d03c946d5566..431214b941ac 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,6 +49,7 @@
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h> 51#include <linux/memory.h>
52#include <linux/compaction.h>
52#include <trace/events/kmem.h> 53#include <trace/events/kmem.h>
53#include <linux/ftrace_event.h> 54#include <linux/ftrace_event.h>
54 55
@@ -56,6 +57,22 @@
56#include <asm/div64.h> 57#include <asm/div64.h>
57#include "internal.h" 58#include "internal.h"
58 59
60#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
61DEFINE_PER_CPU(int, numa_node);
62EXPORT_PER_CPU_SYMBOL(numa_node);
63#endif
64
65#ifdef CONFIG_HAVE_MEMORYLESS_NODES
66/*
67 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
68 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
69 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
70 * defined in <linux/topology.h>.
71 */
72DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
73EXPORT_PER_CPU_SYMBOL(_numa_mem_);
74#endif
75
59/* 76/*
60 * Array of node states. 77 * Array of node states.
61 */ 78 */
@@ -475,6 +492,8 @@ static inline void __free_one_page(struct page *page,
475 int migratetype) 492 int migratetype)
476{ 493{
477 unsigned long page_idx; 494 unsigned long page_idx;
495 unsigned long combined_idx;
496 struct page *buddy;
478 497
479 if (unlikely(PageCompound(page))) 498 if (unlikely(PageCompound(page)))
480 if (unlikely(destroy_compound_page(page, order))) 499 if (unlikely(destroy_compound_page(page, order)))
@@ -488,9 +507,6 @@ static inline void __free_one_page(struct page *page,
488 VM_BUG_ON(bad_range(zone, page)); 507 VM_BUG_ON(bad_range(zone, page));
489 508
490 while (order < MAX_ORDER-1) { 509 while (order < MAX_ORDER-1) {
491 unsigned long combined_idx;
492 struct page *buddy;
493
494 buddy = __page_find_buddy(page, page_idx, order); 510 buddy = __page_find_buddy(page, page_idx, order);
495 if (!page_is_buddy(page, buddy, order)) 511 if (!page_is_buddy(page, buddy, order))
496 break; 512 break;
@@ -505,8 +521,29 @@ static inline void __free_one_page(struct page *page,
505 order++; 521 order++;
506 } 522 }
507 set_page_order(page, order); 523 set_page_order(page, order);
508 list_add(&page->lru, 524
509 &zone->free_area[order].free_list[migratetype]); 525 /*
526 * If this is not the largest possible page, check if the buddy
527 * of the next-highest order is free. If it is, it's possible
528 * that pages are being freed that will coalesce soon. In case,
529 * that is happening, add the free page to the tail of the list
530 * so it's less likely to be used soon and more likely to be merged
531 * as a higher order page
532 */
533 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
534 struct page *higher_page, *higher_buddy;
535 combined_idx = __find_combined_index(page_idx, order);
536 higher_page = page + combined_idx - page_idx;
537 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
538 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
539 list_add_tail(&page->lru,
540 &zone->free_area[order].free_list[migratetype]);
541 goto out;
542 }
543 }
544
545 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
546out:
510 zone->free_area[order].nr_free++; 547 zone->free_area[order].nr_free++;
511} 548}
512 549
@@ -599,20 +636,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
599 spin_unlock(&zone->lock); 636 spin_unlock(&zone->lock);
600} 637}
601 638
602static void __free_pages_ok(struct page *page, unsigned int order) 639static bool free_pages_prepare(struct page *page, unsigned int order)
603{ 640{
604 unsigned long flags;
605 int i; 641 int i;
606 int bad = 0; 642 int bad = 0;
607 int wasMlocked = __TestClearPageMlocked(page);
608 643
609 trace_mm_page_free_direct(page, order); 644 trace_mm_page_free_direct(page, order);
610 kmemcheck_free_shadow(page, order); 645 kmemcheck_free_shadow(page, order);
611 646
612 for (i = 0 ; i < (1 << order) ; ++i) 647 for (i = 0; i < (1 << order); i++) {
613 bad += free_pages_check(page + i); 648 struct page *pg = page + i;
649
650 if (PageAnon(pg))
651 pg->mapping = NULL;
652 bad += free_pages_check(pg);
653 }
614 if (bad) 654 if (bad)
615 return; 655 return false;
616 656
617 if (!PageHighMem(page)) { 657 if (!PageHighMem(page)) {
618 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 658 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -622,6 +662,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
622 arch_free_page(page, order); 662 arch_free_page(page, order);
623 kernel_map_pages(page, 1 << order, 0); 663 kernel_map_pages(page, 1 << order, 0);
624 664
665 return true;
666}
667
668static void __free_pages_ok(struct page *page, unsigned int order)
669{
670 unsigned long flags;
671 int wasMlocked = __TestClearPageMlocked(page);
672
673 if (!free_pages_prepare(page, order))
674 return;
675
625 local_irq_save(flags); 676 local_irq_save(flags);
626 if (unlikely(wasMlocked)) 677 if (unlikely(wasMlocked))
627 free_page_mlock(page); 678 free_page_mlock(page);
@@ -1107,21 +1158,9 @@ void free_hot_cold_page(struct page *page, int cold)
1107 int migratetype; 1158 int migratetype;
1108 int wasMlocked = __TestClearPageMlocked(page); 1159 int wasMlocked = __TestClearPageMlocked(page);
1109 1160
1110 trace_mm_page_free_direct(page, 0); 1161 if (!free_pages_prepare(page, 0))
1111 kmemcheck_free_shadow(page, 0);
1112
1113 if (PageAnon(page))
1114 page->mapping = NULL;
1115 if (free_pages_check(page))
1116 return; 1162 return;
1117 1163
1118 if (!PageHighMem(page)) {
1119 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
1120 debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
1121 }
1122 arch_free_page(page, 0);
1123 kernel_map_pages(page, 1, 0);
1124
1125 migratetype = get_pageblock_migratetype(page); 1164 migratetype = get_pageblock_migratetype(page);
1126 set_page_private(page, migratetype); 1165 set_page_private(page, migratetype);
1127 local_irq_save(flags); 1166 local_irq_save(flags);
@@ -1188,6 +1227,51 @@ void split_page(struct page *page, unsigned int order)
1188} 1227}
1189 1228
1190/* 1229/*
1230 * Similar to split_page except the page is already free. As this is only
1231 * being used for migration, the migratetype of the block also changes.
1232 * As this is called with interrupts disabled, the caller is responsible
1233 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1234 * are enabled.
1235 *
1236 * Note: this is probably too low level an operation for use in drivers.
1237 * Please consult with lkml before using this in your driver.
1238 */
1239int split_free_page(struct page *page)
1240{
1241 unsigned int order;
1242 unsigned long watermark;
1243 struct zone *zone;
1244
1245 BUG_ON(!PageBuddy(page));
1246
1247 zone = page_zone(page);
1248 order = page_order(page);
1249
1250 /* Obey watermarks as if the page was being allocated */
1251 watermark = low_wmark_pages(zone) + (1 << order);
1252 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1253 return 0;
1254
1255 /* Remove page from free list */
1256 list_del(&page->lru);
1257 zone->free_area[order].nr_free--;
1258 rmv_page_order(page);
1259 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1260
1261 /* Split into individual pages */
1262 set_page_refcounted(page);
1263 split_page(page, order);
1264
1265 if (order >= pageblock_order - 1) {
1266 struct page *endpage = page + (1 << order) - 1;
1267 for (; page < endpage; page += pageblock_nr_pages)
1268 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1269 }
1270
1271 return 1 << order;
1272}
1273
1274/*
1191 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1275 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1192 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1276 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1193 * or two. 1277 * or two.
@@ -1693,6 +1777,62 @@ out:
1693 return page; 1777 return page;
1694} 1778}
1695 1779
1780#ifdef CONFIG_COMPACTION
1781/* Try memory compaction for high-order allocations before reclaim */
1782static struct page *
1783__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1784 struct zonelist *zonelist, enum zone_type high_zoneidx,
1785 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1786 int migratetype, unsigned long *did_some_progress)
1787{
1788 struct page *page;
1789
1790 if (!order || compaction_deferred(preferred_zone))
1791 return NULL;
1792
1793 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1794 nodemask);
1795 if (*did_some_progress != COMPACT_SKIPPED) {
1796
1797 /* Page migration frees to the PCP lists but we want merging */
1798 drain_pages(get_cpu());
1799 put_cpu();
1800
1801 page = get_page_from_freelist(gfp_mask, nodemask,
1802 order, zonelist, high_zoneidx,
1803 alloc_flags, preferred_zone,
1804 migratetype);
1805 if (page) {
1806 preferred_zone->compact_considered = 0;
1807 preferred_zone->compact_defer_shift = 0;
1808 count_vm_event(COMPACTSUCCESS);
1809 return page;
1810 }
1811
1812 /*
1813 * It's bad if compaction run occurs and fails.
1814 * The most likely reason is that pages exist,
1815 * but not enough to satisfy watermarks.
1816 */
1817 count_vm_event(COMPACTFAIL);
1818 defer_compaction(preferred_zone);
1819
1820 cond_resched();
1821 }
1822
1823 return NULL;
1824}
1825#else
1826static inline struct page *
1827__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1828 struct zonelist *zonelist, enum zone_type high_zoneidx,
1829 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1830 int migratetype, unsigned long *did_some_progress)
1831{
1832 return NULL;
1833}
1834#endif /* CONFIG_COMPACTION */
1835
1696/* The really slow allocator path where we enter direct reclaim */ 1836/* The really slow allocator path where we enter direct reclaim */
1697static inline struct page * 1837static inline struct page *
1698__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 1838__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1879,6 +2019,15 @@ rebalance:
1879 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2019 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1880 goto nopage; 2020 goto nopage;
1881 2021
2022 /* Try direct compaction */
2023 page = __alloc_pages_direct_compact(gfp_mask, order,
2024 zonelist, high_zoneidx,
2025 nodemask,
2026 alloc_flags, preferred_zone,
2027 migratetype, &did_some_progress);
2028 if (page)
2029 goto got_pg;
2030
1882 /* Try direct reclaim and then allocating */ 2031 /* Try direct reclaim and then allocating */
1883 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2032 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1884 zonelist, high_zoneidx, 2033 zonelist, high_zoneidx,
@@ -1970,10 +2119,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1970 if (unlikely(!zonelist->_zonerefs->zone)) 2119 if (unlikely(!zonelist->_zonerefs->zone))
1971 return NULL; 2120 return NULL;
1972 2121
2122 get_mems_allowed();
1973 /* The preferred zone is used for statistics later */ 2123 /* The preferred zone is used for statistics later */
1974 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2124 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1975 if (!preferred_zone) 2125 if (!preferred_zone) {
2126 put_mems_allowed();
1976 return NULL; 2127 return NULL;
2128 }
1977 2129
1978 /* First allocation attempt */ 2130 /* First allocation attempt */
1979 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2131 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1983,6 +2135,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1983 page = __alloc_pages_slowpath(gfp_mask, order, 2135 page = __alloc_pages_slowpath(gfp_mask, order,
1984 zonelist, high_zoneidx, nodemask, 2136 zonelist, high_zoneidx, nodemask,
1985 preferred_zone, migratetype); 2137 preferred_zone, migratetype);
2138 put_mems_allowed();
1986 2139
1987 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2140 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1988 return page; 2141 return page;
@@ -2434,8 +2587,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2434 strncpy((char*)table->data, saved_string, 2587 strncpy((char*)table->data, saved_string,
2435 NUMA_ZONELIST_ORDER_LEN); 2588 NUMA_ZONELIST_ORDER_LEN);
2436 user_zonelist_order = oldval; 2589 user_zonelist_order = oldval;
2437 } else if (oldval != user_zonelist_order) 2590 } else if (oldval != user_zonelist_order) {
2438 build_all_zonelists(); 2591 mutex_lock(&zonelists_mutex);
2592 build_all_zonelists(NULL);
2593 mutex_unlock(&zonelists_mutex);
2594 }
2439 } 2595 }
2440out: 2596out:
2441 mutex_unlock(&zl_order_mutex); 2597 mutex_unlock(&zl_order_mutex);
@@ -2579,10 +2735,10 @@ static int default_zonelist_order(void)
2579 struct zone *z; 2735 struct zone *z;
2580 int average_size; 2736 int average_size;
2581 /* 2737 /*
2582 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. 2738 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
2583 * If they are really small and used heavily, the system can fall 2739 * If they are really small and used heavily, the system can fall
2584 * into OOM very easily. 2740 * into OOM very easily.
2585 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 2741 * This function detect ZONE_DMA/DMA32 size and configures zone order.
2586 */ 2742 */
2587 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 2743 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2588 low_kmem_size = 0; 2744 low_kmem_size = 0;
@@ -2594,6 +2750,15 @@ static int default_zonelist_order(void)
2594 if (zone_type < ZONE_NORMAL) 2750 if (zone_type < ZONE_NORMAL)
2595 low_kmem_size += z->present_pages; 2751 low_kmem_size += z->present_pages;
2596 total_size += z->present_pages; 2752 total_size += z->present_pages;
2753 } else if (zone_type == ZONE_NORMAL) {
2754 /*
2755 * If any node has only lowmem, then node order
2756 * is preferred to allow kernel allocations
2757 * locally; otherwise, they can easily infringe
2758 * on other nodes when there is an abundance of
2759 * lowmem available to allocate from.
2760 */
2761 return ZONELIST_ORDER_NODE;
2597 } 2762 }
2598 } 2763 }
2599 } 2764 }
@@ -2707,6 +2872,24 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2707 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 2872 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
2708} 2873}
2709 2874
2875#ifdef CONFIG_HAVE_MEMORYLESS_NODES
2876/*
2877 * Return node id of node used for "local" allocations.
2878 * I.e., first node id of first zone in arg node's generic zonelist.
2879 * Used for initializing percpu 'numa_mem', which is used primarily
2880 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
2881 */
2882int local_memory_node(int node)
2883{
2884 struct zone *zone;
2885
2886 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
2887 gfp_zone(GFP_KERNEL),
2888 NULL,
2889 &zone);
2890 return zone->node;
2891}
2892#endif
2710 2893
2711#else /* CONFIG_NUMA */ 2894#else /* CONFIG_NUMA */
2712 2895
@@ -2776,9 +2959,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2776 */ 2959 */
2777static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 2960static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2778static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 2961static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2962static void setup_zone_pageset(struct zone *zone);
2963
2964/*
2965 * Global mutex to protect against size modification of zonelists
2966 * as well as to serialize pageset setup for the new populated zone.
2967 */
2968DEFINE_MUTEX(zonelists_mutex);
2779 2969
2780/* return values int ....just for stop_machine() */ 2970/* return values int ....just for stop_machine() */
2781static int __build_all_zonelists(void *dummy) 2971static __init_refok int __build_all_zonelists(void *data)
2782{ 2972{
2783 int nid; 2973 int nid;
2784 int cpu; 2974 int cpu;
@@ -2793,6 +2983,14 @@ static int __build_all_zonelists(void *dummy)
2793 build_zonelist_cache(pgdat); 2983 build_zonelist_cache(pgdat);
2794 } 2984 }
2795 2985
2986#ifdef CONFIG_MEMORY_HOTPLUG
2987 /* Setup real pagesets for the new zone */
2988 if (data) {
2989 struct zone *zone = data;
2990 setup_zone_pageset(zone);
2991 }
2992#endif
2993
2796 /* 2994 /*
2797 * Initialize the boot_pagesets that are going to be used 2995 * Initialize the boot_pagesets that are going to be used
2798 * for bootstrapping processors. The real pagesets for 2996 * for bootstrapping processors. The real pagesets for
@@ -2806,13 +3004,31 @@ static int __build_all_zonelists(void *dummy)
2806 * needs the percpu allocator in order to allocate its pagesets 3004 * needs the percpu allocator in order to allocate its pagesets
2807 * (a chicken-egg dilemma). 3005 * (a chicken-egg dilemma).
2808 */ 3006 */
2809 for_each_possible_cpu(cpu) 3007 for_each_possible_cpu(cpu) {
2810 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3008 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2811 3009
3010#ifdef CONFIG_HAVE_MEMORYLESS_NODES
3011 /*
3012 * We now know the "local memory node" for each node--
3013 * i.e., the node of the first zone in the generic zonelist.
3014 * Set up numa_mem percpu variable for on-line cpus. During
3015 * boot, only the boot cpu should be on-line; we'll init the
3016 * secondary cpus' numa_mem as they come on-line. During
3017 * node/memory hotplug, we'll fixup all on-line cpus.
3018 */
3019 if (cpu_online(cpu))
3020 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3021#endif
3022 }
3023
2812 return 0; 3024 return 0;
2813} 3025}
2814 3026
2815void build_all_zonelists(void) 3027/*
3028 * Called with zonelists_mutex held always
3029 * unless system_state == SYSTEM_BOOTING.
3030 */
3031void build_all_zonelists(void *data)
2816{ 3032{
2817 set_zonelist_order(); 3033 set_zonelist_order();
2818 3034
@@ -2823,7 +3039,7 @@ void build_all_zonelists(void)
2823 } else { 3039 } else {
2824 /* we have to stop all cpus to guarantee there is no user 3040 /* we have to stop all cpus to guarantee there is no user
2825 of zonelist */ 3041 of zonelist */
2826 stop_machine(__build_all_zonelists, NULL, NULL); 3042 stop_machine(__build_all_zonelists, data, NULL);
2827 /* cpuset refresh routine should be here */ 3043 /* cpuset refresh routine should be here */
2828 } 3044 }
2829 vm_total_pages = nr_free_pagecache_pages(); 3045 vm_total_pages = nr_free_pagecache_pages();
@@ -3146,31 +3362,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3146 pcp->batch = PAGE_SHIFT * 8; 3362 pcp->batch = PAGE_SHIFT * 8;
3147} 3363}
3148 3364
3365static __meminit void setup_zone_pageset(struct zone *zone)
3366{
3367 int cpu;
3368
3369 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3370
3371 for_each_possible_cpu(cpu) {
3372 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3373
3374 setup_pageset(pcp, zone_batchsize(zone));
3375
3376 if (percpu_pagelist_fraction)
3377 setup_pagelist_highmark(pcp,
3378 (zone->present_pages /
3379 percpu_pagelist_fraction));
3380 }
3381}
3382
3149/* 3383/*
3150 * Allocate per cpu pagesets and initialize them. 3384 * Allocate per cpu pagesets and initialize them.
3151 * Before this call only boot pagesets were available. 3385 * Before this call only boot pagesets were available.
3152 * Boot pagesets will no longer be used by this processorr
3153 * after setup_per_cpu_pageset().
3154 */ 3386 */
3155void __init setup_per_cpu_pageset(void) 3387void __init setup_per_cpu_pageset(void)
3156{ 3388{
3157 struct zone *zone; 3389 struct zone *zone;
3158 int cpu;
3159
3160 for_each_populated_zone(zone) {
3161 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3162
3163 for_each_possible_cpu(cpu) {
3164 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3165
3166 setup_pageset(pcp, zone_batchsize(zone));
3167 3390
3168 if (percpu_pagelist_fraction) 3391 for_each_populated_zone(zone)
3169 setup_pagelist_highmark(pcp, 3392 setup_zone_pageset(zone);
3170 (zone->present_pages /
3171 percpu_pagelist_fraction));
3172 }
3173 }
3174} 3393}
3175 3394
3176static noinline __init_refok 3395static noinline __init_refok
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
new file mode 100644
index 000000000000..df680855540a
--- /dev/null
+++ b/mm/percpu-km.c
@@ -0,0 +1,104 @@
1/*
2 * mm/percpu-km.c - kernel memory based chunk allocation
3 *
4 * Copyright (C) 2010 SUSE Linux Products GmbH
5 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * Chunks are allocated as a contiguous kernel memory using gfp
10 * allocation. This is to be used on nommu architectures.
11 *
12 * To use percpu-km,
13 *
14 * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig.
15 *
16 * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's
17 * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work
18 * fine.
19 *
20 * - NUMA is not supported. When setting up the first chunk,
21 * @cpu_distance_fn should be NULL or report all CPUs to be nearer
22 * than or at LOCAL_DISTANCE.
23 *
24 * - It's best if the chunk size is power of two multiple of
25 * PAGE_SIZE. Because each chunk is allocated as a contiguous
26 * kernel memory block using alloc_pages(), memory will be wasted if
27 * chunk size is not aligned. percpu-km code will whine about it.
28 */
29
30#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
31#error "contiguous percpu allocation is incompatible with paged first chunk"
32#endif
33
34#include <linux/log2.h>
35
36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
37{
38 /* noop */
39 return 0;
40}
41
42static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
43{
44 /* nada */
45}
46
47static struct pcpu_chunk *pcpu_create_chunk(void)
48{
49 const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
50 struct pcpu_chunk *chunk;
51 struct page *pages;
52 int i;
53
54 chunk = pcpu_alloc_chunk();
55 if (!chunk)
56 return NULL;
57
58 pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
59 if (!pages) {
60 pcpu_free_chunk(chunk);
61 return NULL;
62 }
63
64 for (i = 0; i < nr_pages; i++)
65 pcpu_set_page_chunk(nth_page(pages, i), chunk);
66
67 chunk->data = pages;
68 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
69 return chunk;
70}
71
72static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
73{
74 const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
75
76 if (chunk && chunk->data)
77 __free_pages(chunk->data, order_base_2(nr_pages));
78 pcpu_free_chunk(chunk);
79}
80
81static struct page *pcpu_addr_to_page(void *addr)
82{
83 return virt_to_page(addr);
84}
85
86static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
87{
88 size_t nr_pages, alloc_pages;
89
90 /* all units must be in a single group */
91 if (ai->nr_groups != 1) {
92 printk(KERN_CRIT "percpu: can't handle more than one groups\n");
93 return -EINVAL;
94 }
95
96 nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT;
97 alloc_pages = roundup_pow_of_two(nr_pages);
98
99 if (alloc_pages > nr_pages)
100 printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
101 alloc_pages - nr_pages);
102
103 return 0;
104}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
new file mode 100644
index 000000000000..7d9c1d0ebd3f
--- /dev/null
+++ b/mm/percpu-vm.c
@@ -0,0 +1,451 @@
1/*
2 * mm/percpu-vm.c - vmalloc area based chunk allocation
3 *
4 * Copyright (C) 2010 SUSE Linux Products GmbH
5 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * Chunks are mapped into vmalloc areas and populated page by page.
10 * This is the default chunk allocator.
11 */
12
13static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
14 unsigned int cpu, int page_idx)
15{
16 /* must not be used on pre-mapped chunk */
17 WARN_ON(chunk->immutable);
18
19 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
20}
21
22/**
23 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
24 * @chunk: chunk of interest
25 * @bitmapp: output parameter for bitmap
26 * @may_alloc: may allocate the array
27 *
28 * Returns pointer to array of pointers to struct page and bitmap,
29 * both of which can be indexed with pcpu_page_idx(). The returned
30 * array is cleared to zero and *@bitmapp is copied from
31 * @chunk->populated. Note that there is only one array and bitmap
32 * and access exclusion is the caller's responsibility.
33 *
34 * CONTEXT:
35 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
36 * Otherwise, don't care.
37 *
38 * RETURNS:
39 * Pointer to temp pages array on success, NULL on failure.
40 */
41static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
42 unsigned long **bitmapp,
43 bool may_alloc)
44{
45 static struct page **pages;
46 static unsigned long *bitmap;
47 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
48 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
49 sizeof(unsigned long);
50
51 if (!pages || !bitmap) {
52 if (may_alloc && !pages)
53 pages = pcpu_mem_alloc(pages_size);
54 if (may_alloc && !bitmap)
55 bitmap = pcpu_mem_alloc(bitmap_size);
56 if (!pages || !bitmap)
57 return NULL;
58 }
59
60 memset(pages, 0, pages_size);
61 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
62
63 *bitmapp = bitmap;
64 return pages;
65}
66
67/**
68 * pcpu_free_pages - free pages which were allocated for @chunk
69 * @chunk: chunk pages were allocated for
70 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
71 * @populated: populated bitmap
72 * @page_start: page index of the first page to be freed
73 * @page_end: page index of the last page to be freed + 1
74 *
75 * Free pages [@page_start and @page_end) in @pages for all units.
76 * The pages were allocated for @chunk.
77 */
78static void pcpu_free_pages(struct pcpu_chunk *chunk,
79 struct page **pages, unsigned long *populated,
80 int page_start, int page_end)
81{
82 unsigned int cpu;
83 int i;
84
85 for_each_possible_cpu(cpu) {
86 for (i = page_start; i < page_end; i++) {
87 struct page *page = pages[pcpu_page_idx(cpu, i)];
88
89 if (page)
90 __free_page(page);
91 }
92 }
93}
94
95/**
96 * pcpu_alloc_pages - allocates pages for @chunk
97 * @chunk: target chunk
98 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
99 * @populated: populated bitmap
100 * @page_start: page index of the first page to be allocated
101 * @page_end: page index of the last page to be allocated + 1
102 *
103 * Allocate pages [@page_start,@page_end) into @pages for all units.
104 * The allocation is for @chunk. Percpu core doesn't care about the
105 * content of @pages and will pass it verbatim to pcpu_map_pages().
106 */
107static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
108 struct page **pages, unsigned long *populated,
109 int page_start, int page_end)
110{
111 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
112 unsigned int cpu;
113 int i;
114
115 for_each_possible_cpu(cpu) {
116 for (i = page_start; i < page_end; i++) {
117 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
118
119 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
120 if (!*pagep) {
121 pcpu_free_pages(chunk, pages, populated,
122 page_start, page_end);
123 return -ENOMEM;
124 }
125 }
126 }
127 return 0;
128}
129
130/**
131 * pcpu_pre_unmap_flush - flush cache prior to unmapping
132 * @chunk: chunk the regions to be flushed belongs to
133 * @page_start: page index of the first page to be flushed
134 * @page_end: page index of the last page to be flushed + 1
135 *
136 * Pages in [@page_start,@page_end) of @chunk are about to be
137 * unmapped. Flush cache. As each flushing trial can be very
138 * expensive, issue flush on the whole region at once rather than
139 * doing it for each cpu. This could be an overkill but is more
140 * scalable.
141 */
142static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
143 int page_start, int page_end)
144{
145 flush_cache_vunmap(
146 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
147 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
148}
149
150static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
151{
152 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
153}
154
155/**
156 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
157 * @chunk: chunk of interest
158 * @pages: pages array which can be used to pass information to free
159 * @populated: populated bitmap
160 * @page_start: page index of the first page to unmap
161 * @page_end: page index of the last page to unmap + 1
162 *
163 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
164 * Corresponding elements in @pages were cleared by the caller and can
165 * be used to carry information to pcpu_free_pages() which will be
166 * called after all unmaps are finished. The caller should call
167 * proper pre/post flush functions.
168 */
169static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
170 struct page **pages, unsigned long *populated,
171 int page_start, int page_end)
172{
173 unsigned int cpu;
174 int i;
175
176 for_each_possible_cpu(cpu) {
177 for (i = page_start; i < page_end; i++) {
178 struct page *page;
179
180 page = pcpu_chunk_page(chunk, cpu, i);
181 WARN_ON(!page);
182 pages[pcpu_page_idx(cpu, i)] = page;
183 }
184 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
185 page_end - page_start);
186 }
187
188 for (i = page_start; i < page_end; i++)
189 __clear_bit(i, populated);
190}
191
192/**
193 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
194 * @chunk: pcpu_chunk the regions to be flushed belong to
195 * @page_start: page index of the first page to be flushed
196 * @page_end: page index of the last page to be flushed + 1
197 *
198 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
199 * TLB for the regions. This can be skipped if the area is to be
200 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
201 *
202 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
203 * for the whole region.
204 */
205static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
206 int page_start, int page_end)
207{
208 flush_tlb_kernel_range(
209 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
210 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
211}
212
213static int __pcpu_map_pages(unsigned long addr, struct page **pages,
214 int nr_pages)
215{
216 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
217 PAGE_KERNEL, pages);
218}
219
220/**
221 * pcpu_map_pages - map pages into a pcpu_chunk
222 * @chunk: chunk of interest
223 * @pages: pages array containing pages to be mapped
224 * @populated: populated bitmap
225 * @page_start: page index of the first page to map
226 * @page_end: page index of the last page to map + 1
227 *
228 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
229 * caller is responsible for calling pcpu_post_map_flush() after all
230 * mappings are complete.
231 *
232 * This function is responsible for setting corresponding bits in
233 * @chunk->populated bitmap and whatever is necessary for reverse
234 * lookup (addr -> chunk).
235 */
236static int pcpu_map_pages(struct pcpu_chunk *chunk,
237 struct page **pages, unsigned long *populated,
238 int page_start, int page_end)
239{
240 unsigned int cpu, tcpu;
241 int i, err;
242
243 for_each_possible_cpu(cpu) {
244 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
245 &pages[pcpu_page_idx(cpu, page_start)],
246 page_end - page_start);
247 if (err < 0)
248 goto err;
249 }
250
251 /* mapping successful, link chunk and mark populated */
252 for (i = page_start; i < page_end; i++) {
253 for_each_possible_cpu(cpu)
254 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
255 chunk);
256 __set_bit(i, populated);
257 }
258
259 return 0;
260
261err:
262 for_each_possible_cpu(tcpu) {
263 if (tcpu == cpu)
264 break;
265 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
266 page_end - page_start);
267 }
268 return err;
269}
270
271/**
272 * pcpu_post_map_flush - flush cache after mapping
273 * @chunk: pcpu_chunk the regions to be flushed belong to
274 * @page_start: page index of the first page to be flushed
275 * @page_end: page index of the last page to be flushed + 1
276 *
277 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
278 * cache.
279 *
280 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
281 * for the whole region.
282 */
283static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
284 int page_start, int page_end)
285{
286 flush_cache_vmap(
287 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
288 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
289}
290
291/**
292 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
293 * @chunk: chunk of interest
294 * @off: offset to the area to populate
295 * @size: size of the area to populate in bytes
296 *
297 * For each cpu, populate and map pages [@page_start,@page_end) into
298 * @chunk. The area is cleared on return.
299 *
300 * CONTEXT:
301 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
302 */
303static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
304{
305 int page_start = PFN_DOWN(off);
306 int page_end = PFN_UP(off + size);
307 int free_end = page_start, unmap_end = page_start;
308 struct page **pages;
309 unsigned long *populated;
310 unsigned int cpu;
311 int rs, re, rc;
312
313 /* quick path, check whether all pages are already there */
314 rs = page_start;
315 pcpu_next_pop(chunk, &rs, &re, page_end);
316 if (rs == page_start && re == page_end)
317 goto clear;
318
319 /* need to allocate and map pages, this chunk can't be immutable */
320 WARN_ON(chunk->immutable);
321
322 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
323 if (!pages)
324 return -ENOMEM;
325
326 /* alloc and map */
327 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
328 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
329 if (rc)
330 goto err_free;
331 free_end = re;
332 }
333
334 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
335 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
336 if (rc)
337 goto err_unmap;
338 unmap_end = re;
339 }
340 pcpu_post_map_flush(chunk, page_start, page_end);
341
342 /* commit new bitmap */
343 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
344clear:
345 for_each_possible_cpu(cpu)
346 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
347 return 0;
348
349err_unmap:
350 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
351 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
352 pcpu_unmap_pages(chunk, pages, populated, rs, re);
353 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
354err_free:
355 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
356 pcpu_free_pages(chunk, pages, populated, rs, re);
357 return rc;
358}
359
360/**
361 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
362 * @chunk: chunk to depopulate
363 * @off: offset to the area to depopulate
364 * @size: size of the area to depopulate in bytes
365 * @flush: whether to flush cache and tlb or not
366 *
367 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
368 * from @chunk. If @flush is true, vcache is flushed before unmapping
369 * and tlb after.
370 *
371 * CONTEXT:
372 * pcpu_alloc_mutex.
373 */
374static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
375{
376 int page_start = PFN_DOWN(off);
377 int page_end = PFN_UP(off + size);
378 struct page **pages;
379 unsigned long *populated;
380 int rs, re;
381
382 /* quick path, check whether it's empty already */
383 rs = page_start;
384 pcpu_next_unpop(chunk, &rs, &re, page_end);
385 if (rs == page_start && re == page_end)
386 return;
387
388 /* immutable chunks can't be depopulated */
389 WARN_ON(chunk->immutable);
390
391 /*
392 * If control reaches here, there must have been at least one
393 * successful population attempt so the temp pages array must
394 * be available now.
395 */
396 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
397 BUG_ON(!pages);
398
399 /* unmap and free */
400 pcpu_pre_unmap_flush(chunk, page_start, page_end);
401
402 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
403 pcpu_unmap_pages(chunk, pages, populated, rs, re);
404
405 /* no need to flush tlb, vmalloc will handle it lazily */
406
407 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
408 pcpu_free_pages(chunk, pages, populated, rs, re);
409
410 /* commit new bitmap */
411 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
412}
413
414static struct pcpu_chunk *pcpu_create_chunk(void)
415{
416 struct pcpu_chunk *chunk;
417 struct vm_struct **vms;
418
419 chunk = pcpu_alloc_chunk();
420 if (!chunk)
421 return NULL;
422
423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
424 pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
425 if (!vms) {
426 pcpu_free_chunk(chunk);
427 return NULL;
428 }
429
430 chunk->data = vms;
431 chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
432 return chunk;
433}
434
435static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
436{
437 if (chunk && chunk->data)
438 pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
439 pcpu_free_chunk(chunk);
440}
441
442static struct page *pcpu_addr_to_page(void *addr)
443{
444 return vmalloc_to_page(addr);
445}
446
447static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
448{
449 /* no extra restriction */
450 return 0;
451}
diff --git a/mm/percpu.c b/mm/percpu.c
index 6e09741ddc62..39f7dfd59585 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/mm/percpu.c - percpu memory allocator 2 * mm/percpu.c - percpu memory allocator
3 * 3 *
4 * Copyright (C) 2009 SUSE Linux Products GmbH 4 * Copyright (C) 2009 SUSE Linux Products GmbH
5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org> 5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
@@ -7,14 +7,13 @@
7 * This file is released under the GPLv2. 7 * This file is released under the GPLv2.
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks. Each chunk is
11 * chunk is consisted of boot-time determined number of units and the 11 * consisted of boot-time determined number of units and the first
12 * first chunk is used for static percpu variables in the kernel image 12 * chunk is used for static percpu variables in the kernel image
13 * (special boot time alloc/init handling necessary as these areas 13 * (special boot time alloc/init handling necessary as these areas
14 * need to be brought up before allocation services are running). 14 * need to be brought up before allocation services are running).
15 * Unit grows as necessary and all units grow or shrink in unison. 15 * Unit grows as necessary and all units grow or shrink in unison.
16 * When a chunk is filled up, another chunk is allocated. ie. in 16 * When a chunk is filled up, another chunk is allocated.
17 * vmalloc area
18 * 17 *
19 * c0 c1 c2 18 * c0 c1 c2
20 * ------------------- ------------------- ------------ 19 * ------------------- ------------------- ------------
@@ -99,7 +98,7 @@ struct pcpu_chunk {
99 int map_used; /* # of map entries used */ 98 int map_used; /* # of map entries used */
100 int map_alloc; /* # of map entries allocated */ 99 int map_alloc; /* # of map entries allocated */
101 int *map; /* allocation map */ 100 int *map; /* allocation map */
102 struct vm_struct **vms; /* mapped vmalloc regions */ 101 void *data; /* chunk data */
103 bool immutable; /* no [de]population allowed */ 102 bool immutable; /* no [de]population allowed */
104 unsigned long populated[]; /* populated bitmap */ 103 unsigned long populated[]; /* populated bitmap */
105}; 104};
@@ -177,6 +176,21 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
177static void pcpu_reclaim(struct work_struct *work); 176static void pcpu_reclaim(struct work_struct *work);
178static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 177static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
179 178
179static bool pcpu_addr_in_first_chunk(void *addr)
180{
181 void *first_start = pcpu_first_chunk->base_addr;
182
183 return addr >= first_start && addr < first_start + pcpu_unit_size;
184}
185
186static bool pcpu_addr_in_reserved_chunk(void *addr)
187{
188 void *first_start = pcpu_first_chunk->base_addr;
189
190 return addr >= first_start &&
191 addr < first_start + pcpu_reserved_chunk_limit;
192}
193
180static int __pcpu_size_to_slot(int size) 194static int __pcpu_size_to_slot(int size)
181{ 195{
182 int highbit = fls(size); /* size is in bytes */ 196 int highbit = fls(size); /* size is in bytes */
@@ -198,27 +212,6 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
198 return pcpu_size_to_slot(chunk->free_size); 212 return pcpu_size_to_slot(chunk->free_size);
199} 213}
200 214
201static int pcpu_page_idx(unsigned int cpu, int page_idx)
202{
203 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
204}
205
206static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
207 unsigned int cpu, int page_idx)
208{
209 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
210 (page_idx << PAGE_SHIFT);
211}
212
213static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
214 unsigned int cpu, int page_idx)
215{
216 /* must not be used on pre-mapped chunk */
217 WARN_ON(chunk->immutable);
218
219 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
220}
221
222/* set the pointer to a chunk in a page struct */ 215/* set the pointer to a chunk in a page struct */
223static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) 216static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
224{ 217{
@@ -231,13 +224,27 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
231 return (struct pcpu_chunk *)page->index; 224 return (struct pcpu_chunk *)page->index;
232} 225}
233 226
234static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) 227static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
228{
229 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
230}
231
232static unsigned long __maybe_unused pcpu_chunk_addr(struct pcpu_chunk *chunk,
233 unsigned int cpu, int page_idx)
234{
235 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
236 (page_idx << PAGE_SHIFT);
237}
238
239static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
240 int *rs, int *re, int end)
235{ 241{
236 *rs = find_next_zero_bit(chunk->populated, end, *rs); 242 *rs = find_next_zero_bit(chunk->populated, end, *rs);
237 *re = find_next_bit(chunk->populated, end, *rs + 1); 243 *re = find_next_bit(chunk->populated, end, *rs + 1);
238} 244}
239 245
240static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) 246static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
247 int *rs, int *re, int end)
241{ 248{
242 *rs = find_next_bit(chunk->populated, end, *rs); 249 *rs = find_next_bit(chunk->populated, end, *rs);
243 *re = find_next_zero_bit(chunk->populated, end, *rs + 1); 250 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
@@ -326,36 +333,6 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
326} 333}
327 334
328/** 335/**
329 * pcpu_chunk_addr_search - determine chunk containing specified address
330 * @addr: address for which the chunk needs to be determined.
331 *
332 * RETURNS:
333 * The address of the found chunk.
334 */
335static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
336{
337 void *first_start = pcpu_first_chunk->base_addr;
338
339 /* is it in the first chunk? */
340 if (addr >= first_start && addr < first_start + pcpu_unit_size) {
341 /* is it in the reserved area? */
342 if (addr < first_start + pcpu_reserved_chunk_limit)
343 return pcpu_reserved_chunk;
344 return pcpu_first_chunk;
345 }
346
347 /*
348 * The address is relative to unit0 which might be unused and
349 * thus unmapped. Offset the address to the unit space of the
350 * current processor before looking it up in the vmalloc
351 * space. Note that any possible cpu id can be used here, so
352 * there's no need to worry about preemption or cpu hotplug.
353 */
354 addr += pcpu_unit_offsets[raw_smp_processor_id()];
355 return pcpu_get_page_chunk(vmalloc_to_page(addr));
356}
357
358/**
359 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 336 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
360 * @chunk: chunk of interest 337 * @chunk: chunk of interest
361 * 338 *
@@ -623,434 +600,92 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
623 pcpu_chunk_relocate(chunk, oslot); 600 pcpu_chunk_relocate(chunk, oslot);
624} 601}
625 602
626/** 603static struct pcpu_chunk *pcpu_alloc_chunk(void)
627 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
628 * @chunk: chunk of interest
629 * @bitmapp: output parameter for bitmap
630 * @may_alloc: may allocate the array
631 *
632 * Returns pointer to array of pointers to struct page and bitmap,
633 * both of which can be indexed with pcpu_page_idx(). The returned
634 * array is cleared to zero and *@bitmapp is copied from
635 * @chunk->populated. Note that there is only one array and bitmap
636 * and access exclusion is the caller's responsibility.
637 *
638 * CONTEXT:
639 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
640 * Otherwise, don't care.
641 *
642 * RETURNS:
643 * Pointer to temp pages array on success, NULL on failure.
644 */
645static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
646 unsigned long **bitmapp,
647 bool may_alloc)
648{
649 static struct page **pages;
650 static unsigned long *bitmap;
651 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
652 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
653 sizeof(unsigned long);
654
655 if (!pages || !bitmap) {
656 if (may_alloc && !pages)
657 pages = pcpu_mem_alloc(pages_size);
658 if (may_alloc && !bitmap)
659 bitmap = pcpu_mem_alloc(bitmap_size);
660 if (!pages || !bitmap)
661 return NULL;
662 }
663
664 memset(pages, 0, pages_size);
665 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
666
667 *bitmapp = bitmap;
668 return pages;
669}
670
671/**
672 * pcpu_free_pages - free pages which were allocated for @chunk
673 * @chunk: chunk pages were allocated for
674 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
675 * @populated: populated bitmap
676 * @page_start: page index of the first page to be freed
677 * @page_end: page index of the last page to be freed + 1
678 *
679 * Free pages [@page_start and @page_end) in @pages for all units.
680 * The pages were allocated for @chunk.
681 */
682static void pcpu_free_pages(struct pcpu_chunk *chunk,
683 struct page **pages, unsigned long *populated,
684 int page_start, int page_end)
685{ 604{
686 unsigned int cpu; 605 struct pcpu_chunk *chunk;
687 int i;
688 606
689 for_each_possible_cpu(cpu) { 607 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
690 for (i = page_start; i < page_end; i++) { 608 if (!chunk)
691 struct page *page = pages[pcpu_page_idx(cpu, i)]; 609 return NULL;
692 610
693 if (page) 611 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
694 __free_page(page); 612 if (!chunk->map) {
695 } 613 kfree(chunk);
614 return NULL;
696 } 615 }
697}
698 616
699/** 617 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
700 * pcpu_alloc_pages - allocates pages for @chunk 618 chunk->map[chunk->map_used++] = pcpu_unit_size;
701 * @chunk: target chunk
702 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
703 * @populated: populated bitmap
704 * @page_start: page index of the first page to be allocated
705 * @page_end: page index of the last page to be allocated + 1
706 *
707 * Allocate pages [@page_start,@page_end) into @pages for all units.
708 * The allocation is for @chunk. Percpu core doesn't care about the
709 * content of @pages and will pass it verbatim to pcpu_map_pages().
710 */
711static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
712 struct page **pages, unsigned long *populated,
713 int page_start, int page_end)
714{
715 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
716 unsigned int cpu;
717 int i;
718 619
719 for_each_possible_cpu(cpu) { 620 INIT_LIST_HEAD(&chunk->list);
720 for (i = page_start; i < page_end; i++) { 621 chunk->free_size = pcpu_unit_size;
721 struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; 622 chunk->contig_hint = pcpu_unit_size;
722
723 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
724 if (!*pagep) {
725 pcpu_free_pages(chunk, pages, populated,
726 page_start, page_end);
727 return -ENOMEM;
728 }
729 }
730 }
731 return 0;
732}
733 623
734/** 624 return chunk;
735 * pcpu_pre_unmap_flush - flush cache prior to unmapping
736 * @chunk: chunk the regions to be flushed belongs to
737 * @page_start: page index of the first page to be flushed
738 * @page_end: page index of the last page to be flushed + 1
739 *
740 * Pages in [@page_start,@page_end) of @chunk are about to be
741 * unmapped. Flush cache. As each flushing trial can be very
742 * expensive, issue flush on the whole region at once rather than
743 * doing it for each cpu. This could be an overkill but is more
744 * scalable.
745 */
746static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
747 int page_start, int page_end)
748{
749 flush_cache_vunmap(
750 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
751 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
752} 625}
753 626
754static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) 627static void pcpu_free_chunk(struct pcpu_chunk *chunk)
755{ 628{
756 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); 629 if (!chunk)
630 return;
631 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
632 kfree(chunk);
757} 633}
758 634
759/** 635/*
760 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk 636 * Chunk management implementation.
761 * @chunk: chunk of interest 637 *
762 * @pages: pages array which can be used to pass information to free 638 * To allow different implementations, chunk alloc/free and
763 * @populated: populated bitmap 639 * [de]population are implemented in a separate file which is pulled
764 * @page_start: page index of the first page to unmap 640 * into this file and compiled together. The following functions
765 * @page_end: page index of the last page to unmap + 1 641 * should be implemented.
766 * 642 *
767 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 643 * pcpu_populate_chunk - populate the specified range of a chunk
768 * Corresponding elements in @pages were cleared by the caller and can 644 * pcpu_depopulate_chunk - depopulate the specified range of a chunk
769 * be used to carry information to pcpu_free_pages() which will be 645 * pcpu_create_chunk - create a new chunk
770 * called after all unmaps are finished. The caller should call 646 * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
771 * proper pre/post flush functions. 647 * pcpu_addr_to_page - translate address to physical address
648 * pcpu_verify_alloc_info - check alloc_info is acceptable during init
772 */ 649 */
773static void pcpu_unmap_pages(struct pcpu_chunk *chunk, 650static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
774 struct page **pages, unsigned long *populated, 651static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
775 int page_start, int page_end) 652static struct pcpu_chunk *pcpu_create_chunk(void);
776{ 653static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
777 unsigned int cpu; 654static struct page *pcpu_addr_to_page(void *addr);
778 int i; 655static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
779 656
780 for_each_possible_cpu(cpu) { 657#ifdef CONFIG_NEED_PER_CPU_KM
781 for (i = page_start; i < page_end; i++) { 658#include "percpu-km.c"
782 struct page *page; 659#else
783 660#include "percpu-vm.c"
784 page = pcpu_chunk_page(chunk, cpu, i); 661#endif
785 WARN_ON(!page);
786 pages[pcpu_page_idx(cpu, i)] = page;
787 }
788 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
789 page_end - page_start);
790 }
791
792 for (i = page_start; i < page_end; i++)
793 __clear_bit(i, populated);
794}
795 662
796/** 663/**
797 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping 664 * pcpu_chunk_addr_search - determine chunk containing specified address
798 * @chunk: pcpu_chunk the regions to be flushed belong to 665 * @addr: address for which the chunk needs to be determined.
799 * @page_start: page index of the first page to be flushed
800 * @page_end: page index of the last page to be flushed + 1
801 *
802 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
803 * TLB for the regions. This can be skipped if the area is to be
804 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
805 * 666 *
806 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once 667 * RETURNS:
807 * for the whole region. 668 * The address of the found chunk.
808 */
809static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
810 int page_start, int page_end)
811{
812 flush_tlb_kernel_range(
813 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
814 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
815}
816
817static int __pcpu_map_pages(unsigned long addr, struct page **pages,
818 int nr_pages)
819{
820 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
821 PAGE_KERNEL, pages);
822}
823
824/**
825 * pcpu_map_pages - map pages into a pcpu_chunk
826 * @chunk: chunk of interest
827 * @pages: pages array containing pages to be mapped
828 * @populated: populated bitmap
829 * @page_start: page index of the first page to map
830 * @page_end: page index of the last page to map + 1
831 *
832 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
833 * caller is responsible for calling pcpu_post_map_flush() after all
834 * mappings are complete.
835 *
836 * This function is responsible for setting corresponding bits in
837 * @chunk->populated bitmap and whatever is necessary for reverse
838 * lookup (addr -> chunk).
839 */ 669 */
840static int pcpu_map_pages(struct pcpu_chunk *chunk, 670static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
841 struct page **pages, unsigned long *populated,
842 int page_start, int page_end)
843{ 671{
844 unsigned int cpu, tcpu; 672 /* is it in the first chunk? */
845 int i, err; 673 if (pcpu_addr_in_first_chunk(addr)) {
846 674 /* is it in the reserved area? */
847 for_each_possible_cpu(cpu) { 675 if (pcpu_addr_in_reserved_chunk(addr))
848 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), 676 return pcpu_reserved_chunk;
849 &pages[pcpu_page_idx(cpu, page_start)], 677 return pcpu_first_chunk;
850 page_end - page_start);
851 if (err < 0)
852 goto err;
853 }
854
855 /* mapping successful, link chunk and mark populated */
856 for (i = page_start; i < page_end; i++) {
857 for_each_possible_cpu(cpu)
858 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
859 chunk);
860 __set_bit(i, populated);
861 }
862
863 return 0;
864
865err:
866 for_each_possible_cpu(tcpu) {
867 if (tcpu == cpu)
868 break;
869 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
870 page_end - page_start);
871 } 678 }
872 return err;
873}
874
875/**
876 * pcpu_post_map_flush - flush cache after mapping
877 * @chunk: pcpu_chunk the regions to be flushed belong to
878 * @page_start: page index of the first page to be flushed
879 * @page_end: page index of the last page to be flushed + 1
880 *
881 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
882 * cache.
883 *
884 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
885 * for the whole region.
886 */
887static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
888 int page_start, int page_end)
889{
890 flush_cache_vmap(
891 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
892 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
893}
894
895/**
896 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
897 * @chunk: chunk to depopulate
898 * @off: offset to the area to depopulate
899 * @size: size of the area to depopulate in bytes
900 * @flush: whether to flush cache and tlb or not
901 *
902 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
903 * from @chunk. If @flush is true, vcache is flushed before unmapping
904 * and tlb after.
905 *
906 * CONTEXT:
907 * pcpu_alloc_mutex.
908 */
909static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
910{
911 int page_start = PFN_DOWN(off);
912 int page_end = PFN_UP(off + size);
913 struct page **pages;
914 unsigned long *populated;
915 int rs, re;
916
917 /* quick path, check whether it's empty already */
918 rs = page_start;
919 pcpu_next_unpop(chunk, &rs, &re, page_end);
920 if (rs == page_start && re == page_end)
921 return;
922
923 /* immutable chunks can't be depopulated */
924 WARN_ON(chunk->immutable);
925 679
926 /* 680 /*
927 * If control reaches here, there must have been at least one 681 * The address is relative to unit0 which might be unused and
928 * successful population attempt so the temp pages array must 682 * thus unmapped. Offset the address to the unit space of the
929 * be available now. 683 * current processor before looking it up in the vmalloc
684 * space. Note that any possible cpu id can be used here, so
685 * there's no need to worry about preemption or cpu hotplug.
930 */ 686 */
931 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); 687 addr += pcpu_unit_offsets[raw_smp_processor_id()];
932 BUG_ON(!pages); 688 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
933
934 /* unmap and free */
935 pcpu_pre_unmap_flush(chunk, page_start, page_end);
936
937 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
938 pcpu_unmap_pages(chunk, pages, populated, rs, re);
939
940 /* no need to flush tlb, vmalloc will handle it lazily */
941
942 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
943 pcpu_free_pages(chunk, pages, populated, rs, re);
944
945 /* commit new bitmap */
946 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
947}
948
949/**
950 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
951 * @chunk: chunk of interest
952 * @off: offset to the area to populate
953 * @size: size of the area to populate in bytes
954 *
955 * For each cpu, populate and map pages [@page_start,@page_end) into
956 * @chunk. The area is cleared on return.
957 *
958 * CONTEXT:
959 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
960 */
961static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
962{
963 int page_start = PFN_DOWN(off);
964 int page_end = PFN_UP(off + size);
965 int free_end = page_start, unmap_end = page_start;
966 struct page **pages;
967 unsigned long *populated;
968 unsigned int cpu;
969 int rs, re, rc;
970
971 /* quick path, check whether all pages are already there */
972 rs = page_start;
973 pcpu_next_pop(chunk, &rs, &re, page_end);
974 if (rs == page_start && re == page_end)
975 goto clear;
976
977 /* need to allocate and map pages, this chunk can't be immutable */
978 WARN_ON(chunk->immutable);
979
980 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
981 if (!pages)
982 return -ENOMEM;
983
984 /* alloc and map */
985 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
986 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
987 if (rc)
988 goto err_free;
989 free_end = re;
990 }
991
992 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
993 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
994 if (rc)
995 goto err_unmap;
996 unmap_end = re;
997 }
998 pcpu_post_map_flush(chunk, page_start, page_end);
999
1000 /* commit new bitmap */
1001 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
1002clear:
1003 for_each_possible_cpu(cpu)
1004 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1005 return 0;
1006
1007err_unmap:
1008 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
1009 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
1010 pcpu_unmap_pages(chunk, pages, populated, rs, re);
1011 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
1012err_free:
1013 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
1014 pcpu_free_pages(chunk, pages, populated, rs, re);
1015 return rc;
1016}
1017
1018static void free_pcpu_chunk(struct pcpu_chunk *chunk)
1019{
1020 if (!chunk)
1021 return;
1022 if (chunk->vms)
1023 pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
1024 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
1025 kfree(chunk);
1026}
1027
1028static struct pcpu_chunk *alloc_pcpu_chunk(void)
1029{
1030 struct pcpu_chunk *chunk;
1031
1032 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
1033 if (!chunk)
1034 return NULL;
1035
1036 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
1037 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
1038 chunk->map[chunk->map_used++] = pcpu_unit_size;
1039
1040 chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
1041 pcpu_nr_groups, pcpu_atom_size,
1042 GFP_KERNEL);
1043 if (!chunk->vms) {
1044 free_pcpu_chunk(chunk);
1045 return NULL;
1046 }
1047
1048 INIT_LIST_HEAD(&chunk->list);
1049 chunk->free_size = pcpu_unit_size;
1050 chunk->contig_hint = pcpu_unit_size;
1051 chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
1052
1053 return chunk;
1054} 689}
1055 690
1056/** 691/**
@@ -1142,7 +777,7 @@ restart:
1142 /* hmmm... no space left, create a new chunk */ 777 /* hmmm... no space left, create a new chunk */
1143 spin_unlock_irqrestore(&pcpu_lock, flags); 778 spin_unlock_irqrestore(&pcpu_lock, flags);
1144 779
1145 chunk = alloc_pcpu_chunk(); 780 chunk = pcpu_create_chunk();
1146 if (!chunk) { 781 if (!chunk) {
1147 err = "failed to allocate new chunk"; 782 err = "failed to allocate new chunk";
1148 goto fail_unlock_mutex; 783 goto fail_unlock_mutex;
@@ -1254,7 +889,7 @@ static void pcpu_reclaim(struct work_struct *work)
1254 889
1255 list_for_each_entry_safe(chunk, next, &todo, list) { 890 list_for_each_entry_safe(chunk, next, &todo, list) {
1256 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); 891 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
1257 free_pcpu_chunk(chunk); 892 pcpu_destroy_chunk(chunk);
1258 } 893 }
1259 894
1260 mutex_unlock(&pcpu_alloc_mutex); 895 mutex_unlock(&pcpu_alloc_mutex);
@@ -1343,11 +978,14 @@ bool is_kernel_percpu_address(unsigned long addr)
1343 */ 978 */
1344phys_addr_t per_cpu_ptr_to_phys(void *addr) 979phys_addr_t per_cpu_ptr_to_phys(void *addr)
1345{ 980{
1346 if ((unsigned long)addr < VMALLOC_START || 981 if (pcpu_addr_in_first_chunk(addr)) {
1347 (unsigned long)addr >= VMALLOC_END) 982 if ((unsigned long)addr < VMALLOC_START ||
1348 return __pa(addr); 983 (unsigned long)addr >= VMALLOC_END)
1349 else 984 return __pa(addr);
1350 return page_to_phys(vmalloc_to_page(addr)); 985 else
986 return page_to_phys(vmalloc_to_page(addr));
987 } else
988 return page_to_phys(pcpu_addr_to_page(addr));
1351} 989}
1352 990
1353static inline size_t pcpu_calc_fc_sizes(size_t static_size, 991static inline size_t pcpu_calc_fc_sizes(size_t static_size,
@@ -1719,6 +1357,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1719 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 1357 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1720 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); 1358 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1721 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 1359 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1360 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1722 1361
1723 /* process group information and build config tables accordingly */ 1362 /* process group information and build config tables accordingly */
1724 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); 1363 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
diff --git a/mm/readahead.c b/mm/readahead.c
index dfa9a1a03a11..77506a291a2d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -523,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
523 * @req_size: hint: total size of the read which the caller is performing in 523 * @req_size: hint: total size of the read which the caller is performing in
524 * pagecache pages 524 * pagecache pages
525 * 525 *
526 * page_cache_async_ondemand() should be called when a page is used which 526 * page_cache_async_readahead() should be called when a page is used which
527 * has the PG_readahead flag; this is a marker to suggest that the application 527 * has the PG_readahead flag; this is a marker to suggest that the application
528 * has used up enough of the readahead window that we should start pulling in 528 * has used up enough of the readahead window that we should start pulling in
529 * more pages. 529 * more pages.
diff --git a/mm/rmap.c b/mm/rmap.c
index 0feeef860a8f..38a336e2eea1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -250,7 +250,7 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
250 list_del(&anon_vma_chain->same_anon_vma); 250 list_del(&anon_vma_chain->same_anon_vma);
251 251
252 /* We must garbage collect the anon_vma if it's empty */ 252 /* We must garbage collect the anon_vma if it's empty */
253 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 253 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
254 spin_unlock(&anon_vma->lock); 254 spin_unlock(&anon_vma->lock);
255 255
256 if (empty) 256 if (empty)
@@ -274,7 +274,7 @@ static void anon_vma_ctor(void *data)
274 struct anon_vma *anon_vma = data; 274 struct anon_vma *anon_vma = data;
275 275
276 spin_lock_init(&anon_vma->lock); 276 spin_lock_init(&anon_vma->lock);
277 ksm_refcount_init(anon_vma); 277 anonvma_external_refcount_init(anon_vma);
278 INIT_LIST_HEAD(&anon_vma->head); 278 INIT_LIST_HEAD(&anon_vma->head);
279} 279}
280 280
@@ -1131,6 +1131,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1131 return ret; 1131 return ret;
1132} 1132}
1133 1133
1134static bool is_vma_temporary_stack(struct vm_area_struct *vma)
1135{
1136 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1137
1138 if (!maybe_stack)
1139 return false;
1140
1141 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1142 VM_STACK_INCOMPLETE_SETUP)
1143 return true;
1144
1145 return false;
1146}
1147
1134/** 1148/**
1135 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1149 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
1136 * rmap method 1150 * rmap method
@@ -1159,7 +1173,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1159 1173
1160 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1174 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1161 struct vm_area_struct *vma = avc->vma; 1175 struct vm_area_struct *vma = avc->vma;
1162 unsigned long address = vma_address(page, vma); 1176 unsigned long address;
1177
1178 /*
1179 * During exec, a temporary VMA is setup and later moved.
1180 * The VMA is moved under the anon_vma lock but not the
1181 * page tables leading to a race where migration cannot
1182 * find the migration ptes. Rather than increasing the
1183 * locking requirements of exec(), migration skips
1184 * temporary VMAs until after exec() completes.
1185 */
1186 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
1187 is_vma_temporary_stack(vma))
1188 continue;
1189
1190 address = vma_address(page, vma);
1163 if (address == -EFAULT) 1191 if (address == -EFAULT)
1164 continue; 1192 continue;
1165 ret = try_to_unmap_one(page, vma, address, flags); 1193 ret = try_to_unmap_one(page, vma, address, flags);
@@ -1355,10 +1383,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1355 /* 1383 /*
1356 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1384 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
1357 * because that depends on page_mapped(); but not all its usages 1385 * because that depends on page_mapped(); but not all its usages
1358 * are holding mmap_sem, which also gave the necessary guarantee 1386 * are holding mmap_sem. Users without mmap_sem are required to
1359 * (that this anon_vma's slab has not already been destroyed). 1387 * take a reference count to prevent the anon_vma disappearing
1360 * This needs to be reviewed later: avoiding page_lock_anon_vma()
1361 * is risky, and currently limits the usefulness of rmap_walk().
1362 */ 1388 */
1363 anon_vma = page_anon_vma(page); 1389 anon_vma = page_anon_vma(page);
1364 if (!anon_vma) 1390 if (!anon_vma)
diff --git a/mm/shmem.c b/mm/shmem.c
index eef4ebea5158..855eaf5b8d5b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -433,8 +433,6 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
433 433
434 spin_unlock(&info->lock); 434 spin_unlock(&info->lock);
435 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); 435 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
436 if (page)
437 set_page_private(page, 0);
438 spin_lock(&info->lock); 436 spin_lock(&info->lock);
439 437
440 if (!page) { 438 if (!page) {
@@ -1545,8 +1543,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1545 return 0; 1543 return 0;
1546} 1544}
1547 1545
1548static struct inode *shmem_get_inode(struct super_block *sb, int mode, 1546static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
1549 dev_t dev, unsigned long flags) 1547 int mode, dev_t dev, unsigned long flags)
1550{ 1548{
1551 struct inode *inode; 1549 struct inode *inode;
1552 struct shmem_inode_info *info; 1550 struct shmem_inode_info *info;
@@ -1557,9 +1555,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
1557 1555
1558 inode = new_inode(sb); 1556 inode = new_inode(sb);
1559 if (inode) { 1557 if (inode) {
1560 inode->i_mode = mode; 1558 inode_init_owner(inode, dir, mode);
1561 inode->i_uid = current_fsuid();
1562 inode->i_gid = current_fsgid();
1563 inode->i_blocks = 0; 1559 inode->i_blocks = 0;
1564 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1560 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1565 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1561 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -1814,7 +1810,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1814 struct inode *inode; 1810 struct inode *inode;
1815 int error = -ENOSPC; 1811 int error = -ENOSPC;
1816 1812
1817 inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE); 1813 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1818 if (inode) { 1814 if (inode) {
1819 error = security_inode_init_security(inode, dir, NULL, NULL, 1815 error = security_inode_init_security(inode, dir, NULL, NULL,
1820 NULL); 1816 NULL);
@@ -1833,11 +1829,6 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1833#else 1829#else
1834 error = 0; 1830 error = 0;
1835#endif 1831#endif
1836 if (dir->i_mode & S_ISGID) {
1837 inode->i_gid = dir->i_gid;
1838 if (S_ISDIR(mode))
1839 inode->i_mode |= S_ISGID;
1840 }
1841 dir->i_size += BOGO_DIRENT_SIZE; 1832 dir->i_size += BOGO_DIRENT_SIZE;
1842 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1833 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1843 d_instantiate(dentry, inode); 1834 d_instantiate(dentry, inode);
@@ -1957,7 +1948,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1957 if (len > PAGE_CACHE_SIZE) 1948 if (len > PAGE_CACHE_SIZE)
1958 return -ENAMETOOLONG; 1949 return -ENAMETOOLONG;
1959 1950
1960 inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); 1951 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
1961 if (!inode) 1952 if (!inode)
1962 return -ENOSPC; 1953 return -ENOSPC;
1963 1954
@@ -1992,8 +1983,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1992 unlock_page(page); 1983 unlock_page(page);
1993 page_cache_release(page); 1984 page_cache_release(page);
1994 } 1985 }
1995 if (dir->i_mode & S_ISGID)
1996 inode->i_gid = dir->i_gid;
1997 dir->i_size += BOGO_DIRENT_SIZE; 1986 dir->i_size += BOGO_DIRENT_SIZE;
1998 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1987 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1999 d_instantiate(dentry, inode); 1988 d_instantiate(dentry, inode);
@@ -2071,14 +2060,14 @@ static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
2071 size, flags); 2060 size, flags);
2072} 2061}
2073 2062
2074static struct xattr_handler shmem_xattr_security_handler = { 2063static const struct xattr_handler shmem_xattr_security_handler = {
2075 .prefix = XATTR_SECURITY_PREFIX, 2064 .prefix = XATTR_SECURITY_PREFIX,
2076 .list = shmem_xattr_security_list, 2065 .list = shmem_xattr_security_list,
2077 .get = shmem_xattr_security_get, 2066 .get = shmem_xattr_security_get,
2078 .set = shmem_xattr_security_set, 2067 .set = shmem_xattr_security_set,
2079}; 2068};
2080 2069
2081static struct xattr_handler *shmem_xattr_handlers[] = { 2070static const struct xattr_handler *shmem_xattr_handlers[] = {
2082 &generic_acl_access_handler, 2071 &generic_acl_access_handler,
2083 &generic_acl_default_handler, 2072 &generic_acl_default_handler,
2084 &shmem_xattr_security_handler, 2073 &shmem_xattr_security_handler,
@@ -2366,7 +2355,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2366 sb->s_flags |= MS_POSIXACL; 2355 sb->s_flags |= MS_POSIXACL;
2367#endif 2356#endif
2368 2357
2369 inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 2358 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
2370 if (!inode) 2359 if (!inode)
2371 goto failed; 2360 goto failed;
2372 inode->i_uid = sbinfo->uid; 2361 inode->i_uid = sbinfo->uid;
@@ -2570,6 +2559,45 @@ out4:
2570 return error; 2559 return error;
2571} 2560}
2572 2561
2562#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2563/**
2564 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2565 * @inode: the inode to be searched
2566 * @pgoff: the offset to be searched
2567 * @pagep: the pointer for the found page to be stored
2568 * @ent: the pointer for the found swap entry to be stored
2569 *
2570 * If a page is found, refcount of it is incremented. Callers should handle
2571 * these refcount.
2572 */
2573void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2574 struct page **pagep, swp_entry_t *ent)
2575{
2576 swp_entry_t entry = { .val = 0 }, *ptr;
2577 struct page *page = NULL;
2578 struct shmem_inode_info *info = SHMEM_I(inode);
2579
2580 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2581 goto out;
2582
2583 spin_lock(&info->lock);
2584 ptr = shmem_swp_entry(info, pgoff, NULL);
2585#ifdef CONFIG_SWAP
2586 if (ptr && ptr->val) {
2587 entry.val = ptr->val;
2588 page = find_get_page(&swapper_space, entry.val);
2589 } else
2590#endif
2591 page = find_get_page(inode->i_mapping, pgoff);
2592 if (ptr)
2593 shmem_swp_unmap(ptr);
2594 spin_unlock(&info->lock);
2595out:
2596 *pagep = page;
2597 *ent = entry;
2598}
2599#endif
2600
2573#else /* !CONFIG_SHMEM */ 2601#else /* !CONFIG_SHMEM */
2574 2602
2575/* 2603/*
@@ -2609,9 +2637,34 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2609 return 0; 2637 return 0;
2610} 2638}
2611 2639
2640#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2641/**
2642 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2643 * @inode: the inode to be searched
2644 * @pgoff: the offset to be searched
2645 * @pagep: the pointer for the found page to be stored
2646 * @ent: the pointer for the found swap entry to be stored
2647 *
2648 * If a page is found, refcount of it is incremented. Callers should handle
2649 * these refcount.
2650 */
2651void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2652 struct page **pagep, swp_entry_t *ent)
2653{
2654 struct page *page = NULL;
2655
2656 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2657 goto out;
2658 page = find_get_page(inode->i_mapping, pgoff);
2659out:
2660 *pagep = page;
2661 *ent = (swp_entry_t){ .val = 0 };
2662}
2663#endif
2664
2612#define shmem_vm_ops generic_file_vm_ops 2665#define shmem_vm_ops generic_file_vm_ops
2613#define shmem_file_operations ramfs_file_operations 2666#define shmem_file_operations ramfs_file_operations
2614#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) 2667#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2615#define shmem_acct_size(flags, size) 0 2668#define shmem_acct_size(flags, size) 0
2616#define shmem_unacct_size(flags, size) do {} while (0) 2669#define shmem_unacct_size(flags, size) do {} while (0)
2617#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE 2670#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
@@ -2655,7 +2708,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2655 path.mnt = mntget(shm_mnt); 2708 path.mnt = mntget(shm_mnt);
2656 2709
2657 error = -ENOSPC; 2710 error = -ENOSPC;
2658 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); 2711 inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
2659 if (!inode) 2712 if (!inode)
2660 goto put_dentry; 2713 goto put_dentry;
2661 2714
diff --git a/mm/slab.c b/mm/slab.c
index bac0f4fcc216..e49f8f46f46d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
115#include <linux/reciprocal_div.h> 115#include <linux/reciprocal_div.h>
116#include <linux/debugobjects.h> 116#include <linux/debugobjects.h>
117#include <linux/kmemcheck.h> 117#include <linux/kmemcheck.h>
118#include <linux/memory.h>
118 119
119#include <asm/cacheflush.h> 120#include <asm/cacheflush.h>
120#include <asm/tlbflush.h> 121#include <asm/tlbflush.h>
@@ -144,30 +145,6 @@
144#define BYTES_PER_WORD sizeof(void *) 145#define BYTES_PER_WORD sizeof(void *)
145#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 146#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
146 147
147#ifndef ARCH_KMALLOC_MINALIGN
148/*
149 * Enforce a minimum alignment for the kmalloc caches.
150 * Usually, the kmalloc caches are cache_line_size() aligned, except when
151 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
152 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
153 * alignment larger than the alignment of a 64-bit integer.
154 * ARCH_KMALLOC_MINALIGN allows that.
155 * Note that increasing this value may disable some debug features.
156 */
157#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
158#endif
159
160#ifndef ARCH_SLAB_MINALIGN
161/*
162 * Enforce a minimum alignment for all caches.
163 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
164 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
165 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
166 * some debug features.
167 */
168#define ARCH_SLAB_MINALIGN 0
169#endif
170
171#ifndef ARCH_KMALLOC_FLAGS 148#ifndef ARCH_KMALLOC_FLAGS
172#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 149#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
173#endif 150#endif
@@ -844,7 +821,7 @@ static void init_reap_node(int cpu)
844{ 821{
845 int node; 822 int node;
846 823
847 node = next_node(cpu_to_node(cpu), node_online_map); 824 node = next_node(cpu_to_mem(cpu), node_online_map);
848 if (node == MAX_NUMNODES) 825 if (node == MAX_NUMNODES)
849 node = first_node(node_online_map); 826 node = first_node(node_online_map);
850 827
@@ -1073,7 +1050,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1073 struct array_cache *alien = NULL; 1050 struct array_cache *alien = NULL;
1074 int node; 1051 int node;
1075 1052
1076 node = numa_node_id(); 1053 node = numa_mem_id();
1077 1054
1078 /* 1055 /*
1079 * Make sure we are not freeing a object from another node to the array 1056 * Make sure we are not freeing a object from another node to the array
@@ -1102,11 +1079,57 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1102} 1079}
1103#endif 1080#endif
1104 1081
1082/*
1083 * Allocates and initializes nodelists for a node on each slab cache, used for
1084 * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
1085 * will be allocated off-node since memory is not yet online for the new node.
1086 * When hotplugging memory or a cpu, existing nodelists are not replaced if
1087 * already in use.
1088 *
1089 * Must hold cache_chain_mutex.
1090 */
1091static int init_cache_nodelists_node(int node)
1092{
1093 struct kmem_cache *cachep;
1094 struct kmem_list3 *l3;
1095 const int memsize = sizeof(struct kmem_list3);
1096
1097 list_for_each_entry(cachep, &cache_chain, next) {
1098 /*
1099 * Set up the size64 kmemlist for cpu before we can
1100 * begin anything. Make sure some other cpu on this
1101 * node has not already allocated this
1102 */
1103 if (!cachep->nodelists[node]) {
1104 l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1105 if (!l3)
1106 return -ENOMEM;
1107 kmem_list3_init(l3);
1108 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1109 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1110
1111 /*
1112 * The l3s don't come and go as CPUs come and
1113 * go. cache_chain_mutex is sufficient
1114 * protection here.
1115 */
1116 cachep->nodelists[node] = l3;
1117 }
1118
1119 spin_lock_irq(&cachep->nodelists[node]->list_lock);
1120 cachep->nodelists[node]->free_limit =
1121 (1 + nr_cpus_node(node)) *
1122 cachep->batchcount + cachep->num;
1123 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1124 }
1125 return 0;
1126}
1127
1105static void __cpuinit cpuup_canceled(long cpu) 1128static void __cpuinit cpuup_canceled(long cpu)
1106{ 1129{
1107 struct kmem_cache *cachep; 1130 struct kmem_cache *cachep;
1108 struct kmem_list3 *l3 = NULL; 1131 struct kmem_list3 *l3 = NULL;
1109 int node = cpu_to_node(cpu); 1132 int node = cpu_to_mem(cpu);
1110 const struct cpumask *mask = cpumask_of_node(node); 1133 const struct cpumask *mask = cpumask_of_node(node);
1111 1134
1112 list_for_each_entry(cachep, &cache_chain, next) { 1135 list_for_each_entry(cachep, &cache_chain, next) {
@@ -1171,8 +1194,8 @@ static int __cpuinit cpuup_prepare(long cpu)
1171{ 1194{
1172 struct kmem_cache *cachep; 1195 struct kmem_cache *cachep;
1173 struct kmem_list3 *l3 = NULL; 1196 struct kmem_list3 *l3 = NULL;
1174 int node = cpu_to_node(cpu); 1197 int node = cpu_to_mem(cpu);
1175 const int memsize = sizeof(struct kmem_list3); 1198 int err;
1176 1199
1177 /* 1200 /*
1178 * We need to do this right in the beginning since 1201 * We need to do this right in the beginning since
@@ -1180,35 +1203,9 @@ static int __cpuinit cpuup_prepare(long cpu)
1180 * kmalloc_node allows us to add the slab to the right 1203 * kmalloc_node allows us to add the slab to the right
1181 * kmem_list3 and not this cpu's kmem_list3 1204 * kmem_list3 and not this cpu's kmem_list3
1182 */ 1205 */
1183 1206 err = init_cache_nodelists_node(node);
1184 list_for_each_entry(cachep, &cache_chain, next) { 1207 if (err < 0)
1185 /* 1208 goto bad;
1186 * Set up the size64 kmemlist for cpu before we can
1187 * begin anything. Make sure some other cpu on this
1188 * node has not already allocated this
1189 */
1190 if (!cachep->nodelists[node]) {
1191 l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1192 if (!l3)
1193 goto bad;
1194 kmem_list3_init(l3);
1195 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1196 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1197
1198 /*
1199 * The l3s don't come and go as CPUs come and
1200 * go. cache_chain_mutex is sufficient
1201 * protection here.
1202 */
1203 cachep->nodelists[node] = l3;
1204 }
1205
1206 spin_lock_irq(&cachep->nodelists[node]->list_lock);
1207 cachep->nodelists[node]->free_limit =
1208 (1 + nr_cpus_node(node)) *
1209 cachep->batchcount + cachep->num;
1210 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1211 }
1212 1209
1213 /* 1210 /*
1214 * Now we can go ahead with allocating the shared arrays and 1211 * Now we can go ahead with allocating the shared arrays and
@@ -1324,18 +1321,82 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1324 mutex_unlock(&cache_chain_mutex); 1321 mutex_unlock(&cache_chain_mutex);
1325 break; 1322 break;
1326 } 1323 }
1327 return err ? NOTIFY_BAD : NOTIFY_OK; 1324 return notifier_from_errno(err);
1328} 1325}
1329 1326
1330static struct notifier_block __cpuinitdata cpucache_notifier = { 1327static struct notifier_block __cpuinitdata cpucache_notifier = {
1331 &cpuup_callback, NULL, 0 1328 &cpuup_callback, NULL, 0
1332}; 1329};
1333 1330
1331#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1332/*
1333 * Drains freelist for a node on each slab cache, used for memory hot-remove.
1334 * Returns -EBUSY if all objects cannot be drained so that the node is not
1335 * removed.
1336 *
1337 * Must hold cache_chain_mutex.
1338 */
1339static int __meminit drain_cache_nodelists_node(int node)
1340{
1341 struct kmem_cache *cachep;
1342 int ret = 0;
1343
1344 list_for_each_entry(cachep, &cache_chain, next) {
1345 struct kmem_list3 *l3;
1346
1347 l3 = cachep->nodelists[node];
1348 if (!l3)
1349 continue;
1350
1351 drain_freelist(cachep, l3, l3->free_objects);
1352
1353 if (!list_empty(&l3->slabs_full) ||
1354 !list_empty(&l3->slabs_partial)) {
1355 ret = -EBUSY;
1356 break;
1357 }
1358 }
1359 return ret;
1360}
1361
1362static int __meminit slab_memory_callback(struct notifier_block *self,
1363 unsigned long action, void *arg)
1364{
1365 struct memory_notify *mnb = arg;
1366 int ret = 0;
1367 int nid;
1368
1369 nid = mnb->status_change_nid;
1370 if (nid < 0)
1371 goto out;
1372
1373 switch (action) {
1374 case MEM_GOING_ONLINE:
1375 mutex_lock(&cache_chain_mutex);
1376 ret = init_cache_nodelists_node(nid);
1377 mutex_unlock(&cache_chain_mutex);
1378 break;
1379 case MEM_GOING_OFFLINE:
1380 mutex_lock(&cache_chain_mutex);
1381 ret = drain_cache_nodelists_node(nid);
1382 mutex_unlock(&cache_chain_mutex);
1383 break;
1384 case MEM_ONLINE:
1385 case MEM_OFFLINE:
1386 case MEM_CANCEL_ONLINE:
1387 case MEM_CANCEL_OFFLINE:
1388 break;
1389 }
1390out:
1391 return ret ? notifier_from_errno(ret) : NOTIFY_OK;
1392}
1393#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1394
1334/* 1395/*
1335 * swap the static kmem_list3 with kmalloced memory 1396 * swap the static kmem_list3 with kmalloced memory
1336 */ 1397 */
1337static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, 1398static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1338 int nodeid) 1399 int nodeid)
1339{ 1400{
1340 struct kmem_list3 *ptr; 1401 struct kmem_list3 *ptr;
1341 1402
@@ -1418,7 +1479,7 @@ void __init kmem_cache_init(void)
1418 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1479 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1419 */ 1480 */
1420 1481
1421 node = numa_node_id(); 1482 node = numa_mem_id();
1422 1483
1423 /* 1) create the cache_cache */ 1484 /* 1) create the cache_cache */
1424 INIT_LIST_HEAD(&cache_chain); 1485 INIT_LIST_HEAD(&cache_chain);
@@ -1580,6 +1641,14 @@ void __init kmem_cache_init_late(void)
1580 */ 1641 */
1581 register_cpu_notifier(&cpucache_notifier); 1642 register_cpu_notifier(&cpucache_notifier);
1582 1643
1644#ifdef CONFIG_NUMA
1645 /*
1646 * Register a memory hotplug callback that initializes and frees
1647 * nodelists.
1648 */
1649 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1650#endif
1651
1583 /* 1652 /*
1584 * The reap timers are started later, with a module init call: That part 1653 * The reap timers are started later, with a module init call: That part
1585 * of the kernel is not yet operational. 1654 * of the kernel is not yet operational.
@@ -2052,7 +2121,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2052 } 2121 }
2053 } 2122 }
2054 } 2123 }
2055 cachep->nodelists[numa_node_id()]->next_reap = 2124 cachep->nodelists[numa_mem_id()]->next_reap =
2056 jiffies + REAPTIMEOUT_LIST3 + 2125 jiffies + REAPTIMEOUT_LIST3 +
2057 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 2126 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2058 2127
@@ -2220,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2220 if (ralign < align) { 2289 if (ralign < align) {
2221 ralign = align; 2290 ralign = align;
2222 } 2291 }
2223 /* disable debug if necessary */ 2292 /* disable debug if not aligning with REDZONE_ALIGN */
2224 if (ralign > __alignof__(unsigned long long)) 2293 if (ralign & (__alignof__(unsigned long long) - 1))
2225 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2294 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2226 /* 2295 /*
2227 * 4) Store it. 2296 * 4) Store it.
@@ -2247,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2247 */ 2316 */
2248 if (flags & SLAB_RED_ZONE) { 2317 if (flags & SLAB_RED_ZONE) {
2249 /* add space for red zone words */ 2318 /* add space for red zone words */
2250 cachep->obj_offset += sizeof(unsigned long long); 2319 cachep->obj_offset += align;
2251 size += 2 * sizeof(unsigned long long); 2320 size += align + sizeof(unsigned long long);
2252 } 2321 }
2253 if (flags & SLAB_STORE_USER) { 2322 if (flags & SLAB_STORE_USER) {
2254 /* user store requires one word storage behind the end of 2323 /* user store requires one word storage behind the end of
@@ -2383,7 +2452,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
2383{ 2452{
2384#ifdef CONFIG_SMP 2453#ifdef CONFIG_SMP
2385 check_irq_off(); 2454 check_irq_off();
2386 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 2455 assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
2387#endif 2456#endif
2388} 2457}
2389 2458
@@ -2410,7 +2479,7 @@ static void do_drain(void *arg)
2410{ 2479{
2411 struct kmem_cache *cachep = arg; 2480 struct kmem_cache *cachep = arg;
2412 struct array_cache *ac; 2481 struct array_cache *ac;
2413 int node = numa_node_id(); 2482 int node = numa_mem_id();
2414 2483
2415 check_irq_off(); 2484 check_irq_off();
2416 ac = cpu_cache_get(cachep); 2485 ac = cpu_cache_get(cachep);
@@ -2943,7 +3012,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2943 3012
2944retry: 3013retry:
2945 check_irq_off(); 3014 check_irq_off();
2946 node = numa_node_id(); 3015 node = numa_mem_id();
2947 ac = cpu_cache_get(cachep); 3016 ac = cpu_cache_get(cachep);
2948 batchcount = ac->batchcount; 3017 batchcount = ac->batchcount;
2949 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3018 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3147,11 +3216,13 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3147 3216
3148 if (in_interrupt() || (flags & __GFP_THISNODE)) 3217 if (in_interrupt() || (flags & __GFP_THISNODE))
3149 return NULL; 3218 return NULL;
3150 nid_alloc = nid_here = numa_node_id(); 3219 nid_alloc = nid_here = numa_mem_id();
3220 get_mems_allowed();
3151 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3221 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3152 nid_alloc = cpuset_mem_spread_node(); 3222 nid_alloc = cpuset_slab_spread_node();
3153 else if (current->mempolicy) 3223 else if (current->mempolicy)
3154 nid_alloc = slab_node(current->mempolicy); 3224 nid_alloc = slab_node(current->mempolicy);
3225 put_mems_allowed();
3155 if (nid_alloc != nid_here) 3226 if (nid_alloc != nid_here)
3156 return ____cache_alloc_node(cachep, flags, nid_alloc); 3227 return ____cache_alloc_node(cachep, flags, nid_alloc);
3157 return NULL; 3228 return NULL;
@@ -3178,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3178 if (flags & __GFP_THISNODE) 3249 if (flags & __GFP_THISNODE)
3179 return NULL; 3250 return NULL;
3180 3251
3252 get_mems_allowed();
3181 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 3253 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3182 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3254 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3183 3255
@@ -3209,7 +3281,7 @@ retry:
3209 if (local_flags & __GFP_WAIT) 3281 if (local_flags & __GFP_WAIT)
3210 local_irq_enable(); 3282 local_irq_enable();
3211 kmem_flagcheck(cache, flags); 3283 kmem_flagcheck(cache, flags);
3212 obj = kmem_getpages(cache, local_flags, numa_node_id()); 3284 obj = kmem_getpages(cache, local_flags, numa_mem_id());
3213 if (local_flags & __GFP_WAIT) 3285 if (local_flags & __GFP_WAIT)
3214 local_irq_disable(); 3286 local_irq_disable();
3215 if (obj) { 3287 if (obj) {
@@ -3233,6 +3305,7 @@ retry:
3233 } 3305 }
3234 } 3306 }
3235 } 3307 }
3308 put_mems_allowed();
3236 return obj; 3309 return obj;
3237} 3310}
3238 3311
@@ -3316,6 +3389,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3316{ 3389{
3317 unsigned long save_flags; 3390 unsigned long save_flags;
3318 void *ptr; 3391 void *ptr;
3392 int slab_node = numa_mem_id();
3319 3393
3320 flags &= gfp_allowed_mask; 3394 flags &= gfp_allowed_mask;
3321 3395
@@ -3328,7 +3402,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3328 local_irq_save(save_flags); 3402 local_irq_save(save_flags);
3329 3403
3330 if (nodeid == -1) 3404 if (nodeid == -1)
3331 nodeid = numa_node_id(); 3405 nodeid = slab_node;
3332 3406
3333 if (unlikely(!cachep->nodelists[nodeid])) { 3407 if (unlikely(!cachep->nodelists[nodeid])) {
3334 /* Node not bootstrapped yet */ 3408 /* Node not bootstrapped yet */
@@ -3336,7 +3410,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3336 goto out; 3410 goto out;
3337 } 3411 }
3338 3412
3339 if (nodeid == numa_node_id()) { 3413 if (nodeid == slab_node) {
3340 /* 3414 /*
3341 * Use the locally cached objects if possible. 3415 * Use the locally cached objects if possible.
3342 * However ____cache_alloc does not allow fallback 3416 * However ____cache_alloc does not allow fallback
@@ -3380,8 +3454,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3380 * We may just have run out of memory on the local node. 3454 * We may just have run out of memory on the local node.
3381 * ____cache_alloc_node() knows how to locate memory on other nodes 3455 * ____cache_alloc_node() knows how to locate memory on other nodes
3382 */ 3456 */
3383 if (!objp) 3457 if (!objp)
3384 objp = ____cache_alloc_node(cache, flags, numa_node_id()); 3458 objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3385 3459
3386 out: 3460 out:
3387 return objp; 3461 return objp;
@@ -3478,7 +3552,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3478{ 3552{
3479 int batchcount; 3553 int batchcount;
3480 struct kmem_list3 *l3; 3554 struct kmem_list3 *l3;
3481 int node = numa_node_id(); 3555 int node = numa_mem_id();
3482 3556
3483 batchcount = ac->batchcount; 3557 batchcount = ac->batchcount;
3484#if DEBUG 3558#if DEBUG
@@ -3912,7 +3986,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3912 return -ENOMEM; 3986 return -ENOMEM;
3913 3987
3914 for_each_online_cpu(i) { 3988 for_each_online_cpu(i) {
3915 new->new[i] = alloc_arraycache(cpu_to_node(i), limit, 3989 new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
3916 batchcount, gfp); 3990 batchcount, gfp);
3917 if (!new->new[i]) { 3991 if (!new->new[i]) {
3918 for (i--; i >= 0; i--) 3992 for (i--; i >= 0; i--)
@@ -3934,9 +4008,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3934 struct array_cache *ccold = new->new[i]; 4008 struct array_cache *ccold = new->new[i];
3935 if (!ccold) 4009 if (!ccold)
3936 continue; 4010 continue;
3937 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 4011 spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
3938 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 4012 free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
3939 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 4013 spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
3940 kfree(ccold); 4014 kfree(ccold);
3941 } 4015 }
3942 kfree(new); 4016 kfree(new);
@@ -4042,7 +4116,7 @@ static void cache_reap(struct work_struct *w)
4042{ 4116{
4043 struct kmem_cache *searchp; 4117 struct kmem_cache *searchp;
4044 struct kmem_list3 *l3; 4118 struct kmem_list3 *l3;
4045 int node = numa_node_id(); 4119 int node = numa_mem_id();
4046 struct delayed_work *work = to_delayed_work(w); 4120 struct delayed_work *work = to_delayed_work(w);
4047 4121
4048 if (!mutex_trylock(&cache_chain_mutex)) 4122 if (!mutex_trylock(&cache_chain_mutex))
@@ -4216,10 +4290,11 @@ static int s_show(struct seq_file *m, void *p)
4216 unsigned long node_frees = cachep->node_frees; 4290 unsigned long node_frees = cachep->node_frees;
4217 unsigned long overflows = cachep->node_overflow; 4291 unsigned long overflows = cachep->node_overflow;
4218 4292
4219 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 4293 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4220 %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, 4294 "%4lu %4lu %4lu %4lu %4lu",
4221 reaped, errors, max_freeable, node_allocs, 4295 allocs, high, grown,
4222 node_frees, overflows); 4296 reaped, errors, max_freeable, node_allocs,
4297 node_frees, overflows);
4223 } 4298 }
4224 /* cpu stats */ 4299 /* cpu stats */
4225 { 4300 {
diff --git a/mm/slob.c b/mm/slob.c
index 837ebd64cc34..23631e2bb57a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -467,14 +467,6 @@ out:
467 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. 467 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
468 */ 468 */
469 469
470#ifndef ARCH_KMALLOC_MINALIGN
471#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
472#endif
473
474#ifndef ARCH_SLAB_MINALIGN
475#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
476#endif
477
478void *__kmalloc_node(size_t size, gfp_t gfp, int node) 470void *__kmalloc_node(size_t size, gfp_t gfp, int node)
479{ 471{
480 unsigned int *m; 472 unsigned int *m;
diff --git a/mm/slub.c b/mm/slub.c
index d2a54fe71ea2..26f0cb9cc584 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -157,14 +157,6 @@
157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
158 SLAB_CACHE_DMA | SLAB_NOTRACK) 158 SLAB_CACHE_DMA | SLAB_NOTRACK)
159 159
160#ifndef ARCH_KMALLOC_MINALIGN
161#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
162#endif
163
164#ifndef ARCH_SLAB_MINALIGN
165#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
166#endif
167
168#define OO_SHIFT 16 160#define OO_SHIFT 16
169#define OO_MASK ((1 << OO_SHIFT) - 1) 161#define OO_MASK ((1 << OO_SHIFT) - 1)
170#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ 162#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */
@@ -1084,7 +1076,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
1084 if (node == -1) 1076 if (node == -1)
1085 return alloc_pages(flags, order); 1077 return alloc_pages(flags, order);
1086 else 1078 else
1087 return alloc_pages_node(node, flags, order); 1079 return alloc_pages_exact_node(node, flags, order);
1088} 1080}
1089 1081
1090static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1082static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1368,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1368 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1360 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1369 return NULL; 1361 return NULL;
1370 1362
1363 get_mems_allowed();
1371 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1364 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1372 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1365 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1373 struct kmem_cache_node *n; 1366 struct kmem_cache_node *n;
@@ -1377,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1377 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1370 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1378 n->nr_partial > s->min_partial) { 1371 n->nr_partial > s->min_partial) {
1379 page = get_partial_node(n); 1372 page = get_partial_node(n);
1380 if (page) 1373 if (page) {
1374 put_mems_allowed();
1381 return page; 1375 return page;
1376 }
1382 } 1377 }
1383 } 1378 }
1379 put_mems_allowed();
1384#endif 1380#endif
1385 return NULL; 1381 return NULL;
1386} 1382}
@@ -2429,9 +2425,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
2429#ifdef CONFIG_SLUB_DEBUG 2425#ifdef CONFIG_SLUB_DEBUG
2430 void *addr = page_address(page); 2426 void *addr = page_address(page);
2431 void *p; 2427 void *p;
2432 DECLARE_BITMAP(map, page->objects); 2428 long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
2429 GFP_ATOMIC);
2433 2430
2434 bitmap_zero(map, page->objects); 2431 if (!map)
2432 return;
2435 slab_err(s, page, "%s", text); 2433 slab_err(s, page, "%s", text);
2436 slab_lock(page); 2434 slab_lock(page);
2437 for_each_free_object(p, s, page->freelist) 2435 for_each_free_object(p, s, page->freelist)
@@ -2446,6 +2444,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
2446 } 2444 }
2447 } 2445 }
2448 slab_unlock(page); 2446 slab_unlock(page);
2447 kfree(map);
2449#endif 2448#endif
2450} 2449}
2451 2450
@@ -3338,8 +3337,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3338 struct kmem_cache *s; 3337 struct kmem_cache *s;
3339 void *ret; 3338 void *ret;
3340 3339
3341 if (unlikely(size > SLUB_MAX_SIZE)) 3340 if (unlikely(size > SLUB_MAX_SIZE)) {
3342 return kmalloc_large_node(size, gfpflags, node); 3341 ret = kmalloc_large_node(size, gfpflags, node);
3342
3343 trace_kmalloc_node(caller, ret,
3344 size, PAGE_SIZE << get_order(size),
3345 gfpflags, node);
3346
3347 return ret;
3348 }
3343 3349
3344 s = get_slab(size, gfpflags); 3350 s = get_slab(size, gfpflags);
3345 3351
@@ -3651,10 +3657,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
3651} 3657}
3652 3658
3653static void process_slab(struct loc_track *t, struct kmem_cache *s, 3659static void process_slab(struct loc_track *t, struct kmem_cache *s,
3654 struct page *page, enum track_item alloc) 3660 struct page *page, enum track_item alloc,
3661 long *map)
3655{ 3662{
3656 void *addr = page_address(page); 3663 void *addr = page_address(page);
3657 DECLARE_BITMAP(map, page->objects);
3658 void *p; 3664 void *p;
3659 3665
3660 bitmap_zero(map, page->objects); 3666 bitmap_zero(map, page->objects);
@@ -3673,11 +3679,14 @@ static int list_locations(struct kmem_cache *s, char *buf,
3673 unsigned long i; 3679 unsigned long i;
3674 struct loc_track t = { 0, 0, NULL }; 3680 struct loc_track t = { 0, 0, NULL };
3675 int node; 3681 int node;
3682 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3683 sizeof(unsigned long), GFP_KERNEL);
3676 3684
3677 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 3685 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3678 GFP_TEMPORARY)) 3686 GFP_TEMPORARY)) {
3687 kfree(map);
3679 return sprintf(buf, "Out of memory\n"); 3688 return sprintf(buf, "Out of memory\n");
3680 3689 }
3681 /* Push back cpu slabs */ 3690 /* Push back cpu slabs */
3682 flush_all(s); 3691 flush_all(s);
3683 3692
@@ -3691,9 +3700,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
3691 3700
3692 spin_lock_irqsave(&n->list_lock, flags); 3701 spin_lock_irqsave(&n->list_lock, flags);
3693 list_for_each_entry(page, &n->partial, lru) 3702 list_for_each_entry(page, &n->partial, lru)
3694 process_slab(&t, s, page, alloc); 3703 process_slab(&t, s, page, alloc, map);
3695 list_for_each_entry(page, &n->full, lru) 3704 list_for_each_entry(page, &n->full, lru)
3696 process_slab(&t, s, page, alloc); 3705 process_slab(&t, s, page, alloc, map);
3697 spin_unlock_irqrestore(&n->list_lock, flags); 3706 spin_unlock_irqrestore(&n->list_lock, flags);
3698 } 3707 }
3699 3708
@@ -3744,6 +3753,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3744 } 3753 }
3745 3754
3746 free_loc_track(&t); 3755 free_loc_track(&t);
3756 kfree(map);
3747 if (!t.count) 3757 if (!t.count)
3748 len += sprintf(buf, "No data\n"); 3758 len += sprintf(buf, "No data\n");
3749 return len; 3759 return len;
diff --git a/mm/sparse.c b/mm/sparse.c
index dc0cc4d43ff3..95ac219af379 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -382,13 +382,15 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
382struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) 382struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
383{ 383{
384 struct page *map; 384 struct page *map;
385 unsigned long size;
385 386
386 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); 387 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
387 if (map) 388 if (map)
388 return map; 389 return map;
389 390
390 map = alloc_bootmem_pages_node(NODE_DATA(nid), 391 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
391 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 392 map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
393 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
392 return map; 394 return map;
393} 395}
394void __init sparse_mem_maps_populate_node(struct page **map_map, 396void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -412,7 +414,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
412 } 414 }
413 415
414 size = PAGE_ALIGN(size); 416 size = PAGE_ALIGN(size);
415 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); 417 map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
418 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
416 if (map) { 419 if (map) {
417 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 420 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
418 if (!present_section_nr(pnum)) 421 if (!present_section_nr(pnum))
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc7..03aa2d55f1a2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) { 140 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block, 141 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); 142 nr_blocks, GFP_KERNEL,
143 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
143 if (err) 144 if (err)
144 return err; 145 return err;
145 cond_resched(); 146 cond_resched();
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 151 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
151 152
152 err = blkdev_issue_discard(si->bdev, start_block, 153 err = blkdev_issue_discard(si->bdev, start_block,
153 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); 154 nr_blocks, GFP_KERNEL,
155 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
154 if (err) 156 if (err)
155 break; 157 break;
156 158
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
189 start_block <<= PAGE_SHIFT - 9; 191 start_block <<= PAGE_SHIFT - 9;
190 nr_blocks <<= PAGE_SHIFT - 9; 192 nr_blocks <<= PAGE_SHIFT - 9;
191 if (blkdev_issue_discard(si->bdev, start_block, 193 if (blkdev_issue_discard(si->bdev, start_block,
192 nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) 194 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
195 BLKDEV_IFL_BARRIER))
193 break; 196 break;
194 } 197 }
195 198
@@ -574,6 +577,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
574 577
575 /* free if no reference */ 578 /* free if no reference */
576 if (!usage) { 579 if (!usage) {
580 struct gendisk *disk = p->bdev->bd_disk;
577 if (offset < p->lowest_bit) 581 if (offset < p->lowest_bit)
578 p->lowest_bit = offset; 582 p->lowest_bit = offset;
579 if (offset > p->highest_bit) 583 if (offset > p->highest_bit)
@@ -583,6 +587,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
583 swap_list.next = p->type; 587 swap_list.next = p->type;
584 nr_swap_pages++; 588 nr_swap_pages++;
585 p->inuse_pages--; 589 p->inuse_pages--;
590 if ((p->flags & SWP_BLKDEV) &&
591 disk->fops->swap_slot_free_notify)
592 disk->fops->swap_slot_free_notify(p->bdev, offset);
586 } 593 }
587 594
588 return usage; 595 return usage;
@@ -1884,6 +1891,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1884 if (error < 0) 1891 if (error < 0)
1885 goto bad_swap; 1892 goto bad_swap;
1886 p->bdev = bdev; 1893 p->bdev = bdev;
1894 p->flags |= SWP_BLKDEV;
1887 } else if (S_ISREG(inode->i_mode)) { 1895 } else if (S_ISREG(inode->i_mode)) {
1888 p->bdev = inode->i_sb->s_bdev; 1896 p->bdev = inode->i_sb->s_bdev;
1889 mutex_lock(&inode->i_mutex); 1897 mutex_lock(&inode->i_mutex);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ff3311447f5..915dceb487c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -73,10 +73,14 @@ struct scan_control {
73 73
74 int swappiness; 74 int swappiness;
75 75
76 int all_unreclaimable;
77
78 int order; 76 int order;
79 77
78 /*
79 * Intend to reclaim enough contenious memory rather than to reclaim
80 * enough amount memory. I.e, it's the mode for high order allocation.
81 */
82 bool lumpy_reclaim_mode;
83
80 /* Which cgroup do we reclaim from */ 84 /* Which cgroup do we reclaim from */
81 struct mem_cgroup *mem_cgroup; 85 struct mem_cgroup *mem_cgroup;
82 86
@@ -85,12 +89,6 @@ struct scan_control {
85 * are scanned. 89 * are scanned.
86 */ 90 */
87 nodemask_t *nodemask; 91 nodemask_t *nodemask;
88
89 /* Pluggable isolate pages callback */
90 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
91 unsigned long *scanned, int order, int mode,
92 struct zone *z, struct mem_cgroup *mem_cont,
93 int active, int file);
94}; 92};
95 93
96#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 94#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -575,7 +573,7 @@ static enum page_references page_check_references(struct page *page,
575 referenced_page = TestClearPageReferenced(page); 573 referenced_page = TestClearPageReferenced(page);
576 574
577 /* Lumpy reclaim - ignore references */ 575 /* Lumpy reclaim - ignore references */
578 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 576 if (sc->lumpy_reclaim_mode)
579 return PAGEREF_RECLAIM; 577 return PAGEREF_RECLAIM;
580 578
581 /* 579 /*
@@ -839,11 +837,6 @@ keep:
839 return nr_reclaimed; 837 return nr_reclaimed;
840} 838}
841 839
842/* LRU Isolation modes. */
843#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
844#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
845#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
846
847/* 840/*
848 * Attempt to remove the specified page from its LRU. Only take this page 841 * Attempt to remove the specified page from its LRU. Only take this page
849 * if it is of the appropriate PageActive status. Pages which are being 842 * if it is of the appropriate PageActive status. Pages which are being
@@ -1011,7 +1004,6 @@ static unsigned long isolate_pages_global(unsigned long nr,
1011 struct list_head *dst, 1004 struct list_head *dst,
1012 unsigned long *scanned, int order, 1005 unsigned long *scanned, int order,
1013 int mode, struct zone *z, 1006 int mode, struct zone *z,
1014 struct mem_cgroup *mem_cont,
1015 int active, int file) 1007 int active, int file)
1016{ 1008{
1017 int lru = LRU_BASE; 1009 int lru = LRU_BASE;
@@ -1130,7 +1122,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1130 unsigned long nr_scanned = 0; 1122 unsigned long nr_scanned = 0;
1131 unsigned long nr_reclaimed = 0; 1123 unsigned long nr_reclaimed = 0;
1132 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1124 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1133 int lumpy_reclaim = 0;
1134 1125
1135 while (unlikely(too_many_isolated(zone, file, sc))) { 1126 while (unlikely(too_many_isolated(zone, file, sc))) {
1136 congestion_wait(BLK_RW_ASYNC, HZ/10); 1127 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1140,17 +1131,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1140 return SWAP_CLUSTER_MAX; 1131 return SWAP_CLUSTER_MAX;
1141 } 1132 }
1142 1133
1143 /*
1144 * If we need a large contiguous chunk of memory, or have
1145 * trouble getting a small set of contiguous pages, we
1146 * will reclaim both active and inactive pages.
1147 *
1148 * We use the same threshold as pageout congestion_wait below.
1149 */
1150 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1151 lumpy_reclaim = 1;
1152 else if (sc->order && priority < DEF_PRIORITY - 2)
1153 lumpy_reclaim = 1;
1154 1134
1155 pagevec_init(&pvec, 1); 1135 pagevec_init(&pvec, 1);
1156 1136
@@ -1163,15 +1143,15 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1163 unsigned long nr_freed; 1143 unsigned long nr_freed;
1164 unsigned long nr_active; 1144 unsigned long nr_active;
1165 unsigned int count[NR_LRU_LISTS] = { 0, }; 1145 unsigned int count[NR_LRU_LISTS] = { 0, };
1166 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; 1146 int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1167 unsigned long nr_anon; 1147 unsigned long nr_anon;
1168 unsigned long nr_file; 1148 unsigned long nr_file;
1169 1149
1170 nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
1171 &page_list, &nr_scan, sc->order, mode,
1172 zone, sc->mem_cgroup, 0, file);
1173
1174 if (scanning_global_lru(sc)) { 1150 if (scanning_global_lru(sc)) {
1151 nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
1152 &page_list, &nr_scan,
1153 sc->order, mode,
1154 zone, 0, file);
1175 zone->pages_scanned += nr_scan; 1155 zone->pages_scanned += nr_scan;
1176 if (current_is_kswapd()) 1156 if (current_is_kswapd())
1177 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1157 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1179,6 +1159,16 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1179 else 1159 else
1180 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1160 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1181 nr_scan); 1161 nr_scan);
1162 } else {
1163 nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
1164 &page_list, &nr_scan,
1165 sc->order, mode,
1166 zone, sc->mem_cgroup,
1167 0, file);
1168 /*
1169 * mem_cgroup_isolate_pages() keeps track of
1170 * scanned pages on its own.
1171 */
1182 } 1172 }
1183 1173
1184 if (nr_taken == 0) 1174 if (nr_taken == 0)
@@ -1216,7 +1206,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1216 * but that should be acceptable to the caller 1206 * but that should be acceptable to the caller
1217 */ 1207 */
1218 if (nr_freed < nr_taken && !current_is_kswapd() && 1208 if (nr_freed < nr_taken && !current_is_kswapd() &&
1219 lumpy_reclaim) { 1209 sc->lumpy_reclaim_mode) {
1220 congestion_wait(BLK_RW_ASYNC, HZ/10); 1210 congestion_wait(BLK_RW_ASYNC, HZ/10);
1221 1211
1222 /* 1212 /*
@@ -1356,16 +1346,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1356 1346
1357 lru_add_drain(); 1347 lru_add_drain();
1358 spin_lock_irq(&zone->lru_lock); 1348 spin_lock_irq(&zone->lru_lock);
1359 nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1360 ISOLATE_ACTIVE, zone,
1361 sc->mem_cgroup, 1, file);
1362 /*
1363 * zone->pages_scanned is used for detect zone's oom
1364 * mem_cgroup remembers nr_scan by itself.
1365 */
1366 if (scanning_global_lru(sc)) { 1349 if (scanning_global_lru(sc)) {
1350 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1351 &pgscanned, sc->order,
1352 ISOLATE_ACTIVE, zone,
1353 1, file);
1367 zone->pages_scanned += pgscanned; 1354 zone->pages_scanned += pgscanned;
1355 } else {
1356 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1357 &pgscanned, sc->order,
1358 ISOLATE_ACTIVE, zone,
1359 sc->mem_cgroup, 1, file);
1360 /*
1361 * mem_cgroup_isolate_pages() keeps track of
1362 * scanned pages on its own.
1363 */
1368 } 1364 }
1365
1369 reclaim_stat->recent_scanned[file] += nr_taken; 1366 reclaim_stat->recent_scanned[file] += nr_taken;
1370 1367
1371 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1368 __count_zone_vm_events(PGREFILL, zone, pgscanned);
@@ -1519,21 +1516,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1519} 1516}
1520 1517
1521/* 1518/*
1519 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1520 * until we collected @swap_cluster_max pages to scan.
1521 */
1522static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1523 unsigned long *nr_saved_scan)
1524{
1525 unsigned long nr;
1526
1527 *nr_saved_scan += nr_to_scan;
1528 nr = *nr_saved_scan;
1529
1530 if (nr >= SWAP_CLUSTER_MAX)
1531 *nr_saved_scan = 0;
1532 else
1533 nr = 0;
1534
1535 return nr;
1536}
1537
1538/*
1522 * Determine how aggressively the anon and file LRU lists should be 1539 * Determine how aggressively the anon and file LRU lists should be
1523 * scanned. The relative value of each set of LRU lists is determined 1540 * scanned. The relative value of each set of LRU lists is determined
1524 * by looking at the fraction of the pages scanned we did rotate back 1541 * by looking at the fraction of the pages scanned we did rotate back
1525 * onto the active list instead of evict. 1542 * onto the active list instead of evict.
1526 * 1543 *
1527 * percent[0] specifies how much pressure to put on ram/swap backed 1544 * nr[0] = anon pages to scan; nr[1] = file pages to scan
1528 * memory, while percent[1] determines pressure on the file LRUs.
1529 */ 1545 */
1530static void get_scan_ratio(struct zone *zone, struct scan_control *sc, 1546static void get_scan_count(struct zone *zone, struct scan_control *sc,
1531 unsigned long *percent) 1547 unsigned long *nr, int priority)
1532{ 1548{
1533 unsigned long anon, file, free; 1549 unsigned long anon, file, free;
1534 unsigned long anon_prio, file_prio; 1550 unsigned long anon_prio, file_prio;
1535 unsigned long ap, fp; 1551 unsigned long ap, fp;
1536 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1552 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1553 u64 fraction[2], denominator;
1554 enum lru_list l;
1555 int noswap = 0;
1556
1557 /* If we have no swap space, do not bother scanning anon pages. */
1558 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1559 noswap = 1;
1560 fraction[0] = 0;
1561 fraction[1] = 1;
1562 denominator = 1;
1563 goto out;
1564 }
1537 1565
1538 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1566 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1539 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1567 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
@@ -1545,9 +1573,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1545 /* If we have very few page cache pages, 1573 /* If we have very few page cache pages,
1546 force-scan anon pages. */ 1574 force-scan anon pages. */
1547 if (unlikely(file + free <= high_wmark_pages(zone))) { 1575 if (unlikely(file + free <= high_wmark_pages(zone))) {
1548 percent[0] = 100; 1576 fraction[0] = 1;
1549 percent[1] = 0; 1577 fraction[1] = 0;
1550 return; 1578 denominator = 1;
1579 goto out;
1551 } 1580 }
1552 } 1581 }
1553 1582
@@ -1594,29 +1623,37 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1594 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1623 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1595 fp /= reclaim_stat->recent_rotated[1] + 1; 1624 fp /= reclaim_stat->recent_rotated[1] + 1;
1596 1625
1597 /* Normalize to percentages */ 1626 fraction[0] = ap;
1598 percent[0] = 100 * ap / (ap + fp + 1); 1627 fraction[1] = fp;
1599 percent[1] = 100 - percent[0]; 1628 denominator = ap + fp + 1;
1629out:
1630 for_each_evictable_lru(l) {
1631 int file = is_file_lru(l);
1632 unsigned long scan;
1633
1634 scan = zone_nr_lru_pages(zone, sc, l);
1635 if (priority || noswap) {
1636 scan >>= priority;
1637 scan = div64_u64(scan * fraction[file], denominator);
1638 }
1639 nr[l] = nr_scan_try_batch(scan,
1640 &reclaim_stat->nr_saved_scan[l]);
1641 }
1600} 1642}
1601 1643
1602/* 1644static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
1603 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1604 * until we collected @swap_cluster_max pages to scan.
1605 */
1606static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1607 unsigned long *nr_saved_scan)
1608{ 1645{
1609 unsigned long nr; 1646 /*
1610 1647 * If we need a large contiguous chunk of memory, or have
1611 *nr_saved_scan += nr_to_scan; 1648 * trouble getting a small set of contiguous pages, we
1612 nr = *nr_saved_scan; 1649 * will reclaim both active and inactive pages.
1613 1650 */
1614 if (nr >= SWAP_CLUSTER_MAX) 1651 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1615 *nr_saved_scan = 0; 1652 sc->lumpy_reclaim_mode = 1;
1653 else if (sc->order && priority < DEF_PRIORITY - 2)
1654 sc->lumpy_reclaim_mode = 1;
1616 else 1655 else
1617 nr = 0; 1656 sc->lumpy_reclaim_mode = 0;
1618
1619 return nr;
1620} 1657}
1621 1658
1622/* 1659/*
@@ -1627,33 +1664,13 @@ static void shrink_zone(int priority, struct zone *zone,
1627{ 1664{
1628 unsigned long nr[NR_LRU_LISTS]; 1665 unsigned long nr[NR_LRU_LISTS];
1629 unsigned long nr_to_scan; 1666 unsigned long nr_to_scan;
1630 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1631 enum lru_list l; 1667 enum lru_list l;
1632 unsigned long nr_reclaimed = sc->nr_reclaimed; 1668 unsigned long nr_reclaimed = sc->nr_reclaimed;
1633 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1669 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1634 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1635 int noswap = 0;
1636
1637 /* If we have no swap space, do not bother scanning anon pages. */
1638 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1639 noswap = 1;
1640 percent[0] = 0;
1641 percent[1] = 100;
1642 } else
1643 get_scan_ratio(zone, sc, percent);
1644 1670
1645 for_each_evictable_lru(l) { 1671 get_scan_count(zone, sc, nr, priority);
1646 int file = is_file_lru(l);
1647 unsigned long scan;
1648 1672
1649 scan = zone_nr_lru_pages(zone, sc, l); 1673 set_lumpy_reclaim_mode(priority, sc);
1650 if (priority || noswap) {
1651 scan >>= priority;
1652 scan = (scan * percent[file]) / 100;
1653 }
1654 nr[l] = nr_scan_try_batch(scan,
1655 &reclaim_stat->nr_saved_scan[l]);
1656 }
1657 1674
1658 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1675 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1659 nr[LRU_INACTIVE_FILE]) { 1676 nr[LRU_INACTIVE_FILE]) {
@@ -1707,14 +1724,14 @@ static void shrink_zone(int priority, struct zone *zone,
1707 * If a zone is deemed to be full of pinned pages then just give it a light 1724 * If a zone is deemed to be full of pinned pages then just give it a light
1708 * scan then give up on it. 1725 * scan then give up on it.
1709 */ 1726 */
1710static void shrink_zones(int priority, struct zonelist *zonelist, 1727static int shrink_zones(int priority, struct zonelist *zonelist,
1711 struct scan_control *sc) 1728 struct scan_control *sc)
1712{ 1729{
1713 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1730 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1714 struct zoneref *z; 1731 struct zoneref *z;
1715 struct zone *zone; 1732 struct zone *zone;
1733 int progress = 0;
1716 1734
1717 sc->all_unreclaimable = 1;
1718 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1735 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1719 sc->nodemask) { 1736 sc->nodemask) {
1720 if (!populated_zone(zone)) 1737 if (!populated_zone(zone))
@@ -1730,19 +1747,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1730 1747
1731 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1748 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1732 continue; /* Let kswapd poll it */ 1749 continue; /* Let kswapd poll it */
1733 sc->all_unreclaimable = 0;
1734 } else { 1750 } else {
1735 /* 1751 /*
1736 * Ignore cpuset limitation here. We just want to reduce 1752 * Ignore cpuset limitation here. We just want to reduce
1737 * # of used pages by us regardless of memory shortage. 1753 * # of used pages by us regardless of memory shortage.
1738 */ 1754 */
1739 sc->all_unreclaimable = 0;
1740 mem_cgroup_note_reclaim_priority(sc->mem_cgroup, 1755 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1741 priority); 1756 priority);
1742 } 1757 }
1743 1758
1744 shrink_zone(priority, zone, sc); 1759 shrink_zone(priority, zone, sc);
1760 progress = 1;
1745 } 1761 }
1762 return progress;
1746} 1763}
1747 1764
1748/* 1765/*
@@ -1774,6 +1791,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1774 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1791 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1775 unsigned long writeback_threshold; 1792 unsigned long writeback_threshold;
1776 1793
1794 get_mems_allowed();
1777 delayacct_freepages_start(); 1795 delayacct_freepages_start();
1778 1796
1779 if (scanning_global_lru(sc)) 1797 if (scanning_global_lru(sc))
@@ -1795,7 +1813,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1795 sc->nr_scanned = 0; 1813 sc->nr_scanned = 0;
1796 if (!priority) 1814 if (!priority)
1797 disable_swap_token(); 1815 disable_swap_token();
1798 shrink_zones(priority, zonelist, sc); 1816 ret = shrink_zones(priority, zonelist, sc);
1799 /* 1817 /*
1800 * Don't shrink slabs when reclaiming memory from 1818 * Don't shrink slabs when reclaiming memory from
1801 * over limit cgroups 1819 * over limit cgroups
@@ -1832,7 +1850,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1832 congestion_wait(BLK_RW_ASYNC, HZ/10); 1850 congestion_wait(BLK_RW_ASYNC, HZ/10);
1833 } 1851 }
1834 /* top priority shrink_zones still had more to do? don't OOM, then */ 1852 /* top priority shrink_zones still had more to do? don't OOM, then */
1835 if (!sc->all_unreclaimable && scanning_global_lru(sc)) 1853 if (ret && scanning_global_lru(sc))
1836 ret = sc->nr_reclaimed; 1854 ret = sc->nr_reclaimed;
1837out: 1855out:
1838 /* 1856 /*
@@ -1857,6 +1875,7 @@ out:
1857 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); 1875 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1858 1876
1859 delayacct_freepages_end(); 1877 delayacct_freepages_end();
1878 put_mems_allowed();
1860 1879
1861 return ret; 1880 return ret;
1862} 1881}
@@ -1873,7 +1892,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1873 .swappiness = vm_swappiness, 1892 .swappiness = vm_swappiness,
1874 .order = order, 1893 .order = order,
1875 .mem_cgroup = NULL, 1894 .mem_cgroup = NULL,
1876 .isolate_pages = isolate_pages_global,
1877 .nodemask = nodemask, 1895 .nodemask = nodemask,
1878 }; 1896 };
1879 1897
@@ -1894,7 +1912,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1894 .swappiness = swappiness, 1912 .swappiness = swappiness,
1895 .order = 0, 1913 .order = 0,
1896 .mem_cgroup = mem, 1914 .mem_cgroup = mem,
1897 .isolate_pages = mem_cgroup_isolate_pages,
1898 }; 1915 };
1899 nodemask_t nm = nodemask_of_node(nid); 1916 nodemask_t nm = nodemask_of_node(nid);
1900 1917
@@ -1928,7 +1945,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1928 .swappiness = swappiness, 1945 .swappiness = swappiness,
1929 .order = 0, 1946 .order = 0,
1930 .mem_cgroup = mem_cont, 1947 .mem_cgroup = mem_cont,
1931 .isolate_pages = mem_cgroup_isolate_pages,
1932 .nodemask = NULL, /* we don't care the placement */ 1948 .nodemask = NULL, /* we don't care the placement */
1933 }; 1949 };
1934 1950
@@ -2006,7 +2022,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2006 .swappiness = vm_swappiness, 2022 .swappiness = vm_swappiness,
2007 .order = order, 2023 .order = order,
2008 .mem_cgroup = NULL, 2024 .mem_cgroup = NULL,
2009 .isolate_pages = isolate_pages_global,
2010 }; 2025 };
2011 /* 2026 /*
2012 * temp_priority is used to remember the scanning priority at which 2027 * temp_priority is used to remember the scanning priority at which
@@ -2385,7 +2400,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2385 .hibernation_mode = 1, 2400 .hibernation_mode = 1,
2386 .swappiness = vm_swappiness, 2401 .swappiness = vm_swappiness,
2387 .order = 0, 2402 .order = 0,
2388 .isolate_pages = isolate_pages_global,
2389 }; 2403 };
2390 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 2404 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2391 struct task_struct *p = current; 2405 struct task_struct *p = current;
@@ -2570,7 +2584,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2570 .gfp_mask = gfp_mask, 2584 .gfp_mask = gfp_mask,
2571 .swappiness = vm_swappiness, 2585 .swappiness = vm_swappiness,
2572 .order = order, 2586 .order = order,
2573 .isolate_pages = isolate_pages_global,
2574 }; 2587 };
2575 unsigned long slab_reclaimable; 2588 unsigned long slab_reclaimable;
2576 2589
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fa12ea3051fb..7759941d4e77 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/vmstat.h> 17#include <linux/vmstat.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/math64.h>
19 20
20#ifdef CONFIG_VM_EVENT_COUNTERS 21#ifdef CONFIG_VM_EVENT_COUNTERS
21DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -379,7 +380,86 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
379} 380}
380#endif 381#endif
381 382
382#ifdef CONFIG_PROC_FS 383#ifdef CONFIG_COMPACTION
384struct contig_page_info {
385 unsigned long free_pages;
386 unsigned long free_blocks_total;
387 unsigned long free_blocks_suitable;
388};
389
390/*
391 * Calculate the number of free pages in a zone, how many contiguous
392 * pages are free and how many are large enough to satisfy an allocation of
393 * the target size. Note that this function makes no attempt to estimate
394 * how many suitable free blocks there *might* be if MOVABLE pages were
395 * migrated. Calculating that is possible, but expensive and can be
396 * figured out from userspace
397 */
398static void fill_contig_page_info(struct zone *zone,
399 unsigned int suitable_order,
400 struct contig_page_info *info)
401{
402 unsigned int order;
403
404 info->free_pages = 0;
405 info->free_blocks_total = 0;
406 info->free_blocks_suitable = 0;
407
408 for (order = 0; order < MAX_ORDER; order++) {
409 unsigned long blocks;
410
411 /* Count number of free blocks */
412 blocks = zone->free_area[order].nr_free;
413 info->free_blocks_total += blocks;
414
415 /* Count free base pages */
416 info->free_pages += blocks << order;
417
418 /* Count the suitable free blocks */
419 if (order >= suitable_order)
420 info->free_blocks_suitable += blocks <<
421 (order - suitable_order);
422 }
423}
424
425/*
426 * A fragmentation index only makes sense if an allocation of a requested
427 * size would fail. If that is true, the fragmentation index indicates
428 * whether external fragmentation or a lack of memory was the problem.
429 * The value can be used to determine if page reclaim or compaction
430 * should be used
431 */
432static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
433{
434 unsigned long requested = 1UL << order;
435
436 if (!info->free_blocks_total)
437 return 0;
438
439 /* Fragmentation index only makes sense when a request would fail */
440 if (info->free_blocks_suitable)
441 return -1000;
442
443 /*
444 * Index is between 0 and 1 so return within 3 decimal places
445 *
446 * 0 => allocation would fail due to lack of memory
447 * 1 => allocation would fail due to fragmentation
448 */
449 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
450}
451
452/* Same as __fragmentation index but allocs contig_page_info on stack */
453int fragmentation_index(struct zone *zone, unsigned int order)
454{
455 struct contig_page_info info;
456
457 fill_contig_page_info(zone, order, &info);
458 return __fragmentation_index(order, &info);
459}
460#endif
461
462#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
383#include <linux/proc_fs.h> 463#include <linux/proc_fs.h>
384#include <linux/seq_file.h> 464#include <linux/seq_file.h>
385 465
@@ -432,7 +512,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
432 spin_unlock_irqrestore(&zone->lock, flags); 512 spin_unlock_irqrestore(&zone->lock, flags);
433 } 513 }
434} 514}
515#endif
435 516
517#ifdef CONFIG_PROC_FS
436static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 518static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
437 struct zone *zone) 519 struct zone *zone)
438{ 520{
@@ -693,6 +775,16 @@ static const char * const vmstat_text[] = {
693 "allocstall", 775 "allocstall",
694 776
695 "pgrotated", 777 "pgrotated",
778
779#ifdef CONFIG_COMPACTION
780 "compact_blocks_moved",
781 "compact_pages_moved",
782 "compact_pagemigrate_failed",
783 "compact_stall",
784 "compact_fail",
785 "compact_success",
786#endif
787
696#ifdef CONFIG_HUGETLB_PAGE 788#ifdef CONFIG_HUGETLB_PAGE
697 "htlb_buddy_alloc_success", 789 "htlb_buddy_alloc_success",
698 "htlb_buddy_alloc_fail", 790 "htlb_buddy_alloc_fail",
@@ -954,3 +1046,162 @@ static int __init setup_vmstat(void)
954 return 0; 1046 return 0;
955} 1047}
956module_init(setup_vmstat) 1048module_init(setup_vmstat)
1049
1050#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1051#include <linux/debugfs.h>
1052
1053static struct dentry *extfrag_debug_root;
1054
1055/*
1056 * Return an index indicating how much of the available free memory is
1057 * unusable for an allocation of the requested size.
1058 */
1059static int unusable_free_index(unsigned int order,
1060 struct contig_page_info *info)
1061{
1062 /* No free memory is interpreted as all free memory is unusable */
1063 if (info->free_pages == 0)
1064 return 1000;
1065
1066 /*
1067 * Index should be a value between 0 and 1. Return a value to 3
1068 * decimal places.
1069 *
1070 * 0 => no fragmentation
1071 * 1 => high fragmentation
1072 */
1073 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
1074
1075}
1076
1077static void unusable_show_print(struct seq_file *m,
1078 pg_data_t *pgdat, struct zone *zone)
1079{
1080 unsigned int order;
1081 int index;
1082 struct contig_page_info info;
1083
1084 seq_printf(m, "Node %d, zone %8s ",
1085 pgdat->node_id,
1086 zone->name);
1087 for (order = 0; order < MAX_ORDER; ++order) {
1088 fill_contig_page_info(zone, order, &info);
1089 index = unusable_free_index(order, &info);
1090 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1091 }
1092
1093 seq_putc(m, '\n');
1094}
1095
1096/*
1097 * Display unusable free space index
1098 *
1099 * The unusable free space index measures how much of the available free
1100 * memory cannot be used to satisfy an allocation of a given size and is a
1101 * value between 0 and 1. The higher the value, the more of free memory is
1102 * unusable and by implication, the worse the external fragmentation is. This
1103 * can be expressed as a percentage by multiplying by 100.
1104 */
1105static int unusable_show(struct seq_file *m, void *arg)
1106{
1107 pg_data_t *pgdat = (pg_data_t *)arg;
1108
1109 /* check memoryless node */
1110 if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
1111 return 0;
1112
1113 walk_zones_in_node(m, pgdat, unusable_show_print);
1114
1115 return 0;
1116}
1117
1118static const struct seq_operations unusable_op = {
1119 .start = frag_start,
1120 .next = frag_next,
1121 .stop = frag_stop,
1122 .show = unusable_show,
1123};
1124
1125static int unusable_open(struct inode *inode, struct file *file)
1126{
1127 return seq_open(file, &unusable_op);
1128}
1129
1130static const struct file_operations unusable_file_ops = {
1131 .open = unusable_open,
1132 .read = seq_read,
1133 .llseek = seq_lseek,
1134 .release = seq_release,
1135};
1136
1137static void extfrag_show_print(struct seq_file *m,
1138 pg_data_t *pgdat, struct zone *zone)
1139{
1140 unsigned int order;
1141 int index;
1142
1143 /* Alloc on stack as interrupts are disabled for zone walk */
1144 struct contig_page_info info;
1145
1146 seq_printf(m, "Node %d, zone %8s ",
1147 pgdat->node_id,
1148 zone->name);
1149 for (order = 0; order < MAX_ORDER; ++order) {
1150 fill_contig_page_info(zone, order, &info);
1151 index = __fragmentation_index(order, &info);
1152 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1153 }
1154
1155 seq_putc(m, '\n');
1156}
1157
1158/*
1159 * Display fragmentation index for orders that allocations would fail for
1160 */
1161static int extfrag_show(struct seq_file *m, void *arg)
1162{
1163 pg_data_t *pgdat = (pg_data_t *)arg;
1164
1165 walk_zones_in_node(m, pgdat, extfrag_show_print);
1166
1167 return 0;
1168}
1169
1170static const struct seq_operations extfrag_op = {
1171 .start = frag_start,
1172 .next = frag_next,
1173 .stop = frag_stop,
1174 .show = extfrag_show,
1175};
1176
1177static int extfrag_open(struct inode *inode, struct file *file)
1178{
1179 return seq_open(file, &extfrag_op);
1180}
1181
1182static const struct file_operations extfrag_file_ops = {
1183 .open = extfrag_open,
1184 .read = seq_read,
1185 .llseek = seq_lseek,
1186 .release = seq_release,
1187};
1188
1189static int __init extfrag_debug_init(void)
1190{
1191 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1192 if (!extfrag_debug_root)
1193 return -ENOMEM;
1194
1195 if (!debugfs_create_file("unusable_index", 0444,
1196 extfrag_debug_root, NULL, &unusable_file_ops))
1197 return -ENOMEM;
1198
1199 if (!debugfs_create_file("extfrag_index", 0444,
1200 extfrag_debug_root, NULL, &extfrag_file_ops))
1201 return -ENOMEM;
1202
1203 return 0;
1204}
1205
1206module_init(extfrag_debug_init);
1207#endif