aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c605
-rw-r--r--mm/filemap.c14
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memory.c13
-rw-r--r--mm/memory_hotplug.c36
-rw-r--r--mm/mempolicy.c226
-rw-r--r--mm/migrate.c72
-rw-r--r--mm/mincore.c263
-rw-r--r--mm/page_alloc.c267
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c40
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slub.c6
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/vmscan.c213
-rw-r--r--mm/vmstat.c253
21 files changed, 1684 insertions, 377 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 9c61158308dc..527136b22384 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -172,6 +172,15 @@ config SPLIT_PTLOCK_CPUS
172 default "4" 172 default "4"
173 173
174# 174#
175# support for memory compaction
176config COMPACTION
177 bool "Allow for memory compaction"
178 select MIGRATION
179 depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
180 help
181 Allows the compaction of memory for the allocation of huge pages.
182
183#
175# support for page migration 184# support for page migration
176# 185#
177config MIGRATION 186config MIGRATION
@@ -180,9 +189,11 @@ config MIGRATION
180 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE 189 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
181 help 190 help
182 Allows the migration of the physical location of pages of processes 191 Allows the migration of the physical location of pages of processes
183 while the virtual addresses are not changed. This is useful for 192 while the virtual addresses are not changed. This is useful in
184 example on NUMA systems to put pages nearer to the processors accessing 193 two situations. The first is on NUMA systems to put pages nearer
185 the page. 194 to the processors accessing. The second is when allocating huge
195 pages as migration can relocate pages to satisfy a huge page
196 allocation instead of reclaiming.
186 197
187config PHYS_ADDR_T_64BIT 198config PHYS_ADDR_T_64BIT
188 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT 199 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 6c2a73a54a43..8982504bd03b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
23obj-$(CONFIG_SPARSEMEM) += sparse.o 23obj-$(CONFIG_SPARSEMEM) += sparse.o
24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
26obj-$(CONFIG_COMPACTION) += compaction.o
26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_KSM) += ksm.o 28obj-$(CONFIG_KSM) += ksm.o
28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 29obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
diff --git a/mm/compaction.c b/mm/compaction.c
new file mode 100644
index 000000000000..94cce51b0b35
--- /dev/null
+++ b/mm/compaction.c
@@ -0,0 +1,605 @@
1/*
2 * linux/mm/compaction.c
3 *
4 * Memory compaction for the reduction of external fragmentation. Note that
5 * this heavily depends upon page migration to do all the real heavy
6 * lifting
7 *
8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
9 */
10#include <linux/swap.h>
11#include <linux/migrate.h>
12#include <linux/compaction.h>
13#include <linux/mm_inline.h>
14#include <linux/backing-dev.h>
15#include <linux/sysctl.h>
16#include <linux/sysfs.h>
17#include "internal.h"
18
19/*
20 * compact_control is used to track pages being migrated and the free pages
21 * they are being migrated to during memory compaction. The free_pfn starts
22 * at the end of a zone and migrate_pfn begins at the start. Movable pages
23 * are moved to the end of a zone during a compaction run and the run
24 * completes when free_pfn <= migrate_pfn
25 */
26struct compact_control {
27 struct list_head freepages; /* List of free pages to migrate to */
28 struct list_head migratepages; /* List of pages being migrated */
29 unsigned long nr_freepages; /* Number of isolated free pages */
30 unsigned long nr_migratepages; /* Number of pages to migrate */
31 unsigned long free_pfn; /* isolate_freepages search base */
32 unsigned long migrate_pfn; /* isolate_migratepages search base */
33
34 /* Account for isolated anon and file pages */
35 unsigned long nr_anon;
36 unsigned long nr_file;
37
38 unsigned int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone;
41};
42
43static unsigned long release_freepages(struct list_head *freelist)
44{
45 struct page *page, *next;
46 unsigned long count = 0;
47
48 list_for_each_entry_safe(page, next, freelist, lru) {
49 list_del(&page->lru);
50 __free_page(page);
51 count++;
52 }
53
54 return count;
55}
56
57/* Isolate free pages onto a private freelist. Must hold zone->lock */
58static unsigned long isolate_freepages_block(struct zone *zone,
59 unsigned long blockpfn,
60 struct list_head *freelist)
61{
62 unsigned long zone_end_pfn, end_pfn;
63 int total_isolated = 0;
64 struct page *cursor;
65
66 /* Get the last PFN we should scan for free pages at */
67 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
68 end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
69
70 /* Find the first usable PFN in the block to initialse page cursor */
71 for (; blockpfn < end_pfn; blockpfn++) {
72 if (pfn_valid_within(blockpfn))
73 break;
74 }
75 cursor = pfn_to_page(blockpfn);
76
77 /* Isolate free pages. This assumes the block is valid */
78 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
79 int isolated, i;
80 struct page *page = cursor;
81
82 if (!pfn_valid_within(blockpfn))
83 continue;
84
85 if (!PageBuddy(page))
86 continue;
87
88 /* Found a free page, break it into order-0 pages */
89 isolated = split_free_page(page);
90 total_isolated += isolated;
91 for (i = 0; i < isolated; i++) {
92 list_add(&page->lru, freelist);
93 page++;
94 }
95
96 /* If a page was split, advance to the end of it */
97 if (isolated) {
98 blockpfn += isolated - 1;
99 cursor += isolated - 1;
100 }
101 }
102
103 return total_isolated;
104}
105
106/* Returns true if the page is within a block suitable for migration to */
107static bool suitable_migration_target(struct page *page)
108{
109
110 int migratetype = get_pageblock_migratetype(page);
111
112 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
113 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
114 return false;
115
116 /* If the page is a large free page, then allow migration */
117 if (PageBuddy(page) && page_order(page) >= pageblock_order)
118 return true;
119
120 /* If the block is MIGRATE_MOVABLE, allow migration */
121 if (migratetype == MIGRATE_MOVABLE)
122 return true;
123
124 /* Otherwise skip the block */
125 return false;
126}
127
128/*
129 * Based on information in the current compact_control, find blocks
130 * suitable for isolating free pages from and then isolate them.
131 */
132static void isolate_freepages(struct zone *zone,
133 struct compact_control *cc)
134{
135 struct page *page;
136 unsigned long high_pfn, low_pfn, pfn;
137 unsigned long flags;
138 int nr_freepages = cc->nr_freepages;
139 struct list_head *freelist = &cc->freepages;
140
141 pfn = cc->free_pfn;
142 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
143 high_pfn = low_pfn;
144
145 /*
146 * Isolate free pages until enough are available to migrate the
147 * pages on cc->migratepages. We stop searching if the migrate
148 * and free page scanners meet or enough free pages are isolated.
149 */
150 spin_lock_irqsave(&zone->lock, flags);
151 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
152 pfn -= pageblock_nr_pages) {
153 unsigned long isolated;
154
155 if (!pfn_valid(pfn))
156 continue;
157
158 /*
159 * Check for overlapping nodes/zones. It's possible on some
160 * configurations to have a setup like
161 * node0 node1 node0
162 * i.e. it's possible that all pages within a zones range of
163 * pages do not belong to a single zone.
164 */
165 page = pfn_to_page(pfn);
166 if (page_zone(page) != zone)
167 continue;
168
169 /* Check the block is suitable for migration */
170 if (!suitable_migration_target(page))
171 continue;
172
173 /* Found a block suitable for isolating free pages from */
174 isolated = isolate_freepages_block(zone, pfn, freelist);
175 nr_freepages += isolated;
176
177 /*
178 * Record the highest PFN we isolated pages from. When next
179 * looking for free pages, the search will restart here as
180 * page migration may have returned some pages to the allocator
181 */
182 if (isolated)
183 high_pfn = max(high_pfn, pfn);
184 }
185 spin_unlock_irqrestore(&zone->lock, flags);
186
187 /* split_free_page does not map the pages */
188 list_for_each_entry(page, freelist, lru) {
189 arch_alloc_page(page, 0);
190 kernel_map_pages(page, 1, 1);
191 }
192
193 cc->free_pfn = high_pfn;
194 cc->nr_freepages = nr_freepages;
195}
196
197/* Update the number of anon and file isolated pages in the zone */
198static void acct_isolated(struct zone *zone, struct compact_control *cc)
199{
200 struct page *page;
201 unsigned int count[NR_LRU_LISTS] = { 0, };
202
203 list_for_each_entry(page, &cc->migratepages, lru) {
204 int lru = page_lru_base_type(page);
205 count[lru]++;
206 }
207
208 cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
209 cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
210 __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
211 __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
212}
213
214/* Similar to reclaim, but different enough that they don't share logic */
215static bool too_many_isolated(struct zone *zone)
216{
217
218 unsigned long inactive, isolated;
219
220 inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
221 zone_page_state(zone, NR_INACTIVE_ANON);
222 isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
223 zone_page_state(zone, NR_ISOLATED_ANON);
224
225 return isolated > inactive;
226}
227
228/*
229 * Isolate all pages that can be migrated from the block pointed to by
230 * the migrate scanner within compact_control.
231 */
232static unsigned long isolate_migratepages(struct zone *zone,
233 struct compact_control *cc)
234{
235 unsigned long low_pfn, end_pfn;
236 struct list_head *migratelist = &cc->migratepages;
237
238 /* Do not scan outside zone boundaries */
239 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
240
241 /* Only scan within a pageblock boundary */
242 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
243
244 /* Do not cross the free scanner or scan within a memory hole */
245 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
246 cc->migrate_pfn = end_pfn;
247 return 0;
248 }
249
250 /*
251 * Ensure that there are not too many pages isolated from the LRU
252 * list by either parallel reclaimers or compaction. If there are,
253 * delay for some time until fewer pages are isolated
254 */
255 while (unlikely(too_many_isolated(zone))) {
256 congestion_wait(BLK_RW_ASYNC, HZ/10);
257
258 if (fatal_signal_pending(current))
259 return 0;
260 }
261
262 /* Time to isolate some pages for migration */
263 spin_lock_irq(&zone->lru_lock);
264 for (; low_pfn < end_pfn; low_pfn++) {
265 struct page *page;
266 if (!pfn_valid_within(low_pfn))
267 continue;
268
269 /* Get the page and skip if free */
270 page = pfn_to_page(low_pfn);
271 if (PageBuddy(page))
272 continue;
273
274 /* Try isolate the page */
275 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
276 continue;
277
278 /* Successfully isolated */
279 del_page_from_lru_list(zone, page, page_lru(page));
280 list_add(&page->lru, migratelist);
281 mem_cgroup_del_lru(page);
282 cc->nr_migratepages++;
283
284 /* Avoid isolating too much */
285 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
286 break;
287 }
288
289 acct_isolated(zone, cc);
290
291 spin_unlock_irq(&zone->lru_lock);
292 cc->migrate_pfn = low_pfn;
293
294 return cc->nr_migratepages;
295}
296
297/*
298 * This is a migrate-callback that "allocates" freepages by taking pages
299 * from the isolated freelists in the block we are migrating to.
300 */
301static struct page *compaction_alloc(struct page *migratepage,
302 unsigned long data,
303 int **result)
304{
305 struct compact_control *cc = (struct compact_control *)data;
306 struct page *freepage;
307
308 /* Isolate free pages if necessary */
309 if (list_empty(&cc->freepages)) {
310 isolate_freepages(cc->zone, cc);
311
312 if (list_empty(&cc->freepages))
313 return NULL;
314 }
315
316 freepage = list_entry(cc->freepages.next, struct page, lru);
317 list_del(&freepage->lru);
318 cc->nr_freepages--;
319
320 return freepage;
321}
322
323/*
324 * We cannot control nr_migratepages and nr_freepages fully when migration is
325 * running as migrate_pages() has no knowledge of compact_control. When
326 * migration is complete, we count the number of pages on the lists by hand.
327 */
328static void update_nr_listpages(struct compact_control *cc)
329{
330 int nr_migratepages = 0;
331 int nr_freepages = 0;
332 struct page *page;
333
334 list_for_each_entry(page, &cc->migratepages, lru)
335 nr_migratepages++;
336 list_for_each_entry(page, &cc->freepages, lru)
337 nr_freepages++;
338
339 cc->nr_migratepages = nr_migratepages;
340 cc->nr_freepages = nr_freepages;
341}
342
343static int compact_finished(struct zone *zone,
344 struct compact_control *cc)
345{
346 unsigned int order;
347 unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
348
349 if (fatal_signal_pending(current))
350 return COMPACT_PARTIAL;
351
352 /* Compaction run completes if the migrate and free scanner meet */
353 if (cc->free_pfn <= cc->migrate_pfn)
354 return COMPACT_COMPLETE;
355
356 /* Compaction run is not finished if the watermark is not met */
357 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
358 return COMPACT_CONTINUE;
359
360 if (cc->order == -1)
361 return COMPACT_CONTINUE;
362
363 /* Direct compactor: Is a suitable page free? */
364 for (order = cc->order; order < MAX_ORDER; order++) {
365 /* Job done if page is free of the right migratetype */
366 if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
367 return COMPACT_PARTIAL;
368
369 /* Job done if allocation would set block type */
370 if (order >= pageblock_order && zone->free_area[order].nr_free)
371 return COMPACT_PARTIAL;
372 }
373
374 return COMPACT_CONTINUE;
375}
376
377static int compact_zone(struct zone *zone, struct compact_control *cc)
378{
379 int ret;
380
381 /* Setup to move all movable pages to the end of the zone */
382 cc->migrate_pfn = zone->zone_start_pfn;
383 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
384 cc->free_pfn &= ~(pageblock_nr_pages-1);
385
386 migrate_prep_local();
387
388 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
389 unsigned long nr_migrate, nr_remaining;
390
391 if (!isolate_migratepages(zone, cc))
392 continue;
393
394 nr_migrate = cc->nr_migratepages;
395 migrate_pages(&cc->migratepages, compaction_alloc,
396 (unsigned long)cc, 0);
397 update_nr_listpages(cc);
398 nr_remaining = cc->nr_migratepages;
399
400 count_vm_event(COMPACTBLOCKS);
401 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
402 if (nr_remaining)
403 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
404
405 /* Release LRU pages not migrated */
406 if (!list_empty(&cc->migratepages)) {
407 putback_lru_pages(&cc->migratepages);
408 cc->nr_migratepages = 0;
409 }
410
411 }
412
413 /* Release free pages and check accounting */
414 cc->nr_freepages -= release_freepages(&cc->freepages);
415 VM_BUG_ON(cc->nr_freepages != 0);
416
417 return ret;
418}
419
420static unsigned long compact_zone_order(struct zone *zone,
421 int order, gfp_t gfp_mask)
422{
423 struct compact_control cc = {
424 .nr_freepages = 0,
425 .nr_migratepages = 0,
426 .order = order,
427 .migratetype = allocflags_to_migratetype(gfp_mask),
428 .zone = zone,
429 };
430 INIT_LIST_HEAD(&cc.freepages);
431 INIT_LIST_HEAD(&cc.migratepages);
432
433 return compact_zone(zone, &cc);
434}
435
436int sysctl_extfrag_threshold = 500;
437
438/**
439 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
440 * @zonelist: The zonelist used for the current allocation
441 * @order: The order of the current allocation
442 * @gfp_mask: The GFP mask of the current allocation
443 * @nodemask: The allowed nodes to allocate from
444 *
445 * This is the main entry point for direct page compaction.
446 */
447unsigned long try_to_compact_pages(struct zonelist *zonelist,
448 int order, gfp_t gfp_mask, nodemask_t *nodemask)
449{
450 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
451 int may_enter_fs = gfp_mask & __GFP_FS;
452 int may_perform_io = gfp_mask & __GFP_IO;
453 unsigned long watermark;
454 struct zoneref *z;
455 struct zone *zone;
456 int rc = COMPACT_SKIPPED;
457
458 /*
459 * Check whether it is worth even starting compaction. The order check is
460 * made because an assumption is made that the page allocator can satisfy
461 * the "cheaper" orders without taking special steps
462 */
463 if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
464 return rc;
465
466 count_vm_event(COMPACTSTALL);
467
468 /* Compact each zone in the list */
469 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
470 nodemask) {
471 int fragindex;
472 int status;
473
474 /*
475 * Watermarks for order-0 must be met for compaction. Note
476 * the 2UL. This is because during migration, copies of
477 * pages need to be allocated and for a short time, the
478 * footprint is higher
479 */
480 watermark = low_wmark_pages(zone) + (2UL << order);
481 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
482 continue;
483
484 /*
485 * fragmentation index determines if allocation failures are
486 * due to low memory or external fragmentation
487 *
488 * index of -1 implies allocations might succeed depending
489 * on watermarks
490 * index towards 0 implies failure is due to lack of memory
491 * index towards 1000 implies failure is due to fragmentation
492 *
493 * Only compact if a failure would be due to fragmentation.
494 */
495 fragindex = fragmentation_index(zone, order);
496 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
497 continue;
498
499 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
500 rc = COMPACT_PARTIAL;
501 break;
502 }
503
504 status = compact_zone_order(zone, order, gfp_mask);
505 rc = max(status, rc);
506
507 if (zone_watermark_ok(zone, order, watermark, 0, 0))
508 break;
509 }
510
511 return rc;
512}
513
514
515/* Compact all zones within a node */
516static int compact_node(int nid)
517{
518 int zoneid;
519 pg_data_t *pgdat;
520 struct zone *zone;
521
522 if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
523 return -EINVAL;
524 pgdat = NODE_DATA(nid);
525
526 /* Flush pending updates to the LRU lists */
527 lru_add_drain_all();
528
529 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
530 struct compact_control cc = {
531 .nr_freepages = 0,
532 .nr_migratepages = 0,
533 .order = -1,
534 };
535
536 zone = &pgdat->node_zones[zoneid];
537 if (!populated_zone(zone))
538 continue;
539
540 cc.zone = zone;
541 INIT_LIST_HEAD(&cc.freepages);
542 INIT_LIST_HEAD(&cc.migratepages);
543
544 compact_zone(zone, &cc);
545
546 VM_BUG_ON(!list_empty(&cc.freepages));
547 VM_BUG_ON(!list_empty(&cc.migratepages));
548 }
549
550 return 0;
551}
552
553/* Compact all nodes in the system */
554static int compact_nodes(void)
555{
556 int nid;
557
558 for_each_online_node(nid)
559 compact_node(nid);
560
561 return COMPACT_COMPLETE;
562}
563
564/* The written value is actually unused, all memory is compacted */
565int sysctl_compact_memory;
566
567/* This is the entry point for compacting all nodes via /proc/sys/vm */
568int sysctl_compaction_handler(struct ctl_table *table, int write,
569 void __user *buffer, size_t *length, loff_t *ppos)
570{
571 if (write)
572 return compact_nodes();
573
574 return 0;
575}
576
577int sysctl_extfrag_handler(struct ctl_table *table, int write,
578 void __user *buffer, size_t *length, loff_t *ppos)
579{
580 proc_dointvec_minmax(table, write, buffer, length, ppos);
581
582 return 0;
583}
584
585#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
586ssize_t sysfs_compact_node(struct sys_device *dev,
587 struct sysdev_attribute *attr,
588 const char *buf, size_t count)
589{
590 compact_node(dev->id);
591
592 return count;
593}
594static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
595
596int compaction_register_node(struct node *node)
597{
598 return sysdev_create_file(&node->sysdev, &attr_compact);
599}
600
601void compaction_unregister_node(struct node *node)
602{
603 return sysdev_remove_file(&node->sysdev, &attr_compact);
604}
605#endif /* CONFIG_SYSFS && CONFIG_NUMA */
diff --git a/mm/filemap.c b/mm/filemap.c
index 140ebda9640f..88d719665a28 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -441,7 +441,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
441 /* 441 /*
442 * Splice_read and readahead add shmem/tmpfs pages into the page cache 442 * Splice_read and readahead add shmem/tmpfs pages into the page cache
443 * before shmem_readpage has a chance to mark them as SwapBacked: they 443 * before shmem_readpage has a chance to mark them as SwapBacked: they
444 * need to go on the active_anon lru below, and mem_cgroup_cache_charge 444 * need to go on the anon lru below, and mem_cgroup_cache_charge
445 * (called in add_to_page_cache) needs to know where they're going too. 445 * (called in add_to_page_cache) needs to know where they're going too.
446 */ 446 */
447 if (mapping_cap_swap_backed(mapping)) 447 if (mapping_cap_swap_backed(mapping))
@@ -452,7 +452,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
452 if (page_is_file_cache(page)) 452 if (page_is_file_cache(page))
453 lru_cache_add_file(page); 453 lru_cache_add_file(page);
454 else 454 else
455 lru_cache_add_active_anon(page); 455 lru_cache_add_anon(page);
456 } 456 }
457 return ret; 457 return ret;
458} 458}
@@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
461#ifdef CONFIG_NUMA 461#ifdef CONFIG_NUMA
462struct page *__page_cache_alloc(gfp_t gfp) 462struct page *__page_cache_alloc(gfp_t gfp)
463{ 463{
464 int n;
465 struct page *page;
466
464 if (cpuset_do_page_mem_spread()) { 467 if (cpuset_do_page_mem_spread()) {
465 int n = cpuset_mem_spread_node(); 468 get_mems_allowed();
466 return alloc_pages_exact_node(n, gfp, 0); 469 n = cpuset_mem_spread_node();
470 page = alloc_pages_exact_node(n, gfp, 0);
471 put_mems_allowed();
472 return page;
467 } 473 }
468 return alloc_pages(gfp, 0); 474 return alloc_pages(gfp, 0);
469} 475}
diff --git a/mm/highmem.c b/mm/highmem.c
index bed8a8bfd01f..66baa20f78f5 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -422,7 +422,7 @@ void __init page_address_init(void)
422 422
423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
424 424
425#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) 425#ifdef CONFIG_DEBUG_HIGHMEM
426 426
427void debug_kmap_atomic(enum km_type type) 427void debug_kmap_atomic(enum km_type type)
428{ 428{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c9e6bbf3772..54d42b009dbe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
465 struct page *page = NULL; 465 struct page *page = NULL;
466 struct mempolicy *mpol; 466 struct mempolicy *mpol;
467 nodemask_t *nodemask; 467 nodemask_t *nodemask;
468 struct zonelist *zonelist = huge_zonelist(vma, address, 468 struct zonelist *zonelist;
469 htlb_alloc_mask, &mpol, &nodemask);
470 struct zone *zone; 469 struct zone *zone;
471 struct zoneref *z; 470 struct zoneref *z;
472 471
472 get_mems_allowed();
473 zonelist = huge_zonelist(vma, address,
474 htlb_alloc_mask, &mpol, &nodemask);
473 /* 475 /*
474 * A child process with MAP_PRIVATE mappings created by their parent 476 * A child process with MAP_PRIVATE mappings created by their parent
475 * have no page reserves. This check ensures that reservations are 477 * have no page reserves. This check ensures that reservations are
@@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
477 */ 479 */
478 if (!vma_has_reserves(vma) && 480 if (!vma_has_reserves(vma) &&
479 h->free_huge_pages - h->resv_huge_pages == 0) 481 h->free_huge_pages - h->resv_huge_pages == 0)
480 return NULL; 482 goto err;
481 483
482 /* If reserves cannot be used, ensure enough pages are in the pool */ 484 /* If reserves cannot be used, ensure enough pages are in the pool */
483 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 485 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
484 return NULL; 486 goto err;;
485 487
486 for_each_zone_zonelist_nodemask(zone, z, zonelist, 488 for_each_zone_zonelist_nodemask(zone, z, zonelist,
487 MAX_NR_ZONES - 1, nodemask) { 489 MAX_NR_ZONES - 1, nodemask) {
@@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
500 break; 502 break;
501 } 503 }
502 } 504 }
505err:
503 mpol_cond_put(mpol); 506 mpol_cond_put(mpol);
507 put_mems_allowed();
504 return page; 508 return page;
505} 509}
506 510
diff --git a/mm/ksm.c b/mm/ksm.c
index 956880f2ff49..6c3e99b4ae7c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -318,14 +318,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
318 struct anon_vma *anon_vma) 318 struct anon_vma *anon_vma)
319{ 319{
320 rmap_item->anon_vma = anon_vma; 320 rmap_item->anon_vma = anon_vma;
321 atomic_inc(&anon_vma->ksm_refcount); 321 atomic_inc(&anon_vma->external_refcount);
322} 322}
323 323
324static void drop_anon_vma(struct rmap_item *rmap_item) 324static void drop_anon_vma(struct rmap_item *rmap_item)
325{ 325{
326 struct anon_vma *anon_vma = rmap_item->anon_vma; 326 struct anon_vma *anon_vma = rmap_item->anon_vma;
327 327
328 if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { 328 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
329 int empty = list_empty(&anon_vma->head); 329 int empty = list_empty(&anon_vma->head);
330 spin_unlock(&anon_vma->lock); 330 spin_unlock(&anon_vma->lock);
331 if (empty) 331 if (empty)
diff --git a/mm/memory.c b/mm/memory.c
index 833952d8b74d..119b7ccdf39b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1227,8 +1227,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1227} 1227}
1228EXPORT_SYMBOL_GPL(zap_vma_ptes); 1228EXPORT_SYMBOL_GPL(zap_vma_ptes);
1229 1229
1230/* 1230/**
1231 * Do a quick page-table lookup for a single page. 1231 * follow_page - look up a page descriptor from a user-virtual address
1232 * @vma: vm_area_struct mapping @address
1233 * @address: virtual address to look up
1234 * @flags: flags modifying lookup behaviour
1235 *
1236 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1237 *
1238 * Returns the mapped (struct page *), %NULL if no mapping exists, or
1239 * an error pointer if there is a mapping to something not represented
1240 * by a page descriptor (see also vm_normal_page()).
1232 */ 1241 */
1233struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1242struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1234 unsigned int flags) 1243 unsigned int flags)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index be211a582930..a4cfcdc00455 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -415,12 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
415 * This means the page allocator ignores this zone. 415 * This means the page allocator ignores this zone.
416 * So, zonelist must be updated after online. 416 * So, zonelist must be updated after online.
417 */ 417 */
418 mutex_lock(&zonelists_mutex);
418 if (!populated_zone(zone)) 419 if (!populated_zone(zone))
419 need_zonelists_rebuild = 1; 420 need_zonelists_rebuild = 1;
420 421
421 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 422 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
422 online_pages_range); 423 online_pages_range);
423 if (ret) { 424 if (ret) {
425 mutex_unlock(&zonelists_mutex);
424 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 426 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
425 nr_pages, pfn); 427 nr_pages, pfn);
426 memory_notify(MEM_CANCEL_ONLINE, &arg); 428 memory_notify(MEM_CANCEL_ONLINE, &arg);
@@ -429,8 +431,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
429 431
430 zone->present_pages += onlined_pages; 432 zone->present_pages += onlined_pages;
431 zone->zone_pgdat->node_present_pages += onlined_pages; 433 zone->zone_pgdat->node_present_pages += onlined_pages;
434 if (need_zonelists_rebuild)
435 build_all_zonelists(zone);
436 else
437 zone_pcp_update(zone);
432 438
433 zone_pcp_update(zone); 439 mutex_unlock(&zonelists_mutex);
434 setup_per_zone_wmarks(); 440 setup_per_zone_wmarks();
435 calculate_zone_inactive_ratio(zone); 441 calculate_zone_inactive_ratio(zone);
436 if (onlined_pages) { 442 if (onlined_pages) {
@@ -438,10 +444,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
438 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 444 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
439 } 445 }
440 446
441 if (need_zonelists_rebuild) 447 vm_total_pages = nr_free_pagecache_pages();
442 build_all_zonelists();
443 else
444 vm_total_pages = nr_free_pagecache_pages();
445 448
446 writeback_set_ratelimit(); 449 writeback_set_ratelimit();
447 450
@@ -482,6 +485,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
482} 485}
483 486
484 487
488/*
489 * called by cpu_up() to online a node without onlined memory.
490 */
491int mem_online_node(int nid)
492{
493 pg_data_t *pgdat;
494 int ret;
495
496 lock_system_sleep();
497 pgdat = hotadd_new_pgdat(nid, 0);
498 if (pgdat) {
499 ret = -ENOMEM;
500 goto out;
501 }
502 node_set_online(nid);
503 ret = register_one_node(nid);
504 BUG_ON(ret);
505
506out:
507 unlock_system_sleep();
508 return ret;
509}
510
485/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 511/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
486int __ref add_memory(int nid, u64 start, u64 size) 512int __ref add_memory(int nid, u64 start, u64 size)
487{ 513{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08f40a2f3fe0..75751012c552 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -119,7 +119,22 @@ struct mempolicy default_policy = {
119 119
120static const struct mempolicy_operations { 120static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 122 /*
123 * If read-side task has no lock to protect task->mempolicy, write-side
124 * task will rebind the task->mempolicy by two step. The first step is
125 * setting all the newly nodes, and the second step is cleaning all the
126 * disallowed nodes. In this way, we can avoid finding no node to alloc
127 * page.
128 * If we have a lock to protect task->mempolicy in read-side, we do
129 * rebind directly.
130 *
131 * step:
132 * MPOL_REBIND_ONCE - do rebind work at once
133 * MPOL_REBIND_STEP1 - set all the newly nodes
134 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
135 */
136 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
137 enum mpol_rebind_step step);
123} mpol_ops[MPOL_MAX]; 138} mpol_ops[MPOL_MAX];
124 139
125/* Check that the nodemask contains at least one populated zone */ 140/* Check that the nodemask contains at least one populated zone */
@@ -127,9 +142,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
127{ 142{
128 int nd, k; 143 int nd, k;
129 144
130 /* Check that there is something useful in this mask */
131 k = policy_zone;
132
133 for_each_node_mask(nd, *nodemask) { 145 for_each_node_mask(nd, *nodemask) {
134 struct zone *z; 146 struct zone *z;
135 147
@@ -145,7 +157,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
145 157
146static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 158static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
147{ 159{
148 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); 160 return pol->flags & MPOL_MODE_FLAGS;
149} 161}
150 162
151static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 163static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
@@ -277,12 +289,19 @@ void __mpol_put(struct mempolicy *p)
277 kmem_cache_free(policy_cache, p); 289 kmem_cache_free(policy_cache, p);
278} 290}
279 291
280static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 292static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
293 enum mpol_rebind_step step)
281{ 294{
282} 295}
283 296
284static void mpol_rebind_nodemask(struct mempolicy *pol, 297/*
285 const nodemask_t *nodes) 298 * step:
299 * MPOL_REBIND_ONCE - do rebind work at once
300 * MPOL_REBIND_STEP1 - set all the newly nodes
301 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
302 */
303static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
304 enum mpol_rebind_step step)
286{ 305{
287 nodemask_t tmp; 306 nodemask_t tmp;
288 307
@@ -291,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
291 else if (pol->flags & MPOL_F_RELATIVE_NODES) 310 else if (pol->flags & MPOL_F_RELATIVE_NODES)
292 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 311 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
293 else { 312 else {
294 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, 313 /*
295 *nodes); 314 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
296 pol->w.cpuset_mems_allowed = *nodes; 315 * result
316 */
317 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
318 nodes_remap(tmp, pol->v.nodes,
319 pol->w.cpuset_mems_allowed, *nodes);
320 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
321 } else if (step == MPOL_REBIND_STEP2) {
322 tmp = pol->w.cpuset_mems_allowed;
323 pol->w.cpuset_mems_allowed = *nodes;
324 } else
325 BUG();
297 } 326 }
298 327
299 pol->v.nodes = tmp; 328 if (nodes_empty(tmp))
329 tmp = *nodes;
330
331 if (step == MPOL_REBIND_STEP1)
332 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
333 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
334 pol->v.nodes = tmp;
335 else
336 BUG();
337
300 if (!node_isset(current->il_next, tmp)) { 338 if (!node_isset(current->il_next, tmp)) {
301 current->il_next = next_node(current->il_next, tmp); 339 current->il_next = next_node(current->il_next, tmp);
302 if (current->il_next >= MAX_NUMNODES) 340 if (current->il_next >= MAX_NUMNODES)
@@ -307,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
307} 345}
308 346
309static void mpol_rebind_preferred(struct mempolicy *pol, 347static void mpol_rebind_preferred(struct mempolicy *pol,
310 const nodemask_t *nodes) 348 const nodemask_t *nodes,
349 enum mpol_rebind_step step)
311{ 350{
312 nodemask_t tmp; 351 nodemask_t tmp;
313 352
@@ -330,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
330 } 369 }
331} 370}
332 371
333/* Migrate a policy to a different set of nodes */ 372/*
334static void mpol_rebind_policy(struct mempolicy *pol, 373 * mpol_rebind_policy - Migrate a policy to a different set of nodes
335 const nodemask_t *newmask) 374 *
375 * If read-side task has no lock to protect task->mempolicy, write-side
376 * task will rebind the task->mempolicy by two step. The first step is
377 * setting all the newly nodes, and the second step is cleaning all the
378 * disallowed nodes. In this way, we can avoid finding no node to alloc
379 * page.
380 * If we have a lock to protect task->mempolicy in read-side, we do
381 * rebind directly.
382 *
383 * step:
384 * MPOL_REBIND_ONCE - do rebind work at once
385 * MPOL_REBIND_STEP1 - set all the newly nodes
386 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
387 */
388static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
389 enum mpol_rebind_step step)
336{ 390{
337 if (!pol) 391 if (!pol)
338 return; 392 return;
339 if (!mpol_store_user_nodemask(pol) && 393 if (!mpol_store_user_nodemask(pol) && step == 0 &&
340 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
341 return; 395 return;
342 mpol_ops[pol->mode].rebind(pol, newmask); 396
397 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
398 return;
399
400 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
401 BUG();
402
403 if (step == MPOL_REBIND_STEP1)
404 pol->flags |= MPOL_F_REBINDING;
405 else if (step == MPOL_REBIND_STEP2)
406 pol->flags &= ~MPOL_F_REBINDING;
407 else if (step >= MPOL_REBIND_NSTEP)
408 BUG();
409
410 mpol_ops[pol->mode].rebind(pol, newmask, step);
343} 411}
344 412
345/* 413/*
@@ -349,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
349 * Called with task's alloc_lock held. 417 * Called with task's alloc_lock held.
350 */ 418 */
351 419
352void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 420void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
421 enum mpol_rebind_step step)
353{ 422{
354 mpol_rebind_policy(tsk->mempolicy, new); 423 mpol_rebind_policy(tsk->mempolicy, new, step);
355} 424}
356 425
357/* 426/*
@@ -366,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
366 435
367 down_write(&mm->mmap_sem); 436 down_write(&mm->mmap_sem);
368 for (vma = mm->mmap; vma; vma = vma->vm_next) 437 for (vma = mm->mmap; vma; vma = vma->vm_next)
369 mpol_rebind_policy(vma->vm_policy, new); 438 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
370 up_write(&mm->mmap_sem); 439 up_write(&mm->mmap_sem);
371} 440}
372 441
@@ -859,7 +928,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
859 nodes_clear(nmask); 928 nodes_clear(nmask);
860 node_set(source, nmask); 929 node_set(source, nmask);
861 930
862 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, 931 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
863 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 932 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
864 933
865 if (!list_empty(&pagelist)) 934 if (!list_empty(&pagelist))
@@ -1444,15 +1513,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1444 /* 1513 /*
1445 * Normally, MPOL_BIND allocations are node-local within the 1514 * Normally, MPOL_BIND allocations are node-local within the
1446 * allowed nodemask. However, if __GFP_THISNODE is set and the 1515 * allowed nodemask. However, if __GFP_THISNODE is set and the
1447 * current node is part of the mask, we use the zonelist for 1516 * current node isn't part of the mask, we use the zonelist for
1448 * the first node in the mask instead. 1517 * the first node in the mask instead.
1449 */ 1518 */
1450 if (unlikely(gfp & __GFP_THISNODE) && 1519 if (unlikely(gfp & __GFP_THISNODE) &&
1451 unlikely(!node_isset(nd, policy->v.nodes))) 1520 unlikely(!node_isset(nd, policy->v.nodes)))
1452 nd = first_node(policy->v.nodes); 1521 nd = first_node(policy->v.nodes);
1453 break; 1522 break;
1454 case MPOL_INTERLEAVE: /* should not happen */
1455 break;
1456 default: 1523 default:
1457 BUG(); 1524 BUG();
1458 } 1525 }
@@ -1572,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1572 * to the struct mempolicy for conditional unref after allocation. 1639 * to the struct mempolicy for conditional unref after allocation.
1573 * If the effective policy is 'BIND, returns a pointer to the mempolicy's 1640 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1574 * @nodemask for filtering the zonelist. 1641 * @nodemask for filtering the zonelist.
1642 *
1643 * Must be protected by get_mems_allowed()
1575 */ 1644 */
1576struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1645struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1577 gfp_t gfp_flags, struct mempolicy **mpol, 1646 gfp_t gfp_flags, struct mempolicy **mpol,
@@ -1617,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1617 if (!(mask && current->mempolicy)) 1686 if (!(mask && current->mempolicy))
1618 return false; 1687 return false;
1619 1688
1689 task_lock(current);
1620 mempolicy = current->mempolicy; 1690 mempolicy = current->mempolicy;
1621 switch (mempolicy->mode) { 1691 switch (mempolicy->mode) {
1622 case MPOL_PREFERRED: 1692 case MPOL_PREFERRED:
@@ -1636,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1636 default: 1706 default:
1637 BUG(); 1707 BUG();
1638 } 1708 }
1709 task_unlock(current);
1639 1710
1640 return true; 1711 return true;
1641} 1712}
@@ -1683,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1683{ 1754{
1684 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1755 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1685 struct zonelist *zl; 1756 struct zonelist *zl;
1757 struct page *page;
1686 1758
1759 get_mems_allowed();
1687 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1760 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1688 unsigned nid; 1761 unsigned nid;
1689 1762
1690 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1763 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1691 mpol_cond_put(pol); 1764 mpol_cond_put(pol);
1692 return alloc_page_interleave(gfp, 0, nid); 1765 page = alloc_page_interleave(gfp, 0, nid);
1766 put_mems_allowed();
1767 return page;
1693 } 1768 }
1694 zl = policy_zonelist(gfp, pol); 1769 zl = policy_zonelist(gfp, pol);
1695 if (unlikely(mpol_needs_cond_ref(pol))) { 1770 if (unlikely(mpol_needs_cond_ref(pol))) {
@@ -1699,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1699 struct page *page = __alloc_pages_nodemask(gfp, 0, 1774 struct page *page = __alloc_pages_nodemask(gfp, 0,
1700 zl, policy_nodemask(gfp, pol)); 1775 zl, policy_nodemask(gfp, pol));
1701 __mpol_put(pol); 1776 __mpol_put(pol);
1777 put_mems_allowed();
1702 return page; 1778 return page;
1703 } 1779 }
1704 /* 1780 /*
1705 * fast path: default or task policy 1781 * fast path: default or task policy
1706 */ 1782 */
1707 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); 1783 page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1784 put_mems_allowed();
1785 return page;
1708} 1786}
1709 1787
1710/** 1788/**
@@ -1729,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1729struct page *alloc_pages_current(gfp_t gfp, unsigned order) 1807struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1730{ 1808{
1731 struct mempolicy *pol = current->mempolicy; 1809 struct mempolicy *pol = current->mempolicy;
1810 struct page *page;
1732 1811
1733 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1812 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1734 pol = &default_policy; 1813 pol = &default_policy;
1735 1814
1815 get_mems_allowed();
1736 /* 1816 /*
1737 * No reference counting needed for current->mempolicy 1817 * No reference counting needed for current->mempolicy
1738 * nor system default_policy 1818 * nor system default_policy
1739 */ 1819 */
1740 if (pol->mode == MPOL_INTERLEAVE) 1820 if (pol->mode == MPOL_INTERLEAVE)
1741 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1821 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1742 return __alloc_pages_nodemask(gfp, order, 1822 else
1823 page = __alloc_pages_nodemask(gfp, order,
1743 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); 1824 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1825 put_mems_allowed();
1826 return page;
1744} 1827}
1745EXPORT_SYMBOL(alloc_pages_current); 1828EXPORT_SYMBOL(alloc_pages_current);
1746 1829
@@ -1750,6 +1833,9 @@ EXPORT_SYMBOL(alloc_pages_current);
1750 * with the mems_allowed returned by cpuset_mems_allowed(). This 1833 * with the mems_allowed returned by cpuset_mems_allowed(). This
1751 * keeps mempolicies cpuset relative after its cpuset moves. See 1834 * keeps mempolicies cpuset relative after its cpuset moves. See
1752 * further kernel/cpuset.c update_nodemask(). 1835 * further kernel/cpuset.c update_nodemask().
1836 *
1837 * current's mempolicy may be rebinded by the other task(the task that changes
1838 * cpuset's mems), so we needn't do rebind work for current task.
1753 */ 1839 */
1754 1840
1755/* Slow path of a mempolicy duplicate */ 1841/* Slow path of a mempolicy duplicate */
@@ -1759,13 +1845,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1759 1845
1760 if (!new) 1846 if (!new)
1761 return ERR_PTR(-ENOMEM); 1847 return ERR_PTR(-ENOMEM);
1848
1849 /* task's mempolicy is protected by alloc_lock */
1850 if (old == current->mempolicy) {
1851 task_lock(current);
1852 *new = *old;
1853 task_unlock(current);
1854 } else
1855 *new = *old;
1856
1762 rcu_read_lock(); 1857 rcu_read_lock();
1763 if (current_cpuset_is_being_rebound()) { 1858 if (current_cpuset_is_being_rebound()) {
1764 nodemask_t mems = cpuset_mems_allowed(current); 1859 nodemask_t mems = cpuset_mems_allowed(current);
1765 mpol_rebind_policy(old, &mems); 1860 if (new->flags & MPOL_F_REBINDING)
1861 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1862 else
1863 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1766 } 1864 }
1767 rcu_read_unlock(); 1865 rcu_read_unlock();
1768 *new = *old;
1769 atomic_set(&new->refcnt, 1); 1866 atomic_set(&new->refcnt, 1);
1770 return new; 1867 return new;
1771} 1868}
@@ -1792,16 +1889,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1792 return tompol; 1889 return tompol;
1793} 1890}
1794 1891
1795static int mpol_match_intent(const struct mempolicy *a,
1796 const struct mempolicy *b)
1797{
1798 if (a->flags != b->flags)
1799 return 0;
1800 if (!mpol_store_user_nodemask(a))
1801 return 1;
1802 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1803}
1804
1805/* Slow path of a mempolicy comparison */ 1892/* Slow path of a mempolicy comparison */
1806int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 1893int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1807{ 1894{
@@ -1809,8 +1896,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1809 return 0; 1896 return 0;
1810 if (a->mode != b->mode) 1897 if (a->mode != b->mode)
1811 return 0; 1898 return 0;
1812 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) 1899 if (a->flags != b->flags)
1813 return 0; 1900 return 0;
1901 if (mpol_store_user_nodemask(a))
1902 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1903 return 0;
1904
1814 switch (a->mode) { 1905 switch (a->mode) {
1815 case MPOL_BIND: 1906 case MPOL_BIND:
1816 /* Fall through */ 1907 /* Fall through */
@@ -2006,26 +2097,22 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2006 return; 2097 return;
2007 /* contextualize the tmpfs mount point mempolicy */ 2098 /* contextualize the tmpfs mount point mempolicy */
2008 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 2099 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2009 if (IS_ERR(new)) { 2100 if (IS_ERR(new))
2010 mpol_put(mpol); /* drop our ref on sb mpol */ 2101 goto put_free; /* no valid nodemask intersection */
2011 NODEMASK_SCRATCH_FREE(scratch);
2012 return; /* no valid nodemask intersection */
2013 }
2014 2102
2015 task_lock(current); 2103 task_lock(current);
2016 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); 2104 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2017 task_unlock(current); 2105 task_unlock(current);
2018 mpol_put(mpol); /* drop our ref on sb mpol */ 2106 mpol_put(mpol); /* drop our ref on sb mpol */
2019 if (ret) { 2107 if (ret)
2020 NODEMASK_SCRATCH_FREE(scratch); 2108 goto put_free;
2021 mpol_put(new);
2022 return;
2023 }
2024 2109
2025 /* Create pseudo-vma that contains just the policy */ 2110 /* Create pseudo-vma that contains just the policy */
2026 memset(&pvma, 0, sizeof(struct vm_area_struct)); 2111 memset(&pvma, 0, sizeof(struct vm_area_struct));
2027 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 2112 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2028 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 2113 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2114
2115put_free:
2029 mpol_put(new); /* drop initial ref */ 2116 mpol_put(new); /* drop initial ref */
2030 NODEMASK_SCRATCH_FREE(scratch); 2117 NODEMASK_SCRATCH_FREE(scratch);
2031 } 2118 }
@@ -2132,9 +2219,15 @@ void numa_default_policy(void)
2132 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2219 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
2133 * Used only for mpol_parse_str() and mpol_to_str() 2220 * Used only for mpol_parse_str() and mpol_to_str()
2134 */ 2221 */
2135#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) 2222#define MPOL_LOCAL MPOL_MAX
2136static const char * const policy_types[] = 2223static const char * const policy_modes[] =
2137 { "default", "prefer", "bind", "interleave", "local" }; 2224{
2225 [MPOL_DEFAULT] = "default",
2226 [MPOL_PREFERRED] = "prefer",
2227 [MPOL_BIND] = "bind",
2228 [MPOL_INTERLEAVE] = "interleave",
2229 [MPOL_LOCAL] = "local"
2230};
2138 2231
2139 2232
2140#ifdef CONFIG_TMPFS 2233#ifdef CONFIG_TMPFS
@@ -2159,12 +2252,11 @@ static const char * const policy_types[] =
2159int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) 2252int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2160{ 2253{
2161 struct mempolicy *new = NULL; 2254 struct mempolicy *new = NULL;
2162 unsigned short uninitialized_var(mode); 2255 unsigned short mode;
2163 unsigned short uninitialized_var(mode_flags); 2256 unsigned short uninitialized_var(mode_flags);
2164 nodemask_t nodes; 2257 nodemask_t nodes;
2165 char *nodelist = strchr(str, ':'); 2258 char *nodelist = strchr(str, ':');
2166 char *flags = strchr(str, '='); 2259 char *flags = strchr(str, '=');
2167 int i;
2168 int err = 1; 2260 int err = 1;
2169 2261
2170 if (nodelist) { 2262 if (nodelist) {
@@ -2180,13 +2272,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2180 if (flags) 2272 if (flags)
2181 *flags++ = '\0'; /* terminate mode string */ 2273 *flags++ = '\0'; /* terminate mode string */
2182 2274
2183 for (i = 0; i <= MPOL_LOCAL; i++) { 2275 for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2184 if (!strcmp(str, policy_types[i])) { 2276 if (!strcmp(str, policy_modes[mode])) {
2185 mode = i;
2186 break; 2277 break;
2187 } 2278 }
2188 } 2279 }
2189 if (i > MPOL_LOCAL) 2280 if (mode > MPOL_LOCAL)
2190 goto out; 2281 goto out;
2191 2282
2192 switch (mode) { 2283 switch (mode) {
@@ -2250,7 +2341,10 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2250 if (IS_ERR(new)) 2341 if (IS_ERR(new))
2251 goto out; 2342 goto out;
2252 2343
2253 { 2344 if (no_context) {
2345 /* save for contextualization */
2346 new->w.user_nodemask = nodes;
2347 } else {
2254 int ret; 2348 int ret;
2255 NODEMASK_SCRATCH(scratch); 2349 NODEMASK_SCRATCH(scratch);
2256 if (scratch) { 2350 if (scratch) {
@@ -2266,10 +2360,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2266 } 2360 }
2267 } 2361 }
2268 err = 0; 2362 err = 0;
2269 if (no_context) {
2270 /* save for contextualization */
2271 new->w.user_nodemask = nodes;
2272 }
2273 2363
2274out: 2364out:
2275 /* Restore string for error message */ 2365 /* Restore string for error message */
@@ -2338,11 +2428,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2338 BUG(); 2428 BUG();
2339 } 2429 }
2340 2430
2341 l = strlen(policy_types[mode]); 2431 l = strlen(policy_modes[mode]);
2342 if (buffer + maxlen < p + l + 1) 2432 if (buffer + maxlen < p + l + 1)
2343 return -ENOSPC; 2433 return -ENOSPC;
2344 2434
2345 strcpy(p, policy_types[mode]); 2435 strcpy(p, policy_modes[mode]);
2346 p += l; 2436 p += l;
2347 2437
2348 if (flags & MPOL_MODE_FLAGS) { 2438 if (flags & MPOL_MODE_FLAGS) {
diff --git a/mm/migrate.c b/mm/migrate.c
index d3f3f7f81075..09e2471afa0f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -40,7 +40,8 @@
40 40
41/* 41/*
42 * migrate_prep() needs to be called before we start compiling a list of pages 42 * migrate_prep() needs to be called before we start compiling a list of pages
43 * to be migrated using isolate_lru_page(). 43 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
44 * undesirable, use migrate_prep_local()
44 */ 45 */
45int migrate_prep(void) 46int migrate_prep(void)
46{ 47{
@@ -55,26 +56,29 @@ int migrate_prep(void)
55 return 0; 56 return 0;
56} 57}
57 58
59/* Do the necessary work of migrate_prep but not if it involves other CPUs */
60int migrate_prep_local(void)
61{
62 lru_add_drain();
63
64 return 0;
65}
66
58/* 67/*
59 * Add isolated pages on the list back to the LRU under page lock 68 * Add isolated pages on the list back to the LRU under page lock
60 * to avoid leaking evictable pages back onto unevictable list. 69 * to avoid leaking evictable pages back onto unevictable list.
61 *
62 * returns the number of pages put back.
63 */ 70 */
64int putback_lru_pages(struct list_head *l) 71void putback_lru_pages(struct list_head *l)
65{ 72{
66 struct page *page; 73 struct page *page;
67 struct page *page2; 74 struct page *page2;
68 int count = 0;
69 75
70 list_for_each_entry_safe(page, page2, l, lru) { 76 list_for_each_entry_safe(page, page2, l, lru) {
71 list_del(&page->lru); 77 list_del(&page->lru);
72 dec_zone_page_state(page, NR_ISOLATED_ANON + 78 dec_zone_page_state(page, NR_ISOLATED_ANON +
73 page_is_file_cache(page)); 79 page_is_file_cache(page));
74 putback_lru_page(page); 80 putback_lru_page(page);
75 count++;
76 } 81 }
77 return count;
78} 82}
79 83
80/* 84/*
@@ -490,7 +494,8 @@ static int fallback_migrate_page(struct address_space *mapping,
490 * < 0 - error code 494 * < 0 - error code
491 * == 0 - success 495 * == 0 - success
492 */ 496 */
493static int move_to_new_page(struct page *newpage, struct page *page) 497static int move_to_new_page(struct page *newpage, struct page *page,
498 int remap_swapcache)
494{ 499{
495 struct address_space *mapping; 500 struct address_space *mapping;
496 int rc; 501 int rc;
@@ -525,10 +530,12 @@ static int move_to_new_page(struct page *newpage, struct page *page)
525 else 530 else
526 rc = fallback_migrate_page(mapping, newpage, page); 531 rc = fallback_migrate_page(mapping, newpage, page);
527 532
528 if (!rc) 533 if (rc) {
529 remove_migration_ptes(page, newpage);
530 else
531 newpage->mapping = NULL; 534 newpage->mapping = NULL;
535 } else {
536 if (remap_swapcache)
537 remove_migration_ptes(page, newpage);
538 }
532 539
533 unlock_page(newpage); 540 unlock_page(newpage);
534 541
@@ -545,9 +552,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
545 int rc = 0; 552 int rc = 0;
546 int *result = NULL; 553 int *result = NULL;
547 struct page *newpage = get_new_page(page, private, &result); 554 struct page *newpage = get_new_page(page, private, &result);
555 int remap_swapcache = 1;
548 int rcu_locked = 0; 556 int rcu_locked = 0;
549 int charge = 0; 557 int charge = 0;
550 struct mem_cgroup *mem = NULL; 558 struct mem_cgroup *mem = NULL;
559 struct anon_vma *anon_vma = NULL;
551 560
552 if (!newpage) 561 if (!newpage)
553 return -ENOMEM; 562 return -ENOMEM;
@@ -604,6 +613,34 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
604 if (PageAnon(page)) { 613 if (PageAnon(page)) {
605 rcu_read_lock(); 614 rcu_read_lock();
606 rcu_locked = 1; 615 rcu_locked = 1;
616
617 /* Determine how to safely use anon_vma */
618 if (!page_mapped(page)) {
619 if (!PageSwapCache(page))
620 goto rcu_unlock;
621
622 /*
623 * We cannot be sure that the anon_vma of an unmapped
624 * swapcache page is safe to use because we don't
625 * know in advance if the VMA that this page belonged
626 * to still exists. If the VMA and others sharing the
627 * data have been freed, then the anon_vma could
628 * already be invalid.
629 *
630 * To avoid this possibility, swapcache pages get
631 * migrated but are not remapped when migration
632 * completes
633 */
634 remap_swapcache = 0;
635 } else {
636 /*
637 * Take a reference count on the anon_vma if the
638 * page is mapped so that it is guaranteed to
639 * exist when the page is remapped later
640 */
641 anon_vma = page_anon_vma(page);
642 atomic_inc(&anon_vma->external_refcount);
643 }
607 } 644 }
608 645
609 /* 646 /*
@@ -638,11 +675,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
638 675
639skip_unmap: 676skip_unmap:
640 if (!page_mapped(page)) 677 if (!page_mapped(page))
641 rc = move_to_new_page(newpage, page); 678 rc = move_to_new_page(newpage, page, remap_swapcache);
642 679
643 if (rc) 680 if (rc && remap_swapcache)
644 remove_migration_ptes(page, page); 681 remove_migration_ptes(page, page);
645rcu_unlock: 682rcu_unlock:
683
684 /* Drop an anon_vma reference if we took one */
685 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
686 int empty = list_empty(&anon_vma->head);
687 spin_unlock(&anon_vma->lock);
688 if (empty)
689 anon_vma_free(anon_vma);
690 }
691
646 if (rcu_locked) 692 if (rcu_locked)
647 rcu_read_unlock(); 693 rcu_read_unlock();
648uncharge: 694uncharge:
diff --git a/mm/mincore.c b/mm/mincore.c
index f77433c20279..9ac42dc6d7b6 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,6 +19,40 @@
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20#include <asm/pgtable.h> 20#include <asm/pgtable.h>
21 21
22static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
23 unsigned long addr, unsigned long end,
24 unsigned char *vec)
25{
26#ifdef CONFIG_HUGETLB_PAGE
27 struct hstate *h;
28
29 h = hstate_vma(vma);
30 while (1) {
31 unsigned char present;
32 pte_t *ptep;
33 /*
34 * Huge pages are always in RAM for now, but
35 * theoretically it needs to be checked.
36 */
37 ptep = huge_pte_offset(current->mm,
38 addr & huge_page_mask(h));
39 present = ptep && !huge_pte_none(huge_ptep_get(ptep));
40 while (1) {
41 *vec = present;
42 vec++;
43 addr += PAGE_SIZE;
44 if (addr == end)
45 return;
46 /* check hugepage border */
47 if (!(addr & ~huge_page_mask(h)))
48 break;
49 }
50 }
51#else
52 BUG();
53#endif
54}
55
22/* 56/*
23 * Later we can get more picky about what "in core" means precisely. 57 * Later we can get more picky about what "in core" means precisely.
24 * For now, simply check to see if the page is in the page cache, 58 * For now, simply check to see if the page is in the page cache,
@@ -49,145 +83,150 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
49 return present; 83 return present;
50} 84}
51 85
52/* 86static void mincore_unmapped_range(struct vm_area_struct *vma,
53 * Do a chunk of "sys_mincore()". We've already checked 87 unsigned long addr, unsigned long end,
54 * all the arguments, we hold the mmap semaphore: we should 88 unsigned char *vec)
55 * just return the amount of info we're asked for.
56 */
57static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
58{ 89{
59 pgd_t *pgd; 90 unsigned long nr = (end - addr) >> PAGE_SHIFT;
60 pud_t *pud;
61 pmd_t *pmd;
62 pte_t *ptep;
63 spinlock_t *ptl;
64 unsigned long nr;
65 int i; 91 int i;
66 pgoff_t pgoff;
67 struct vm_area_struct *vma = find_vma(current->mm, addr);
68 92
69 /* 93 if (vma->vm_file) {
70 * find_vma() didn't find anything above us, or we're 94 pgoff_t pgoff;
71 * in an unmapped hole in the address space: ENOMEM.
72 */
73 if (!vma || addr < vma->vm_start)
74 return -ENOMEM;
75
76#ifdef CONFIG_HUGETLB_PAGE
77 if (is_vm_hugetlb_page(vma)) {
78 struct hstate *h;
79 unsigned long nr_huge;
80 unsigned char present;
81 95
82 i = 0; 96 pgoff = linear_page_index(vma, addr);
83 nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); 97 for (i = 0; i < nr; i++, pgoff++)
84 h = hstate_vma(vma); 98 vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
85 nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) 99 } else {
86 - (addr >> huge_page_shift(h)) + 1; 100 for (i = 0; i < nr; i++)
87 nr_huge = min(nr_huge, 101 vec[i] = 0;
88 (vma->vm_end - addr) >> huge_page_shift(h));
89 while (1) {
90 /* hugepage always in RAM for now,
91 * but generally it needs to be check */
92 ptep = huge_pte_offset(current->mm,
93 addr & huge_page_mask(h));
94 present = !!(ptep &&
95 !huge_pte_none(huge_ptep_get(ptep)));
96 while (1) {
97 vec[i++] = present;
98 addr += PAGE_SIZE;
99 /* reach buffer limit */
100 if (i == nr)
101 return nr;
102 /* check hugepage border */
103 if (!((addr & ~huge_page_mask(h))
104 >> PAGE_SHIFT))
105 break;
106 }
107 }
108 return nr;
109 } 102 }
110#endif 103}
111
112 /*
113 * Calculate how many pages there are left in the last level of the
114 * PTE array for our address.
115 */
116 nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1));
117
118 /*
119 * Don't overrun this vma
120 */
121 nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT);
122
123 /*
124 * Don't return more than the caller asked for
125 */
126 nr = min(nr, pages);
127 104
128 pgd = pgd_offset(vma->vm_mm, addr); 105static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
129 if (pgd_none_or_clear_bad(pgd)) 106 unsigned long addr, unsigned long end,
130 goto none_mapped; 107 unsigned char *vec)
131 pud = pud_offset(pgd, addr); 108{
132 if (pud_none_or_clear_bad(pud)) 109 unsigned long next;
133 goto none_mapped; 110 spinlock_t *ptl;
134 pmd = pmd_offset(pud, addr); 111 pte_t *ptep;
135 if (pmd_none_or_clear_bad(pmd))
136 goto none_mapped;
137 112
138 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 113 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
139 for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { 114 do {
140 unsigned char present;
141 pte_t pte = *ptep; 115 pte_t pte = *ptep;
116 pgoff_t pgoff;
142 117
143 if (pte_present(pte)) { 118 next = addr + PAGE_SIZE;
144 present = 1; 119 if (pte_none(pte))
145 120 mincore_unmapped_range(vma, addr, next, vec);
146 } else if (pte_none(pte)) { 121 else if (pte_present(pte))
147 if (vma->vm_file) { 122 *vec = 1;
148 pgoff = linear_page_index(vma, addr); 123 else if (pte_file(pte)) {
149 present = mincore_page(vma->vm_file->f_mapping,
150 pgoff);
151 } else
152 present = 0;
153
154 } else if (pte_file(pte)) {
155 pgoff = pte_to_pgoff(pte); 124 pgoff = pte_to_pgoff(pte);
156 present = mincore_page(vma->vm_file->f_mapping, pgoff); 125 *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
157
158 } else { /* pte is a swap entry */ 126 } else { /* pte is a swap entry */
159 swp_entry_t entry = pte_to_swp_entry(pte); 127 swp_entry_t entry = pte_to_swp_entry(pte);
128
160 if (is_migration_entry(entry)) { 129 if (is_migration_entry(entry)) {
161 /* migration entries are always uptodate */ 130 /* migration entries are always uptodate */
162 present = 1; 131 *vec = 1;
163 } else { 132 } else {
164#ifdef CONFIG_SWAP 133#ifdef CONFIG_SWAP
165 pgoff = entry.val; 134 pgoff = entry.val;
166 present = mincore_page(&swapper_space, pgoff); 135 *vec = mincore_page(&swapper_space, pgoff);
167#else 136#else
168 WARN_ON(1); 137 WARN_ON(1);
169 present = 1; 138 *vec = 1;
170#endif 139#endif
171 } 140 }
172 } 141 }
142 vec++;
143 } while (ptep++, addr = next, addr != end);
144 pte_unmap_unlock(ptep - 1, ptl);
145}
173 146
174 vec[i] = present; 147static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
175 } 148 unsigned long addr, unsigned long end,
176 pte_unmap_unlock(ptep-1, ptl); 149 unsigned char *vec)
150{
151 unsigned long next;
152 pmd_t *pmd;
177 153
178 return nr; 154 pmd = pmd_offset(pud, addr);
155 do {
156 next = pmd_addr_end(addr, end);
157 if (pmd_none_or_clear_bad(pmd))
158 mincore_unmapped_range(vma, addr, next, vec);
159 else
160 mincore_pte_range(vma, pmd, addr, next, vec);
161 vec += (next - addr) >> PAGE_SHIFT;
162 } while (pmd++, addr = next, addr != end);
163}
179 164
180none_mapped: 165static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
181 if (vma->vm_file) { 166 unsigned long addr, unsigned long end,
182 pgoff = linear_page_index(vma, addr); 167 unsigned char *vec)
183 for (i = 0; i < nr; i++, pgoff++) 168{
184 vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); 169 unsigned long next;
185 } else { 170 pud_t *pud;
186 for (i = 0; i < nr; i++) 171
187 vec[i] = 0; 172 pud = pud_offset(pgd, addr);
173 do {
174 next = pud_addr_end(addr, end);
175 if (pud_none_or_clear_bad(pud))
176 mincore_unmapped_range(vma, addr, next, vec);
177 else
178 mincore_pmd_range(vma, pud, addr, next, vec);
179 vec += (next - addr) >> PAGE_SHIFT;
180 } while (pud++, addr = next, addr != end);
181}
182
183static void mincore_page_range(struct vm_area_struct *vma,
184 unsigned long addr, unsigned long end,
185 unsigned char *vec)
186{
187 unsigned long next;
188 pgd_t *pgd;
189
190 pgd = pgd_offset(vma->vm_mm, addr);
191 do {
192 next = pgd_addr_end(addr, end);
193 if (pgd_none_or_clear_bad(pgd))
194 mincore_unmapped_range(vma, addr, next, vec);
195 else
196 mincore_pud_range(vma, pgd, addr, next, vec);
197 vec += (next - addr) >> PAGE_SHIFT;
198 } while (pgd++, addr = next, addr != end);
199}
200
201/*
202 * Do a chunk of "sys_mincore()". We've already checked
203 * all the arguments, we hold the mmap semaphore: we should
204 * just return the amount of info we're asked for.
205 */
206static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
207{
208 struct vm_area_struct *vma;
209 unsigned long end;
210
211 vma = find_vma(current->mm, addr);
212 if (!vma || addr < vma->vm_start)
213 return -ENOMEM;
214
215 end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
216
217 if (is_vm_hugetlb_page(vma)) {
218 mincore_hugetlb_page_range(vma, addr, end, vec);
219 return (end - addr) >> PAGE_SHIFT;
188 } 220 }
189 221
190 return nr; 222 end = pmd_addr_end(addr, end);
223
224 if (is_vm_hugetlb_page(vma))
225 mincore_hugetlb_page_range(vma, addr, end, vec);
226 else
227 mincore_page_range(vma, addr, end, vec);
228
229 return (end - addr) >> PAGE_SHIFT;
191} 230}
192 231
193/* 232/*
@@ -247,7 +286,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
247 * the temporary buffer size. 286 * the temporary buffer size.
248 */ 287 */
249 down_read(&current->mm->mmap_sem); 288 down_read(&current->mm->mmap_sem);
250 retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); 289 retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
251 up_read(&current->mm->mmap_sem); 290 up_read(&current->mm->mmap_sem);
252 291
253 if (retval <= 0) 292 if (retval <= 0)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6326c71b663..08b349931ebc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,6 +49,7 @@
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h> 51#include <linux/memory.h>
52#include <linux/compaction.h>
52#include <trace/events/kmem.h> 53#include <trace/events/kmem.h>
53#include <linux/ftrace_event.h> 54#include <linux/ftrace_event.h>
54 55
@@ -475,6 +476,8 @@ static inline void __free_one_page(struct page *page,
475 int migratetype) 476 int migratetype)
476{ 477{
477 unsigned long page_idx; 478 unsigned long page_idx;
479 unsigned long combined_idx;
480 struct page *buddy;
478 481
479 if (unlikely(PageCompound(page))) 482 if (unlikely(PageCompound(page)))
480 if (unlikely(destroy_compound_page(page, order))) 483 if (unlikely(destroy_compound_page(page, order)))
@@ -488,9 +491,6 @@ static inline void __free_one_page(struct page *page,
488 VM_BUG_ON(bad_range(zone, page)); 491 VM_BUG_ON(bad_range(zone, page));
489 492
490 while (order < MAX_ORDER-1) { 493 while (order < MAX_ORDER-1) {
491 unsigned long combined_idx;
492 struct page *buddy;
493
494 buddy = __page_find_buddy(page, page_idx, order); 494 buddy = __page_find_buddy(page, page_idx, order);
495 if (!page_is_buddy(page, buddy, order)) 495 if (!page_is_buddy(page, buddy, order))
496 break; 496 break;
@@ -505,8 +505,29 @@ static inline void __free_one_page(struct page *page,
505 order++; 505 order++;
506 } 506 }
507 set_page_order(page, order); 507 set_page_order(page, order);
508 list_add(&page->lru, 508
509 &zone->free_area[order].free_list[migratetype]); 509 /*
510 * If this is not the largest possible page, check if the buddy
511 * of the next-highest order is free. If it is, it's possible
512 * that pages are being freed that will coalesce soon. In case,
513 * that is happening, add the free page to the tail of the list
514 * so it's less likely to be used soon and more likely to be merged
515 * as a higher order page
516 */
517 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
518 struct page *higher_page, *higher_buddy;
519 combined_idx = __find_combined_index(page_idx, order);
520 higher_page = page + combined_idx - page_idx;
521 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
522 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
523 list_add_tail(&page->lru,
524 &zone->free_area[order].free_list[migratetype]);
525 goto out;
526 }
527 }
528
529 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
530out:
510 zone->free_area[order].nr_free++; 531 zone->free_area[order].nr_free++;
511} 532}
512 533
@@ -599,20 +620,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
599 spin_unlock(&zone->lock); 620 spin_unlock(&zone->lock);
600} 621}
601 622
602static void __free_pages_ok(struct page *page, unsigned int order) 623static bool free_pages_prepare(struct page *page, unsigned int order)
603{ 624{
604 unsigned long flags;
605 int i; 625 int i;
606 int bad = 0; 626 int bad = 0;
607 int wasMlocked = __TestClearPageMlocked(page);
608 627
609 trace_mm_page_free_direct(page, order); 628 trace_mm_page_free_direct(page, order);
610 kmemcheck_free_shadow(page, order); 629 kmemcheck_free_shadow(page, order);
611 630
612 for (i = 0 ; i < (1 << order) ; ++i) 631 for (i = 0; i < (1 << order); i++) {
613 bad += free_pages_check(page + i); 632 struct page *pg = page + i;
633
634 if (PageAnon(pg))
635 pg->mapping = NULL;
636 bad += free_pages_check(pg);
637 }
614 if (bad) 638 if (bad)
615 return; 639 return false;
616 640
617 if (!PageHighMem(page)) { 641 if (!PageHighMem(page)) {
618 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 642 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -622,6 +646,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
622 arch_free_page(page, order); 646 arch_free_page(page, order);
623 kernel_map_pages(page, 1 << order, 0); 647 kernel_map_pages(page, 1 << order, 0);
624 648
649 return true;
650}
651
652static void __free_pages_ok(struct page *page, unsigned int order)
653{
654 unsigned long flags;
655 int wasMlocked = __TestClearPageMlocked(page);
656
657 if (!free_pages_prepare(page, order))
658 return;
659
625 local_irq_save(flags); 660 local_irq_save(flags);
626 if (unlikely(wasMlocked)) 661 if (unlikely(wasMlocked))
627 free_page_mlock(page); 662 free_page_mlock(page);
@@ -1107,21 +1142,9 @@ void free_hot_cold_page(struct page *page, int cold)
1107 int migratetype; 1142 int migratetype;
1108 int wasMlocked = __TestClearPageMlocked(page); 1143 int wasMlocked = __TestClearPageMlocked(page);
1109 1144
1110 trace_mm_page_free_direct(page, 0); 1145 if (!free_pages_prepare(page, 0))
1111 kmemcheck_free_shadow(page, 0);
1112
1113 if (PageAnon(page))
1114 page->mapping = NULL;
1115 if (free_pages_check(page))
1116 return; 1146 return;
1117 1147
1118 if (!PageHighMem(page)) {
1119 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
1120 debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
1121 }
1122 arch_free_page(page, 0);
1123 kernel_map_pages(page, 1, 0);
1124
1125 migratetype = get_pageblock_migratetype(page); 1148 migratetype = get_pageblock_migratetype(page);
1126 set_page_private(page, migratetype); 1149 set_page_private(page, migratetype);
1127 local_irq_save(flags); 1150 local_irq_save(flags);
@@ -1188,6 +1211,51 @@ void split_page(struct page *page, unsigned int order)
1188} 1211}
1189 1212
1190/* 1213/*
1214 * Similar to split_page except the page is already free. As this is only
1215 * being used for migration, the migratetype of the block also changes.
1216 * As this is called with interrupts disabled, the caller is responsible
1217 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1218 * are enabled.
1219 *
1220 * Note: this is probably too low level an operation for use in drivers.
1221 * Please consult with lkml before using this in your driver.
1222 */
1223int split_free_page(struct page *page)
1224{
1225 unsigned int order;
1226 unsigned long watermark;
1227 struct zone *zone;
1228
1229 BUG_ON(!PageBuddy(page));
1230
1231 zone = page_zone(page);
1232 order = page_order(page);
1233
1234 /* Obey watermarks as if the page was being allocated */
1235 watermark = low_wmark_pages(zone) + (1 << order);
1236 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1237 return 0;
1238
1239 /* Remove page from free list */
1240 list_del(&page->lru);
1241 zone->free_area[order].nr_free--;
1242 rmv_page_order(page);
1243 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1244
1245 /* Split into individual pages */
1246 set_page_refcounted(page);
1247 split_page(page, order);
1248
1249 if (order >= pageblock_order - 1) {
1250 struct page *endpage = page + (1 << order) - 1;
1251 for (; page < endpage; page += pageblock_nr_pages)
1252 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1253 }
1254
1255 return 1 << order;
1256}
1257
1258/*
1191 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1259 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1192 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1260 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1193 * or two. 1261 * or two.
@@ -1693,6 +1761,62 @@ out:
1693 return page; 1761 return page;
1694} 1762}
1695 1763
1764#ifdef CONFIG_COMPACTION
1765/* Try memory compaction for high-order allocations before reclaim */
1766static struct page *
1767__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1768 struct zonelist *zonelist, enum zone_type high_zoneidx,
1769 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1770 int migratetype, unsigned long *did_some_progress)
1771{
1772 struct page *page;
1773
1774 if (!order || compaction_deferred(preferred_zone))
1775 return NULL;
1776
1777 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1778 nodemask);
1779 if (*did_some_progress != COMPACT_SKIPPED) {
1780
1781 /* Page migration frees to the PCP lists but we want merging */
1782 drain_pages(get_cpu());
1783 put_cpu();
1784
1785 page = get_page_from_freelist(gfp_mask, nodemask,
1786 order, zonelist, high_zoneidx,
1787 alloc_flags, preferred_zone,
1788 migratetype);
1789 if (page) {
1790 preferred_zone->compact_considered = 0;
1791 preferred_zone->compact_defer_shift = 0;
1792 count_vm_event(COMPACTSUCCESS);
1793 return page;
1794 }
1795
1796 /*
1797 * It's bad if compaction run occurs and fails.
1798 * The most likely reason is that pages exist,
1799 * but not enough to satisfy watermarks.
1800 */
1801 count_vm_event(COMPACTFAIL);
1802 defer_compaction(preferred_zone);
1803
1804 cond_resched();
1805 }
1806
1807 return NULL;
1808}
1809#else
1810static inline struct page *
1811__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1812 struct zonelist *zonelist, enum zone_type high_zoneidx,
1813 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1814 int migratetype, unsigned long *did_some_progress)
1815{
1816 return NULL;
1817}
1818#endif /* CONFIG_COMPACTION */
1819
1696/* The really slow allocator path where we enter direct reclaim */ 1820/* The really slow allocator path where we enter direct reclaim */
1697static inline struct page * 1821static inline struct page *
1698__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 1822__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1879,6 +2003,15 @@ rebalance:
1879 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2003 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1880 goto nopage; 2004 goto nopage;
1881 2005
2006 /* Try direct compaction */
2007 page = __alloc_pages_direct_compact(gfp_mask, order,
2008 zonelist, high_zoneidx,
2009 nodemask,
2010 alloc_flags, preferred_zone,
2011 migratetype, &did_some_progress);
2012 if (page)
2013 goto got_pg;
2014
1882 /* Try direct reclaim and then allocating */ 2015 /* Try direct reclaim and then allocating */
1883 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2016 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1884 zonelist, high_zoneidx, 2017 zonelist, high_zoneidx,
@@ -1970,10 +2103,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1970 if (unlikely(!zonelist->_zonerefs->zone)) 2103 if (unlikely(!zonelist->_zonerefs->zone))
1971 return NULL; 2104 return NULL;
1972 2105
2106 get_mems_allowed();
1973 /* The preferred zone is used for statistics later */ 2107 /* The preferred zone is used for statistics later */
1974 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2108 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1975 if (!preferred_zone) 2109 if (!preferred_zone) {
2110 put_mems_allowed();
1976 return NULL; 2111 return NULL;
2112 }
1977 2113
1978 /* First allocation attempt */ 2114 /* First allocation attempt */
1979 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2115 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1983,6 +2119,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1983 page = __alloc_pages_slowpath(gfp_mask, order, 2119 page = __alloc_pages_slowpath(gfp_mask, order,
1984 zonelist, high_zoneidx, nodemask, 2120 zonelist, high_zoneidx, nodemask,
1985 preferred_zone, migratetype); 2121 preferred_zone, migratetype);
2122 put_mems_allowed();
1986 2123
1987 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2124 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1988 return page; 2125 return page;
@@ -2434,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2434 strncpy((char*)table->data, saved_string, 2571 strncpy((char*)table->data, saved_string,
2435 NUMA_ZONELIST_ORDER_LEN); 2572 NUMA_ZONELIST_ORDER_LEN);
2436 user_zonelist_order = oldval; 2573 user_zonelist_order = oldval;
2437 } else if (oldval != user_zonelist_order) 2574 } else if (oldval != user_zonelist_order) {
2438 build_all_zonelists(); 2575 mutex_lock(&zonelists_mutex);
2576 build_all_zonelists(NULL);
2577 mutex_unlock(&zonelists_mutex);
2578 }
2439 } 2579 }
2440out: 2580out:
2441 mutex_unlock(&zl_order_mutex); 2581 mutex_unlock(&zl_order_mutex);
@@ -2582,7 +2722,7 @@ static int default_zonelist_order(void)
2582 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 2722 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
2583 * If they are really small and used heavily, the system can fall 2723 * If they are really small and used heavily, the system can fall
2584 * into OOM very easily. 2724 * into OOM very easily.
2585 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 2725 * This function detect ZONE_DMA/DMA32 size and configures zone order.
2586 */ 2726 */
2587 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 2727 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2588 low_kmem_size = 0; 2728 low_kmem_size = 0;
@@ -2594,6 +2734,15 @@ static int default_zonelist_order(void)
2594 if (zone_type < ZONE_NORMAL) 2734 if (zone_type < ZONE_NORMAL)
2595 low_kmem_size += z->present_pages; 2735 low_kmem_size += z->present_pages;
2596 total_size += z->present_pages; 2736 total_size += z->present_pages;
2737 } else if (zone_type == ZONE_NORMAL) {
2738 /*
2739 * If any node has only lowmem, then node order
2740 * is preferred to allow kernel allocations
2741 * locally; otherwise, they can easily infringe
2742 * on other nodes when there is an abundance of
2743 * lowmem available to allocate from.
2744 */
2745 return ZONELIST_ORDER_NODE;
2597 } 2746 }
2598 } 2747 }
2599 } 2748 }
@@ -2776,9 +2925,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2776 */ 2925 */
2777static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 2926static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2778static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 2927static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2928static void setup_zone_pageset(struct zone *zone);
2929
2930/*
2931 * Global mutex to protect against size modification of zonelists
2932 * as well as to serialize pageset setup for the new populated zone.
2933 */
2934DEFINE_MUTEX(zonelists_mutex);
2779 2935
2780/* return values int ....just for stop_machine() */ 2936/* return values int ....just for stop_machine() */
2781static int __build_all_zonelists(void *dummy) 2937static __init_refok int __build_all_zonelists(void *data)
2782{ 2938{
2783 int nid; 2939 int nid;
2784 int cpu; 2940 int cpu;
@@ -2793,6 +2949,14 @@ static int __build_all_zonelists(void *dummy)
2793 build_zonelist_cache(pgdat); 2949 build_zonelist_cache(pgdat);
2794 } 2950 }
2795 2951
2952#ifdef CONFIG_MEMORY_HOTPLUG
2953 /* Setup real pagesets for the new zone */
2954 if (data) {
2955 struct zone *zone = data;
2956 setup_zone_pageset(zone);
2957 }
2958#endif
2959
2796 /* 2960 /*
2797 * Initialize the boot_pagesets that are going to be used 2961 * Initialize the boot_pagesets that are going to be used
2798 * for bootstrapping processors. The real pagesets for 2962 * for bootstrapping processors. The real pagesets for
@@ -2812,7 +2976,11 @@ static int __build_all_zonelists(void *dummy)
2812 return 0; 2976 return 0;
2813} 2977}
2814 2978
2815void build_all_zonelists(void) 2979/*
2980 * Called with zonelists_mutex held always
2981 * unless system_state == SYSTEM_BOOTING.
2982 */
2983void build_all_zonelists(void *data)
2816{ 2984{
2817 set_zonelist_order(); 2985 set_zonelist_order();
2818 2986
@@ -2823,7 +2991,7 @@ void build_all_zonelists(void)
2823 } else { 2991 } else {
2824 /* we have to stop all cpus to guarantee there is no user 2992 /* we have to stop all cpus to guarantee there is no user
2825 of zonelist */ 2993 of zonelist */
2826 stop_machine(__build_all_zonelists, NULL, NULL); 2994 stop_machine(__build_all_zonelists, data, NULL);
2827 /* cpuset refresh routine should be here */ 2995 /* cpuset refresh routine should be here */
2828 } 2996 }
2829 vm_total_pages = nr_free_pagecache_pages(); 2997 vm_total_pages = nr_free_pagecache_pages();
@@ -3146,31 +3314,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3146 pcp->batch = PAGE_SHIFT * 8; 3314 pcp->batch = PAGE_SHIFT * 8;
3147} 3315}
3148 3316
3317static __meminit void setup_zone_pageset(struct zone *zone)
3318{
3319 int cpu;
3320
3321 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3322
3323 for_each_possible_cpu(cpu) {
3324 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3325
3326 setup_pageset(pcp, zone_batchsize(zone));
3327
3328 if (percpu_pagelist_fraction)
3329 setup_pagelist_highmark(pcp,
3330 (zone->present_pages /
3331 percpu_pagelist_fraction));
3332 }
3333}
3334
3149/* 3335/*
3150 * Allocate per cpu pagesets and initialize them. 3336 * Allocate per cpu pagesets and initialize them.
3151 * Before this call only boot pagesets were available. 3337 * Before this call only boot pagesets were available.
3152 * Boot pagesets will no longer be used by this processorr
3153 * after setup_per_cpu_pageset().
3154 */ 3338 */
3155void __init setup_per_cpu_pageset(void) 3339void __init setup_per_cpu_pageset(void)
3156{ 3340{
3157 struct zone *zone; 3341 struct zone *zone;
3158 int cpu;
3159 3342
3160 for_each_populated_zone(zone) { 3343 for_each_populated_zone(zone)
3161 zone->pageset = alloc_percpu(struct per_cpu_pageset); 3344 setup_zone_pageset(zone);
3162
3163 for_each_possible_cpu(cpu) {
3164 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3165
3166 setup_pageset(pcp, zone_batchsize(zone));
3167
3168 if (percpu_pagelist_fraction)
3169 setup_pagelist_highmark(pcp,
3170 (zone->present_pages /
3171 percpu_pagelist_fraction));
3172 }
3173 }
3174} 3345}
3175 3346
3176static noinline __init_refok 3347static noinline __init_refok
diff --git a/mm/readahead.c b/mm/readahead.c
index dfa9a1a03a11..77506a291a2d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -523,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
523 * @req_size: hint: total size of the read which the caller is performing in 523 * @req_size: hint: total size of the read which the caller is performing in
524 * pagecache pages 524 * pagecache pages
525 * 525 *
526 * page_cache_async_ondemand() should be called when a page is used which 526 * page_cache_async_readahead() should be called when a page is used which
527 * has the PG_readahead flag; this is a marker to suggest that the application 527 * has the PG_readahead flag; this is a marker to suggest that the application
528 * has used up enough of the readahead window that we should start pulling in 528 * has used up enough of the readahead window that we should start pulling in
529 * more pages. 529 * more pages.
diff --git a/mm/rmap.c b/mm/rmap.c
index 0feeef860a8f..38a336e2eea1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -250,7 +250,7 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
250 list_del(&anon_vma_chain->same_anon_vma); 250 list_del(&anon_vma_chain->same_anon_vma);
251 251
252 /* We must garbage collect the anon_vma if it's empty */ 252 /* We must garbage collect the anon_vma if it's empty */
253 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 253 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
254 spin_unlock(&anon_vma->lock); 254 spin_unlock(&anon_vma->lock);
255 255
256 if (empty) 256 if (empty)
@@ -274,7 +274,7 @@ static void anon_vma_ctor(void *data)
274 struct anon_vma *anon_vma = data; 274 struct anon_vma *anon_vma = data;
275 275
276 spin_lock_init(&anon_vma->lock); 276 spin_lock_init(&anon_vma->lock);
277 ksm_refcount_init(anon_vma); 277 anonvma_external_refcount_init(anon_vma);
278 INIT_LIST_HEAD(&anon_vma->head); 278 INIT_LIST_HEAD(&anon_vma->head);
279} 279}
280 280
@@ -1131,6 +1131,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1131 return ret; 1131 return ret;
1132} 1132}
1133 1133
1134static bool is_vma_temporary_stack(struct vm_area_struct *vma)
1135{
1136 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1137
1138 if (!maybe_stack)
1139 return false;
1140
1141 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1142 VM_STACK_INCOMPLETE_SETUP)
1143 return true;
1144
1145 return false;
1146}
1147
1134/** 1148/**
1135 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1149 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
1136 * rmap method 1150 * rmap method
@@ -1159,7 +1173,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1159 1173
1160 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1174 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1161 struct vm_area_struct *vma = avc->vma; 1175 struct vm_area_struct *vma = avc->vma;
1162 unsigned long address = vma_address(page, vma); 1176 unsigned long address;
1177
1178 /*
1179 * During exec, a temporary VMA is setup and later moved.
1180 * The VMA is moved under the anon_vma lock but not the
1181 * page tables leading to a race where migration cannot
1182 * find the migration ptes. Rather than increasing the
1183 * locking requirements of exec(), migration skips
1184 * temporary VMAs until after exec() completes.
1185 */
1186 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
1187 is_vma_temporary_stack(vma))
1188 continue;
1189
1190 address = vma_address(page, vma);
1163 if (address == -EFAULT) 1191 if (address == -EFAULT)
1164 continue; 1192 continue;
1165 ret = try_to_unmap_one(page, vma, address, flags); 1193 ret = try_to_unmap_one(page, vma, address, flags);
@@ -1355,10 +1383,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1355 /* 1383 /*
1356 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1384 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
1357 * because that depends on page_mapped(); but not all its usages 1385 * because that depends on page_mapped(); but not all its usages
1358 * are holding mmap_sem, which also gave the necessary guarantee 1386 * are holding mmap_sem. Users without mmap_sem are required to
1359 * (that this anon_vma's slab has not already been destroyed). 1387 * take a reference count to prevent the anon_vma disappearing
1360 * This needs to be reviewed later: avoiding page_lock_anon_vma()
1361 * is risky, and currently limits the usefulness of rmap_walk().
1362 */ 1388 */
1363 anon_vma = page_anon_vma(page); 1389 anon_vma = page_anon_vma(page);
1364 if (!anon_vma) 1390 if (!anon_vma)
diff --git a/mm/shmem.c b/mm/shmem.c
index 0cd7f66f1c66..4ef9797bd430 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -433,8 +433,6 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
433 433
434 spin_unlock(&info->lock); 434 spin_unlock(&info->lock);
435 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); 435 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
436 if (page)
437 set_page_private(page, 0);
438 spin_lock(&info->lock); 436 spin_lock(&info->lock);
439 437
440 if (!page) { 438 if (!page) {
diff --git a/mm/slab.c b/mm/slab.c
index 50a73fca19c4..02786e1a32d2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3217,10 +3217,12 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3217 if (in_interrupt() || (flags & __GFP_THISNODE)) 3217 if (in_interrupt() || (flags & __GFP_THISNODE))
3218 return NULL; 3218 return NULL;
3219 nid_alloc = nid_here = numa_node_id(); 3219 nid_alloc = nid_here = numa_node_id();
3220 get_mems_allowed();
3220 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3221 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3221 nid_alloc = cpuset_mem_spread_node(); 3222 nid_alloc = cpuset_mem_spread_node();
3222 else if (current->mempolicy) 3223 else if (current->mempolicy)
3223 nid_alloc = slab_node(current->mempolicy); 3224 nid_alloc = slab_node(current->mempolicy);
3225 put_mems_allowed();
3224 if (nid_alloc != nid_here) 3226 if (nid_alloc != nid_here)
3225 return ____cache_alloc_node(cachep, flags, nid_alloc); 3227 return ____cache_alloc_node(cachep, flags, nid_alloc);
3226 return NULL; 3228 return NULL;
@@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3247 if (flags & __GFP_THISNODE) 3249 if (flags & __GFP_THISNODE)
3248 return NULL; 3250 return NULL;
3249 3251
3252 get_mems_allowed();
3250 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 3253 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3251 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3254 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3252 3255
@@ -3302,6 +3305,7 @@ retry:
3302 } 3305 }
3303 } 3306 }
3304 } 3307 }
3308 put_mems_allowed();
3305 return obj; 3309 return obj;
3306} 3310}
3307 3311
diff --git a/mm/slub.c b/mm/slub.c
index e46e3129697d..26f0cb9cc584 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1360 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1360 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1361 return NULL; 1361 return NULL;
1362 1362
1363 get_mems_allowed();
1363 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1364 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1364 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1365 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1365 struct kmem_cache_node *n; 1366 struct kmem_cache_node *n;
@@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1369 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1370 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1370 n->nr_partial > s->min_partial) { 1371 n->nr_partial > s->min_partial) {
1371 page = get_partial_node(n); 1372 page = get_partial_node(n);
1372 if (page) 1373 if (page) {
1374 put_mems_allowed();
1373 return page; 1375 return page;
1376 }
1374 } 1377 }
1375 } 1378 }
1379 put_mems_allowed();
1376#endif 1380#endif
1377 return NULL; 1381 return NULL;
1378} 1382}
diff --git a/mm/sparse.c b/mm/sparse.c
index dc0cc4d43ff3..95ac219af379 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -382,13 +382,15 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
382struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) 382struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
383{ 383{
384 struct page *map; 384 struct page *map;
385 unsigned long size;
385 386
386 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); 387 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
387 if (map) 388 if (map)
388 return map; 389 return map;
389 390
390 map = alloc_bootmem_pages_node(NODE_DATA(nid), 391 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
391 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 392 map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
393 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
392 return map; 394 return map;
393} 395}
394void __init sparse_mem_maps_populate_node(struct page **map_map, 396void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -412,7 +414,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
412 } 414 }
413 415
414 size = PAGE_ALIGN(size); 416 size = PAGE_ALIGN(size);
415 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); 417 map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
418 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
416 if (map) { 419 if (map) {
417 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 420 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
418 if (!present_section_nr(pnum)) 421 if (!present_section_nr(pnum))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ff3311447f5..915dceb487c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -73,10 +73,14 @@ struct scan_control {
73 73
74 int swappiness; 74 int swappiness;
75 75
76 int all_unreclaimable;
77
78 int order; 76 int order;
79 77
78 /*
79 * Intend to reclaim enough contenious memory rather than to reclaim
80 * enough amount memory. I.e, it's the mode for high order allocation.
81 */
82 bool lumpy_reclaim_mode;
83
80 /* Which cgroup do we reclaim from */ 84 /* Which cgroup do we reclaim from */
81 struct mem_cgroup *mem_cgroup; 85 struct mem_cgroup *mem_cgroup;
82 86
@@ -85,12 +89,6 @@ struct scan_control {
85 * are scanned. 89 * are scanned.
86 */ 90 */
87 nodemask_t *nodemask; 91 nodemask_t *nodemask;
88
89 /* Pluggable isolate pages callback */
90 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
91 unsigned long *scanned, int order, int mode,
92 struct zone *z, struct mem_cgroup *mem_cont,
93 int active, int file);
94}; 92};
95 93
96#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 94#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -575,7 +573,7 @@ static enum page_references page_check_references(struct page *page,
575 referenced_page = TestClearPageReferenced(page); 573 referenced_page = TestClearPageReferenced(page);
576 574
577 /* Lumpy reclaim - ignore references */ 575 /* Lumpy reclaim - ignore references */
578 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 576 if (sc->lumpy_reclaim_mode)
579 return PAGEREF_RECLAIM; 577 return PAGEREF_RECLAIM;
580 578
581 /* 579 /*
@@ -839,11 +837,6 @@ keep:
839 return nr_reclaimed; 837 return nr_reclaimed;
840} 838}
841 839
842/* LRU Isolation modes. */
843#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
844#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
845#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
846
847/* 840/*
848 * Attempt to remove the specified page from its LRU. Only take this page 841 * Attempt to remove the specified page from its LRU. Only take this page
849 * if it is of the appropriate PageActive status. Pages which are being 842 * if it is of the appropriate PageActive status. Pages which are being
@@ -1011,7 +1004,6 @@ static unsigned long isolate_pages_global(unsigned long nr,
1011 struct list_head *dst, 1004 struct list_head *dst,
1012 unsigned long *scanned, int order, 1005 unsigned long *scanned, int order,
1013 int mode, struct zone *z, 1006 int mode, struct zone *z,
1014 struct mem_cgroup *mem_cont,
1015 int active, int file) 1007 int active, int file)
1016{ 1008{
1017 int lru = LRU_BASE; 1009 int lru = LRU_BASE;
@@ -1130,7 +1122,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1130 unsigned long nr_scanned = 0; 1122 unsigned long nr_scanned = 0;
1131 unsigned long nr_reclaimed = 0; 1123 unsigned long nr_reclaimed = 0;
1132 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1124 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1133 int lumpy_reclaim = 0;
1134 1125
1135 while (unlikely(too_many_isolated(zone, file, sc))) { 1126 while (unlikely(too_many_isolated(zone, file, sc))) {
1136 congestion_wait(BLK_RW_ASYNC, HZ/10); 1127 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1140,17 +1131,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1140 return SWAP_CLUSTER_MAX; 1131 return SWAP_CLUSTER_MAX;
1141 } 1132 }
1142 1133
1143 /*
1144 * If we need a large contiguous chunk of memory, or have
1145 * trouble getting a small set of contiguous pages, we
1146 * will reclaim both active and inactive pages.
1147 *
1148 * We use the same threshold as pageout congestion_wait below.
1149 */
1150 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1151 lumpy_reclaim = 1;
1152 else if (sc->order && priority < DEF_PRIORITY - 2)
1153 lumpy_reclaim = 1;
1154 1134
1155 pagevec_init(&pvec, 1); 1135 pagevec_init(&pvec, 1);
1156 1136
@@ -1163,15 +1143,15 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1163 unsigned long nr_freed; 1143 unsigned long nr_freed;
1164 unsigned long nr_active; 1144 unsigned long nr_active;
1165 unsigned int count[NR_LRU_LISTS] = { 0, }; 1145 unsigned int count[NR_LRU_LISTS] = { 0, };
1166 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; 1146 int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1167 unsigned long nr_anon; 1147 unsigned long nr_anon;
1168 unsigned long nr_file; 1148 unsigned long nr_file;
1169 1149
1170 nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
1171 &page_list, &nr_scan, sc->order, mode,
1172 zone, sc->mem_cgroup, 0, file);
1173
1174 if (scanning_global_lru(sc)) { 1150 if (scanning_global_lru(sc)) {
1151 nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
1152 &page_list, &nr_scan,
1153 sc->order, mode,
1154 zone, 0, file);
1175 zone->pages_scanned += nr_scan; 1155 zone->pages_scanned += nr_scan;
1176 if (current_is_kswapd()) 1156 if (current_is_kswapd())
1177 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1157 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1179,6 +1159,16 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1179 else 1159 else
1180 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1160 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1181 nr_scan); 1161 nr_scan);
1162 } else {
1163 nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
1164 &page_list, &nr_scan,
1165 sc->order, mode,
1166 zone, sc->mem_cgroup,
1167 0, file);
1168 /*
1169 * mem_cgroup_isolate_pages() keeps track of
1170 * scanned pages on its own.
1171 */
1182 } 1172 }
1183 1173
1184 if (nr_taken == 0) 1174 if (nr_taken == 0)
@@ -1216,7 +1206,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1216 * but that should be acceptable to the caller 1206 * but that should be acceptable to the caller
1217 */ 1207 */
1218 if (nr_freed < nr_taken && !current_is_kswapd() && 1208 if (nr_freed < nr_taken && !current_is_kswapd() &&
1219 lumpy_reclaim) { 1209 sc->lumpy_reclaim_mode) {
1220 congestion_wait(BLK_RW_ASYNC, HZ/10); 1210 congestion_wait(BLK_RW_ASYNC, HZ/10);
1221 1211
1222 /* 1212 /*
@@ -1356,16 +1346,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1356 1346
1357 lru_add_drain(); 1347 lru_add_drain();
1358 spin_lock_irq(&zone->lru_lock); 1348 spin_lock_irq(&zone->lru_lock);
1359 nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1360 ISOLATE_ACTIVE, zone,
1361 sc->mem_cgroup, 1, file);
1362 /*
1363 * zone->pages_scanned is used for detect zone's oom
1364 * mem_cgroup remembers nr_scan by itself.
1365 */
1366 if (scanning_global_lru(sc)) { 1349 if (scanning_global_lru(sc)) {
1350 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1351 &pgscanned, sc->order,
1352 ISOLATE_ACTIVE, zone,
1353 1, file);
1367 zone->pages_scanned += pgscanned; 1354 zone->pages_scanned += pgscanned;
1355 } else {
1356 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1357 &pgscanned, sc->order,
1358 ISOLATE_ACTIVE, zone,
1359 sc->mem_cgroup, 1, file);
1360 /*
1361 * mem_cgroup_isolate_pages() keeps track of
1362 * scanned pages on its own.
1363 */
1368 } 1364 }
1365
1369 reclaim_stat->recent_scanned[file] += nr_taken; 1366 reclaim_stat->recent_scanned[file] += nr_taken;
1370 1367
1371 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1368 __count_zone_vm_events(PGREFILL, zone, pgscanned);
@@ -1519,21 +1516,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1519} 1516}
1520 1517
1521/* 1518/*
1519 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1520 * until we collected @swap_cluster_max pages to scan.
1521 */
1522static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1523 unsigned long *nr_saved_scan)
1524{
1525 unsigned long nr;
1526
1527 *nr_saved_scan += nr_to_scan;
1528 nr = *nr_saved_scan;
1529
1530 if (nr >= SWAP_CLUSTER_MAX)
1531 *nr_saved_scan = 0;
1532 else
1533 nr = 0;
1534
1535 return nr;
1536}
1537
1538/*
1522 * Determine how aggressively the anon and file LRU lists should be 1539 * Determine how aggressively the anon and file LRU lists should be
1523 * scanned. The relative value of each set of LRU lists is determined 1540 * scanned. The relative value of each set of LRU lists is determined
1524 * by looking at the fraction of the pages scanned we did rotate back 1541 * by looking at the fraction of the pages scanned we did rotate back
1525 * onto the active list instead of evict. 1542 * onto the active list instead of evict.
1526 * 1543 *
1527 * percent[0] specifies how much pressure to put on ram/swap backed 1544 * nr[0] = anon pages to scan; nr[1] = file pages to scan
1528 * memory, while percent[1] determines pressure on the file LRUs.
1529 */ 1545 */
1530static void get_scan_ratio(struct zone *zone, struct scan_control *sc, 1546static void get_scan_count(struct zone *zone, struct scan_control *sc,
1531 unsigned long *percent) 1547 unsigned long *nr, int priority)
1532{ 1548{
1533 unsigned long anon, file, free; 1549 unsigned long anon, file, free;
1534 unsigned long anon_prio, file_prio; 1550 unsigned long anon_prio, file_prio;
1535 unsigned long ap, fp; 1551 unsigned long ap, fp;
1536 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1552 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1553 u64 fraction[2], denominator;
1554 enum lru_list l;
1555 int noswap = 0;
1556
1557 /* If we have no swap space, do not bother scanning anon pages. */
1558 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1559 noswap = 1;
1560 fraction[0] = 0;
1561 fraction[1] = 1;
1562 denominator = 1;
1563 goto out;
1564 }
1537 1565
1538 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1566 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1539 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1567 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
@@ -1545,9 +1573,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1545 /* If we have very few page cache pages, 1573 /* If we have very few page cache pages,
1546 force-scan anon pages. */ 1574 force-scan anon pages. */
1547 if (unlikely(file + free <= high_wmark_pages(zone))) { 1575 if (unlikely(file + free <= high_wmark_pages(zone))) {
1548 percent[0] = 100; 1576 fraction[0] = 1;
1549 percent[1] = 0; 1577 fraction[1] = 0;
1550 return; 1578 denominator = 1;
1579 goto out;
1551 } 1580 }
1552 } 1581 }
1553 1582
@@ -1594,29 +1623,37 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1594 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1623 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1595 fp /= reclaim_stat->recent_rotated[1] + 1; 1624 fp /= reclaim_stat->recent_rotated[1] + 1;
1596 1625
1597 /* Normalize to percentages */ 1626 fraction[0] = ap;
1598 percent[0] = 100 * ap / (ap + fp + 1); 1627 fraction[1] = fp;
1599 percent[1] = 100 - percent[0]; 1628 denominator = ap + fp + 1;
1629out:
1630 for_each_evictable_lru(l) {
1631 int file = is_file_lru(l);
1632 unsigned long scan;
1633
1634 scan = zone_nr_lru_pages(zone, sc, l);
1635 if (priority || noswap) {
1636 scan >>= priority;
1637 scan = div64_u64(scan * fraction[file], denominator);
1638 }
1639 nr[l] = nr_scan_try_batch(scan,
1640 &reclaim_stat->nr_saved_scan[l]);
1641 }
1600} 1642}
1601 1643
1602/* 1644static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
1603 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1604 * until we collected @swap_cluster_max pages to scan.
1605 */
1606static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1607 unsigned long *nr_saved_scan)
1608{ 1645{
1609 unsigned long nr; 1646 /*
1610 1647 * If we need a large contiguous chunk of memory, or have
1611 *nr_saved_scan += nr_to_scan; 1648 * trouble getting a small set of contiguous pages, we
1612 nr = *nr_saved_scan; 1649 * will reclaim both active and inactive pages.
1613 1650 */
1614 if (nr >= SWAP_CLUSTER_MAX) 1651 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1615 *nr_saved_scan = 0; 1652 sc->lumpy_reclaim_mode = 1;
1653 else if (sc->order && priority < DEF_PRIORITY - 2)
1654 sc->lumpy_reclaim_mode = 1;
1616 else 1655 else
1617 nr = 0; 1656 sc->lumpy_reclaim_mode = 0;
1618
1619 return nr;
1620} 1657}
1621 1658
1622/* 1659/*
@@ -1627,33 +1664,13 @@ static void shrink_zone(int priority, struct zone *zone,
1627{ 1664{
1628 unsigned long nr[NR_LRU_LISTS]; 1665 unsigned long nr[NR_LRU_LISTS];
1629 unsigned long nr_to_scan; 1666 unsigned long nr_to_scan;
1630 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1631 enum lru_list l; 1667 enum lru_list l;
1632 unsigned long nr_reclaimed = sc->nr_reclaimed; 1668 unsigned long nr_reclaimed = sc->nr_reclaimed;
1633 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1669 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1634 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1635 int noswap = 0;
1636
1637 /* If we have no swap space, do not bother scanning anon pages. */
1638 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1639 noswap = 1;
1640 percent[0] = 0;
1641 percent[1] = 100;
1642 } else
1643 get_scan_ratio(zone, sc, percent);
1644 1670
1645 for_each_evictable_lru(l) { 1671 get_scan_count(zone, sc, nr, priority);
1646 int file = is_file_lru(l);
1647 unsigned long scan;
1648 1672
1649 scan = zone_nr_lru_pages(zone, sc, l); 1673 set_lumpy_reclaim_mode(priority, sc);
1650 if (priority || noswap) {
1651 scan >>= priority;
1652 scan = (scan * percent[file]) / 100;
1653 }
1654 nr[l] = nr_scan_try_batch(scan,
1655 &reclaim_stat->nr_saved_scan[l]);
1656 }
1657 1674
1658 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1675 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1659 nr[LRU_INACTIVE_FILE]) { 1676 nr[LRU_INACTIVE_FILE]) {
@@ -1707,14 +1724,14 @@ static void shrink_zone(int priority, struct zone *zone,
1707 * If a zone is deemed to be full of pinned pages then just give it a light 1724 * If a zone is deemed to be full of pinned pages then just give it a light
1708 * scan then give up on it. 1725 * scan then give up on it.
1709 */ 1726 */
1710static void shrink_zones(int priority, struct zonelist *zonelist, 1727static int shrink_zones(int priority, struct zonelist *zonelist,
1711 struct scan_control *sc) 1728 struct scan_control *sc)
1712{ 1729{
1713 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1730 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1714 struct zoneref *z; 1731 struct zoneref *z;
1715 struct zone *zone; 1732 struct zone *zone;
1733 int progress = 0;
1716 1734
1717 sc->all_unreclaimable = 1;
1718 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1735 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1719 sc->nodemask) { 1736 sc->nodemask) {
1720 if (!populated_zone(zone)) 1737 if (!populated_zone(zone))
@@ -1730,19 +1747,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1730 1747
1731 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1748 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1732 continue; /* Let kswapd poll it */ 1749 continue; /* Let kswapd poll it */
1733 sc->all_unreclaimable = 0;
1734 } else { 1750 } else {
1735 /* 1751 /*
1736 * Ignore cpuset limitation here. We just want to reduce 1752 * Ignore cpuset limitation here. We just want to reduce
1737 * # of used pages by us regardless of memory shortage. 1753 * # of used pages by us regardless of memory shortage.
1738 */ 1754 */
1739 sc->all_unreclaimable = 0;
1740 mem_cgroup_note_reclaim_priority(sc->mem_cgroup, 1755 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1741 priority); 1756 priority);
1742 } 1757 }
1743 1758
1744 shrink_zone(priority, zone, sc); 1759 shrink_zone(priority, zone, sc);
1760 progress = 1;
1745 } 1761 }
1762 return progress;
1746} 1763}
1747 1764
1748/* 1765/*
@@ -1774,6 +1791,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1774 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1791 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1775 unsigned long writeback_threshold; 1792 unsigned long writeback_threshold;
1776 1793
1794 get_mems_allowed();
1777 delayacct_freepages_start(); 1795 delayacct_freepages_start();
1778 1796
1779 if (scanning_global_lru(sc)) 1797 if (scanning_global_lru(sc))
@@ -1795,7 +1813,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1795 sc->nr_scanned = 0; 1813 sc->nr_scanned = 0;
1796 if (!priority) 1814 if (!priority)
1797 disable_swap_token(); 1815 disable_swap_token();
1798 shrink_zones(priority, zonelist, sc); 1816 ret = shrink_zones(priority, zonelist, sc);
1799 /* 1817 /*
1800 * Don't shrink slabs when reclaiming memory from 1818 * Don't shrink slabs when reclaiming memory from
1801 * over limit cgroups 1819 * over limit cgroups
@@ -1832,7 +1850,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1832 congestion_wait(BLK_RW_ASYNC, HZ/10); 1850 congestion_wait(BLK_RW_ASYNC, HZ/10);
1833 } 1851 }
1834 /* top priority shrink_zones still had more to do? don't OOM, then */ 1852 /* top priority shrink_zones still had more to do? don't OOM, then */
1835 if (!sc->all_unreclaimable && scanning_global_lru(sc)) 1853 if (ret && scanning_global_lru(sc))
1836 ret = sc->nr_reclaimed; 1854 ret = sc->nr_reclaimed;
1837out: 1855out:
1838 /* 1856 /*
@@ -1857,6 +1875,7 @@ out:
1857 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); 1875 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1858 1876
1859 delayacct_freepages_end(); 1877 delayacct_freepages_end();
1878 put_mems_allowed();
1860 1879
1861 return ret; 1880 return ret;
1862} 1881}
@@ -1873,7 +1892,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1873 .swappiness = vm_swappiness, 1892 .swappiness = vm_swappiness,
1874 .order = order, 1893 .order = order,
1875 .mem_cgroup = NULL, 1894 .mem_cgroup = NULL,
1876 .isolate_pages = isolate_pages_global,
1877 .nodemask = nodemask, 1895 .nodemask = nodemask,
1878 }; 1896 };
1879 1897
@@ -1894,7 +1912,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1894 .swappiness = swappiness, 1912 .swappiness = swappiness,
1895 .order = 0, 1913 .order = 0,
1896 .mem_cgroup = mem, 1914 .mem_cgroup = mem,
1897 .isolate_pages = mem_cgroup_isolate_pages,
1898 }; 1915 };
1899 nodemask_t nm = nodemask_of_node(nid); 1916 nodemask_t nm = nodemask_of_node(nid);
1900 1917
@@ -1928,7 +1945,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1928 .swappiness = swappiness, 1945 .swappiness = swappiness,
1929 .order = 0, 1946 .order = 0,
1930 .mem_cgroup = mem_cont, 1947 .mem_cgroup = mem_cont,
1931 .isolate_pages = mem_cgroup_isolate_pages,
1932 .nodemask = NULL, /* we don't care the placement */ 1948 .nodemask = NULL, /* we don't care the placement */
1933 }; 1949 };
1934 1950
@@ -2006,7 +2022,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2006 .swappiness = vm_swappiness, 2022 .swappiness = vm_swappiness,
2007 .order = order, 2023 .order = order,
2008 .mem_cgroup = NULL, 2024 .mem_cgroup = NULL,
2009 .isolate_pages = isolate_pages_global,
2010 }; 2025 };
2011 /* 2026 /*
2012 * temp_priority is used to remember the scanning priority at which 2027 * temp_priority is used to remember the scanning priority at which
@@ -2385,7 +2400,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2385 .hibernation_mode = 1, 2400 .hibernation_mode = 1,
2386 .swappiness = vm_swappiness, 2401 .swappiness = vm_swappiness,
2387 .order = 0, 2402 .order = 0,
2388 .isolate_pages = isolate_pages_global,
2389 }; 2403 };
2390 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 2404 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2391 struct task_struct *p = current; 2405 struct task_struct *p = current;
@@ -2570,7 +2584,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2570 .gfp_mask = gfp_mask, 2584 .gfp_mask = gfp_mask,
2571 .swappiness = vm_swappiness, 2585 .swappiness = vm_swappiness,
2572 .order = order, 2586 .order = order,
2573 .isolate_pages = isolate_pages_global,
2574 }; 2587 };
2575 unsigned long slab_reclaimable; 2588 unsigned long slab_reclaimable;
2576 2589
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fa12ea3051fb..7759941d4e77 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/vmstat.h> 17#include <linux/vmstat.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/math64.h>
19 20
20#ifdef CONFIG_VM_EVENT_COUNTERS 21#ifdef CONFIG_VM_EVENT_COUNTERS
21DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -379,7 +380,86 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
379} 380}
380#endif 381#endif
381 382
382#ifdef CONFIG_PROC_FS 383#ifdef CONFIG_COMPACTION
384struct contig_page_info {
385 unsigned long free_pages;
386 unsigned long free_blocks_total;
387 unsigned long free_blocks_suitable;
388};
389
390/*
391 * Calculate the number of free pages in a zone, how many contiguous
392 * pages are free and how many are large enough to satisfy an allocation of
393 * the target size. Note that this function makes no attempt to estimate
394 * how many suitable free blocks there *might* be if MOVABLE pages were
395 * migrated. Calculating that is possible, but expensive and can be
396 * figured out from userspace
397 */
398static void fill_contig_page_info(struct zone *zone,
399 unsigned int suitable_order,
400 struct contig_page_info *info)
401{
402 unsigned int order;
403
404 info->free_pages = 0;
405 info->free_blocks_total = 0;
406 info->free_blocks_suitable = 0;
407
408 for (order = 0; order < MAX_ORDER; order++) {
409 unsigned long blocks;
410
411 /* Count number of free blocks */
412 blocks = zone->free_area[order].nr_free;
413 info->free_blocks_total += blocks;
414
415 /* Count free base pages */
416 info->free_pages += blocks << order;
417
418 /* Count the suitable free blocks */
419 if (order >= suitable_order)
420 info->free_blocks_suitable += blocks <<
421 (order - suitable_order);
422 }
423}
424
425/*
426 * A fragmentation index only makes sense if an allocation of a requested
427 * size would fail. If that is true, the fragmentation index indicates
428 * whether external fragmentation or a lack of memory was the problem.
429 * The value can be used to determine if page reclaim or compaction
430 * should be used
431 */
432static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
433{
434 unsigned long requested = 1UL << order;
435
436 if (!info->free_blocks_total)
437 return 0;
438
439 /* Fragmentation index only makes sense when a request would fail */
440 if (info->free_blocks_suitable)
441 return -1000;
442
443 /*
444 * Index is between 0 and 1 so return within 3 decimal places
445 *
446 * 0 => allocation would fail due to lack of memory
447 * 1 => allocation would fail due to fragmentation
448 */
449 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
450}
451
452/* Same as __fragmentation index but allocs contig_page_info on stack */
453int fragmentation_index(struct zone *zone, unsigned int order)
454{
455 struct contig_page_info info;
456
457 fill_contig_page_info(zone, order, &info);
458 return __fragmentation_index(order, &info);
459}
460#endif
461
462#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
383#include <linux/proc_fs.h> 463#include <linux/proc_fs.h>
384#include <linux/seq_file.h> 464#include <linux/seq_file.h>
385 465
@@ -432,7 +512,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
432 spin_unlock_irqrestore(&zone->lock, flags); 512 spin_unlock_irqrestore(&zone->lock, flags);
433 } 513 }
434} 514}
515#endif
435 516
517#ifdef CONFIG_PROC_FS
436static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 518static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
437 struct zone *zone) 519 struct zone *zone)
438{ 520{
@@ -693,6 +775,16 @@ static const char * const vmstat_text[] = {
693 "allocstall", 775 "allocstall",
694 776
695 "pgrotated", 777 "pgrotated",
778
779#ifdef CONFIG_COMPACTION
780 "compact_blocks_moved",
781 "compact_pages_moved",
782 "compact_pagemigrate_failed",
783 "compact_stall",
784 "compact_fail",
785 "compact_success",
786#endif
787
696#ifdef CONFIG_HUGETLB_PAGE 788#ifdef CONFIG_HUGETLB_PAGE
697 "htlb_buddy_alloc_success", 789 "htlb_buddy_alloc_success",
698 "htlb_buddy_alloc_fail", 790 "htlb_buddy_alloc_fail",
@@ -954,3 +1046,162 @@ static int __init setup_vmstat(void)
954 return 0; 1046 return 0;
955} 1047}
956module_init(setup_vmstat) 1048module_init(setup_vmstat)
1049
1050#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1051#include <linux/debugfs.h>
1052
1053static struct dentry *extfrag_debug_root;
1054
1055/*
1056 * Return an index indicating how much of the available free memory is
1057 * unusable for an allocation of the requested size.
1058 */
1059static int unusable_free_index(unsigned int order,
1060 struct contig_page_info *info)
1061{
1062 /* No free memory is interpreted as all free memory is unusable */
1063 if (info->free_pages == 0)
1064 return 1000;
1065
1066 /*
1067 * Index should be a value between 0 and 1. Return a value to 3
1068 * decimal places.
1069 *
1070 * 0 => no fragmentation
1071 * 1 => high fragmentation
1072 */
1073 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
1074
1075}
1076
1077static void unusable_show_print(struct seq_file *m,
1078 pg_data_t *pgdat, struct zone *zone)
1079{
1080 unsigned int order;
1081 int index;
1082 struct contig_page_info info;
1083
1084 seq_printf(m, "Node %d, zone %8s ",
1085 pgdat->node_id,
1086 zone->name);
1087 for (order = 0; order < MAX_ORDER; ++order) {
1088 fill_contig_page_info(zone, order, &info);
1089 index = unusable_free_index(order, &info);
1090 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1091 }
1092
1093 seq_putc(m, '\n');
1094}
1095
1096/*
1097 * Display unusable free space index
1098 *
1099 * The unusable free space index measures how much of the available free
1100 * memory cannot be used to satisfy an allocation of a given size and is a
1101 * value between 0 and 1. The higher the value, the more of free memory is
1102 * unusable and by implication, the worse the external fragmentation is. This
1103 * can be expressed as a percentage by multiplying by 100.
1104 */
1105static int unusable_show(struct seq_file *m, void *arg)
1106{
1107 pg_data_t *pgdat = (pg_data_t *)arg;
1108
1109 /* check memoryless node */
1110 if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
1111 return 0;
1112
1113 walk_zones_in_node(m, pgdat, unusable_show_print);
1114
1115 return 0;
1116}
1117
1118static const struct seq_operations unusable_op = {
1119 .start = frag_start,
1120 .next = frag_next,
1121 .stop = frag_stop,
1122 .show = unusable_show,
1123};
1124
1125static int unusable_open(struct inode *inode, struct file *file)
1126{
1127 return seq_open(file, &unusable_op);
1128}
1129
1130static const struct file_operations unusable_file_ops = {
1131 .open = unusable_open,
1132 .read = seq_read,
1133 .llseek = seq_lseek,
1134 .release = seq_release,
1135};
1136
1137static void extfrag_show_print(struct seq_file *m,
1138 pg_data_t *pgdat, struct zone *zone)
1139{
1140 unsigned int order;
1141 int index;
1142
1143 /* Alloc on stack as interrupts are disabled for zone walk */
1144 struct contig_page_info info;
1145
1146 seq_printf(m, "Node %d, zone %8s ",
1147 pgdat->node_id,
1148 zone->name);
1149 for (order = 0; order < MAX_ORDER; ++order) {
1150 fill_contig_page_info(zone, order, &info);
1151 index = __fragmentation_index(order, &info);
1152 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1153 }
1154
1155 seq_putc(m, '\n');
1156}
1157
1158/*
1159 * Display fragmentation index for orders that allocations would fail for
1160 */
1161static int extfrag_show(struct seq_file *m, void *arg)
1162{
1163 pg_data_t *pgdat = (pg_data_t *)arg;
1164
1165 walk_zones_in_node(m, pgdat, extfrag_show_print);
1166
1167 return 0;
1168}
1169
1170static const struct seq_operations extfrag_op = {
1171 .start = frag_start,
1172 .next = frag_next,
1173 .stop = frag_stop,
1174 .show = extfrag_show,
1175};
1176
1177static int extfrag_open(struct inode *inode, struct file *file)
1178{
1179 return seq_open(file, &extfrag_op);
1180}
1181
1182static const struct file_operations extfrag_file_ops = {
1183 .open = extfrag_open,
1184 .read = seq_read,
1185 .llseek = seq_lseek,
1186 .release = seq_release,
1187};
1188
1189static int __init extfrag_debug_init(void)
1190{
1191 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1192 if (!extfrag_debug_root)
1193 return -ENOMEM;
1194
1195 if (!debugfs_create_file("unusable_index", 0444,
1196 extfrag_debug_root, NULL, &unusable_file_ops))
1197 return -ENOMEM;
1198
1199 if (!debugfs_create_file("extfrag_index", 0444,
1200 extfrag_debug_root, NULL, &extfrag_file_ops))
1201 return -ENOMEM;
1202
1203 return 0;
1204}
1205
1206module_init(extfrag_debug_init);
1207#endif