aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2011-04-26 04:22:15 -0400
committerJiri Kosina <jkosina@suse.cz>2011-04-26 04:22:59 -0400
commit07f9479a40cc778bc1462ada11f95b01360ae4ff (patch)
tree0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /mm
parent9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff)
parentcd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff)
Merge branch 'master' into for-next
Fast-forwarded to current state of Linus' tree as there are patches to be applied for files that didn't exist on the old branch.
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug25
-rw-r--r--mm/backing-dev.c18
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/compaction.c65
-rw-r--r--mm/filemap.c211
-rw-r--r--mm/huge_memory.c69
-rw-r--r--mm/hugetlb.c16
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kmemleak.c6
-rw-r--r--mm/ksm.c25
-rw-r--r--mm/memblock.c241
-rw-r--r--mm/memcontrol.c669
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c106
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/migrate.c58
-rw-r--r--mm/mlock.c17
-rw-r--r--mm/mmap.c15
-rw-r--r--mm/mremap.c11
-rw-r--r--mm/nobootmem.c10
-rw-r--r--mm/nommu.c58
-rw-r--r--mm/oom_kill.c89
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/page_alloc.c95
-rw-r--r--mm/page_cgroup.c140
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pagewalk.c24
-rw-r--r--mm/percpu.c13
-rw-r--r--mm/readahead.c18
-rw-r--r--mm/rmap.c85
-rw-r--r--mm/shmem.c11
-rw-r--r--mm/slab.c61
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c376
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c189
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c411
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c158
-rw-r--r--mm/vmscan.c66
-rw-r--r--mm/vmstat.c27
45 files changed, 2123 insertions, 1359 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index af7cfb43d2f0..8b1a477162dc 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,27 +1,24 @@
1config DEBUG_PAGEALLOC 1config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC 3 depends on DEBUG_KERNEL
4 depends on !HIBERNATION || !PPC && !SPARC 4 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
5 depends on !KMEMCHECK 5 depends on !KMEMCHECK
6 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
6 ---help--- 7 ---help---
7 Unmap pages from the kernel linear mapping after free_pages(). 8 Unmap pages from the kernel linear mapping after free_pages().
8 This results in a large slowdown, but helps to find certain types 9 This results in a large slowdown, but helps to find certain types
9 of memory corruption. 10 of memory corruption.
10 11
12 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
13 fill the pages with poison patterns after free_pages() and verify
14 the patterns before alloc_pages(). Additionally,
15 this option cannot be enabled in combination with hibernation as
16 that would result in incorrect warnings of memory corruption after
17 a resume because free pages are not saved to the suspend image.
18
11config WANT_PAGE_DEBUG_FLAGS 19config WANT_PAGE_DEBUG_FLAGS
12 bool 20 bool
13 21
14config PAGE_POISONING 22config PAGE_POISONING
15 bool "Debug page memory allocations" 23 bool
16 depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
17 depends on !HIBERNATION
18 select DEBUG_PAGEALLOC
19 select WANT_PAGE_DEBUG_FLAGS 24 select WANT_PAGE_DEBUG_FLAGS
20 ---help---
21 Fill the pages with poison patterns after free_pages() and verify
22 the patterns before alloc_pages(). This results in a large slowdown,
23 but helps to find certain types of memory corruption.
24
25 This option cannot be enabled in combination with hibernation as
26 that would result in incorrect warnings of memory corruption after
27 a resume because free pages are not saved to the suspend image.
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 027100d30227..befc87531e4f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,17 +14,11 @@
14 14
15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
16 16
17void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
18{
19}
20EXPORT_SYMBOL(default_unplug_io_fn);
21
22struct backing_dev_info default_backing_dev_info = { 17struct backing_dev_info default_backing_dev_info = {
23 .name = "default", 18 .name = "default",
24 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 19 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
25 .state = 0, 20 .state = 0,
26 .capabilities = BDI_CAP_MAP_COPY, 21 .capabilities = BDI_CAP_MAP_COPY,
27 .unplug_io_fn = default_unplug_io_fn,
28}; 22};
29EXPORT_SYMBOL_GPL(default_backing_dev_info); 23EXPORT_SYMBOL_GPL(default_backing_dev_info);
30 24
@@ -73,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
73 struct inode *inode; 67 struct inode *inode;
74 68
75 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 69 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
76 spin_lock(&inode_lock); 70 spin_lock(&inode_wb_list_lock);
77 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
78 nr_dirty++; 72 nr_dirty++;
79 list_for_each_entry(inode, &wb->b_io, i_wb_list) 73 list_for_each_entry(inode, &wb->b_io, i_wb_list)
80 nr_io++; 74 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
82 nr_more_io++; 76 nr_more_io++;
83 spin_unlock(&inode_lock); 77 spin_unlock(&inode_wb_list_lock);
84 78
85 global_dirty_limits(&background_thresh, &dirty_thresh); 79 global_dirty_limits(&background_thresh, &dirty_thresh);
86 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
604 spin_lock(&sb_lock); 598 spin_lock(&sb_lock);
605 list_for_each_entry(sb, &super_blocks, s_list) { 599 list_for_each_entry(sb, &super_blocks, s_list) {
606 if (sb->s_bdi == bdi) 600 if (sb->s_bdi == bdi)
607 sb->s_bdi = NULL; 601 sb->s_bdi = &default_backing_dev_info;
608 } 602 }
609 spin_unlock(&sb_lock); 603 spin_unlock(&sb_lock);
610} 604}
@@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
682 if (bdi_has_dirty_io(bdi)) { 676 if (bdi_has_dirty_io(bdi)) {
683 struct bdi_writeback *dst = &default_backing_dev_info.wb; 677 struct bdi_writeback *dst = &default_backing_dev_info.wb;
684 678
685 spin_lock(&inode_lock); 679 spin_lock(&inode_wb_list_lock);
686 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
687 list_splice(&bdi->wb.b_io, &dst->b_io); 681 list_splice(&bdi->wb.b_io, &dst->b_io);
688 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
689 spin_unlock(&inode_lock); 683 spin_unlock(&inode_wb_list_lock);
690 } 684 }
691 685
692 bdi_unregister(bdi); 686 bdi_unregister(bdi);
@@ -793,7 +787,7 @@ EXPORT_SYMBOL(congestion_wait);
793 * jiffies for either a BDI to exit congestion of the given @sync queue 787 * jiffies for either a BDI to exit congestion of the given @sync queue
794 * or a write to complete. 788 * or a write to complete.
795 * 789 *
796 * In the absense of zone congestion, cond_resched() is called to yield 790 * In the absence of zone congestion, cond_resched() is called to yield
797 * the processor if necessary but otherwise does not sleep. 791 * the processor if necessary but otherwise does not sleep.
798 * 792 *
799 * The return value is 0 if the sleep is for the full timeout. Otherwise, 793 * The return value is 0 if the sleep is for the full timeout. Otherwise,
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 07aeb89e396e..01d5a4b3dd0c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -34,14 +34,6 @@ unsigned long max_low_pfn;
34unsigned long min_low_pfn; 34unsigned long min_low_pfn;
35unsigned long max_pfn; 35unsigned long max_pfn;
36 36
37#ifdef CONFIG_CRASH_DUMP
38/*
39 * If we have booted due to a crash, max_pfn will be a very low value. We need
40 * to know the amount of memory that the previous kernel used.
41 */
42unsigned long saved_max_pfn;
43#endif
44
45bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 37bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
46 38
47static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 39static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
diff --git a/mm/compaction.c b/mm/compaction.c
index 8be430b812de..021a2960ef9e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,8 +42,6 @@ struct compact_control {
42 unsigned int order; /* order a direct compactor needs */ 42 unsigned int order; /* order a direct compactor needs */
43 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 43 int migratetype; /* MOVABLE, RECLAIMABLE etc */
44 struct zone *zone; 44 struct zone *zone;
45
46 int compact_mode;
47}; 45};
48 46
49static unsigned long release_freepages(struct list_head *freelist) 47static unsigned long release_freepages(struct list_head *freelist)
@@ -155,7 +153,6 @@ static void isolate_freepages(struct zone *zone,
155 * pages on cc->migratepages. We stop searching if the migrate 153 * pages on cc->migratepages. We stop searching if the migrate
156 * and free page scanners meet or enough free pages are isolated. 154 * and free page scanners meet or enough free pages are isolated.
157 */ 155 */
158 spin_lock_irqsave(&zone->lock, flags);
159 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; 156 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
160 pfn -= pageblock_nr_pages) { 157 pfn -= pageblock_nr_pages) {
161 unsigned long isolated; 158 unsigned long isolated;
@@ -178,9 +175,19 @@ static void isolate_freepages(struct zone *zone,
178 if (!suitable_migration_target(page)) 175 if (!suitable_migration_target(page))
179 continue; 176 continue;
180 177
181 /* Found a block suitable for isolating free pages from */ 178 /*
182 isolated = isolate_freepages_block(zone, pfn, freelist); 179 * Found a block suitable for isolating free pages from. Now
183 nr_freepages += isolated; 180 * we disabled interrupts, double check things are ok and
181 * isolate the pages. This is to minimise the time IRQs
182 * are disabled
183 */
184 isolated = 0;
185 spin_lock_irqsave(&zone->lock, flags);
186 if (suitable_migration_target(page)) {
187 isolated = isolate_freepages_block(zone, pfn, freelist);
188 nr_freepages += isolated;
189 }
190 spin_unlock_irqrestore(&zone->lock, flags);
184 191
185 /* 192 /*
186 * Record the highest PFN we isolated pages from. When next 193 * Record the highest PFN we isolated pages from. When next
@@ -190,7 +197,6 @@ static void isolate_freepages(struct zone *zone,
190 if (isolated) 197 if (isolated)
191 high_pfn = max(high_pfn, pfn); 198 high_pfn = max(high_pfn, pfn);
192 } 199 }
193 spin_unlock_irqrestore(&zone->lock, flags);
194 200
195 /* split_free_page does not map the pages */ 201 /* split_free_page does not map the pages */
196 list_for_each_entry(page, freelist, lru) { 202 list_for_each_entry(page, freelist, lru) {
@@ -271,9 +277,27 @@ static unsigned long isolate_migratepages(struct zone *zone,
271 } 277 }
272 278
273 /* Time to isolate some pages for migration */ 279 /* Time to isolate some pages for migration */
280 cond_resched();
274 spin_lock_irq(&zone->lru_lock); 281 spin_lock_irq(&zone->lru_lock);
275 for (; low_pfn < end_pfn; low_pfn++) { 282 for (; low_pfn < end_pfn; low_pfn++) {
276 struct page *page; 283 struct page *page;
284 bool locked = true;
285
286 /* give a chance to irqs before checking need_resched() */
287 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
288 spin_unlock_irq(&zone->lru_lock);
289 locked = false;
290 }
291 if (need_resched() || spin_is_contended(&zone->lru_lock)) {
292 if (locked)
293 spin_unlock_irq(&zone->lru_lock);
294 cond_resched();
295 spin_lock_irq(&zone->lru_lock);
296 if (fatal_signal_pending(current))
297 break;
298 } else if (!locked)
299 spin_lock_irq(&zone->lru_lock);
300
277 if (!pfn_valid_within(low_pfn)) 301 if (!pfn_valid_within(low_pfn))
278 continue; 302 continue;
279 nr_scanned++; 303 nr_scanned++;
@@ -397,10 +421,7 @@ static int compact_finished(struct zone *zone,
397 return COMPACT_COMPLETE; 421 return COMPACT_COMPLETE;
398 422
399 /* Compaction run is not finished if the watermark is not met */ 423 /* Compaction run is not finished if the watermark is not met */
400 if (cc->compact_mode != COMPACT_MODE_KSWAPD) 424 watermark = low_wmark_pages(zone);
401 watermark = low_wmark_pages(zone);
402 else
403 watermark = high_wmark_pages(zone);
404 watermark += (1 << cc->order); 425 watermark += (1 << cc->order);
405 426
406 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 427 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
@@ -413,15 +434,6 @@ static int compact_finished(struct zone *zone,
413 if (cc->order == -1) 434 if (cc->order == -1)
414 return COMPACT_CONTINUE; 435 return COMPACT_CONTINUE;
415 436
416 /*
417 * Generating only one page of the right order is not enough
418 * for kswapd, we must continue until we're above the high
419 * watermark as a pool for high order GFP_ATOMIC allocations
420 * too.
421 */
422 if (cc->compact_mode == COMPACT_MODE_KSWAPD)
423 return COMPACT_CONTINUE;
424
425 /* Direct compactor: Is a suitable page free? */ 437 /* Direct compactor: Is a suitable page free? */
426 for (order = cc->order; order < MAX_ORDER; order++) { 438 for (order = cc->order; order < MAX_ORDER; order++) {
427 /* Job done if page is free of the right migratetype */ 439 /* Job done if page is free of the right migratetype */
@@ -508,12 +520,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
508 520
509 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 521 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
510 unsigned long nr_migrate, nr_remaining; 522 unsigned long nr_migrate, nr_remaining;
523 int err;
511 524
512 if (!isolate_migratepages(zone, cc)) 525 if (!isolate_migratepages(zone, cc))
513 continue; 526 continue;
514 527
515 nr_migrate = cc->nr_migratepages; 528 nr_migrate = cc->nr_migratepages;
516 migrate_pages(&cc->migratepages, compaction_alloc, 529 err = migrate_pages(&cc->migratepages, compaction_alloc,
517 (unsigned long)cc, false, 530 (unsigned long)cc, false,
518 cc->sync); 531 cc->sync);
519 update_nr_listpages(cc); 532 update_nr_listpages(cc);
@@ -527,7 +540,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
527 nr_remaining); 540 nr_remaining);
528 541
529 /* Release LRU pages not migrated */ 542 /* Release LRU pages not migrated */
530 if (!list_empty(&cc->migratepages)) { 543 if (err) {
531 putback_lru_pages(&cc->migratepages); 544 putback_lru_pages(&cc->migratepages);
532 cc->nr_migratepages = 0; 545 cc->nr_migratepages = 0;
533 } 546 }
@@ -543,8 +556,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
543 556
544unsigned long compact_zone_order(struct zone *zone, 557unsigned long compact_zone_order(struct zone *zone,
545 int order, gfp_t gfp_mask, 558 int order, gfp_t gfp_mask,
546 bool sync, 559 bool sync)
547 int compact_mode)
548{ 560{
549 struct compact_control cc = { 561 struct compact_control cc = {
550 .nr_freepages = 0, 562 .nr_freepages = 0,
@@ -553,7 +565,6 @@ unsigned long compact_zone_order(struct zone *zone,
553 .migratetype = allocflags_to_migratetype(gfp_mask), 565 .migratetype = allocflags_to_migratetype(gfp_mask),
554 .zone = zone, 566 .zone = zone,
555 .sync = sync, 567 .sync = sync,
556 .compact_mode = compact_mode,
557 }; 568 };
558 INIT_LIST_HEAD(&cc.freepages); 569 INIT_LIST_HEAD(&cc.freepages);
559 INIT_LIST_HEAD(&cc.migratepages); 570 INIT_LIST_HEAD(&cc.migratepages);
@@ -599,8 +610,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
599 nodemask) { 610 nodemask) {
600 int status; 611 int status;
601 612
602 status = compact_zone_order(zone, order, gfp_mask, sync, 613 status = compact_zone_order(zone, order, gfp_mask, sync);
603 COMPACT_MODE_DIRECT_RECLAIM);
604 rc = max(status, rc); 614 rc = max(status, rc);
605 615
606 /* If a normal allocation would succeed, stop compacting */ 616 /* If a normal allocation would succeed, stop compacting */
@@ -631,7 +641,6 @@ static int compact_node(int nid)
631 .nr_freepages = 0, 641 .nr_freepages = 0,
632 .nr_migratepages = 0, 642 .nr_migratepages = 0,
633 .order = -1, 643 .order = -1,
634 .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
635 }; 644 };
636 645
637 zone = &pgdat->node_zones[zoneid]; 646 zone = &pgdat->node_zones[zoneid];
diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d35468b..c641edf553a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -80,8 +80,8 @@
80 * ->i_mutex 80 * ->i_mutex
81 * ->i_alloc_sem (various) 81 * ->i_alloc_sem (various)
82 * 82 *
83 * ->inode_lock 83 * inode_wb_list_lock
84 * ->sb_lock (fs/fs-writeback.c) 84 * sb_lock (fs/fs-writeback.c)
85 * ->mapping->tree_lock (__sync_single_inode) 85 * ->mapping->tree_lock (__sync_single_inode)
86 * 86 *
87 * ->i_mmap_lock 87 * ->i_mmap_lock
@@ -98,8 +98,10 @@
98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
99 * ->private_lock (page_remove_rmap->set_page_dirty) 99 * ->private_lock (page_remove_rmap->set_page_dirty)
100 * ->tree_lock (page_remove_rmap->set_page_dirty) 100 * ->tree_lock (page_remove_rmap->set_page_dirty)
101 * ->inode_lock (page_remove_rmap->set_page_dirty) 101 * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
102 * ->inode_lock (zap_pte_range->set_page_dirty) 102 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
103 * inode_wb_list_lock (zap_pte_range->set_page_dirty)
104 * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 106 *
105 * (code doesn't rely on that order, so you could switch it around) 107 * (code doesn't rely on that order, so you could switch it around)
@@ -108,11 +110,11 @@
108 */ 110 */
109 111
110/* 112/*
111 * Remove a page from the page cache and free it. Caller has to make 113 * Delete a page from the page cache and free it. Caller has to make
112 * sure the page is locked and that nobody else uses it - or that usage 114 * sure the page is locked and that nobody else uses it - or that usage
113 * is safe. The caller must hold the mapping's tree_lock. 115 * is safe. The caller must hold the mapping's tree_lock.
114 */ 116 */
115void __remove_from_page_cache(struct page *page) 117void __delete_from_page_cache(struct page *page)
116{ 118{
117 struct address_space *mapping = page->mapping; 119 struct address_space *mapping = page->mapping;
118 120
@@ -137,7 +139,15 @@ void __remove_from_page_cache(struct page *page)
137 } 139 }
138} 140}
139 141
140void remove_from_page_cache(struct page *page) 142/**
143 * delete_from_page_cache - delete page from page cache
144 * @page: the page which the kernel is trying to remove from page cache
145 *
146 * This must be called only on pages that have been verified to be in the page
147 * cache and locked. It will never put the page into the free list, the caller
148 * has a reference on the page.
149 */
150void delete_from_page_cache(struct page *page)
141{ 151{
142 struct address_space *mapping = page->mapping; 152 struct address_space *mapping = page->mapping;
143 void (*freepage)(struct page *); 153 void (*freepage)(struct page *);
@@ -146,54 +156,25 @@ void remove_from_page_cache(struct page *page)
146 156
147 freepage = mapping->a_ops->freepage; 157 freepage = mapping->a_ops->freepage;
148 spin_lock_irq(&mapping->tree_lock); 158 spin_lock_irq(&mapping->tree_lock);
149 __remove_from_page_cache(page); 159 __delete_from_page_cache(page);
150 spin_unlock_irq(&mapping->tree_lock); 160 spin_unlock_irq(&mapping->tree_lock);
151 mem_cgroup_uncharge_cache_page(page); 161 mem_cgroup_uncharge_cache_page(page);
152 162
153 if (freepage) 163 if (freepage)
154 freepage(page); 164 freepage(page);
165 page_cache_release(page);
155} 166}
156EXPORT_SYMBOL(remove_from_page_cache); 167EXPORT_SYMBOL(delete_from_page_cache);
157 168
158static int sync_page(void *word) 169static int sleep_on_page(void *word)
159{ 170{
160 struct address_space *mapping;
161 struct page *page;
162
163 page = container_of((unsigned long *)word, struct page, flags);
164
165 /*
166 * page_mapping() is being called without PG_locked held.
167 * Some knowledge of the state and use of the page is used to
168 * reduce the requirements down to a memory barrier.
169 * The danger here is of a stale page_mapping() return value
170 * indicating a struct address_space different from the one it's
171 * associated with when it is associated with one.
172 * After smp_mb(), it's either the correct page_mapping() for
173 * the page, or an old page_mapping() and the page's own
174 * page_mapping() has gone NULL.
175 * The ->sync_page() address_space operation must tolerate
176 * page_mapping() going NULL. By an amazing coincidence,
177 * this comes about because none of the users of the page
178 * in the ->sync_page() methods make essential use of the
179 * page_mapping(), merely passing the page down to the backing
180 * device's unplug functions when it's non-NULL, which in turn
181 * ignore it for all cases but swap, where only page_private(page) is
182 * of interest. When page_mapping() does go NULL, the entire
183 * call stack gracefully ignores the page and returns.
184 * -- wli
185 */
186 smp_mb();
187 mapping = page_mapping(page);
188 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
189 mapping->a_ops->sync_page(page);
190 io_schedule(); 171 io_schedule();
191 return 0; 172 return 0;
192} 173}
193 174
194static int sync_page_killable(void *word) 175static int sleep_on_page_killable(void *word)
195{ 176{
196 sync_page(word); 177 sleep_on_page(word);
197 return fatal_signal_pending(current) ? -EINTR : 0; 178 return fatal_signal_pending(current) ? -EINTR : 0;
198} 179}
199 180
@@ -387,6 +368,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
387EXPORT_SYMBOL(filemap_write_and_wait_range); 368EXPORT_SYMBOL(filemap_write_and_wait_range);
388 369
389/** 370/**
371 * replace_page_cache_page - replace a pagecache page with a new one
372 * @old: page to be replaced
373 * @new: page to replace with
374 * @gfp_mask: allocation mode
375 *
376 * This function replaces a page in the pagecache with a new one. On
377 * success it acquires the pagecache reference for the new page and
378 * drops it for the old page. Both the old and new pages must be
379 * locked. This function does not add the new page to the LRU, the
380 * caller must do that.
381 *
382 * The remove + add is atomic. The only way this function can fail is
383 * memory allocation failure.
384 */
385int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
386{
387 int error;
388 struct mem_cgroup *memcg = NULL;
389
390 VM_BUG_ON(!PageLocked(old));
391 VM_BUG_ON(!PageLocked(new));
392 VM_BUG_ON(new->mapping);
393
394 /*
395 * This is not page migration, but prepare_migration and
396 * end_migration does enough work for charge replacement.
397 *
398 * In the longer term we probably want a specialized function
399 * for moving the charge from old to new in a more efficient
400 * manner.
401 */
402 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
403 if (error)
404 return error;
405
406 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
407 if (!error) {
408 struct address_space *mapping = old->mapping;
409 void (*freepage)(struct page *);
410
411 pgoff_t offset = old->index;
412 freepage = mapping->a_ops->freepage;
413
414 page_cache_get(new);
415 new->mapping = mapping;
416 new->index = offset;
417
418 spin_lock_irq(&mapping->tree_lock);
419 __delete_from_page_cache(old);
420 error = radix_tree_insert(&mapping->page_tree, offset, new);
421 BUG_ON(error);
422 mapping->nrpages++;
423 __inc_zone_page_state(new, NR_FILE_PAGES);
424 if (PageSwapBacked(new))
425 __inc_zone_page_state(new, NR_SHMEM);
426 spin_unlock_irq(&mapping->tree_lock);
427 radix_tree_preload_end();
428 if (freepage)
429 freepage(old);
430 page_cache_release(old);
431 mem_cgroup_end_migration(memcg, old, new, true);
432 } else {
433 mem_cgroup_end_migration(memcg, old, new, false);
434 }
435
436 return error;
437}
438EXPORT_SYMBOL_GPL(replace_page_cache_page);
439
440/**
390 * add_to_page_cache_locked - add a locked page to the pagecache 441 * add_to_page_cache_locked - add a locked page to the pagecache
391 * @page: page to add 442 * @page: page to add
392 * @mapping: the page's address_space 443 * @mapping: the page's address_space
@@ -479,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
479EXPORT_SYMBOL(__page_cache_alloc); 530EXPORT_SYMBOL(__page_cache_alloc);
480#endif 531#endif
481 532
482static int __sleep_on_page_lock(void *word)
483{
484 io_schedule();
485 return 0;
486}
487
488/* 533/*
489 * In order to wait for pages to become available there must be 534 * In order to wait for pages to become available there must be
490 * waitqueues associated with pages. By using a hash table of 535 * waitqueues associated with pages. By using a hash table of
@@ -512,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
512 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 557 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
513 558
514 if (test_bit(bit_nr, &page->flags)) 559 if (test_bit(bit_nr, &page->flags))
515 __wait_on_bit(page_waitqueue(page), &wait, sync_page, 560 __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
516 TASK_UNINTERRUPTIBLE); 561 TASK_UNINTERRUPTIBLE);
517} 562}
518EXPORT_SYMBOL(wait_on_page_bit); 563EXPORT_SYMBOL(wait_on_page_bit);
@@ -576,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback);
576/** 621/**
577 * __lock_page - get a lock on the page, assuming we need to sleep to get it 622 * __lock_page - get a lock on the page, assuming we need to sleep to get it
578 * @page: the page to lock 623 * @page: the page to lock
579 *
580 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
581 * random driver's requestfn sets TASK_RUNNING, we could busywait. However
582 * chances are that on the second loop, the block layer's plug list is empty,
583 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
584 */ 624 */
585void __lock_page(struct page *page) 625void __lock_page(struct page *page)
586{ 626{
587 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 627 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
588 628
589 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, 629 __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
590 TASK_UNINTERRUPTIBLE); 630 TASK_UNINTERRUPTIBLE);
591} 631}
592EXPORT_SYMBOL(__lock_page); 632EXPORT_SYMBOL(__lock_page);
@@ -596,24 +636,10 @@ int __lock_page_killable(struct page *page)
596 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 636 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
597 637
598 return __wait_on_bit_lock(page_waitqueue(page), &wait, 638 return __wait_on_bit_lock(page_waitqueue(page), &wait,
599 sync_page_killable, TASK_KILLABLE); 639 sleep_on_page_killable, TASK_KILLABLE);
600} 640}
601EXPORT_SYMBOL_GPL(__lock_page_killable); 641EXPORT_SYMBOL_GPL(__lock_page_killable);
602 642
603/**
604 * __lock_page_nosync - get a lock on the page, without calling sync_page()
605 * @page: the page to lock
606 *
607 * Variant of lock_page that does not require the caller to hold a reference
608 * on the page's mapping.
609 */
610void __lock_page_nosync(struct page *page)
611{
612 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
613 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
614 TASK_UNINTERRUPTIBLE);
615}
616
617int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 643int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
618 unsigned int flags) 644 unsigned int flags)
619{ 645{
@@ -621,8 +647,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
621 __lock_page(page); 647 __lock_page(page);
622 return 1; 648 return 1;
623 } else { 649 } else {
624 up_read(&mm->mmap_sem); 650 if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
625 wait_on_page_locked(page); 651 up_read(&mm->mmap_sem);
652 wait_on_page_locked(page);
653 }
626 return 0; 654 return 0;
627 } 655 }
628} 656}
@@ -782,9 +810,13 @@ repeat:
782 page = radix_tree_deref_slot((void **)pages[i]); 810 page = radix_tree_deref_slot((void **)pages[i]);
783 if (unlikely(!page)) 811 if (unlikely(!page))
784 continue; 812 continue;
813
814 /*
815 * This can only trigger when the entry at index 0 moves out
816 * of or back to the root: none yet gotten, safe to restart.
817 */
785 if (radix_tree_deref_retry(page)) { 818 if (radix_tree_deref_retry(page)) {
786 if (ret) 819 WARN_ON(start | i);
787 start = pages[ret-1]->index;
788 goto restart; 820 goto restart;
789 } 821 }
790 822
@@ -800,6 +832,13 @@ repeat:
800 pages[ret] = page; 832 pages[ret] = page;
801 ret++; 833 ret++;
802 } 834 }
835
836 /*
837 * If all entries were removed before we could secure them,
838 * try again, because callers stop trying once 0 is returned.
839 */
840 if (unlikely(!ret && nr_found))
841 goto restart;
803 rcu_read_unlock(); 842 rcu_read_unlock();
804 return ret; 843 return ret;
805} 844}
@@ -834,6 +873,11 @@ repeat:
834 page = radix_tree_deref_slot((void **)pages[i]); 873 page = radix_tree_deref_slot((void **)pages[i]);
835 if (unlikely(!page)) 874 if (unlikely(!page))
836 continue; 875 continue;
876
877 /*
878 * This can only trigger when the entry at index 0 moves out
879 * of or back to the root: none yet gotten, safe to restart.
880 */
837 if (radix_tree_deref_retry(page)) 881 if (radix_tree_deref_retry(page))
838 goto restart; 882 goto restart;
839 883
@@ -894,6 +938,11 @@ repeat:
894 page = radix_tree_deref_slot((void **)pages[i]); 938 page = radix_tree_deref_slot((void **)pages[i]);
895 if (unlikely(!page)) 939 if (unlikely(!page))
896 continue; 940 continue;
941
942 /*
943 * This can only trigger when the entry at index 0 moves out
944 * of or back to the root: none yet gotten, safe to restart.
945 */
897 if (radix_tree_deref_retry(page)) 946 if (radix_tree_deref_retry(page))
898 goto restart; 947 goto restart;
899 948
@@ -909,6 +958,13 @@ repeat:
909 pages[ret] = page; 958 pages[ret] = page;
910 ret++; 959 ret++;
911 } 960 }
961
962 /*
963 * If all entries were removed before we could secure them,
964 * try again, because callers stop trying once 0 is returned.
965 */
966 if (unlikely(!ret && nr_found))
967 goto restart;
912 rcu_read_unlock(); 968 rcu_read_unlock();
913 969
914 if (ret) 970 if (ret)
@@ -1298,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1298 unsigned long seg = 0; 1354 unsigned long seg = 0;
1299 size_t count; 1355 size_t count;
1300 loff_t *ppos = &iocb->ki_pos; 1356 loff_t *ppos = &iocb->ki_pos;
1357 struct blk_plug plug;
1301 1358
1302 count = 0; 1359 count = 0;
1303 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1360 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1304 if (retval) 1361 if (retval)
1305 return retval; 1362 return retval;
1306 1363
1364 blk_start_plug(&plug);
1365
1307 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1366 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1308 if (filp->f_flags & O_DIRECT) { 1367 if (filp->f_flags & O_DIRECT) {
1309 loff_t size; 1368 loff_t size;
@@ -1376,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1376 break; 1435 break;
1377 } 1436 }
1378out: 1437out:
1438 blk_finish_plug(&plug);
1379 return retval; 1439 return retval;
1380} 1440}
1381EXPORT_SYMBOL(generic_file_aio_read); 1441EXPORT_SYMBOL(generic_file_aio_read);
@@ -2487,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2487{ 2547{
2488 struct file *file = iocb->ki_filp; 2548 struct file *file = iocb->ki_filp;
2489 struct inode *inode = file->f_mapping->host; 2549 struct inode *inode = file->f_mapping->host;
2550 struct blk_plug plug;
2490 ssize_t ret; 2551 ssize_t ret;
2491 2552
2492 BUG_ON(iocb->ki_pos != pos); 2553 BUG_ON(iocb->ki_pos != pos);
2493 2554
2494 mutex_lock(&inode->i_mutex); 2555 mutex_lock(&inode->i_mutex);
2556 blk_start_plug(&plug);
2495 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2557 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2496 mutex_unlock(&inode->i_mutex); 2558 mutex_unlock(&inode->i_mutex);
2497 2559
@@ -2502,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2502 if (err < 0 && ret > 0) 2564 if (err < 0 && ret > 0)
2503 ret = err; 2565 ret = err;
2504 } 2566 }
2567 blk_finish_plug(&plug);
2505 return ret; 2568 return ret;
2506} 2569}
2507EXPORT_SYMBOL(generic_file_aio_write); 2570EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 113e35c47502..470dcda10add 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -244,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj,
244 struct kobj_attribute *attr, char *buf, 244 struct kobj_attribute *attr, char *buf,
245 enum transparent_hugepage_flag flag) 245 enum transparent_hugepage_flag flag)
246{ 246{
247 if (test_bit(flag, &transparent_hugepage_flags)) 247 return sprintf(buf, "%d\n",
248 return sprintf(buf, "[yes] no\n"); 248 !!test_bit(flag, &transparent_hugepage_flags));
249 else
250 return sprintf(buf, "yes [no]\n");
251} 249}
250
252static ssize_t single_flag_store(struct kobject *kobj, 251static ssize_t single_flag_store(struct kobject *kobj,
253 struct kobj_attribute *attr, 252 struct kobj_attribute *attr,
254 const char *buf, size_t count, 253 const char *buf, size_t count,
255 enum transparent_hugepage_flag flag) 254 enum transparent_hugepage_flag flag)
256{ 255{
257 if (!memcmp("yes", buf, 256 unsigned long value;
258 min(sizeof("yes")-1, count))) { 257 int ret;
258
259 ret = kstrtoul(buf, 10, &value);
260 if (ret < 0)
261 return ret;
262 if (value > 1)
263 return -EINVAL;
264
265 if (value)
259 set_bit(flag, &transparent_hugepage_flags); 266 set_bit(flag, &transparent_hugepage_flags);
260 } else if (!memcmp("no", buf, 267 else
261 min(sizeof("no")-1, count))) {
262 clear_bit(flag, &transparent_hugepage_flags); 268 clear_bit(flag, &transparent_hugepage_flags);
263 } else
264 return -EINVAL;
265 269
266 return count; 270 return count;
267} 271}
@@ -643,23 +647,24 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
643 return ret; 647 return ret;
644} 648}
645 649
646static inline gfp_t alloc_hugepage_gfpmask(int defrag) 650static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
647{ 651{
648 return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); 652 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
649} 653}
650 654
651static inline struct page *alloc_hugepage_vma(int defrag, 655static inline struct page *alloc_hugepage_vma(int defrag,
652 struct vm_area_struct *vma, 656 struct vm_area_struct *vma,
653 unsigned long haddr, int nd) 657 unsigned long haddr, int nd,
658 gfp_t extra_gfp)
654{ 659{
655 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), 660 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
656 HPAGE_PMD_ORDER, vma, haddr, nd); 661 HPAGE_PMD_ORDER, vma, haddr, nd);
657} 662}
658 663
659#ifndef CONFIG_NUMA 664#ifndef CONFIG_NUMA
660static inline struct page *alloc_hugepage(int defrag) 665static inline struct page *alloc_hugepage(int defrag)
661{ 666{
662 return alloc_pages(alloc_hugepage_gfpmask(defrag), 667 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
663 HPAGE_PMD_ORDER); 668 HPAGE_PMD_ORDER);
664} 669}
665#endif 670#endif
@@ -678,9 +683,12 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
678 if (unlikely(khugepaged_enter(vma))) 683 if (unlikely(khugepaged_enter(vma)))
679 return VM_FAULT_OOM; 684 return VM_FAULT_OOM;
680 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 685 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
681 vma, haddr, numa_node_id()); 686 vma, haddr, numa_node_id(), 0);
682 if (unlikely(!page)) 687 if (unlikely(!page)) {
688 count_vm_event(THP_FAULT_FALLBACK);
683 goto out; 689 goto out;
690 }
691 count_vm_event(THP_FAULT_ALLOC);
684 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 692 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
685 put_page(page); 693 put_page(page);
686 goto out; 694 goto out;
@@ -799,7 +807,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
799 } 807 }
800 808
801 for (i = 0; i < HPAGE_PMD_NR; i++) { 809 for (i = 0; i < HPAGE_PMD_NR; i++) {
802 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, 810 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
811 __GFP_OTHER_NODE,
803 vma, address, page_to_nid(page)); 812 vma, address, page_to_nid(page));
804 if (unlikely(!pages[i] || 813 if (unlikely(!pages[i] ||
805 mem_cgroup_newpage_charge(pages[i], mm, 814 mem_cgroup_newpage_charge(pages[i], mm,
@@ -902,16 +911,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
902 if (transparent_hugepage_enabled(vma) && 911 if (transparent_hugepage_enabled(vma) &&
903 !transparent_hugepage_debug_cow()) 912 !transparent_hugepage_debug_cow())
904 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 913 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
905 vma, haddr, numa_node_id()); 914 vma, haddr, numa_node_id(), 0);
906 else 915 else
907 new_page = NULL; 916 new_page = NULL;
908 917
909 if (unlikely(!new_page)) { 918 if (unlikely(!new_page)) {
919 count_vm_event(THP_FAULT_FALLBACK);
910 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 920 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
911 pmd, orig_pmd, page, haddr); 921 pmd, orig_pmd, page, haddr);
912 put_page(page); 922 put_page(page);
913 goto out; 923 goto out;
914 } 924 }
925 count_vm_event(THP_FAULT_ALLOC);
915 926
916 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 927 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
917 put_page(new_page); 928 put_page(new_page);
@@ -1388,6 +1399,7 @@ int split_huge_page(struct page *page)
1388 1399
1389 BUG_ON(!PageSwapBacked(page)); 1400 BUG_ON(!PageSwapBacked(page));
1390 __split_huge_page(page, anon_vma); 1401 __split_huge_page(page, anon_vma);
1402 count_vm_event(THP_SPLIT);
1391 1403
1392 BUG_ON(PageCompound(page)); 1404 BUG_ON(PageCompound(page));
1393out_unlock: 1405out_unlock:
@@ -1779,12 +1791,14 @@ static void collapse_huge_page(struct mm_struct *mm,
1779 * scalability. 1791 * scalability.
1780 */ 1792 */
1781 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 1793 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1782 node); 1794 node, __GFP_OTHER_NODE);
1783 if (unlikely(!new_page)) { 1795 if (unlikely(!new_page)) {
1784 up_read(&mm->mmap_sem); 1796 up_read(&mm->mmap_sem);
1797 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1785 *hpage = ERR_PTR(-ENOMEM); 1798 *hpage = ERR_PTR(-ENOMEM);
1786 return; 1799 return;
1787 } 1800 }
1801 count_vm_event(THP_COLLAPSE_ALLOC);
1788 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1802 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1789 up_read(&mm->mmap_sem); 1803 up_read(&mm->mmap_sem);
1790 put_page(new_page); 1804 put_page(new_page);
@@ -2149,8 +2163,11 @@ static void khugepaged_do_scan(struct page **hpage)
2149#ifndef CONFIG_NUMA 2163#ifndef CONFIG_NUMA
2150 if (!*hpage) { 2164 if (!*hpage) {
2151 *hpage = alloc_hugepage(khugepaged_defrag()); 2165 *hpage = alloc_hugepage(khugepaged_defrag());
2152 if (unlikely(!*hpage)) 2166 if (unlikely(!*hpage)) {
2167 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2153 break; 2168 break;
2169 }
2170 count_vm_event(THP_COLLAPSE_ALLOC);
2154 } 2171 }
2155#else 2172#else
2156 if (IS_ERR(*hpage)) 2173 if (IS_ERR(*hpage))
@@ -2190,8 +2207,11 @@ static struct page *khugepaged_alloc_hugepage(void)
2190 2207
2191 do { 2208 do {
2192 hpage = alloc_hugepage(khugepaged_defrag()); 2209 hpage = alloc_hugepage(khugepaged_defrag());
2193 if (!hpage) 2210 if (!hpage) {
2211 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2194 khugepaged_alloc_sleep(); 2212 khugepaged_alloc_sleep();
2213 } else
2214 count_vm_event(THP_COLLAPSE_ALLOC);
2195 } while (unlikely(!hpage) && 2215 } while (unlikely(!hpage) &&
2196 likely(khugepaged_enabled())); 2216 likely(khugepaged_enabled()));
2197 return hpage; 2217 return hpage;
@@ -2208,8 +2228,11 @@ static void khugepaged_loop(void)
2208 while (likely(khugepaged_enabled())) { 2228 while (likely(khugepaged_enabled())) {
2209#ifndef CONFIG_NUMA 2229#ifndef CONFIG_NUMA
2210 hpage = khugepaged_alloc_hugepage(); 2230 hpage = khugepaged_alloc_hugepage();
2211 if (unlikely(!hpage)) 2231 if (unlikely(!hpage)) {
2232 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2212 break; 2233 break;
2234 }
2235 count_vm_event(THP_COLLAPSE_ALLOC);
2213#else 2236#else
2214 if (IS_ERR(hpage)) { 2237 if (IS_ERR(hpage)) {
2215 khugepaged_alloc_sleep(); 2238 khugepaged_alloc_sleep();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 838fe25f704c..bbb4a5bbb958 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t)
146 if (rg->from > t) 146 if (rg->from > t)
147 return chg; 147 return chg;
148 148
149 /* We overlap with this area, if it extends futher than 149 /* We overlap with this area, if it extends further than
150 * us then we must extend ourselves. Account for its 150 * us then we must extend ourselves. Account for its
151 * existing reservation. */ 151 * existing reservation. */
152 if (rg->to > t) { 152 if (rg->to > t) {
@@ -842,7 +842,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
842} 842}
843 843
844/* 844/*
845 * Increase the hugetlb pool such that it can accomodate a reservation 845 * Increase the hugetlb pool such that it can accommodate a reservation
846 * of size 'delta'. 846 * of size 'delta'.
847 */ 847 */
848static int gather_surplus_pages(struct hstate *h, int delta) 848static int gather_surplus_pages(struct hstate *h, int delta)
@@ -890,7 +890,7 @@ retry:
890 890
891 /* 891 /*
892 * The surplus_list now contains _at_least_ the number of extra pages 892 * The surplus_list now contains _at_least_ the number of extra pages
893 * needed to accomodate the reservation. Add the appropriate number 893 * needed to accommodate the reservation. Add the appropriate number
894 * of pages to the hugetlb pool and free the extras back to the buddy 894 * of pages to the hugetlb pool and free the extras back to the buddy
895 * allocator. Commit the entire reservation here to prevent another 895 * allocator. Commit the entire reservation here to prevent another
896 * process from stealing the pages as they are added to the pool but 896 * process from stealing the pages as they are added to the pool but
@@ -1872,8 +1872,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1872 unsigned long tmp; 1872 unsigned long tmp;
1873 int ret; 1873 int ret;
1874 1874
1875 if (!write) 1875 tmp = h->max_huge_pages;
1876 tmp = h->max_huge_pages;
1877 1876
1878 if (write && h->order >= MAX_ORDER) 1877 if (write && h->order >= MAX_ORDER)
1879 return -EINVAL; 1878 return -EINVAL;
@@ -1938,8 +1937,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1938 unsigned long tmp; 1937 unsigned long tmp;
1939 int ret; 1938 int ret;
1940 1939
1941 if (!write) 1940 tmp = h->nr_overcommit_huge_pages;
1942 tmp = h->nr_overcommit_huge_pages;
1943 1941
1944 if (write && h->order >= MAX_ORDER) 1942 if (write && h->order >= MAX_ORDER)
1945 return -EINVAL; 1943 return -EINVAL;
@@ -2045,7 +2043,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2045 * This new VMA should share its siblings reservation map if present. 2043 * This new VMA should share its siblings reservation map if present.
2046 * The VMA will only ever have a valid reservation map pointer where 2044 * The VMA will only ever have a valid reservation map pointer where
2047 * it is being copied for another still existing VMA. As that VMA 2045 * it is being copied for another still existing VMA. As that VMA
2048 * has a reference to the reservation map it cannot dissappear until 2046 * has a reference to the reservation map it cannot disappear until
2049 * after this open call completes. It is therefore safe to take a 2047 * after this open call completes. It is therefore safe to take a
2050 * new reference here without additional locking. 2048 * new reference here without additional locking.
2051 */ 2049 */
@@ -2492,7 +2490,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2492 /* 2490 /*
2493 * Currently, we are forced to kill the process in the event the 2491 * Currently, we are forced to kill the process in the event the
2494 * original mapper has unmapped pages from the child due to a failed 2492 * original mapper has unmapped pages from the child due to a failed
2495 * COW. Warn that such a situation has occured as it may not be obvious 2493 * COW. Warn that such a situation has occurred as it may not be obvious
2496 */ 2494 */
2497 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 2495 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2498 printk(KERN_WARNING 2496 printk(KERN_WARNING
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 0948f1072d6b..c7fc7fd00e32 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -1,4 +1,4 @@
1/* Inject a hwpoison memory failure on a arbitary pfn */ 1/* Inject a hwpoison memory failure on a arbitrary pfn */
2#include <linux/module.h> 2#include <linux/module.h>
3#include <linux/debugfs.h> 3#include <linux/debugfs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
diff --git a/mm/internal.h b/mm/internal.h
index 3438dd43a062..9d0ced8e505e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -162,7 +162,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset)
162} 162}
163 163
164/* 164/*
165 * Iterator over all subpages withing the maximally aligned gigantic 165 * Iterator over all subpages within the maximally aligned gigantic
166 * page 'base'. Handle any discontiguity in the mem_map. 166 * page 'base'. Handle any discontiguity in the mem_map.
167 */ 167 */
168static inline struct page *mem_map_next(struct page *iter, 168static inline struct page *mem_map_next(struct page *iter,
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 84225f3b7190..c1d5867543e4 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -265,7 +265,7 @@ static void kmemleak_disable(void);
265} while (0) 265} while (0)
266 266
267/* 267/*
268 * Macro invoked when a serious kmemleak condition occured and cannot be 268 * Macro invoked when a serious kmemleak condition occurred and cannot be
269 * recovered from. Kmemleak will be disabled and further allocation/freeing 269 * recovered from. Kmemleak will be disabled and further allocation/freeing
270 * tracing no longer available. 270 * tracing no longer available.
271 */ 271 */
@@ -1006,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object)
1006 1006
1007/* 1007/*
1008 * Memory scanning is a long process and it needs to be interruptable. This 1008 * Memory scanning is a long process and it needs to be interruptable. This
1009 * function checks whether such interrupt condition occured. 1009 * function checks whether such interrupt condition occurred.
1010 */ 1010 */
1011static int scan_should_stop(void) 1011static int scan_should_stop(void)
1012{ 1012{
@@ -1733,7 +1733,7 @@ static int __init kmemleak_late_init(void)
1733 1733
1734 if (atomic_read(&kmemleak_error)) { 1734 if (atomic_read(&kmemleak_error)) {
1735 /* 1735 /*
1736 * Some error occured and kmemleak was disabled. There is a 1736 * Some error occurred and kmemleak was disabled. There is a
1737 * small chance that kmemleak_disable() was called immediately 1737 * small chance that kmemleak_disable() was called immediately
1738 * after setting kmemleak_initialized and we may end up with 1738 * after setting kmemleak_initialized and we may end up with
1739 * two clean-up threads but serialized by scan_mutex. 1739 * two clean-up threads but serialized by scan_mutex.
diff --git a/mm/ksm.c b/mm/ksm.c
index c2b2a94f9d67..942dfc73a2ff 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -301,20 +301,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
301 return rmap_item->address & STABLE_FLAG; 301 return rmap_item->address & STABLE_FLAG;
302} 302}
303 303
304static void hold_anon_vma(struct rmap_item *rmap_item,
305 struct anon_vma *anon_vma)
306{
307 rmap_item->anon_vma = anon_vma;
308 get_anon_vma(anon_vma);
309}
310
311static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
312{
313 struct anon_vma *anon_vma = rmap_item->anon_vma;
314
315 drop_anon_vma(anon_vma);
316}
317
318/* 304/*
319 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 305 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
320 * page tables after it has passed through ksm_exit() - which, if necessary, 306 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -397,7 +383,7 @@ static void break_cow(struct rmap_item *rmap_item)
397 * It is not an accident that whenever we want to break COW 383 * It is not an accident that whenever we want to break COW
398 * to undo, we also need to drop a reference to the anon_vma. 384 * to undo, we also need to drop a reference to the anon_vma.
399 */ 385 */
400 ksm_drop_anon_vma(rmap_item); 386 put_anon_vma(rmap_item->anon_vma);
401 387
402 down_read(&mm->mmap_sem); 388 down_read(&mm->mmap_sem);
403 if (ksm_test_exit(mm)) 389 if (ksm_test_exit(mm))
@@ -466,7 +452,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
466 ksm_pages_sharing--; 452 ksm_pages_sharing--;
467 else 453 else
468 ksm_pages_shared--; 454 ksm_pages_shared--;
469 ksm_drop_anon_vma(rmap_item); 455 put_anon_vma(rmap_item->anon_vma);
470 rmap_item->address &= PAGE_MASK; 456 rmap_item->address &= PAGE_MASK;
471 cond_resched(); 457 cond_resched();
472 } 458 }
@@ -554,7 +540,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
554 else 540 else
555 ksm_pages_shared--; 541 ksm_pages_shared--;
556 542
557 ksm_drop_anon_vma(rmap_item); 543 put_anon_vma(rmap_item->anon_vma);
558 rmap_item->address &= PAGE_MASK; 544 rmap_item->address &= PAGE_MASK;
559 545
560 } else if (rmap_item->address & UNSTABLE_FLAG) { 546 } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -734,7 +720,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
734 swapped = PageSwapCache(page); 720 swapped = PageSwapCache(page);
735 flush_cache_page(vma, addr, page_to_pfn(page)); 721 flush_cache_page(vma, addr, page_to_pfn(page));
736 /* 722 /*
737 * Ok this is tricky, when get_user_pages_fast() run it doesnt 723 * Ok this is tricky, when get_user_pages_fast() run it doesn't
738 * take any lock, therefore the check that we are going to make 724 * take any lock, therefore the check that we are going to make
739 * with the pagecount against the mapcount is racey and 725 * with the pagecount against the mapcount is racey and
740 * O_DIRECT can happen right after the check. 726 * O_DIRECT can happen right after the check.
@@ -949,7 +935,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
949 goto out; 935 goto out;
950 936
951 /* Must get reference to anon_vma while still holding mmap_sem */ 937 /* Must get reference to anon_vma while still holding mmap_sem */
952 hold_anon_vma(rmap_item, vma->anon_vma); 938 rmap_item->anon_vma = vma->anon_vma;
939 get_anon_vma(vma->anon_vma);
953out: 940out:
954 up_read(&mm->mmap_sem); 941 up_read(&mm->mmap_sem);
955 return err; 942 return err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 4618fda975a0..a0562d1a6ad4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -58,28 +58,6 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
58 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); 58 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
59} 59}
60 60
61static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
62 phys_addr_t base2, phys_addr_t size2)
63{
64 if (base2 == base1 + size1)
65 return 1;
66 else if (base1 == base2 + size2)
67 return -1;
68
69 return 0;
70}
71
72static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
73 unsigned long r1, unsigned long r2)
74{
75 phys_addr_t base1 = type->regions[r1].base;
76 phys_addr_t size1 = type->regions[r1].size;
77 phys_addr_t base2 = type->regions[r2].base;
78 phys_addr_t size2 = type->regions[r2].size;
79
80 return memblock_addrs_adjacent(base1, size1, base2, size2);
81}
82
83long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) 61long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
84{ 62{
85 unsigned long i; 63 unsigned long i;
@@ -206,14 +184,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
206 type->regions[i].size = type->regions[i + 1].size; 184 type->regions[i].size = type->regions[i + 1].size;
207 } 185 }
208 type->cnt--; 186 type->cnt--;
209}
210 187
211/* Assumption: base addr of region 1 < base addr of region 2 */ 188 /* Special case for empty arrays */
212static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, 189 if (type->cnt == 0) {
213 unsigned long r1, unsigned long r2) 190 type->cnt = 1;
214{ 191 type->regions[0].base = 0;
215 type->regions[r1].size += type->regions[r2].size; 192 type->regions[0].size = 0;
216 memblock_remove_region(type, r2); 193 }
217} 194}
218 195
219/* Defined below but needed now */ 196/* Defined below but needed now */
@@ -276,7 +253,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
276 return 0; 253 return 0;
277 254
278 /* Add the new reserved region now. Should not fail ! */ 255 /* Add the new reserved region now. Should not fail ! */
279 BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); 256 BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size));
280 257
281 /* If the array wasn't our static init one, then free it. We only do 258 /* If the array wasn't our static init one, then free it. We only do
282 * that before SLAB is available as later on, we don't know whether 259 * that before SLAB is available as later on, we don't know whether
@@ -296,58 +273,99 @@ extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1
296 return 1; 273 return 1;
297} 274}
298 275
299static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) 276static long __init_memblock memblock_add_region(struct memblock_type *type,
277 phys_addr_t base, phys_addr_t size)
300{ 278{
301 unsigned long coalesced = 0; 279 phys_addr_t end = base + size;
302 long adjacent, i; 280 int i, slot = -1;
303
304 if ((type->cnt == 1) && (type->regions[0].size == 0)) {
305 type->regions[0].base = base;
306 type->regions[0].size = size;
307 return 0;
308 }
309 281
310 /* First try and coalesce this MEMBLOCK with another. */ 282 /* First try and coalesce this MEMBLOCK with others */
311 for (i = 0; i < type->cnt; i++) { 283 for (i = 0; i < type->cnt; i++) {
312 phys_addr_t rgnbase = type->regions[i].base; 284 struct memblock_region *rgn = &type->regions[i];
313 phys_addr_t rgnsize = type->regions[i].size; 285 phys_addr_t rend = rgn->base + rgn->size;
286
287 /* Exit if there's no possible hits */
288 if (rgn->base > end || rgn->size == 0)
289 break;
314 290
315 if ((rgnbase == base) && (rgnsize == size)) 291 /* Check if we are fully enclosed within an existing
316 /* Already have this region, so we're done */ 292 * block
293 */
294 if (rgn->base <= base && rend >= end)
317 return 0; 295 return 0;
318 296
319 adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); 297 /* Check if we overlap or are adjacent with the bottom
320 /* Check if arch allows coalescing */ 298 * of a block.
321 if (adjacent != 0 && type == &memblock.memory && 299 */
322 !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) 300 if (base < rgn->base && end >= rgn->base) {
323 break; 301 /* If we can't coalesce, create a new block */
324 if (adjacent > 0) { 302 if (!memblock_memory_can_coalesce(base, size,
325 type->regions[i].base -= size; 303 rgn->base,
326 type->regions[i].size += size; 304 rgn->size)) {
327 coalesced++; 305 /* Overlap & can't coalesce are mutually
328 break; 306 * exclusive, if you do that, be prepared
329 } else if (adjacent < 0) { 307 * for trouble
330 type->regions[i].size += size; 308 */
331 coalesced++; 309 WARN_ON(end != rgn->base);
332 break; 310 goto new_block;
311 }
312 /* We extend the bottom of the block down to our
313 * base
314 */
315 rgn->base = base;
316 rgn->size = rend - base;
317
318 /* Return if we have nothing else to allocate
319 * (fully coalesced)
320 */
321 if (rend >= end)
322 return 0;
323
324 /* We continue processing from the end of the
325 * coalesced block.
326 */
327 base = rend;
328 size = end - base;
329 }
330
331 /* Now check if we overlap or are adjacent with the
332 * top of a block
333 */
334 if (base <= rend && end >= rend) {
335 /* If we can't coalesce, create a new block */
336 if (!memblock_memory_can_coalesce(rgn->base,
337 rgn->size,
338 base, size)) {
339 /* Overlap & can't coalesce are mutually
340 * exclusive, if you do that, be prepared
341 * for trouble
342 */
343 WARN_ON(rend != base);
344 goto new_block;
345 }
346 /* We adjust our base down to enclose the
347 * original block and destroy it. It will be
348 * part of our new allocation. Since we've
349 * freed an entry, we know we won't fail
350 * to allocate one later, so we won't risk
351 * losing the original block allocation.
352 */
353 size += (base - rgn->base);
354 base = rgn->base;
355 memblock_remove_region(type, i--);
333 } 356 }
334 } 357 }
335 358
336 /* If we plugged a hole, we may want to also coalesce with the 359 /* If the array is empty, special case, replace the fake
337 * next region 360 * filler region and return
338 */ 361 */
339 if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && 362 if ((type->cnt == 1) && (type->regions[0].size == 0)) {
340 ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, 363 type->regions[0].base = base;
341 type->regions[i].size, 364 type->regions[0].size = size;
342 type->regions[i+1].base, 365 return 0;
343 type->regions[i+1].size)))) {
344 memblock_coalesce_regions(type, i, i+1);
345 coalesced++;
346 } 366 }
347 367
348 if (coalesced) 368 new_block:
349 return coalesced;
350
351 /* If we are out of space, we fail. It's too late to resize the array 369 /* If we are out of space, we fail. It's too late to resize the array
352 * but then this shouldn't have happened in the first place. 370 * but then this shouldn't have happened in the first place.
353 */ 371 */
@@ -362,13 +380,14 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys
362 } else { 380 } else {
363 type->regions[i+1].base = base; 381 type->regions[i+1].base = base;
364 type->regions[i+1].size = size; 382 type->regions[i+1].size = size;
383 slot = i + 1;
365 break; 384 break;
366 } 385 }
367 } 386 }
368
369 if (base < type->regions[0].base) { 387 if (base < type->regions[0].base) {
370 type->regions[0].base = base; 388 type->regions[0].base = base;
371 type->regions[0].size = size; 389 type->regions[0].size = size;
390 slot = 0;
372 } 391 }
373 type->cnt++; 392 type->cnt++;
374 393
@@ -376,7 +395,8 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys
376 * our allocation and return an error 395 * our allocation and return an error
377 */ 396 */
378 if (type->cnt == type->max && memblock_double_array(type)) { 397 if (type->cnt == type->max && memblock_double_array(type)) {
379 type->cnt--; 398 BUG_ON(slot < 0);
399 memblock_remove_region(type, slot);
380 return -1; 400 return -1;
381 } 401 }
382 402
@@ -389,52 +409,55 @@ long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
389 409
390} 410}
391 411
392static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) 412static long __init_memblock __memblock_remove(struct memblock_type *type,
413 phys_addr_t base, phys_addr_t size)
393{ 414{
394 phys_addr_t rgnbegin, rgnend;
395 phys_addr_t end = base + size; 415 phys_addr_t end = base + size;
396 int i; 416 int i;
397 417
398 rgnbegin = rgnend = 0; /* supress gcc warnings */ 418 /* Walk through the array for collisions */
399 419 for (i = 0; i < type->cnt; i++) {
400 /* Find the region where (base, size) belongs to */ 420 struct memblock_region *rgn = &type->regions[i];
401 for (i=0; i < type->cnt; i++) { 421 phys_addr_t rend = rgn->base + rgn->size;
402 rgnbegin = type->regions[i].base;
403 rgnend = rgnbegin + type->regions[i].size;
404 422
405 if ((rgnbegin <= base) && (end <= rgnend)) 423 /* Nothing more to do, exit */
424 if (rgn->base > end || rgn->size == 0)
406 break; 425 break;
407 }
408 426
409 /* Didn't find the region */ 427 /* If we fully enclose the block, drop it */
410 if (i == type->cnt) 428 if (base <= rgn->base && end >= rend) {
411 return -1; 429 memblock_remove_region(type, i--);
430 continue;
431 }
412 432
413 /* Check to see if we are removing entire region */ 433 /* If we are fully enclosed within a block
414 if ((rgnbegin == base) && (rgnend == end)) { 434 * then we need to split it and we are done
415 memblock_remove_region(type, i); 435 */
416 return 0; 436 if (base > rgn->base && end < rend) {
417 } 437 rgn->size = base - rgn->base;
438 if (!memblock_add_region(type, end, rend - end))
439 return 0;
440 /* Failure to split is bad, we at least
441 * restore the block before erroring
442 */
443 rgn->size = rend - rgn->base;
444 WARN_ON(1);
445 return -1;
446 }
418 447
419 /* Check to see if region is matching at the front */ 448 /* Check if we need to trim the bottom of a block */
420 if (rgnbegin == base) { 449 if (rgn->base < end && rend > end) {
421 type->regions[i].base = end; 450 rgn->size -= end - rgn->base;
422 type->regions[i].size -= size; 451 rgn->base = end;
423 return 0; 452 break;
424 } 453 }
425 454
426 /* Check to see if the region is matching at the end */ 455 /* And check if we need to trim the top of a block */
427 if (rgnend == end) { 456 if (base < rend)
428 type->regions[i].size -= size; 457 rgn->size -= rend - base;
429 return 0;
430 }
431 458
432 /* 459 }
433 * We need to split the entry - adjust the current one to the 460 return 0;
434 * beginging of the hole and add the region after hole.
435 */
436 type->regions[i].size = base - type->regions[i].base;
437 return memblock_add_region(type, end, rgnend - end);
438} 461}
439 462
440long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) 463long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
@@ -467,7 +490,7 @@ phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, ph
467 490
468 found = memblock_find_base(size, align, 0, max_addr); 491 found = memblock_find_base(size, align, 0, max_addr);
469 if (found != MEMBLOCK_ERROR && 492 if (found != MEMBLOCK_ERROR &&
470 memblock_add_region(&memblock.reserved, found, size) >= 0) 493 !memblock_add_region(&memblock.reserved, found, size))
471 return found; 494 return found;
472 495
473 return 0; 496 return 0;
@@ -548,7 +571,7 @@ static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
548 if (this_nid == nid) { 571 if (this_nid == nid) {
549 phys_addr_t ret = memblock_find_region(start, this_end, size, align); 572 phys_addr_t ret = memblock_find_region(start, this_end, size, align);
550 if (ret != MEMBLOCK_ERROR && 573 if (ret != MEMBLOCK_ERROR &&
551 memblock_add_region(&memblock.reserved, ret, size) >= 0) 574 !memblock_add_region(&memblock.reserved, ret, size))
552 return ret; 575 return ret;
553 } 576 }
554 start = this_end; 577 start = this_end;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index da53a252b259..010f9166fa6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0;
73#define do_swap_account (0) 73#define do_swap_account (0)
74#endif 74#endif
75 75
76/*
77 * Per memcg event counter is incremented at every pagein/pageout. This counter
78 * is used for trigger some periodic events. This is straightforward and better
79 * than using jiffies etc. to handle periodic memcg event.
80 *
81 * These values will be used as !((event) & ((1 <<(thresh)) - 1))
82 */
83#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
84#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
85 76
86/* 77/*
87 * Statistics for memory cgroup. 78 * Statistics for memory cgroup.
@@ -93,19 +84,36 @@ enum mem_cgroup_stat_index {
93 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 84 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
94 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 85 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
95 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 86 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
96 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
97 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
98 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 87 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
99 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 88 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
100 /* incremented at every pagein/pageout */
101 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
102 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 89 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
103
104 MEM_CGROUP_STAT_NSTATS, 90 MEM_CGROUP_STAT_NSTATS,
105}; 91};
106 92
93enum mem_cgroup_events_index {
94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
97 MEM_CGROUP_EVENTS_NSTATS,
98};
99/*
100 * Per memcg event counter is incremented at every pagein/pageout. With THP,
101 * it will be incremated by the number of pages. This counter is used for
102 * for trigger some periodic events. This is straightforward and better
103 * than using jiffies etc. to handle periodic memcg event.
104 */
105enum mem_cgroup_events_target {
106 MEM_CGROUP_TARGET_THRESH,
107 MEM_CGROUP_TARGET_SOFTLIMIT,
108 MEM_CGROUP_NTARGETS,
109};
110#define THRESHOLDS_EVENTS_TARGET (128)
111#define SOFTLIMIT_EVENTS_TARGET (1024)
112
107struct mem_cgroup_stat_cpu { 113struct mem_cgroup_stat_cpu {
108 s64 count[MEM_CGROUP_STAT_NSTATS]; 114 long count[MEM_CGROUP_STAT_NSTATS];
115 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
116 unsigned long targets[MEM_CGROUP_NTARGETS];
109}; 117};
110 118
111/* 119/*
@@ -218,12 +226,6 @@ struct mem_cgroup {
218 * per zone LRU lists. 226 * per zone LRU lists.
219 */ 227 */
220 struct mem_cgroup_lru_info info; 228 struct mem_cgroup_lru_info info;
221
222 /*
223 protect against reclaim related member.
224 */
225 spinlock_t reclaim_param_lock;
226
227 /* 229 /*
228 * While reclaiming in a hierarchy, we cache the last child we 230 * While reclaiming in a hierarchy, we cache the last child we
229 * reclaimed from. 231 * reclaimed from.
@@ -327,13 +329,6 @@ enum charge_type {
327 NR_CHARGE_TYPE, 329 NR_CHARGE_TYPE,
328}; 330};
329 331
330/* only for here (for easy reading.) */
331#define PCGF_CACHE (1UL << PCG_CACHE)
332#define PCGF_USED (1UL << PCG_USED)
333#define PCGF_LOCK (1UL << PCG_LOCK)
334/* Not used, but added here for completeness */
335#define PCGF_ACCT (1UL << PCG_ACCT)
336
337/* for encoding cft->private value on file */ 332/* for encoding cft->private value on file */
338#define _MEM (0) 333#define _MEM (0)
339#define _MEMSWAP (1) 334#define _MEMSWAP (1)
@@ -371,14 +366,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
371} 366}
372 367
373static struct mem_cgroup_per_zone * 368static struct mem_cgroup_per_zone *
374page_cgroup_zoneinfo(struct page_cgroup *pc) 369page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
375{ 370{
376 struct mem_cgroup *mem = pc->mem_cgroup; 371 int nid = page_to_nid(page);
377 int nid = page_cgroup_nid(pc); 372 int zid = page_zonenum(page);
378 int zid = page_cgroup_zid(pc);
379
380 if (!mem)
381 return NULL;
382 373
383 return mem_cgroup_zoneinfo(mem, nid, zid); 374 return mem_cgroup_zoneinfo(mem, nid, zid);
384} 375}
@@ -504,11 +495,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
504 } 495 }
505} 496}
506 497
507static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
508{
509 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
510}
511
512static struct mem_cgroup_per_zone * 498static struct mem_cgroup_per_zone *
513__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 499__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
514{ 500{
@@ -565,11 +551,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
565 * common workload, threashold and synchonization as vmstat[] should be 551 * common workload, threashold and synchonization as vmstat[] should be
566 * implemented. 552 * implemented.
567 */ 553 */
568static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 554static long mem_cgroup_read_stat(struct mem_cgroup *mem,
569 enum mem_cgroup_stat_index idx) 555 enum mem_cgroup_stat_index idx)
570{ 556{
557 long val = 0;
571 int cpu; 558 int cpu;
572 s64 val = 0;
573 559
574 get_online_cpus(); 560 get_online_cpus();
575 for_each_online_cpu(cpu) 561 for_each_online_cpu(cpu)
@@ -583,9 +569,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
583 return val; 569 return val;
584} 570}
585 571
586static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) 572static long mem_cgroup_local_usage(struct mem_cgroup *mem)
587{ 573{
588 s64 ret; 574 long ret;
589 575
590 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 576 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
591 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 577 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
@@ -599,6 +585,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
599 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 585 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
600} 586}
601 587
588static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
589 enum mem_cgroup_events_index idx)
590{
591 unsigned long val = 0;
592 int cpu;
593
594 for_each_online_cpu(cpu)
595 val += per_cpu(mem->stat->events[idx], cpu);
596#ifdef CONFIG_HOTPLUG_CPU
597 spin_lock(&mem->pcp_counter_lock);
598 val += mem->nocpu_base.events[idx];
599 spin_unlock(&mem->pcp_counter_lock);
600#endif
601 return val;
602}
603
602static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 604static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
603 bool file, int nr_pages) 605 bool file, int nr_pages)
604{ 606{
@@ -611,13 +613,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
611 613
612 /* pagein of a big page is an event. So, ignore page size */ 614 /* pagein of a big page is an event. So, ignore page size */
613 if (nr_pages > 0) 615 if (nr_pages > 0)
614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 616 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
615 else { 617 else {
616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 618 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
617 nr_pages = -nr_pages; /* for event */ 619 nr_pages = -nr_pages; /* for event */
618 } 620 }
619 621
620 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); 622 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
621 623
622 preempt_enable(); 624 preempt_enable();
623} 625}
@@ -637,13 +639,34 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
637 return total; 639 return total;
638} 640}
639 641
640static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) 642static bool __memcg_event_check(struct mem_cgroup *mem, int target)
641{ 643{
642 s64 val; 644 unsigned long val, next;
645
646 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
647 next = this_cpu_read(mem->stat->targets[target]);
648 /* from time_after() in jiffies.h */
649 return ((long)next - (long)val < 0);
650}
651
652static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
653{
654 unsigned long val, next;
643 655
644 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); 656 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
645 657
646 return !(val & ((1 << event_mask_shift) - 1)); 658 switch (target) {
659 case MEM_CGROUP_TARGET_THRESH:
660 next = val + THRESHOLDS_EVENTS_TARGET;
661 break;
662 case MEM_CGROUP_TARGET_SOFTLIMIT:
663 next = val + SOFTLIMIT_EVENTS_TARGET;
664 break;
665 default:
666 return;
667 }
668
669 this_cpu_write(mem->stat->targets[target], next);
647} 670}
648 671
649/* 672/*
@@ -653,10 +676,15 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
653static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 676static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
654{ 677{
655 /* threshold event is triggered in finer grain than soft limit */ 678 /* threshold event is triggered in finer grain than soft limit */
656 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { 679 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
657 mem_cgroup_threshold(mem); 680 mem_cgroup_threshold(mem);
658 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) 681 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
682 if (unlikely(__memcg_event_check(mem,
683 MEM_CGROUP_TARGET_SOFTLIMIT))){
659 mem_cgroup_update_tree(mem, page); 684 mem_cgroup_update_tree(mem, page);
685 __mem_cgroup_target_update(mem,
686 MEM_CGROUP_TARGET_SOFTLIMIT);
687 }
660 } 688 }
661} 689}
662 690
@@ -815,7 +843,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
815 * We don't check PCG_USED bit. It's cleared when the "page" is finally 843 * We don't check PCG_USED bit. It's cleared when the "page" is finally
816 * removed from global LRU. 844 * removed from global LRU.
817 */ 845 */
818 mz = page_cgroup_zoneinfo(pc); 846 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
819 /* huge page split is done under lru_lock. so, we have no races. */ 847 /* huge page split is done under lru_lock. so, we have no races. */
820 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 848 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
821 if (mem_cgroup_is_root(pc->mem_cgroup)) 849 if (mem_cgroup_is_root(pc->mem_cgroup))
@@ -829,6 +857,32 @@ void mem_cgroup_del_lru(struct page *page)
829 mem_cgroup_del_lru_list(page, page_lru(page)); 857 mem_cgroup_del_lru_list(page, page_lru(page));
830} 858}
831 859
860/*
861 * Writeback is about to end against a page which has been marked for immediate
862 * reclaim. If it still appears to be reclaimable, move it to the tail of the
863 * inactive list.
864 */
865void mem_cgroup_rotate_reclaimable_page(struct page *page)
866{
867 struct mem_cgroup_per_zone *mz;
868 struct page_cgroup *pc;
869 enum lru_list lru = page_lru(page);
870
871 if (mem_cgroup_disabled())
872 return;
873
874 pc = lookup_page_cgroup(page);
875 /* unused or root page is not rotated. */
876 if (!PageCgroupUsed(pc))
877 return;
878 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
879 smp_rmb();
880 if (mem_cgroup_is_root(pc->mem_cgroup))
881 return;
882 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
883 list_move_tail(&pc->lru, &mz->lists[lru]);
884}
885
832void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 886void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
833{ 887{
834 struct mem_cgroup_per_zone *mz; 888 struct mem_cgroup_per_zone *mz;
@@ -845,7 +899,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
845 smp_rmb(); 899 smp_rmb();
846 if (mem_cgroup_is_root(pc->mem_cgroup)) 900 if (mem_cgroup_is_root(pc->mem_cgroup))
847 return; 901 return;
848 mz = page_cgroup_zoneinfo(pc); 902 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
849 list_move(&pc->lru, &mz->lists[lru]); 903 list_move(&pc->lru, &mz->lists[lru]);
850} 904}
851 905
@@ -862,7 +916,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
862 return; 916 return;
863 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 917 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
864 smp_rmb(); 918 smp_rmb();
865 mz = page_cgroup_zoneinfo(pc); 919 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
866 /* huge page split is done under lru_lock. so, we have no races. */ 920 /* huge page split is done under lru_lock. so, we have no races. */
867 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 921 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
868 SetPageCgroupAcctLRU(pc); 922 SetPageCgroupAcctLRU(pc);
@@ -872,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
872} 926}
873 927
874/* 928/*
875 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 929 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
876 * lru because the page may.be reused after it's fully uncharged (because of 930 * while it's linked to lru because the page may be reused after it's fully
877 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 931 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
878 * it again. This function is only used to charge SwapCache. It's done under 932 * It's done under lock_page and expected that zone->lru_lock isnever held.
879 * lock_page and expected that zone->lru_lock is never held.
880 */ 933 */
881static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 934static void mem_cgroup_lru_del_before_commit(struct page *page)
882{ 935{
883 unsigned long flags; 936 unsigned long flags;
884 struct zone *zone = page_zone(page); 937 struct zone *zone = page_zone(page);
885 struct page_cgroup *pc = lookup_page_cgroup(page); 938 struct page_cgroup *pc = lookup_page_cgroup(page);
886 939
940 /*
941 * Doing this check without taking ->lru_lock seems wrong but this
942 * is safe. Because if page_cgroup's USED bit is unset, the page
943 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
944 * set, the commit after this will fail, anyway.
945 * This all charge/uncharge is done under some mutual execustion.
946 * So, we don't need to taking care of changes in USED bit.
947 */
948 if (likely(!PageLRU(page)))
949 return;
950
887 spin_lock_irqsave(&zone->lru_lock, flags); 951 spin_lock_irqsave(&zone->lru_lock, flags);
888 /* 952 /*
889 * Forget old LRU when this page_cgroup is *not* used. This Used bit 953 * Forget old LRU when this page_cgroup is *not* used. This Used bit
@@ -894,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
894 spin_unlock_irqrestore(&zone->lru_lock, flags); 958 spin_unlock_irqrestore(&zone->lru_lock, flags);
895} 959}
896 960
897static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 961static void mem_cgroup_lru_add_after_commit(struct page *page)
898{ 962{
899 unsigned long flags; 963 unsigned long flags;
900 struct zone *zone = page_zone(page); 964 struct zone *zone = page_zone(page);
901 struct page_cgroup *pc = lookup_page_cgroup(page); 965 struct page_cgroup *pc = lookup_page_cgroup(page);
902 966
967 /* taking care of that the page is added to LRU while we commit it */
968 if (likely(!PageLRU(page)))
969 return;
903 spin_lock_irqsave(&zone->lru_lock, flags); 970 spin_lock_irqsave(&zone->lru_lock, flags);
904 /* link when the page is linked to LRU but page_cgroup isn't */ 971 /* link when the page is linked to LRU but page_cgroup isn't */
905 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 972 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
@@ -1032,10 +1099,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1032 return NULL; 1099 return NULL;
1033 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1100 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1034 smp_rmb(); 1101 smp_rmb();
1035 mz = page_cgroup_zoneinfo(pc); 1102 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1036 if (!mz)
1037 return NULL;
1038
1039 return &mz->reclaim_stat; 1103 return &mz->reclaim_stat;
1040} 1104}
1041 1105
@@ -1067,9 +1131,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1067 if (scan >= nr_to_scan) 1131 if (scan >= nr_to_scan)
1068 break; 1132 break;
1069 1133
1070 page = pc->page;
1071 if (unlikely(!PageCgroupUsed(pc))) 1134 if (unlikely(!PageCgroupUsed(pc)))
1072 continue; 1135 continue;
1136
1137 page = lookup_cgroup_page(pc);
1138
1073 if (unlikely(!PageLRU(page))) 1139 if (unlikely(!PageLRU(page)))
1074 continue; 1140 continue;
1075 1141
@@ -1101,49 +1167,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1101#define mem_cgroup_from_res_counter(counter, member) \ 1167#define mem_cgroup_from_res_counter(counter, member) \
1102 container_of(counter, struct mem_cgroup, member) 1168 container_of(counter, struct mem_cgroup, member)
1103 1169
1104static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1105{
1106 if (do_swap_account) {
1107 if (res_counter_check_under_limit(&mem->res) &&
1108 res_counter_check_under_limit(&mem->memsw))
1109 return true;
1110 } else
1111 if (res_counter_check_under_limit(&mem->res))
1112 return true;
1113 return false;
1114}
1115
1116/** 1170/**
1117 * mem_cgroup_check_margin - check if the memory cgroup allows charging 1171 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1118 * @mem: memory cgroup to check 1172 * @mem: the memory cgroup
1119 * @bytes: the number of bytes the caller intends to charge
1120 * 1173 *
1121 * Returns a boolean value on whether @mem can be charged @bytes or 1174 * Returns the maximum amount of memory @mem can be charged with, in
1122 * whether this would exceed the limit. 1175 * pages.
1123 */ 1176 */
1124static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) 1177static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1125{ 1178{
1126 if (!res_counter_check_margin(&mem->res, bytes)) 1179 unsigned long long margin;
1127 return false; 1180
1128 if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) 1181 margin = res_counter_margin(&mem->res);
1129 return false; 1182 if (do_swap_account)
1130 return true; 1183 margin = min(margin, res_counter_margin(&mem->memsw));
1184 return margin >> PAGE_SHIFT;
1131} 1185}
1132 1186
1133static unsigned int get_swappiness(struct mem_cgroup *memcg) 1187static unsigned int get_swappiness(struct mem_cgroup *memcg)
1134{ 1188{
1135 struct cgroup *cgrp = memcg->css.cgroup; 1189 struct cgroup *cgrp = memcg->css.cgroup;
1136 unsigned int swappiness;
1137 1190
1138 /* root ? */ 1191 /* root ? */
1139 if (cgrp->parent == NULL) 1192 if (cgrp->parent == NULL)
1140 return vm_swappiness; 1193 return vm_swappiness;
1141 1194
1142 spin_lock(&memcg->reclaim_param_lock); 1195 return memcg->swappiness;
1143 swappiness = memcg->swappiness;
1144 spin_unlock(&memcg->reclaim_param_lock);
1145
1146 return swappiness;
1147} 1196}
1148 1197
1149static void mem_cgroup_start_move(struct mem_cgroup *mem) 1198static void mem_cgroup_start_move(struct mem_cgroup *mem)
@@ -1359,13 +1408,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1359 1408
1360 rcu_read_unlock(); 1409 rcu_read_unlock();
1361 /* Updates scanning parameter */ 1410 /* Updates scanning parameter */
1362 spin_lock(&root_mem->reclaim_param_lock);
1363 if (!css) { 1411 if (!css) {
1364 /* this means start scan from ID:1 */ 1412 /* this means start scan from ID:1 */
1365 root_mem->last_scanned_child = 0; 1413 root_mem->last_scanned_child = 0;
1366 } else 1414 } else
1367 root_mem->last_scanned_child = found; 1415 root_mem->last_scanned_child = found;
1368 spin_unlock(&root_mem->reclaim_param_lock);
1369 } 1416 }
1370 1417
1371 return ret; 1418 return ret;
@@ -1394,7 +1441,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1394 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1441 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1395 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1442 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1396 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1443 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1397 unsigned long excess = mem_cgroup_get_excess(root_mem); 1444 unsigned long excess;
1445
1446 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1398 1447
1399 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1448 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1400 if (root_mem->memsw_is_minimum) 1449 if (root_mem->memsw_is_minimum)
@@ -1417,7 +1466,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1417 break; 1466 break;
1418 } 1467 }
1419 /* 1468 /*
1420 * We want to do more targetted reclaim. 1469 * We want to do more targeted reclaim.
1421 * excess >> 2 is not to excessive so as to 1470 * excess >> 2 is not to excessive so as to
1422 * reclaim too much, nor too less that we keep 1471 * reclaim too much, nor too less that we keep
1423 * coming back to reclaim from this cgroup 1472 * coming back to reclaim from this cgroup
@@ -1451,9 +1500,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1451 return ret; 1500 return ret;
1452 total += ret; 1501 total += ret;
1453 if (check_soft) { 1502 if (check_soft) {
1454 if (res_counter_check_under_soft_limit(&root_mem->res)) 1503 if (!res_counter_soft_limit_excess(&root_mem->res))
1455 return total; 1504 return total;
1456 } else if (mem_cgroup_check_under_limit(root_mem)) 1505 } else if (mem_cgroup_margin(root_mem))
1457 return 1 + total; 1506 return 1 + total;
1458 } 1507 }
1459 return total; 1508 return total;
@@ -1661,17 +1710,17 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1661 * size of first charge trial. "32" comes from vmscan.c's magic value. 1710 * size of first charge trial. "32" comes from vmscan.c's magic value.
1662 * TODO: maybe necessary to use big numbers in big irons. 1711 * TODO: maybe necessary to use big numbers in big irons.
1663 */ 1712 */
1664#define CHARGE_SIZE (32 * PAGE_SIZE) 1713#define CHARGE_BATCH 32U
1665struct memcg_stock_pcp { 1714struct memcg_stock_pcp {
1666 struct mem_cgroup *cached; /* this never be root cgroup */ 1715 struct mem_cgroup *cached; /* this never be root cgroup */
1667 int charge; 1716 unsigned int nr_pages;
1668 struct work_struct work; 1717 struct work_struct work;
1669}; 1718};
1670static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1719static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1671static atomic_t memcg_drain_count; 1720static atomic_t memcg_drain_count;
1672 1721
1673/* 1722/*
1674 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1723 * Try to consume stocked charge on this cpu. If success, one page is consumed
1675 * from local stock and true is returned. If the stock is 0 or charges from a 1724 * from local stock and true is returned. If the stock is 0 or charges from a
1676 * cgroup which is not current target, returns false. This stock will be 1725 * cgroup which is not current target, returns false. This stock will be
1677 * refilled. 1726 * refilled.
@@ -1682,8 +1731,8 @@ static bool consume_stock(struct mem_cgroup *mem)
1682 bool ret = true; 1731 bool ret = true;
1683 1732
1684 stock = &get_cpu_var(memcg_stock); 1733 stock = &get_cpu_var(memcg_stock);
1685 if (mem == stock->cached && stock->charge) 1734 if (mem == stock->cached && stock->nr_pages)
1686 stock->charge -= PAGE_SIZE; 1735 stock->nr_pages--;
1687 else /* need to call res_counter_charge */ 1736 else /* need to call res_counter_charge */
1688 ret = false; 1737 ret = false;
1689 put_cpu_var(memcg_stock); 1738 put_cpu_var(memcg_stock);
@@ -1697,13 +1746,15 @@ static void drain_stock(struct memcg_stock_pcp *stock)
1697{ 1746{
1698 struct mem_cgroup *old = stock->cached; 1747 struct mem_cgroup *old = stock->cached;
1699 1748
1700 if (stock->charge) { 1749 if (stock->nr_pages) {
1701 res_counter_uncharge(&old->res, stock->charge); 1750 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
1751
1752 res_counter_uncharge(&old->res, bytes);
1702 if (do_swap_account) 1753 if (do_swap_account)
1703 res_counter_uncharge(&old->memsw, stock->charge); 1754 res_counter_uncharge(&old->memsw, bytes);
1755 stock->nr_pages = 0;
1704 } 1756 }
1705 stock->cached = NULL; 1757 stock->cached = NULL;
1706 stock->charge = 0;
1707} 1758}
1708 1759
1709/* 1760/*
@@ -1720,7 +1771,7 @@ static void drain_local_stock(struct work_struct *dummy)
1720 * Cache charges(val) which is from res_counter, to local per_cpu area. 1771 * Cache charges(val) which is from res_counter, to local per_cpu area.
1721 * This will be consumed by consume_stock() function, later. 1772 * This will be consumed by consume_stock() function, later.
1722 */ 1773 */
1723static void refill_stock(struct mem_cgroup *mem, int val) 1774static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
1724{ 1775{
1725 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1776 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1726 1777
@@ -1728,7 +1779,7 @@ static void refill_stock(struct mem_cgroup *mem, int val)
1728 drain_stock(stock); 1779 drain_stock(stock);
1729 stock->cached = mem; 1780 stock->cached = mem;
1730 } 1781 }
1731 stock->charge += val; 1782 stock->nr_pages += nr_pages;
1732 put_cpu_var(memcg_stock); 1783 put_cpu_var(memcg_stock);
1733} 1784}
1734 1785
@@ -1780,11 +1831,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1780 1831
1781 spin_lock(&mem->pcp_counter_lock); 1832 spin_lock(&mem->pcp_counter_lock);
1782 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 1833 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1783 s64 x = per_cpu(mem->stat->count[i], cpu); 1834 long x = per_cpu(mem->stat->count[i], cpu);
1784 1835
1785 per_cpu(mem->stat->count[i], cpu) = 0; 1836 per_cpu(mem->stat->count[i], cpu) = 0;
1786 mem->nocpu_base.count[i] += x; 1837 mem->nocpu_base.count[i] += x;
1787 } 1838 }
1839 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
1840 unsigned long x = per_cpu(mem->stat->events[i], cpu);
1841
1842 per_cpu(mem->stat->events[i], cpu) = 0;
1843 mem->nocpu_base.events[i] += x;
1844 }
1788 /* need to clear ON_MOVE value, works as a kind of lock. */ 1845 /* need to clear ON_MOVE value, works as a kind of lock. */
1789 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 1846 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1790 spin_unlock(&mem->pcp_counter_lock); 1847 spin_unlock(&mem->pcp_counter_lock);
@@ -1834,9 +1891,10 @@ enum {
1834 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 1891 CHARGE_OOM_DIE, /* the current is killed because of OOM */
1835}; 1892};
1836 1893
1837static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 1894static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1838 int csize, bool oom_check) 1895 unsigned int nr_pages, bool oom_check)
1839{ 1896{
1897 unsigned long csize = nr_pages * PAGE_SIZE;
1840 struct mem_cgroup *mem_over_limit; 1898 struct mem_cgroup *mem_over_limit;
1841 struct res_counter *fail_res; 1899 struct res_counter *fail_res;
1842 unsigned long flags = 0; 1900 unsigned long flags = 0;
@@ -1857,14 +1915,13 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1857 } else 1915 } else
1858 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1916 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1859 /* 1917 /*
1860 * csize can be either a huge page (HPAGE_SIZE), a batch of 1918 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
1861 * regular pages (CHARGE_SIZE), or a single regular page 1919 * of regular pages (CHARGE_BATCH), or a single regular page (1).
1862 * (PAGE_SIZE).
1863 * 1920 *
1864 * Never reclaim on behalf of optional batching, retry with a 1921 * Never reclaim on behalf of optional batching, retry with a
1865 * single page instead. 1922 * single page instead.
1866 */ 1923 */
1867 if (csize == CHARGE_SIZE) 1924 if (nr_pages == CHARGE_BATCH)
1868 return CHARGE_RETRY; 1925 return CHARGE_RETRY;
1869 1926
1870 if (!(gfp_mask & __GFP_WAIT)) 1927 if (!(gfp_mask & __GFP_WAIT))
@@ -1872,7 +1929,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1872 1929
1873 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1930 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1874 gfp_mask, flags); 1931 gfp_mask, flags);
1875 if (mem_cgroup_check_margin(mem_over_limit, csize)) 1932 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1876 return CHARGE_RETRY; 1933 return CHARGE_RETRY;
1877 /* 1934 /*
1878 * Even though the limit is exceeded at this point, reclaim 1935 * Even though the limit is exceeded at this point, reclaim
@@ -1883,7 +1940,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1883 * unlikely to succeed so close to the limit, and we fall back 1940 * unlikely to succeed so close to the limit, and we fall back
1884 * to regular pages anyway in case of failure. 1941 * to regular pages anyway in case of failure.
1885 */ 1942 */
1886 if (csize == PAGE_SIZE && ret) 1943 if (nr_pages == 1 && ret)
1887 return CHARGE_RETRY; 1944 return CHARGE_RETRY;
1888 1945
1889 /* 1946 /*
@@ -1909,13 +1966,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1909 */ 1966 */
1910static int __mem_cgroup_try_charge(struct mm_struct *mm, 1967static int __mem_cgroup_try_charge(struct mm_struct *mm,
1911 gfp_t gfp_mask, 1968 gfp_t gfp_mask,
1912 struct mem_cgroup **memcg, bool oom, 1969 unsigned int nr_pages,
1913 int page_size) 1970 struct mem_cgroup **memcg,
1971 bool oom)
1914{ 1972{
1973 unsigned int batch = max(CHARGE_BATCH, nr_pages);
1915 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1974 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1916 struct mem_cgroup *mem = NULL; 1975 struct mem_cgroup *mem = NULL;
1917 int ret; 1976 int ret;
1918 int csize = max(CHARGE_SIZE, (unsigned long) page_size);
1919 1977
1920 /* 1978 /*
1921 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1979 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1940,7 +1998,7 @@ again:
1940 VM_BUG_ON(css_is_removed(&mem->css)); 1998 VM_BUG_ON(css_is_removed(&mem->css));
1941 if (mem_cgroup_is_root(mem)) 1999 if (mem_cgroup_is_root(mem))
1942 goto done; 2000 goto done;
1943 if (page_size == PAGE_SIZE && consume_stock(mem)) 2001 if (nr_pages == 1 && consume_stock(mem))
1944 goto done; 2002 goto done;
1945 css_get(&mem->css); 2003 css_get(&mem->css);
1946 } else { 2004 } else {
@@ -1963,7 +2021,7 @@ again:
1963 rcu_read_unlock(); 2021 rcu_read_unlock();
1964 goto done; 2022 goto done;
1965 } 2023 }
1966 if (page_size == PAGE_SIZE && consume_stock(mem)) { 2024 if (nr_pages == 1 && consume_stock(mem)) {
1967 /* 2025 /*
1968 * It seems dagerous to access memcg without css_get(). 2026 * It seems dagerous to access memcg without css_get().
1969 * But considering how consume_stok works, it's not 2027 * But considering how consume_stok works, it's not
@@ -1998,13 +2056,12 @@ again:
1998 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2056 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1999 } 2057 }
2000 2058
2001 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); 2059 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
2002
2003 switch (ret) { 2060 switch (ret) {
2004 case CHARGE_OK: 2061 case CHARGE_OK:
2005 break; 2062 break;
2006 case CHARGE_RETRY: /* not in OOM situation but retry */ 2063 case CHARGE_RETRY: /* not in OOM situation but retry */
2007 csize = page_size; 2064 batch = nr_pages;
2008 css_put(&mem->css); 2065 css_put(&mem->css);
2009 mem = NULL; 2066 mem = NULL;
2010 goto again; 2067 goto again;
@@ -2025,8 +2082,8 @@ again:
2025 } 2082 }
2026 } while (ret != CHARGE_OK); 2083 } while (ret != CHARGE_OK);
2027 2084
2028 if (csize > page_size) 2085 if (batch > nr_pages)
2029 refill_stock(mem, csize - page_size); 2086 refill_stock(mem, batch - nr_pages);
2030 css_put(&mem->css); 2087 css_put(&mem->css);
2031done: 2088done:
2032 *memcg = mem; 2089 *memcg = mem;
@@ -2045,21 +2102,17 @@ bypass:
2045 * gotten by try_charge(). 2102 * gotten by try_charge().
2046 */ 2103 */
2047static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2104static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2048 unsigned long count) 2105 unsigned int nr_pages)
2049{ 2106{
2050 if (!mem_cgroup_is_root(mem)) { 2107 if (!mem_cgroup_is_root(mem)) {
2051 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 2108 unsigned long bytes = nr_pages * PAGE_SIZE;
2109
2110 res_counter_uncharge(&mem->res, bytes);
2052 if (do_swap_account) 2111 if (do_swap_account)
2053 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 2112 res_counter_uncharge(&mem->memsw, bytes);
2054 } 2113 }
2055} 2114}
2056 2115
2057static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2058 int page_size)
2059{
2060 __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
2061}
2062
2063/* 2116/*
2064 * A helper function to get mem_cgroup from ID. must be called under 2117 * A helper function to get mem_cgroup from ID. must be called under
2065 * rcu_read_lock(). The caller must check css_is_removed() or some if 2118 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -2108,20 +2161,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2108} 2161}
2109 2162
2110static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2163static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2164 struct page *page,
2165 unsigned int nr_pages,
2111 struct page_cgroup *pc, 2166 struct page_cgroup *pc,
2112 enum charge_type ctype, 2167 enum charge_type ctype)
2113 int page_size)
2114{ 2168{
2115 int nr_pages = page_size >> PAGE_SHIFT;
2116
2117 /* try_charge() can return NULL to *memcg, taking care of it. */
2118 if (!mem)
2119 return;
2120
2121 lock_page_cgroup(pc); 2169 lock_page_cgroup(pc);
2122 if (unlikely(PageCgroupUsed(pc))) { 2170 if (unlikely(PageCgroupUsed(pc))) {
2123 unlock_page_cgroup(pc); 2171 unlock_page_cgroup(pc);
2124 mem_cgroup_cancel_charge(mem, page_size); 2172 __mem_cgroup_cancel_charge(mem, nr_pages);
2125 return; 2173 return;
2126 } 2174 }
2127 /* 2175 /*
@@ -2158,7 +2206,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2158 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2206 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2159 * if they exceeds softlimit. 2207 * if they exceeds softlimit.
2160 */ 2208 */
2161 memcg_check_events(mem, pc->page); 2209 memcg_check_events(mem, page);
2162} 2210}
2163 2211
2164#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2212#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2195,7 +2243,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2195 * We hold lru_lock, then, reduce counter directly. 2243 * We hold lru_lock, then, reduce counter directly.
2196 */ 2244 */
2197 lru = page_lru(head); 2245 lru = page_lru(head);
2198 mz = page_cgroup_zoneinfo(head_pc); 2246 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2199 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 2247 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2200 } 2248 }
2201 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2249 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
@@ -2204,7 +2252,9 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2204#endif 2252#endif
2205 2253
2206/** 2254/**
2207 * __mem_cgroup_move_account - move account of the page 2255 * mem_cgroup_move_account - move account of the page
2256 * @page: the page
2257 * @nr_pages: number of regular pages (>1 for huge pages)
2208 * @pc: page_cgroup of the page. 2258 * @pc: page_cgroup of the page.
2209 * @from: mem_cgroup which the page is moved from. 2259 * @from: mem_cgroup which the page is moved from.
2210 * @to: mem_cgroup which the page is moved to. @from != @to. 2260 * @to: mem_cgroup which the page is moved to. @from != @to.
@@ -2212,25 +2262,42 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2212 * 2262 *
2213 * The caller must confirm following. 2263 * The caller must confirm following.
2214 * - page is not on LRU (isolate_page() is useful.) 2264 * - page is not on LRU (isolate_page() is useful.)
2215 * - the pc is locked, used, and ->mem_cgroup points to @from. 2265 * - compound_lock is held when nr_pages > 1
2216 * 2266 *
2217 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2267 * This function doesn't do "charge" nor css_get to new cgroup. It should be
2218 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 2268 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
2219 * true, this function does "uncharge" from old cgroup, but it doesn't if 2269 * true, this function does "uncharge" from old cgroup, but it doesn't if
2220 * @uncharge is false, so a caller should do "uncharge". 2270 * @uncharge is false, so a caller should do "uncharge".
2221 */ 2271 */
2222 2272static int mem_cgroup_move_account(struct page *page,
2223static void __mem_cgroup_move_account(struct page_cgroup *pc, 2273 unsigned int nr_pages,
2224 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, 2274 struct page_cgroup *pc,
2225 int charge_size) 2275 struct mem_cgroup *from,
2276 struct mem_cgroup *to,
2277 bool uncharge)
2226{ 2278{
2227 int nr_pages = charge_size >> PAGE_SHIFT; 2279 unsigned long flags;
2280 int ret;
2228 2281
2229 VM_BUG_ON(from == to); 2282 VM_BUG_ON(from == to);
2230 VM_BUG_ON(PageLRU(pc->page)); 2283 VM_BUG_ON(PageLRU(page));
2231 VM_BUG_ON(!page_is_cgroup_locked(pc)); 2284 /*
2232 VM_BUG_ON(!PageCgroupUsed(pc)); 2285 * The page is isolated from LRU. So, collapse function
2233 VM_BUG_ON(pc->mem_cgroup != from); 2286 * will not handle this page. But page splitting can happen.
2287 * Do this check under compound_page_lock(). The caller should
2288 * hold it.
2289 */
2290 ret = -EBUSY;
2291 if (nr_pages > 1 && !PageTransHuge(page))
2292 goto out;
2293
2294 lock_page_cgroup(pc);
2295
2296 ret = -EINVAL;
2297 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2298 goto unlock;
2299
2300 move_lock_page_cgroup(pc, &flags);
2234 2301
2235 if (PageCgroupFileMapped(pc)) { 2302 if (PageCgroupFileMapped(pc)) {
2236 /* Update mapped_file data for mem_cgroup */ 2303 /* Update mapped_file data for mem_cgroup */
@@ -2242,7 +2309,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2242 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2309 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2243 if (uncharge) 2310 if (uncharge)
2244 /* This is not "cancel", but cancel_charge does all we need. */ 2311 /* This is not "cancel", but cancel_charge does all we need. */
2245 mem_cgroup_cancel_charge(from, charge_size); 2312 __mem_cgroup_cancel_charge(from, nr_pages);
2246 2313
2247 /* caller should have done css_get */ 2314 /* caller should have done css_get */
2248 pc->mem_cgroup = to; 2315 pc->mem_cgroup = to;
@@ -2251,43 +2318,19 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2251 * We charges against "to" which may not have any tasks. Then, "to" 2318 * We charges against "to" which may not have any tasks. Then, "to"
2252 * can be under rmdir(). But in current implementation, caller of 2319 * can be under rmdir(). But in current implementation, caller of
2253 * this function is just force_empty() and move charge, so it's 2320 * this function is just force_empty() and move charge, so it's
2254 * garanteed that "to" is never removed. So, we don't check rmdir 2321 * guaranteed that "to" is never removed. So, we don't check rmdir
2255 * status here. 2322 * status here.
2256 */ 2323 */
2257} 2324 move_unlock_page_cgroup(pc, &flags);
2258 2325 ret = 0;
2259/* 2326unlock:
2260 * check whether the @pc is valid for moving account and call
2261 * __mem_cgroup_move_account()
2262 */
2263static int mem_cgroup_move_account(struct page_cgroup *pc,
2264 struct mem_cgroup *from, struct mem_cgroup *to,
2265 bool uncharge, int charge_size)
2266{
2267 int ret = -EINVAL;
2268 unsigned long flags;
2269 /*
2270 * The page is isolated from LRU. So, collapse function
2271 * will not handle this page. But page splitting can happen.
2272 * Do this check under compound_page_lock(). The caller should
2273 * hold it.
2274 */
2275 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
2276 return -EBUSY;
2277
2278 lock_page_cgroup(pc);
2279 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2280 move_lock_page_cgroup(pc, &flags);
2281 __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
2282 move_unlock_page_cgroup(pc, &flags);
2283 ret = 0;
2284 }
2285 unlock_page_cgroup(pc); 2327 unlock_page_cgroup(pc);
2286 /* 2328 /*
2287 * check events 2329 * check events
2288 */ 2330 */
2289 memcg_check_events(to, pc->page); 2331 memcg_check_events(to, page);
2290 memcg_check_events(from, pc->page); 2332 memcg_check_events(from, page);
2333out:
2291 return ret; 2334 return ret;
2292} 2335}
2293 2336
@@ -2295,16 +2338,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
2295 * move charges to its parent. 2338 * move charges to its parent.
2296 */ 2339 */
2297 2340
2298static int mem_cgroup_move_parent(struct page_cgroup *pc, 2341static int mem_cgroup_move_parent(struct page *page,
2342 struct page_cgroup *pc,
2299 struct mem_cgroup *child, 2343 struct mem_cgroup *child,
2300 gfp_t gfp_mask) 2344 gfp_t gfp_mask)
2301{ 2345{
2302 struct page *page = pc->page;
2303 struct cgroup *cg = child->css.cgroup; 2346 struct cgroup *cg = child->css.cgroup;
2304 struct cgroup *pcg = cg->parent; 2347 struct cgroup *pcg = cg->parent;
2305 struct mem_cgroup *parent; 2348 struct mem_cgroup *parent;
2306 int page_size = PAGE_SIZE; 2349 unsigned int nr_pages;
2307 unsigned long flags; 2350 unsigned long uninitialized_var(flags);
2308 int ret; 2351 int ret;
2309 2352
2310 /* Is ROOT ? */ 2353 /* Is ROOT ? */
@@ -2317,23 +2360,21 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2317 if (isolate_lru_page(page)) 2360 if (isolate_lru_page(page))
2318 goto put; 2361 goto put;
2319 2362
2320 if (PageTransHuge(page)) 2363 nr_pages = hpage_nr_pages(page);
2321 page_size = HPAGE_SIZE;
2322 2364
2323 parent = mem_cgroup_from_cont(pcg); 2365 parent = mem_cgroup_from_cont(pcg);
2324 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 2366 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2325 &parent, false, page_size);
2326 if (ret || !parent) 2367 if (ret || !parent)
2327 goto put_back; 2368 goto put_back;
2328 2369
2329 if (page_size > PAGE_SIZE) 2370 if (nr_pages > 1)
2330 flags = compound_lock_irqsave(page); 2371 flags = compound_lock_irqsave(page);
2331 2372
2332 ret = mem_cgroup_move_account(pc, child, parent, true, page_size); 2373 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2333 if (ret) 2374 if (ret)
2334 mem_cgroup_cancel_charge(parent, page_size); 2375 __mem_cgroup_cancel_charge(parent, nr_pages);
2335 2376
2336 if (page_size > PAGE_SIZE) 2377 if (nr_pages > 1)
2337 compound_unlock_irqrestore(page, flags); 2378 compound_unlock_irqrestore(page, flags);
2338put_back: 2379put_back:
2339 putback_lru_page(page); 2380 putback_lru_page(page);
@@ -2353,13 +2394,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2353 gfp_t gfp_mask, enum charge_type ctype) 2394 gfp_t gfp_mask, enum charge_type ctype)
2354{ 2395{
2355 struct mem_cgroup *mem = NULL; 2396 struct mem_cgroup *mem = NULL;
2356 int page_size = PAGE_SIZE; 2397 unsigned int nr_pages = 1;
2357 struct page_cgroup *pc; 2398 struct page_cgroup *pc;
2358 bool oom = true; 2399 bool oom = true;
2359 int ret; 2400 int ret;
2360 2401
2361 if (PageTransHuge(page)) { 2402 if (PageTransHuge(page)) {
2362 page_size <<= compound_order(page); 2403 nr_pages <<= compound_order(page);
2363 VM_BUG_ON(!PageTransHuge(page)); 2404 VM_BUG_ON(!PageTransHuge(page));
2364 /* 2405 /*
2365 * Never OOM-kill a process for a huge page. The 2406 * Never OOM-kill a process for a huge page. The
@@ -2369,16 +2410,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2369 } 2410 }
2370 2411
2371 pc = lookup_page_cgroup(page); 2412 pc = lookup_page_cgroup(page);
2372 /* can happen at boot */ 2413 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2373 if (unlikely(!pc))
2374 return 0;
2375 prefetchw(pc);
2376 2414
2377 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); 2415 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
2378 if (ret || !mem) 2416 if (ret || !mem)
2379 return ret; 2417 return ret;
2380 2418
2381 __mem_cgroup_commit_charge(mem, pc, ctype, page_size); 2419 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
2382 return 0; 2420 return 0;
2383} 2421}
2384 2422
@@ -2406,9 +2444,26 @@ static void
2406__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2444__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2407 enum charge_type ctype); 2445 enum charge_type ctype);
2408 2446
2447static void
2448__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2449 enum charge_type ctype)
2450{
2451 struct page_cgroup *pc = lookup_page_cgroup(page);
2452 /*
2453 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2454 * is already on LRU. It means the page may on some other page_cgroup's
2455 * LRU. Take care of it.
2456 */
2457 mem_cgroup_lru_del_before_commit(page);
2458 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2459 mem_cgroup_lru_add_after_commit(page);
2460 return;
2461}
2462
2409int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2463int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2410 gfp_t gfp_mask) 2464 gfp_t gfp_mask)
2411{ 2465{
2466 struct mem_cgroup *mem = NULL;
2412 int ret; 2467 int ret;
2413 2468
2414 if (mem_cgroup_disabled()) 2469 if (mem_cgroup_disabled())
@@ -2443,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2443 if (unlikely(!mm)) 2498 if (unlikely(!mm))
2444 mm = &init_mm; 2499 mm = &init_mm;
2445 2500
2446 if (page_is_file_cache(page)) 2501 if (page_is_file_cache(page)) {
2447 return mem_cgroup_charge_common(page, mm, gfp_mask, 2502 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
2448 MEM_CGROUP_CHARGE_TYPE_CACHE); 2503 if (ret || !mem)
2504 return ret;
2449 2505
2506 /*
2507 * FUSE reuses pages without going through the final
2508 * put that would remove them from the LRU list, make
2509 * sure that they get relinked properly.
2510 */
2511 __mem_cgroup_commit_charge_lrucare(page, mem,
2512 MEM_CGROUP_CHARGE_TYPE_CACHE);
2513 return ret;
2514 }
2450 /* shmem */ 2515 /* shmem */
2451 if (PageSwapCache(page)) { 2516 if (PageSwapCache(page)) {
2452 struct mem_cgroup *mem = NULL;
2453
2454 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2517 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2455 if (!ret) 2518 if (!ret)
2456 __mem_cgroup_commit_charge_swapin(page, mem, 2519 __mem_cgroup_commit_charge_swapin(page, mem,
@@ -2475,6 +2538,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2475 struct mem_cgroup *mem; 2538 struct mem_cgroup *mem;
2476 int ret; 2539 int ret;
2477 2540
2541 *ptr = NULL;
2542
2478 if (mem_cgroup_disabled()) 2543 if (mem_cgroup_disabled())
2479 return 0; 2544 return 0;
2480 2545
@@ -2492,30 +2557,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2492 if (!mem) 2557 if (!mem)
2493 goto charge_cur_mm; 2558 goto charge_cur_mm;
2494 *ptr = mem; 2559 *ptr = mem;
2495 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); 2560 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2496 css_put(&mem->css); 2561 css_put(&mem->css);
2497 return ret; 2562 return ret;
2498charge_cur_mm: 2563charge_cur_mm:
2499 if (unlikely(!mm)) 2564 if (unlikely(!mm))
2500 mm = &init_mm; 2565 mm = &init_mm;
2501 return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); 2566 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2502} 2567}
2503 2568
2504static void 2569static void
2505__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2570__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2506 enum charge_type ctype) 2571 enum charge_type ctype)
2507{ 2572{
2508 struct page_cgroup *pc;
2509
2510 if (mem_cgroup_disabled()) 2573 if (mem_cgroup_disabled())
2511 return; 2574 return;
2512 if (!ptr) 2575 if (!ptr)
2513 return; 2576 return;
2514 cgroup_exclude_rmdir(&ptr->css); 2577 cgroup_exclude_rmdir(&ptr->css);
2515 pc = lookup_page_cgroup(page); 2578
2516 mem_cgroup_lru_del_before_commit_swapcache(page); 2579 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2517 __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
2518 mem_cgroup_lru_add_after_commit_swapcache(page);
2519 /* 2580 /*
2520 * Now swap is on-memory. This means this page may be 2581 * Now swap is on-memory. This means this page may be
2521 * counted both as mem and swap....double count. 2582 * counted both as mem and swap....double count.
@@ -2563,15 +2624,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2563 return; 2624 return;
2564 if (!mem) 2625 if (!mem)
2565 return; 2626 return;
2566 mem_cgroup_cancel_charge(mem, PAGE_SIZE); 2627 __mem_cgroup_cancel_charge(mem, 1);
2567} 2628}
2568 2629
2569static void 2630static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2570__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, 2631 unsigned int nr_pages,
2571 int page_size) 2632 const enum charge_type ctype)
2572{ 2633{
2573 struct memcg_batch_info *batch = NULL; 2634 struct memcg_batch_info *batch = NULL;
2574 bool uncharge_memsw = true; 2635 bool uncharge_memsw = true;
2636
2575 /* If swapout, usage of swap doesn't decrease */ 2637 /* If swapout, usage of swap doesn't decrease */
2576 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2638 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2577 uncharge_memsw = false; 2639 uncharge_memsw = false;
@@ -2586,7 +2648,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2586 batch->memcg = mem; 2648 batch->memcg = mem;
2587 /* 2649 /*
2588 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2650 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2589 * In those cases, all pages freed continously can be expected to be in 2651 * In those cases, all pages freed continuously can be expected to be in
2590 * the same cgroup and we have chance to coalesce uncharges. 2652 * the same cgroup and we have chance to coalesce uncharges.
2591 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2653 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2592 * because we want to do uncharge as soon as possible. 2654 * because we want to do uncharge as soon as possible.
@@ -2595,7 +2657,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2595 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2657 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2596 goto direct_uncharge; 2658 goto direct_uncharge;
2597 2659
2598 if (page_size != PAGE_SIZE) 2660 if (nr_pages > 1)
2599 goto direct_uncharge; 2661 goto direct_uncharge;
2600 2662
2601 /* 2663 /*
@@ -2606,14 +2668,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2606 if (batch->memcg != mem) 2668 if (batch->memcg != mem)
2607 goto direct_uncharge; 2669 goto direct_uncharge;
2608 /* remember freed charge and uncharge it later */ 2670 /* remember freed charge and uncharge it later */
2609 batch->bytes += PAGE_SIZE; 2671 batch->nr_pages++;
2610 if (uncharge_memsw) 2672 if (uncharge_memsw)
2611 batch->memsw_bytes += PAGE_SIZE; 2673 batch->memsw_nr_pages++;
2612 return; 2674 return;
2613direct_uncharge: 2675direct_uncharge:
2614 res_counter_uncharge(&mem->res, page_size); 2676 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
2615 if (uncharge_memsw) 2677 if (uncharge_memsw)
2616 res_counter_uncharge(&mem->memsw, page_size); 2678 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
2617 if (unlikely(batch->memcg != mem)) 2679 if (unlikely(batch->memcg != mem))
2618 memcg_oom_recover(mem); 2680 memcg_oom_recover(mem);
2619 return; 2681 return;
@@ -2625,10 +2687,9 @@ direct_uncharge:
2625static struct mem_cgroup * 2687static struct mem_cgroup *
2626__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2688__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2627{ 2689{
2628 int count;
2629 struct page_cgroup *pc;
2630 struct mem_cgroup *mem = NULL; 2690 struct mem_cgroup *mem = NULL;
2631 int page_size = PAGE_SIZE; 2691 unsigned int nr_pages = 1;
2692 struct page_cgroup *pc;
2632 2693
2633 if (mem_cgroup_disabled()) 2694 if (mem_cgroup_disabled())
2634 return NULL; 2695 return NULL;
@@ -2637,11 +2698,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2637 return NULL; 2698 return NULL;
2638 2699
2639 if (PageTransHuge(page)) { 2700 if (PageTransHuge(page)) {
2640 page_size <<= compound_order(page); 2701 nr_pages <<= compound_order(page);
2641 VM_BUG_ON(!PageTransHuge(page)); 2702 VM_BUG_ON(!PageTransHuge(page));
2642 } 2703 }
2643
2644 count = page_size >> PAGE_SHIFT;
2645 /* 2704 /*
2646 * Check if our page_cgroup is valid 2705 * Check if our page_cgroup is valid
2647 */ 2706 */
@@ -2674,7 +2733,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2674 break; 2733 break;
2675 } 2734 }
2676 2735
2677 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); 2736 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
2678 2737
2679 ClearPageCgroupUsed(pc); 2738 ClearPageCgroupUsed(pc);
2680 /* 2739 /*
@@ -2695,7 +2754,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2695 mem_cgroup_get(mem); 2754 mem_cgroup_get(mem);
2696 } 2755 }
2697 if (!mem_cgroup_is_root(mem)) 2756 if (!mem_cgroup_is_root(mem))
2698 __do_uncharge(mem, ctype, page_size); 2757 mem_cgroup_do_uncharge(mem, nr_pages, ctype);
2699 2758
2700 return mem; 2759 return mem;
2701 2760
@@ -2735,8 +2794,8 @@ void mem_cgroup_uncharge_start(void)
2735 /* We can do nest. */ 2794 /* We can do nest. */
2736 if (current->memcg_batch.do_batch == 1) { 2795 if (current->memcg_batch.do_batch == 1) {
2737 current->memcg_batch.memcg = NULL; 2796 current->memcg_batch.memcg = NULL;
2738 current->memcg_batch.bytes = 0; 2797 current->memcg_batch.nr_pages = 0;
2739 current->memcg_batch.memsw_bytes = 0; 2798 current->memcg_batch.memsw_nr_pages = 0;
2740 } 2799 }
2741} 2800}
2742 2801
@@ -2757,10 +2816,12 @@ void mem_cgroup_uncharge_end(void)
2757 * This "batch->memcg" is valid without any css_get/put etc... 2816 * This "batch->memcg" is valid without any css_get/put etc...
2758 * bacause we hide charges behind us. 2817 * bacause we hide charges behind us.
2759 */ 2818 */
2760 if (batch->bytes) 2819 if (batch->nr_pages)
2761 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2820 res_counter_uncharge(&batch->memcg->res,
2762 if (batch->memsw_bytes) 2821 batch->nr_pages * PAGE_SIZE);
2763 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2822 if (batch->memsw_nr_pages)
2823 res_counter_uncharge(&batch->memcg->memsw,
2824 batch->memsw_nr_pages * PAGE_SIZE);
2764 memcg_oom_recover(batch->memcg); 2825 memcg_oom_recover(batch->memcg);
2765 /* forget this pointer (for sanity check) */ 2826 /* forget this pointer (for sanity check) */
2766 batch->memcg = NULL; 2827 batch->memcg = NULL;
@@ -2883,13 +2944,15 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2883 * page belongs to. 2944 * page belongs to.
2884 */ 2945 */
2885int mem_cgroup_prepare_migration(struct page *page, 2946int mem_cgroup_prepare_migration(struct page *page,
2886 struct page *newpage, struct mem_cgroup **ptr) 2947 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
2887{ 2948{
2888 struct page_cgroup *pc;
2889 struct mem_cgroup *mem = NULL; 2949 struct mem_cgroup *mem = NULL;
2950 struct page_cgroup *pc;
2890 enum charge_type ctype; 2951 enum charge_type ctype;
2891 int ret = 0; 2952 int ret = 0;
2892 2953
2954 *ptr = NULL;
2955
2893 VM_BUG_ON(PageTransHuge(page)); 2956 VM_BUG_ON(PageTransHuge(page));
2894 if (mem_cgroup_disabled()) 2957 if (mem_cgroup_disabled())
2895 return 0; 2958 return 0;
@@ -2940,7 +3003,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2940 return 0; 3003 return 0;
2941 3004
2942 *ptr = mem; 3005 *ptr = mem;
2943 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); 3006 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
2944 css_put(&mem->css);/* drop extra refcnt */ 3007 css_put(&mem->css);/* drop extra refcnt */
2945 if (ret || *ptr == NULL) { 3008 if (ret || *ptr == NULL) {
2946 if (PageAnon(page)) { 3009 if (PageAnon(page)) {
@@ -2967,7 +3030,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2967 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3030 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2968 else 3031 else
2969 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3032 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2970 __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); 3033 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2971 return ret; 3034 return ret;
2972} 3035}
2973 3036
@@ -3032,7 +3095,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
3032 struct mm_struct *mm, 3095 struct mm_struct *mm,
3033 gfp_t gfp_mask) 3096 gfp_t gfp_mask)
3034{ 3097{
3035 struct mem_cgroup *mem = NULL; 3098 struct mem_cgroup *mem;
3036 int ret; 3099 int ret;
3037 3100
3038 if (mem_cgroup_disabled()) 3101 if (mem_cgroup_disabled())
@@ -3045,6 +3108,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
3045 return ret; 3108 return ret;
3046} 3109}
3047 3110
3111#ifdef CONFIG_DEBUG_VM
3112static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3113{
3114 struct page_cgroup *pc;
3115
3116 pc = lookup_page_cgroup(page);
3117 if (likely(pc) && PageCgroupUsed(pc))
3118 return pc;
3119 return NULL;
3120}
3121
3122bool mem_cgroup_bad_page_check(struct page *page)
3123{
3124 if (mem_cgroup_disabled())
3125 return false;
3126
3127 return lookup_page_cgroup_used(page) != NULL;
3128}
3129
3130void mem_cgroup_print_bad_page(struct page *page)
3131{
3132 struct page_cgroup *pc;
3133
3134 pc = lookup_page_cgroup_used(page);
3135 if (pc) {
3136 int ret = -1;
3137 char *path;
3138
3139 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3140 pc, pc->flags, pc->mem_cgroup);
3141
3142 path = kmalloc(PATH_MAX, GFP_KERNEL);
3143 if (path) {
3144 rcu_read_lock();
3145 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3146 path, PATH_MAX);
3147 rcu_read_unlock();
3148 }
3149
3150 printk(KERN_CONT "(%s)\n",
3151 (ret < 0) ? "cannot get the path" : path);
3152 kfree(path);
3153 }
3154}
3155#endif
3156
3048static DEFINE_MUTEX(set_limit_mutex); 3157static DEFINE_MUTEX(set_limit_mutex);
3049 3158
3050static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3159static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
@@ -3288,6 +3397,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3288 loop += 256; 3397 loop += 256;
3289 busy = NULL; 3398 busy = NULL;
3290 while (loop--) { 3399 while (loop--) {
3400 struct page *page;
3401
3291 ret = 0; 3402 ret = 0;
3292 spin_lock_irqsave(&zone->lru_lock, flags); 3403 spin_lock_irqsave(&zone->lru_lock, flags);
3293 if (list_empty(list)) { 3404 if (list_empty(list)) {
@@ -3303,7 +3414,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3303 } 3414 }
3304 spin_unlock_irqrestore(&zone->lru_lock, flags); 3415 spin_unlock_irqrestore(&zone->lru_lock, flags);
3305 3416
3306 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 3417 page = lookup_cgroup_page(pc);
3418
3419 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
3307 if (ret == -ENOMEM) 3420 if (ret == -ENOMEM)
3308 break; 3421 break;
3309 3422
@@ -3451,13 +3564,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3451} 3564}
3452 3565
3453 3566
3454static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3567static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3455 enum mem_cgroup_stat_index idx) 3568 enum mem_cgroup_stat_index idx)
3456{ 3569{
3457 struct mem_cgroup *iter; 3570 struct mem_cgroup *iter;
3458 s64 val = 0; 3571 long val = 0;
3459 3572
3460 /* each per cpu's value can be minus.Then, use s64 */ 3573 /* Per-cpu values can be negative, use a signed accumulator */
3461 for_each_mem_cgroup_tree(iter, mem) 3574 for_each_mem_cgroup_tree(iter, mem)
3462 val += mem_cgroup_read_stat(iter, idx); 3575 val += mem_cgroup_read_stat(iter, idx);
3463 3576
@@ -3477,12 +3590,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3477 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3590 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3478 } 3591 }
3479 3592
3480 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); 3593 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
3481 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); 3594 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
3482 3595
3483 if (swap) 3596 if (swap)
3484 val += mem_cgroup_get_recursive_idx_stat(mem, 3597 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3485 MEM_CGROUP_STAT_SWAPOUT);
3486 3598
3487 return val << PAGE_SHIFT; 3599 return val << PAGE_SHIFT;
3488} 3600}
@@ -3702,9 +3814,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3702 s->stat[MCS_RSS] += val * PAGE_SIZE; 3814 s->stat[MCS_RSS] += val * PAGE_SIZE;
3703 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3815 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3704 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3816 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3705 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 3817 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
3706 s->stat[MCS_PGPGIN] += val; 3818 s->stat[MCS_PGPGIN] += val;
3707 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3819 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
3708 s->stat[MCS_PGPGOUT] += val; 3820 s->stat[MCS_PGPGOUT] += val;
3709 if (do_swap_account) { 3821 if (do_swap_account) {
3710 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3822 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
@@ -3828,9 +3940,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3828 return -EINVAL; 3940 return -EINVAL;
3829 } 3941 }
3830 3942
3831 spin_lock(&memcg->reclaim_param_lock);
3832 memcg->swappiness = val; 3943 memcg->swappiness = val;
3833 spin_unlock(&memcg->reclaim_param_lock);
3834 3944
3835 cgroup_unlock(); 3945 cgroup_unlock();
3836 3946
@@ -4486,7 +4596,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4486 res_counter_init(&mem->memsw, NULL); 4596 res_counter_init(&mem->memsw, NULL);
4487 } 4597 }
4488 mem->last_scanned_child = 0; 4598 mem->last_scanned_child = 0;
4489 spin_lock_init(&mem->reclaim_param_lock);
4490 INIT_LIST_HEAD(&mem->oom_notify); 4599 INIT_LIST_HEAD(&mem->oom_notify);
4491 4600
4492 if (parent) 4601 if (parent)
@@ -4574,8 +4683,7 @@ one_by_one:
4574 batch_count = PRECHARGE_COUNT_AT_ONCE; 4683 batch_count = PRECHARGE_COUNT_AT_ONCE;
4575 cond_resched(); 4684 cond_resched();
4576 } 4685 }
4577 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 4686 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
4578 PAGE_SIZE);
4579 if (ret || !mem) 4687 if (ret || !mem)
4580 /* mem_cgroup_clear_mc() will do uncharge later */ 4688 /* mem_cgroup_clear_mc() will do uncharge later */
4581 return -ENOMEM; 4689 return -ENOMEM;
@@ -4737,7 +4845,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4737 pte_t *pte; 4845 pte_t *pte;
4738 spinlock_t *ptl; 4846 spinlock_t *ptl;
4739 4847
4740 VM_BUG_ON(pmd_trans_huge(*pmd)); 4848 split_huge_page_pmd(walk->mm, pmd);
4849
4741 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4850 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4742 for (; addr != end; pte++, addr += PAGE_SIZE) 4851 for (; addr != end; pte++, addr += PAGE_SIZE)
4743 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4852 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4899,8 +5008,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4899 pte_t *pte; 5008 pte_t *pte;
4900 spinlock_t *ptl; 5009 spinlock_t *ptl;
4901 5010
5011 split_huge_page_pmd(walk->mm, pmd);
4902retry: 5012retry:
4903 VM_BUG_ON(pmd_trans_huge(*pmd));
4904 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5013 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4905 for (; addr != end; addr += PAGE_SIZE) { 5014 for (; addr != end; addr += PAGE_SIZE) {
4906 pte_t ptent = *(pte++); 5015 pte_t ptent = *(pte++);
@@ -4920,8 +5029,8 @@ retry:
4920 if (isolate_lru_page(page)) 5029 if (isolate_lru_page(page))
4921 goto put; 5030 goto put;
4922 pc = lookup_page_cgroup(page); 5031 pc = lookup_page_cgroup(page);
4923 if (!mem_cgroup_move_account(pc, 5032 if (!mem_cgroup_move_account(page, 1, pc,
4924 mc.from, mc.to, false, PAGE_SIZE)) { 5033 mc.from, mc.to, false)) {
4925 mc.precharge--; 5034 mc.precharge--;
4926 /* we uncharge from mc.from later. */ 5035 /* we uncharge from mc.from later. */
4927 mc.moved_charge++; 5036 mc.moved_charge++;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 99ccb4472623..2b9a5eef39e0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -208,7 +208,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
208 * Don't use force here, it's convenient if the signal 208 * Don't use force here, it's convenient if the signal
209 * can be temporarily blocked. 209 * can be temporarily blocked.
210 * This could cause a loop when the user sets SIGBUS 210 * This could cause a loop when the user sets SIGBUS
211 * to SIG_IGN, but hopefully noone will do that? 211 * to SIG_IGN, but hopefully no one will do that?
212 */ 212 */
213 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ 213 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
214 if (ret < 0) 214 if (ret < 0)
@@ -634,7 +634,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
634 * when the page is reread or dropped. If an 634 * when the page is reread or dropped. If an
635 * application assumes it will always get error on 635 * application assumes it will always get error on
636 * fsync, but does other operations on the fd before 636 * fsync, but does other operations on the fd before
637 * and the page is dropped inbetween then the error 637 * and the page is dropped between then the error
638 * will not be properly reported. 638 * will not be properly reported.
639 * 639 *
640 * This can already happen even without hwpoisoned 640 * This can already happen even without hwpoisoned
@@ -728,7 +728,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
728 * The table matches them in order and calls the right handler. 728 * The table matches them in order and calls the right handler.
729 * 729 *
730 * This is quite tricky because we can access page at any time 730 * This is quite tricky because we can access page at any time
731 * in its live cycle, so all accesses have to be extremly careful. 731 * in its live cycle, so all accesses have to be extremely careful.
732 * 732 *
733 * This is not complete. More states could be added. 733 * This is not complete. More states could be added.
734 * For any missing state don't attempt recovery. 734 * For any missing state don't attempt recovery.
@@ -945,7 +945,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
945 collect_procs(ppage, &tokill); 945 collect_procs(ppage, &tokill);
946 946
947 if (hpage != ppage) 947 if (hpage != ppage)
948 lock_page_nosync(ppage); 948 lock_page(ppage);
949 949
950 ret = try_to_unmap(ppage, ttu); 950 ret = try_to_unmap(ppage, ttu);
951 if (ret != SWAP_SUCCESS) 951 if (ret != SWAP_SUCCESS)
@@ -1038,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1038 * Check "just unpoisoned", "filter hit", and 1038 * Check "just unpoisoned", "filter hit", and
1039 * "race with other subpage." 1039 * "race with other subpage."
1040 */ 1040 */
1041 lock_page_nosync(hpage); 1041 lock_page(hpage);
1042 if (!PageHWPoison(hpage) 1042 if (!PageHWPoison(hpage)
1043 || (hwpoison_filter(p) && TestClearPageHWPoison(p)) 1043 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1044 || (p != hpage && TestSetPageHWPoison(hpage))) { 1044 || (p != hpage && TestSetPageHWPoison(hpage))) {
@@ -1088,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1088 * It's very difficult to mess with pages currently under IO 1088 * It's very difficult to mess with pages currently under IO
1089 * and in many cases impossible, so we just avoid it here. 1089 * and in many cases impossible, so we just avoid it here.
1090 */ 1090 */
1091 lock_page_nosync(hpage); 1091 lock_page(hpage);
1092 1092
1093 /* 1093 /*
1094 * unpoison always clear PG_hwpoison inside page lock 1094 * unpoison always clear PG_hwpoison inside page lock
@@ -1130,7 +1130,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1130 1130
1131 /* 1131 /*
1132 * Now take care of user space mappings. 1132 * Now take care of user space mappings.
1133 * Abort on fail: __remove_from_page_cache() assumes unmapped page. 1133 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1134 */ 1134 */
1135 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { 1135 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1136 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 1136 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
@@ -1231,7 +1231,7 @@ int unpoison_memory(unsigned long pfn)
1231 return 0; 1231 return 0;
1232 } 1232 }
1233 1233
1234 lock_page_nosync(page); 1234 lock_page(page);
1235 /* 1235 /*
1236 * This test is racy because PG_hwpoison is set outside of page lock. 1236 * This test is racy because PG_hwpoison is set outside of page lock.
1237 * That's acceptable because that won't trigger kernel panic. Instead, 1237 * That's acceptable because that won't trigger kernel panic. Instead,
diff --git a/mm/memory.c b/mm/memory.c
index e48945ab362b..ce22a250926f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1410,6 +1410,13 @@ no_page_table:
1410 return page; 1410 return page;
1411} 1411}
1412 1412
1413static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1414{
1415 return (vma->vm_flags & VM_GROWSDOWN) &&
1416 (vma->vm_start == addr) &&
1417 !vma_stack_continue(vma->vm_prev, addr);
1418}
1419
1413/** 1420/**
1414 * __get_user_pages() - pin user pages in memory 1421 * __get_user_pages() - pin user pages in memory
1415 * @tsk: task_struct of target task 1422 * @tsk: task_struct of target task
@@ -1486,9 +1493,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1486 struct vm_area_struct *vma; 1493 struct vm_area_struct *vma;
1487 1494
1488 vma = find_extend_vma(mm, start); 1495 vma = find_extend_vma(mm, start);
1489 if (!vma && in_gate_area(tsk, start)) { 1496 if (!vma && in_gate_area(mm, start)) {
1490 unsigned long pg = start & PAGE_MASK; 1497 unsigned long pg = start & PAGE_MASK;
1491 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1492 pgd_t *pgd; 1498 pgd_t *pgd;
1493 pud_t *pud; 1499 pud_t *pud;
1494 pmd_t *pmd; 1500 pmd_t *pmd;
@@ -1513,10 +1519,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1513 pte_unmap(pte); 1519 pte_unmap(pte);
1514 return i ? : -EFAULT; 1520 return i ? : -EFAULT;
1515 } 1521 }
1522 vma = get_gate_vma(mm);
1516 if (pages) { 1523 if (pages) {
1517 struct page *page; 1524 struct page *page;
1518 1525
1519 page = vm_normal_page(gate_vma, start, *pte); 1526 page = vm_normal_page(vma, start, *pte);
1520 if (!page) { 1527 if (!page) {
1521 if (!(gup_flags & FOLL_DUMP) && 1528 if (!(gup_flags & FOLL_DUMP) &&
1522 is_zero_pfn(pte_pfn(*pte))) 1529 is_zero_pfn(pte_pfn(*pte)))
@@ -1530,12 +1537,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1530 get_page(page); 1537 get_page(page);
1531 } 1538 }
1532 pte_unmap(pte); 1539 pte_unmap(pte);
1533 if (vmas) 1540 goto next_page;
1534 vmas[i] = gate_vma;
1535 i++;
1536 start += PAGE_SIZE;
1537 nr_pages--;
1538 continue;
1539 } 1541 }
1540 1542
1541 if (!vma || 1543 if (!vma ||
@@ -1549,6 +1551,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1549 continue; 1551 continue;
1550 } 1552 }
1551 1553
1554 /*
1555 * If we don't actually want the page itself,
1556 * and it's the stack guard page, just skip it.
1557 */
1558 if (!pages && stack_guard_page(vma, start))
1559 goto next_page;
1560
1552 do { 1561 do {
1553 struct page *page; 1562 struct page *page;
1554 unsigned int foll_flags = gup_flags; 1563 unsigned int foll_flags = gup_flags;
@@ -1569,6 +1578,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1569 fault_flags |= FAULT_FLAG_WRITE; 1578 fault_flags |= FAULT_FLAG_WRITE;
1570 if (nonblocking) 1579 if (nonblocking)
1571 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 1580 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1581 if (foll_flags & FOLL_NOWAIT)
1582 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1572 1583
1573 ret = handle_mm_fault(mm, vma, start, 1584 ret = handle_mm_fault(mm, vma, start,
1574 fault_flags); 1585 fault_flags);
@@ -1589,13 +1600,17 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1589 return i ? i : -EFAULT; 1600 return i ? i : -EFAULT;
1590 BUG(); 1601 BUG();
1591 } 1602 }
1592 if (ret & VM_FAULT_MAJOR) 1603
1593 tsk->maj_flt++; 1604 if (tsk) {
1594 else 1605 if (ret & VM_FAULT_MAJOR)
1595 tsk->min_flt++; 1606 tsk->maj_flt++;
1607 else
1608 tsk->min_flt++;
1609 }
1596 1610
1597 if (ret & VM_FAULT_RETRY) { 1611 if (ret & VM_FAULT_RETRY) {
1598 *nonblocking = 0; 1612 if (nonblocking)
1613 *nonblocking = 0;
1599 return i; 1614 return i;
1600 } 1615 }
1601 1616
@@ -1625,6 +1640,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1625 flush_anon_page(vma, page, start); 1640 flush_anon_page(vma, page, start);
1626 flush_dcache_page(page); 1641 flush_dcache_page(page);
1627 } 1642 }
1643next_page:
1628 if (vmas) 1644 if (vmas)
1629 vmas[i] = vma; 1645 vmas[i] = vma;
1630 i++; 1646 i++;
@@ -1638,7 +1654,8 @@ EXPORT_SYMBOL(__get_user_pages);
1638 1654
1639/** 1655/**
1640 * get_user_pages() - pin user pages in memory 1656 * get_user_pages() - pin user pages in memory
1641 * @tsk: task_struct of target task 1657 * @tsk: the task_struct to use for page fault accounting, or
1658 * NULL if faults are not to be recorded.
1642 * @mm: mm_struct of target mm 1659 * @mm: mm_struct of target mm
1643 * @start: starting user address 1660 * @start: starting user address
1644 * @nr_pages: number of pages from start to pin 1661 * @nr_pages: number of pages from start to pin
@@ -2764,7 +2781,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2764 swp_entry_t entry; 2781 swp_entry_t entry;
2765 pte_t pte; 2782 pte_t pte;
2766 int locked; 2783 int locked;
2767 struct mem_cgroup *ptr = NULL; 2784 struct mem_cgroup *ptr;
2768 int exclusive = 0; 2785 int exclusive = 0;
2769 int ret = 0; 2786 int ret = 0;
2770 2787
@@ -3496,7 +3513,7 @@ static int __init gate_vma_init(void)
3496__initcall(gate_vma_init); 3513__initcall(gate_vma_init);
3497#endif 3514#endif
3498 3515
3499struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 3516struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3500{ 3517{
3501#ifdef AT_SYSINFO_EHDR 3518#ifdef AT_SYSINFO_EHDR
3502 return &gate_vma; 3519 return &gate_vma;
@@ -3505,7 +3522,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
3505#endif 3522#endif
3506} 3523}
3507 3524
3508int in_gate_area_no_task(unsigned long addr) 3525int in_gate_area_no_mm(unsigned long addr)
3509{ 3526{
3510#ifdef AT_SYSINFO_EHDR 3527#ifdef AT_SYSINFO_EHDR
3511 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) 3528 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
@@ -3646,20 +3663,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3646#endif 3663#endif
3647 3664
3648/* 3665/*
3649 * Access another process' address space. 3666 * Access another process' address space as given in mm. If non-NULL, use the
3650 * Source/target buffer must be kernel space, 3667 * given task for page fault accounting.
3651 * Do not walk the page table directly, use get_user_pages
3652 */ 3668 */
3653int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) 3669static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3670 unsigned long addr, void *buf, int len, int write)
3654{ 3671{
3655 struct mm_struct *mm;
3656 struct vm_area_struct *vma; 3672 struct vm_area_struct *vma;
3657 void *old_buf = buf; 3673 void *old_buf = buf;
3658 3674
3659 mm = get_task_mm(tsk);
3660 if (!mm)
3661 return 0;
3662
3663 down_read(&mm->mmap_sem); 3675 down_read(&mm->mmap_sem);
3664 /* ignore errors, just check how much was successfully transferred */ 3676 /* ignore errors, just check how much was successfully transferred */
3665 while (len) { 3677 while (len) {
@@ -3676,7 +3688,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
3676 */ 3688 */
3677#ifdef CONFIG_HAVE_IOREMAP_PROT 3689#ifdef CONFIG_HAVE_IOREMAP_PROT
3678 vma = find_vma(mm, addr); 3690 vma = find_vma(mm, addr);
3679 if (!vma) 3691 if (!vma || vma->vm_start > addr)
3680 break; 3692 break;
3681 if (vma->vm_ops && vma->vm_ops->access) 3693 if (vma->vm_ops && vma->vm_ops->access)
3682 ret = vma->vm_ops->access(vma, addr, buf, 3694 ret = vma->vm_ops->access(vma, addr, buf,
@@ -3708,11 +3720,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
3708 addr += bytes; 3720 addr += bytes;
3709 } 3721 }
3710 up_read(&mm->mmap_sem); 3722 up_read(&mm->mmap_sem);
3711 mmput(mm);
3712 3723
3713 return buf - old_buf; 3724 return buf - old_buf;
3714} 3725}
3715 3726
3727/**
3728 * access_remote_vm - access another process' address space
3729 * @mm: the mm_struct of the target address space
3730 * @addr: start address to access
3731 * @buf: source or destination buffer
3732 * @len: number of bytes to transfer
3733 * @write: whether the access is a write
3734 *
3735 * The caller must hold a reference on @mm.
3736 */
3737int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3738 void *buf, int len, int write)
3739{
3740 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3741}
3742
3743/*
3744 * Access another process' address space.
3745 * Source/target buffer must be kernel space,
3746 * Do not walk the page table directly, use get_user_pages
3747 */
3748int access_process_vm(struct task_struct *tsk, unsigned long addr,
3749 void *buf, int len, int write)
3750{
3751 struct mm_struct *mm;
3752 int ret;
3753
3754 mm = get_task_mm(tsk);
3755 if (!mm)
3756 return 0;
3757
3758 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3759 mmput(mm);
3760
3761 return ret;
3762}
3763
3716/* 3764/*
3717 * Print the name of a VMA. 3765 * Print the name of a VMA.
3718 */ 3766 */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 321fc7455df7..9ca1d604f7cd 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -375,7 +375,7 @@ void online_page(struct page *page)
375#endif 375#endif
376 376
377#ifdef CONFIG_FLATMEM 377#ifdef CONFIG_FLATMEM
378 max_mapnr = max(page_to_pfn(page), max_mapnr); 378 max_mapnr = max(pfn, max_mapnr);
379#endif 379#endif
380 380
381 ClearPageReserved(page); 381 ClearPageReserved(page);
@@ -724,7 +724,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
724 pfn); 724 pfn);
725 dump_page(page); 725 dump_page(page);
726#endif 726#endif
727 /* Becasue we don't have big zone->lock. we should 727 /* Because we don't have big zone->lock. we should
728 check this again here. */ 728 check this again here. */
729 if (page_count(page)) { 729 if (page_count(page)) {
730 not_managed++; 730 not_managed++;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 78062ab641ff..959a8b8c7350 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1979,8 +1979,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1979 case MPOL_INTERLEAVE: 1979 case MPOL_INTERLEAVE:
1980 return nodes_equal(a->v.nodes, b->v.nodes); 1980 return nodes_equal(a->v.nodes, b->v.nodes);
1981 case MPOL_PREFERRED: 1981 case MPOL_PREFERRED:
1982 return a->v.preferred_node == b->v.preferred_node && 1982 return a->v.preferred_node == b->v.preferred_node;
1983 a->flags == b->flags;
1984 default: 1983 default:
1985 BUG(); 1984 BUG();
1986 return 0; 1985 return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 352de555626c..34132f8e9109 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -375,7 +375,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
375 * redo the accounting that clear_page_dirty_for_io undid, 375 * redo the accounting that clear_page_dirty_for_io undid,
376 * but we can't use set_page_dirty because that function 376 * but we can't use set_page_dirty because that function
377 * is actually a signal that all of the page has become dirty. 377 * is actually a signal that all of the page has become dirty.
378 * Wheras only part of our page may be dirty. 378 * Whereas only part of our page may be dirty.
379 */ 379 */
380 __set_page_dirty_nobuffers(newpage); 380 __set_page_dirty_nobuffers(newpage);
381 } 381 }
@@ -564,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping,
564 * == 0 - success 564 * == 0 - success
565 */ 565 */
566static int move_to_new_page(struct page *newpage, struct page *page, 566static int move_to_new_page(struct page *newpage, struct page *page,
567 int remap_swapcache) 567 int remap_swapcache, bool sync)
568{ 568{
569 struct address_space *mapping; 569 struct address_space *mapping;
570 int rc; 570 int rc;
@@ -586,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page,
586 mapping = page_mapping(page); 586 mapping = page_mapping(page);
587 if (!mapping) 587 if (!mapping)
588 rc = migrate_page(mapping, newpage, page); 588 rc = migrate_page(mapping, newpage, page);
589 else if (mapping->a_ops->migratepage) 589 else {
590 /* 590 /*
591 * Most pages have a mapping and most filesystems 591 * Do not writeback pages if !sync and migratepage is
592 * should provide a migration function. Anonymous 592 * not pointing to migrate_page() which is nonblocking
593 * pages are part of swap space which also has its 593 * (swapcache/tmpfs uses migratepage = migrate_page).
594 * own migration function. This is the most common
595 * path for page migration.
596 */ 594 */
597 rc = mapping->a_ops->migratepage(mapping, 595 if (PageDirty(page) && !sync &&
598 newpage, page); 596 mapping->a_ops->migratepage != migrate_page)
599 else 597 rc = -EBUSY;
600 rc = fallback_migrate_page(mapping, newpage, page); 598 else if (mapping->a_ops->migratepage)
599 /*
600 * Most pages have a mapping and most filesystems
601 * should provide a migration function. Anonymous
602 * pages are part of swap space which also has its
603 * own migration function. This is the most common
604 * path for page migration.
605 */
606 rc = mapping->a_ops->migratepage(mapping,
607 newpage, page);
608 else
609 rc = fallback_migrate_page(mapping, newpage, page);
610 }
601 611
602 if (rc) { 612 if (rc) {
603 newpage->mapping = NULL; 613 newpage->mapping = NULL;
@@ -623,7 +633,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
623 struct page *newpage = get_new_page(page, private, &result); 633 struct page *newpage = get_new_page(page, private, &result);
624 int remap_swapcache = 1; 634 int remap_swapcache = 1;
625 int charge = 0; 635 int charge = 0;
626 struct mem_cgroup *mem = NULL; 636 struct mem_cgroup *mem;
627 struct anon_vma *anon_vma = NULL; 637 struct anon_vma *anon_vma = NULL;
628 638
629 if (!newpage) 639 if (!newpage)
@@ -641,7 +651,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
641 rc = -EAGAIN; 651 rc = -EAGAIN;
642 652
643 if (!trylock_page(page)) { 653 if (!trylock_page(page)) {
644 if (!force) 654 if (!force || !sync)
645 goto move_newpage; 655 goto move_newpage;
646 656
647 /* 657 /*
@@ -678,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
678 } 688 }
679 689
680 /* charge against new page */ 690 /* charge against new page */
681 charge = mem_cgroup_prepare_migration(page, newpage, &mem); 691 charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
682 if (charge == -ENOMEM) { 692 if (charge == -ENOMEM) {
683 rc = -ENOMEM; 693 rc = -ENOMEM;
684 goto unlock; 694 goto unlock;
@@ -686,7 +696,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
686 BUG_ON(charge); 696 BUG_ON(charge);
687 697
688 if (PageWriteback(page)) { 698 if (PageWriteback(page)) {
689 if (!force || !sync) 699 /*
700 * For !sync, there is no point retrying as the retry loop
701 * is expected to be too short for PageWriteback to be cleared
702 */
703 if (!sync) {
704 rc = -EBUSY;
705 goto uncharge;
706 }
707 if (!force)
690 goto uncharge; 708 goto uncharge;
691 wait_on_page_writeback(page); 709 wait_on_page_writeback(page);
692 } 710 }
@@ -757,14 +775,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
757 775
758skip_unmap: 776skip_unmap:
759 if (!page_mapped(page)) 777 if (!page_mapped(page))
760 rc = move_to_new_page(newpage, page, remap_swapcache); 778 rc = move_to_new_page(newpage, page, remap_swapcache, sync);
761 779
762 if (rc && remap_swapcache) 780 if (rc && remap_swapcache)
763 remove_migration_ptes(page, page); 781 remove_migration_ptes(page, page);
764 782
765 /* Drop an anon_vma reference if we took one */ 783 /* Drop an anon_vma reference if we took one */
766 if (anon_vma) 784 if (anon_vma)
767 drop_anon_vma(anon_vma); 785 put_anon_vma(anon_vma);
768 786
769uncharge: 787uncharge:
770 if (!charge) 788 if (!charge)
@@ -850,13 +868,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
850 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 868 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
851 869
852 if (!page_mapped(hpage)) 870 if (!page_mapped(hpage))
853 rc = move_to_new_page(new_hpage, hpage, 1); 871 rc = move_to_new_page(new_hpage, hpage, 1, sync);
854 872
855 if (rc) 873 if (rc)
856 remove_migration_ptes(hpage, hpage); 874 remove_migration_ptes(hpage, hpage);
857 875
858 if (anon_vma) 876 if (anon_vma)
859 drop_anon_vma(anon_vma); 877 put_anon_vma(anon_vma);
860out: 878out:
861 unlock_page(hpage); 879 unlock_page(hpage);
862 880
diff --git a/mm/mlock.c b/mm/mlock.c
index c3924c7f00be..6b55e3efe0df 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page)
135 } 135 }
136} 136}
137 137
138static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
139{
140 return (vma->vm_flags & VM_GROWSDOWN) &&
141 (vma->vm_start == addr) &&
142 !vma_stack_continue(vma->vm_prev, addr);
143}
144
145/** 138/**
146 * __mlock_vma_pages_range() - mlock a range of pages in the vma. 139 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
147 * @vma: target vma 140 * @vma: target vma
@@ -188,12 +181,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
188 if (vma->vm_flags & VM_LOCKED) 181 if (vma->vm_flags & VM_LOCKED)
189 gup_flags |= FOLL_MLOCK; 182 gup_flags |= FOLL_MLOCK;
190 183
191 /* We don't try to access the guard page of a stack vma */
192 if (stack_guard_page(vma, start)) {
193 addr += PAGE_SIZE;
194 nr_pages--;
195 }
196
197 return __get_user_pages(current, mm, addr, nr_pages, gup_flags, 184 return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
198 NULL, NULL, nonblocking); 185 NULL, NULL, nonblocking);
199} 186}
@@ -237,7 +224,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
237 224
238 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 225 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
239 is_vm_hugetlb_page(vma) || 226 is_vm_hugetlb_page(vma) ||
240 vma == get_gate_vma(current))) { 227 vma == get_gate_vma(current->mm))) {
241 228
242 __mlock_vma_pages_range(vma, start, end, NULL); 229 __mlock_vma_pages_range(vma, start, end, NULL);
243 230
@@ -332,7 +319,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
332 int lock = newflags & VM_LOCKED; 319 int lock = newflags & VM_LOCKED;
333 320
334 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || 321 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
335 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) 322 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
336 goto out; /* don't set VM_LOCKED, don't count */ 323 goto out; /* don't set VM_LOCKED, don't count */
337 324
338 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 325 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
diff --git a/mm/mmap.c b/mm/mmap.c
index 2ec8eb5a9cdd..e27e0cf0de03 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -259,7 +259,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
259 * randomize_va_space to 2, which will still cause mm->start_brk 259 * randomize_va_space to 2, which will still cause mm->start_brk
260 * to be arbitrarily shifted 260 * to be arbitrarily shifted
261 */ 261 */
262 if (mm->start_brk > PAGE_ALIGN(mm->end_data)) 262 if (current->brk_randomized)
263 min_brk = mm->start_brk; 263 min_brk = mm->start_brk;
264 else 264 else
265 min_brk = mm->end_data; 265 min_brk = mm->end_data;
@@ -1814,11 +1814,14 @@ static int expand_downwards(struct vm_area_struct *vma,
1814 size = vma->vm_end - address; 1814 size = vma->vm_end - address;
1815 grow = (vma->vm_start - address) >> PAGE_SHIFT; 1815 grow = (vma->vm_start - address) >> PAGE_SHIFT;
1816 1816
1817 error = acct_stack_growth(vma, size, grow); 1817 error = -ENOMEM;
1818 if (!error) { 1818 if (grow <= vma->vm_pgoff) {
1819 vma->vm_start = address; 1819 error = acct_stack_growth(vma, size, grow);
1820 vma->vm_pgoff -= grow; 1820 if (!error) {
1821 perf_event_mmap(vma); 1821 vma->vm_start = address;
1822 vma->vm_pgoff -= grow;
1823 perf_event_mmap(vma);
1824 }
1822 } 1825 }
1823 } 1826 }
1824 vma_unlock_anon_vma(vma); 1827 vma_unlock_anon_vma(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index 1de98d492ddc..a7c1f9f9b941 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -277,9 +277,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
277 if (old_len > vma->vm_end - addr) 277 if (old_len > vma->vm_end - addr)
278 goto Efault; 278 goto Efault;
279 279
280 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { 280 /* Need to be careful about a growing mapping */
281 if (new_len > old_len) 281 if (new_len > old_len) {
282 unsigned long pgoff;
283
284 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
282 goto Efault; 285 goto Efault;
286 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
287 pgoff += vma->vm_pgoff;
288 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
289 goto Einval;
283 } 290 }
284 291
285 if (vma->vm_flags & VM_LOCKED) { 292 if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index e2bdb07079ce..9109049f0bbc 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -32,14 +32,6 @@ unsigned long max_low_pfn;
32unsigned long min_low_pfn; 32unsigned long min_low_pfn;
33unsigned long max_pfn; 33unsigned long max_pfn;
34 34
35#ifdef CONFIG_CRASH_DUMP
36/*
37 * If we have booted due to a crash, max_pfn will be a very low value. We need
38 * to know the amount of memory that the previous kernel used.
39 */
40unsigned long saved_max_pfn;
41#endif
42
43static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, 35static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
44 u64 goal, u64 limit) 36 u64 goal, u64 limit)
45{ 37{
@@ -158,7 +150,7 @@ unsigned long __init free_all_bootmem(void)
158{ 150{
159 /* 151 /*
160 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 152 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
161 * because in some case like Node0 doesnt have RAM installed 153 * because in some case like Node0 doesn't have RAM installed
162 * low ram will be on Node1 154 * low ram will be on Node1
163 * Use MAX_NUMNODES will make sure all ranges in early_node_map[] 155 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
164 * will be used instead of only Node0 related 156 * will be used instead of only Node0 related
diff --git a/mm/nommu.c b/mm/nommu.c
index f59e1424d3db..c4c542c736a9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1842,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1842} 1842}
1843EXPORT_SYMBOL(remap_vmalloc_range); 1843EXPORT_SYMBOL(remap_vmalloc_range);
1844 1844
1845void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1846{
1847}
1848
1849unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, 1845unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1850 unsigned long len, unsigned long pgoff, unsigned long flags) 1846 unsigned long len, unsigned long pgoff, unsigned long flags)
1851{ 1847{
@@ -1963,7 +1959,7 @@ error:
1963 return -ENOMEM; 1959 return -ENOMEM;
1964} 1960}
1965 1961
1966int in_gate_area_no_task(unsigned long addr) 1962int in_gate_area_no_mm(unsigned long addr)
1967{ 1963{
1968 return 0; 1964 return 0;
1969} 1965}
@@ -1975,21 +1971,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1975} 1971}
1976EXPORT_SYMBOL(filemap_fault); 1972EXPORT_SYMBOL(filemap_fault);
1977 1973
1978/* 1974static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1979 * Access another process' address space. 1975 unsigned long addr, void *buf, int len, int write)
1980 * - source/target buffer must be kernel space
1981 */
1982int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
1983{ 1976{
1984 struct vm_area_struct *vma; 1977 struct vm_area_struct *vma;
1985 struct mm_struct *mm;
1986
1987 if (addr + len < addr)
1988 return 0;
1989
1990 mm = get_task_mm(tsk);
1991 if (!mm)
1992 return 0;
1993 1978
1994 down_read(&mm->mmap_sem); 1979 down_read(&mm->mmap_sem);
1995 1980
@@ -2014,6 +1999,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2014 } 1999 }
2015 2000
2016 up_read(&mm->mmap_sem); 2001 up_read(&mm->mmap_sem);
2002
2003 return len;
2004}
2005
2006/**
2007 * @access_remote_vm - access another process' address space
2008 * @mm: the mm_struct of the target address space
2009 * @addr: start address to access
2010 * @buf: source or destination buffer
2011 * @len: number of bytes to transfer
2012 * @write: whether the access is a write
2013 *
2014 * The caller must hold a reference on @mm.
2015 */
2016int access_remote_vm(struct mm_struct *mm, unsigned long addr,
2017 void *buf, int len, int write)
2018{
2019 return __access_remote_vm(NULL, mm, addr, buf, len, write);
2020}
2021
2022/*
2023 * Access another process' address space.
2024 * - source/target buffer must be kernel space
2025 */
2026int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2027{
2028 struct mm_struct *mm;
2029
2030 if (addr + len < addr)
2031 return 0;
2032
2033 mm = get_task_mm(tsk);
2034 if (!mm)
2035 return 0;
2036
2037 len = __access_remote_vm(tsk, mm, addr, buf, len, write);
2038
2017 mmput(mm); 2039 mmput(mm);
2018 return len; 2040 return len;
2019} 2041}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7dcca55ede7c..83fb72c108b7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,6 +31,7 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/ptrace.h>
34 35
35int sysctl_panic_on_oom; 36int sysctl_panic_on_oom;
36int sysctl_oom_kill_allocating_task; 37int sysctl_oom_kill_allocating_task;
@@ -83,24 +84,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
83#endif /* CONFIG_NUMA */ 84#endif /* CONFIG_NUMA */
84 85
85/* 86/*
86 * If this is a system OOM (not a memcg OOM) and the task selected to be
87 * killed is not already running at high (RT) priorities, speed up the
88 * recovery by boosting the dying task to the lowest FIFO priority.
89 * That helps with the recovery and avoids interfering with RT tasks.
90 */
91static void boost_dying_task_prio(struct task_struct *p,
92 struct mem_cgroup *mem)
93{
94 struct sched_param param = { .sched_priority = 1 };
95
96 if (mem)
97 return;
98
99 if (!rt_task(p))
100 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
101}
102
103/*
104 * The process p may have detached its own ->mm while exiting or through 87 * The process p may have detached its own ->mm while exiting or through
105 * use_mm(), but one or more of its subthreads may still have a valid 88 * use_mm(), but one or more of its subthreads may still have a valid
106 * pointer. Return p, or any of its subthreads with a valid ->mm, with 89 * pointer. Return p, or any of its subthreads with a valid ->mm, with
@@ -292,13 +275,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
292 unsigned long totalpages, struct mem_cgroup *mem, 275 unsigned long totalpages, struct mem_cgroup *mem,
293 const nodemask_t *nodemask) 276 const nodemask_t *nodemask)
294{ 277{
295 struct task_struct *p; 278 struct task_struct *g, *p;
296 struct task_struct *chosen = NULL; 279 struct task_struct *chosen = NULL;
297 *ppoints = 0; 280 *ppoints = 0;
298 281
299 for_each_process(p) { 282 do_each_thread(g, p) {
300 unsigned int points; 283 unsigned int points;
301 284
285 if (!p->mm)
286 continue;
302 if (oom_unkillable_task(p, mem, nodemask)) 287 if (oom_unkillable_task(p, mem, nodemask))
303 continue; 288 continue;
304 289
@@ -314,22 +299,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
314 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 299 if (test_tsk_thread_flag(p, TIF_MEMDIE))
315 return ERR_PTR(-1UL); 300 return ERR_PTR(-1UL);
316 301
317 /* 302 if (p->flags & PF_EXITING) {
318 * This is in the process of releasing memory so wait for it 303 /*
319 * to finish before killing some other task by mistake. 304 * If p is the current task and is in the process of
320 * 305 * releasing memory, we allow the "kill" to set
321 * However, if p is the current task, we allow the 'kill' to 306 * TIF_MEMDIE, which will allow it to gain access to
322 * go ahead if it is exiting: this will simply set TIF_MEMDIE, 307 * memory reserves. Otherwise, it may stall forever.
323 * which will allow it to gain access to memory reserves in 308 *
324 * the process of exiting and releasing its resources. 309 * The loop isn't broken here, however, in case other
325 * Otherwise we could get an easy OOM deadlock. 310 * threads are found to have already been oom killed.
326 */ 311 */
327 if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { 312 if (p == current) {
328 if (p != current) 313 chosen = p;
329 return ERR_PTR(-1UL); 314 *ppoints = 1000;
330 315 } else {
331 chosen = p; 316 /*
332 *ppoints = 1000; 317 * If this task is not being ptraced on exit,
318 * then wait for it to finish before killing
319 * some other task unnecessarily.
320 */
321 if (!(task_ptrace(p->group_leader) &
322 PT_TRACE_EXIT))
323 return ERR_PTR(-1UL);
324 }
333 } 325 }
334 326
335 points = oom_badness(p, mem, nodemask, totalpages); 327 points = oom_badness(p, mem, nodemask, totalpages);
@@ -337,7 +329,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
337 chosen = p; 329 chosen = p;
338 *ppoints = points; 330 *ppoints = points;
339 } 331 }
340 } 332 } while_each_thread(g, p);
341 333
342 return chosen; 334 return chosen;
343} 335}
@@ -396,7 +388,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
396 task_unlock(current); 388 task_unlock(current);
397 dump_stack(); 389 dump_stack();
398 mem_cgroup_print_oom_info(mem, p); 390 mem_cgroup_print_oom_info(mem, p);
399 show_mem(); 391 show_mem(SHOW_MEM_FILTER_NODES);
400 if (sysctl_oom_dump_tasks) 392 if (sysctl_oom_dump_tasks)
401 dump_tasks(mem, nodemask); 393 dump_tasks(mem, nodemask);
402} 394}
@@ -442,13 +434,6 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
442 set_tsk_thread_flag(p, TIF_MEMDIE); 434 set_tsk_thread_flag(p, TIF_MEMDIE);
443 force_sig(SIGKILL, p); 435 force_sig(SIGKILL, p);
444 436
445 /*
446 * We give our sacrificial lamb high priority and access to
447 * all the memory it needs. That way it should be able to
448 * exit() and clear out its resources quickly...
449 */
450 boost_dying_task_prio(p, mem);
451
452 return 0; 437 return 0;
453} 438}
454#undef K 439#undef K
@@ -472,7 +457,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
472 */ 457 */
473 if (p->flags & PF_EXITING) { 458 if (p->flags & PF_EXITING) {
474 set_tsk_thread_flag(p, TIF_MEMDIE); 459 set_tsk_thread_flag(p, TIF_MEMDIE);
475 boost_dying_task_prio(p, mem);
476 return 0; 460 return 0;
477 } 461 }
478 462
@@ -491,6 +475,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
491 list_for_each_entry(child, &t->children, sibling) { 475 list_for_each_entry(child, &t->children, sibling) {
492 unsigned int child_points; 476 unsigned int child_points;
493 477
478 if (child->mm == p->mm)
479 continue;
494 /* 480 /*
495 * oom_badness() returns 0 if the thread is unkillable 481 * oom_badness() returns 0 if the thread is unkillable
496 */ 482 */
@@ -537,6 +523,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
537 unsigned int points = 0; 523 unsigned int points = 0;
538 struct task_struct *p; 524 struct task_struct *p;
539 525
526 /*
527 * If current has a pending SIGKILL, then automatically select it. The
528 * goal is to allow it to allocate so that it may quickly exit and free
529 * its memory.
530 */
531 if (fatal_signal_pending(current)) {
532 set_thread_flag(TIF_MEMDIE);
533 return;
534 }
535
540 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 536 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
541 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; 537 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
542 read_lock(&tasklist_lock); 538 read_lock(&tasklist_lock);
@@ -689,7 +685,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
689 */ 685 */
690 if (fatal_signal_pending(current)) { 686 if (fatal_signal_pending(current)) {
691 set_thread_flag(TIF_MEMDIE); 687 set_thread_flag(TIF_MEMDIE);
692 boost_dying_task_prio(current, NULL);
693 return; 688 return;
694 } 689 }
695 690
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2cb01f6ec5d0..31f698862420 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -927,7 +927,7 @@ retry:
927 break; 927 break;
928 } 928 }
929 929
930 done_index = page->index + 1; 930 done_index = page->index;
931 931
932 lock_page(page); 932 lock_page(page);
933 933
@@ -977,6 +977,7 @@ continue_unlock:
977 * not be suitable for data integrity 977 * not be suitable for data integrity
978 * writeout). 978 * writeout).
979 */ 979 */
980 done_index = page->index + 1;
980 done = 1; 981 done = 1;
981 break; 982 break;
982 } 983 }
@@ -1039,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
1039int generic_writepages(struct address_space *mapping, 1040int generic_writepages(struct address_space *mapping,
1040 struct writeback_control *wbc) 1041 struct writeback_control *wbc)
1041{ 1042{
1043 struct blk_plug plug;
1044 int ret;
1045
1042 /* deal with chardevs and other special file */ 1046 /* deal with chardevs and other special file */
1043 if (!mapping->a_ops->writepage) 1047 if (!mapping->a_ops->writepage)
1044 return 0; 1048 return 0;
1045 1049
1046 return write_cache_pages(mapping, wbc, __writepage, mapping); 1050 blk_start_plug(&plug);
1051 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
1052 blk_finish_plug(&plug);
1053 return ret;
1047} 1054}
1048 1055
1049EXPORT_SYMBOL(generic_writepages); 1056EXPORT_SYMBOL(generic_writepages);
@@ -1211,6 +1218,17 @@ int set_page_dirty(struct page *page)
1211 1218
1212 if (likely(mapping)) { 1219 if (likely(mapping)) {
1213 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; 1220 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
1221 /*
1222 * readahead/lru_deactivate_page could remain
1223 * PG_readahead/PG_reclaim due to race with end_page_writeback
1224 * About readahead, if the page is written, the flags would be
1225 * reset. So no problem.
1226 * About lru_deactivate_page, if the page is redirty, the flag
1227 * will be reset. So no problem. but if the page is used by readahead
1228 * it will confuse readahead and make it restart the size rampup
1229 * process. But it's a trivial problem.
1230 */
1231 ClearPageReclaim(page);
1214#ifdef CONFIG_BLOCK 1232#ifdef CONFIG_BLOCK
1215 if (!spd) 1233 if (!spd)
1216 spd = __set_page_dirty_buffers; 1234 spd = __set_page_dirty_buffers;
@@ -1239,7 +1257,7 @@ int set_page_dirty_lock(struct page *page)
1239{ 1257{
1240 int ret; 1258 int ret;
1241 1259
1242 lock_page_nosync(page); 1260 lock_page(page);
1243 ret = set_page_dirty(page); 1261 ret = set_page_dirty(page);
1244 unlock_page(page); 1262 unlock_page(page);
1245 return ret; 1263 return ret;
@@ -1266,7 +1284,6 @@ int clear_page_dirty_for_io(struct page *page)
1266 1284
1267 BUG_ON(!PageLocked(page)); 1285 BUG_ON(!PageLocked(page));
1268 1286
1269 ClearPageReclaim(page);
1270 if (mapping && mapping_cap_account_dirty(mapping)) { 1287 if (mapping && mapping_cap_account_dirty(mapping)) {
1271 /* 1288 /*
1272 * Yes, Virginia, this is indeed insane. 1289 * Yes, Virginia, this is indeed insane.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 48c9737ad49a..df9fc3385fb2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@
53#include <linux/compaction.h> 53#include <linux/compaction.h>
54#include <trace/events/kmem.h> 54#include <trace/events/kmem.h>
55#include <linux/ftrace_event.h> 55#include <linux/ftrace_event.h>
56#include <linux/memcontrol.h>
56 57
57#include <asm/tlbflush.h> 58#include <asm/tlbflush.h>
58#include <asm/div64.h> 59#include <asm/div64.h>
@@ -565,7 +566,8 @@ static inline int free_pages_check(struct page *page)
565 if (unlikely(page_mapcount(page) | 566 if (unlikely(page_mapcount(page) |
566 (page->mapping != NULL) | 567 (page->mapping != NULL) |
567 (atomic_read(&page->_count) != 0) | 568 (atomic_read(&page->_count) != 0) |
568 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 569 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
570 (mem_cgroup_bad_page_check(page)))) {
569 bad_page(page); 571 bad_page(page);
570 return 1; 572 return 1;
571 } 573 }
@@ -614,6 +616,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
614 list = &pcp->lists[migratetype]; 616 list = &pcp->lists[migratetype];
615 } while (list_empty(list)); 617 } while (list_empty(list));
616 618
619 /* This is the only non-empty list. Free them all. */
620 if (batch_free == MIGRATE_PCPTYPES)
621 batch_free = to_free;
622
617 do { 623 do {
618 page = list_entry(list->prev, struct page, lru); 624 page = list_entry(list->prev, struct page, lru);
619 /* must delete as __free_one_page list manipulates */ 625 /* must delete as __free_one_page list manipulates */
@@ -750,7 +756,8 @@ static inline int check_new_page(struct page *page)
750 if (unlikely(page_mapcount(page) | 756 if (unlikely(page_mapcount(page) |
751 (page->mapping != NULL) | 757 (page->mapping != NULL) |
752 (atomic_read(&page->_count) != 0) | 758 (atomic_read(&page->_count) != 0) |
753 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 759 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
760 (mem_cgroup_bad_page_check(page)))) {
754 bad_page(page); 761 bad_page(page);
755 return 1; 762 return 1;
756 } 763 }
@@ -863,9 +870,8 @@ static int move_freepages(struct zone *zone,
863 } 870 }
864 871
865 order = page_order(page); 872 order = page_order(page);
866 list_del(&page->lru); 873 list_move(&page->lru,
867 list_add(&page->lru, 874 &zone->free_area[order].free_list[migratetype]);
868 &zone->free_area[order].free_list[migratetype]);
869 page += 1 << order; 875 page += 1 << order;
870 pages_moved += 1 << order; 876 pages_moved += 1 << order;
871 } 877 }
@@ -936,7 +942,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
936 * If breaking a large block of pages, move all free 942 * If breaking a large block of pages, move all free
937 * pages to the preferred allocation list. If falling 943 * pages to the preferred allocation list. If falling
938 * back for a reclaimable kernel allocation, be more 944 * back for a reclaimable kernel allocation, be more
939 * agressive about taking ownership of free pages 945 * aggressive about taking ownership of free pages
940 */ 946 */
941 if (unlikely(current_order >= (pageblock_order >> 1)) || 947 if (unlikely(current_order >= (pageblock_order >> 1)) ||
942 start_migratetype == MIGRATE_RECLAIMABLE || 948 start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1333,7 +1339,7 @@ again:
1333 } 1339 }
1334 1340
1335 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1341 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1336 zone_statistics(preferred_zone, zone); 1342 zone_statistics(preferred_zone, zone, gfp_flags);
1337 local_irq_restore(flags); 1343 local_irq_restore(flags);
1338 1344
1339 VM_BUG_ON(bad_range(zone, page)); 1345 VM_BUG_ON(bad_range(zone, page));
@@ -1714,6 +1720,20 @@ try_next_zone:
1714 return page; 1720 return page;
1715} 1721}
1716 1722
1723/*
1724 * Large machines with many possible nodes should not always dump per-node
1725 * meminfo in irq context.
1726 */
1727static inline bool should_suppress_show_mem(void)
1728{
1729 bool ret = false;
1730
1731#if NODES_SHIFT > 8
1732 ret = in_interrupt();
1733#endif
1734 return ret;
1735}
1736
1717static inline int 1737static inline int
1718should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1738should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1719 unsigned long pages_reclaimed) 1739 unsigned long pages_reclaimed)
@@ -2085,7 +2105,7 @@ rebalance:
2085 sync_migration); 2105 sync_migration);
2086 if (page) 2106 if (page)
2087 goto got_pg; 2107 goto got_pg;
2088 sync_migration = true; 2108 sync_migration = !(gfp_mask & __GFP_NO_KSWAPD);
2089 2109
2090 /* Try direct reclaim and then allocating */ 2110 /* Try direct reclaim and then allocating */
2091 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2111 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2157,11 +2177,25 @@ rebalance:
2157 2177
2158nopage: 2178nopage:
2159 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 2179 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
2160 printk(KERN_WARNING "%s: page allocation failure." 2180 unsigned int filter = SHOW_MEM_FILTER_NODES;
2161 " order:%d, mode:0x%x\n", 2181
2182 /*
2183 * This documents exceptions given to allocations in certain
2184 * contexts that are allowed to allocate outside current's set
2185 * of allowed nodes.
2186 */
2187 if (!(gfp_mask & __GFP_NOMEMALLOC))
2188 if (test_thread_flag(TIF_MEMDIE) ||
2189 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2190 filter &= ~SHOW_MEM_FILTER_NODES;
2191 if (in_interrupt() || !wait)
2192 filter &= ~SHOW_MEM_FILTER_NODES;
2193
2194 pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n",
2162 current->comm, order, gfp_mask); 2195 current->comm, order, gfp_mask);
2163 dump_stack(); 2196 dump_stack();
2164 show_mem(); 2197 if (!should_suppress_show_mem())
2198 show_mem(filter);
2165 } 2199 }
2166 return page; 2200 return page;
2167got_pg: 2201got_pg:
@@ -2411,19 +2445,42 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2411} 2445}
2412#endif 2446#endif
2413 2447
2448/*
2449 * Determine whether the zone's node should be displayed or not, depending on
2450 * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas().
2451 */
2452static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
2453{
2454 bool ret = false;
2455
2456 if (!(flags & SHOW_MEM_FILTER_NODES))
2457 goto out;
2458
2459 get_mems_allowed();
2460 ret = !node_isset(zone->zone_pgdat->node_id,
2461 cpuset_current_mems_allowed);
2462 put_mems_allowed();
2463out:
2464 return ret;
2465}
2466
2414#define K(x) ((x) << (PAGE_SHIFT-10)) 2467#define K(x) ((x) << (PAGE_SHIFT-10))
2415 2468
2416/* 2469/*
2417 * Show free area list (used inside shift_scroll-lock stuff) 2470 * Show free area list (used inside shift_scroll-lock stuff)
2418 * We also calculate the percentage fragmentation. We do this by counting the 2471 * We also calculate the percentage fragmentation. We do this by counting the
2419 * memory on each free list with the exception of the first item on the list. 2472 * memory on each free list with the exception of the first item on the list.
2473 * Suppresses nodes that are not allowed by current's cpuset if
2474 * SHOW_MEM_FILTER_NODES is passed.
2420 */ 2475 */
2421void show_free_areas(void) 2476void __show_free_areas(unsigned int filter)
2422{ 2477{
2423 int cpu; 2478 int cpu;
2424 struct zone *zone; 2479 struct zone *zone;
2425 2480
2426 for_each_populated_zone(zone) { 2481 for_each_populated_zone(zone) {
2482 if (skip_free_areas_zone(filter, zone))
2483 continue;
2427 show_node(zone); 2484 show_node(zone);
2428 printk("%s per-cpu:\n", zone->name); 2485 printk("%s per-cpu:\n", zone->name);
2429 2486
@@ -2465,6 +2522,8 @@ void show_free_areas(void)
2465 for_each_populated_zone(zone) { 2522 for_each_populated_zone(zone) {
2466 int i; 2523 int i;
2467 2524
2525 if (skip_free_areas_zone(filter, zone))
2526 continue;
2468 show_node(zone); 2527 show_node(zone);
2469 printk("%s" 2528 printk("%s"
2470 " free:%lukB" 2529 " free:%lukB"
@@ -2532,6 +2591,8 @@ void show_free_areas(void)
2532 for_each_populated_zone(zone) { 2591 for_each_populated_zone(zone) {
2533 unsigned long nr[MAX_ORDER], flags, order, total = 0; 2592 unsigned long nr[MAX_ORDER], flags, order, total = 0;
2534 2593
2594 if (skip_free_areas_zone(filter, zone))
2595 continue;
2535 show_node(zone); 2596 show_node(zone);
2536 printk("%s: ", zone->name); 2597 printk("%s: ", zone->name);
2537 2598
@@ -2551,6 +2612,11 @@ void show_free_areas(void)
2551 show_swap_cache_info(); 2612 show_swap_cache_info();
2552} 2613}
2553 2614
2615void show_free_areas(void)
2616{
2617 __show_free_areas(0);
2618}
2619
2554static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 2620static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
2555{ 2621{
2556 zoneref->zone = zone; 2622 zoneref->zone = zone;
@@ -3110,7 +3176,7 @@ static __init_refok int __build_all_zonelists(void *data)
3110 * Called with zonelists_mutex held always 3176 * Called with zonelists_mutex held always
3111 * unless system_state == SYSTEM_BOOTING. 3177 * unless system_state == SYSTEM_BOOTING.
3112 */ 3178 */
3113void build_all_zonelists(void *data) 3179void __ref build_all_zonelists(void *data)
3114{ 3180{
3115 set_zonelist_order(); 3181 set_zonelist_order();
3116 3182
@@ -3860,7 +3926,7 @@ static void __init find_usable_zone_for_movable(void)
3860 3926
3861/* 3927/*
3862 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 3928 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
3863 * because it is sized independant of architecture. Unlike the other zones, 3929 * because it is sized independent of architecture. Unlike the other zones,
3864 * the starting point for ZONE_MOVABLE is not fixed. It may be different 3930 * the starting point for ZONE_MOVABLE is not fixed. It may be different
3865 * in each node depending on the size of each node and how evenly kernelcore 3931 * in each node depending on the size of each node and how evenly kernelcore
3866 * is distributed. This helper function adjusts the zone ranges 3932 * is distributed. This helper function adjusts the zone ranges
@@ -5621,4 +5687,5 @@ void dump_page(struct page *page)
5621 page, atomic_read(&page->_count), page_mapcount(page), 5687 page, atomic_read(&page->_count), page_mapcount(page),
5622 page->mapping, page->index); 5688 page->mapping, page->index);
5623 dump_page_flags(page->flags); 5689 dump_page_flags(page->flags);
5690 mem_cgroup_print_bad_page(page);
5624} 5691}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5bffada7cde1..99055010cece 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,12 +11,11 @@
11#include <linux/swapops.h> 11#include <linux/swapops.h>
12#include <linux/kmemleak.h> 12#include <linux/kmemleak.h>
13 13
14static void __meminit 14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
15__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
16{ 15{
17 pc->flags = 0; 16 pc->flags = 0;
17 set_page_cgroup_array_id(pc, id);
18 pc->mem_cgroup = NULL; 18 pc->mem_cgroup = NULL;
19 pc->page = pfn_to_page(pfn);
20 INIT_LIST_HEAD(&pc->lru); 19 INIT_LIST_HEAD(&pc->lru);
21} 20}
22static unsigned long total_usage; 21static unsigned long total_usage;
@@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
43 return base + offset; 42 return base + offset;
44} 43}
45 44
45struct page *lookup_cgroup_page(struct page_cgroup *pc)
46{
47 unsigned long pfn;
48 struct page *page;
49 pg_data_t *pgdat;
50
51 pgdat = NODE_DATA(page_cgroup_array_id(pc));
52 pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
53 page = pfn_to_page(pfn);
54 VM_BUG_ON(pc != lookup_page_cgroup(page));
55 return page;
56}
57
46static int __init alloc_node_page_cgroup(int nid) 58static int __init alloc_node_page_cgroup(int nid)
47{ 59{
48 struct page_cgroup *base, *pc; 60 struct page_cgroup *base, *pc;
@@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid)
63 return -ENOMEM; 75 return -ENOMEM;
64 for (index = 0; index < nr_pages; index++) { 76 for (index = 0; index < nr_pages; index++) {
65 pc = base + index; 77 pc = base + index;
66 __init_page_cgroup(pc, start_pfn + index); 78 init_page_cgroup(pc, nid);
67 } 79 }
68 NODE_DATA(nid)->node_page_cgroup = base; 80 NODE_DATA(nid)->node_page_cgroup = base;
69 total_usage += table_size; 81 total_usage += table_size;
@@ -105,46 +117,75 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
105 return section->page_cgroup + pfn; 117 return section->page_cgroup + pfn;
106} 118}
107 119
108/* __alloc_bootmem...() is protected by !slab_available() */ 120struct page *lookup_cgroup_page(struct page_cgroup *pc)
121{
122 struct mem_section *section;
123 struct page *page;
124 unsigned long nr;
125
126 nr = page_cgroup_array_id(pc);
127 section = __nr_to_section(nr);
128 page = pfn_to_page(pc - section->page_cgroup);
129 VM_BUG_ON(pc != lookup_page_cgroup(page));
130 return page;
131}
132
133static void *__init_refok alloc_page_cgroup(size_t size, int nid)
134{
135 void *addr = NULL;
136
137 addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN);
138 if (addr)
139 return addr;
140
141 if (node_state(nid, N_HIGH_MEMORY))
142 addr = vmalloc_node(size, nid);
143 else
144 addr = vmalloc(size);
145
146 return addr;
147}
148
149#ifdef CONFIG_MEMORY_HOTPLUG
150static void free_page_cgroup(void *addr)
151{
152 if (is_vmalloc_addr(addr)) {
153 vfree(addr);
154 } else {
155 struct page *page = virt_to_page(addr);
156 size_t table_size =
157 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
158
159 BUG_ON(PageReserved(page));
160 free_pages_exact(addr, table_size);
161 }
162}
163#endif
164
109static int __init_refok init_section_page_cgroup(unsigned long pfn) 165static int __init_refok init_section_page_cgroup(unsigned long pfn)
110{ 166{
111 struct mem_section *section = __pfn_to_section(pfn);
112 struct page_cgroup *base, *pc; 167 struct page_cgroup *base, *pc;
168 struct mem_section *section;
113 unsigned long table_size; 169 unsigned long table_size;
170 unsigned long nr;
114 int nid, index; 171 int nid, index;
115 172
116 if (!section->page_cgroup) { 173 nr = pfn_to_section_nr(pfn);
117 nid = page_to_nid(pfn_to_page(pfn)); 174 section = __nr_to_section(nr);
118 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 175
119 VM_BUG_ON(!slab_is_available()); 176 if (section->page_cgroup)
120 if (node_state(nid, N_HIGH_MEMORY)) { 177 return 0;
121 base = kmalloc_node(table_size, 178
122 GFP_KERNEL | __GFP_NOWARN, nid); 179 nid = page_to_nid(pfn_to_page(pfn));
123 if (!base) 180 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
124 base = vmalloc_node(table_size, nid); 181 base = alloc_page_cgroup(table_size, nid);
125 } else { 182
126 base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); 183 /*
127 if (!base) 184 * The value stored in section->page_cgroup is (base - pfn)
128 base = vmalloc(table_size); 185 * and it does not point to the memory block allocated above,
129 } 186 * causing kmemleak false positives.
130 /* 187 */
131 * The value stored in section->page_cgroup is (base - pfn) 188 kmemleak_not_leak(base);
132 * and it does not point to the memory block allocated above,
133 * causing kmemleak false positives.
134 */
135 kmemleak_not_leak(base);
136 } else {
137 /*
138 * We don't have to allocate page_cgroup again, but
139 * address of memmap may be changed. So, we have to initialize
140 * again.
141 */
142 base = section->page_cgroup + pfn;
143 table_size = 0;
144 /* check address of memmap is changed or not. */
145 if (base->page == pfn_to_page(pfn))
146 return 0;
147 }
148 189
149 if (!base) { 190 if (!base) {
150 printk(KERN_ERR "page cgroup allocation failure\n"); 191 printk(KERN_ERR "page cgroup allocation failure\n");
@@ -153,7 +194,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
153 194
154 for (index = 0; index < PAGES_PER_SECTION; index++) { 195 for (index = 0; index < PAGES_PER_SECTION; index++) {
155 pc = base + index; 196 pc = base + index;
156 __init_page_cgroup(pc, pfn + index); 197 init_page_cgroup(pc, nr);
157 } 198 }
158 199
159 section->page_cgroup = base - pfn; 200 section->page_cgroup = base - pfn;
@@ -170,16 +211,8 @@ void __free_page_cgroup(unsigned long pfn)
170 if (!ms || !ms->page_cgroup) 211 if (!ms || !ms->page_cgroup)
171 return; 212 return;
172 base = ms->page_cgroup + pfn; 213 base = ms->page_cgroup + pfn;
173 if (is_vmalloc_addr(base)) { 214 free_page_cgroup(base);
174 vfree(base); 215 ms->page_cgroup = NULL;
175 ms->page_cgroup = NULL;
176 } else {
177 struct page *page = virt_to_page(base);
178 if (!PageReserved(page)) { /* Is bootmem ? */
179 kfree(base);
180 ms->page_cgroup = NULL;
181 }
182 }
183} 216}
184 217
185int __meminit online_page_cgroup(unsigned long start_pfn, 218int __meminit online_page_cgroup(unsigned long start_pfn,
@@ -243,12 +276,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
243 break; 276 break;
244 } 277 }
245 278
246 if (ret) 279 return notifier_from_errno(ret);
247 ret = notifier_from_errno(ret);
248 else
249 ret = NOTIFY_OK;
250
251 return ret;
252} 280}
253 281
254#endif 282#endif
@@ -349,7 +377,7 @@ not_enough_page:
349 * @new: new id 377 * @new: new id
350 * 378 *
351 * Returns old id at success, 0 at failure. 379 * Returns old id at success, 0 at failure.
352 * (There is no mem_cgroup useing 0 as its id) 380 * (There is no mem_cgroup using 0 as its id)
353 */ 381 */
354unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, 382unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
355 unsigned short old, unsigned short new) 383 unsigned short old, unsigned short new)
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dee975bf469..dc76b4d0611e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
106 goto out; 106 goto out;
107 } 107 }
108 if (wbc->sync_mode == WB_SYNC_ALL) 108 if (wbc->sync_mode == WB_SYNC_ALL)
109 rw |= REQ_SYNC | REQ_UNPLUG; 109 rw |= REQ_SYNC;
110 count_vm_event(PSWPOUT); 110 count_vm_event(PSWPOUT);
111 set_page_writeback(page); 111 set_page_writeback(page);
112 unlock_page(page); 112 unlock_page(page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7cfa6ae02303..c3450d533611 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -33,19 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
33 33
34 pmd = pmd_offset(pud, addr); 34 pmd = pmd_offset(pud, addr);
35 do { 35 do {
36again:
36 next = pmd_addr_end(addr, end); 37 next = pmd_addr_end(addr, end);
37 split_huge_page_pmd(walk->mm, pmd); 38 if (pmd_none(*pmd)) {
38 if (pmd_none_or_clear_bad(pmd)) {
39 if (walk->pte_hole) 39 if (walk->pte_hole)
40 err = walk->pte_hole(addr, next, walk); 40 err = walk->pte_hole(addr, next, walk);
41 if (err) 41 if (err)
42 break; 42 break;
43 continue; 43 continue;
44 } 44 }
45 /*
46 * This implies that each ->pmd_entry() handler
47 * needs to know about pmd_trans_huge() pmds
48 */
45 if (walk->pmd_entry) 49 if (walk->pmd_entry)
46 err = walk->pmd_entry(pmd, addr, next, walk); 50 err = walk->pmd_entry(pmd, addr, next, walk);
47 if (!err && walk->pte_entry) 51 if (err)
48 err = walk_pte_range(pmd, addr, next, walk); 52 break;
53
54 /*
55 * Check this here so we only break down trans_huge
56 * pages when we _need_ to
57 */
58 if (!walk->pte_entry)
59 continue;
60
61 split_huge_page_pmd(walk->mm, pmd);
62 if (pmd_none_or_clear_bad(pmd))
63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk);
49 if (err) 65 if (err)
50 break; 66 break;
51 } while (pmd++, addr = next, addr != end); 67 } while (pmd++, addr = next, addr != end);
diff --git a/mm/percpu.c b/mm/percpu.c
index 3f930018aa60..a160db39b810 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -342,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
342 * @chunk: chunk of interest 342 * @chunk: chunk of interest
343 * 343 *
344 * Determine whether area map of @chunk needs to be extended to 344 * Determine whether area map of @chunk needs to be extended to
345 * accomodate a new allocation. 345 * accommodate a new allocation.
346 * 346 *
347 * CONTEXT: 347 * CONTEXT:
348 * pcpu_lock. 348 * pcpu_lock.
@@ -431,7 +431,7 @@ out_unlock:
431 * depending on @head, is reduced by @tail bytes and @tail byte block 431 * depending on @head, is reduced by @tail bytes and @tail byte block
432 * is inserted after the target block. 432 * is inserted after the target block.
433 * 433 *
434 * @chunk->map must have enough free slots to accomodate the split. 434 * @chunk->map must have enough free slots to accommodate the split.
435 * 435 *
436 * CONTEXT: 436 * CONTEXT:
437 * pcpu_lock. 437 * pcpu_lock.
@@ -1008,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
1008 } 1008 }
1009 1009
1010 if (in_first_chunk) { 1010 if (in_first_chunk) {
1011 if ((unsigned long)addr < VMALLOC_START || 1011 if (!is_vmalloc_addr(addr))
1012 (unsigned long)addr >= VMALLOC_END)
1013 return __pa(addr); 1012 return __pa(addr);
1014 else 1013 else
1015 return page_to_phys(vmalloc_to_page(addr)); 1014 return page_to_phys(vmalloc_to_page(addr));
@@ -1436,7 +1435,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1436 /* 1435 /*
1437 * Determine min_unit_size, alloc_size and max_upa such that 1436 * Determine min_unit_size, alloc_size and max_upa such that
1438 * alloc_size is multiple of atom_size and is the smallest 1437 * alloc_size is multiple of atom_size and is the smallest
1439 * which can accomodate 4k aligned segments which are equal to 1438 * which can accommodate 4k aligned segments which are equal to
1440 * or larger than min_unit_size. 1439 * or larger than min_unit_size.
1441 */ 1440 */
1442 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); 1441 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
@@ -1551,7 +1550,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1551 * @atom_size: allocation atom size 1550 * @atom_size: allocation atom size
1552 * @cpu_distance_fn: callback to determine distance between cpus, optional 1551 * @cpu_distance_fn: callback to determine distance between cpus, optional
1553 * @alloc_fn: function to allocate percpu page 1552 * @alloc_fn: function to allocate percpu page
1554 * @free_fn: funtion to free percpu page 1553 * @free_fn: function to free percpu page
1555 * 1554 *
1556 * This is a helper to ease setting up embedded first percpu chunk and 1555 * This is a helper to ease setting up embedded first percpu chunk and
1557 * can be called where pcpu_setup_first_chunk() is expected. 1556 * can be called where pcpu_setup_first_chunk() is expected.
@@ -1679,7 +1678,7 @@ out_free:
1679 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages 1678 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1680 * @reserved_size: the size of reserved percpu area in bytes 1679 * @reserved_size: the size of reserved percpu area in bytes
1681 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE 1680 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
1682 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE 1681 * @free_fn: function to free percpu page, always called with PAGE_SIZE
1683 * @populate_pte_fn: function to populate pte 1682 * @populate_pte_fn: function to populate pte
1684 * 1683 *
1685 * This is a helper to ease setting up page-remapped first percpu 1684 * This is a helper to ease setting up page-remapped first percpu
diff --git a/mm/readahead.c b/mm/readahead.c
index 77506a291a2d..2c0cc489e288 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages);
109static int read_pages(struct address_space *mapping, struct file *filp, 109static int read_pages(struct address_space *mapping, struct file *filp,
110 struct list_head *pages, unsigned nr_pages) 110 struct list_head *pages, unsigned nr_pages)
111{ 111{
112 struct blk_plug plug;
112 unsigned page_idx; 113 unsigned page_idx;
113 int ret; 114 int ret;
114 115
116 blk_start_plug(&plug);
117
115 if (mapping->a_ops->readpages) { 118 if (mapping->a_ops->readpages) {
116 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 119 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
117 /* Clean up the remaining pages */ 120 /* Clean up the remaining pages */
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
129 page_cache_release(page); 132 page_cache_release(page);
130 } 133 }
131 ret = 0; 134 ret = 0;
135
132out: 136out:
137 blk_finish_plug(&plug);
138
133 return ret; 139 return ret;
134} 140}
135 141
@@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping,
554 560
555 /* do read-ahead */ 561 /* do read-ahead */
556 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 562 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
557
558#ifdef CONFIG_BLOCK
559 /*
560 * Normally the current page is !uptodate and lock_page() will be
561 * immediately called to implicitly unplug the device. However this
562 * is not always true for RAID conifgurations, where data arrives
563 * not strictly in their submission order. In this case we need to
564 * explicitly kick off the IO.
565 */
566 if (PageUptodate(page))
567 blk_run_backing_dev(mapping->backing_dev_info, NULL);
568#endif
569} 563}
570EXPORT_SYMBOL_GPL(page_cache_async_readahead); 564EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index 941bf82e8961..8da044a1db0f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -31,11 +31,12 @@
31 * swap_lock (in swap_duplicate, swap_info_get) 31 * swap_lock (in swap_duplicate, swap_info_get)
32 * mmlist_lock (in mmput, drain_mmlist and others) 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->private_lock (in __set_page_dirty_buffers) 33 * mapping->private_lock (in __set_page_dirty_buffers)
34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 34 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
35 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 38 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 39 * within inode_wb_list_lock in __sync_single_inode)
39 * 40 *
40 * (code doesn't rely on that order so it could be switched around) 41 * (code doesn't rely on that order so it could be switched around)
41 * ->tasklist_lock 42 * ->tasklist_lock
@@ -67,11 +68,24 @@ static struct kmem_cache *anon_vma_chain_cachep;
67 68
68static inline struct anon_vma *anon_vma_alloc(void) 69static inline struct anon_vma *anon_vma_alloc(void)
69{ 70{
70 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 71 struct anon_vma *anon_vma;
72
73 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
74 if (anon_vma) {
75 atomic_set(&anon_vma->refcount, 1);
76 /*
77 * Initialise the anon_vma root to point to itself. If called
78 * from fork, the root will be reset to the parents anon_vma.
79 */
80 anon_vma->root = anon_vma;
81 }
82
83 return anon_vma;
71} 84}
72 85
73void anon_vma_free(struct anon_vma *anon_vma) 86static inline void anon_vma_free(struct anon_vma *anon_vma)
74{ 87{
88 VM_BUG_ON(atomic_read(&anon_vma->refcount));
75 kmem_cache_free(anon_vma_cachep, anon_vma); 89 kmem_cache_free(anon_vma_cachep, anon_vma);
76} 90}
77 91
@@ -133,11 +147,6 @@ int anon_vma_prepare(struct vm_area_struct *vma)
133 if (unlikely(!anon_vma)) 147 if (unlikely(!anon_vma))
134 goto out_enomem_free_avc; 148 goto out_enomem_free_avc;
135 allocated = anon_vma; 149 allocated = anon_vma;
136 /*
137 * This VMA had no anon_vma yet. This anon_vma is
138 * the root of any anon_vma tree that might form.
139 */
140 anon_vma->root = anon_vma;
141 } 150 }
142 151
143 anon_vma_lock(anon_vma); 152 anon_vma_lock(anon_vma);
@@ -156,7 +165,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
156 anon_vma_unlock(anon_vma); 165 anon_vma_unlock(anon_vma);
157 166
158 if (unlikely(allocated)) 167 if (unlikely(allocated))
159 anon_vma_free(allocated); 168 put_anon_vma(allocated);
160 if (unlikely(avc)) 169 if (unlikely(avc))
161 anon_vma_chain_free(avc); 170 anon_vma_chain_free(avc);
162 } 171 }
@@ -241,9 +250,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
241 */ 250 */
242 anon_vma->root = pvma->anon_vma->root; 251 anon_vma->root = pvma->anon_vma->root;
243 /* 252 /*
244 * With KSM refcounts, an anon_vma can stay around longer than the 253 * With refcounts, an anon_vma can stay around longer than the
245 * process it belongs to. The root anon_vma needs to be pinned 254 * process it belongs to. The root anon_vma needs to be pinned until
246 * until this anon_vma is freed, because the lock lives in the root. 255 * this anon_vma is freed, because the lock lives in the root.
247 */ 256 */
248 get_anon_vma(anon_vma->root); 257 get_anon_vma(anon_vma->root);
249 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 258 /* Mark this anon_vma as the one where our new (COWed) pages go. */
@@ -253,7 +262,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
253 return 0; 262 return 0;
254 263
255 out_error_free_anon_vma: 264 out_error_free_anon_vma:
256 anon_vma_free(anon_vma); 265 put_anon_vma(anon_vma);
257 out_error: 266 out_error:
258 unlink_anon_vmas(vma); 267 unlink_anon_vmas(vma);
259 return -ENOMEM; 268 return -ENOMEM;
@@ -272,15 +281,11 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
272 list_del(&anon_vma_chain->same_anon_vma); 281 list_del(&anon_vma_chain->same_anon_vma);
273 282
274 /* We must garbage collect the anon_vma if it's empty */ 283 /* We must garbage collect the anon_vma if it's empty */
275 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); 284 empty = list_empty(&anon_vma->head);
276 anon_vma_unlock(anon_vma); 285 anon_vma_unlock(anon_vma);
277 286
278 if (empty) { 287 if (empty)
279 /* We no longer need the root anon_vma */ 288 put_anon_vma(anon_vma);
280 if (anon_vma->root != anon_vma)
281 drop_anon_vma(anon_vma->root);
282 anon_vma_free(anon_vma);
283 }
284} 289}
285 290
286void unlink_anon_vmas(struct vm_area_struct *vma) 291void unlink_anon_vmas(struct vm_area_struct *vma)
@@ -303,7 +308,7 @@ static void anon_vma_ctor(void *data)
303 struct anon_vma *anon_vma = data; 308 struct anon_vma *anon_vma = data;
304 309
305 spin_lock_init(&anon_vma->lock); 310 spin_lock_init(&anon_vma->lock);
306 anonvma_external_refcount_init(anon_vma); 311 atomic_set(&anon_vma->refcount, 0);
307 INIT_LIST_HEAD(&anon_vma->head); 312 INIT_LIST_HEAD(&anon_vma->head);
308} 313}
309 314
@@ -1486,41 +1491,15 @@ int try_to_munlock(struct page *page)
1486 return try_to_unmap_file(page, TTU_MUNLOCK); 1491 return try_to_unmap_file(page, TTU_MUNLOCK);
1487} 1492}
1488 1493
1489#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) 1494void __put_anon_vma(struct anon_vma *anon_vma)
1490/*
1491 * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
1492 * if necessary. Be careful to do all the tests under the lock. Once
1493 * we know we are the last user, nobody else can get a reference and we
1494 * can do the freeing without the lock.
1495 */
1496void drop_anon_vma(struct anon_vma *anon_vma)
1497{ 1495{
1498 BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); 1496 struct anon_vma *root = anon_vma->root;
1499 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
1500 struct anon_vma *root = anon_vma->root;
1501 int empty = list_empty(&anon_vma->head);
1502 int last_root_user = 0;
1503 int root_empty = 0;
1504 1497
1505 /* 1498 if (root != anon_vma && atomic_dec_and_test(&root->refcount))
1506 * The refcount on a non-root anon_vma got dropped. Drop 1499 anon_vma_free(root);
1507 * the refcount on the root and check if we need to free it.
1508 */
1509 if (empty && anon_vma != root) {
1510 BUG_ON(atomic_read(&root->external_refcount) <= 0);
1511 last_root_user = atomic_dec_and_test(&root->external_refcount);
1512 root_empty = list_empty(&root->head);
1513 }
1514 anon_vma_unlock(anon_vma);
1515 1500
1516 if (empty) { 1501 anon_vma_free(anon_vma);
1517 anon_vma_free(anon_vma);
1518 if (root_empty && last_root_user)
1519 anon_vma_free(root);
1520 }
1521 }
1522} 1502}
1523#endif
1524 1503
1525#ifdef CONFIG_MIGRATION 1504#ifdef CONFIG_MIGRATION
1526/* 1505/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 048a95a5244d..8fa27e4e582a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops;
224static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 224static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
225 .ra_pages = 0, /* No readahead */ 225 .ra_pages = 0, /* No readahead */
226 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 226 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
227 .unplug_io_fn = default_unplug_io_fn,
228}; 227};
229 228
230static LIST_HEAD(shmem_swaplist); 229static LIST_HEAD(shmem_swaplist);
@@ -422,7 +421,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
422 * a waste to allocate index if we cannot allocate data. 421 * a waste to allocate index if we cannot allocate data.
423 */ 422 */
424 if (sbinfo->max_blocks) { 423 if (sbinfo->max_blocks) {
425 if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) 424 if (percpu_counter_compare(&sbinfo->used_blocks,
425 sbinfo->max_blocks - 1) >= 0)
426 return ERR_PTR(-ENOSPC); 426 return ERR_PTR(-ENOSPC);
427 percpu_counter_inc(&sbinfo->used_blocks); 427 percpu_counter_inc(&sbinfo->used_blocks);
428 spin_lock(&inode->i_lock); 428 spin_lock(&inode->i_lock);
@@ -1081,7 +1081,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1081 shmem_recalc_inode(inode); 1081 shmem_recalc_inode(inode);
1082 1082
1083 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1083 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1084 remove_from_page_cache(page); 1084 delete_from_page_cache(page);
1085 shmem_swp_set(info, entry, swap.val); 1085 shmem_swp_set(info, entry, swap.val);
1086 shmem_swp_unmap(entry); 1086 shmem_swp_unmap(entry);
1087 if (list_empty(&info->swaplist)) 1087 if (list_empty(&info->swaplist))
@@ -1091,7 +1091,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1091 spin_unlock(&info->lock); 1091 spin_unlock(&info->lock);
1092 swap_shmem_alloc(swap); 1092 swap_shmem_alloc(swap);
1093 BUG_ON(page_mapped(page)); 1093 BUG_ON(page_mapped(page));
1094 page_cache_release(page); /* pagecache ref */
1095 swap_writepage(page, wbc); 1094 swap_writepage(page, wbc);
1096 if (inode) { 1095 if (inode) {
1097 mutex_lock(&shmem_swaplist_mutex); 1096 mutex_lock(&shmem_swaplist_mutex);
@@ -1399,7 +1398,8 @@ repeat:
1399 shmem_swp_unmap(entry); 1398 shmem_swp_unmap(entry);
1400 sbinfo = SHMEM_SB(inode->i_sb); 1399 sbinfo = SHMEM_SB(inode->i_sb);
1401 if (sbinfo->max_blocks) { 1400 if (sbinfo->max_blocks) {
1402 if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || 1401 if (percpu_counter_compare(&sbinfo->used_blocks,
1402 sbinfo->max_blocks) >= 0 ||
1403 shmem_acct_block(info->flags)) { 1403 shmem_acct_block(info->flags)) {
1404 spin_unlock(&info->lock); 1404 spin_unlock(&info->lock);
1405 error = -ENOSPC; 1405 error = -ENOSPC;
@@ -2794,5 +2794,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2794 fput(vma->vm_file); 2794 fput(vma->vm_file);
2795 vma->vm_file = file; 2795 vma->vm_file = file;
2796 vma->vm_ops = &shmem_vm_ops; 2796 vma->vm_ops = &shmem_vm_ops;
2797 vma->vm_flags |= VM_CAN_NONLINEAR;
2797 return 0; 2798 return 0;
2798} 2799}
diff --git a/mm/slab.c b/mm/slab.c
index 37961d1f584f..46a9c163a92f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -191,22 +191,6 @@ typedef unsigned int kmem_bufctl_t;
191#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) 191#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
192 192
193/* 193/*
194 * struct slab
195 *
196 * Manages the objs in a slab. Placed either at the beginning of mem allocated
197 * for a slab, or allocated from an general cache.
198 * Slabs are chained into three list: fully used, partial, fully free slabs.
199 */
200struct slab {
201 struct list_head list;
202 unsigned long colouroff;
203 void *s_mem; /* including colour offset */
204 unsigned int inuse; /* num of objs active in slab */
205 kmem_bufctl_t free;
206 unsigned short nodeid;
207};
208
209/*
210 * struct slab_rcu 194 * struct slab_rcu
211 * 195 *
212 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 196 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
@@ -219,8 +203,6 @@ struct slab {
219 * 203 *
220 * rcu_read_lock before reading the address, then rcu_read_unlock after 204 * rcu_read_lock before reading the address, then rcu_read_unlock after
221 * taking the spinlock within the structure expected at that address. 205 * taking the spinlock within the structure expected at that address.
222 *
223 * We assume struct slab_rcu can overlay struct slab when destroying.
224 */ 206 */
225struct slab_rcu { 207struct slab_rcu {
226 struct rcu_head head; 208 struct rcu_head head;
@@ -229,6 +211,27 @@ struct slab_rcu {
229}; 211};
230 212
231/* 213/*
214 * struct slab
215 *
216 * Manages the objs in a slab. Placed either at the beginning of mem allocated
217 * for a slab, or allocated from an general cache.
218 * Slabs are chained into three list: fully used, partial, fully free slabs.
219 */
220struct slab {
221 union {
222 struct {
223 struct list_head list;
224 unsigned long colouroff;
225 void *s_mem; /* including colour offset */
226 unsigned int inuse; /* num of objs active in slab */
227 kmem_bufctl_t free;
228 unsigned short nodeid;
229 };
230 struct slab_rcu __slab_cover_slab_rcu;
231 };
232};
233
234/*
232 * struct array_cache 235 * struct array_cache
233 * 236 *
234 * Purpose: 237 * Purpose:
@@ -875,7 +878,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
875 nc = kmalloc_node(memsize, gfp, node); 878 nc = kmalloc_node(memsize, gfp, node);
876 /* 879 /*
877 * The array_cache structures contain pointers to free object. 880 * The array_cache structures contain pointers to free object.
878 * However, when such objects are allocated or transfered to another 881 * However, when such objects are allocated or transferred to another
879 * cache the pointers are not cleared and they could be counted as 882 * cache the pointers are not cleared and they could be counted as
880 * valid references during a kmemleak scan. Therefore, kmemleak must 883 * valid references during a kmemleak scan. Therefore, kmemleak must
881 * not scan such objects. 884 * not scan such objects.
@@ -1387,7 +1390,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
1387 break; 1390 break;
1388 } 1391 }
1389out: 1392out:
1390 return ret ? notifier_from_errno(ret) : NOTIFY_OK; 1393 return notifier_from_errno(ret);
1391} 1394}
1392#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ 1395#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1393 1396
@@ -2147,8 +2150,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2147 * 2150 *
2148 * @name must be valid until the cache is destroyed. This implies that 2151 * @name must be valid until the cache is destroyed. This implies that
2149 * the module calling this has to destroy the cache before getting unloaded. 2152 * the module calling this has to destroy the cache before getting unloaded.
2150 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
2151 * therefore applications must manage it themselves.
2152 * 2153 *
2153 * The flags are 2154 * The flags are
2154 * 2155 *
@@ -2288,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2288 if (ralign < align) { 2289 if (ralign < align) {
2289 ralign = align; 2290 ralign = align;
2290 } 2291 }
2291 /* disable debug if not aligning with REDZONE_ALIGN */ 2292 /* disable debug if necessary */
2292 if (ralign & (__alignof__(unsigned long long) - 1)) 2293 if (ralign > __alignof__(unsigned long long))
2293 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2294 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2294 /* 2295 /*
2295 * 4) Store it. 2296 * 4) Store it.
@@ -2315,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2315 */ 2316 */
2316 if (flags & SLAB_RED_ZONE) { 2317 if (flags & SLAB_RED_ZONE) {
2317 /* add space for red zone words */ 2318 /* add space for red zone words */
2318 cachep->obj_offset += align; 2319 cachep->obj_offset += sizeof(unsigned long long);
2319 size += align + sizeof(unsigned long long); 2320 size += 2 * sizeof(unsigned long long);
2320 } 2321 }
2321 if (flags & SLAB_STORE_USER) { 2322 if (flags & SLAB_STORE_USER) {
2322 /* user store requires one word storage behind the end of 2323 /* user store requires one word storage behind the end of
@@ -2605,7 +2606,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2605 * 2606 *
2606 * The cache must be empty before calling this function. 2607 * The cache must be empty before calling this function.
2607 * 2608 *
2608 * The caller must guarantee that noone will allocate memory from the cache 2609 * The caller must guarantee that no one will allocate memory from the cache
2609 * during the kmem_cache_destroy(). 2610 * during the kmem_cache_destroy().
2610 */ 2611 */
2611void kmem_cache_destroy(struct kmem_cache *cachep) 2612void kmem_cache_destroy(struct kmem_cache *cachep)
@@ -3840,12 +3841,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep)
3840} 3841}
3841EXPORT_SYMBOL(kmem_cache_size); 3842EXPORT_SYMBOL(kmem_cache_size);
3842 3843
3843const char *kmem_cache_name(struct kmem_cache *cachep)
3844{
3845 return cachep->name;
3846}
3847EXPORT_SYMBOL_GPL(kmem_cache_name);
3848
3849/* 3844/*
3850 * This initializes kmem_list3 or resizes various caches for all nodes. 3845 * This initializes kmem_list3 or resizes various caches for all nodes.
3851 */ 3846 */
diff --git a/mm/slob.c b/mm/slob.c
index 3588eaaef726..46e0aee33a23 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -666,12 +666,6 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
666} 666}
667EXPORT_SYMBOL(kmem_cache_size); 667EXPORT_SYMBOL(kmem_cache_size);
668 668
669const char *kmem_cache_name(struct kmem_cache *c)
670{
671 return c->name;
672}
673EXPORT_SYMBOL(kmem_cache_name);
674
675int kmem_cache_shrink(struct kmem_cache *d) 669int kmem_cache_shrink(struct kmem_cache *d)
676{ 670{
677 return 0; 671 return 0;
diff --git a/mm/slub.c b/mm/slub.c
index e15aa7f193c9..94d2a33a866e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -64,7 +64,7 @@
64 * we must stay away from it for a while since we may cause a bouncing 64 * we must stay away from it for a while since we may cause a bouncing
65 * cacheline if we try to acquire the lock. So go onto the next slab. 65 * cacheline if we try to acquire the lock. So go onto the next slab.
66 * If all pages are busy then we may allocate a new slab instead of reusing 66 * If all pages are busy then we may allocate a new slab instead of reusing
67 * a partial slab. A new slab has noone operating on it and thus there is 67 * a partial slab. A new slab has no one operating on it and thus there is
68 * no danger of cacheline contention. 68 * no danger of cacheline contention.
69 * 69 *
70 * Interrupts are disabled during allocation and deallocation in order to 70 * Interrupts are disabled during allocation and deallocation in order to
@@ -217,7 +217,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
217 217
218#endif 218#endif
219 219
220static inline void stat(struct kmem_cache *s, enum stat_item si) 220static inline void stat(const struct kmem_cache *s, enum stat_item si)
221{ 221{
222#ifdef CONFIG_SLUB_STATS 222#ifdef CONFIG_SLUB_STATS
223 __this_cpu_inc(s->cpu_slab->stat[si]); 223 __this_cpu_inc(s->cpu_slab->stat[si]);
@@ -281,11 +281,40 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
281 return (p - addr) / s->size; 281 return (p - addr) / s->size;
282} 282}
283 283
284static inline size_t slab_ksize(const struct kmem_cache *s)
285{
286#ifdef CONFIG_SLUB_DEBUG
287 /*
288 * Debugging requires use of the padding between object
289 * and whatever may come after it.
290 */
291 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
292 return s->objsize;
293
294#endif
295 /*
296 * If we have the need to store the freelist pointer
297 * back there or track user information then we can
298 * only use the space before that information.
299 */
300 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
301 return s->inuse;
302 /*
303 * Else we can use all the padding etc for the allocation
304 */
305 return s->size;
306}
307
308static inline int order_objects(int order, unsigned long size, int reserved)
309{
310 return ((PAGE_SIZE << order) - reserved) / size;
311}
312
284static inline struct kmem_cache_order_objects oo_make(int order, 313static inline struct kmem_cache_order_objects oo_make(int order,
285 unsigned long size) 314 unsigned long size, int reserved)
286{ 315{
287 struct kmem_cache_order_objects x = { 316 struct kmem_cache_order_objects x = {
288 (order << OO_SHIFT) + (PAGE_SIZE << order) / size 317 (order << OO_SHIFT) + order_objects(order, size, reserved)
289 }; 318 };
290 319
291 return x; 320 return x;
@@ -617,7 +646,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
617 return 1; 646 return 1;
618 647
619 start = page_address(page); 648 start = page_address(page);
620 length = (PAGE_SIZE << compound_order(page)); 649 length = (PAGE_SIZE << compound_order(page)) - s->reserved;
621 end = start + length; 650 end = start + length;
622 remainder = length % s->size; 651 remainder = length % s->size;
623 if (!remainder) 652 if (!remainder)
@@ -698,7 +727,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
698 return 0; 727 return 0;
699 } 728 }
700 729
701 maxobj = (PAGE_SIZE << compound_order(page)) / s->size; 730 maxobj = order_objects(compound_order(page), s->size, s->reserved);
702 if (page->objects > maxobj) { 731 if (page->objects > maxobj) {
703 slab_err(s, page, "objects %u > max %u", 732 slab_err(s, page, "objects %u > max %u",
704 s->name, page->objects, maxobj); 733 s->name, page->objects, maxobj);
@@ -748,7 +777,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
748 nr++; 777 nr++;
749 } 778 }
750 779
751 max_objects = (PAGE_SIZE << compound_order(page)) / s->size; 780 max_objects = order_objects(compound_order(page), s->size, s->reserved);
752 if (max_objects > MAX_OBJS_PER_PAGE) 781 if (max_objects > MAX_OBJS_PER_PAGE)
753 max_objects = MAX_OBJS_PER_PAGE; 782 max_objects = MAX_OBJS_PER_PAGE;
754 783
@@ -800,21 +829,31 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
800static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 829static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
801{ 830{
802 flags &= gfp_allowed_mask; 831 flags &= gfp_allowed_mask;
803 kmemcheck_slab_alloc(s, flags, object, s->objsize); 832 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
804 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); 833 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
805} 834}
806 835
807static inline void slab_free_hook(struct kmem_cache *s, void *x) 836static inline void slab_free_hook(struct kmem_cache *s, void *x)
808{ 837{
809 kmemleak_free_recursive(x, s->flags); 838 kmemleak_free_recursive(x, s->flags);
810}
811 839
812static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) 840 /*
813{ 841 * Trouble is that we may no longer disable interupts in the fast path
814 kmemcheck_slab_free(s, object, s->objsize); 842 * So in order to make the debug calls that expect irqs to be
815 debug_check_no_locks_freed(object, s->objsize); 843 * disabled we need to disable interrupts temporarily.
844 */
845#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
846 {
847 unsigned long flags;
848
849 local_irq_save(flags);
850 kmemcheck_slab_free(s, x, s->objsize);
851 debug_check_no_locks_freed(x, s->objsize);
852 local_irq_restore(flags);
853 }
854#endif
816 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 855 if (!(s->flags & SLAB_DEBUG_OBJECTS))
817 debug_check_no_obj_freed(object, s->objsize); 856 debug_check_no_obj_freed(x, s->objsize);
818} 857}
819 858
820/* 859/*
@@ -1101,9 +1140,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1101 1140
1102static inline void slab_free_hook(struct kmem_cache *s, void *x) {} 1141static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1103 1142
1104static inline void slab_free_hook_irq(struct kmem_cache *s,
1105 void *object) {}
1106
1107#endif /* CONFIG_SLUB_DEBUG */ 1143#endif /* CONFIG_SLUB_DEBUG */
1108 1144
1109/* 1145/*
@@ -1249,21 +1285,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1249 __free_pages(page, order); 1285 __free_pages(page, order);
1250} 1286}
1251 1287
1288#define need_reserve_slab_rcu \
1289 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
1290
1252static void rcu_free_slab(struct rcu_head *h) 1291static void rcu_free_slab(struct rcu_head *h)
1253{ 1292{
1254 struct page *page; 1293 struct page *page;
1255 1294
1256 page = container_of((struct list_head *)h, struct page, lru); 1295 if (need_reserve_slab_rcu)
1296 page = virt_to_head_page(h);
1297 else
1298 page = container_of((struct list_head *)h, struct page, lru);
1299
1257 __free_slab(page->slab, page); 1300 __free_slab(page->slab, page);
1258} 1301}
1259 1302
1260static void free_slab(struct kmem_cache *s, struct page *page) 1303static void free_slab(struct kmem_cache *s, struct page *page)
1261{ 1304{
1262 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1305 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1263 /* 1306 struct rcu_head *head;
1264 * RCU free overloads the RCU head over the LRU 1307
1265 */ 1308 if (need_reserve_slab_rcu) {
1266 struct rcu_head *head = (void *)&page->lru; 1309 int order = compound_order(page);
1310 int offset = (PAGE_SIZE << order) - s->reserved;
1311
1312 VM_BUG_ON(s->reserved != sizeof(*head));
1313 head = page_address(page) + offset;
1314 } else {
1315 /*
1316 * RCU free overloads the RCU head over the LRU
1317 */
1318 head = (void *)&page->lru;
1319 }
1267 1320
1268 call_rcu(head, rcu_free_slab); 1321 call_rcu(head, rcu_free_slab);
1269 } else 1322 } else
@@ -1487,6 +1540,78 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1487 } 1540 }
1488} 1541}
1489 1542
1543#ifdef CONFIG_CMPXCHG_LOCAL
1544#ifdef CONFIG_PREEMPT
1545/*
1546 * Calculate the next globally unique transaction for disambiguiation
1547 * during cmpxchg. The transactions start with the cpu number and are then
1548 * incremented by CONFIG_NR_CPUS.
1549 */
1550#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
1551#else
1552/*
1553 * No preemption supported therefore also no need to check for
1554 * different cpus.
1555 */
1556#define TID_STEP 1
1557#endif
1558
1559static inline unsigned long next_tid(unsigned long tid)
1560{
1561 return tid + TID_STEP;
1562}
1563
1564static inline unsigned int tid_to_cpu(unsigned long tid)
1565{
1566 return tid % TID_STEP;
1567}
1568
1569static inline unsigned long tid_to_event(unsigned long tid)
1570{
1571 return tid / TID_STEP;
1572}
1573
1574static inline unsigned int init_tid(int cpu)
1575{
1576 return cpu;
1577}
1578
1579static inline void note_cmpxchg_failure(const char *n,
1580 const struct kmem_cache *s, unsigned long tid)
1581{
1582#ifdef SLUB_DEBUG_CMPXCHG
1583 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1584
1585 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
1586
1587#ifdef CONFIG_PREEMPT
1588 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1589 printk("due to cpu change %d -> %d\n",
1590 tid_to_cpu(tid), tid_to_cpu(actual_tid));
1591 else
1592#endif
1593 if (tid_to_event(tid) != tid_to_event(actual_tid))
1594 printk("due to cpu running other code. Event %ld->%ld\n",
1595 tid_to_event(tid), tid_to_event(actual_tid));
1596 else
1597 printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
1598 actual_tid, tid, next_tid(tid));
1599#endif
1600 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1601}
1602
1603#endif
1604
1605void init_kmem_cache_cpus(struct kmem_cache *s)
1606{
1607#ifdef CONFIG_CMPXCHG_LOCAL
1608 int cpu;
1609
1610 for_each_possible_cpu(cpu)
1611 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1612#endif
1613
1614}
1490/* 1615/*
1491 * Remove the cpu slab 1616 * Remove the cpu slab
1492 */ 1617 */
@@ -1518,6 +1643,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1518 page->inuse--; 1643 page->inuse--;
1519 } 1644 }
1520 c->page = NULL; 1645 c->page = NULL;
1646#ifdef CONFIG_CMPXCHG_LOCAL
1647 c->tid = next_tid(c->tid);
1648#endif
1521 unfreeze_slab(s, page, tail); 1649 unfreeze_slab(s, page, tail);
1522} 1650}
1523 1651
@@ -1652,6 +1780,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1652{ 1780{
1653 void **object; 1781 void **object;
1654 struct page *new; 1782 struct page *new;
1783#ifdef CONFIG_CMPXCHG_LOCAL
1784 unsigned long flags;
1785
1786 local_irq_save(flags);
1787#ifdef CONFIG_PREEMPT
1788 /*
1789 * We may have been preempted and rescheduled on a different
1790 * cpu before disabling interrupts. Need to reload cpu area
1791 * pointer.
1792 */
1793 c = this_cpu_ptr(s->cpu_slab);
1794#endif
1795#endif
1655 1796
1656 /* We handle __GFP_ZERO in the caller */ 1797 /* We handle __GFP_ZERO in the caller */
1657 gfpflags &= ~__GFP_ZERO; 1798 gfpflags &= ~__GFP_ZERO;
@@ -1678,6 +1819,10 @@ load_freelist:
1678 c->node = page_to_nid(c->page); 1819 c->node = page_to_nid(c->page);
1679unlock_out: 1820unlock_out:
1680 slab_unlock(c->page); 1821 slab_unlock(c->page);
1822#ifdef CONFIG_CMPXCHG_LOCAL
1823 c->tid = next_tid(c->tid);
1824 local_irq_restore(flags);
1825#endif
1681 stat(s, ALLOC_SLOWPATH); 1826 stat(s, ALLOC_SLOWPATH);
1682 return object; 1827 return object;
1683 1828
@@ -1713,6 +1858,9 @@ new_slab:
1713 } 1858 }
1714 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 1859 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1715 slab_out_of_memory(s, gfpflags, node); 1860 slab_out_of_memory(s, gfpflags, node);
1861#ifdef CONFIG_CMPXCHG_LOCAL
1862 local_irq_restore(flags);
1863#endif
1716 return NULL; 1864 return NULL;
1717debug: 1865debug:
1718 if (!alloc_debug_processing(s, c->page, object, addr)) 1866 if (!alloc_debug_processing(s, c->page, object, addr))
@@ -1739,23 +1887,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1739{ 1887{
1740 void **object; 1888 void **object;
1741 struct kmem_cache_cpu *c; 1889 struct kmem_cache_cpu *c;
1890#ifdef CONFIG_CMPXCHG_LOCAL
1891 unsigned long tid;
1892#else
1742 unsigned long flags; 1893 unsigned long flags;
1894#endif
1743 1895
1744 if (slab_pre_alloc_hook(s, gfpflags)) 1896 if (slab_pre_alloc_hook(s, gfpflags))
1745 return NULL; 1897 return NULL;
1746 1898
1899#ifndef CONFIG_CMPXCHG_LOCAL
1747 local_irq_save(flags); 1900 local_irq_save(flags);
1901#else
1902redo:
1903#endif
1904
1905 /*
1906 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
1907 * enabled. We may switch back and forth between cpus while
1908 * reading from one cpu area. That does not matter as long
1909 * as we end up on the original cpu again when doing the cmpxchg.
1910 */
1748 c = __this_cpu_ptr(s->cpu_slab); 1911 c = __this_cpu_ptr(s->cpu_slab);
1912
1913#ifdef CONFIG_CMPXCHG_LOCAL
1914 /*
1915 * The transaction ids are globally unique per cpu and per operation on
1916 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
1917 * occurs on the right processor and that there was no operation on the
1918 * linked list in between.
1919 */
1920 tid = c->tid;
1921 barrier();
1922#endif
1923
1749 object = c->freelist; 1924 object = c->freelist;
1750 if (unlikely(!object || !node_match(c, node))) 1925 if (unlikely(!object || !node_match(c, node)))
1751 1926
1752 object = __slab_alloc(s, gfpflags, node, addr, c); 1927 object = __slab_alloc(s, gfpflags, node, addr, c);
1753 1928
1754 else { 1929 else {
1930#ifdef CONFIG_CMPXCHG_LOCAL
1931 /*
1932 * The cmpxchg will only match if there was no additional
1933 * operation and if we are on the right processor.
1934 *
1935 * The cmpxchg does the following atomically (without lock semantics!)
1936 * 1. Relocate first pointer to the current per cpu area.
1937 * 2. Verify that tid and freelist have not been changed
1938 * 3. If they were not changed replace tid and freelist
1939 *
1940 * Since this is without lock semantics the protection is only against
1941 * code executing on this cpu *not* from access by other cpus.
1942 */
1943 if (unlikely(!this_cpu_cmpxchg_double(
1944 s->cpu_slab->freelist, s->cpu_slab->tid,
1945 object, tid,
1946 get_freepointer(s, object), next_tid(tid)))) {
1947
1948 note_cmpxchg_failure("slab_alloc", s, tid);
1949 goto redo;
1950 }
1951#else
1755 c->freelist = get_freepointer(s, object); 1952 c->freelist = get_freepointer(s, object);
1953#endif
1756 stat(s, ALLOC_FASTPATH); 1954 stat(s, ALLOC_FASTPATH);
1757 } 1955 }
1956
1957#ifndef CONFIG_CMPXCHG_LOCAL
1758 local_irq_restore(flags); 1958 local_irq_restore(flags);
1959#endif
1759 1960
1760 if (unlikely(gfpflags & __GFP_ZERO) && object) 1961 if (unlikely(gfpflags & __GFP_ZERO) && object)
1761 memset(object, 0, s->objsize); 1962 memset(object, 0, s->objsize);
@@ -1833,9 +2034,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1833{ 2034{
1834 void *prior; 2035 void *prior;
1835 void **object = (void *)x; 2036 void **object = (void *)x;
2037#ifdef CONFIG_CMPXCHG_LOCAL
2038 unsigned long flags;
1836 2039
1837 stat(s, FREE_SLOWPATH); 2040 local_irq_save(flags);
2041#endif
1838 slab_lock(page); 2042 slab_lock(page);
2043 stat(s, FREE_SLOWPATH);
1839 2044
1840 if (kmem_cache_debug(s)) 2045 if (kmem_cache_debug(s))
1841 goto debug; 2046 goto debug;
@@ -1865,6 +2070,9 @@ checks_ok:
1865 2070
1866out_unlock: 2071out_unlock:
1867 slab_unlock(page); 2072 slab_unlock(page);
2073#ifdef CONFIG_CMPXCHG_LOCAL
2074 local_irq_restore(flags);
2075#endif
1868 return; 2076 return;
1869 2077
1870slab_empty: 2078slab_empty:
@@ -1876,6 +2084,9 @@ slab_empty:
1876 stat(s, FREE_REMOVE_PARTIAL); 2084 stat(s, FREE_REMOVE_PARTIAL);
1877 } 2085 }
1878 slab_unlock(page); 2086 slab_unlock(page);
2087#ifdef CONFIG_CMPXCHG_LOCAL
2088 local_irq_restore(flags);
2089#endif
1879 stat(s, FREE_SLAB); 2090 stat(s, FREE_SLAB);
1880 discard_slab(s, page); 2091 discard_slab(s, page);
1881 return; 2092 return;
@@ -1902,23 +2113,56 @@ static __always_inline void slab_free(struct kmem_cache *s,
1902{ 2113{
1903 void **object = (void *)x; 2114 void **object = (void *)x;
1904 struct kmem_cache_cpu *c; 2115 struct kmem_cache_cpu *c;
2116#ifdef CONFIG_CMPXCHG_LOCAL
2117 unsigned long tid;
2118#else
1905 unsigned long flags; 2119 unsigned long flags;
2120#endif
1906 2121
1907 slab_free_hook(s, x); 2122 slab_free_hook(s, x);
1908 2123
2124#ifndef CONFIG_CMPXCHG_LOCAL
1909 local_irq_save(flags); 2125 local_irq_save(flags);
2126
2127#else
2128redo:
2129#endif
2130
2131 /*
2132 * Determine the currently cpus per cpu slab.
2133 * The cpu may change afterward. However that does not matter since
2134 * data is retrieved via this pointer. If we are on the same cpu
2135 * during the cmpxchg then the free will succedd.
2136 */
1910 c = __this_cpu_ptr(s->cpu_slab); 2137 c = __this_cpu_ptr(s->cpu_slab);
1911 2138
1912 slab_free_hook_irq(s, x); 2139#ifdef CONFIG_CMPXCHG_LOCAL
2140 tid = c->tid;
2141 barrier();
2142#endif
1913 2143
1914 if (likely(page == c->page && c->node != NUMA_NO_NODE)) { 2144 if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
1915 set_freepointer(s, object, c->freelist); 2145 set_freepointer(s, object, c->freelist);
2146
2147#ifdef CONFIG_CMPXCHG_LOCAL
2148 if (unlikely(!this_cpu_cmpxchg_double(
2149 s->cpu_slab->freelist, s->cpu_slab->tid,
2150 c->freelist, tid,
2151 object, next_tid(tid)))) {
2152
2153 note_cmpxchg_failure("slab_free", s, tid);
2154 goto redo;
2155 }
2156#else
1916 c->freelist = object; 2157 c->freelist = object;
2158#endif
1917 stat(s, FREE_FASTPATH); 2159 stat(s, FREE_FASTPATH);
1918 } else 2160 } else
1919 __slab_free(s, page, x, addr); 2161 __slab_free(s, page, x, addr);
1920 2162
2163#ifndef CONFIG_CMPXCHG_LOCAL
1921 local_irq_restore(flags); 2164 local_irq_restore(flags);
2165#endif
1922} 2166}
1923 2167
1924void kmem_cache_free(struct kmem_cache *s, void *x) 2168void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -1988,13 +2232,13 @@ static int slub_nomerge;
1988 * the smallest order which will fit the object. 2232 * the smallest order which will fit the object.
1989 */ 2233 */
1990static inline int slab_order(int size, int min_objects, 2234static inline int slab_order(int size, int min_objects,
1991 int max_order, int fract_leftover) 2235 int max_order, int fract_leftover, int reserved)
1992{ 2236{
1993 int order; 2237 int order;
1994 int rem; 2238 int rem;
1995 int min_order = slub_min_order; 2239 int min_order = slub_min_order;
1996 2240
1997 if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) 2241 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
1998 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2242 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
1999 2243
2000 for (order = max(min_order, 2244 for (order = max(min_order,
@@ -2003,10 +2247,10 @@ static inline int slab_order(int size, int min_objects,
2003 2247
2004 unsigned long slab_size = PAGE_SIZE << order; 2248 unsigned long slab_size = PAGE_SIZE << order;
2005 2249
2006 if (slab_size < min_objects * size) 2250 if (slab_size < min_objects * size + reserved)
2007 continue; 2251 continue;
2008 2252
2009 rem = slab_size % size; 2253 rem = (slab_size - reserved) % size;
2010 2254
2011 if (rem <= slab_size / fract_leftover) 2255 if (rem <= slab_size / fract_leftover)
2012 break; 2256 break;
@@ -2016,7 +2260,7 @@ static inline int slab_order(int size, int min_objects,
2016 return order; 2260 return order;
2017} 2261}
2018 2262
2019static inline int calculate_order(int size) 2263static inline int calculate_order(int size, int reserved)
2020{ 2264{
2021 int order; 2265 int order;
2022 int min_objects; 2266 int min_objects;
@@ -2034,14 +2278,14 @@ static inline int calculate_order(int size)
2034 min_objects = slub_min_objects; 2278 min_objects = slub_min_objects;
2035 if (!min_objects) 2279 if (!min_objects)
2036 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2280 min_objects = 4 * (fls(nr_cpu_ids) + 1);
2037 max_objects = (PAGE_SIZE << slub_max_order)/size; 2281 max_objects = order_objects(slub_max_order, size, reserved);
2038 min_objects = min(min_objects, max_objects); 2282 min_objects = min(min_objects, max_objects);
2039 2283
2040 while (min_objects > 1) { 2284 while (min_objects > 1) {
2041 fraction = 16; 2285 fraction = 16;
2042 while (fraction >= 4) { 2286 while (fraction >= 4) {
2043 order = slab_order(size, min_objects, 2287 order = slab_order(size, min_objects,
2044 slub_max_order, fraction); 2288 slub_max_order, fraction, reserved);
2045 if (order <= slub_max_order) 2289 if (order <= slub_max_order)
2046 return order; 2290 return order;
2047 fraction /= 2; 2291 fraction /= 2;
@@ -2053,14 +2297,14 @@ static inline int calculate_order(int size)
2053 * We were unable to place multiple objects in a slab. Now 2297 * We were unable to place multiple objects in a slab. Now
2054 * lets see if we can place a single object there. 2298 * lets see if we can place a single object there.
2055 */ 2299 */
2056 order = slab_order(size, 1, slub_max_order, 1); 2300 order = slab_order(size, 1, slub_max_order, 1, reserved);
2057 if (order <= slub_max_order) 2301 if (order <= slub_max_order)
2058 return order; 2302 return order;
2059 2303
2060 /* 2304 /*
2061 * Doh this slab cannot be placed using slub_max_order. 2305 * Doh this slab cannot be placed using slub_max_order.
2062 */ 2306 */
2063 order = slab_order(size, 1, MAX_ORDER, 1); 2307 order = slab_order(size, 1, MAX_ORDER, 1, reserved);
2064 if (order < MAX_ORDER) 2308 if (order < MAX_ORDER)
2065 return order; 2309 return order;
2066 return -ENOSYS; 2310 return -ENOSYS;
@@ -2110,9 +2354,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2110 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2354 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2111 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2355 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2112 2356
2357#ifdef CONFIG_CMPXCHG_LOCAL
2358 /*
2359 * Must align to double word boundary for the double cmpxchg instructions
2360 * to work.
2361 */
2362 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
2363#else
2364 /* Regular alignment is sufficient */
2113 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); 2365 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2366#endif
2367
2368 if (!s->cpu_slab)
2369 return 0;
2114 2370
2115 return s->cpu_slab != NULL; 2371 init_kmem_cache_cpus(s);
2372
2373 return 1;
2116} 2374}
2117 2375
2118static struct kmem_cache *kmem_cache_node; 2376static struct kmem_cache *kmem_cache_node;
@@ -2311,7 +2569,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2311 if (forced_order >= 0) 2569 if (forced_order >= 0)
2312 order = forced_order; 2570 order = forced_order;
2313 else 2571 else
2314 order = calculate_order(size); 2572 order = calculate_order(size, s->reserved);
2315 2573
2316 if (order < 0) 2574 if (order < 0)
2317 return 0; 2575 return 0;
@@ -2329,8 +2587,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2329 /* 2587 /*
2330 * Determine the number of objects per slab 2588 * Determine the number of objects per slab
2331 */ 2589 */
2332 s->oo = oo_make(order, size); 2590 s->oo = oo_make(order, size, s->reserved);
2333 s->min = oo_make(get_order(size), size); 2591 s->min = oo_make(get_order(size), size, s->reserved);
2334 if (oo_objects(s->oo) > oo_objects(s->max)) 2592 if (oo_objects(s->oo) > oo_objects(s->max))
2335 s->max = s->oo; 2593 s->max = s->oo;
2336 2594
@@ -2349,6 +2607,10 @@ static int kmem_cache_open(struct kmem_cache *s,
2349 s->objsize = size; 2607 s->objsize = size;
2350 s->align = align; 2608 s->align = align;
2351 s->flags = kmem_cache_flags(size, flags, name, ctor); 2609 s->flags = kmem_cache_flags(size, flags, name, ctor);
2610 s->reserved = 0;
2611
2612 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
2613 s->reserved = sizeof(struct rcu_head);
2352 2614
2353 if (!calculate_sizes(s, -1)) 2615 if (!calculate_sizes(s, -1))
2354 goto error; 2616 goto error;
@@ -2399,12 +2661,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
2399} 2661}
2400EXPORT_SYMBOL(kmem_cache_size); 2662EXPORT_SYMBOL(kmem_cache_size);
2401 2663
2402const char *kmem_cache_name(struct kmem_cache *s)
2403{
2404 return s->name;
2405}
2406EXPORT_SYMBOL(kmem_cache_name);
2407
2408static void list_slab_objects(struct kmem_cache *s, struct page *page, 2664static void list_slab_objects(struct kmem_cache *s, struct page *page,
2409 const char *text) 2665 const char *text)
2410{ 2666{
@@ -2696,7 +2952,6 @@ EXPORT_SYMBOL(__kmalloc_node);
2696size_t ksize(const void *object) 2952size_t ksize(const void *object)
2697{ 2953{
2698 struct page *page; 2954 struct page *page;
2699 struct kmem_cache *s;
2700 2955
2701 if (unlikely(object == ZERO_SIZE_PTR)) 2956 if (unlikely(object == ZERO_SIZE_PTR))
2702 return 0; 2957 return 0;
@@ -2707,28 +2962,8 @@ size_t ksize(const void *object)
2707 WARN_ON(!PageCompound(page)); 2962 WARN_ON(!PageCompound(page));
2708 return PAGE_SIZE << compound_order(page); 2963 return PAGE_SIZE << compound_order(page);
2709 } 2964 }
2710 s = page->slab;
2711
2712#ifdef CONFIG_SLUB_DEBUG
2713 /*
2714 * Debugging requires use of the padding between object
2715 * and whatever may come after it.
2716 */
2717 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2718 return s->objsize;
2719 2965
2720#endif 2966 return slab_ksize(page->slab);
2721 /*
2722 * If we have the need to store the freelist pointer
2723 * back there or track user information then we can
2724 * only use the space before that information.
2725 */
2726 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2727 return s->inuse;
2728 /*
2729 * Else we can use all the padding etc for the allocation
2730 */
2731 return s->size;
2732} 2967}
2733EXPORT_SYMBOL(ksize); 2968EXPORT_SYMBOL(ksize);
2734 2969
@@ -3312,7 +3547,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3312 3547
3313 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); 3548 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
3314 3549
3315 /* Honor the call site pointer we recieved. */ 3550 /* Honor the call site pointer we received. */
3316 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3551 trace_kmalloc(caller, ret, size, s->size, gfpflags);
3317 3552
3318 return ret; 3553 return ret;
@@ -3342,7 +3577,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3342 3577
3343 ret = slab_alloc(s, gfpflags, node, caller); 3578 ret = slab_alloc(s, gfpflags, node, caller);
3344 3579
3345 /* Honor the call site pointer we recieved. */ 3580 /* Honor the call site pointer we received. */
3346 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3581 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3347 3582
3348 return ret; 3583 return ret;
@@ -4017,6 +4252,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4017} 4252}
4018SLAB_ATTR_RO(destroy_by_rcu); 4253SLAB_ATTR_RO(destroy_by_rcu);
4019 4254
4255static ssize_t reserved_show(struct kmem_cache *s, char *buf)
4256{
4257 return sprintf(buf, "%d\n", s->reserved);
4258}
4259SLAB_ATTR_RO(reserved);
4260
4020#ifdef CONFIG_SLUB_DEBUG 4261#ifdef CONFIG_SLUB_DEBUG
4021static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4262static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4022{ 4263{
@@ -4303,6 +4544,7 @@ static struct attribute *slab_attrs[] = {
4303 &reclaim_account_attr.attr, 4544 &reclaim_account_attr.attr,
4304 &destroy_by_rcu_attr.attr, 4545 &destroy_by_rcu_attr.attr,
4305 &shrink_attr.attr, 4546 &shrink_attr.attr,
4547 &reserved_attr.attr,
4306#ifdef CONFIG_SLUB_DEBUG 4548#ifdef CONFIG_SLUB_DEBUG
4307 &total_objects_attr.attr, 4549 &total_objects_attr.attr,
4308 &slabs_attr.attr, 4550 &slabs_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index 93250207c5cf..aa64b12831a2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -500,7 +500,7 @@ void __init sparse_init(void)
500 * so alloc 2M (with 2M align) and 24 bytes in turn will 500 * so alloc 2M (with 2M align) and 24 bytes in turn will
501 * make next 2M slip to one more 2M later. 501 * make next 2M slip to one more 2M later.
502 * then in big system, the memory will have a lot of holes... 502 * then in big system, the memory will have a lot of holes...
503 * here try to allocate 2M pages continously. 503 * here try to allocate 2M pages continuously.
504 * 504 *
505 * powerpc need to call sparse_init_one_section right after each 505 * powerpc need to call sparse_init_one_section right after each
506 * sparse_early_mem_map_alloc, so allocate usemap_map at first. 506 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
diff --git a/mm/swap.c b/mm/swap.c
index c02f93611a84..a448db377cb0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,6 +39,7 @@ int page_cluster;
39 39
40static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); 40static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
41static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 41static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
42static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
42 43
43/* 44/*
44 * This path almost never happens for VM activity - pages are normally 45 * This path almost never happens for VM activity - pages are normally
@@ -178,15 +179,13 @@ void put_pages_list(struct list_head *pages)
178} 179}
179EXPORT_SYMBOL(put_pages_list); 180EXPORT_SYMBOL(put_pages_list);
180 181
181/* 182static void pagevec_lru_move_fn(struct pagevec *pvec,
182 * pagevec_move_tail() must be called with IRQ disabled. 183 void (*move_fn)(struct page *page, void *arg),
183 * Otherwise this may cause nasty races. 184 void *arg)
184 */
185static void pagevec_move_tail(struct pagevec *pvec)
186{ 185{
187 int i; 186 int i;
188 int pgmoved = 0;
189 struct zone *zone = NULL; 187 struct zone *zone = NULL;
188 unsigned long flags = 0;
190 189
191 for (i = 0; i < pagevec_count(pvec); i++) { 190 for (i = 0; i < pagevec_count(pvec); i++) {
192 struct page *page = pvec->pages[i]; 191 struct page *page = pvec->pages[i];
@@ -194,29 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec)
194 193
195 if (pagezone != zone) { 194 if (pagezone != zone) {
196 if (zone) 195 if (zone)
197 spin_unlock(&zone->lru_lock); 196 spin_unlock_irqrestore(&zone->lru_lock, flags);
198 zone = pagezone; 197 zone = pagezone;
199 spin_lock(&zone->lru_lock); 198 spin_lock_irqsave(&zone->lru_lock, flags);
200 }
201 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
202 int lru = page_lru_base_type(page);
203 list_move_tail(&page->lru, &zone->lru[lru].list);
204 pgmoved++;
205 } 199 }
200
201 (*move_fn)(page, arg);
206 } 202 }
207 if (zone) 203 if (zone)
208 spin_unlock(&zone->lru_lock); 204 spin_unlock_irqrestore(&zone->lru_lock, flags);
209 __count_vm_events(PGROTATED, pgmoved);
210 release_pages(pvec->pages, pvec->nr, pvec->cold); 205 release_pages(pvec->pages, pvec->nr, pvec->cold);
211 pagevec_reinit(pvec); 206 pagevec_reinit(pvec);
212} 207}
213 208
209static void pagevec_move_tail_fn(struct page *page, void *arg)
210{
211 int *pgmoved = arg;
212 struct zone *zone = page_zone(page);
213
214 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
215 enum lru_list lru = page_lru_base_type(page);
216 list_move_tail(&page->lru, &zone->lru[lru].list);
217 mem_cgroup_rotate_reclaimable_page(page);
218 (*pgmoved)++;
219 }
220}
221
222/*
223 * pagevec_move_tail() must be called with IRQ disabled.
224 * Otherwise this may cause nasty races.
225 */
226static void pagevec_move_tail(struct pagevec *pvec)
227{
228 int pgmoved = 0;
229
230 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
231 __count_vm_events(PGROTATED, pgmoved);
232}
233
214/* 234/*
215 * Writeback is about to end against a page which has been marked for immediate 235 * Writeback is about to end against a page which has been marked for immediate
216 * reclaim. If it still appears to be reclaimable, move it to the tail of the 236 * reclaim. If it still appears to be reclaimable, move it to the tail of the
217 * inactive list. 237 * inactive list.
218 */ 238 */
219void rotate_reclaimable_page(struct page *page) 239void rotate_reclaimable_page(struct page *page)
220{ 240{
221 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 241 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
222 !PageUnevictable(page) && PageLRU(page)) { 242 !PageUnevictable(page) && PageLRU(page)) {
@@ -347,6 +367,71 @@ void add_page_to_unevictable_list(struct page *page)
347} 367}
348 368
349/* 369/*
370 * If the page can not be invalidated, it is moved to the
371 * inactive list to speed up its reclaim. It is moved to the
372 * head of the list, rather than the tail, to give the flusher
373 * threads some time to write it out, as this is much more
374 * effective than the single-page writeout from reclaim.
375 *
376 * If the page isn't page_mapped and dirty/writeback, the page
377 * could reclaim asap using PG_reclaim.
378 *
379 * 1. active, mapped page -> none
380 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
381 * 3. inactive, mapped page -> none
382 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
383 * 5. inactive, clean -> inactive, tail
384 * 6. Others -> none
385 *
386 * In 4, why it moves inactive's head, the VM expects the page would
387 * be write it out by flusher threads as this is much more effective
388 * than the single-page writeout from reclaim.
389 */
390static void lru_deactivate_fn(struct page *page, void *arg)
391{
392 int lru, file;
393 bool active;
394 struct zone *zone = page_zone(page);
395
396 if (!PageLRU(page))
397 return;
398
399 /* Some processes are using the page */
400 if (page_mapped(page))
401 return;
402
403 active = PageActive(page);
404
405 file = page_is_file_cache(page);
406 lru = page_lru_base_type(page);
407 del_page_from_lru_list(zone, page, lru + active);
408 ClearPageActive(page);
409 ClearPageReferenced(page);
410 add_page_to_lru_list(zone, page, lru);
411
412 if (PageWriteback(page) || PageDirty(page)) {
413 /*
414 * PG_reclaim could be raced with end_page_writeback
415 * It can make readahead confusing. But race window
416 * is _really_ small and it's non-critical problem.
417 */
418 SetPageReclaim(page);
419 } else {
420 /*
421 * The page's writeback ends up during pagevec
422 * We moves tha page into tail of inactive.
423 */
424 list_move_tail(&page->lru, &zone->lru[lru].list);
425 mem_cgroup_rotate_reclaimable_page(page);
426 __count_vm_event(PGROTATED);
427 }
428
429 if (active)
430 __count_vm_event(PGDEACTIVATE);
431 update_page_reclaim_stat(zone, page, file, 0);
432}
433
434/*
350 * Drain pages out of the cpu's pagevecs. 435 * Drain pages out of the cpu's pagevecs.
351 * Either "cpu" is the current CPU, and preemption has already been 436 * Either "cpu" is the current CPU, and preemption has already been
352 * disabled; or "cpu" is being hot-unplugged, and is already dead. 437 * disabled; or "cpu" is being hot-unplugged, and is already dead.
@@ -372,6 +457,29 @@ static void drain_cpu_pagevecs(int cpu)
372 pagevec_move_tail(pvec); 457 pagevec_move_tail(pvec);
373 local_irq_restore(flags); 458 local_irq_restore(flags);
374 } 459 }
460
461 pvec = &per_cpu(lru_deactivate_pvecs, cpu);
462 if (pagevec_count(pvec))
463 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
464}
465
466/**
467 * deactivate_page - forcefully deactivate a page
468 * @page: page to deactivate
469 *
470 * This function hints the VM that @page is a good reclaim candidate,
471 * for example if its invalidation fails due to the page being dirty
472 * or under writeback.
473 */
474void deactivate_page(struct page *page)
475{
476 if (likely(get_page_unless_zero(page))) {
477 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
478
479 if (!pagevec_add(pvec, page))
480 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
481 put_cpu_var(lru_deactivate_pvecs);
482 }
375} 483}
376 484
377void lru_add_drain(void) 485void lru_add_drain(void)
@@ -516,44 +624,33 @@ void lru_add_page_tail(struct zone* zone,
516 } 624 }
517} 625}
518 626
627static void ____pagevec_lru_add_fn(struct page *page, void *arg)
628{
629 enum lru_list lru = (enum lru_list)arg;
630 struct zone *zone = page_zone(page);
631 int file = is_file_lru(lru);
632 int active = is_active_lru(lru);
633
634 VM_BUG_ON(PageActive(page));
635 VM_BUG_ON(PageUnevictable(page));
636 VM_BUG_ON(PageLRU(page));
637
638 SetPageLRU(page);
639 if (active)
640 SetPageActive(page);
641 update_page_reclaim_stat(zone, page, file, active);
642 add_page_to_lru_list(zone, page, lru);
643}
644
519/* 645/*
520 * Add the passed pages to the LRU, then drop the caller's refcount 646 * Add the passed pages to the LRU, then drop the caller's refcount
521 * on them. Reinitialises the caller's pagevec. 647 * on them. Reinitialises the caller's pagevec.
522 */ 648 */
523void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 649void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
524{ 650{
525 int i;
526 struct zone *zone = NULL;
527
528 VM_BUG_ON(is_unevictable_lru(lru)); 651 VM_BUG_ON(is_unevictable_lru(lru));
529 652
530 for (i = 0; i < pagevec_count(pvec); i++) { 653 pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
531 struct page *page = pvec->pages[i];
532 struct zone *pagezone = page_zone(page);
533 int file;
534 int active;
535
536 if (pagezone != zone) {
537 if (zone)
538 spin_unlock_irq(&zone->lru_lock);
539 zone = pagezone;
540 spin_lock_irq(&zone->lru_lock);
541 }
542 VM_BUG_ON(PageActive(page));
543 VM_BUG_ON(PageUnevictable(page));
544 VM_BUG_ON(PageLRU(page));
545 SetPageLRU(page);
546 active = is_active_lru(lru);
547 file = is_file_lru(lru);
548 if (active)
549 SetPageActive(page);
550 update_page_reclaim_stat(zone, page, file, active);
551 add_page_to_lru_list(zone, page, lru);
552 }
553 if (zone)
554 spin_unlock_irq(&zone->lru_lock);
555 release_pages(pvec->pages, pvec->nr, pvec->cold);
556 pagevec_reinit(pvec);
557} 654}
558 655
559EXPORT_SYMBOL(____pagevec_lru_add); 656EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5c8cfabbc9bc..46680461785b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,12 +24,10 @@
24 24
25/* 25/*
26 * swapper_space is a fiction, retained to simplify the path through 26 * swapper_space is a fiction, retained to simplify the path through
27 * vmscan's shrink_page_list, to make sync_page look nicer, and to allow 27 * vmscan's shrink_page_list.
28 * future use of radix_tree tags in the swap cache.
29 */ 28 */
30static const struct address_space_operations swap_aops = { 29static const struct address_space_operations swap_aops = {
31 .writepage = swap_writepage, 30 .writepage = swap_writepage,
32 .sync_page = block_sync_page,
33 .set_page_dirty = __set_page_dirty_nobuffers, 31 .set_page_dirty = __set_page_dirty_nobuffers,
34 .migratepage = migrate_page, 32 .migratepage = migrate_page,
35}; 33};
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = {
37static struct backing_dev_info swap_backing_dev_info = { 35static struct backing_dev_info swap_backing_dev_info = {
38 .name = "swap", 36 .name = "swap",
39 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
40 .unplug_io_fn = swap_unplug_io_fn,
41}; 38};
42 39
43struct address_space swapper_space = { 40struct address_space swapper_space = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0341c5700e34..8c6b3ce38f09 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -95,39 +95,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
95} 95}
96 96
97/* 97/*
98 * We need this because the bdev->unplug_fn can sleep and we cannot
99 * hold swap_lock while calling the unplug_fn. And swap_lock
100 * cannot be turned into a mutex.
101 */
102static DECLARE_RWSEM(swap_unplug_sem);
103
104void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
105{
106 swp_entry_t entry;
107
108 down_read(&swap_unplug_sem);
109 entry.val = page_private(page);
110 if (PageSwapCache(page)) {
111 struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
112 struct backing_dev_info *bdi;
113
114 /*
115 * If the page is removed from swapcache from under us (with a
116 * racy try_to_unuse/swapoff) we need an additional reference
117 * count to avoid reading garbage from page_private(page) above.
118 * If the WARN_ON triggers during a swapoff it maybe the race
119 * condition and it's harmless. However if it triggers without
120 * swapoff it signals a problem.
121 */
122 WARN_ON(page_count(page) <= 1);
123
124 bdi = bdev->bd_inode->i_mapping->backing_dev_info;
125 blk_run_backing_dev(bdi, page);
126 }
127 up_read(&swap_unplug_sem);
128}
129
130/*
131 * swapon tell device that all the old swap contents can be discarded, 98 * swapon tell device that all the old swap contents can be discarded,
132 * to allow the swap device to optimize its wear-levelling. 99 * to allow the swap device to optimize its wear-levelling.
133 */ 100 */
@@ -212,8 +179,8 @@ static int wait_for_discard(void *word)
212#define SWAPFILE_CLUSTER 256 179#define SWAPFILE_CLUSTER 256
213#define LATENCY_LIMIT 256 180#define LATENCY_LIMIT 256
214 181
215static inline unsigned long scan_swap_map(struct swap_info_struct *si, 182static unsigned long scan_swap_map(struct swap_info_struct *si,
216 unsigned char usage) 183 unsigned char usage)
217{ 184{
218 unsigned long offset; 185 unsigned long offset;
219 unsigned long scan_base; 186 unsigned long scan_base;
@@ -880,7 +847,7 @@ unsigned int count_swap_pages(int type, int free)
880static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 847static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
881 unsigned long addr, swp_entry_t entry, struct page *page) 848 unsigned long addr, swp_entry_t entry, struct page *page)
882{ 849{
883 struct mem_cgroup *ptr = NULL; 850 struct mem_cgroup *ptr;
884 spinlock_t *ptl; 851 spinlock_t *ptl;
885 pte_t *pte; 852 pte_t *pte;
886 int ret = 1; 853 int ret = 1;
@@ -1550,6 +1517,36 @@ bad_bmap:
1550 goto out; 1517 goto out;
1551} 1518}
1552 1519
1520static void enable_swap_info(struct swap_info_struct *p, int prio,
1521 unsigned char *swap_map)
1522{
1523 int i, prev;
1524
1525 spin_lock(&swap_lock);
1526 if (prio >= 0)
1527 p->prio = prio;
1528 else
1529 p->prio = --least_priority;
1530 p->swap_map = swap_map;
1531 p->flags |= SWP_WRITEOK;
1532 nr_swap_pages += p->pages;
1533 total_swap_pages += p->pages;
1534
1535 /* insert swap space into swap_list: */
1536 prev = -1;
1537 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1538 if (p->prio >= swap_info[i]->prio)
1539 break;
1540 prev = i;
1541 }
1542 p->next = i;
1543 if (prev < 0)
1544 swap_list.head = swap_list.next = p->type;
1545 else
1546 swap_info[prev]->next = p->type;
1547 spin_unlock(&swap_lock);
1548}
1549
1553SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1550SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1554{ 1551{
1555 struct swap_info_struct *p = NULL; 1552 struct swap_info_struct *p = NULL;
@@ -1621,32 +1618,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1621 current->flags &= ~PF_OOM_ORIGIN; 1618 current->flags &= ~PF_OOM_ORIGIN;
1622 1619
1623 if (err) { 1620 if (err) {
1621 /*
1622 * reading p->prio and p->swap_map outside the lock is
1623 * safe here because only sys_swapon and sys_swapoff
1624 * change them, and there can be no other sys_swapon or
1625 * sys_swapoff for this swap_info_struct at this point.
1626 */
1624 /* re-insert swap space back into swap_list */ 1627 /* re-insert swap space back into swap_list */
1625 spin_lock(&swap_lock); 1628 enable_swap_info(p, p->prio, p->swap_map);
1626 if (p->prio < 0)
1627 p->prio = --least_priority;
1628 prev = -1;
1629 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1630 if (p->prio >= swap_info[i]->prio)
1631 break;
1632 prev = i;
1633 }
1634 p->next = i;
1635 if (prev < 0)
1636 swap_list.head = swap_list.next = type;
1637 else
1638 swap_info[prev]->next = type;
1639 nr_swap_pages += p->pages;
1640 total_swap_pages += p->pages;
1641 p->flags |= SWP_WRITEOK;
1642 spin_unlock(&swap_lock);
1643 goto out_dput; 1629 goto out_dput;
1644 } 1630 }
1645 1631
1646 /* wait for any unplug function to finish */
1647 down_write(&swap_unplug_sem);
1648 up_write(&swap_unplug_sem);
1649
1650 destroy_swap_extents(p); 1632 destroy_swap_extents(p);
1651 if (p->flags & SWP_CONTINUED) 1633 if (p->flags & SWP_CONTINUED)
1652 free_swap_count_continuations(p); 1634 free_swap_count_continuations(p);
@@ -1844,49 +1826,24 @@ static int __init max_swapfiles_check(void)
1844late_initcall(max_swapfiles_check); 1826late_initcall(max_swapfiles_check);
1845#endif 1827#endif
1846 1828
1847/* 1829static struct swap_info_struct *alloc_swap_info(void)
1848 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1849 *
1850 * The swapon system call
1851 */
1852SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1853{ 1830{
1854 struct swap_info_struct *p; 1831 struct swap_info_struct *p;
1855 char *name = NULL;
1856 struct block_device *bdev = NULL;
1857 struct file *swap_file = NULL;
1858 struct address_space *mapping;
1859 unsigned int type; 1832 unsigned int type;
1860 int i, prev;
1861 int error;
1862 union swap_header *swap_header;
1863 unsigned int nr_good_pages;
1864 int nr_extents = 0;
1865 sector_t span;
1866 unsigned long maxpages;
1867 unsigned long swapfilepages;
1868 unsigned char *swap_map = NULL;
1869 struct page *page = NULL;
1870 struct inode *inode = NULL;
1871 int did_down = 0;
1872
1873 if (!capable(CAP_SYS_ADMIN))
1874 return -EPERM;
1875 1833
1876 p = kzalloc(sizeof(*p), GFP_KERNEL); 1834 p = kzalloc(sizeof(*p), GFP_KERNEL);
1877 if (!p) 1835 if (!p)
1878 return -ENOMEM; 1836 return ERR_PTR(-ENOMEM);
1879 1837
1880 spin_lock(&swap_lock); 1838 spin_lock(&swap_lock);
1881 for (type = 0; type < nr_swapfiles; type++) { 1839 for (type = 0; type < nr_swapfiles; type++) {
1882 if (!(swap_info[type]->flags & SWP_USED)) 1840 if (!(swap_info[type]->flags & SWP_USED))
1883 break; 1841 break;
1884 } 1842 }
1885 error = -EPERM;
1886 if (type >= MAX_SWAPFILES) { 1843 if (type >= MAX_SWAPFILES) {
1887 spin_unlock(&swap_lock); 1844 spin_unlock(&swap_lock);
1888 kfree(p); 1845 kfree(p);
1889 goto out; 1846 return ERR_PTR(-EPERM);
1890 } 1847 }
1891 if (type >= nr_swapfiles) { 1848 if (type >= nr_swapfiles) {
1892 p->type = type; 1849 p->type = type;
@@ -1911,81 +1868,49 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1911 p->next = -1; 1868 p->next = -1;
1912 spin_unlock(&swap_lock); 1869 spin_unlock(&swap_lock);
1913 1870
1914 name = getname(specialfile); 1871 return p;
1915 error = PTR_ERR(name); 1872}
1916 if (IS_ERR(name)) {
1917 name = NULL;
1918 goto bad_swap_2;
1919 }
1920 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1921 error = PTR_ERR(swap_file);
1922 if (IS_ERR(swap_file)) {
1923 swap_file = NULL;
1924 goto bad_swap_2;
1925 }
1926
1927 p->swap_file = swap_file;
1928 mapping = swap_file->f_mapping;
1929 inode = mapping->host;
1930
1931 error = -EBUSY;
1932 for (i = 0; i < nr_swapfiles; i++) {
1933 struct swap_info_struct *q = swap_info[i];
1934 1873
1935 if (i == type || !q->swap_file) 1874static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1936 continue; 1875{
1937 if (mapping == q->swap_file->f_mapping) 1876 int error;
1938 goto bad_swap;
1939 }
1940 1877
1941 error = -EINVAL;
1942 if (S_ISBLK(inode->i_mode)) { 1878 if (S_ISBLK(inode->i_mode)) {
1943 bdev = bdgrab(I_BDEV(inode)); 1879 p->bdev = bdgrab(I_BDEV(inode));
1944 error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, 1880 error = blkdev_get(p->bdev,
1881 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1945 sys_swapon); 1882 sys_swapon);
1946 if (error < 0) { 1883 if (error < 0) {
1947 bdev = NULL; 1884 p->bdev = NULL;
1948 error = -EINVAL; 1885 return -EINVAL;
1949 goto bad_swap;
1950 } 1886 }
1951 p->old_block_size = block_size(bdev); 1887 p->old_block_size = block_size(p->bdev);
1952 error = set_blocksize(bdev, PAGE_SIZE); 1888 error = set_blocksize(p->bdev, PAGE_SIZE);
1953 if (error < 0) 1889 if (error < 0)
1954 goto bad_swap; 1890 return error;
1955 p->bdev = bdev;
1956 p->flags |= SWP_BLKDEV; 1891 p->flags |= SWP_BLKDEV;
1957 } else if (S_ISREG(inode->i_mode)) { 1892 } else if (S_ISREG(inode->i_mode)) {
1958 p->bdev = inode->i_sb->s_bdev; 1893 p->bdev = inode->i_sb->s_bdev;
1959 mutex_lock(&inode->i_mutex); 1894 mutex_lock(&inode->i_mutex);
1960 did_down = 1; 1895 if (IS_SWAPFILE(inode))
1961 if (IS_SWAPFILE(inode)) { 1896 return -EBUSY;
1962 error = -EBUSY; 1897 } else
1963 goto bad_swap; 1898 return -EINVAL;
1964 }
1965 } else {
1966 goto bad_swap;
1967 }
1968 1899
1969 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 1900 return 0;
1901}
1970 1902
1971 /* 1903static unsigned long read_swap_header(struct swap_info_struct *p,
1972 * Read the swap header. 1904 union swap_header *swap_header,
1973 */ 1905 struct inode *inode)
1974 if (!mapping->a_ops->readpage) { 1906{
1975 error = -EINVAL; 1907 int i;
1976 goto bad_swap; 1908 unsigned long maxpages;
1977 } 1909 unsigned long swapfilepages;
1978 page = read_mapping_page(mapping, 0, swap_file);
1979 if (IS_ERR(page)) {
1980 error = PTR_ERR(page);
1981 goto bad_swap;
1982 }
1983 swap_header = kmap(page);
1984 1910
1985 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 1911 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1986 printk(KERN_ERR "Unable to find swap-space signature\n"); 1912 printk(KERN_ERR "Unable to find swap-space signature\n");
1987 error = -EINVAL; 1913 return 0;
1988 goto bad_swap;
1989 } 1914 }
1990 1915
1991 /* swap partition endianess hack... */ 1916 /* swap partition endianess hack... */
@@ -2001,8 +1926,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2001 printk(KERN_WARNING 1926 printk(KERN_WARNING
2002 "Unable to handle swap header version %d\n", 1927 "Unable to handle swap header version %d\n",
2003 swap_header->info.version); 1928 swap_header->info.version);
2004 error = -EINVAL; 1929 return 0;
2005 goto bad_swap;
2006 } 1930 }
2007 1931
2008 p->lowest_bit = 1; 1932 p->lowest_bit = 1;
@@ -2033,61 +1957,155 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2033 } 1957 }
2034 p->highest_bit = maxpages - 1; 1958 p->highest_bit = maxpages - 1;
2035 1959
2036 error = -EINVAL;
2037 if (!maxpages) 1960 if (!maxpages)
2038 goto bad_swap; 1961 return 0;
1962 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2039 if (swapfilepages && maxpages > swapfilepages) { 1963 if (swapfilepages && maxpages > swapfilepages) {
2040 printk(KERN_WARNING 1964 printk(KERN_WARNING
2041 "Swap area shorter than signature indicates\n"); 1965 "Swap area shorter than signature indicates\n");
2042 goto bad_swap; 1966 return 0;
2043 } 1967 }
2044 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1968 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2045 goto bad_swap; 1969 return 0;
2046 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1970 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2047 goto bad_swap; 1971 return 0;
2048 1972
2049 /* OK, set up the swap map and apply the bad block list */ 1973 return maxpages;
2050 swap_map = vmalloc(maxpages); 1974}
2051 if (!swap_map) { 1975
2052 error = -ENOMEM; 1976static int setup_swap_map_and_extents(struct swap_info_struct *p,
2053 goto bad_swap; 1977 union swap_header *swap_header,
2054 } 1978 unsigned char *swap_map,
1979 unsigned long maxpages,
1980 sector_t *span)
1981{
1982 int i;
1983 unsigned int nr_good_pages;
1984 int nr_extents;
2055 1985
2056 memset(swap_map, 0, maxpages);
2057 nr_good_pages = maxpages - 1; /* omit header page */ 1986 nr_good_pages = maxpages - 1; /* omit header page */
2058 1987
2059 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1988 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2060 unsigned int page_nr = swap_header->info.badpages[i]; 1989 unsigned int page_nr = swap_header->info.badpages[i];
2061 if (page_nr == 0 || page_nr > swap_header->info.last_page) { 1990 if (page_nr == 0 || page_nr > swap_header->info.last_page)
2062 error = -EINVAL; 1991 return -EINVAL;
2063 goto bad_swap;
2064 }
2065 if (page_nr < maxpages) { 1992 if (page_nr < maxpages) {
2066 swap_map[page_nr] = SWAP_MAP_BAD; 1993 swap_map[page_nr] = SWAP_MAP_BAD;
2067 nr_good_pages--; 1994 nr_good_pages--;
2068 } 1995 }
2069 } 1996 }
2070 1997
2071 error = swap_cgroup_swapon(type, maxpages);
2072 if (error)
2073 goto bad_swap;
2074
2075 if (nr_good_pages) { 1998 if (nr_good_pages) {
2076 swap_map[0] = SWAP_MAP_BAD; 1999 swap_map[0] = SWAP_MAP_BAD;
2077 p->max = maxpages; 2000 p->max = maxpages;
2078 p->pages = nr_good_pages; 2001 p->pages = nr_good_pages;
2079 nr_extents = setup_swap_extents(p, &span); 2002 nr_extents = setup_swap_extents(p, span);
2080 if (nr_extents < 0) { 2003 if (nr_extents < 0)
2081 error = nr_extents; 2004 return nr_extents;
2082 goto bad_swap;
2083 }
2084 nr_good_pages = p->pages; 2005 nr_good_pages = p->pages;
2085 } 2006 }
2086 if (!nr_good_pages) { 2007 if (!nr_good_pages) {
2087 printk(KERN_WARNING "Empty swap-file\n"); 2008 printk(KERN_WARNING "Empty swap-file\n");
2009 return -EINVAL;
2010 }
2011
2012 return nr_extents;
2013}
2014
2015SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2016{
2017 struct swap_info_struct *p;
2018 char *name;
2019 struct file *swap_file = NULL;
2020 struct address_space *mapping;
2021 int i;
2022 int prio;
2023 int error;
2024 union swap_header *swap_header;
2025 int nr_extents;
2026 sector_t span;
2027 unsigned long maxpages;
2028 unsigned char *swap_map = NULL;
2029 struct page *page = NULL;
2030 struct inode *inode = NULL;
2031
2032 if (!capable(CAP_SYS_ADMIN))
2033 return -EPERM;
2034
2035 p = alloc_swap_info();
2036 if (IS_ERR(p))
2037 return PTR_ERR(p);
2038
2039 name = getname(specialfile);
2040 if (IS_ERR(name)) {
2041 error = PTR_ERR(name);
2042 name = NULL;
2043 goto bad_swap;
2044 }
2045 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
2046 if (IS_ERR(swap_file)) {
2047 error = PTR_ERR(swap_file);
2048 swap_file = NULL;
2049 goto bad_swap;
2050 }
2051
2052 p->swap_file = swap_file;
2053 mapping = swap_file->f_mapping;
2054
2055 for (i = 0; i < nr_swapfiles; i++) {
2056 struct swap_info_struct *q = swap_info[i];
2057
2058 if (q == p || !q->swap_file)
2059 continue;
2060 if (mapping == q->swap_file->f_mapping) {
2061 error = -EBUSY;
2062 goto bad_swap;
2063 }
2064 }
2065
2066 inode = mapping->host;
2067 /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
2068 error = claim_swapfile(p, inode);
2069 if (unlikely(error))
2070 goto bad_swap;
2071
2072 /*
2073 * Read the swap header.
2074 */
2075 if (!mapping->a_ops->readpage) {
2088 error = -EINVAL; 2076 error = -EINVAL;
2089 goto bad_swap; 2077 goto bad_swap;
2090 } 2078 }
2079 page = read_mapping_page(mapping, 0, swap_file);
2080 if (IS_ERR(page)) {
2081 error = PTR_ERR(page);
2082 goto bad_swap;
2083 }
2084 swap_header = kmap(page);
2085
2086 maxpages = read_swap_header(p, swap_header, inode);
2087 if (unlikely(!maxpages)) {
2088 error = -EINVAL;
2089 goto bad_swap;
2090 }
2091
2092 /* OK, set up the swap map and apply the bad block list */
2093 swap_map = vzalloc(maxpages);
2094 if (!swap_map) {
2095 error = -ENOMEM;
2096 goto bad_swap;
2097 }
2098
2099 error = swap_cgroup_swapon(p->type, maxpages);
2100 if (error)
2101 goto bad_swap;
2102
2103 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2104 maxpages, &span);
2105 if (unlikely(nr_extents < 0)) {
2106 error = nr_extents;
2107 goto bad_swap;
2108 }
2091 2109
2092 if (p->bdev) { 2110 if (p->bdev) {
2093 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2111 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2099,58 +2117,46 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2099 } 2117 }
2100 2118
2101 mutex_lock(&swapon_mutex); 2119 mutex_lock(&swapon_mutex);
2102 spin_lock(&swap_lock); 2120 prio = -1;
2103 if (swap_flags & SWAP_FLAG_PREFER) 2121 if (swap_flags & SWAP_FLAG_PREFER)
2104 p->prio = 2122 prio =
2105 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2123 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2106 else 2124 enable_swap_info(p, prio, swap_map);
2107 p->prio = --least_priority;
2108 p->swap_map = swap_map;
2109 p->flags |= SWP_WRITEOK;
2110 nr_swap_pages += nr_good_pages;
2111 total_swap_pages += nr_good_pages;
2112 2125
2113 printk(KERN_INFO "Adding %uk swap on %s. " 2126 printk(KERN_INFO "Adding %uk swap on %s. "
2114 "Priority:%d extents:%d across:%lluk %s%s\n", 2127 "Priority:%d extents:%d across:%lluk %s%s\n",
2115 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 2128 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2116 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2129 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2117 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2130 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2118 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2131 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2119 2132
2120 /* insert swap space into swap_list: */
2121 prev = -1;
2122 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2123 if (p->prio >= swap_info[i]->prio)
2124 break;
2125 prev = i;
2126 }
2127 p->next = i;
2128 if (prev < 0)
2129 swap_list.head = swap_list.next = type;
2130 else
2131 swap_info[prev]->next = type;
2132 spin_unlock(&swap_lock);
2133 mutex_unlock(&swapon_mutex); 2133 mutex_unlock(&swapon_mutex);
2134 atomic_inc(&proc_poll_event); 2134 atomic_inc(&proc_poll_event);
2135 wake_up_interruptible(&proc_poll_wait); 2135 wake_up_interruptible(&proc_poll_wait);
2136 2136
2137 if (S_ISREG(inode->i_mode))
2138 inode->i_flags |= S_SWAPFILE;
2137 error = 0; 2139 error = 0;
2138 goto out; 2140 goto out;
2139bad_swap: 2141bad_swap:
2140 if (bdev) { 2142 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2141 set_blocksize(bdev, p->old_block_size); 2143 set_blocksize(p->bdev, p->old_block_size);
2142 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2144 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2143 } 2145 }
2144 destroy_swap_extents(p); 2146 destroy_swap_extents(p);
2145 swap_cgroup_swapoff(type); 2147 swap_cgroup_swapoff(p->type);
2146bad_swap_2:
2147 spin_lock(&swap_lock); 2148 spin_lock(&swap_lock);
2148 p->swap_file = NULL; 2149 p->swap_file = NULL;
2149 p->flags = 0; 2150 p->flags = 0;
2150 spin_unlock(&swap_lock); 2151 spin_unlock(&swap_lock);
2151 vfree(swap_map); 2152 vfree(swap_map);
2152 if (swap_file) 2153 if (swap_file) {
2154 if (inode && S_ISREG(inode->i_mode)) {
2155 mutex_unlock(&inode->i_mutex);
2156 inode = NULL;
2157 }
2153 filp_close(swap_file, NULL); 2158 filp_close(swap_file, NULL);
2159 }
2154out: 2160out:
2155 if (page && !IS_ERR(page)) { 2161 if (page && !IS_ERR(page)) {
2156 kunmap(page); 2162 kunmap(page);
@@ -2158,11 +2164,8 @@ out:
2158 } 2164 }
2159 if (name) 2165 if (name)
2160 putname(name); 2166 putname(name);
2161 if (did_down) { 2167 if (inode && S_ISREG(inode->i_mode))
2162 if (!error)
2163 inode->i_flags |= S_SWAPFILE;
2164 mutex_unlock(&inode->i_mutex); 2168 mutex_unlock(&inode->i_mutex);
2165 }
2166 return error; 2169 return error;
2167} 2170}
2168 2171
diff --git a/mm/truncate.c b/mm/truncate.c
index d64296be00d3..a95667529135 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -106,9 +106,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
106 cancel_dirty_page(page, PAGE_CACHE_SIZE); 106 cancel_dirty_page(page, PAGE_CACHE_SIZE);
107 107
108 clear_page_mlock(page); 108 clear_page_mlock(page);
109 remove_from_page_cache(page);
110 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
111 page_cache_release(page); /* pagecache ref */ 110 delete_from_page_cache(page);
112 return 0; 111 return 0;
113} 112}
114 113
@@ -322,11 +321,12 @@ EXPORT_SYMBOL(truncate_inode_pages);
322 * pagetables. 321 * pagetables.
323 */ 322 */
324unsigned long invalidate_mapping_pages(struct address_space *mapping, 323unsigned long invalidate_mapping_pages(struct address_space *mapping,
325 pgoff_t start, pgoff_t end) 324 pgoff_t start, pgoff_t end)
326{ 325{
327 struct pagevec pvec; 326 struct pagevec pvec;
328 pgoff_t next = start; 327 pgoff_t next = start;
329 unsigned long ret = 0; 328 unsigned long ret;
329 unsigned long count = 0;
330 int i; 330 int i;
331 331
332 pagevec_init(&pvec, 0); 332 pagevec_init(&pvec, 0);
@@ -353,9 +353,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
353 if (lock_failed) 353 if (lock_failed)
354 continue; 354 continue;
355 355
356 ret += invalidate_inode_page(page); 356 ret = invalidate_inode_page(page);
357
358 unlock_page(page); 357 unlock_page(page);
358 /*
359 * Invalidation is a hint that the page is no longer
360 * of interest and try to speed up its reclaim.
361 */
362 if (!ret)
363 deactivate_page(page);
364 count += ret;
359 if (next > end) 365 if (next > end)
360 break; 366 break;
361 } 367 }
@@ -363,7 +369,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
363 mem_cgroup_uncharge_end(); 369 mem_cgroup_uncharge_end();
364 cond_resched(); 370 cond_resched();
365 } 371 }
366 return ret; 372 return count;
367} 373}
368EXPORT_SYMBOL(invalidate_mapping_pages); 374EXPORT_SYMBOL(invalidate_mapping_pages);
369 375
@@ -389,7 +395,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
389 395
390 clear_page_mlock(page); 396 clear_page_mlock(page);
391 BUG_ON(page_has_private(page)); 397 BUG_ON(page_has_private(page));
392 __remove_from_page_cache(page); 398 __delete_from_page_cache(page);
393 spin_unlock_irq(&mapping->tree_lock); 399 spin_unlock_irq(&mapping->tree_lock);
394 mem_cgroup_uncharge_cache_page(page); 400 mem_cgroup_uncharge_cache_page(page);
395 401
diff --git a/mm/util.c b/mm/util.c
index f126975ef23e..e7b103a6fd21 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -227,7 +227,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
227/* 227/*
228 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall 228 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
229 * back to the regular GUP. 229 * back to the regular GUP.
230 * If the architecture not support this fucntion, simply return with no 230 * If the architecture not support this function, simply return with no
231 * page pinned 231 * page pinned
232 */ 232 */
233int __attribute__((weak)) __get_user_pages_fast(unsigned long start, 233int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f9b166732e70..5d6030235d7a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -261,8 +261,15 @@ struct vmap_area {
261}; 261};
262 262
263static DEFINE_SPINLOCK(vmap_area_lock); 263static DEFINE_SPINLOCK(vmap_area_lock);
264static struct rb_root vmap_area_root = RB_ROOT;
265static LIST_HEAD(vmap_area_list); 264static LIST_HEAD(vmap_area_list);
265static struct rb_root vmap_area_root = RB_ROOT;
266
267/* The vmap cache globals are protected by vmap_area_lock */
268static struct rb_node *free_vmap_cache;
269static unsigned long cached_hole_size;
270static unsigned long cached_vstart;
271static unsigned long cached_align;
272
266static unsigned long vmap_area_pcpu_hole; 273static unsigned long vmap_area_pcpu_hole;
267 274
268static struct vmap_area *__find_vmap_area(unsigned long addr) 275static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
331 struct rb_node *n; 338 struct rb_node *n;
332 unsigned long addr; 339 unsigned long addr;
333 int purged = 0; 340 int purged = 0;
341 struct vmap_area *first;
334 342
335 BUG_ON(!size); 343 BUG_ON(!size);
336 BUG_ON(size & ~PAGE_MASK); 344 BUG_ON(size & ~PAGE_MASK);
345 BUG_ON(!is_power_of_2(align));
337 346
338 va = kmalloc_node(sizeof(struct vmap_area), 347 va = kmalloc_node(sizeof(struct vmap_area),
339 gfp_mask & GFP_RECLAIM_MASK, node); 348 gfp_mask & GFP_RECLAIM_MASK, node);
@@ -341,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
341 return ERR_PTR(-ENOMEM); 350 return ERR_PTR(-ENOMEM);
342 351
343retry: 352retry:
344 addr = ALIGN(vstart, align);
345
346 spin_lock(&vmap_area_lock); 353 spin_lock(&vmap_area_lock);
347 if (addr + size - 1 < addr) 354 /*
348 goto overflow; 355 * Invalidate cache if we have more permissive parameters.
356 * cached_hole_size notes the largest hole noticed _below_
357 * the vmap_area cached in free_vmap_cache: if size fits
358 * into that hole, we want to scan from vstart to reuse
359 * the hole instead of allocating above free_vmap_cache.
360 * Note that __free_vmap_area may update free_vmap_cache
361 * without updating cached_hole_size or cached_align.
362 */
363 if (!free_vmap_cache ||
364 size < cached_hole_size ||
365 vstart < cached_vstart ||
366 align < cached_align) {
367nocache:
368 cached_hole_size = 0;
369 free_vmap_cache = NULL;
370 }
371 /* record if we encounter less permissive parameters */
372 cached_vstart = vstart;
373 cached_align = align;
374
375 /* find starting point for our search */
376 if (free_vmap_cache) {
377 first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
378 addr = ALIGN(first->va_end + PAGE_SIZE, align);
379 if (addr < vstart)
380 goto nocache;
381 if (addr + size - 1 < addr)
382 goto overflow;
383
384 } else {
385 addr = ALIGN(vstart, align);
386 if (addr + size - 1 < addr)
387 goto overflow;
349 388
350 /* XXX: could have a last_hole cache */ 389 n = vmap_area_root.rb_node;
351 n = vmap_area_root.rb_node; 390 first = NULL;
352 if (n) {
353 struct vmap_area *first = NULL;
354 391
355 do { 392 while (n) {
356 struct vmap_area *tmp; 393 struct vmap_area *tmp;
357 tmp = rb_entry(n, struct vmap_area, rb_node); 394 tmp = rb_entry(n, struct vmap_area, rb_node);
358 if (tmp->va_end >= addr) { 395 if (tmp->va_end >= addr) {
359 if (!first && tmp->va_start < addr + size)
360 first = tmp;
361 n = n->rb_left;
362 } else {
363 first = tmp; 396 first = tmp;
397 if (tmp->va_start <= addr)
398 break;
399 n = n->rb_left;
400 } else
364 n = n->rb_right; 401 n = n->rb_right;
365 } 402 }
366 } while (n);
367 403
368 if (!first) 404 if (!first)
369 goto found; 405 goto found;
370
371 if (first->va_end < addr) {
372 n = rb_next(&first->rb_node);
373 if (n)
374 first = rb_entry(n, struct vmap_area, rb_node);
375 else
376 goto found;
377 }
378
379 while (addr + size > first->va_start && addr + size <= vend) {
380 addr = ALIGN(first->va_end + PAGE_SIZE, align);
381 if (addr + size - 1 < addr)
382 goto overflow;
383
384 n = rb_next(&first->rb_node);
385 if (n)
386 first = rb_entry(n, struct vmap_area, rb_node);
387 else
388 goto found;
389 }
390 } 406 }
391found: 407
392 if (addr + size > vend) { 408 /* from the starting point, walk areas until a suitable hole is found */
393overflow: 409 while (addr + size >= first->va_start && addr + size <= vend) {
394 spin_unlock(&vmap_area_lock); 410 if (addr + cached_hole_size < first->va_start)
395 if (!purged) { 411 cached_hole_size = first->va_start - addr;
396 purge_vmap_area_lazy(); 412 addr = ALIGN(first->va_end + PAGE_SIZE, align);
397 purged = 1; 413 if (addr + size - 1 < addr)
398 goto retry; 414 goto overflow;
399 } 415
400 if (printk_ratelimit()) 416 n = rb_next(&first->rb_node);
401 printk(KERN_WARNING 417 if (n)
402 "vmap allocation for size %lu failed: " 418 first = rb_entry(n, struct vmap_area, rb_node);
403 "use vmalloc=<size> to increase size.\n", size); 419 else
404 kfree(va); 420 goto found;
405 return ERR_PTR(-EBUSY);
406 } 421 }
407 422
408 BUG_ON(addr & (align-1)); 423found:
424 if (addr + size > vend)
425 goto overflow;
409 426
410 va->va_start = addr; 427 va->va_start = addr;
411 va->va_end = addr + size; 428 va->va_end = addr + size;
412 va->flags = 0; 429 va->flags = 0;
413 __insert_vmap_area(va); 430 __insert_vmap_area(va);
431 free_vmap_cache = &va->rb_node;
414 spin_unlock(&vmap_area_lock); 432 spin_unlock(&vmap_area_lock);
415 433
434 BUG_ON(va->va_start & (align-1));
435 BUG_ON(va->va_start < vstart);
436 BUG_ON(va->va_end > vend);
437
416 return va; 438 return va;
439
440overflow:
441 spin_unlock(&vmap_area_lock);
442 if (!purged) {
443 purge_vmap_area_lazy();
444 purged = 1;
445 goto retry;
446 }
447 if (printk_ratelimit())
448 printk(KERN_WARNING
449 "vmap allocation for size %lu failed: "
450 "use vmalloc=<size> to increase size.\n", size);
451 kfree(va);
452 return ERR_PTR(-EBUSY);
417} 453}
418 454
419static void rcu_free_va(struct rcu_head *head) 455static void rcu_free_va(struct rcu_head *head)
@@ -426,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head)
426static void __free_vmap_area(struct vmap_area *va) 462static void __free_vmap_area(struct vmap_area *va)
427{ 463{
428 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 464 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
465
466 if (free_vmap_cache) {
467 if (va->va_end < cached_vstart) {
468 free_vmap_cache = NULL;
469 } else {
470 struct vmap_area *cache;
471 cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
472 if (va->va_start <= cache->va_start) {
473 free_vmap_cache = rb_prev(&va->rb_node);
474 /*
475 * We don't try to update cached_hole_size or
476 * cached_align, but it won't go very wrong.
477 */
478 }
479 }
480 }
429 rb_erase(&va->rb_node, &vmap_area_root); 481 rb_erase(&va->rb_node, &vmap_area_root);
430 RB_CLEAR_NODE(&va->rb_node); 482 RB_CLEAR_NODE(&va->rb_node);
431 list_del_rcu(&va->list); 483 list_del_rcu(&va->list);
@@ -1951,8 +2003,6 @@ finished:
1951 * should know vmalloc() area is valid and can use memcpy(). 2003 * should know vmalloc() area is valid and can use memcpy().
1952 * This is for routines which have to access vmalloc area without 2004 * This is for routines which have to access vmalloc area without
1953 * any informaion, as /dev/kmem. 2005 * any informaion, as /dev/kmem.
1954 *
1955 * The caller should guarantee KM_USER1 is not used.
1956 */ 2006 */
1957 2007
1958long vwrite(char *buf, char *addr, unsigned long count) 2008long vwrite(char *buf, char *addr, unsigned long count)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6771ea70bfe7..f6b435c80079 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,7 @@
41#include <linux/memcontrol.h> 41#include <linux/memcontrol.h>
42#include <linux/delayacct.h> 42#include <linux/delayacct.h>
43#include <linux/sysctl.h> 43#include <linux/sysctl.h>
44#include <linux/oom.h>
44 45
45#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
46#include <asm/div64.h> 47#include <asm/div64.h>
@@ -358,7 +359,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
358static void handle_write_error(struct address_space *mapping, 359static void handle_write_error(struct address_space *mapping,
359 struct page *page, int error) 360 struct page *page, int error)
360{ 361{
361 lock_page_nosync(page); 362 lock_page(page);
362 if (page_mapping(page) == mapping) 363 if (page_mapping(page) == mapping)
363 mapping_set_error(mapping, error); 364 mapping_set_error(mapping, error);
364 unlock_page(page); 365 unlock_page(page);
@@ -514,7 +515,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
514 515
515 freepage = mapping->a_ops->freepage; 516 freepage = mapping->a_ops->freepage;
516 517
517 __remove_from_page_cache(page); 518 __delete_from_page_cache(page);
518 spin_unlock_irq(&mapping->tree_lock); 519 spin_unlock_irq(&mapping->tree_lock);
519 mem_cgroup_uncharge_cache_page(page); 520 mem_cgroup_uncharge_cache_page(page);
520 521
@@ -1065,7 +1066,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1065 * surrounding the tag page. Only take those pages of 1066 * surrounding the tag page. Only take those pages of
1066 * the same active state as that tag page. We may safely 1067 * the same active state as that tag page. We may safely
1067 * round the target page pfn down to the requested order 1068 * round the target page pfn down to the requested order
1068 * as the mem_map is guarenteed valid out to MAX_ORDER, 1069 * as the mem_map is guaranteed valid out to MAX_ORDER,
1069 * where that page is in a different zone we will detect 1070 * where that page is in a different zone we will detect
1070 * it from its zone id and abort this block scan. 1071 * it from its zone id and abort this block scan.
1071 */ 1072 */
@@ -1988,17 +1989,12 @@ static bool zone_reclaimable(struct zone *zone)
1988 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 1989 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
1989} 1990}
1990 1991
1991/* 1992/* All zones in zonelist are unreclaimable? */
1992 * As hibernation is going on, kswapd is freezed so that it can't mark
1993 * the zone into all_unreclaimable. It can't handle OOM during hibernation.
1994 * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
1995 */
1996static bool all_unreclaimable(struct zonelist *zonelist, 1993static bool all_unreclaimable(struct zonelist *zonelist,
1997 struct scan_control *sc) 1994 struct scan_control *sc)
1998{ 1995{
1999 struct zoneref *z; 1996 struct zoneref *z;
2000 struct zone *zone; 1997 struct zone *zone;
2001 bool all_unreclaimable = true;
2002 1998
2003 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1999 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2004 gfp_zone(sc->gfp_mask), sc->nodemask) { 2000 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2006,13 +2002,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
2006 continue; 2002 continue;
2007 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2003 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2008 continue; 2004 continue;
2009 if (zone_reclaimable(zone)) { 2005 if (!zone->all_unreclaimable)
2010 all_unreclaimable = false; 2006 return false;
2011 break;
2012 }
2013 } 2007 }
2014 2008
2015 return all_unreclaimable; 2009 return true;
2016} 2010}
2017 2011
2018/* 2012/*
@@ -2108,6 +2102,14 @@ out:
2108 if (sc->nr_reclaimed) 2102 if (sc->nr_reclaimed)
2109 return sc->nr_reclaimed; 2103 return sc->nr_reclaimed;
2110 2104
2105 /*
2106 * As hibernation is going on, kswapd is freezed so that it can't mark
2107 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2108 * check.
2109 */
2110 if (oom_killer_disabled)
2111 return 0;
2112
2111 /* top priority shrink_zones still had more to do? don't OOM, then */ 2113 /* top priority shrink_zones still had more to do? don't OOM, then */
2112 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2114 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
2113 return 1; 2115 return 1;
@@ -2224,7 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2224 * o a 16M DMA zone that is balanced will not balance a zone on any 2226 * o a 16M DMA zone that is balanced will not balance a zone on any
2225 * reasonable sized machine 2227 * reasonable sized machine
2226 * o On all other machines, the top zone must be at least a reasonable 2228 * o On all other machines, the top zone must be at least a reasonable
2227 * precentage of the middle zones. For example, on 32-bit x86, highmem 2229 * percentage of the middle zones. For example, on 32-bit x86, highmem
2228 * would need to be at least 256M for it to be balance a whole node. 2230 * would need to be at least 256M for it to be balance a whole node.
2229 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2231 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2230 * to balance a node on its own. These seemed like reasonable ratios. 2232 * to balance a node on its own. These seemed like reasonable ratios.
@@ -2397,9 +2399,9 @@ loop_again:
2397 * cause too much scanning of the lower zones. 2399 * cause too much scanning of the lower zones.
2398 */ 2400 */
2399 for (i = 0; i <= end_zone; i++) { 2401 for (i = 0; i <= end_zone; i++) {
2400 int compaction;
2401 struct zone *zone = pgdat->node_zones + i; 2402 struct zone *zone = pgdat->node_zones + i;
2402 int nr_slab; 2403 int nr_slab;
2404 unsigned long balance_gap;
2403 2405
2404 if (!populated_zone(zone)) 2406 if (!populated_zone(zone))
2405 continue; 2407 continue;
@@ -2416,11 +2418,20 @@ loop_again:
2416 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); 2418 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
2417 2419
2418 /* 2420 /*
2419 * We put equal pressure on every zone, unless one 2421 * We put equal pressure on every zone, unless
2420 * zone has way too many pages free already. 2422 * one zone has way too many pages free
2423 * already. The "too many pages" is defined
2424 * as the high wmark plus a "gap" where the
2425 * gap is either the low watermark or 1%
2426 * of the zone, whichever is smaller.
2421 */ 2427 */
2428 balance_gap = min(low_wmark_pages(zone),
2429 (zone->present_pages +
2430 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2431 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2422 if (!zone_watermark_ok_safe(zone, order, 2432 if (!zone_watermark_ok_safe(zone, order,
2423 8*high_wmark_pages(zone), end_zone, 0)) 2433 high_wmark_pages(zone) + balance_gap,
2434 end_zone, 0))
2424 shrink_zone(priority, zone, &sc); 2435 shrink_zone(priority, zone, &sc);
2425 reclaim_state->reclaimed_slab = 0; 2436 reclaim_state->reclaimed_slab = 0;
2426 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 2437 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2428,24 +2439,9 @@ loop_again:
2428 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2439 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2429 total_scanned += sc.nr_scanned; 2440 total_scanned += sc.nr_scanned;
2430 2441
2431 compaction = 0;
2432 if (order &&
2433 zone_watermark_ok(zone, 0,
2434 high_wmark_pages(zone),
2435 end_zone, 0) &&
2436 !zone_watermark_ok(zone, order,
2437 high_wmark_pages(zone),
2438 end_zone, 0)) {
2439 compact_zone_order(zone,
2440 order,
2441 sc.gfp_mask, false,
2442 COMPACT_MODE_KSWAPD);
2443 compaction = 1;
2444 }
2445
2446 if (zone->all_unreclaimable) 2442 if (zone->all_unreclaimable)
2447 continue; 2443 continue;
2448 if (!compaction && nr_slab == 0 && 2444 if (nr_slab == 0 &&
2449 !zone_reclaimable(zone)) 2445 !zone_reclaimable(zone))
2450 zone->all_unreclaimable = 1; 2446 zone->all_unreclaimable = 1;
2451 /* 2447 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0c3b5048773e..897ea9e88238 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -321,9 +321,12 @@ static inline void mod_state(struct zone *zone,
321 /* 321 /*
322 * The fetching of the stat_threshold is racy. We may apply 322 * The fetching of the stat_threshold is racy. We may apply
323 * a counter threshold to the wrong the cpu if we get 323 * a counter threshold to the wrong the cpu if we get
324 * rescheduled while executing here. However, the following 324 * rescheduled while executing here. However, the next
325 * will apply the threshold again and therefore bring the 325 * counter update will apply the threshold again and
326 * counter under the threshold. 326 * therefore bring the counter under the threshold again.
327 *
328 * Most of the time the thresholds are the same anyways
329 * for all cpus in a zone.
327 */ 330 */
328 t = this_cpu_read(pcp->stat_threshold); 331 t = this_cpu_read(pcp->stat_threshold);
329 332
@@ -500,8 +503,12 @@ void refresh_cpu_vm_stats(int cpu)
500 * z = the zone from which the allocation occurred. 503 * z = the zone from which the allocation occurred.
501 * 504 *
502 * Must be called with interrupts disabled. 505 * Must be called with interrupts disabled.
506 *
507 * When __GFP_OTHER_NODE is set assume the node of the preferred
508 * zone is the local node. This is useful for daemons who allocate
509 * memory on behalf of other processes.
503 */ 510 */
504void zone_statistics(struct zone *preferred_zone, struct zone *z) 511void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
505{ 512{
506 if (z->zone_pgdat == preferred_zone->zone_pgdat) { 513 if (z->zone_pgdat == preferred_zone->zone_pgdat) {
507 __inc_zone_state(z, NUMA_HIT); 514 __inc_zone_state(z, NUMA_HIT);
@@ -509,7 +516,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
509 __inc_zone_state(z, NUMA_MISS); 516 __inc_zone_state(z, NUMA_MISS);
510 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 517 __inc_zone_state(preferred_zone, NUMA_FOREIGN);
511 } 518 }
512 if (z->node == numa_node_id()) 519 if (z->node == ((flags & __GFP_OTHER_NODE) ?
520 preferred_zone->node : numa_node_id()))
513 __inc_zone_state(z, NUMA_LOCAL); 521 __inc_zone_state(z, NUMA_LOCAL);
514 else 522 else
515 __inc_zone_state(z, NUMA_OTHER); 523 __inc_zone_state(z, NUMA_OTHER);
@@ -940,7 +948,16 @@ static const char * const vmstat_text[] = {
940 "unevictable_pgs_cleared", 948 "unevictable_pgs_cleared",
941 "unevictable_pgs_stranded", 949 "unevictable_pgs_stranded",
942 "unevictable_pgs_mlockfreed", 950 "unevictable_pgs_mlockfreed",
951
952#ifdef CONFIG_TRANSPARENT_HUGEPAGE
953 "thp_fault_alloc",
954 "thp_fault_fallback",
955 "thp_collapse_alloc",
956 "thp_collapse_alloc_failed",
957 "thp_split",
943#endif 958#endif
959
960#endif /* CONFIG_VM_EVENTS_COUNTERS */
944}; 961};
945 962
946static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 963static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,