aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c23
-rw-r--r--mm/bootmem.c5
-rw-r--r--mm/bounce.c4
-rw-r--r--mm/cleancache.c98
-rw-r--r--mm/compaction.c101
-rw-r--r--mm/filemap.c38
-rw-r--r--mm/filemap_xip.c7
-rw-r--r--mm/huge_memory.c133
-rw-r--r--mm/hugetlb.c211
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/ksm.c57
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memblock.c6
-rw-r--r--mm/memcontrol.c680
-rw-r--r--mm/memory-failure.c98
-rw-r--r--mm/memory.c198
-rw-r--r--mm/mempolicy.c65
-rw-r--r--mm/migrate.c40
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mmap.c90
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c9
-rw-r--r--mm/oom_kill.c166
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c59
-rw-r--r--mm/page_cgroup.c4
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu-vm.c3
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/process_vm_access.c23
-rw-r--r--mm/rmap.c70
-rw-r--r--mm/shmem.c106
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slub.c40
-rw-r--r--mm/sparse.c30
-rw-r--r--mm/swap.c14
-rw-r--r--mm/swap_state.c34
-rw-r--r--mm/swapfile.c92
-rw-r--r--mm/truncate.c12
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmalloc.c8
-rw-r--r--mm/vmscan.c152
46 files changed, 1598 insertions, 1173 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7ba8feae11b..dd8e2aafb07 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -318,7 +318,7 @@ static void wakeup_timer_fn(unsigned long data)
318 if (bdi->wb.task) { 318 if (bdi->wb.task) {
319 trace_writeback_wake_thread(bdi); 319 trace_writeback_wake_thread(bdi);
320 wake_up_process(bdi->wb.task); 320 wake_up_process(bdi->wb.task);
321 } else { 321 } else if (bdi->dev) {
322 /* 322 /*
323 * When bdi tasks are inactive for long time, they are killed. 323 * When bdi tasks are inactive for long time, they are killed.
324 * In this case we have to wake-up the forker thread which 324 * In this case we have to wake-up the forker thread which
@@ -584,6 +584,8 @@ EXPORT_SYMBOL(bdi_register_dev);
584 */ 584 */
585static void bdi_wb_shutdown(struct backing_dev_info *bdi) 585static void bdi_wb_shutdown(struct backing_dev_info *bdi)
586{ 586{
587 struct task_struct *task;
588
587 if (!bdi_cap_writeback_dirty(bdi)) 589 if (!bdi_cap_writeback_dirty(bdi))
588 return; 590 return;
589 591
@@ -602,8 +604,13 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
602 * Finally, kill the kernel thread. We don't need to be RCU 604 * Finally, kill the kernel thread. We don't need to be RCU
603 * safe anymore, since the bdi is gone from visibility. 605 * safe anymore, since the bdi is gone from visibility.
604 */ 606 */
605 if (bdi->wb.task) 607 spin_lock_bh(&bdi->wb_lock);
606 kthread_stop(bdi->wb.task); 608 task = bdi->wb.task;
609 bdi->wb.task = NULL;
610 spin_unlock_bh(&bdi->wb_lock);
611
612 if (task)
613 kthread_stop(task);
607} 614}
608 615
609/* 616/*
@@ -623,7 +630,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
623 630
624void bdi_unregister(struct backing_dev_info *bdi) 631void bdi_unregister(struct backing_dev_info *bdi)
625{ 632{
626 if (bdi->dev) { 633 struct device *dev = bdi->dev;
634
635 if (dev) {
627 bdi_set_min_ratio(bdi, 0); 636 bdi_set_min_ratio(bdi, 0);
628 trace_writeback_bdi_unregister(bdi); 637 trace_writeback_bdi_unregister(bdi);
629 bdi_prune_sb(bdi); 638 bdi_prune_sb(bdi);
@@ -632,8 +641,12 @@ void bdi_unregister(struct backing_dev_info *bdi)
632 if (!bdi_cap_flush_forker(bdi)) 641 if (!bdi_cap_flush_forker(bdi))
633 bdi_wb_shutdown(bdi); 642 bdi_wb_shutdown(bdi);
634 bdi_debug_unregister(bdi); 643 bdi_debug_unregister(bdi);
635 device_unregister(bdi->dev); 644
645 spin_lock_bh(&bdi->wb_lock);
636 bdi->dev = NULL; 646 bdi->dev = NULL;
647 spin_unlock_bh(&bdi->wb_lock);
648
649 device_unregister(dev);
637 } 650 }
638} 651}
639EXPORT_SYMBOL(bdi_unregister); 652EXPORT_SYMBOL(bdi_unregister);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 668e94df8cf..0131170c9d5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size,
766 unsigned long section_nr) 766 unsigned long section_nr)
767{ 767{
768 bootmem_data_t *bdata; 768 bootmem_data_t *bdata;
769 unsigned long pfn, goal, limit; 769 unsigned long pfn, goal;
770 770
771 pfn = section_nr_to_pfn(section_nr); 771 pfn = section_nr_to_pfn(section_nr);
772 goal = pfn << PAGE_SHIFT; 772 goal = pfn << PAGE_SHIFT;
773 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
774 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 773 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
775 774
776 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 775 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
777} 776}
778#endif 777#endif
779 778
diff --git a/mm/bounce.c b/mm/bounce.c
index 4e9ae722af8..d1be02ca188 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -50,9 +50,9 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
50 unsigned char *vto; 50 unsigned char *vto;
51 51
52 local_irq_save(flags); 52 local_irq_save(flags);
53 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); 53 vto = kmap_atomic(to->bv_page);
54 memcpy(vto + to->bv_offset, vfrom, to->bv_len); 54 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
55 kunmap_atomic(vto, KM_BOUNCE_READ); 55 kunmap_atomic(vto);
56 local_irq_restore(flags); 56 local_irq_restore(flags);
57} 57}
58 58
diff --git a/mm/cleancache.c b/mm/cleancache.c
index bcaae4c2a77..5646c740f61 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -15,29 +15,34 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/exportfs.h> 16#include <linux/exportfs.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/debugfs.h>
18#include <linux/cleancache.h> 19#include <linux/cleancache.h>
19 20
20/* 21/*
21 * This global enablement flag may be read thousands of times per second 22 * This global enablement flag may be read thousands of times per second
22 * by cleancache_get/put/flush even on systems where cleancache_ops 23 * by cleancache_get/put/invalidate even on systems where cleancache_ops
23 * is not claimed (e.g. cleancache is config'ed on but remains 24 * is not claimed (e.g. cleancache is config'ed on but remains
24 * disabled), so is preferred to the slower alternative: a function 25 * disabled), so is preferred to the slower alternative: a function
25 * call that checks a non-global. 26 * call that checks a non-global.
26 */ 27 */
27int cleancache_enabled; 28int cleancache_enabled __read_mostly;
28EXPORT_SYMBOL(cleancache_enabled); 29EXPORT_SYMBOL(cleancache_enabled);
29 30
30/* 31/*
31 * cleancache_ops is set by cleancache_ops_register to contain the pointers 32 * cleancache_ops is set by cleancache_ops_register to contain the pointers
32 * to the cleancache "backend" implementation functions. 33 * to the cleancache "backend" implementation functions.
33 */ 34 */
34static struct cleancache_ops cleancache_ops; 35static struct cleancache_ops cleancache_ops __read_mostly;
35 36
36/* useful stats available in /sys/kernel/mm/cleancache */ 37/*
37static unsigned long cleancache_succ_gets; 38 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
38static unsigned long cleancache_failed_gets; 39 * properly configured. These are for information only so are not protected
39static unsigned long cleancache_puts; 40 * against increment races.
40static unsigned long cleancache_flushes; 41 */
42static u64 cleancache_succ_gets;
43static u64 cleancache_failed_gets;
44static u64 cleancache_puts;
45static u64 cleancache_invalidates;
41 46
42/* 47/*
43 * register operations for cleancache, returning previous thus allowing 48 * register operations for cleancache, returning previous thus allowing
@@ -148,10 +153,11 @@ void __cleancache_put_page(struct page *page)
148EXPORT_SYMBOL(__cleancache_put_page); 153EXPORT_SYMBOL(__cleancache_put_page);
149 154
150/* 155/*
151 * Flush any data from cleancache associated with the poolid and the 156 * Invalidate any data from cleancache associated with the poolid and the
152 * page's inode and page index so that a subsequent "get" will fail. 157 * page's inode and page index so that a subsequent "get" will fail.
153 */ 158 */
154void __cleancache_flush_page(struct address_space *mapping, struct page *page) 159void __cleancache_invalidate_page(struct address_space *mapping,
160 struct page *page)
155{ 161{
156 /* careful... page->mapping is NULL sometimes when this is called */ 162 /* careful... page->mapping is NULL sometimes when this is called */
157 int pool_id = mapping->host->i_sb->cleancache_poolid; 163 int pool_id = mapping->host->i_sb->cleancache_poolid;
@@ -160,85 +166,57 @@ void __cleancache_flush_page(struct address_space *mapping, struct page *page)
160 if (pool_id >= 0) { 166 if (pool_id >= 0) {
161 VM_BUG_ON(!PageLocked(page)); 167 VM_BUG_ON(!PageLocked(page));
162 if (cleancache_get_key(mapping->host, &key) >= 0) { 168 if (cleancache_get_key(mapping->host, &key) >= 0) {
163 (*cleancache_ops.flush_page)(pool_id, key, page->index); 169 (*cleancache_ops.invalidate_page)(pool_id,
164 cleancache_flushes++; 170 key, page->index);
171 cleancache_invalidates++;
165 } 172 }
166 } 173 }
167} 174}
168EXPORT_SYMBOL(__cleancache_flush_page); 175EXPORT_SYMBOL(__cleancache_invalidate_page);
169 176
170/* 177/*
171 * Flush all data from cleancache associated with the poolid and the 178 * Invalidate all data from cleancache associated with the poolid and the
172 * mappings's inode so that all subsequent gets to this poolid/inode 179 * mappings's inode so that all subsequent gets to this poolid/inode
173 * will fail. 180 * will fail.
174 */ 181 */
175void __cleancache_flush_inode(struct address_space *mapping) 182void __cleancache_invalidate_inode(struct address_space *mapping)
176{ 183{
177 int pool_id = mapping->host->i_sb->cleancache_poolid; 184 int pool_id = mapping->host->i_sb->cleancache_poolid;
178 struct cleancache_filekey key = { .u.key = { 0 } }; 185 struct cleancache_filekey key = { .u.key = { 0 } };
179 186
180 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) 187 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
181 (*cleancache_ops.flush_inode)(pool_id, key); 188 (*cleancache_ops.invalidate_inode)(pool_id, key);
182} 189}
183EXPORT_SYMBOL(__cleancache_flush_inode); 190EXPORT_SYMBOL(__cleancache_invalidate_inode);
184 191
185/* 192/*
186 * Called by any cleancache-enabled filesystem at time of unmount; 193 * Called by any cleancache-enabled filesystem at time of unmount;
187 * note that pool_id is surrendered and may be reutrned by a subsequent 194 * note that pool_id is surrendered and may be reutrned by a subsequent
188 * cleancache_init_fs or cleancache_init_shared_fs 195 * cleancache_init_fs or cleancache_init_shared_fs
189 */ 196 */
190void __cleancache_flush_fs(struct super_block *sb) 197void __cleancache_invalidate_fs(struct super_block *sb)
191{ 198{
192 if (sb->cleancache_poolid >= 0) { 199 if (sb->cleancache_poolid >= 0) {
193 int old_poolid = sb->cleancache_poolid; 200 int old_poolid = sb->cleancache_poolid;
194 sb->cleancache_poolid = -1; 201 sb->cleancache_poolid = -1;
195 (*cleancache_ops.flush_fs)(old_poolid); 202 (*cleancache_ops.invalidate_fs)(old_poolid);
196 } 203 }
197} 204}
198EXPORT_SYMBOL(__cleancache_flush_fs); 205EXPORT_SYMBOL(__cleancache_invalidate_fs);
199
200#ifdef CONFIG_SYSFS
201
202/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
203
204#define CLEANCACHE_SYSFS_RO(_name) \
205 static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
206 struct kobj_attribute *attr, char *buf) \
207 { \
208 return sprintf(buf, "%lu\n", cleancache_##_name); \
209 } \
210 static struct kobj_attribute cleancache_##_name##_attr = { \
211 .attr = { .name = __stringify(_name), .mode = 0444 }, \
212 .show = cleancache_##_name##_show, \
213 }
214
215CLEANCACHE_SYSFS_RO(succ_gets);
216CLEANCACHE_SYSFS_RO(failed_gets);
217CLEANCACHE_SYSFS_RO(puts);
218CLEANCACHE_SYSFS_RO(flushes);
219
220static struct attribute *cleancache_attrs[] = {
221 &cleancache_succ_gets_attr.attr,
222 &cleancache_failed_gets_attr.attr,
223 &cleancache_puts_attr.attr,
224 &cleancache_flushes_attr.attr,
225 NULL,
226};
227
228static struct attribute_group cleancache_attr_group = {
229 .attrs = cleancache_attrs,
230 .name = "cleancache",
231};
232
233#endif /* CONFIG_SYSFS */
234 206
235static int __init init_cleancache(void) 207static int __init init_cleancache(void)
236{ 208{
237#ifdef CONFIG_SYSFS 209#ifdef CONFIG_DEBUG_FS
238 int err; 210 struct dentry *root = debugfs_create_dir("cleancache", NULL);
239 211 if (root == NULL)
240 err = sysfs_create_group(mm_kobj, &cleancache_attr_group); 212 return -ENXIO;
241#endif /* CONFIG_SYSFS */ 213 debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets);
214 debugfs_create_u64("failed_gets", S_IRUGO,
215 root, &cleancache_failed_gets);
216 debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts);
217 debugfs_create_u64("invalidates", S_IRUGO,
218 root, &cleancache_invalidates);
219#endif
242 return 0; 220 return 0;
243} 221}
244module_init(init_cleancache) 222module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
index 71a58f67f48..74a8c825ff2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,7 +35,7 @@ struct compact_control {
35 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */ 36 bool sync; /* Synchronous migration */
37 37
38 unsigned int order; /* order a direct compactor needs */ 38 int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone; 40 struct zone *zone;
41}; 41};
@@ -313,12 +313,34 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
313 } else if (!locked) 313 } else if (!locked)
314 spin_lock_irq(&zone->lru_lock); 314 spin_lock_irq(&zone->lru_lock);
315 315
316 /*
317 * migrate_pfn does not necessarily start aligned to a
318 * pageblock. Ensure that pfn_valid is called when moving
319 * into a new MAX_ORDER_NR_PAGES range in case of large
320 * memory holes within the zone
321 */
322 if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
323 if (!pfn_valid(low_pfn)) {
324 low_pfn += MAX_ORDER_NR_PAGES - 1;
325 continue;
326 }
327 }
328
316 if (!pfn_valid_within(low_pfn)) 329 if (!pfn_valid_within(low_pfn))
317 continue; 330 continue;
318 nr_scanned++; 331 nr_scanned++;
319 332
320 /* Get the page and skip if free */ 333 /*
334 * Get the page and ensure the page is within the same zone.
335 * See the comment in isolate_freepages about overlapping
336 * nodes. It is deliberate that the new zone lock is not taken
337 * as memory compaction should not move pages between nodes.
338 */
321 page = pfn_to_page(low_pfn); 339 page = pfn_to_page(low_pfn);
340 if (page_zone(page) != zone)
341 continue;
342
343 /* Skip if free */
322 if (PageBuddy(page)) 344 if (PageBuddy(page))
323 continue; 345 continue;
324 346
@@ -653,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
653 675
654 676
655/* Compact all zones within a node */ 677/* Compact all zones within a node */
656static int compact_node(int nid) 678static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
657{ 679{
658 int zoneid; 680 int zoneid;
659 pg_data_t *pgdat;
660 struct zone *zone; 681 struct zone *zone;
661 682
662 if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
663 return -EINVAL;
664 pgdat = NODE_DATA(nid);
665
666 /* Flush pending updates to the LRU lists */
667 lru_add_drain_all();
668
669 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 683 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
670 struct compact_control cc = {
671 .nr_freepages = 0,
672 .nr_migratepages = 0,
673 .order = -1,
674 .sync = true,
675 };
676 684
677 zone = &pgdat->node_zones[zoneid]; 685 zone = &pgdat->node_zones[zoneid];
678 if (!populated_zone(zone)) 686 if (!populated_zone(zone))
679 continue; 687 continue;
680 688
681 cc.zone = zone; 689 cc->nr_freepages = 0;
682 INIT_LIST_HEAD(&cc.freepages); 690 cc->nr_migratepages = 0;
683 INIT_LIST_HEAD(&cc.migratepages); 691 cc->zone = zone;
684 692 INIT_LIST_HEAD(&cc->freepages);
685 compact_zone(zone, &cc); 693 INIT_LIST_HEAD(&cc->migratepages);
694
695 if (cc->order == -1 || !compaction_deferred(zone, cc->order))
696 compact_zone(zone, cc);
697
698 if (cc->order > 0) {
699 int ok = zone_watermark_ok(zone, cc->order,
700 low_wmark_pages(zone), 0, 0);
701 if (ok && cc->order > zone->compact_order_failed)
702 zone->compact_order_failed = cc->order + 1;
703 /* Currently async compaction is never deferred. */
704 else if (!ok && cc->sync)
705 defer_compaction(zone, cc->order);
706 }
686 707
687 VM_BUG_ON(!list_empty(&cc.freepages)); 708 VM_BUG_ON(!list_empty(&cc->freepages));
688 VM_BUG_ON(!list_empty(&cc.migratepages)); 709 VM_BUG_ON(!list_empty(&cc->migratepages));
689 } 710 }
690 711
691 return 0; 712 return 0;
692} 713}
693 714
715int compact_pgdat(pg_data_t *pgdat, int order)
716{
717 struct compact_control cc = {
718 .order = order,
719 .sync = false,
720 };
721
722 return __compact_pgdat(pgdat, &cc);
723}
724
725static int compact_node(int nid)
726{
727 struct compact_control cc = {
728 .order = -1,
729 .sync = true,
730 };
731
732 return __compact_pgdat(NODE_DATA(nid), &cc);
733}
734
694/* Compact all nodes in the system */ 735/* Compact all nodes in the system */
695static int compact_nodes(void) 736static int compact_nodes(void)
696{ 737{
697 int nid; 738 int nid;
698 739
740 /* Flush pending updates to the LRU lists */
741 lru_add_drain_all();
742
699 for_each_online_node(nid) 743 for_each_online_node(nid)
700 compact_node(nid); 744 compact_node(nid);
701 745
@@ -728,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev,
728 struct device_attribute *attr, 772 struct device_attribute *attr,
729 const char *buf, size_t count) 773 const char *buf, size_t count)
730{ 774{
731 compact_node(dev->id); 775 int nid = dev->id;
776
777 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
778 /* Flush pending updates to the LRU lists */
779 lru_add_drain_all();
780
781 compact_node(nid);
782 }
732 783
733 return count; 784 return count;
734} 785}
diff --git a/mm/filemap.c b/mm/filemap.c
index 97f49ed35bd..c3811bc6b9e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,9 +101,8 @@
101 * ->inode->i_lock (zap_pte_range->set_page_dirty) 101 * ->inode->i_lock (zap_pte_range->set_page_dirty)
102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
103 * 103 *
104 * (code doesn't rely on that order, so you could switch it around) 104 * ->i_mmap_mutex
105 * ->tasklist_lock (memory_failure, collect_procs_ao) 105 * ->tasklist_lock (memory_failure, collect_procs_ao)
106 * ->i_mmap_mutex
107 */ 106 */
108 107
109/* 108/*
@@ -123,7 +122,7 @@ void __delete_from_page_cache(struct page *page)
123 if (PageUptodate(page) && PageMappedToDisk(page)) 122 if (PageUptodate(page) && PageMappedToDisk(page))
124 cleancache_put_page(page); 123 cleancache_put_page(page);
125 else 124 else
126 cleancache_flush_page(mapping, page); 125 cleancache_invalidate_page(mapping, page);
127 126
128 radix_tree_delete(&mapping->page_tree, page->index); 127 radix_tree_delete(&mapping->page_tree, page->index);
129 page->mapping = NULL; 128 page->mapping = NULL;
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
500 struct page *page; 499 struct page *page;
501 500
502 if (cpuset_do_page_mem_spread()) { 501 if (cpuset_do_page_mem_spread()) {
503 get_mems_allowed(); 502 unsigned int cpuset_mems_cookie;
504 n = cpuset_mem_spread_node(); 503 do {
505 page = alloc_pages_exact_node(n, gfp, 0); 504 cpuset_mems_cookie = get_mems_allowed();
506 put_mems_allowed(); 505 n = cpuset_mem_spread_node();
506 page = alloc_pages_exact_node(n, gfp, 0);
507 } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
508
507 return page; 509 return page;
508 } 510 }
509 return alloc_pages(gfp, 0); 511 return alloc_pages(gfp, 0);
@@ -1318,10 +1320,10 @@ int file_read_actor(read_descriptor_t *desc, struct page *page,
1318 * taking the kmap. 1320 * taking the kmap.
1319 */ 1321 */
1320 if (!fault_in_pages_writeable(desc->arg.buf, size)) { 1322 if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1321 kaddr = kmap_atomic(page, KM_USER0); 1323 kaddr = kmap_atomic(page);
1322 left = __copy_to_user_inatomic(desc->arg.buf, 1324 left = __copy_to_user_inatomic(desc->arg.buf,
1323 kaddr + offset, size); 1325 kaddr + offset, size);
1324 kunmap_atomic(kaddr, KM_USER0); 1326 kunmap_atomic(kaddr);
1325 if (left == 0) 1327 if (left == 0)
1326 goto success; 1328 goto success;
1327 } 1329 }
@@ -1400,15 +1402,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1400 unsigned long seg = 0; 1402 unsigned long seg = 0;
1401 size_t count; 1403 size_t count;
1402 loff_t *ppos = &iocb->ki_pos; 1404 loff_t *ppos = &iocb->ki_pos;
1403 struct blk_plug plug;
1404 1405
1405 count = 0; 1406 count = 0;
1406 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1407 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1407 if (retval) 1408 if (retval)
1408 return retval; 1409 return retval;
1409 1410
1410 blk_start_plug(&plug);
1411
1412 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1411 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1413 if (filp->f_flags & O_DIRECT) { 1412 if (filp->f_flags & O_DIRECT) {
1414 loff_t size; 1413 loff_t size;
@@ -1424,8 +1423,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1424 retval = filemap_write_and_wait_range(mapping, pos, 1423 retval = filemap_write_and_wait_range(mapping, pos,
1425 pos + iov_length(iov, nr_segs) - 1); 1424 pos + iov_length(iov, nr_segs) - 1);
1426 if (!retval) { 1425 if (!retval) {
1426 struct blk_plug plug;
1427
1428 blk_start_plug(&plug);
1427 retval = mapping->a_ops->direct_IO(READ, iocb, 1429 retval = mapping->a_ops->direct_IO(READ, iocb,
1428 iov, pos, nr_segs); 1430 iov, pos, nr_segs);
1431 blk_finish_plug(&plug);
1429 } 1432 }
1430 if (retval > 0) { 1433 if (retval > 0) {
1431 *ppos = pos + retval; 1434 *ppos = pos + retval;
@@ -1481,7 +1484,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1481 break; 1484 break;
1482 } 1485 }
1483out: 1486out:
1484 blk_finish_plug(&plug);
1485 return retval; 1487 return retval;
1486} 1488}
1487EXPORT_SYMBOL(generic_file_aio_read); 1489EXPORT_SYMBOL(generic_file_aio_read);
@@ -2045,7 +2047,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
2045 size_t copied; 2047 size_t copied;
2046 2048
2047 BUG_ON(!in_atomic()); 2049 BUG_ON(!in_atomic());
2048 kaddr = kmap_atomic(page, KM_USER0); 2050 kaddr = kmap_atomic(page);
2049 if (likely(i->nr_segs == 1)) { 2051 if (likely(i->nr_segs == 1)) {
2050 int left; 2052 int left;
2051 char __user *buf = i->iov->iov_base + i->iov_offset; 2053 char __user *buf = i->iov->iov_base + i->iov_offset;
@@ -2055,7 +2057,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
2055 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 2057 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
2056 i->iov, i->iov_offset, bytes); 2058 i->iov, i->iov_offset, bytes);
2057 } 2059 }
2058 kunmap_atomic(kaddr, KM_USER0); 2060 kunmap_atomic(kaddr);
2059 2061
2060 return copied; 2062 return copied;
2061} 2063}
@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
2341 struct page *page; 2343 struct page *page;
2342 gfp_t gfp_notmask = 0; 2344 gfp_t gfp_notmask = 0;
2343 2345
2344 gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; 2346 gfp_mask = mapping_gfp_mask(mapping);
2347 if (mapping_cap_account_dirty(mapping))
2348 gfp_mask |= __GFP_WRITE;
2345 if (flags & AOP_FLAG_NOFS) 2349 if (flags & AOP_FLAG_NOFS)
2346 gfp_notmask = __GFP_FS; 2350 gfp_notmask = __GFP_FS;
2347repeat: 2351repeat:
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index f91b2f68734..a4eb3113222 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -263,7 +263,12 @@ found:
263 xip_pfn); 263 xip_pfn);
264 if (err == -ENOMEM) 264 if (err == -ENOMEM)
265 return VM_FAULT_OOM; 265 return VM_FAULT_OOM;
266 BUG_ON(err); 266 /*
267 * err == -EBUSY is fine, we've raced against another thread
268 * that faulted-in the same page
269 */
270 if (err != -EBUSY)
271 BUG_ON(err);
267 return VM_FAULT_NOPAGE; 272 return VM_FAULT_NOPAGE;
268 } else { 273 } else {
269 int err, ret = VM_FAULT_OOM; 274 int err, ret = VM_FAULT_OOM;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b3ffc21ce80..f0e5306eeb5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -671,6 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
671 set_pmd_at(mm, haddr, pmd, entry); 671 set_pmd_at(mm, haddr, pmd, entry);
672 prepare_pmd_huge_pte(pgtable, mm); 672 prepare_pmd_huge_pte(pgtable, mm);
673 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 673 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
674 mm->nr_ptes++;
674 spin_unlock(&mm->page_table_lock); 675 spin_unlock(&mm->page_table_lock);
675 } 676 }
676 677
@@ -789,6 +790,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
789 pmd = pmd_mkold(pmd_wrprotect(pmd)); 790 pmd = pmd_mkold(pmd_wrprotect(pmd));
790 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 791 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
791 prepare_pmd_huge_pte(pgtable, dst_mm); 792 prepare_pmd_huge_pte(pgtable, dst_mm);
793 dst_mm->nr_ptes++;
792 794
793 ret = 0; 795 ret = 0;
794out_unlock: 796out_unlock:
@@ -887,7 +889,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
887 } 889 }
888 kfree(pages); 890 kfree(pages);
889 891
890 mm->nr_ptes++;
891 smp_wmb(); /* make pte visible before pmd */ 892 smp_wmb(); /* make pte visible before pmd */
892 pmd_populate(mm, pmd, pgtable); 893 pmd_populate(mm, pmd, pgtable);
893 page_remove_rmap(page); 894 page_remove_rmap(page);
@@ -1030,31 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1030{ 1031{
1031 int ret = 0; 1032 int ret = 0;
1032 1033
1033 spin_lock(&tlb->mm->page_table_lock); 1034 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1034 if (likely(pmd_trans_huge(*pmd))) { 1035 struct page *page;
1035 if (unlikely(pmd_trans_splitting(*pmd))) { 1036 pgtable_t pgtable;
1036 spin_unlock(&tlb->mm->page_table_lock); 1037 pgtable = get_pmd_huge_pte(tlb->mm);
1037 wait_split_huge_page(vma->anon_vma, 1038 page = pmd_page(*pmd);
1038 pmd); 1039 pmd_clear(pmd);
1039 } else { 1040 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1040 struct page *page; 1041 page_remove_rmap(page);
1041 pgtable_t pgtable; 1042 VM_BUG_ON(page_mapcount(page) < 0);
1042 pgtable = get_pmd_huge_pte(tlb->mm); 1043 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1043 page = pmd_page(*pmd); 1044 VM_BUG_ON(!PageHead(page));
1044 pmd_clear(pmd); 1045 tlb->mm->nr_ptes--;
1045 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1046 page_remove_rmap(page);
1047 VM_BUG_ON(page_mapcount(page) < 0);
1048 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1049 VM_BUG_ON(!PageHead(page));
1050 spin_unlock(&tlb->mm->page_table_lock);
1051 tlb_remove_page(tlb, page);
1052 pte_free(tlb->mm, pgtable);
1053 ret = 1;
1054 }
1055 } else
1056 spin_unlock(&tlb->mm->page_table_lock); 1046 spin_unlock(&tlb->mm->page_table_lock);
1057 1047 tlb_remove_page(tlb, page);
1048 pte_free(tlb->mm, pgtable);
1049 ret = 1;
1050 }
1058 return ret; 1051 return ret;
1059} 1052}
1060 1053
@@ -1064,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1064{ 1057{
1065 int ret = 0; 1058 int ret = 0;
1066 1059
1067 spin_lock(&vma->vm_mm->page_table_lock); 1060 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1068 if (likely(pmd_trans_huge(*pmd))) { 1061 /*
1069 ret = !pmd_trans_splitting(*pmd); 1062 * All logical pages in the range are present
1070 spin_unlock(&vma->vm_mm->page_table_lock); 1063 * if backed by a huge page.
1071 if (unlikely(!ret)) 1064 */
1072 wait_split_huge_page(vma->anon_vma, pmd);
1073 else {
1074 /*
1075 * All logical pages in the range are present
1076 * if backed by a huge page.
1077 */
1078 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1079 }
1080 } else
1081 spin_unlock(&vma->vm_mm->page_table_lock); 1065 spin_unlock(&vma->vm_mm->page_table_lock);
1066 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1067 ret = 1;
1068 }
1082 1069
1083 return ret; 1070 return ret;
1084} 1071}
@@ -1108,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1108 goto out; 1095 goto out;
1109 } 1096 }
1110 1097
1111 spin_lock(&mm->page_table_lock); 1098 ret = __pmd_trans_huge_lock(old_pmd, vma);
1112 if (likely(pmd_trans_huge(*old_pmd))) { 1099 if (ret == 1) {
1113 if (pmd_trans_splitting(*old_pmd)) { 1100 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1114 spin_unlock(&mm->page_table_lock); 1101 VM_BUG_ON(!pmd_none(*new_pmd));
1115 wait_split_huge_page(vma->anon_vma, old_pmd); 1102 set_pmd_at(mm, new_addr, new_pmd, pmd);
1116 ret = -1;
1117 } else {
1118 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1119 VM_BUG_ON(!pmd_none(*new_pmd));
1120 set_pmd_at(mm, new_addr, new_pmd, pmd);
1121 spin_unlock(&mm->page_table_lock);
1122 ret = 1;
1123 }
1124 } else {
1125 spin_unlock(&mm->page_table_lock); 1103 spin_unlock(&mm->page_table_lock);
1126 } 1104 }
1127out: 1105out:
@@ -1134,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1134 struct mm_struct *mm = vma->vm_mm; 1112 struct mm_struct *mm = vma->vm_mm;
1135 int ret = 0; 1113 int ret = 0;
1136 1114
1137 spin_lock(&mm->page_table_lock); 1115 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1116 pmd_t entry;
1117 entry = pmdp_get_and_clear(mm, addr, pmd);
1118 entry = pmd_modify(entry, newprot);
1119 set_pmd_at(mm, addr, pmd, entry);
1120 spin_unlock(&vma->vm_mm->page_table_lock);
1121 ret = 1;
1122 }
1123
1124 return ret;
1125}
1126
1127/*
1128 * Returns 1 if a given pmd maps a stable (not under splitting) thp.
1129 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1130 *
1131 * Note that if it returns 1, this routine returns without unlocking page
1132 * table locks. So callers must unlock them.
1133 */
1134int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1135{
1136 spin_lock(&vma->vm_mm->page_table_lock);
1138 if (likely(pmd_trans_huge(*pmd))) { 1137 if (likely(pmd_trans_huge(*pmd))) {
1139 if (unlikely(pmd_trans_splitting(*pmd))) { 1138 if (unlikely(pmd_trans_splitting(*pmd))) {
1140 spin_unlock(&mm->page_table_lock); 1139 spin_unlock(&vma->vm_mm->page_table_lock);
1141 wait_split_huge_page(vma->anon_vma, pmd); 1140 wait_split_huge_page(vma->anon_vma, pmd);
1141 return -1;
1142 } else { 1142 } else {
1143 pmd_t entry; 1143 /* Thp mapped by 'pmd' is stable, so we can
1144 1144 * handle it as it is. */
1145 entry = pmdp_get_and_clear(mm, addr, pmd); 1145 return 1;
1146 entry = pmd_modify(entry, newprot);
1147 set_pmd_at(mm, addr, pmd, entry);
1148 spin_unlock(&vma->vm_mm->page_table_lock);
1149 ret = 1;
1150 } 1146 }
1151 } else 1147 }
1152 spin_unlock(&vma->vm_mm->page_table_lock); 1148 spin_unlock(&vma->vm_mm->page_table_lock);
1153 1149 return 0;
1154 return ret;
1155} 1150}
1156 1151
1157pmd_t *page_check_address_pmd(struct page *page, 1152pmd_t *page_check_address_pmd(struct page *page,
@@ -1375,7 +1370,6 @@ static int __split_huge_page_map(struct page *page,
1375 pte_unmap(pte); 1370 pte_unmap(pte);
1376 } 1371 }
1377 1372
1378 mm->nr_ptes++;
1379 smp_wmb(); /* make pte visible before pmd */ 1373 smp_wmb(); /* make pte visible before pmd */
1380 /* 1374 /*
1381 * Up to this point the pmd is present and huge and 1375 * Up to this point the pmd is present and huge and
@@ -1988,7 +1982,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1988 set_pmd_at(mm, address, pmd, _pmd); 1982 set_pmd_at(mm, address, pmd, _pmd);
1989 update_mmu_cache(vma, address, _pmd); 1983 update_mmu_cache(vma, address, _pmd);
1990 prepare_pmd_huge_pte(pgtable, mm); 1984 prepare_pmd_huge_pte(pgtable, mm);
1991 mm->nr_ptes--;
1992 spin_unlock(&mm->page_table_lock); 1985 spin_unlock(&mm->page_table_lock);
1993 1986
1994#ifndef CONFIG_NUMA 1987#ifndef CONFIG_NUMA
@@ -2083,7 +2076,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
2083{ 2076{
2084 struct mm_struct *mm = mm_slot->mm; 2077 struct mm_struct *mm = mm_slot->mm;
2085 2078
2086 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); 2079 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2087 2080
2088 if (khugepaged_test_exit(mm)) { 2081 if (khugepaged_test_exit(mm)) {
2089 /* free mm_slot */ 2082 /* free mm_slot */
@@ -2113,7 +2106,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2113 int progress = 0; 2106 int progress = 0;
2114 2107
2115 VM_BUG_ON(!pages); 2108 VM_BUG_ON(!pages);
2116 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); 2109 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2117 2110
2118 if (khugepaged_scan.mm_slot) 2111 if (khugepaged_scan.mm_slot)
2119 mm_slot = khugepaged_scan.mm_slot; 2112 mm_slot = khugepaged_scan.mm_slot;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5f34bd8dda3..b8ce6f45095 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54static DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{
58 bool free = (spool->count == 0) && (spool->used_hpages == 0);
59
60 spin_unlock(&spool->lock);
61
62 /* If no pages are used, and no other handles to the subpool
63 * remain, free the subpool the subpool remain */
64 if (free)
65 kfree(spool);
66}
67
68struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
69{
70 struct hugepage_subpool *spool;
71
72 spool = kmalloc(sizeof(*spool), GFP_KERNEL);
73 if (!spool)
74 return NULL;
75
76 spin_lock_init(&spool->lock);
77 spool->count = 1;
78 spool->max_hpages = nr_blocks;
79 spool->used_hpages = 0;
80
81 return spool;
82}
83
84void hugepage_put_subpool(struct hugepage_subpool *spool)
85{
86 spin_lock(&spool->lock);
87 BUG_ON(!spool->count);
88 spool->count--;
89 unlock_or_release_subpool(spool);
90}
91
92static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
93 long delta)
94{
95 int ret = 0;
96
97 if (!spool)
98 return 0;
99
100 spin_lock(&spool->lock);
101 if ((spool->used_hpages + delta) <= spool->max_hpages) {
102 spool->used_hpages += delta;
103 } else {
104 ret = -ENOMEM;
105 }
106 spin_unlock(&spool->lock);
107
108 return ret;
109}
110
111static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
112 long delta)
113{
114 if (!spool)
115 return;
116
117 spin_lock(&spool->lock);
118 spool->used_hpages -= delta;
119 /* If hugetlbfs_put_super couldn't free spool due to
120 * an outstanding quota reference, free it now. */
121 unlock_or_release_subpool(spool);
122}
123
124static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
125{
126 return HUGETLBFS_SB(inode->i_sb)->spool;
127}
128
129static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
130{
131 return subpool_inode(vma->vm_file->f_dentry->d_inode);
132}
133
56/* 134/*
57 * Region tracking -- allows tracking of reservations and instantiated pages 135 * Region tracking -- allows tracking of reservations and instantiated pages
58 * across the pages in a mapping. 136 * across the pages in a mapping.
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
454 struct vm_area_struct *vma, 532 struct vm_area_struct *vma,
455 unsigned long address, int avoid_reserve) 533 unsigned long address, int avoid_reserve)
456{ 534{
457 struct page *page = NULL; 535 struct page *page;
458 struct mempolicy *mpol; 536 struct mempolicy *mpol;
459 nodemask_t *nodemask; 537 nodemask_t *nodemask;
460 struct zonelist *zonelist; 538 struct zonelist *zonelist;
461 struct zone *zone; 539 struct zone *zone;
462 struct zoneref *z; 540 struct zoneref *z;
541 unsigned int cpuset_mems_cookie;
463 542
464 get_mems_allowed(); 543retry_cpuset:
544 cpuset_mems_cookie = get_mems_allowed();
465 zonelist = huge_zonelist(vma, address, 545 zonelist = huge_zonelist(vma, address,
466 htlb_alloc_mask, &mpol, &nodemask); 546 htlb_alloc_mask, &mpol, &nodemask);
467 /* 547 /*
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
488 } 568 }
489 } 569 }
490 } 570 }
491err: 571
492 mpol_cond_put(mpol); 572 mpol_cond_put(mpol);
493 put_mems_allowed(); 573 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
574 goto retry_cpuset;
494 return page; 575 return page;
576
577err:
578 mpol_cond_put(mpol);
579 return NULL;
495} 580}
496 581
497static void update_and_free_page(struct hstate *h, struct page *page) 582static void update_and_free_page(struct hstate *h, struct page *page)
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page)
533 */ 618 */
534 struct hstate *h = page_hstate(page); 619 struct hstate *h = page_hstate(page);
535 int nid = page_to_nid(page); 620 int nid = page_to_nid(page);
536 struct address_space *mapping; 621 struct hugepage_subpool *spool =
622 (struct hugepage_subpool *)page_private(page);
537 623
538 mapping = (struct address_space *) page_private(page);
539 set_page_private(page, 0); 624 set_page_private(page, 0);
540 page->mapping = NULL; 625 page->mapping = NULL;
541 BUG_ON(page_count(page)); 626 BUG_ON(page_count(page));
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page)
551 enqueue_huge_page(h, page); 636 enqueue_huge_page(h, page);
552 } 637 }
553 spin_unlock(&hugetlb_lock); 638 spin_unlock(&hugetlb_lock);
554 if (mapping) 639 hugepage_subpool_put_pages(spool, 1);
555 hugetlb_put_quota(mapping, 1);
556} 640}
557 641
558static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
852 struct page *page, *tmp; 936 struct page *page, *tmp;
853 int ret, i; 937 int ret, i;
854 int needed, allocated; 938 int needed, allocated;
939 bool alloc_ok = true;
855 940
856 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 941 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
857 if (needed <= 0) { 942 if (needed <= 0) {
@@ -867,17 +952,13 @@ retry:
867 spin_unlock(&hugetlb_lock); 952 spin_unlock(&hugetlb_lock);
868 for (i = 0; i < needed; i++) { 953 for (i = 0; i < needed; i++) {
869 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 954 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
870 if (!page) 955 if (!page) {
871 /* 956 alloc_ok = false;
872 * We were not able to allocate enough pages to 957 break;
873 * satisfy the entire reservation so we free what 958 }
874 * we've allocated so far.
875 */
876 goto free;
877
878 list_add(&page->lru, &surplus_list); 959 list_add(&page->lru, &surplus_list);
879 } 960 }
880 allocated += needed; 961 allocated += i;
881 962
882 /* 963 /*
883 * After retaking hugetlb_lock, we need to recalculate 'needed' 964 * After retaking hugetlb_lock, we need to recalculate 'needed'
@@ -886,9 +967,16 @@ retry:
886 spin_lock(&hugetlb_lock); 967 spin_lock(&hugetlb_lock);
887 needed = (h->resv_huge_pages + delta) - 968 needed = (h->resv_huge_pages + delta) -
888 (h->free_huge_pages + allocated); 969 (h->free_huge_pages + allocated);
889 if (needed > 0) 970 if (needed > 0) {
890 goto retry; 971 if (alloc_ok)
891 972 goto retry;
973 /*
974 * We were not able to allocate enough pages to
975 * satisfy the entire reservation so we free what
976 * we've allocated so far.
977 */
978 goto free;
979 }
892 /* 980 /*
893 * The surplus_list now contains _at_least_ the number of extra pages 981 * The surplus_list now contains _at_least_ the number of extra pages
894 * needed to accommodate the reservation. Add the appropriate number 982 * needed to accommodate the reservation. Add the appropriate number
@@ -914,10 +1002,10 @@ retry:
914 VM_BUG_ON(page_count(page)); 1002 VM_BUG_ON(page_count(page));
915 enqueue_huge_page(h, page); 1003 enqueue_huge_page(h, page);
916 } 1004 }
1005free:
917 spin_unlock(&hugetlb_lock); 1006 spin_unlock(&hugetlb_lock);
918 1007
919 /* Free unnecessary surplus pages to the buddy allocator */ 1008 /* Free unnecessary surplus pages to the buddy allocator */
920free:
921 if (!list_empty(&surplus_list)) { 1009 if (!list_empty(&surplus_list)) {
922 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
923 list_del(&page->lru); 1011 list_del(&page->lru);
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
966/* 1054/*
967 * Determine if the huge page at addr within the vma has an associated 1055 * Determine if the huge page at addr within the vma has an associated
968 * reservation. Where it does not we will need to logically increase 1056 * reservation. Where it does not we will need to logically increase
969 * reservation and actually increase quota before an allocation can occur. 1057 * reservation and actually increase subpool usage before an allocation
970 * Where any new reservation would be required the reservation change is 1058 * can occur. Where any new reservation would be required the
971 * prepared, but not committed. Once the page has been quota'd allocated 1059 * reservation change is prepared, but not committed. Once the page
972 * an instantiated the change should be committed via vma_commit_reservation. 1060 * has been allocated from the subpool and instantiated the change should
973 * No action is required on failure. 1061 * be committed via vma_commit_reservation. No action is required on
1062 * failure.
974 */ 1063 */
975static long vma_needs_reservation(struct hstate *h, 1064static long vma_needs_reservation(struct hstate *h,
976 struct vm_area_struct *vma, unsigned long addr) 1065 struct vm_area_struct *vma, unsigned long addr)
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
1019static struct page *alloc_huge_page(struct vm_area_struct *vma, 1108static struct page *alloc_huge_page(struct vm_area_struct *vma,
1020 unsigned long addr, int avoid_reserve) 1109 unsigned long addr, int avoid_reserve)
1021{ 1110{
1111 struct hugepage_subpool *spool = subpool_vma(vma);
1022 struct hstate *h = hstate_vma(vma); 1112 struct hstate *h = hstate_vma(vma);
1023 struct page *page; 1113 struct page *page;
1024 struct address_space *mapping = vma->vm_file->f_mapping;
1025 struct inode *inode = mapping->host;
1026 long chg; 1114 long chg;
1027 1115
1028 /* 1116 /*
1029 * Processes that did not create the mapping will have no reserves and 1117 * Processes that did not create the mapping will have no
1030 * will not have accounted against quota. Check that the quota can be 1118 * reserves and will not have accounted against subpool
1031 * made before satisfying the allocation 1119 * limit. Check that the subpool limit can be made before
1032 * MAP_NORESERVE mappings may also need pages and quota allocated 1120 * satisfying the allocation MAP_NORESERVE mappings may also
1033 * if no reserve mapping overlaps. 1121 * need pages and subpool limit allocated allocated if no reserve
1122 * mapping overlaps.
1034 */ 1123 */
1035 chg = vma_needs_reservation(h, vma, addr); 1124 chg = vma_needs_reservation(h, vma, addr);
1036 if (chg < 0) 1125 if (chg < 0)
1037 return ERR_PTR(-VM_FAULT_OOM); 1126 return ERR_PTR(-VM_FAULT_OOM);
1038 if (chg) 1127 if (chg)
1039 if (hugetlb_get_quota(inode->i_mapping, chg)) 1128 if (hugepage_subpool_get_pages(spool, chg))
1040 return ERR_PTR(-VM_FAULT_SIGBUS); 1129 return ERR_PTR(-VM_FAULT_SIGBUS);
1041 1130
1042 spin_lock(&hugetlb_lock); 1131 spin_lock(&hugetlb_lock);
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1046 if (!page) { 1135 if (!page) {
1047 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1048 if (!page) { 1137 if (!page) {
1049 hugetlb_put_quota(inode->i_mapping, chg); 1138 hugepage_subpool_put_pages(spool, chg);
1050 return ERR_PTR(-VM_FAULT_SIGBUS); 1139 return ERR_PTR(-VM_FAULT_SIGBUS);
1051 } 1140 }
1052 } 1141 }
1053 1142
1054 set_page_private(page, (unsigned long) mapping); 1143 set_page_private(page, (unsigned long)spool);
1055 1144
1056 vma_commit_reservation(h, vma, addr); 1145 vma_commit_reservation(h, vma, addr);
1057 1146
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2072{ 2161{
2073 struct hstate *h = hstate_vma(vma); 2162 struct hstate *h = hstate_vma(vma);
2074 struct resv_map *reservations = vma_resv_map(vma); 2163 struct resv_map *reservations = vma_resv_map(vma);
2164 struct hugepage_subpool *spool = subpool_vma(vma);
2075 unsigned long reserve; 2165 unsigned long reserve;
2076 unsigned long start; 2166 unsigned long start;
2077 unsigned long end; 2167 unsigned long end;
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2087 2177
2088 if (reserve) { 2178 if (reserve) {
2089 hugetlb_acct_memory(h, -reserve); 2179 hugetlb_acct_memory(h, -reserve);
2090 hugetlb_put_quota(vma->vm_file->f_mapping, reserve); 2180 hugepage_subpool_put_pages(spool, reserve);
2091 } 2181 }
2092 } 2182 }
2093} 2183}
@@ -2241,16 +2331,23 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2241 if (huge_pmd_unshare(mm, &address, ptep)) 2331 if (huge_pmd_unshare(mm, &address, ptep))
2242 continue; 2332 continue;
2243 2333
2334 pte = huge_ptep_get(ptep);
2335 if (huge_pte_none(pte))
2336 continue;
2337
2338 /*
2339 * HWPoisoned hugepage is already unmapped and dropped reference
2340 */
2341 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2342 continue;
2343
2344 page = pte_page(pte);
2244 /* 2345 /*
2245 * If a reference page is supplied, it is because a specific 2346 * If a reference page is supplied, it is because a specific
2246 * page is being unmapped, not a range. Ensure the page we 2347 * page is being unmapped, not a range. Ensure the page we
2247 * are about to unmap is the actual page of interest. 2348 * are about to unmap is the actual page of interest.
2248 */ 2349 */
2249 if (ref_page) { 2350 if (ref_page) {
2250 pte = huge_ptep_get(ptep);
2251 if (huge_pte_none(pte))
2252 continue;
2253 page = pte_page(pte);
2254 if (page != ref_page) 2351 if (page != ref_page)
2255 continue; 2352 continue;
2256 2353
@@ -2263,22 +2360,16 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2263 } 2360 }
2264 2361
2265 pte = huge_ptep_get_and_clear(mm, address, ptep); 2362 pte = huge_ptep_get_and_clear(mm, address, ptep);
2266 if (huge_pte_none(pte))
2267 continue;
2268
2269 /*
2270 * HWPoisoned hugepage is already unmapped and dropped reference
2271 */
2272 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2273 continue;
2274
2275 page = pte_page(pte);
2276 if (pte_dirty(pte)) 2363 if (pte_dirty(pte))
2277 set_page_dirty(page); 2364 set_page_dirty(page);
2278 list_add(&page->lru, &page_list); 2365 list_add(&page->lru, &page_list);
2366
2367 /* Bail out after unmapping reference page if supplied */
2368 if (ref_page)
2369 break;
2279 } 2370 }
2280 spin_unlock(&mm->page_table_lock);
2281 flush_tlb_range(vma, start, end); 2371 flush_tlb_range(vma, start, end);
2372 spin_unlock(&mm->page_table_lock);
2282 mmu_notifier_invalidate_range_end(mm, start, end); 2373 mmu_notifier_invalidate_range_end(mm, start, end);
2283 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2374 list_for_each_entry_safe(page, tmp, &page_list, lru) {
2284 page_remove_rmap(page); 2375 page_remove_rmap(page);
@@ -2316,7 +2407,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2316 */ 2407 */
2317 address = address & huge_page_mask(h); 2408 address = address & huge_page_mask(h);
2318 pgoff = vma_hugecache_offset(h, vma, address); 2409 pgoff = vma_hugecache_offset(h, vma, address);
2319 mapping = (struct address_space *)page_private(page); 2410 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2320 2411
2321 /* 2412 /*
2322 * Take the mapping lock for the duration of the table walk. As 2413 * Take the mapping lock for the duration of the table walk. As
@@ -2869,11 +2960,12 @@ int hugetlb_reserve_pages(struct inode *inode,
2869{ 2960{
2870 long ret, chg; 2961 long ret, chg;
2871 struct hstate *h = hstate_inode(inode); 2962 struct hstate *h = hstate_inode(inode);
2963 struct hugepage_subpool *spool = subpool_inode(inode);
2872 2964
2873 /* 2965 /*
2874 * Only apply hugepage reservation if asked. At fault time, an 2966 * Only apply hugepage reservation if asked. At fault time, an
2875 * attempt will be made for VM_NORESERVE to allocate a page 2967 * attempt will be made for VM_NORESERVE to allocate a page
2876 * and filesystem quota without using reserves 2968 * without using reserves
2877 */ 2969 */
2878 if (vm_flags & VM_NORESERVE) 2970 if (vm_flags & VM_NORESERVE)
2879 return 0; 2971 return 0;
@@ -2900,17 +2992,17 @@ int hugetlb_reserve_pages(struct inode *inode,
2900 if (chg < 0) 2992 if (chg < 0)
2901 return chg; 2993 return chg;
2902 2994
2903 /* There must be enough filesystem quota for the mapping */ 2995 /* There must be enough pages in the subpool for the mapping */
2904 if (hugetlb_get_quota(inode->i_mapping, chg)) 2996 if (hugepage_subpool_get_pages(spool, chg))
2905 return -ENOSPC; 2997 return -ENOSPC;
2906 2998
2907 /* 2999 /*
2908 * Check enough hugepages are available for the reservation. 3000 * Check enough hugepages are available for the reservation.
2909 * Hand back the quota if there are not 3001 * Hand the pages back to the subpool if there are not
2910 */ 3002 */
2911 ret = hugetlb_acct_memory(h, chg); 3003 ret = hugetlb_acct_memory(h, chg);
2912 if (ret < 0) { 3004 if (ret < 0) {
2913 hugetlb_put_quota(inode->i_mapping, chg); 3005 hugepage_subpool_put_pages(spool, chg);
2914 return ret; 3006 return ret;
2915 } 3007 }
2916 3008
@@ -2934,12 +3026,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2934{ 3026{
2935 struct hstate *h = hstate_inode(inode); 3027 struct hstate *h = hstate_inode(inode);
2936 long chg = region_truncate(&inode->i_mapping->private_list, offset); 3028 long chg = region_truncate(&inode->i_mapping->private_list, offset);
3029 struct hugepage_subpool *spool = subpool_inode(inode);
2937 3030
2938 spin_lock(&inode->i_lock); 3031 spin_lock(&inode->i_lock);
2939 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3032 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2940 spin_unlock(&inode->i_lock); 3033 spin_unlock(&inode->i_lock);
2941 3034
2942 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 3035 hugepage_subpool_put_pages(spool, (chg - freed));
2943 hugetlb_acct_memory(h, -(chg - freed)); 3036 hugetlb_acct_memory(h, -(chg - freed));
2944} 3037}
2945 3038
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index c7fc7fd00e3..cc448bb983b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -45,7 +45,7 @@ static int hwpoison_inject(void *data, u64 val)
45 * do a racy check with elevated page count, to make sure PG_hwpoison 45 * do a racy check with elevated page count, to make sure PG_hwpoison
46 * will only be set for the targeted owner (or on a free page). 46 * will only be set for the targeted owner (or on a free page).
47 * We temporarily take page lock for try_get_mem_cgroup_from_page(). 47 * We temporarily take page lock for try_get_mem_cgroup_from_page().
48 * __memory_failure() will redo the check reliably inside page lock. 48 * memory_failure() will redo the check reliably inside page lock.
49 */ 49 */
50 lock_page(hpage); 50 lock_page(hpage);
51 err = hwpoison_filter(hpage); 51 err = hwpoison_filter(hpage);
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
55 55
56inject: 56inject:
57 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); 57 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
58 return __memory_failure(pfn, 18, MF_COUNT_INCREASED); 58 return memory_failure(pfn, 18, MF_COUNT_INCREASED);
59} 59}
60 60
61static int hwpoison_unpoison(void *data, u64 val) 61static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c833addd94d..45eb6217bf3 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1036,7 +1036,7 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
1036{ 1036{
1037 pr_debug("%s(0x%p)\n", __func__, ptr); 1037 pr_debug("%s(0x%p)\n", __func__, ptr);
1038 1038
1039 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 1039 if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr))
1040 add_scan_area((unsigned long)ptr, size, gfp); 1040 add_scan_area((unsigned long)ptr, size, gfp);
1041 else if (atomic_read(&kmemleak_early_log)) 1041 else if (atomic_read(&kmemleak_early_log))
1042 log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); 1042 log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
@@ -1757,6 +1757,7 @@ void __init kmemleak_init(void)
1757 1757
1758#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF 1758#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
1759 if (!kmemleak_skip_disable) { 1759 if (!kmemleak_skip_disable) {
1760 atomic_set(&kmemleak_early_log, 0);
1760 kmemleak_disable(); 1761 kmemleak_disable();
1761 return; 1762 return;
1762 } 1763 }
diff --git a/mm/ksm.c b/mm/ksm.c
index 1925ffbfb27..47c88536889 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -28,7 +28,6 @@
28#include <linux/kthread.h> 28#include <linux/kthread.h>
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/memcontrol.h>
32#include <linux/rbtree.h> 31#include <linux/rbtree.h>
33#include <linux/memory.h> 32#include <linux/memory.h>
34#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
@@ -375,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
375 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 374 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
376} 375}
377 376
377static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
378 unsigned long addr)
379{
380 struct vm_area_struct *vma;
381 if (ksm_test_exit(mm))
382 return NULL;
383 vma = find_vma(mm, addr);
384 if (!vma || vma->vm_start > addr)
385 return NULL;
386 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
387 return NULL;
388 return vma;
389}
390
378static void break_cow(struct rmap_item *rmap_item) 391static void break_cow(struct rmap_item *rmap_item)
379{ 392{
380 struct mm_struct *mm = rmap_item->mm; 393 struct mm_struct *mm = rmap_item->mm;
@@ -388,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item)
388 put_anon_vma(rmap_item->anon_vma); 401 put_anon_vma(rmap_item->anon_vma);
389 402
390 down_read(&mm->mmap_sem); 403 down_read(&mm->mmap_sem);
391 if (ksm_test_exit(mm)) 404 vma = find_mergeable_vma(mm, addr);
392 goto out; 405 if (vma)
393 vma = find_vma(mm, addr); 406 break_ksm(vma, addr);
394 if (!vma || vma->vm_start > addr)
395 goto out;
396 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
397 goto out;
398 break_ksm(vma, addr);
399out:
400 up_read(&mm->mmap_sem); 407 up_read(&mm->mmap_sem);
401} 408}
402 409
@@ -422,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
422 struct page *page; 429 struct page *page;
423 430
424 down_read(&mm->mmap_sem); 431 down_read(&mm->mmap_sem);
425 if (ksm_test_exit(mm)) 432 vma = find_mergeable_vma(mm, addr);
426 goto out; 433 if (!vma)
427 vma = find_vma(mm, addr);
428 if (!vma || vma->vm_start > addr)
429 goto out;
430 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
431 goto out; 434 goto out;
432 435
433 page = follow_page(vma, addr, FOLL_GET); 436 page = follow_page(vma, addr, FOLL_GET);
@@ -673,9 +676,9 @@ error:
673static u32 calc_checksum(struct page *page) 676static u32 calc_checksum(struct page *page)
674{ 677{
675 u32 checksum; 678 u32 checksum;
676 void *addr = kmap_atomic(page, KM_USER0); 679 void *addr = kmap_atomic(page);
677 checksum = jhash2(addr, PAGE_SIZE / 4, 17); 680 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
678 kunmap_atomic(addr, KM_USER0); 681 kunmap_atomic(addr);
679 return checksum; 682 return checksum;
680} 683}
681 684
@@ -684,11 +687,11 @@ static int memcmp_pages(struct page *page1, struct page *page2)
684 char *addr1, *addr2; 687 char *addr1, *addr2;
685 int ret; 688 int ret;
686 689
687 addr1 = kmap_atomic(page1, KM_USER0); 690 addr1 = kmap_atomic(page1);
688 addr2 = kmap_atomic(page2, KM_USER1); 691 addr2 = kmap_atomic(page2);
689 ret = memcmp(addr1, addr2, PAGE_SIZE); 692 ret = memcmp(addr1, addr2, PAGE_SIZE);
690 kunmap_atomic(addr2, KM_USER1); 693 kunmap_atomic(addr2);
691 kunmap_atomic(addr1, KM_USER0); 694 kunmap_atomic(addr1);
692 return ret; 695 return ret;
693} 696}
694 697
@@ -1572,16 +1575,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
1572 1575
1573 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1576 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1574 if (new_page) { 1577 if (new_page) {
1575 /*
1576 * The memcg-specific accounting when moving
1577 * pages around the LRU lists relies on the
1578 * page's owner (memcg) to be valid. Usually,
1579 * pages are assigned to a new owner before
1580 * being put on the LRU list, but since this
1581 * is not the case here, the stale owner from
1582 * a previous allocation cycle must be reset.
1583 */
1584 mem_cgroup_reset_owner(new_page);
1585 copy_user_highpage(new_page, page, address, vma); 1578 copy_user_highpage(new_page, page, address, vma);
1586 1579
1587 SetPageDirty(new_page); 1580 SetPageDirty(new_page);
diff --git a/mm/madvise.c b/mm/madvise.c
index 74bf193eff0..1ccbba5b667 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,6 +65,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
65 } 65 }
66 new_flags &= ~VM_DONTCOPY; 66 new_flags &= ~VM_DONTCOPY;
67 break; 67 break;
68 case MADV_DONTDUMP:
69 new_flags |= VM_NODUMP;
70 break;
71 case MADV_DODUMP:
72 new_flags &= ~VM_NODUMP;
73 break;
68 case MADV_MERGEABLE: 74 case MADV_MERGEABLE:
69 case MADV_UNMERGEABLE: 75 case MADV_UNMERGEABLE:
70 error = ksm_madvise(vma, start, end, behavior, &new_flags); 76 error = ksm_madvise(vma, start, end, behavior, &new_flags);
@@ -251,7 +257,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
251 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 257 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
252 page_to_pfn(p), start); 258 page_to_pfn(p), start);
253 /* Ignore return value for now */ 259 /* Ignore return value for now */
254 __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 260 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
255 } 261 }
256 return ret; 262 return ret;
257} 263}
@@ -293,6 +299,8 @@ madvise_behavior_valid(int behavior)
293 case MADV_HUGEPAGE: 299 case MADV_HUGEPAGE:
294 case MADV_NOHUGEPAGE: 300 case MADV_NOHUGEPAGE:
295#endif 301#endif
302 case MADV_DONTDUMP:
303 case MADV_DODUMP:
296 return 1; 304 return 1;
297 305
298 default: 306 default:
diff --git a/mm/memblock.c b/mm/memblock.c
index 77b5f227e1d..99f28559950 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -99,9 +99,6 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
99 phys_addr_t this_start, this_end, cand; 99 phys_addr_t this_start, this_end, cand;
100 u64 i; 100 u64 i;
101 101
102 /* align @size to avoid excessive fragmentation on reserved array */
103 size = round_up(size, align);
104
105 /* pump up @end */ 102 /* pump up @end */
106 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 103 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
107 end = memblock.current_limit; 104 end = memblock.current_limit;
@@ -731,6 +728,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
731{ 728{
732 phys_addr_t found; 729 phys_addr_t found;
733 730
731 /* align @size to avoid excessive fragmentation on reserved array */
732 size = round_up(size, align);
733
734 found = memblock_find_in_range_node(0, max_addr, size, align, nid); 734 found = memblock_find_in_range_node(0, max_addr, size, align, nid);
735 if (found && !memblock_reserve(found, size)) 735 if (found && !memblock_reserve(found, size))
736 return found; 736 return found;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 556859fec4e..b2ee6df0e9b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 MEM_CGROUP_STAT_NSTATS, 92 MEM_CGROUP_STAT_NSTATS,
94}; 93};
95 94
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {
135 */ 134 */
136struct mem_cgroup_per_zone { 135struct mem_cgroup_per_zone {
137 struct lruvec lruvec; 136 struct lruvec lruvec;
138 unsigned long count[NR_LRU_LISTS]; 137 unsigned long lru_size[NR_LRU_LISTS];
139 138
140 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
141 140
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {
144 unsigned long long usage_in_excess;/* Set to the value by which */ 143 unsigned long long usage_in_excess;/* Set to the value by which */
145 /* the soft limit is exceeded*/ 144 /* the soft limit is exceeded*/
146 bool on_tree; 145 bool on_tree;
147 struct mem_cgroup *mem; /* Back pointer, we cannot */ 146 struct mem_cgroup *memcg; /* Back pointer, we cannot */
148 /* use container_of */ 147 /* use container_of */
149}; 148};
150/* Macro for accessing counter */
151#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
152 149
153struct mem_cgroup_per_node { 150struct mem_cgroup_per_node {
154 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 151 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
@@ -230,10 +227,30 @@ struct mem_cgroup {
230 * the counter to account for memory usage 227 * the counter to account for memory usage
231 */ 228 */
232 struct res_counter res; 229 struct res_counter res;
233 /* 230
234 * the counter to account for mem+swap usage. 231 union {
235 */ 232 /*
236 struct res_counter memsw; 233 * the counter to account for mem+swap usage.
234 */
235 struct res_counter memsw;
236
237 /*
238 * rcu_freeing is used only when freeing struct mem_cgroup,
239 * so put it into a union to avoid wasting more memory.
240 * It must be disjoint from the css field. It could be
241 * in a union with the res field, but res plays a much
242 * larger part in mem_cgroup life than memsw, and might
243 * be of interest, even at time of free, when debugging.
244 * So share rcu_head with the less interesting memsw.
245 */
246 struct rcu_head rcu_freeing;
247 /*
248 * But when using vfree(), that cannot be done at
249 * interrupt time, so we must then queue the work.
250 */
251 struct work_struct work_freeing;
252 };
253
237 /* 254 /*
238 * Per cgroup active and inactive list, similar to the 255 * Per cgroup active and inactive list, similar to the
239 * per zone LRU lists. 256 * per zone LRU lists.
@@ -280,6 +297,12 @@ struct mem_cgroup {
280 */ 297 */
281 unsigned long move_charge_at_immigrate; 298 unsigned long move_charge_at_immigrate;
282 /* 299 /*
300 * set > 0 if pages under this cgroup are moving to other cgroup.
301 */
302 atomic_t moving_account;
303 /* taken only while moving_account > 0 */
304 spinlock_t move_lock;
305 /*
283 * percpu counter. 306 * percpu counter.
284 */ 307 */
285 struct mem_cgroup_stat_cpu *stat; 308 struct mem_cgroup_stat_cpu *stat;
@@ -592,9 +615,9 @@ retry:
592 * we will to add it back at the end of reclaim to its correct 615 * we will to add it back at the end of reclaim to its correct
593 * position in the tree. 616 * position in the tree.
594 */ 617 */
595 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 618 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
596 if (!res_counter_soft_limit_excess(&mz->mem->res) || 619 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
597 !css_tryget(&mz->mem->css)) 620 !css_tryget(&mz->memcg->css))
598 goto retry; 621 goto retry;
599done: 622done:
600 return mz; 623 return mz;
@@ -672,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
672} 695}
673 696
674static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 697static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
675 bool file, int nr_pages) 698 bool anon, int nr_pages)
676{ 699{
677 preempt_disable(); 700 preempt_disable();
678 701
679 if (file) 702 /*
680 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 703 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
704 * counted as CACHE even if it's on ANON LRU.
705 */
706 if (anon)
707 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
681 nr_pages); 708 nr_pages);
682 else 709 else
683 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 710 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
684 nr_pages); 711 nr_pages);
685 712
686 /* pagein of a big page is an event. So, ignore page size */ 713 /* pagein of a big page is an event. So, ignore page size */
@@ -701,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
701 unsigned int lru_mask) 728 unsigned int lru_mask)
702{ 729{
703 struct mem_cgroup_per_zone *mz; 730 struct mem_cgroup_per_zone *mz;
704 enum lru_list l; 731 enum lru_list lru;
705 unsigned long ret = 0; 732 unsigned long ret = 0;
706 733
707 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 734 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
708 735
709 for_each_lru(l) { 736 for_each_lru(lru) {
710 if (BIT(l) & lru_mask) 737 if (BIT(lru) & lru_mask)
711 ret += MEM_CGROUP_ZSTAT(mz, l); 738 ret += mz->lru_size[lru];
712 } 739 }
713 return ret; 740 return ret;
714} 741}
@@ -776,7 +803,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
776 /* threshold event is triggered in finer grain than soft limit */ 803 /* threshold event is triggered in finer grain than soft limit */
777 if (unlikely(mem_cgroup_event_ratelimit(memcg, 804 if (unlikely(mem_cgroup_event_ratelimit(memcg,
778 MEM_CGROUP_TARGET_THRESH))) { 805 MEM_CGROUP_TARGET_THRESH))) {
779 bool do_softlimit, do_numainfo; 806 bool do_softlimit;
807 bool do_numainfo __maybe_unused;
780 808
781 do_softlimit = mem_cgroup_event_ratelimit(memcg, 809 do_softlimit = mem_cgroup_event_ratelimit(memcg,
782 MEM_CGROUP_TARGET_SOFTLIMIT); 810 MEM_CGROUP_TARGET_SOFTLIMIT);
@@ -1041,9 +1069,22 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1041 1069
1042 pc = lookup_page_cgroup(page); 1070 pc = lookup_page_cgroup(page);
1043 memcg = pc->mem_cgroup; 1071 memcg = pc->mem_cgroup;
1072
1073 /*
1074 * Surreptitiously switch any uncharged page to root:
1075 * an uncharged page off lru does nothing to secure
1076 * its former mem_cgroup from sudden removal.
1077 *
1078 * Our caller holds lru_lock, and PageCgroupUsed is updated
1079 * under page_cgroup lock: between them, they make all uses
1080 * of pc->mem_cgroup safe.
1081 */
1082 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1083 pc->mem_cgroup = memcg = root_mem_cgroup;
1084
1044 mz = page_cgroup_zoneinfo(memcg, page); 1085 mz = page_cgroup_zoneinfo(memcg, page);
1045 /* compound_order() is stabilized through lru_lock */ 1086 /* compound_order() is stabilized through lru_lock */
1046 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1087 mz->lru_size[lru] += 1 << compound_order(page);
1047 return &mz->lruvec; 1088 return &mz->lruvec;
1048} 1089}
1049 1090
@@ -1071,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1071 VM_BUG_ON(!memcg); 1112 VM_BUG_ON(!memcg);
1072 mz = page_cgroup_zoneinfo(memcg, page); 1113 mz = page_cgroup_zoneinfo(memcg, page);
1073 /* huge page split is done under lru_lock. so, we have no races. */ 1114 /* huge page split is done under lru_lock. so, we have no races. */
1074 VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); 1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
1075 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 1116 mz->lru_size[lru] -= 1 << compound_order(page);
1076} 1117}
1077 1118
1078void mem_cgroup_lru_del(struct page *page) 1119void mem_cgroup_lru_del(struct page *page)
@@ -1251,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1251 return memcg->swappiness; 1292 return memcg->swappiness;
1252} 1293}
1253 1294
1254static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1295/*
1255{ 1296 * memcg->moving_account is used for checking possibility that some thread is
1256 int cpu; 1297 * calling move_account(). When a thread on CPU-A starts moving pages under
1298 * a memcg, other threads should check memcg->moving_account under
1299 * rcu_read_lock(), like this:
1300 *
1301 * CPU-A CPU-B
1302 * rcu_read_lock()
1303 * memcg->moving_account+1 if (memcg->mocing_account)
1304 * take heavy locks.
1305 * synchronize_rcu() update something.
1306 * rcu_read_unlock()
1307 * start move here.
1308 */
1257 1309
1258 get_online_cpus(); 1310/* for quick checking without looking up memcg */
1259 spin_lock(&memcg->pcp_counter_lock); 1311atomic_t memcg_moving __read_mostly;
1260 for_each_online_cpu(cpu)
1261 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1262 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1263 spin_unlock(&memcg->pcp_counter_lock);
1264 put_online_cpus();
1265 1312
1313static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1314{
1315 atomic_inc(&memcg_moving);
1316 atomic_inc(&memcg->moving_account);
1266 synchronize_rcu(); 1317 synchronize_rcu();
1267} 1318}
1268 1319
1269static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1320static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1270{ 1321{
1271 int cpu; 1322 /*
1272 1323 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1273 if (!memcg) 1324 * We check NULL in callee rather than caller.
1274 return; 1325 */
1275 get_online_cpus(); 1326 if (memcg) {
1276 spin_lock(&memcg->pcp_counter_lock); 1327 atomic_dec(&memcg_moving);
1277 for_each_online_cpu(cpu) 1328 atomic_dec(&memcg->moving_account);
1278 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1329 }
1279 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1280 spin_unlock(&memcg->pcp_counter_lock);
1281 put_online_cpus();
1282} 1330}
1331
1283/* 1332/*
1284 * 2 routines for checking "mem" is under move_account() or not. 1333 * 2 routines for checking "mem" is under move_account() or not.
1285 * 1334 *
1286 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1335 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
1287 * for avoiding race in accounting. If true, 1336 * is used for avoiding races in accounting. If true,
1288 * pc->mem_cgroup may be overwritten. 1337 * pc->mem_cgroup may be overwritten.
1289 * 1338 *
1290 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1339 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
@@ -1292,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1292 * waiting at hith-memory prressure caused by "move". 1341 * waiting at hith-memory prressure caused by "move".
1293 */ 1342 */
1294 1343
1295static bool mem_cgroup_stealed(struct mem_cgroup *memcg) 1344static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1296{ 1345{
1297 VM_BUG_ON(!rcu_read_lock_held()); 1346 VM_BUG_ON(!rcu_read_lock_held());
1298 return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1347 return atomic_read(&memcg->moving_account) > 0;
1299} 1348}
1300 1349
1301static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1350static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
@@ -1336,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1336 return false; 1385 return false;
1337} 1386}
1338 1387
1388/*
1389 * Take this lock when
1390 * - a code tries to modify page's memcg while it's USED.
1391 * - a code tries to modify page state accounting in a memcg.
1392 * see mem_cgroup_stolen(), too.
1393 */
1394static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1395 unsigned long *flags)
1396{
1397 spin_lock_irqsave(&memcg->move_lock, *flags);
1398}
1399
1400static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1401 unsigned long *flags)
1402{
1403 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1404}
1405
1339/** 1406/**
1340 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1407 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1341 * @memcg: The memory cgroup that went over limit 1408 * @memcg: The memory cgroup that went over limit
@@ -1359,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1359 if (!memcg || !p) 1426 if (!memcg || !p)
1360 return; 1427 return;
1361 1428
1362
1363 rcu_read_lock(); 1429 rcu_read_lock();
1364 1430
1365 mem_cgrp = memcg->css.cgroup; 1431 mem_cgrp = memcg->css.cgroup;
@@ -1738,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);
1738static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1804static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1739 1805
1740struct oom_wait_info { 1806struct oom_wait_info {
1741 struct mem_cgroup *mem; 1807 struct mem_cgroup *memcg;
1742 wait_queue_t wait; 1808 wait_queue_t wait;
1743}; 1809};
1744 1810
1745static int memcg_oom_wake_function(wait_queue_t *wait, 1811static int memcg_oom_wake_function(wait_queue_t *wait,
1746 unsigned mode, int sync, void *arg) 1812 unsigned mode, int sync, void *arg)
1747{ 1813{
1748 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, 1814 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1749 *oom_wait_memcg; 1815 struct mem_cgroup *oom_wait_memcg;
1750 struct oom_wait_info *oom_wait_info; 1816 struct oom_wait_info *oom_wait_info;
1751 1817
1752 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1818 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1753 oom_wait_memcg = oom_wait_info->mem; 1819 oom_wait_memcg = oom_wait_info->memcg;
1754 1820
1755 /* 1821 /*
1756 * Both of oom_wait_info->mem and wake_mem are stable under us. 1822 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
1757 * Then we can use css_is_ancestor without taking care of RCU. 1823 * Then we can use css_is_ancestor without taking care of RCU.
1758 */ 1824 */
1759 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 1825 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
@@ -1777,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
1777/* 1843/*
1778 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1779 */ 1845 */
1780bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) 1846bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1781{ 1847{
1782 struct oom_wait_info owait; 1848 struct oom_wait_info owait;
1783 bool locked, need_to_kill; 1849 bool locked, need_to_kill;
1784 1850
1785 owait.mem = memcg; 1851 owait.memcg = memcg;
1786 owait.wait.flags = 0; 1852 owait.wait.flags = 0;
1787 owait.wait.func = memcg_oom_wake_function; 1853 owait.wait.func = memcg_oom_wake_function;
1788 owait.wait.private = current; 1854 owait.wait.private = current;
@@ -1807,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1807 1873
1808 if (need_to_kill) { 1874 if (need_to_kill) {
1809 finish_wait(&memcg_oom_waitq, &owait.wait); 1875 finish_wait(&memcg_oom_waitq, &owait.wait);
1810 mem_cgroup_out_of_memory(memcg, mask); 1876 mem_cgroup_out_of_memory(memcg, mask, order);
1811 } else { 1877 } else {
1812 schedule(); 1878 schedule();
1813 finish_wait(&memcg_oom_waitq, &owait.wait); 1879 finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1847,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1847 * by flags. 1913 * by flags.
1848 * 1914 *
1849 * Considering "move", this is an only case we see a race. To make the race 1915 * Considering "move", this is an only case we see a race. To make the race
1850 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1916 * small, we check mm->moving_account and detect there are possibility of race
1851 * possibility of race condition. If there is, we take a lock. 1917 * If there is, we take a lock.
1852 */ 1918 */
1853 1919
1920void __mem_cgroup_begin_update_page_stat(struct page *page,
1921 bool *locked, unsigned long *flags)
1922{
1923 struct mem_cgroup *memcg;
1924 struct page_cgroup *pc;
1925
1926 pc = lookup_page_cgroup(page);
1927again:
1928 memcg = pc->mem_cgroup;
1929 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1930 return;
1931 /*
1932 * If this memory cgroup is not under account moving, we don't
1933 * need to take move_lock_page_cgroup(). Because we already hold
1934 * rcu_read_lock(), any calls to move_account will be delayed until
1935 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1936 */
1937 if (!mem_cgroup_stolen(memcg))
1938 return;
1939
1940 move_lock_mem_cgroup(memcg, flags);
1941 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
1942 move_unlock_mem_cgroup(memcg, flags);
1943 goto again;
1944 }
1945 *locked = true;
1946}
1947
1948void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1949{
1950 struct page_cgroup *pc = lookup_page_cgroup(page);
1951
1952 /*
1953 * It's guaranteed that pc->mem_cgroup never changes while
1954 * lock is held because a routine modifies pc->mem_cgroup
1955 * should take move_lock_page_cgroup().
1956 */
1957 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1958}
1959
1854void mem_cgroup_update_page_stat(struct page *page, 1960void mem_cgroup_update_page_stat(struct page *page,
1855 enum mem_cgroup_page_stat_item idx, int val) 1961 enum mem_cgroup_page_stat_item idx, int val)
1856{ 1962{
1857 struct mem_cgroup *memcg; 1963 struct mem_cgroup *memcg;
1858 struct page_cgroup *pc = lookup_page_cgroup(page); 1964 struct page_cgroup *pc = lookup_page_cgroup(page);
1859 bool need_unlock = false;
1860 unsigned long uninitialized_var(flags); 1965 unsigned long uninitialized_var(flags);
1861 1966
1862 if (mem_cgroup_disabled()) 1967 if (mem_cgroup_disabled())
1863 return; 1968 return;
1864 1969
1865 rcu_read_lock();
1866 memcg = pc->mem_cgroup; 1970 memcg = pc->mem_cgroup;
1867 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1971 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1868 goto out; 1972 return;
1869 /* pc->mem_cgroup is unstable ? */
1870 if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
1871 /* take a lock against to access pc->mem_cgroup */
1872 move_lock_page_cgroup(pc, &flags);
1873 need_unlock = true;
1874 memcg = pc->mem_cgroup;
1875 if (!memcg || !PageCgroupUsed(pc))
1876 goto out;
1877 }
1878 1973
1879 switch (idx) { 1974 switch (idx) {
1880 case MEMCG_NR_FILE_MAPPED: 1975 case MEMCG_NR_FILE_MAPPED:
1881 if (val > 0)
1882 SetPageCgroupFileMapped(pc);
1883 else if (!page_mapped(page))
1884 ClearPageCgroupFileMapped(pc);
1885 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1976 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1886 break; 1977 break;
1887 default: 1978 default:
@@ -1889,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,
1889 } 1980 }
1890 1981
1891 this_cpu_add(memcg->stat->count[idx], val); 1982 this_cpu_add(memcg->stat->count[idx], val);
1892
1893out:
1894 if (unlikely(need_unlock))
1895 move_unlock_page_cgroup(pc, &flags);
1896 rcu_read_unlock();
1897 return;
1898} 1983}
1899EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1900 1984
1901/* 1985/*
1902 * size of first charge trial. "32" comes from vmscan.c's magic value. 1986 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2067,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2067 per_cpu(memcg->stat->events[i], cpu) = 0; 2151 per_cpu(memcg->stat->events[i], cpu) = 0;
2068 memcg->nocpu_base.events[i] += x; 2152 memcg->nocpu_base.events[i] += x;
2069 } 2153 }
2070 /* need to clear ON_MOVE value, works as a kind of lock. */
2071 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2072 spin_unlock(&memcg->pcp_counter_lock);
2073}
2074
2075static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2076{
2077 int idx = MEM_CGROUP_ON_MOVE;
2078
2079 spin_lock(&memcg->pcp_counter_lock);
2080 per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2081 spin_unlock(&memcg->pcp_counter_lock); 2154 spin_unlock(&memcg->pcp_counter_lock);
2082} 2155}
2083 2156
@@ -2089,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2089 struct memcg_stock_pcp *stock; 2162 struct memcg_stock_pcp *stock;
2090 struct mem_cgroup *iter; 2163 struct mem_cgroup *iter;
2091 2164
2092 if ((action == CPU_ONLINE)) { 2165 if (action == CPU_ONLINE)
2093 for_each_mem_cgroup(iter)
2094 synchronize_mem_cgroup_on_move(iter, cpu);
2095 return NOTIFY_OK; 2166 return NOTIFY_OK;
2096 }
2097 2167
2098 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2168 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2099 return NOTIFY_OK; 2169 return NOTIFY_OK;
@@ -2178,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2178 if (!oom_check) 2248 if (!oom_check)
2179 return CHARGE_NOMEM; 2249 return CHARGE_NOMEM;
2180 /* check OOM */ 2250 /* check OOM */
2181 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 2251 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2182 return CHARGE_OOM_DIE; 2252 return CHARGE_OOM_DIE;
2183 2253
2184 return CHARGE_RETRY; 2254 return CHARGE_RETRY;
@@ -2407,8 +2477,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2407 struct page *page, 2477 struct page *page,
2408 unsigned int nr_pages, 2478 unsigned int nr_pages,
2409 struct page_cgroup *pc, 2479 struct page_cgroup *pc,
2410 enum charge_type ctype) 2480 enum charge_type ctype,
2481 bool lrucare)
2411{ 2482{
2483 struct zone *uninitialized_var(zone);
2484 bool was_on_lru = false;
2485 bool anon;
2486
2412 lock_page_cgroup(pc); 2487 lock_page_cgroup(pc);
2413 if (unlikely(PageCgroupUsed(pc))) { 2488 if (unlikely(PageCgroupUsed(pc))) {
2414 unlock_page_cgroup(pc); 2489 unlock_page_cgroup(pc);
@@ -2419,6 +2494,21 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2419 * we don't need page_cgroup_lock about tail pages, becase they are not 2494 * we don't need page_cgroup_lock about tail pages, becase they are not
2420 * accessed by any other context at this point. 2495 * accessed by any other context at this point.
2421 */ 2496 */
2497
2498 /*
2499 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2500 * may already be on some other mem_cgroup's LRU. Take care of it.
2501 */
2502 if (lrucare) {
2503 zone = page_zone(page);
2504 spin_lock_irq(&zone->lru_lock);
2505 if (PageLRU(page)) {
2506 ClearPageLRU(page);
2507 del_page_from_lru_list(zone, page, page_lru(page));
2508 was_on_lru = true;
2509 }
2510 }
2511
2422 pc->mem_cgroup = memcg; 2512 pc->mem_cgroup = memcg;
2423 /* 2513 /*
2424 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2514 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2428,23 +2518,25 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2428 * See mem_cgroup_add_lru_list(), etc. 2518 * See mem_cgroup_add_lru_list(), etc.
2429 */ 2519 */
2430 smp_wmb(); 2520 smp_wmb();
2431 switch (ctype) { 2521 SetPageCgroupUsed(pc);
2432 case MEM_CGROUP_CHARGE_TYPE_CACHE: 2522
2433 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 2523 if (lrucare) {
2434 SetPageCgroupCache(pc); 2524 if (was_on_lru) {
2435 SetPageCgroupUsed(pc); 2525 VM_BUG_ON(PageLRU(page));
2436 break; 2526 SetPageLRU(page);
2437 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2527 add_page_to_lru_list(zone, page, page_lru(page));
2438 ClearPageCgroupCache(pc); 2528 }
2439 SetPageCgroupUsed(pc); 2529 spin_unlock_irq(&zone->lru_lock);
2440 break;
2441 default:
2442 break;
2443 } 2530 }
2444 2531
2445 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2532 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2533 anon = true;
2534 else
2535 anon = false;
2536
2537 mem_cgroup_charge_statistics(memcg, anon, nr_pages);
2446 unlock_page_cgroup(pc); 2538 unlock_page_cgroup(pc);
2447 WARN_ON_ONCE(PageLRU(page)); 2539
2448 /* 2540 /*
2449 * "charge_statistics" updated event counter. Then, check it. 2541 * "charge_statistics" updated event counter. Then, check it.
2450 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2542 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -2455,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2455 2547
2456#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2548#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2457 2549
2458#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2550#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
2459 (1 << PCG_MIGRATION))
2460/* 2551/*
2461 * Because tail pages are not marked as "used", set it. We're under 2552 * Because tail pages are not marked as "used", set it. We're under
2462 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2553 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2507,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,
2507{ 2598{
2508 unsigned long flags; 2599 unsigned long flags;
2509 int ret; 2600 int ret;
2601 bool anon = PageAnon(page);
2510 2602
2511 VM_BUG_ON(from == to); 2603 VM_BUG_ON(from == to);
2512 VM_BUG_ON(PageLRU(page)); 2604 VM_BUG_ON(PageLRU(page));
@@ -2526,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,
2526 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2618 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2527 goto unlock; 2619 goto unlock;
2528 2620
2529 move_lock_page_cgroup(pc, &flags); 2621 move_lock_mem_cgroup(from, &flags);
2530 2622
2531 if (PageCgroupFileMapped(pc)) { 2623 if (!anon && page_mapped(page)) {
2532 /* Update mapped_file data for mem_cgroup */ 2624 /* Update mapped_file data for mem_cgroup */
2533 preempt_disable(); 2625 preempt_disable();
2534 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2626 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2535 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2627 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2536 preempt_enable(); 2628 preempt_enable();
2537 } 2629 }
2538 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2630 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2539 if (uncharge) 2631 if (uncharge)
2540 /* This is not "cancel", but cancel_charge does all we need. */ 2632 /* This is not "cancel", but cancel_charge does all we need. */
2541 __mem_cgroup_cancel_charge(from, nr_pages); 2633 __mem_cgroup_cancel_charge(from, nr_pages);
2542 2634
2543 /* caller should have done css_get */ 2635 /* caller should have done css_get */
2544 pc->mem_cgroup = to; 2636 pc->mem_cgroup = to;
2545 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2637 mem_cgroup_charge_statistics(to, anon, nr_pages);
2546 /* 2638 /*
2547 * We charges against "to" which may not have any tasks. Then, "to" 2639 * We charges against "to" which may not have any tasks. Then, "to"
2548 * can be under rmdir(). But in current implementation, caller of 2640 * can be under rmdir(). But in current implementation, caller of
@@ -2550,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,
2550 * guaranteed that "to" is never removed. So, we don't check rmdir 2642 * guaranteed that "to" is never removed. So, we don't check rmdir
2551 * status here. 2643 * status here.
2552 */ 2644 */
2553 move_unlock_page_cgroup(pc, &flags); 2645 move_unlock_mem_cgroup(from, &flags);
2554 ret = 0; 2646 ret = 0;
2555unlock: 2647unlock:
2556 unlock_page_cgroup(pc); 2648 unlock_page_cgroup(pc);
@@ -2642,7 +2734,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2642 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2734 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2643 if (ret == -ENOMEM) 2735 if (ret == -ENOMEM)
2644 return ret; 2736 return ret;
2645 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); 2737 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
2646 return 0; 2738 return 0;
2647} 2739}
2648 2740
@@ -2662,35 +2754,6 @@ static void
2662__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2754__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2663 enum charge_type ctype); 2755 enum charge_type ctype);
2664 2756
2665static void
2666__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2667 enum charge_type ctype)
2668{
2669 struct page_cgroup *pc = lookup_page_cgroup(page);
2670 struct zone *zone = page_zone(page);
2671 unsigned long flags;
2672 bool removed = false;
2673
2674 /*
2675 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2676 * is already on LRU. It means the page may on some other page_cgroup's
2677 * LRU. Take care of it.
2678 */
2679 spin_lock_irqsave(&zone->lru_lock, flags);
2680 if (PageLRU(page)) {
2681 del_page_from_lru_list(zone, page, page_lru(page));
2682 ClearPageLRU(page);
2683 removed = true;
2684 }
2685 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2686 if (removed) {
2687 add_page_to_lru_list(zone, page, page_lru(page));
2688 SetPageLRU(page);
2689 }
2690 spin_unlock_irqrestore(&zone->lru_lock, flags);
2691 return;
2692}
2693
2694int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2757int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2695 gfp_t gfp_mask) 2758 gfp_t gfp_mask)
2696{ 2759{
@@ -2768,13 +2831,16 @@ static void
2768__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2831__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2769 enum charge_type ctype) 2832 enum charge_type ctype)
2770{ 2833{
2834 struct page_cgroup *pc;
2835
2771 if (mem_cgroup_disabled()) 2836 if (mem_cgroup_disabled())
2772 return; 2837 return;
2773 if (!memcg) 2838 if (!memcg)
2774 return; 2839 return;
2775 cgroup_exclude_rmdir(&memcg->css); 2840 cgroup_exclude_rmdir(&memcg->css);
2776 2841
2777 __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); 2842 pc = lookup_page_cgroup(page);
2843 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
2778 /* 2844 /*
2779 * Now swap is on-memory. This means this page may be 2845 * Now swap is on-memory. This means this page may be
2780 * counted both as mem and swap....double count. 2846 * counted both as mem and swap....double count.
@@ -2878,7 +2944,6 @@ direct_uncharge:
2878 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 2944 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2879 if (unlikely(batch->memcg != memcg)) 2945 if (unlikely(batch->memcg != memcg))
2880 memcg_oom_recover(memcg); 2946 memcg_oom_recover(memcg);
2881 return;
2882} 2947}
2883 2948
2884/* 2949/*
@@ -2890,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2890 struct mem_cgroup *memcg = NULL; 2955 struct mem_cgroup *memcg = NULL;
2891 unsigned int nr_pages = 1; 2956 unsigned int nr_pages = 1;
2892 struct page_cgroup *pc; 2957 struct page_cgroup *pc;
2958 bool anon;
2893 2959
2894 if (mem_cgroup_disabled()) 2960 if (mem_cgroup_disabled())
2895 return NULL; 2961 return NULL;
@@ -2915,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2915 if (!PageCgroupUsed(pc)) 2981 if (!PageCgroupUsed(pc))
2916 goto unlock_out; 2982 goto unlock_out;
2917 2983
2984 anon = PageAnon(page);
2985
2918 switch (ctype) { 2986 switch (ctype) {
2919 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2987 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2988 /*
2989 * Generally PageAnon tells if it's the anon statistics to be
2990 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
2991 * used before page reached the stage of being marked PageAnon.
2992 */
2993 anon = true;
2994 /* fallthrough */
2920 case MEM_CGROUP_CHARGE_TYPE_DROP: 2995 case MEM_CGROUP_CHARGE_TYPE_DROP:
2921 /* See mem_cgroup_prepare_migration() */ 2996 /* See mem_cgroup_prepare_migration() */
2922 if (page_mapped(page) || PageCgroupMigration(pc)) 2997 if (page_mapped(page) || PageCgroupMigration(pc))
@@ -2933,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2933 break; 3008 break;
2934 } 3009 }
2935 3010
2936 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); 3011 mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
2937 3012
2938 ClearPageCgroupUsed(pc); 3013 ClearPageCgroupUsed(pc);
2939 /* 3014 /*
@@ -3026,23 +3101,6 @@ void mem_cgroup_uncharge_end(void)
3026 batch->memcg = NULL; 3101 batch->memcg = NULL;
3027} 3102}
3028 3103
3029/*
3030 * A function for resetting pc->mem_cgroup for newly allocated pages.
3031 * This function should be called if the newpage will be added to LRU
3032 * before start accounting.
3033 */
3034void mem_cgroup_reset_owner(struct page *newpage)
3035{
3036 struct page_cgroup *pc;
3037
3038 if (mem_cgroup_disabled())
3039 return;
3040
3041 pc = lookup_page_cgroup(newpage);
3042 VM_BUG_ON(PageCgroupUsed(pc));
3043 pc->mem_cgroup = root_mem_cgroup;
3044}
3045
3046#ifdef CONFIG_SWAP 3104#ifdef CONFIG_SWAP
3047/* 3105/*
3048 * called after __delete_from_swap_cache() and drop "page" account. 3106 * called after __delete_from_swap_cache() and drop "page" account.
@@ -3247,7 +3305,7 @@ int mem_cgroup_prepare_migration(struct page *page,
3247 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3305 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3248 else 3306 else
3249 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3307 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3250 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype); 3308 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
3251 return ret; 3309 return ret;
3252} 3310}
3253 3311
@@ -3257,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3257{ 3315{
3258 struct page *used, *unused; 3316 struct page *used, *unused;
3259 struct page_cgroup *pc; 3317 struct page_cgroup *pc;
3318 bool anon;
3260 3319
3261 if (!memcg) 3320 if (!memcg)
3262 return; 3321 return;
@@ -3278,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3278 lock_page_cgroup(pc); 3337 lock_page_cgroup(pc);
3279 ClearPageCgroupMigration(pc); 3338 ClearPageCgroupMigration(pc);
3280 unlock_page_cgroup(pc); 3339 unlock_page_cgroup(pc);
3281 3340 anon = PageAnon(used);
3282 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3341 __mem_cgroup_uncharge_common(unused,
3342 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3343 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3283 3344
3284 /* 3345 /*
3285 * If a page is a file cache, radix-tree replacement is very atomic 3346 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3289,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3289 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3350 * and USED bit check in mem_cgroup_uncharge_page() will do enough
3290 * check. (see prepare_charge() also) 3351 * check. (see prepare_charge() also)
3291 */ 3352 */
3292 if (PageAnon(used)) 3353 if (anon)
3293 mem_cgroup_uncharge_page(used); 3354 mem_cgroup_uncharge_page(used);
3294 /* 3355 /*
3295 * At migration, we may charge account against cgroup which has no 3356 * At migration, we may charge account against cgroup which has no
@@ -3319,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3319 /* fix accounting on old pages */ 3380 /* fix accounting on old pages */
3320 lock_page_cgroup(pc); 3381 lock_page_cgroup(pc);
3321 memcg = pc->mem_cgroup; 3382 memcg = pc->mem_cgroup;
3322 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); 3383 mem_cgroup_charge_statistics(memcg, false, -1);
3323 ClearPageCgroupUsed(pc); 3384 ClearPageCgroupUsed(pc);
3324 unlock_page_cgroup(pc); 3385 unlock_page_cgroup(pc);
3325 3386
@@ -3331,7 +3392,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3331 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3392 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3332 * LRU while we overwrite pc->mem_cgroup. 3393 * LRU while we overwrite pc->mem_cgroup.
3333 */ 3394 */
3334 __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); 3395 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
3335} 3396}
3336 3397
3337#ifdef CONFIG_DEBUG_VM 3398#ifdef CONFIG_DEBUG_VM
@@ -3530,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3530 break; 3591 break;
3531 3592
3532 nr_scanned = 0; 3593 nr_scanned = 0;
3533 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, 3594 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3534 gfp_mask, &nr_scanned); 3595 gfp_mask, &nr_scanned);
3535 nr_reclaimed += reclaimed; 3596 nr_reclaimed += reclaimed;
3536 *total_scanned += nr_scanned; 3597 *total_scanned += nr_scanned;
@@ -3557,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3557 next_mz = 3618 next_mz =
3558 __mem_cgroup_largest_soft_limit_node(mctz); 3619 __mem_cgroup_largest_soft_limit_node(mctz);
3559 if (next_mz == mz) 3620 if (next_mz == mz)
3560 css_put(&next_mz->mem->css); 3621 css_put(&next_mz->memcg->css);
3561 else /* next_mz == NULL or other memcg */ 3622 else /* next_mz == NULL or other memcg */
3562 break; 3623 break;
3563 } while (1); 3624 } while (1);
3564 } 3625 }
3565 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3626 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
3566 excess = res_counter_soft_limit_excess(&mz->mem->res); 3627 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3567 /* 3628 /*
3568 * One school of thought says that we should not add 3629 * One school of thought says that we should not add
3569 * back the node to the tree if reclaim returns 0. 3630 * back the node to the tree if reclaim returns 0.
@@ -3573,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3573 * term TODO. 3634 * term TODO.
3574 */ 3635 */
3575 /* If excess == 0, no tree ops */ 3636 /* If excess == 0, no tree ops */
3576 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3637 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
3577 spin_unlock(&mctz->lock); 3638 spin_unlock(&mctz->lock);
3578 css_put(&mz->mem->css); 3639 css_put(&mz->memcg->css);
3579 loop++; 3640 loop++;
3580 /* 3641 /*
3581 * Could not reclaim anything and there are no more 3642 * Could not reclaim anything and there are no more
@@ -3588,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3588 break; 3649 break;
3589 } while (!nr_reclaimed); 3650 } while (!nr_reclaimed);
3590 if (next_mz) 3651 if (next_mz)
3591 css_put(&next_mz->mem->css); 3652 css_put(&next_mz->memcg->css);
3592 return nr_reclaimed; 3653 return nr_reclaimed;
3593} 3654}
3594 3655
@@ -3610,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3610 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3671 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3611 list = &mz->lruvec.lists[lru]; 3672 list = &mz->lruvec.lists[lru];
3612 3673
3613 loop = MEM_CGROUP_ZSTAT(mz, lru); 3674 loop = mz->lru_size[lru];
3614 /* give some margin against EBUSY etc...*/ 3675 /* give some margin against EBUSY etc...*/
3615 loop += 256; 3676 loop += 256;
3616 busy = NULL; 3677 busy = NULL;
@@ -3684,10 +3745,10 @@ move_account:
3684 mem_cgroup_start_move(memcg); 3745 mem_cgroup_start_move(memcg);
3685 for_each_node_state(node, N_HIGH_MEMORY) { 3746 for_each_node_state(node, N_HIGH_MEMORY) {
3686 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3747 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3687 enum lru_list l; 3748 enum lru_list lru;
3688 for_each_lru(l) { 3749 for_each_lru(lru) {
3689 ret = mem_cgroup_force_empty_list(memcg, 3750 ret = mem_cgroup_force_empty_list(memcg,
3690 node, zid, l); 3751 node, zid, lru);
3691 if (ret) 3752 if (ret)
3692 break; 3753 break;
3693 } 3754 }
@@ -3841,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3841 break; 3902 break;
3842 default: 3903 default:
3843 BUG(); 3904 BUG();
3844 break;
3845 } 3905 }
3846 return val; 3906 return val;
3847} 3907}
@@ -3920,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3920out: 3980out:
3921 *mem_limit = min_limit; 3981 *mem_limit = min_limit;
3922 *memsw_limit = min_memsw_limit; 3982 *memsw_limit = min_memsw_limit;
3923 return;
3924} 3983}
3925 3984
3926static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3985static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -4079,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4079 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4138 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4080 unsigned long node_nr; 4139 unsigned long node_nr;
4081 struct cgroup *cont = m->private; 4140 struct cgroup *cont = m->private;
4082 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4141 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4083 4142
4084 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); 4143 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4085 seq_printf(m, "total=%lu", total_nr); 4144 seq_printf(m, "total=%lu", total_nr);
4086 for_each_node_state(nid, N_HIGH_MEMORY) { 4145 for_each_node_state(nid, N_HIGH_MEMORY) {
4087 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); 4146 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4088 seq_printf(m, " N%d=%lu", nid, node_nr); 4147 seq_printf(m, " N%d=%lu", nid, node_nr);
4089 } 4148 }
4090 seq_putc(m, '\n'); 4149 seq_putc(m, '\n');
4091 4150
4092 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); 4151 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4093 seq_printf(m, "file=%lu", file_nr); 4152 seq_printf(m, "file=%lu", file_nr);
4094 for_each_node_state(nid, N_HIGH_MEMORY) { 4153 for_each_node_state(nid, N_HIGH_MEMORY) {
4095 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4154 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4096 LRU_ALL_FILE); 4155 LRU_ALL_FILE);
4097 seq_printf(m, " N%d=%lu", nid, node_nr); 4156 seq_printf(m, " N%d=%lu", nid, node_nr);
4098 } 4157 }
4099 seq_putc(m, '\n'); 4158 seq_putc(m, '\n');
4100 4159
4101 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); 4160 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4102 seq_printf(m, "anon=%lu", anon_nr); 4161 seq_printf(m, "anon=%lu", anon_nr);
4103 for_each_node_state(nid, N_HIGH_MEMORY) { 4162 for_each_node_state(nid, N_HIGH_MEMORY) {
4104 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4163 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4105 LRU_ALL_ANON); 4164 LRU_ALL_ANON);
4106 seq_printf(m, " N%d=%lu", nid, node_nr); 4165 seq_printf(m, " N%d=%lu", nid, node_nr);
4107 } 4166 }
4108 seq_putc(m, '\n'); 4167 seq_putc(m, '\n');
4109 4168
4110 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); 4169 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4111 seq_printf(m, "unevictable=%lu", unevictable_nr); 4170 seq_printf(m, "unevictable=%lu", unevictable_nr);
4112 for_each_node_state(nid, N_HIGH_MEMORY) { 4171 for_each_node_state(nid, N_HIGH_MEMORY) {
4113 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4172 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4114 BIT(LRU_UNEVICTABLE)); 4173 BIT(LRU_UNEVICTABLE));
4115 seq_printf(m, " N%d=%lu", nid, node_nr); 4174 seq_printf(m, " N%d=%lu", nid, node_nr);
4116 } 4175 }
@@ -4122,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4122static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4181static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4123 struct cgroup_map_cb *cb) 4182 struct cgroup_map_cb *cb)
4124{ 4183{
4125 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4184 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4126 struct mcs_total_stat mystat; 4185 struct mcs_total_stat mystat;
4127 int i; 4186 int i;
4128 4187
4129 memset(&mystat, 0, sizeof(mystat)); 4188 memset(&mystat, 0, sizeof(mystat));
4130 mem_cgroup_get_local_stat(mem_cont, &mystat); 4189 mem_cgroup_get_local_stat(memcg, &mystat);
4131 4190
4132 4191
4133 for (i = 0; i < NR_MCS_STAT; i++) { 4192 for (i = 0; i < NR_MCS_STAT; i++) {
@@ -4139,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4139 /* Hierarchical information */ 4198 /* Hierarchical information */
4140 { 4199 {
4141 unsigned long long limit, memsw_limit; 4200 unsigned long long limit, memsw_limit;
4142 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 4201 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4143 cb->fill(cb, "hierarchical_memory_limit", limit); 4202 cb->fill(cb, "hierarchical_memory_limit", limit);
4144 if (do_swap_account) 4203 if (do_swap_account)
4145 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4204 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4146 } 4205 }
4147 4206
4148 memset(&mystat, 0, sizeof(mystat)); 4207 memset(&mystat, 0, sizeof(mystat));
4149 mem_cgroup_get_total_stat(mem_cont, &mystat); 4208 mem_cgroup_get_total_stat(memcg, &mystat);
4150 for (i = 0; i < NR_MCS_STAT; i++) { 4209 for (i = 0; i < NR_MCS_STAT; i++) {
4151 if (i == MCS_SWAP && !do_swap_account) 4210 if (i == MCS_SWAP && !do_swap_account)
4152 continue; 4211 continue;
@@ -4162,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4162 4221
4163 for_each_online_node(nid) 4222 for_each_online_node(nid)
4164 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4223 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4165 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 4224 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4166 4225
4167 recent_rotated[0] += 4226 recent_rotated[0] +=
4168 mz->reclaim_stat.recent_rotated[0]; 4227 mz->reclaim_stat.recent_rotated[0];
@@ -4407,11 +4466,8 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4407 else 4466 else
4408 BUG(); 4467 BUG();
4409 4468
4410 /* 4469 if (!thresholds->primary)
4411 * Something went wrong if we trying to unregister a threshold 4470 goto unlock;
4412 * if we don't have thresholds
4413 */
4414 BUG_ON(!thresholds);
4415 4471
4416 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4472 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4417 4473
@@ -4461,7 +4517,7 @@ swap_buffers:
4461 4517
4462 /* To be sure that nobody uses thresholds */ 4518 /* To be sure that nobody uses thresholds */
4463 synchronize_rcu(); 4519 synchronize_rcu();
4464 4520unlock:
4465 mutex_unlock(&memcg->thresholds_lock); 4521 mutex_unlock(&memcg->thresholds_lock);
4466} 4522}
4467 4523
@@ -4580,10 +4636,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4580 return mem_cgroup_sockets_init(cont, ss); 4636 return mem_cgroup_sockets_init(cont, ss);
4581}; 4637};
4582 4638
4583static void kmem_cgroup_destroy(struct cgroup_subsys *ss, 4639static void kmem_cgroup_destroy(struct cgroup *cont)
4584 struct cgroup *cont)
4585{ 4640{
4586 mem_cgroup_sockets_destroy(cont, ss); 4641 mem_cgroup_sockets_destroy(cont);
4587} 4642}
4588#else 4643#else
4589static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4644static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
@@ -4591,8 +4646,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4591 return 0; 4646 return 0;
4592} 4647}
4593 4648
4594static void kmem_cgroup_destroy(struct cgroup_subsys *ss, 4649static void kmem_cgroup_destroy(struct cgroup *cont)
4595 struct cgroup *cont)
4596{ 4650{
4597} 4651}
4598#endif 4652#endif
@@ -4716,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4716{ 4770{
4717 struct mem_cgroup_per_node *pn; 4771 struct mem_cgroup_per_node *pn;
4718 struct mem_cgroup_per_zone *mz; 4772 struct mem_cgroup_per_zone *mz;
4719 enum lru_list l; 4773 enum lru_list lru;
4720 int zone, tmp = node; 4774 int zone, tmp = node;
4721 /* 4775 /*
4722 * This routine is called against possible nodes. 4776 * This routine is called against possible nodes.
@@ -4734,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4734 4788
4735 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4736 mz = &pn->zoneinfo[zone]; 4790 mz = &pn->zoneinfo[zone];
4737 for_each_lru(l) 4791 for_each_lru(lru)
4738 INIT_LIST_HEAD(&mz->lruvec.lists[l]); 4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
4739 mz->usage_in_excess = 0; 4793 mz->usage_in_excess = 0;
4740 mz->on_tree = false; 4794 mz->on_tree = false;
4741 mz->mem = memcg; 4795 mz->memcg = memcg;
4742 } 4796 }
4743 memcg->info.nodeinfo[node] = pn; 4797 memcg->info.nodeinfo[node] = pn;
4744 return 0; 4798 return 0;
@@ -4751,33 +4805,54 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4751 4805
4752static struct mem_cgroup *mem_cgroup_alloc(void) 4806static struct mem_cgroup *mem_cgroup_alloc(void)
4753{ 4807{
4754 struct mem_cgroup *mem; 4808 struct mem_cgroup *memcg;
4755 int size = sizeof(struct mem_cgroup); 4809 int size = sizeof(struct mem_cgroup);
4756 4810
4757 /* Can be very big if MAX_NUMNODES is very big */ 4811 /* Can be very big if MAX_NUMNODES is very big */
4758 if (size < PAGE_SIZE) 4812 if (size < PAGE_SIZE)
4759 mem = kzalloc(size, GFP_KERNEL); 4813 memcg = kzalloc(size, GFP_KERNEL);
4760 else 4814 else
4761 mem = vzalloc(size); 4815 memcg = vzalloc(size);
4762 4816
4763 if (!mem) 4817 if (!memcg)
4764 return NULL; 4818 return NULL;
4765 4819
4766 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4820 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4767 if (!mem->stat) 4821 if (!memcg->stat)
4768 goto out_free; 4822 goto out_free;
4769 spin_lock_init(&mem->pcp_counter_lock); 4823 spin_lock_init(&memcg->pcp_counter_lock);
4770 return mem; 4824 return memcg;
4771 4825
4772out_free: 4826out_free:
4773 if (size < PAGE_SIZE) 4827 if (size < PAGE_SIZE)
4774 kfree(mem); 4828 kfree(memcg);
4775 else 4829 else
4776 vfree(mem); 4830 vfree(memcg);
4777 return NULL; 4831 return NULL;
4778} 4832}
4779 4833
4780/* 4834/*
4835 * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
4836 * but in process context. The work_freeing structure is overlaid
4837 * on the rcu_freeing structure, which itself is overlaid on memsw.
4838 */
4839static void vfree_work(struct work_struct *work)
4840{
4841 struct mem_cgroup *memcg;
4842
4843 memcg = container_of(work, struct mem_cgroup, work_freeing);
4844 vfree(memcg);
4845}
4846static void vfree_rcu(struct rcu_head *rcu_head)
4847{
4848 struct mem_cgroup *memcg;
4849
4850 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4851 INIT_WORK(&memcg->work_freeing, vfree_work);
4852 schedule_work(&memcg->work_freeing);
4853}
4854
4855/*
4781 * At destroying mem_cgroup, references from swap_cgroup can remain. 4856 * At destroying mem_cgroup, references from swap_cgroup can remain.
4782 * (scanning all at force_empty is too costly...) 4857 * (scanning all at force_empty is too costly...)
4783 * 4858 *
@@ -4800,9 +4875,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4800 4875
4801 free_percpu(memcg->stat); 4876 free_percpu(memcg->stat);
4802 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4877 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4803 kfree(memcg); 4878 kfree_rcu(memcg, rcu_freeing);
4804 else 4879 else
4805 vfree(memcg); 4880 call_rcu(&memcg->rcu_freeing, vfree_rcu);
4806} 4881}
4807 4882
4808static void mem_cgroup_get(struct mem_cgroup *memcg) 4883static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4884,7 +4959,7 @@ err_cleanup:
4884} 4959}
4885 4960
4886static struct cgroup_subsys_state * __ref 4961static struct cgroup_subsys_state * __ref
4887mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4962mem_cgroup_create(struct cgroup *cont)
4888{ 4963{
4889 struct mem_cgroup *memcg, *parent; 4964 struct mem_cgroup *memcg, *parent;
4890 long error = -ENOMEM; 4965 long error = -ENOMEM;
@@ -4940,26 +5015,25 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4940 atomic_set(&memcg->refcnt, 1); 5015 atomic_set(&memcg->refcnt, 1);
4941 memcg->move_charge_at_immigrate = 0; 5016 memcg->move_charge_at_immigrate = 0;
4942 mutex_init(&memcg->thresholds_lock); 5017 mutex_init(&memcg->thresholds_lock);
5018 spin_lock_init(&memcg->move_lock);
4943 return &memcg->css; 5019 return &memcg->css;
4944free_out: 5020free_out:
4945 __mem_cgroup_free(memcg); 5021 __mem_cgroup_free(memcg);
4946 return ERR_PTR(error); 5022 return ERR_PTR(error);
4947} 5023}
4948 5024
4949static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 5025static int mem_cgroup_pre_destroy(struct cgroup *cont)
4950 struct cgroup *cont)
4951{ 5026{
4952 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5027 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4953 5028
4954 return mem_cgroup_force_empty(memcg, false); 5029 return mem_cgroup_force_empty(memcg, false);
4955} 5030}
4956 5031
4957static void mem_cgroup_destroy(struct cgroup_subsys *ss, 5032static void mem_cgroup_destroy(struct cgroup *cont)
4958 struct cgroup *cont)
4959{ 5033{
4960 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5034 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4961 5035
4962 kmem_cgroup_destroy(ss, cont); 5036 kmem_cgroup_destroy(cont);
4963 5037
4964 mem_cgroup_put(memcg); 5038 mem_cgroup_put(memcg);
4965} 5039}
@@ -5036,7 +5110,7 @@ one_by_one:
5036} 5110}
5037 5111
5038/** 5112/**
5039 * is_target_pte_for_mc - check a pte whether it is valid for move charge 5113 * get_mctgt_type - get target type of moving charge
5040 * @vma: the vma the pte to be checked belongs 5114 * @vma: the vma the pte to be checked belongs
5041 * @addr: the address corresponding to the pte to be checked 5115 * @addr: the address corresponding to the pte to be checked
5042 * @ptent: the pte to be checked 5116 * @ptent: the pte to be checked
@@ -5059,7 +5133,7 @@ union mc_target {
5059}; 5133};
5060 5134
5061enum mc_target_type { 5135enum mc_target_type {
5062 MC_TARGET_NONE, /* not used */ 5136 MC_TARGET_NONE = 0,
5063 MC_TARGET_PAGE, 5137 MC_TARGET_PAGE,
5064 MC_TARGET_SWAP, 5138 MC_TARGET_SWAP,
5065}; 5139};
@@ -5140,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5140 return page; 5214 return page;
5141} 5215}
5142 5216
5143static int is_target_pte_for_mc(struct vm_area_struct *vma, 5217static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5144 unsigned long addr, pte_t ptent, union mc_target *target) 5218 unsigned long addr, pte_t ptent, union mc_target *target)
5145{ 5219{
5146 struct page *page = NULL; 5220 struct page *page = NULL;
5147 struct page_cgroup *pc; 5221 struct page_cgroup *pc;
5148 int ret = 0; 5222 enum mc_target_type ret = MC_TARGET_NONE;
5149 swp_entry_t ent = { .val = 0 }; 5223 swp_entry_t ent = { .val = 0 };
5150 5224
5151 if (pte_present(ptent)) 5225 if (pte_present(ptent))
@@ -5156,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5156 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5230 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5157 5231
5158 if (!page && !ent.val) 5232 if (!page && !ent.val)
5159 return 0; 5233 return ret;
5160 if (page) { 5234 if (page) {
5161 pc = lookup_page_cgroup(page); 5235 pc = lookup_page_cgroup(page);
5162 /* 5236 /*
@@ -5182,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5182 return ret; 5256 return ret;
5183} 5257}
5184 5258
5259#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5260/*
5261 * We don't consider swapping or file mapped pages because THP does not
5262 * support them for now.
5263 * Caller should make sure that pmd_trans_huge(pmd) is true.
5264 */
5265static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5266 unsigned long addr, pmd_t pmd, union mc_target *target)
5267{
5268 struct page *page = NULL;
5269 struct page_cgroup *pc;
5270 enum mc_target_type ret = MC_TARGET_NONE;
5271
5272 page = pmd_page(pmd);
5273 VM_BUG_ON(!page || !PageHead(page));
5274 if (!move_anon())
5275 return ret;
5276 pc = lookup_page_cgroup(page);
5277 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5278 ret = MC_TARGET_PAGE;
5279 if (target) {
5280 get_page(page);
5281 target->page = page;
5282 }
5283 }
5284 return ret;
5285}
5286#else
5287static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5288 unsigned long addr, pmd_t pmd, union mc_target *target)
5289{
5290 return MC_TARGET_NONE;
5291}
5292#endif
5293
5185static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5294static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5186 unsigned long addr, unsigned long end, 5295 unsigned long addr, unsigned long end,
5187 struct mm_walk *walk) 5296 struct mm_walk *walk)
@@ -5190,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5190 pte_t *pte; 5299 pte_t *pte;
5191 spinlock_t *ptl; 5300 spinlock_t *ptl;
5192 5301
5193 split_huge_page_pmd(walk->mm, pmd); 5302 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5303 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5304 mc.precharge += HPAGE_PMD_NR;
5305 spin_unlock(&vma->vm_mm->page_table_lock);
5306 return 0;
5307 }
5194 5308
5195 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5309 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5196 for (; addr != end; pte++, addr += PAGE_SIZE) 5310 for (; addr != end; pte++, addr += PAGE_SIZE)
5197 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5311 if (get_mctgt_type(vma, addr, *pte, NULL))
5198 mc.precharge++; /* increment precharge temporarily */ 5312 mc.precharge++; /* increment precharge temporarily */
5199 pte_unmap_unlock(pte - 1, ptl); 5313 pte_unmap_unlock(pte - 1, ptl);
5200 cond_resched(); 5314 cond_resched();
@@ -5296,9 +5410,8 @@ static void mem_cgroup_clear_mc(void)
5296 mem_cgroup_end_move(from); 5410 mem_cgroup_end_move(from);
5297} 5411}
5298 5412
5299static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5413static int mem_cgroup_can_attach(struct cgroup *cgroup,
5300 struct cgroup *cgroup, 5414 struct cgroup_taskset *tset)
5301 struct cgroup_taskset *tset)
5302{ 5415{
5303 struct task_struct *p = cgroup_taskset_first(tset); 5416 struct task_struct *p = cgroup_taskset_first(tset);
5304 int ret = 0; 5417 int ret = 0;
@@ -5336,9 +5449,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5336 return ret; 5449 return ret;
5337} 5450}
5338 5451
5339static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5452static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5340 struct cgroup *cgroup, 5453 struct cgroup_taskset *tset)
5341 struct cgroup_taskset *tset)
5342{ 5454{
5343 mem_cgroup_clear_mc(); 5455 mem_cgroup_clear_mc();
5344} 5456}
@@ -5351,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5351 struct vm_area_struct *vma = walk->private; 5463 struct vm_area_struct *vma = walk->private;
5352 pte_t *pte; 5464 pte_t *pte;
5353 spinlock_t *ptl; 5465 spinlock_t *ptl;
5466 enum mc_target_type target_type;
5467 union mc_target target;
5468 struct page *page;
5469 struct page_cgroup *pc;
5470
5471 /*
5472 * We don't take compound_lock() here but no race with splitting thp
5473 * happens because:
5474 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
5475 * under splitting, which means there's no concurrent thp split,
5476 * - if another thread runs into split_huge_page() just after we
5477 * entered this if-block, the thread must wait for page table lock
5478 * to be unlocked in __split_huge_page_splitting(), where the main
5479 * part of thp split is not executed yet.
5480 */
5481 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5482 if (!mc.precharge) {
5483 spin_unlock(&vma->vm_mm->page_table_lock);
5484 return 0;
5485 }
5486 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5487 if (target_type == MC_TARGET_PAGE) {
5488 page = target.page;
5489 if (!isolate_lru_page(page)) {
5490 pc = lookup_page_cgroup(page);
5491 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5492 pc, mc.from, mc.to,
5493 false)) {
5494 mc.precharge -= HPAGE_PMD_NR;
5495 mc.moved_charge += HPAGE_PMD_NR;
5496 }
5497 putback_lru_page(page);
5498 }
5499 put_page(page);
5500 }
5501 spin_unlock(&vma->vm_mm->page_table_lock);
5502 return 0;
5503 }
5354 5504
5355 split_huge_page_pmd(walk->mm, pmd);
5356retry: 5505retry:
5357 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5506 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5358 for (; addr != end; addr += PAGE_SIZE) { 5507 for (; addr != end; addr += PAGE_SIZE) {
5359 pte_t ptent = *(pte++); 5508 pte_t ptent = *(pte++);
5360 union mc_target target;
5361 int type;
5362 struct page *page;
5363 struct page_cgroup *pc;
5364 swp_entry_t ent; 5509 swp_entry_t ent;
5365 5510
5366 if (!mc.precharge) 5511 if (!mc.precharge)
5367 break; 5512 break;
5368 5513
5369 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5514 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5370 switch (type) {
5371 case MC_TARGET_PAGE: 5515 case MC_TARGET_PAGE:
5372 page = target.page; 5516 page = target.page;
5373 if (isolate_lru_page(page)) 5517 if (isolate_lru_page(page))
@@ -5380,7 +5524,7 @@ retry:
5380 mc.moved_charge++; 5524 mc.moved_charge++;
5381 } 5525 }
5382 putback_lru_page(page); 5526 putback_lru_page(page);
5383put: /* is_target_pte_for_mc() gets the page */ 5527put: /* get_mctgt_type() gets the page */
5384 put_page(page); 5528 put_page(page);
5385 break; 5529 break;
5386 case MC_TARGET_SWAP: 5530 case MC_TARGET_SWAP:
@@ -5453,9 +5597,8 @@ retry:
5453 up_read(&mm->mmap_sem); 5597 up_read(&mm->mmap_sem);
5454} 5598}
5455 5599
5456static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5600static void mem_cgroup_move_task(struct cgroup *cont,
5457 struct cgroup *cont, 5601 struct cgroup_taskset *tset)
5458 struct cgroup_taskset *tset)
5459{ 5602{
5460 struct task_struct *p = cgroup_taskset_first(tset); 5603 struct task_struct *p = cgroup_taskset_first(tset);
5461 struct mm_struct *mm = get_task_mm(p); 5604 struct mm_struct *mm = get_task_mm(p);
@@ -5470,20 +5613,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5470 mem_cgroup_clear_mc(); 5613 mem_cgroup_clear_mc();
5471} 5614}
5472#else /* !CONFIG_MMU */ 5615#else /* !CONFIG_MMU */
5473static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5616static int mem_cgroup_can_attach(struct cgroup *cgroup,
5474 struct cgroup *cgroup, 5617 struct cgroup_taskset *tset)
5475 struct cgroup_taskset *tset)
5476{ 5618{
5477 return 0; 5619 return 0;
5478} 5620}
5479static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5621static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5480 struct cgroup *cgroup, 5622 struct cgroup_taskset *tset)
5481 struct cgroup_taskset *tset)
5482{ 5623{
5483} 5624}
5484static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5625static void mem_cgroup_move_task(struct cgroup *cont,
5485 struct cgroup *cont, 5626 struct cgroup_taskset *tset)
5486 struct cgroup_taskset *tset)
5487{ 5627{
5488} 5628}
5489#endif 5629#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 56080ea3614..97cc2733551 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -187,33 +187,40 @@ int hwpoison_filter(struct page *p)
187EXPORT_SYMBOL_GPL(hwpoison_filter); 187EXPORT_SYMBOL_GPL(hwpoison_filter);
188 188
189/* 189/*
190 * Send all the processes who have the page mapped an ``action optional'' 190 * Send all the processes who have the page mapped a signal.
191 * signal. 191 * ``action optional'' if they are not immediately affected by the error
192 * ``action required'' if error happened in current execution context
192 */ 193 */
193static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, 194static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
194 unsigned long pfn, struct page *page) 195 unsigned long pfn, struct page *page, int flags)
195{ 196{
196 struct siginfo si; 197 struct siginfo si;
197 int ret; 198 int ret;
198 199
199 printk(KERN_ERR 200 printk(KERN_ERR
200 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", 201 "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
201 pfn, t->comm, t->pid); 202 pfn, t->comm, t->pid);
202 si.si_signo = SIGBUS; 203 si.si_signo = SIGBUS;
203 si.si_errno = 0; 204 si.si_errno = 0;
204 si.si_code = BUS_MCEERR_AO;
205 si.si_addr = (void *)addr; 205 si.si_addr = (void *)addr;
206#ifdef __ARCH_SI_TRAPNO 206#ifdef __ARCH_SI_TRAPNO
207 si.si_trapno = trapno; 207 si.si_trapno = trapno;
208#endif 208#endif
209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; 209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
210 /* 210
211 * Don't use force here, it's convenient if the signal 211 if ((flags & MF_ACTION_REQUIRED) && t == current) {
212 * can be temporarily blocked. 212 si.si_code = BUS_MCEERR_AR;
213 * This could cause a loop when the user sets SIGBUS 213 ret = force_sig_info(SIGBUS, &si, t);
214 * to SIG_IGN, but hopefully no one will do that? 214 } else {
215 */ 215 /*
216 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ 216 * Don't use force here, it's convenient if the signal
217 * can be temporarily blocked.
218 * This could cause a loop when the user sets SIGBUS
219 * to SIG_IGN, but hopefully no one will do that?
220 */
221 si.si_code = BUS_MCEERR_AO;
222 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
223 }
217 if (ret < 0) 224 if (ret < 0)
218 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", 225 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
219 t->comm, t->pid, ret); 226 t->comm, t->pid, ret);
@@ -338,8 +345,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
338 * Also when FAIL is set do a force kill because something went 345 * Also when FAIL is set do a force kill because something went
339 * wrong earlier. 346 * wrong earlier.
340 */ 347 */
341static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, 348static void kill_procs(struct list_head *to_kill, int doit, int trapno,
342 int fail, struct page *page, unsigned long pfn) 349 int fail, struct page *page, unsigned long pfn,
350 int flags)
343{ 351{
344 struct to_kill *tk, *next; 352 struct to_kill *tk, *next;
345 353
@@ -363,8 +371,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
363 * check for that, but we need to tell the 371 * check for that, but we need to tell the
364 * process anyways. 372 * process anyways.
365 */ 373 */
366 else if (kill_proc_ao(tk->tsk, tk->addr, trapno, 374 else if (kill_proc(tk->tsk, tk->addr, trapno,
367 pfn, page) < 0) 375 pfn, page, flags) < 0)
368 printk(KERN_ERR 376 printk(KERN_ERR
369 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", 377 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
370 pfn, tk->tsk->comm, tk->tsk->pid); 378 pfn, tk->tsk->comm, tk->tsk->pid);
@@ -844,7 +852,7 @@ static int page_action(struct page_state *ps, struct page *p,
844 * the pages and send SIGBUS to the processes if the data was dirty. 852 * the pages and send SIGBUS to the processes if the data was dirty.
845 */ 853 */
846static int hwpoison_user_mappings(struct page *p, unsigned long pfn, 854static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
847 int trapno) 855 int trapno, int flags)
848{ 856{
849 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 857 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
850 struct address_space *mapping; 858 struct address_space *mapping;
@@ -962,8 +970,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
962 * use a more force-full uncatchable kill to prevent 970 * use a more force-full uncatchable kill to prevent
963 * any accesses to the poisoned memory. 971 * any accesses to the poisoned memory.
964 */ 972 */
965 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, 973 kill_procs(&tokill, !!PageDirty(ppage), trapno,
966 ret != SWAP_SUCCESS, p, pfn); 974 ret != SWAP_SUCCESS, p, pfn, flags);
967 975
968 return ret; 976 return ret;
969} 977}
@@ -984,7 +992,25 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
984 ClearPageHWPoison(hpage + i); 992 ClearPageHWPoison(hpage + i);
985} 993}
986 994
987int __memory_failure(unsigned long pfn, int trapno, int flags) 995/**
996 * memory_failure - Handle memory failure of a page.
997 * @pfn: Page Number of the corrupted page
998 * @trapno: Trap number reported in the signal to user space.
999 * @flags: fine tune action taken
1000 *
1001 * This function is called by the low level machine check code
1002 * of an architecture when it detects hardware memory corruption
1003 * of a page. It tries its best to recover, which includes
1004 * dropping pages, killing processes etc.
1005 *
1006 * The function is primarily of use for corruptions that
1007 * happen outside the current execution context (e.g. when
1008 * detected by a background scrubber)
1009 *
1010 * Must run in process context (e.g. a work queue) with interrupts
1011 * enabled and no spinlocks hold.
1012 */
1013int memory_failure(unsigned long pfn, int trapno, int flags)
988{ 1014{
989 struct page_state *ps; 1015 struct page_state *ps;
990 struct page *p; 1016 struct page *p;
@@ -1063,7 +1089,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1063 * The check (unnecessarily) ignores LRU pages being isolated and 1089 * The check (unnecessarily) ignores LRU pages being isolated and
1064 * walked by the page reclaim code, however that's not a big loss. 1090 * walked by the page reclaim code, however that's not a big loss.
1065 */ 1091 */
1066 if (!PageHuge(p) && !PageTransCompound(p)) { 1092 if (!PageHuge(p) && !PageTransTail(p)) {
1067 if (!PageLRU(p)) 1093 if (!PageLRU(p))
1068 shake_page(p, 0); 1094 shake_page(p, 0);
1069 if (!PageLRU(p)) { 1095 if (!PageLRU(p)) {
@@ -1130,7 +1156,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1130 * Now take care of user space mappings. 1156 * Now take care of user space mappings.
1131 * Abort on fail: __delete_from_page_cache() assumes unmapped page. 1157 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1132 */ 1158 */
1133 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { 1159 if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
1134 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 1160 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1135 res = -EBUSY; 1161 res = -EBUSY;
1136 goto out; 1162 goto out;
@@ -1156,29 +1182,7 @@ out:
1156 unlock_page(hpage); 1182 unlock_page(hpage);
1157 return res; 1183 return res;
1158} 1184}
1159EXPORT_SYMBOL_GPL(__memory_failure); 1185EXPORT_SYMBOL_GPL(memory_failure);
1160
1161/**
1162 * memory_failure - Handle memory failure of a page.
1163 * @pfn: Page Number of the corrupted page
1164 * @trapno: Trap number reported in the signal to user space.
1165 *
1166 * This function is called by the low level machine check code
1167 * of an architecture when it detects hardware memory corruption
1168 * of a page. It tries its best to recover, which includes
1169 * dropping pages, killing processes etc.
1170 *
1171 * The function is primarily of use for corruptions that
1172 * happen outside the current execution context (e.g. when
1173 * detected by a background scrubber)
1174 *
1175 * Must run in process context (e.g. a work queue) with interrupts
1176 * enabled and no spinlocks hold.
1177 */
1178void memory_failure(unsigned long pfn, int trapno)
1179{
1180 __memory_failure(pfn, trapno, 0);
1181}
1182 1186
1183#define MEMORY_FAILURE_FIFO_ORDER 4 1187#define MEMORY_FAILURE_FIFO_ORDER 4
1184#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) 1188#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
@@ -1251,7 +1255,7 @@ static void memory_failure_work_func(struct work_struct *work)
1251 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 1255 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1252 if (!gotten) 1256 if (!gotten)
1253 break; 1257 break;
1254 __memory_failure(entry.pfn, entry.trapno, entry.flags); 1258 memory_failure(entry.pfn, entry.trapno, entry.flags);
1255 } 1259 }
1256} 1260}
1257 1261
diff --git a/mm/memory.c b/mm/memory.c
index fa2f04e0337..6105f475fa8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
125 125
126#if defined(SPLIT_RSS_COUNTING) 126#if defined(SPLIT_RSS_COUNTING)
127 127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) 128void sync_mm_rss(struct mm_struct *mm)
129{ 129{
130 int i; 130 int i;
131 131
132 for (i = 0; i < NR_MM_COUNTERS; i++) { 132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) { 133 if (current->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]); 134 add_mm_counter(mm, i, current->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0; 135 current->rss_stat.count[i] = 0;
136 } 136 }
137 } 137 }
138 task->rss_stat.events = 0; 138 current->rss_stat.events = 0;
139} 139}
140 140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) 141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task)
157 if (unlikely(task != current)) 157 if (unlikely(task != current))
158 return; 158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) 159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm); 160 sync_mm_rss(task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167 /*
168 * Don't use task->mm here...for avoiding to use task_get_mm()..
169 * The caller must guarantee task->mm is not invalid.
170 */
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172 /*
173 * counter is updated in asynchronous manner and may go to minus.
174 * But it's never be expected number for users.
175 */
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184} 161}
185#else /* SPLIT_RSS_COUNTING */ 162#else /* SPLIT_RSS_COUNTING */
186 163
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
661 int i; 638 int i;
662 639
663 if (current->mm == mm) 640 if (current->mm == mm)
664 sync_mm_rss(current, mm); 641 sync_mm_rss(mm);
665 for (i = 0; i < NR_MM_COUNTERS; i++) 642 for (i = 0; i < NR_MM_COUNTERS; i++)
666 if (rss[i]) 643 if (rss[i])
667 add_mm_counter(mm, i, rss[i]); 644 add_mm_counter(mm, i, rss[i]);
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1247 do { 1224 do {
1248 next = pmd_addr_end(addr, end); 1225 next = pmd_addr_end(addr, end);
1249 if (pmd_trans_huge(*pmd)) { 1226 if (pmd_trans_huge(*pmd)) {
1250 if (next-addr != HPAGE_PMD_SIZE) { 1227 if (next - addr != HPAGE_PMD_SIZE) {
1251 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1228 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1252 split_huge_page_pmd(vma->vm_mm, pmd); 1229 split_huge_page_pmd(vma->vm_mm, pmd);
1253 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1230 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1254 continue; 1231 goto next;
1255 /* fall through */ 1232 /* fall through */
1256 } 1233 }
1257 if (pmd_none_or_clear_bad(pmd)) 1234 /*
1258 continue; 1235 * Here there can be other concurrent MADV_DONTNEED or
1236 * trans huge page faults running, and if the pmd is
1237 * none or trans huge it can change under us. This is
1238 * because MADV_DONTNEED holds the mmap_sem in read
1239 * mode.
1240 */
1241 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1242 goto next;
1259 next = zap_pte_range(tlb, vma, pmd, addr, next, details); 1243 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1244next:
1260 cond_resched(); 1245 cond_resched();
1261 } while (pmd++, addr = next, addr != end); 1246 } while (pmd++, addr = next, addr != end);
1262 1247
@@ -1282,10 +1267,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1282 return addr; 1267 return addr;
1283} 1268}
1284 1269
1285static unsigned long unmap_page_range(struct mmu_gather *tlb, 1270static void unmap_page_range(struct mmu_gather *tlb,
1286 struct vm_area_struct *vma, 1271 struct vm_area_struct *vma,
1287 unsigned long addr, unsigned long end, 1272 unsigned long addr, unsigned long end,
1288 struct zap_details *details) 1273 struct zap_details *details)
1289{ 1274{
1290 pgd_t *pgd; 1275 pgd_t *pgd;
1291 unsigned long next; 1276 unsigned long next;
@@ -1305,8 +1290,47 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1305 } while (pgd++, addr = next, addr != end); 1290 } while (pgd++, addr = next, addr != end);
1306 tlb_end_vma(tlb, vma); 1291 tlb_end_vma(tlb, vma);
1307 mem_cgroup_uncharge_end(); 1292 mem_cgroup_uncharge_end();
1293}
1308 1294
1309 return addr; 1295
1296static void unmap_single_vma(struct mmu_gather *tlb,
1297 struct vm_area_struct *vma, unsigned long start_addr,
1298 unsigned long end_addr, unsigned long *nr_accounted,
1299 struct zap_details *details)
1300{
1301 unsigned long start = max(vma->vm_start, start_addr);
1302 unsigned long end;
1303
1304 if (start >= vma->vm_end)
1305 return;
1306 end = min(vma->vm_end, end_addr);
1307 if (end <= vma->vm_start)
1308 return;
1309
1310 if (vma->vm_flags & VM_ACCOUNT)
1311 *nr_accounted += (end - start) >> PAGE_SHIFT;
1312
1313 if (unlikely(is_pfn_mapping(vma)))
1314 untrack_pfn_vma(vma, 0, 0);
1315
1316 if (start != end) {
1317 if (unlikely(is_vm_hugetlb_page(vma))) {
1318 /*
1319 * It is undesirable to test vma->vm_file as it
1320 * should be non-null for valid hugetlb area.
1321 * However, vm_file will be NULL in the error
1322 * cleanup path of do_mmap_pgoff. When
1323 * hugetlbfs ->mmap method fails,
1324 * do_mmap_pgoff() nullifies vma->vm_file
1325 * before calling this function to clean up.
1326 * Since no pte has actually been setup, it is
1327 * safe to do nothing in this case.
1328 */
1329 if (vma->vm_file)
1330 unmap_hugepage_range(vma, start, end, NULL);
1331 } else
1332 unmap_page_range(tlb, vma, start, end, details);
1333 }
1310} 1334}
1311 1335
1312/** 1336/**
@@ -1318,8 +1342,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1318 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here 1342 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
1319 * @details: details of nonlinear truncation or shared cache invalidation 1343 * @details: details of nonlinear truncation or shared cache invalidation
1320 * 1344 *
1321 * Returns the end address of the unmapping (restart addr if interrupted).
1322 *
1323 * Unmap all pages in the vma list. 1345 * Unmap all pages in the vma list.
1324 * 1346 *
1325 * Only addresses between `start' and `end' will be unmapped. 1347 * Only addresses between `start' and `end' will be unmapped.
@@ -1331,55 +1353,18 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1331 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 1353 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1332 * drops the lock and schedules. 1354 * drops the lock and schedules.
1333 */ 1355 */
1334unsigned long unmap_vmas(struct mmu_gather *tlb, 1356void unmap_vmas(struct mmu_gather *tlb,
1335 struct vm_area_struct *vma, unsigned long start_addr, 1357 struct vm_area_struct *vma, unsigned long start_addr,
1336 unsigned long end_addr, unsigned long *nr_accounted, 1358 unsigned long end_addr, unsigned long *nr_accounted,
1337 struct zap_details *details) 1359 struct zap_details *details)
1338{ 1360{
1339 unsigned long start = start_addr;
1340 struct mm_struct *mm = vma->vm_mm; 1361 struct mm_struct *mm = vma->vm_mm;
1341 1362
1342 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1363 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1343 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 1364 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1344 unsigned long end; 1365 unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted,
1345 1366 details);
1346 start = max(vma->vm_start, start_addr);
1347 if (start >= vma->vm_end)
1348 continue;
1349 end = min(vma->vm_end, end_addr);
1350 if (end <= vma->vm_start)
1351 continue;
1352
1353 if (vma->vm_flags & VM_ACCOUNT)
1354 *nr_accounted += (end - start) >> PAGE_SHIFT;
1355
1356 if (unlikely(is_pfn_mapping(vma)))
1357 untrack_pfn_vma(vma, 0, 0);
1358
1359 while (start != end) {
1360 if (unlikely(is_vm_hugetlb_page(vma))) {
1361 /*
1362 * It is undesirable to test vma->vm_file as it
1363 * should be non-null for valid hugetlb area.
1364 * However, vm_file will be NULL in the error
1365 * cleanup path of do_mmap_pgoff. When
1366 * hugetlbfs ->mmap method fails,
1367 * do_mmap_pgoff() nullifies vma->vm_file
1368 * before calling this function to clean up.
1369 * Since no pte has actually been setup, it is
1370 * safe to do nothing in this case.
1371 */
1372 if (vma->vm_file)
1373 unmap_hugepage_range(vma, start, end, NULL);
1374
1375 start = end;
1376 } else
1377 start = unmap_page_range(tlb, vma, start, end, details);
1378 }
1379 }
1380
1381 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1367 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1382 return start; /* which is now the end (or restart) address */
1383} 1368}
1384 1369
1385/** 1370/**
@@ -1388,8 +1373,10 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
1388 * @address: starting address of pages to zap 1373 * @address: starting address of pages to zap
1389 * @size: number of bytes to zap 1374 * @size: number of bytes to zap
1390 * @details: details of nonlinear truncation or shared cache invalidation 1375 * @details: details of nonlinear truncation or shared cache invalidation
1376 *
1377 * Caller must protect the VMA list
1391 */ 1378 */
1392unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, 1379void zap_page_range(struct vm_area_struct *vma, unsigned long address,
1393 unsigned long size, struct zap_details *details) 1380 unsigned long size, struct zap_details *details)
1394{ 1381{
1395 struct mm_struct *mm = vma->vm_mm; 1382 struct mm_struct *mm = vma->vm_mm;
@@ -1400,9 +1387,34 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1400 lru_add_drain(); 1387 lru_add_drain();
1401 tlb_gather_mmu(&tlb, mm, 0); 1388 tlb_gather_mmu(&tlb, mm, 0);
1402 update_hiwater_rss(mm); 1389 update_hiwater_rss(mm);
1403 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1390 unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1391 tlb_finish_mmu(&tlb, address, end);
1392}
1393
1394/**
1395 * zap_page_range_single - remove user pages in a given range
1396 * @vma: vm_area_struct holding the applicable pages
1397 * @address: starting address of pages to zap
1398 * @size: number of bytes to zap
1399 * @details: details of nonlinear truncation or shared cache invalidation
1400 *
1401 * The range must fit into one VMA.
1402 */
1403static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1404 unsigned long size, struct zap_details *details)
1405{
1406 struct mm_struct *mm = vma->vm_mm;
1407 struct mmu_gather tlb;
1408 unsigned long end = address + size;
1409 unsigned long nr_accounted = 0;
1410
1411 lru_add_drain();
1412 tlb_gather_mmu(&tlb, mm, 0);
1413 update_hiwater_rss(mm);
1414 mmu_notifier_invalidate_range_start(mm, address, end);
1415 unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details);
1416 mmu_notifier_invalidate_range_end(mm, address, end);
1404 tlb_finish_mmu(&tlb, address, end); 1417 tlb_finish_mmu(&tlb, address, end);
1405 return end;
1406} 1418}
1407 1419
1408/** 1420/**
@@ -1423,7 +1435,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1423 if (address < vma->vm_start || address + size > vma->vm_end || 1435 if (address < vma->vm_start || address + size > vma->vm_end ||
1424 !(vma->vm_flags & VM_PFNMAP)) 1436 !(vma->vm_flags & VM_PFNMAP))
1425 return -1; 1437 return -1;
1426 zap_page_range(vma, address, size, NULL); 1438 zap_page_range_single(vma, address, size, NULL);
1427 return 0; 1439 return 0;
1428} 1440}
1429EXPORT_SYMBOL_GPL(zap_vma_ptes); 1441EXPORT_SYMBOL_GPL(zap_vma_ptes);
@@ -2447,7 +2459,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2447 * fails, we just zero-fill it. Live with it. 2459 * fails, we just zero-fill it. Live with it.
2448 */ 2460 */
2449 if (unlikely(!src)) { 2461 if (unlikely(!src)) {
2450 void *kaddr = kmap_atomic(dst, KM_USER0); 2462 void *kaddr = kmap_atomic(dst);
2451 void __user *uaddr = (void __user *)(va & PAGE_MASK); 2463 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2452 2464
2453 /* 2465 /*
@@ -2458,7 +2470,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2458 */ 2470 */
2459 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 2471 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2460 clear_page(kaddr); 2472 clear_page(kaddr);
2461 kunmap_atomic(kaddr, KM_USER0); 2473 kunmap_atomic(kaddr);
2462 flush_dcache_page(dst); 2474 flush_dcache_page(dst);
2463 } else 2475 } else
2464 copy_user_highpage(dst, src, va, vma); 2476 copy_user_highpage(dst, src, va, vma);
@@ -2770,7 +2782,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2770 unsigned long start_addr, unsigned long end_addr, 2782 unsigned long start_addr, unsigned long end_addr,
2771 struct zap_details *details) 2783 struct zap_details *details)
2772{ 2784{
2773 zap_page_range(vma, start_addr, end_addr - start_addr, details); 2785 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2774} 2786}
2775 2787
2776static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2788static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -3611,13 +3623,7 @@ static int __init gate_vma_init(void)
3611 gate_vma.vm_end = FIXADDR_USER_END; 3623 gate_vma.vm_end = FIXADDR_USER_END;
3612 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; 3624 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3613 gate_vma.vm_page_prot = __P101; 3625 gate_vma.vm_page_prot = __P101;
3614 /* 3626
3615 * Make sure the vDSO gets into every core dump.
3616 * Dumping its contents makes post-mortem fully interpretable later
3617 * without matching up the same kernel and hardware config to see
3618 * what PC values meant.
3619 */
3620 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3621 return 0; 3627 return 0;
3622} 3628}
3623__initcall(gate_vma_init); 3629__initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 06b145fb64a..cfb6c867875 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
512 do { 512 do {
513 next = pmd_addr_end(addr, end); 513 next = pmd_addr_end(addr, end);
514 split_huge_page_pmd(vma->vm_mm, pmd); 514 split_huge_page_pmd(vma->vm_mm, pmd);
515 if (pmd_none_or_clear_bad(pmd)) 515 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
516 continue; 516 continue;
517 if (check_pte_range(vma, pmd, addr, next, nodes, 517 if (check_pte_range(vma, pmd, addr, next, nodes,
518 flags, private)) 518 flags, private))
@@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
640 unsigned long vmstart; 640 unsigned long vmstart;
641 unsigned long vmend; 641 unsigned long vmend;
642 642
643 vma = find_vma_prev(mm, start, &prev); 643 vma = find_vma(mm, start);
644 if (!vma || vma->vm_start > start) 644 if (!vma || vma->vm_start > start)
645 return -EFAULT; 645 return -EFAULT;
646 646
647 prev = vma->vm_prev;
647 if (start > vma->vm_start) 648 if (start > vma->vm_start)
648 prev = vma; 649 prev = vma;
649 650
@@ -1322,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1322 err = -ESRCH; 1323 err = -ESRCH;
1323 goto out; 1324 goto out;
1324 } 1325 }
1325 mm = get_task_mm(task); 1326 get_task_struct(task);
1326 rcu_read_unlock();
1327 1327
1328 err = -EINVAL; 1328 err = -EINVAL;
1329 if (!mm)
1330 goto out;
1331 1329
1332 /* 1330 /*
1333 * Check if this process has the right to modify the specified 1331 * Check if this process has the right to modify the specified
@@ -1335,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1335 * capabilities, superuser privileges or the same 1333 * capabilities, superuser privileges or the same
1336 * userid as the target process. 1334 * userid as the target process.
1337 */ 1335 */
1338 rcu_read_lock();
1339 tcred = __task_cred(task); 1336 tcred = __task_cred(task);
1340 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1337 if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1341 cred->uid != tcred->suid && cred->uid != tcred->uid && 1338 cred->uid != tcred->suid && cred->uid != tcred->uid &&
1342 !capable(CAP_SYS_NICE)) { 1339 !capable(CAP_SYS_NICE)) {
1343 rcu_read_unlock(); 1340 rcu_read_unlock();
1344 err = -EPERM; 1341 err = -EPERM;
1345 goto out; 1342 goto out_put;
1346 } 1343 }
1347 rcu_read_unlock(); 1344 rcu_read_unlock();
1348 1345
@@ -1350,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1350 /* Is the user allowed to access the target nodes? */ 1347 /* Is the user allowed to access the target nodes? */
1351 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1348 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1352 err = -EPERM; 1349 err = -EPERM;
1353 goto out; 1350 goto out_put;
1354 } 1351 }
1355 1352
1356 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { 1353 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1357 err = -EINVAL; 1354 err = -EINVAL;
1358 goto out; 1355 goto out_put;
1359 } 1356 }
1360 1357
1361 err = security_task_movememory(task); 1358 err = security_task_movememory(task);
1362 if (err) 1359 if (err)
1363 goto out; 1360 goto out_put;
1364 1361
1365 err = do_migrate_pages(mm, old, new, 1362 mm = get_task_mm(task);
1366 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1363 put_task_struct(task);
1367out:
1368 if (mm) 1364 if (mm)
1369 mmput(mm); 1365 err = do_migrate_pages(mm, old, new,
1366 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1367 else
1368 err = -EINVAL;
1369
1370 mmput(mm);
1371out:
1370 NODEMASK_SCRATCH_FREE(scratch); 1372 NODEMASK_SCRATCH_FREE(scratch);
1371 1373
1372 return err; 1374 return err;
1375
1376out_put:
1377 put_task_struct(task);
1378 goto out;
1379
1373} 1380}
1374 1381
1375 1382
@@ -1843,18 +1850,24 @@ struct page *
1843alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1850alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1844 unsigned long addr, int node) 1851 unsigned long addr, int node)
1845{ 1852{
1846 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1853 struct mempolicy *pol;
1847 struct zonelist *zl; 1854 struct zonelist *zl;
1848 struct page *page; 1855 struct page *page;
1856 unsigned int cpuset_mems_cookie;
1857
1858retry_cpuset:
1859 pol = get_vma_policy(current, vma, addr);
1860 cpuset_mems_cookie = get_mems_allowed();
1849 1861
1850 get_mems_allowed();
1851 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1862 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1852 unsigned nid; 1863 unsigned nid;
1853 1864
1854 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1865 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1855 mpol_cond_put(pol); 1866 mpol_cond_put(pol);
1856 page = alloc_page_interleave(gfp, order, nid); 1867 page = alloc_page_interleave(gfp, order, nid);
1857 put_mems_allowed(); 1868 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1869 goto retry_cpuset;
1870
1858 return page; 1871 return page;
1859 } 1872 }
1860 zl = policy_zonelist(gfp, pol, node); 1873 zl = policy_zonelist(gfp, pol, node);
@@ -1865,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1865 struct page *page = __alloc_pages_nodemask(gfp, order, 1878 struct page *page = __alloc_pages_nodemask(gfp, order,
1866 zl, policy_nodemask(gfp, pol)); 1879 zl, policy_nodemask(gfp, pol));
1867 __mpol_put(pol); 1880 __mpol_put(pol);
1868 put_mems_allowed(); 1881 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1882 goto retry_cpuset;
1869 return page; 1883 return page;
1870 } 1884 }
1871 /* 1885 /*
@@ -1873,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1873 */ 1887 */
1874 page = __alloc_pages_nodemask(gfp, order, zl, 1888 page = __alloc_pages_nodemask(gfp, order, zl,
1875 policy_nodemask(gfp, pol)); 1889 policy_nodemask(gfp, pol));
1876 put_mems_allowed(); 1890 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1891 goto retry_cpuset;
1877 return page; 1892 return page;
1878} 1893}
1879 1894
@@ -1900,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1900{ 1915{
1901 struct mempolicy *pol = current->mempolicy; 1916 struct mempolicy *pol = current->mempolicy;
1902 struct page *page; 1917 struct page *page;
1918 unsigned int cpuset_mems_cookie;
1903 1919
1904 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1920 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1905 pol = &default_policy; 1921 pol = &default_policy;
1906 1922
1907 get_mems_allowed(); 1923retry_cpuset:
1924 cpuset_mems_cookie = get_mems_allowed();
1925
1908 /* 1926 /*
1909 * No reference counting needed for current->mempolicy 1927 * No reference counting needed for current->mempolicy
1910 * nor system default_policy 1928 * nor system default_policy
@@ -1915,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1915 page = __alloc_pages_nodemask(gfp, order, 1933 page = __alloc_pages_nodemask(gfp, order,
1916 policy_zonelist(gfp, pol, numa_node_id()), 1934 policy_zonelist(gfp, pol, numa_node_id()),
1917 policy_nodemask(gfp, pol)); 1935 policy_nodemask(gfp, pol));
1918 put_mems_allowed(); 1936
1937 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1938 goto retry_cpuset;
1939
1919 return page; 1940 return page;
1920} 1941}
1921EXPORT_SYMBOL(alloc_pages_current); 1942EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/migrate.c b/mm/migrate.c
index 9871a56d82c..51c08a0c6f6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -445,7 +445,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
445 ClearPageSwapCache(page); 445 ClearPageSwapCache(page);
446 ClearPagePrivate(page); 446 ClearPagePrivate(page);
447 set_page_private(page, 0); 447 set_page_private(page, 0);
448 page->mapping = NULL;
449 448
450 /* 449 /*
451 * If any waiters have accumulated on the new page then 450 * If any waiters have accumulated on the new page then
@@ -667,6 +666,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
667 } else { 666 } else {
668 if (remap_swapcache) 667 if (remap_swapcache)
669 remove_migration_ptes(page, newpage); 668 remove_migration_ptes(page, newpage);
669 page->mapping = NULL;
670 } 670 }
671 671
672 unlock_page(newpage); 672 unlock_page(newpage);
@@ -839,8 +839,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
839 if (!newpage) 839 if (!newpage)
840 return -ENOMEM; 840 return -ENOMEM;
841 841
842 mem_cgroup_reset_owner(newpage);
843
844 if (page_count(page) == 1) { 842 if (page_count(page) == 1) {
845 /* page was freed from under us. So we are done. */ 843 /* page was freed from under us. So we are done. */
846 goto out; 844 goto out;
@@ -1176,20 +1174,17 @@ set_status:
1176 * Migrate an array of page address onto an array of nodes and fill 1174 * Migrate an array of page address onto an array of nodes and fill
1177 * the corresponding array of status. 1175 * the corresponding array of status.
1178 */ 1176 */
1179static int do_pages_move(struct mm_struct *mm, struct task_struct *task, 1177static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1180 unsigned long nr_pages, 1178 unsigned long nr_pages,
1181 const void __user * __user *pages, 1179 const void __user * __user *pages,
1182 const int __user *nodes, 1180 const int __user *nodes,
1183 int __user *status, int flags) 1181 int __user *status, int flags)
1184{ 1182{
1185 struct page_to_node *pm; 1183 struct page_to_node *pm;
1186 nodemask_t task_nodes;
1187 unsigned long chunk_nr_pages; 1184 unsigned long chunk_nr_pages;
1188 unsigned long chunk_start; 1185 unsigned long chunk_start;
1189 int err; 1186 int err;
1190 1187
1191 task_nodes = cpuset_mems_allowed(task);
1192
1193 err = -ENOMEM; 1188 err = -ENOMEM;
1194 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 1189 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1195 if (!pm) 1190 if (!pm)
@@ -1351,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1351 struct task_struct *task; 1346 struct task_struct *task;
1352 struct mm_struct *mm; 1347 struct mm_struct *mm;
1353 int err; 1348 int err;
1349 nodemask_t task_nodes;
1354 1350
1355 /* Check flags */ 1351 /* Check flags */
1356 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1352 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1366,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1366 rcu_read_unlock(); 1362 rcu_read_unlock();
1367 return -ESRCH; 1363 return -ESRCH;
1368 } 1364 }
1369 mm = get_task_mm(task); 1365 get_task_struct(task);
1370 rcu_read_unlock();
1371
1372 if (!mm)
1373 return -EINVAL;
1374 1366
1375 /* 1367 /*
1376 * Check if this process has the right to modify the specified 1368 * Check if this process has the right to modify the specified
@@ -1378,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1378 * capabilities, superuser privileges or the same 1370 * capabilities, superuser privileges or the same
1379 * userid as the target process. 1371 * userid as the target process.
1380 */ 1372 */
1381 rcu_read_lock();
1382 tcred = __task_cred(task); 1373 tcred = __task_cred(task);
1383 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1374 if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1384 cred->uid != tcred->suid && cred->uid != tcred->uid && 1375 cred->uid != tcred->suid && cred->uid != tcred->uid &&
@@ -1393,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1393 if (err) 1384 if (err)
1394 goto out; 1385 goto out;
1395 1386
1396 if (nodes) { 1387 task_nodes = cpuset_mems_allowed(task);
1397 err = do_pages_move(mm, task, nr_pages, pages, nodes, status, 1388 mm = get_task_mm(task);
1398 flags); 1389 put_task_struct(task);
1399 } else { 1390
1400 err = do_pages_stat(mm, nr_pages, pages, status); 1391 if (mm) {
1401 } 1392 if (nodes)
1393 err = do_pages_move(mm, task_nodes, nr_pages, pages,
1394 nodes, status, flags);
1395 else
1396 err = do_pages_stat(mm, nr_pages, pages, status);
1397 } else
1398 err = -EINVAL;
1402 1399
1403out:
1404 mmput(mm); 1400 mmput(mm);
1405 return err; 1401 return err;
1402
1403out:
1404 put_task_struct(task);
1405 return err;
1406} 1406}
1407 1407
1408/* 1408/*
diff --git a/mm/mincore.c b/mm/mincore.c
index 636a86876ff..936b4cee8cb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
164 } 164 }
165 /* fall through */ 165 /* fall through */
166 } 166 }
167 if (pmd_none_or_clear_bad(pmd)) 167 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
168 mincore_unmapped_range(vma, addr, next, vec); 168 mincore_unmapped_range(vma, addr, next, vec);
169 else 169 else
170 mincore_pte_range(vma, pmd, addr, next, vec); 170 mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4f4f53bdc65..ef726e8aa8e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on)
385 return -EINVAL; 385 return -EINVAL;
386 if (end == start) 386 if (end == start)
387 return 0; 387 return 0;
388 vma = find_vma_prev(current->mm, start, &prev); 388 vma = find_vma(current->mm, start);
389 if (!vma || vma->vm_start > start) 389 if (!vma || vma->vm_start > start)
390 return -ENOMEM; 390 return -ENOMEM;
391 391
392 prev = vma->vm_prev;
392 if (start > vma->vm_start) 393 if (start > vma->vm_start)
393 prev = vma; 394 prev = vma;
394 395
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f758c7f4c8..a7bf6a31c9f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
451} 451}
452 452
453/* 453/*
454 * Helper for vma_adjust in the split_vma insert case: 454 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
455 * insert vm structure into list and rbtree and anon_vma, 455 * mm's list and rbtree. It has already been inserted into the prio_tree.
456 * but it has already been inserted into prio_tree earlier.
457 */ 456 */
458static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 457static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
459{ 458{
@@ -936,6 +935,19 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
936#endif /* CONFIG_PROC_FS */ 935#endif /* CONFIG_PROC_FS */
937 936
938/* 937/*
938 * If a hint addr is less than mmap_min_addr change hint to be as
939 * low as possible but still greater than mmap_min_addr
940 */
941static inline unsigned long round_hint_to_min(unsigned long hint)
942{
943 hint &= PAGE_MASK;
944 if (((void *)hint != NULL) &&
945 (hint < mmap_min_addr))
946 return PAGE_ALIGN(mmap_min_addr);
947 return hint;
948}
949
950/*
939 * The caller must hold down_write(&current->mm->mmap_sem). 951 * The caller must hold down_write(&current->mm->mmap_sem).
940 */ 952 */
941 953
@@ -1099,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1099 * A dummy user value is used because we are not locking 1111 * A dummy user value is used because we are not locking
1100 * memory so no accounting is necessary 1112 * memory so no accounting is necessary
1101 */ 1113 */
1102 len = ALIGN(len, huge_page_size(&default_hstate)); 1114 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1103 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, 1115 VM_NORESERVE, &user,
1104 &user, HUGETLB_ANONHUGE_INODE); 1116 HUGETLB_ANONHUGE_INODE);
1105 if (IS_ERR(file)) 1117 if (IS_ERR(file))
1106 return PTR_ERR(file); 1118 return PTR_ERR(file);
1107 } 1119 }
@@ -1235,7 +1247,7 @@ munmap_back:
1235 */ 1247 */
1236 if (accountable_mapping(file, vm_flags)) { 1248 if (accountable_mapping(file, vm_flags)) {
1237 charged = len >> PAGE_SHIFT; 1249 charged = len >> PAGE_SHIFT;
1238 if (security_vm_enough_memory(charged)) 1250 if (security_vm_enough_memory_mm(mm, charged))
1239 return -ENOMEM; 1251 return -ENOMEM;
1240 vm_flags |= VM_ACCOUNT; 1252 vm_flags |= VM_ACCOUNT;
1241 } 1253 }
@@ -1266,8 +1278,9 @@ munmap_back:
1266 vma->vm_pgoff = pgoff; 1278 vma->vm_pgoff = pgoff;
1267 INIT_LIST_HEAD(&vma->anon_vma_chain); 1279 INIT_LIST_HEAD(&vma->anon_vma_chain);
1268 1280
1281 error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
1282
1269 if (file) { 1283 if (file) {
1270 error = -EINVAL;
1271 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1284 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1272 goto free_vma; 1285 goto free_vma;
1273 if (vm_flags & VM_DENYWRITE) { 1286 if (vm_flags & VM_DENYWRITE) {
@@ -1293,6 +1306,8 @@ munmap_back:
1293 pgoff = vma->vm_pgoff; 1306 pgoff = vma->vm_pgoff;
1294 vm_flags = vma->vm_flags; 1307 vm_flags = vma->vm_flags;
1295 } else if (vm_flags & VM_SHARED) { 1308 } else if (vm_flags & VM_SHARED) {
1309 if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
1310 goto free_vma;
1296 error = shmem_zero_setup(vma); 1311 error = shmem_zero_setup(vma);
1297 if (error) 1312 if (error)
1298 goto free_vma; 1313 goto free_vma;
@@ -1423,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1423 /* 1438 /*
1424 * Is this a new hole at the lowest possible address? 1439 * Is this a new hole at the lowest possible address?
1425 */ 1440 */
1426 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { 1441 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
1427 mm->free_area_cache = addr; 1442 mm->free_area_cache = addr;
1428 mm->cached_hole_size = ~0UL;
1429 }
1430} 1443}
1431 1444
1432/* 1445/*
@@ -1441,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1441{ 1454{
1442 struct vm_area_struct *vma; 1455 struct vm_area_struct *vma;
1443 struct mm_struct *mm = current->mm; 1456 struct mm_struct *mm = current->mm;
1444 unsigned long addr = addr0; 1457 unsigned long addr = addr0, start_addr;
1445 1458
1446 /* requested length too big for entire address space */ 1459 /* requested length too big for entire address space */
1447 if (len > TASK_SIZE) 1460 if (len > TASK_SIZE)
@@ -1465,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1465 mm->free_area_cache = mm->mmap_base; 1478 mm->free_area_cache = mm->mmap_base;
1466 } 1479 }
1467 1480
1481try_again:
1468 /* either no address requested or can't fit in requested address hole */ 1482 /* either no address requested or can't fit in requested address hole */
1469 addr = mm->free_area_cache; 1483 start_addr = addr = mm->free_area_cache;
1470
1471 /* make sure it can fit in the remaining address space */
1472 if (addr > len) {
1473 vma = find_vma(mm, addr-len);
1474 if (!vma || addr <= vma->vm_start)
1475 /* remember the address as a hint for next time */
1476 return (mm->free_area_cache = addr-len);
1477 }
1478 1484
1479 if (mm->mmap_base < len) 1485 if (addr < len)
1480 goto bottomup; 1486 goto fail;
1481
1482 addr = mm->mmap_base-len;
1483 1487
1488 addr -= len;
1484 do { 1489 do {
1485 /* 1490 /*
1486 * Lookup failure means no vma is above this address, 1491 * Lookup failure means no vma is above this address,
@@ -1500,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1500 addr = vma->vm_start-len; 1505 addr = vma->vm_start-len;
1501 } while (len < vma->vm_start); 1506 } while (len < vma->vm_start);
1502 1507
1503bottomup: 1508fail:
1509 /*
1510 * if hint left us with no space for the requested
1511 * mapping then try again:
1512 *
1513 * Note: this is different with the case of bottomup
1514 * which does the fully line-search, but we use find_vma
1515 * here that causes some holes skipped.
1516 */
1517 if (start_addr != mm->mmap_base) {
1518 mm->free_area_cache = mm->mmap_base;
1519 mm->cached_hole_size = 0;
1520 goto try_again;
1521 }
1522
1504 /* 1523 /*
1505 * A failed mmap() very likely causes application failure, 1524 * A failed mmap() very likely causes application failure,
1506 * so fall back to the bottom-up function here. This scenario 1525 * so fall back to the bottom-up function here. This scenario
@@ -1605,7 +1624,6 @@ EXPORT_SYMBOL(find_vma);
1605 1624
1606/* 1625/*
1607 * Same as find_vma, but also return a pointer to the previous VMA in *pprev. 1626 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
1608 * Note: pprev is set to NULL when return value is NULL.
1609 */ 1627 */
1610struct vm_area_struct * 1628struct vm_area_struct *
1611find_vma_prev(struct mm_struct *mm, unsigned long addr, 1629find_vma_prev(struct mm_struct *mm, unsigned long addr,
@@ -1614,7 +1632,16 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
1614 struct vm_area_struct *vma; 1632 struct vm_area_struct *vma;
1615 1633
1616 vma = find_vma(mm, addr); 1634 vma = find_vma(mm, addr);
1617 *pprev = vma ? vma->vm_prev : NULL; 1635 if (vma) {
1636 *pprev = vma->vm_prev;
1637 } else {
1638 struct rb_node *rb_node = mm->mm_rb.rb_node;
1639 *pprev = NULL;
1640 while (rb_node) {
1641 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
1642 rb_node = rb_node->rb_right;
1643 }
1644 }
1618 return vma; 1645 return vma;
1619} 1646}
1620 1647
@@ -2169,7 +2196,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2169 if (mm->map_count > sysctl_max_map_count) 2196 if (mm->map_count > sysctl_max_map_count)
2170 return -ENOMEM; 2197 return -ENOMEM;
2171 2198
2172 if (security_vm_enough_memory(len >> PAGE_SHIFT)) 2199 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
2173 return -ENOMEM; 2200 return -ENOMEM;
2174 2201
2175 /* Can we just expand an old private anonymous mapping? */ 2202 /* Can we just expand an old private anonymous mapping? */
@@ -2213,7 +2240,6 @@ void exit_mmap(struct mm_struct *mm)
2213 struct mmu_gather tlb; 2240 struct mmu_gather tlb;
2214 struct vm_area_struct *vma; 2241 struct vm_area_struct *vma;
2215 unsigned long nr_accounted = 0; 2242 unsigned long nr_accounted = 0;
2216 unsigned long end;
2217 2243
2218 /* mm's last user has gone, and its about to be pulled down */ 2244 /* mm's last user has gone, and its about to be pulled down */
2219 mmu_notifier_release(mm); 2245 mmu_notifier_release(mm);
@@ -2238,11 +2264,11 @@ void exit_mmap(struct mm_struct *mm)
2238 tlb_gather_mmu(&tlb, mm, 1); 2264 tlb_gather_mmu(&tlb, mm, 1);
2239 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2265 /* update_hiwater_rss(mm) here? but nobody should be looking */
2240 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2266 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2241 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2267 unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2242 vm_unacct_memory(nr_accounted); 2268 vm_unacct_memory(nr_accounted);
2243 2269
2244 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2270 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
2245 tlb_finish_mmu(&tlb, 0, end); 2271 tlb_finish_mmu(&tlb, 0, -1);
2246 2272
2247 /* 2273 /*
2248 * Walk the list again, actually closing and freeing it, 2274 * Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index cf332bc0080..3dcfaf4ed35 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
53 struct task_struct *tsk = current; 53 struct task_struct *tsk = current;
54 54
55 task_lock(tsk); 55 task_lock(tsk);
56 sync_mm_rss(tsk, mm); 56 sync_mm_rss(mm);
57 tsk->mm = NULL; 57 tsk->mm = NULL;
58 /* active_mm is still 'mm' */ 58 /* active_mm is still 'mm' */
59 enter_lazy_tlb(mm, tsk); 59 enter_lazy_tlb(mm, tsk);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5a688a2756b..a40992610ab 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
60 ptent = pte_mkwrite(ptent); 60 ptent = pte_mkwrite(ptent);
61 61
62 ptep_modify_prot_commit(mm, addr, pte, ptent); 62 ptep_modify_prot_commit(mm, addr, pte, ptent);
63 } else if (PAGE_MIGRATION && !pte_file(oldpte)) { 63 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 64 swp_entry_t entry = pte_to_swp_entry(oldpte);
65 65
66 if (is_write_migration_entry(entry)) { 66 if (is_write_migration_entry(entry)) {
@@ -168,7 +168,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
168 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| 168 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
169 VM_SHARED|VM_NORESERVE))) { 169 VM_SHARED|VM_NORESERVE))) {
170 charged = nrpages; 170 charged = nrpages;
171 if (security_vm_enough_memory(charged)) 171 if (security_vm_enough_memory_mm(mm, charged))
172 return -ENOMEM; 172 return -ENOMEM;
173 newflags |= VM_ACCOUNT; 173 newflags |= VM_ACCOUNT;
174 } 174 }
@@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
262 262
263 down_write(&current->mm->mmap_sem); 263 down_write(&current->mm->mmap_sem);
264 264
265 vma = find_vma_prev(current->mm, start, &prev); 265 vma = find_vma(current->mm, start);
266 error = -ENOMEM; 266 error = -ENOMEM;
267 if (!vma) 267 if (!vma)
268 goto out; 268 goto out;
269 prev = vma->vm_prev;
269 if (unlikely(grows & PROT_GROWSDOWN)) { 270 if (unlikely(grows & PROT_GROWSDOWN)) {
270 if (vma->vm_start >= end) 271 if (vma->vm_start >= end)
271 goto out; 272 goto out;
diff --git a/mm/mremap.c b/mm/mremap.c
index 87bb8393e7d..db8d983b5a7 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -329,7 +329,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
329 329
330 if (vma->vm_flags & VM_ACCOUNT) { 330 if (vma->vm_flags & VM_ACCOUNT) {
331 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; 331 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
332 if (security_vm_enough_memory(charged)) 332 if (security_vm_enough_memory_mm(mm, charged))
333 goto Efault; 333 goto Efault;
334 *p = charged; 334 *p = charged;
335 } 335 }
diff --git a/mm/nommu.c b/mm/nommu.c
index b982290fd96..f59e170fceb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -696,9 +696,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
696 if (vma->vm_file) { 696 if (vma->vm_file) {
697 mapping = vma->vm_file->f_mapping; 697 mapping = vma->vm_file->f_mapping;
698 698
699 mutex_lock(&mapping->i_mmap_mutex);
699 flush_dcache_mmap_lock(mapping); 700 flush_dcache_mmap_lock(mapping);
700 vma_prio_tree_insert(vma, &mapping->i_mmap); 701 vma_prio_tree_insert(vma, &mapping->i_mmap);
701 flush_dcache_mmap_unlock(mapping); 702 flush_dcache_mmap_unlock(mapping);
703 mutex_unlock(&mapping->i_mmap_mutex);
702 } 704 }
703 705
704 /* add the VMA to the tree */ 706 /* add the VMA to the tree */
@@ -760,9 +762,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
760 if (vma->vm_file) { 762 if (vma->vm_file) {
761 mapping = vma->vm_file->f_mapping; 763 mapping = vma->vm_file->f_mapping;
762 764
765 mutex_lock(&mapping->i_mmap_mutex);
763 flush_dcache_mmap_lock(mapping); 766 flush_dcache_mmap_lock(mapping);
764 vma_prio_tree_remove(vma, &mapping->i_mmap); 767 vma_prio_tree_remove(vma, &mapping->i_mmap);
765 flush_dcache_mmap_unlock(mapping); 768 flush_dcache_mmap_unlock(mapping);
769 mutex_unlock(&mapping->i_mmap_mutex);
766 } 770 }
767 771
768 /* remove from the MM's tree and list */ 772 /* remove from the MM's tree and list */
@@ -775,8 +779,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
775 779
776 if (vma->vm_next) 780 if (vma->vm_next)
777 vma->vm_next->vm_prev = vma->vm_prev; 781 vma->vm_next->vm_prev = vma->vm_prev;
778
779 vma->vm_mm = NULL;
780} 782}
781 783
782/* 784/*
@@ -2052,6 +2054,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2052 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2054 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2053 2055
2054 down_write(&nommu_region_sem); 2056 down_write(&nommu_region_sem);
2057 mutex_lock(&inode->i_mapping->i_mmap_mutex);
2055 2058
2056 /* search for VMAs that fall within the dead zone */ 2059 /* search for VMAs that fall within the dead zone */
2057 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2060 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
@@ -2059,6 +2062,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2059 /* found one - only interested if it's shared out of the page 2062 /* found one - only interested if it's shared out of the page
2060 * cache */ 2063 * cache */
2061 if (vma->vm_flags & VM_SHARED) { 2064 if (vma->vm_flags & VM_SHARED) {
2065 mutex_unlock(&inode->i_mapping->i_mmap_mutex);
2062 up_write(&nommu_region_sem); 2066 up_write(&nommu_region_sem);
2063 return -ETXTBSY; /* not quite true, but near enough */ 2067 return -ETXTBSY; /* not quite true, but near enough */
2064 } 2068 }
@@ -2086,6 +2090,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2086 } 2090 }
2087 } 2091 }
2088 2092
2093 mutex_unlock(&inode->i_mapping->i_mmap_mutex);
2089 up_write(&nommu_region_sem); 2094 up_write(&nommu_region_sem);
2090 return 0; 2095 return 0;
2091} 2096}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2958fd8e7c9..46bf2ed5594 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,7 @@
34#include <linux/ptrace.h> 34#include <linux/ptrace.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/ftrace.h> 36#include <linux/ftrace.h>
37#include <linux/ratelimit.h>
37 38
38#define CREATE_TRACE_POINTS 39#define CREATE_TRACE_POINTS
39#include <trace/events/oom.h> 40#include <trace/events/oom.h>
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
309 */ 310 */
310static struct task_struct *select_bad_process(unsigned int *ppoints, 311static struct task_struct *select_bad_process(unsigned int *ppoints,
311 unsigned long totalpages, struct mem_cgroup *memcg, 312 unsigned long totalpages, struct mem_cgroup *memcg,
312 const nodemask_t *nodemask) 313 const nodemask_t *nodemask, bool force_kill)
313{ 314{
314 struct task_struct *g, *p; 315 struct task_struct *g, *p;
315 struct task_struct *chosen = NULL; 316 struct task_struct *chosen = NULL;
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
335 if (test_tsk_thread_flag(p, TIF_MEMDIE)) { 336 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
336 if (unlikely(frozen(p))) 337 if (unlikely(frozen(p)))
337 __thaw_task(p); 338 __thaw_task(p);
338 return ERR_PTR(-1UL); 339 if (!force_kill)
340 return ERR_PTR(-1UL);
339 } 341 }
340 if (!p->mm) 342 if (!p->mm)
341 continue; 343 continue;
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
353 if (p == current) { 355 if (p == current) {
354 chosen = p; 356 chosen = p;
355 *ppoints = 1000; 357 *ppoints = 1000;
356 } else { 358 } else if (!force_kill) {
357 /* 359 /*
358 * If this task is not being ptraced on exit, 360 * If this task is not being ptraced on exit,
359 * then wait for it to finish before killing 361 * then wait for it to finish before killing
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
434} 436}
435 437
436#define K(x) ((x) << (PAGE_SHIFT-10)) 438#define K(x) ((x) << (PAGE_SHIFT-10))
437static int oom_kill_task(struct task_struct *p) 439static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
438{ 440 unsigned int points, unsigned long totalpages,
439 struct task_struct *q; 441 struct mem_cgroup *memcg, nodemask_t *nodemask,
440 struct mm_struct *mm; 442 const char *message)
441
442 p = find_lock_task_mm(p);
443 if (!p)
444 return 1;
445
446 /* mm cannot be safely dereferenced after task_unlock(p) */
447 mm = p->mm;
448
449 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
450 task_pid_nr(p), p->comm, K(p->mm->total_vm),
451 K(get_mm_counter(p->mm, MM_ANONPAGES)),
452 K(get_mm_counter(p->mm, MM_FILEPAGES)));
453 task_unlock(p);
454
455 /*
456 * Kill all user processes sharing p->mm in other thread groups, if any.
457 * They don't get access to memory reserves or a higher scheduler
458 * priority, though, to avoid depletion of all memory or task
459 * starvation. This prevents mm->mmap_sem livelock when an oom killed
460 * task cannot exit because it requires the semaphore and its contended
461 * by another thread trying to allocate memory itself. That thread will
462 * now get access to memory reserves since it has a pending fatal
463 * signal.
464 */
465 for_each_process(q)
466 if (q->mm == mm && !same_thread_group(q, p) &&
467 !(q->flags & PF_KTHREAD)) {
468 if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
469 continue;
470
471 task_lock(q); /* Protect ->comm from prctl() */
472 pr_err("Kill process %d (%s) sharing same memory\n",
473 task_pid_nr(q), q->comm);
474 task_unlock(q);
475 force_sig(SIGKILL, q);
476 }
477
478 set_tsk_thread_flag(p, TIF_MEMDIE);
479 force_sig(SIGKILL, p);
480
481 return 0;
482}
483#undef K
484
485static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
486 unsigned int points, unsigned long totalpages,
487 struct mem_cgroup *memcg, nodemask_t *nodemask,
488 const char *message)
489{ 443{
490 struct task_struct *victim = p; 444 struct task_struct *victim = p;
491 struct task_struct *child; 445 struct task_struct *child;
492 struct task_struct *t = p; 446 struct task_struct *t = p;
447 struct mm_struct *mm;
493 unsigned int victim_points = 0; 448 unsigned int victim_points = 0;
494 449 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
495 if (printk_ratelimit()) 450 DEFAULT_RATELIMIT_BURST);
496 dump_header(p, gfp_mask, order, memcg, nodemask);
497 451
498 /* 452 /*
499 * If the task is already exiting, don't alarm the sysadmin or kill 453 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
501 */ 455 */
502 if (p->flags & PF_EXITING) { 456 if (p->flags & PF_EXITING) {
503 set_tsk_thread_flag(p, TIF_MEMDIE); 457 set_tsk_thread_flag(p, TIF_MEMDIE);
504 return 0; 458 return;
505 } 459 }
506 460
461 if (__ratelimit(&oom_rs))
462 dump_header(p, gfp_mask, order, memcg, nodemask);
463
507 task_lock(p); 464 task_lock(p);
508 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", 465 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
509 message, task_pid_nr(p), p->comm, points); 466 message, task_pid_nr(p), p->comm, points);
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
533 } 490 }
534 } while_each_thread(p, t); 491 } while_each_thread(p, t);
535 492
536 return oom_kill_task(victim); 493 victim = find_lock_task_mm(victim);
494 if (!victim)
495 return;
496
497 /* mm cannot safely be dereferenced after task_unlock(victim) */
498 mm = victim->mm;
499 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
500 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
501 K(get_mm_counter(victim->mm, MM_ANONPAGES)),
502 K(get_mm_counter(victim->mm, MM_FILEPAGES)));
503 task_unlock(victim);
504
505 /*
506 * Kill all user processes sharing victim->mm in other thread groups, if
507 * any. They don't get access to memory reserves, though, to avoid
508 * depletion of all memory. This prevents mm->mmap_sem livelock when an
509 * oom killed thread cannot exit because it requires the semaphore and
510 * its contended by another thread trying to allocate memory itself.
511 * That thread will now get access to memory reserves since it has a
512 * pending fatal signal.
513 */
514 for_each_process(p)
515 if (p->mm == mm && !same_thread_group(p, victim) &&
516 !(p->flags & PF_KTHREAD)) {
517 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
518 continue;
519
520 task_lock(p); /* Protect ->comm from prctl() */
521 pr_err("Kill process %d (%s) sharing same memory\n",
522 task_pid_nr(p), p->comm);
523 task_unlock(p);
524 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
525 }
526
527 set_tsk_thread_flag(victim, TIF_MEMDIE);
528 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
537} 529}
530#undef K
538 531
539/* 532/*
540 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 533 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
561} 554}
562 555
563#ifdef CONFIG_CGROUP_MEM_RES_CTLR 556#ifdef CONFIG_CGROUP_MEM_RES_CTLR
564void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) 557void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
558 int order)
565{ 559{
566 unsigned long limit; 560 unsigned long limit;
567 unsigned int points = 0; 561 unsigned int points = 0;
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
577 return; 571 return;
578 } 572 }
579 573
580 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 574 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
581 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; 575 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
582 read_lock(&tasklist_lock); 576 read_lock(&tasklist_lock);
583retry: 577 p = select_bad_process(&points, limit, memcg, NULL, false);
584 p = select_bad_process(&points, limit, memcg, NULL); 578 if (p && PTR_ERR(p) != -1UL)
585 if (!p || PTR_ERR(p) == -1UL) 579 oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
586 goto out; 580 "Memory cgroup out of memory");
587
588 if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
589 "Memory cgroup out of memory"))
590 goto retry;
591out:
592 read_unlock(&tasklist_lock); 581 read_unlock(&tasklist_lock);
593} 582}
594#endif 583#endif
@@ -700,6 +689,7 @@ static void clear_system_oom(void)
700 * @gfp_mask: memory allocation flags 689 * @gfp_mask: memory allocation flags
701 * @order: amount of memory being requested as a power of 2 690 * @order: amount of memory being requested as a power of 2
702 * @nodemask: nodemask passed to page allocator 691 * @nodemask: nodemask passed to page allocator
692 * @force_kill: true if a task must be killed, even if others are exiting
703 * 693 *
704 * If we run out of memory, we have the choice between either 694 * If we run out of memory, we have the choice between either
705 * killing a random task (bad), letting the system crash (worse) 695 * killing a random task (bad), letting the system crash (worse)
@@ -707,7 +697,7 @@ static void clear_system_oom(void)
707 * don't have to be perfect here, we just have to be good. 697 * don't have to be perfect here, we just have to be good.
708 */ 698 */
709void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 699void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
710 int order, nodemask_t *nodemask) 700 int order, nodemask_t *nodemask, bool force_kill)
711{ 701{
712 const nodemask_t *mpol_mask; 702 const nodemask_t *mpol_mask;
713 struct task_struct *p; 703 struct task_struct *p;
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
745 if (sysctl_oom_kill_allocating_task && 735 if (sysctl_oom_kill_allocating_task &&
746 !oom_unkillable_task(current, NULL, nodemask) && 736 !oom_unkillable_task(current, NULL, nodemask) &&
747 current->mm) { 737 current->mm) {
748 /* 738 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
749 * oom_kill_process() needs tasklist_lock held. If it returns 739 nodemask,
750 * non-zero, current could not be killed so we must fallback to 740 "Out of memory (oom_kill_allocating_task)");
751 * the tasklist scan.
752 */
753 if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
754 NULL, nodemask,
755 "Out of memory (oom_kill_allocating_task)"))
756 goto out;
757 }
758
759retry:
760 p = select_bad_process(&points, totalpages, NULL, mpol_mask);
761 if (PTR_ERR(p) == -1UL)
762 goto out; 741 goto out;
742 }
763 743
744 p = select_bad_process(&points, totalpages, NULL, mpol_mask,
745 force_kill);
764 /* Found nothing?!?! Either we hang forever, or we panic. */ 746 /* Found nothing?!?! Either we hang forever, or we panic. */
765 if (!p) { 747 if (!p) {
766 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 748 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
767 read_unlock(&tasklist_lock); 749 read_unlock(&tasklist_lock);
768 panic("Out of memory and no killable processes...\n"); 750 panic("Out of memory and no killable processes...\n");
769 } 751 }
770 752 if (PTR_ERR(p) != -1UL) {
771 if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, 753 oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
772 nodemask, "Out of memory")) 754 nodemask, "Out of memory");
773 goto retry; 755 killed = 1;
774 killed = 1; 756 }
775out: 757out:
776 read_unlock(&tasklist_lock); 758 read_unlock(&tasklist_lock);
777 759
@@ -792,7 +774,7 @@ out:
792void pagefault_out_of_memory(void) 774void pagefault_out_of_memory(void)
793{ 775{
794 if (try_set_system_oom()) { 776 if (try_set_system_oom()) {
795 out_of_memory(NULL, 0, 0, NULL); 777 out_of_memory(NULL, 0, 0, NULL, false);
796 clear_system_oom(); 778 clear_system_oom();
797 } 779 }
798 if (!test_thread_flag(TIF_MEMDIE)) 780 if (!test_thread_flag(TIF_MEMDIE))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5e39858880f..26adea8ca2e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1474,6 +1474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
1474 1474
1475 for ( ; ; ) { 1475 for ( ; ; ) {
1476 global_dirty_limits(&background_thresh, &dirty_thresh); 1476 global_dirty_limits(&background_thresh, &dirty_thresh);
1477 dirty_thresh = hard_dirty_limit(dirty_thresh);
1477 1478
1478 /* 1479 /*
1479 * Boost the allowable dirty threshold a bit for page 1480 * Boost the allowable dirty threshold a bit for page
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d2186ecb36f..caea788628e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1968 goto out; 1968 goto out;
1969 } 1969 }
1970 /* Exhausted what can be done so it's blamo time */ 1970 /* Exhausted what can be done so it's blamo time */
1971 out_of_memory(zonelist, gfp_mask, order, nodemask); 1971 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
1972 1972
1973out: 1973out:
1974 clear_zonelist_oom(zonelist, gfp_mask); 1974 clear_zonelist_oom(zonelist, gfp_mask);
@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1990 if (!order) 1990 if (!order)
1991 return NULL; 1991 return NULL;
1992 1992
1993 if (compaction_deferred(preferred_zone)) { 1993 if (compaction_deferred(preferred_zone, order)) {
1994 *deferred_compaction = true; 1994 *deferred_compaction = true;
1995 return NULL; 1995 return NULL;
1996 } 1996 }
@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2012 if (page) { 2012 if (page) {
2013 preferred_zone->compact_considered = 0; 2013 preferred_zone->compact_considered = 0;
2014 preferred_zone->compact_defer_shift = 0; 2014 preferred_zone->compact_defer_shift = 0;
2015 if (order >= preferred_zone->compact_order_failed)
2016 preferred_zone->compact_order_failed = order + 1;
2015 count_vm_event(COMPACTSUCCESS); 2017 count_vm_event(COMPACTSUCCESS);
2016 return page; 2018 return page;
2017 } 2019 }
@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2028 * defer if the failure was a sync compaction failure. 2030 * defer if the failure was a sync compaction failure.
2029 */ 2031 */
2030 if (sync_migration) 2032 if (sync_migration)
2031 defer_compaction(preferred_zone); 2033 defer_compaction(preferred_zone, order);
2032 2034
2033 cond_resched(); 2035 cond_resched();
2034 } 2036 }
@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2378{ 2380{
2379 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2381 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2380 struct zone *preferred_zone; 2382 struct zone *preferred_zone;
2381 struct page *page; 2383 struct page *page = NULL;
2382 int migratetype = allocflags_to_migratetype(gfp_mask); 2384 int migratetype = allocflags_to_migratetype(gfp_mask);
2385 unsigned int cpuset_mems_cookie;
2383 2386
2384 gfp_mask &= gfp_allowed_mask; 2387 gfp_mask &= gfp_allowed_mask;
2385 2388
@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2398 if (unlikely(!zonelist->_zonerefs->zone)) 2401 if (unlikely(!zonelist->_zonerefs->zone))
2399 return NULL; 2402 return NULL;
2400 2403
2401 get_mems_allowed(); 2404retry_cpuset:
2405 cpuset_mems_cookie = get_mems_allowed();
2406
2402 /* The preferred zone is used for statistics later */ 2407 /* The preferred zone is used for statistics later */
2403 first_zones_zonelist(zonelist, high_zoneidx, 2408 first_zones_zonelist(zonelist, high_zoneidx,
2404 nodemask ? : &cpuset_current_mems_allowed, 2409 nodemask ? : &cpuset_current_mems_allowed,
2405 &preferred_zone); 2410 &preferred_zone);
2406 if (!preferred_zone) { 2411 if (!preferred_zone)
2407 put_mems_allowed(); 2412 goto out;
2408 return NULL;
2409 }
2410 2413
2411 /* First allocation attempt */ 2414 /* First allocation attempt */
2412 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2415 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2416 page = __alloc_pages_slowpath(gfp_mask, order, 2419 page = __alloc_pages_slowpath(gfp_mask, order,
2417 zonelist, high_zoneidx, nodemask, 2420 zonelist, high_zoneidx, nodemask,
2418 preferred_zone, migratetype); 2421 preferred_zone, migratetype);
2419 put_mems_allowed();
2420 2422
2421 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2423 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2424
2425out:
2426 /*
2427 * When updating a task's mems_allowed, it is possible to race with
2428 * parallel threads in such a way that an allocation can fail while
2429 * the mask is being updated. If a page allocation is about to fail,
2430 * check if the cpuset changed during allocation and if so, retry.
2431 */
2432 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2433 goto retry_cpuset;
2434
2422 return page; 2435 return page;
2423} 2436}
2424EXPORT_SYMBOL(__alloc_pages_nodemask); 2437EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2632bool skip_free_areas_node(unsigned int flags, int nid) 2645bool skip_free_areas_node(unsigned int flags, int nid)
2633{ 2646{
2634 bool ret = false; 2647 bool ret = false;
2648 unsigned int cpuset_mems_cookie;
2635 2649
2636 if (!(flags & SHOW_MEM_FILTER_NODES)) 2650 if (!(flags & SHOW_MEM_FILTER_NODES))
2637 goto out; 2651 goto out;
2638 2652
2639 get_mems_allowed(); 2653 do {
2640 ret = !node_isset(nid, cpuset_current_mems_allowed); 2654 cpuset_mems_cookie = get_mems_allowed();
2641 put_mems_allowed(); 2655 ret = !node_isset(nid, cpuset_current_mems_allowed);
2656 } while (!put_mems_allowed(cpuset_mems_cookie));
2642out: 2657out:
2643 return ret; 2658 return ret;
2644} 2659}
@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
3925 } 3940 }
3926} 3941}
3927 3942
3928int __init add_from_early_node_map(struct range *range, int az,
3929 int nr_range, int nid)
3930{
3931 unsigned long start_pfn, end_pfn;
3932 int i;
3933
3934 /* need to go over early_node_map to find out good range for node */
3935 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
3936 nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
3937 return nr_range;
3938}
3939
3940/** 3943/**
3941 * sparse_memory_present_with_active_regions - Call memory_present for each active range 3944 * sparse_memory_present_with_active_regions - Call memory_present for each active range
3942 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 3945 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void)
4521 * memory. When they don't, some nodes will have more kernelcore than 4524 * memory. When they don't, some nodes will have more kernelcore than
4522 * others 4525 * others
4523 */ 4526 */
4524static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 4527static void __init find_zone_movable_pfns_for_nodes(void)
4525{ 4528{
4526 int i, nid; 4529 int i, nid;
4527 unsigned long usable_startpfn; 4530 unsigned long usable_startpfn;
@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4713 4716
4714 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 4717 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
4715 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 4718 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4716 find_zone_movable_pfns_for_nodes(zone_movable_pfn); 4719 find_zone_movable_pfns_for_nodes();
4717 4720
4718 /* Print out the zone ranges */ 4721 /* Print out the zone ranges */
4719 printk("Zone PFN ranges:\n"); 4722 printk("Zone PFN ranges:\n");
@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
4823 int cpu = (unsigned long)hcpu; 4826 int cpu = (unsigned long)hcpu;
4824 4827
4825 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 4828 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
4829 lru_add_drain_cpu(cpu);
4826 drain_pages(cpu); 4830 drain_pages(cpu);
4827 4831
4828 /* 4832 /*
@@ -5236,6 +5240,7 @@ void *__init alloc_large_system_hash(const char *tablename,
5236 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5240 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5237 do_div(max, bucketsize); 5241 do_div(max, bucketsize);
5238 } 5242 }
5243 max = min(max, 0x80000000ULL);
5239 5244
5240 if (numentries > max) 5245 if (numentries > max)
5241 numentries = max; 5246 numentries = max;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index de1616aa9b1..1ccbd714059 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -379,13 +379,15 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
379 pgoff_t offset = swp_offset(ent); 379 pgoff_t offset = swp_offset(ent);
380 struct swap_cgroup_ctrl *ctrl; 380 struct swap_cgroup_ctrl *ctrl;
381 struct page *mappage; 381 struct page *mappage;
382 struct swap_cgroup *sc;
382 383
383 ctrl = &swap_cgroup_ctrl[swp_type(ent)]; 384 ctrl = &swap_cgroup_ctrl[swp_type(ent)];
384 if (ctrlp) 385 if (ctrlp)
385 *ctrlp = ctrl; 386 *ctrlp = ctrl;
386 387
387 mappage = ctrl->map[offset / SC_PER_PAGE]; 388 mappage = ctrl->map[offset / SC_PER_PAGE];
388 return page_address(mappage) + offset % SC_PER_PAGE; 389 sc = page_address(mappage);
390 return sc + offset % SC_PER_PAGE;
389} 391}
390 392
391/** 393/**
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2f5cf10ff66..aa9701e1271 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd(walk->mm, pmd);
62 if (pmd_none_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
65 if (err) 65 if (err)
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 12a48a88c0d..405d331804c 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -184,8 +184,7 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
184 page_end - page_start); 184 page_end - page_start);
185 } 185 }
186 186
187 for (i = page_start; i < page_end; i++) 187 bitmap_clear(populated, page_start, page_end - page_start);
188 __clear_bit(i, populated);
189} 188}
190 189
191/** 190/**
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index eb663fb533e..5a74fea182f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
70 unsigned long address, pmd_t *pmdp) 70 unsigned long address, pmd_t *pmdp)
71{ 71{
72 int young; 72 int young;
73#ifndef CONFIG_TRANSPARENT_HUGEPAGE 73#ifdef CONFIG_TRANSPARENT_HUGEPAGE
74 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
75#else
74 BUG(); 76 BUG();
75#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 77#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
76 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
77 young = pmdp_test_and_clear_young(vma, address, pmdp); 78 young = pmdp_test_and_clear_young(vma, address, pmdp);
78 if (young) 79 if (young)
79 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 80 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index e920aa3ce10..c20ff48994c 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
298 goto free_proc_pages; 298 goto free_proc_pages;
299 } 299 }
300 300
301 task_lock(task); 301 mm = mm_access(task, PTRACE_MODE_ATTACH);
302 if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { 302 if (!mm || IS_ERR(mm)) {
303 task_unlock(task); 303 rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
304 rc = -EPERM; 304 /*
305 goto put_task_struct; 305 * Explicitly map EACCES to EPERM as EPERM is a more a
306 } 306 * appropriate error code for process_vw_readv/writev
307 mm = task->mm; 307 */
308 308 if (rc == -EACCES)
309 if (!mm || (task->flags & PF_KTHREAD)) { 309 rc = -EPERM;
310 task_unlock(task);
311 rc = -EINVAL;
312 goto put_task_struct; 310 goto put_task_struct;
313 } 311 }
314 312
315 atomic_inc(&mm->mm_users);
316 task_unlock(task);
317
318 for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { 313 for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
319 rc = process_vm_rw_single_vec( 314 rc = process_vm_rw_single_vec(
320 (unsigned long)rvec[i].iov_base, rvec[i].iov_len, 315 (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
diff --git a/mm/rmap.c b/mm/rmap.c
index c8454e06b6c..5b5ad584ffb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
121} 121}
122 122
123static void anon_vma_chain_link(struct vm_area_struct *vma,
124 struct anon_vma_chain *avc,
125 struct anon_vma *anon_vma)
126{
127 avc->vma = vma;
128 avc->anon_vma = anon_vma;
129 list_add(&avc->same_vma, &vma->anon_vma_chain);
130
131 /*
132 * It's critical to add new vmas to the tail of the anon_vma,
133 * see comment in huge_memory.c:__split_huge_page().
134 */
135 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
136}
137
123/** 138/**
124 * anon_vma_prepare - attach an anon_vma to a memory region 139 * anon_vma_prepare - attach an anon_vma to a memory region
125 * @vma: the memory region in question 140 * @vma: the memory region in question
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
175 spin_lock(&mm->page_table_lock); 190 spin_lock(&mm->page_table_lock);
176 if (likely(!vma->anon_vma)) { 191 if (likely(!vma->anon_vma)) {
177 vma->anon_vma = anon_vma; 192 vma->anon_vma = anon_vma;
178 avc->anon_vma = anon_vma; 193 anon_vma_chain_link(vma, avc, anon_vma);
179 avc->vma = vma;
180 list_add(&avc->same_vma, &vma->anon_vma_chain);
181 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
182 allocated = NULL; 194 allocated = NULL;
183 avc = NULL; 195 avc = NULL;
184 } 196 }
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
224 mutex_unlock(&root->mutex); 236 mutex_unlock(&root->mutex);
225} 237}
226 238
227static void anon_vma_chain_link(struct vm_area_struct *vma,
228 struct anon_vma_chain *avc,
229 struct anon_vma *anon_vma)
230{
231 avc->vma = vma;
232 avc->anon_vma = anon_vma;
233 list_add(&avc->same_vma, &vma->anon_vma_chain);
234
235 /*
236 * It's critical to add new vmas to the tail of the anon_vma,
237 * see comment in huge_memory.c:__split_huge_page().
238 */
239 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
240}
241
242/* 239/*
243 * Attach the anon_vmas from src to dst. 240 * Attach the anon_vmas from src to dst.
244 * Returns 0 on success, -ENOMEM on failure. 241 * Returns 0 on success, -ENOMEM on failure.
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page,
1151 */ 1148 */
1152void page_add_file_rmap(struct page *page) 1149void page_add_file_rmap(struct page *page)
1153{ 1150{
1151 bool locked;
1152 unsigned long flags;
1153
1154 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1154 if (atomic_inc_and_test(&page->_mapcount)) { 1155 if (atomic_inc_and_test(&page->_mapcount)) {
1155 __inc_zone_page_state(page, NR_FILE_MAPPED); 1156 __inc_zone_page_state(page, NR_FILE_MAPPED);
1156 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); 1157 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
1157 } 1158 }
1159 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1158} 1160}
1159 1161
1160/** 1162/**
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page)
1165 */ 1167 */
1166void page_remove_rmap(struct page *page) 1168void page_remove_rmap(struct page *page)
1167{ 1169{
1170 bool anon = PageAnon(page);
1171 bool locked;
1172 unsigned long flags;
1173
1174 /*
1175 * The anon case has no mem_cgroup page_stat to update; but may
1176 * uncharge_page() below, where the lock ordering can deadlock if
1177 * we hold the lock against page_stat move: so avoid it on anon.
1178 */
1179 if (!anon)
1180 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1181
1168 /* page still mapped by someone else? */ 1182 /* page still mapped by someone else? */
1169 if (!atomic_add_negative(-1, &page->_mapcount)) 1183 if (!atomic_add_negative(-1, &page->_mapcount))
1170 return; 1184 goto out;
1171 1185
1172 /* 1186 /*
1173 * Now that the last pte has gone, s390 must transfer dirty 1187 * Now that the last pte has gone, s390 must transfer dirty
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page)
1176 * not if it's in swapcache - there might be another pte slot 1190 * not if it's in swapcache - there might be another pte slot
1177 * containing the swap entry, but page not yet written to swap. 1191 * containing the swap entry, but page not yet written to swap.
1178 */ 1192 */
1179 if ((!PageAnon(page) || PageSwapCache(page)) && 1193 if ((!anon || PageSwapCache(page)) &&
1180 page_test_and_clear_dirty(page_to_pfn(page), 1)) 1194 page_test_and_clear_dirty(page_to_pfn(page), 1))
1181 set_page_dirty(page); 1195 set_page_dirty(page);
1182 /* 1196 /*
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page)
1184 * and not charged by memcg for now. 1198 * and not charged by memcg for now.
1185 */ 1199 */
1186 if (unlikely(PageHuge(page))) 1200 if (unlikely(PageHuge(page)))
1187 return; 1201 goto out;
1188 if (PageAnon(page)) { 1202 if (anon) {
1189 mem_cgroup_uncharge_page(page); 1203 mem_cgroup_uncharge_page(page);
1190 if (!PageTransHuge(page)) 1204 if (!PageTransHuge(page))
1191 __dec_zone_page_state(page, NR_ANON_PAGES); 1205 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page)
1205 * Leaving it set also helps swapoff to reinstate ptes 1219 * Leaving it set also helps swapoff to reinstate ptes
1206 * faster for those pages still in swapcache. 1220 * faster for those pages still in swapcache.
1207 */ 1221 */
1222out:
1223 if (!anon)
1224 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1208} 1225}
1209 1226
1210/* 1227/*
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1282 } 1299 }
1283 dec_mm_counter(mm, MM_ANONPAGES); 1300 dec_mm_counter(mm, MM_ANONPAGES);
1284 inc_mm_counter(mm, MM_SWAPENTS); 1301 inc_mm_counter(mm, MM_SWAPENTS);
1285 } else if (PAGE_MIGRATION) { 1302 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
1286 /* 1303 /*
1287 * Store the pfn of the page in a special migration 1304 * Store the pfn of the page in a special migration
1288 * pte. do_swap_page() will wait until the migration 1305 * pte. do_swap_page() will wait until the migration
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1293 } 1310 }
1294 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 1311 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1295 BUG_ON(pte_file(*pte)); 1312 BUG_ON(pte_file(*pte));
1296 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { 1313 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1314 (TTU_ACTION(flags) == TTU_MIGRATION)) {
1297 /* Establish migration entry for a file page */ 1315 /* Establish migration entry for a file page */
1298 swp_entry_t entry; 1316 swp_entry_t entry;
1299 entry = make_migration_entry(page, pte_write(pteval)); 1317 entry = make_migration_entry(page, pte_write(pteval));
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1499 * locking requirements of exec(), migration skips 1517 * locking requirements of exec(), migration skips
1500 * temporary VMAs until after exec() completes. 1518 * temporary VMAs until after exec() completes.
1501 */ 1519 */
1502 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && 1520 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
1503 is_vma_temporary_stack(vma)) 1521 is_vma_temporary_stack(vma))
1504 continue; 1522 continue;
1505 1523
diff --git a/mm/shmem.c b/mm/shmem.c
index 269d049294a..f99ff3e50bd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -127,7 +127,7 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
127static inline int shmem_acct_size(unsigned long flags, loff_t size) 127static inline int shmem_acct_size(unsigned long flags, loff_t size)
128{ 128{
129 return (flags & VM_NORESERVE) ? 129 return (flags & VM_NORESERVE) ?
130 0 : security_vm_enough_memory_kern(VM_ACCT(size)); 130 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
131} 131}
132 132
133static inline void shmem_unacct_size(unsigned long flags, loff_t size) 133static inline void shmem_unacct_size(unsigned long flags, loff_t size)
@@ -145,7 +145,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
145static inline int shmem_acct_block(unsigned long flags) 145static inline int shmem_acct_block(unsigned long flags)
146{ 146{
147 return (flags & VM_NORESERVE) ? 147 return (flags & VM_NORESERVE) ?
148 security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; 148 security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
149} 149}
150 150
151static inline void shmem_unacct_blocks(unsigned long flags, long pages) 151static inline void shmem_unacct_blocks(unsigned long flags, long pages)
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1178static const struct inode_operations shmem_symlink_inode_operations; 1178static const struct inode_operations shmem_symlink_inode_operations;
1179static const struct inode_operations shmem_short_symlink_operations; 1179static const struct inode_operations shmem_short_symlink_operations;
1180 1180
1181#ifdef CONFIG_TMPFS_XATTR
1182static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
1183#else
1184#define shmem_initxattrs NULL
1185#endif
1186
1181static int 1187static int
1182shmem_write_begin(struct file *file, struct address_space *mapping, 1188shmem_write_begin(struct file *file, struct address_space *mapping,
1183 loff_t pos, unsigned len, unsigned flags, 1189 loff_t pos, unsigned len, unsigned flags,
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1490 if (inode) { 1496 if (inode) {
1491 error = security_inode_init_security(inode, dir, 1497 error = security_inode_init_security(inode, dir,
1492 &dentry->d_name, 1498 &dentry->d_name,
1493 NULL, NULL); 1499 shmem_initxattrs, NULL);
1494 if (error) { 1500 if (error) {
1495 if (error != -EOPNOTSUPP) { 1501 if (error != -EOPNOTSUPP) {
1496 iput(inode); 1502 iput(inode);
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1630 return -ENOSPC; 1636 return -ENOSPC;
1631 1637
1632 error = security_inode_init_security(inode, dir, &dentry->d_name, 1638 error = security_inode_init_security(inode, dir, &dentry->d_name,
1633 NULL, NULL); 1639 shmem_initxattrs, NULL);
1634 if (error) { 1640 if (error) {
1635 if (error != -EOPNOTSUPP) { 1641 if (error != -EOPNOTSUPP) {
1636 iput(inode); 1642 iput(inode);
@@ -1656,9 +1662,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1656 } 1662 }
1657 inode->i_mapping->a_ops = &shmem_aops; 1663 inode->i_mapping->a_ops = &shmem_aops;
1658 inode->i_op = &shmem_symlink_inode_operations; 1664 inode->i_op = &shmem_symlink_inode_operations;
1659 kaddr = kmap_atomic(page, KM_USER0); 1665 kaddr = kmap_atomic(page);
1660 memcpy(kaddr, symname, len); 1666 memcpy(kaddr, symname, len);
1661 kunmap_atomic(kaddr, KM_USER0); 1667 kunmap_atomic(kaddr);
1662 set_page_dirty(page); 1668 set_page_dirty(page);
1663 unlock_page(page); 1669 unlock_page(page);
1664 page_cache_release(page); 1670 page_cache_release(page);
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
1704 * filesystem level, though. 1710 * filesystem level, though.
1705 */ 1711 */
1706 1712
1713/*
1714 * Allocate new xattr and copy in the value; but leave the name to callers.
1715 */
1716static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
1717{
1718 struct shmem_xattr *new_xattr;
1719 size_t len;
1720
1721 /* wrap around? */
1722 len = sizeof(*new_xattr) + size;
1723 if (len <= sizeof(*new_xattr))
1724 return NULL;
1725
1726 new_xattr = kmalloc(len, GFP_KERNEL);
1727 if (!new_xattr)
1728 return NULL;
1729
1730 new_xattr->size = size;
1731 memcpy(new_xattr->value, value, size);
1732 return new_xattr;
1733}
1734
1735/*
1736 * Callback for security_inode_init_security() for acquiring xattrs.
1737 */
1738static int shmem_initxattrs(struct inode *inode,
1739 const struct xattr *xattr_array,
1740 void *fs_info)
1741{
1742 struct shmem_inode_info *info = SHMEM_I(inode);
1743 const struct xattr *xattr;
1744 struct shmem_xattr *new_xattr;
1745 size_t len;
1746
1747 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
1748 new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
1749 if (!new_xattr)
1750 return -ENOMEM;
1751
1752 len = strlen(xattr->name) + 1;
1753 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
1754 GFP_KERNEL);
1755 if (!new_xattr->name) {
1756 kfree(new_xattr);
1757 return -ENOMEM;
1758 }
1759
1760 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
1761 XATTR_SECURITY_PREFIX_LEN);
1762 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
1763 xattr->name, len);
1764
1765 spin_lock(&info->lock);
1766 list_add(&new_xattr->list, &info->xattr_list);
1767 spin_unlock(&info->lock);
1768 }
1769
1770 return 0;
1771}
1772
1707static int shmem_xattr_get(struct dentry *dentry, const char *name, 1773static int shmem_xattr_get(struct dentry *dentry, const char *name,
1708 void *buffer, size_t size) 1774 void *buffer, size_t size)
1709{ 1775{
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name,
1731 return ret; 1797 return ret;
1732} 1798}
1733 1799
1734static int shmem_xattr_set(struct dentry *dentry, const char *name, 1800static int shmem_xattr_set(struct inode *inode, const char *name,
1735 const void *value, size_t size, int flags) 1801 const void *value, size_t size, int flags)
1736{ 1802{
1737 struct inode *inode = dentry->d_inode;
1738 struct shmem_inode_info *info = SHMEM_I(inode); 1803 struct shmem_inode_info *info = SHMEM_I(inode);
1739 struct shmem_xattr *xattr; 1804 struct shmem_xattr *xattr;
1740 struct shmem_xattr *new_xattr = NULL; 1805 struct shmem_xattr *new_xattr = NULL;
1741 size_t len;
1742 int err = 0; 1806 int err = 0;
1743 1807
1744 /* value == NULL means remove */ 1808 /* value == NULL means remove */
1745 if (value) { 1809 if (value) {
1746 /* wrap around? */ 1810 new_xattr = shmem_xattr_alloc(value, size);
1747 len = sizeof(*new_xattr) + size;
1748 if (len <= sizeof(*new_xattr))
1749 return -ENOMEM;
1750
1751 new_xattr = kmalloc(len, GFP_KERNEL);
1752 if (!new_xattr) 1811 if (!new_xattr)
1753 return -ENOMEM; 1812 return -ENOMEM;
1754 1813
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name,
1757 kfree(new_xattr); 1816 kfree(new_xattr);
1758 return -ENOMEM; 1817 return -ENOMEM;
1759 } 1818 }
1760
1761 new_xattr->size = size;
1762 memcpy(new_xattr->value, value, size);
1763 } 1819 }
1764 1820
1765 spin_lock(&info->lock); 1821 spin_lock(&info->lock);
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
1858 if (size == 0) 1914 if (size == 0)
1859 value = ""; /* empty EA, do not remove */ 1915 value = ""; /* empty EA, do not remove */
1860 1916
1861 return shmem_xattr_set(dentry, name, value, size, flags); 1917 return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
1862 1918
1863} 1919}
1864 1920
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
1878 if (err) 1934 if (err)
1879 return err; 1935 return err;
1880 1936
1881 return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); 1937 return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
1882} 1938}
1883 1939
1884static bool xattr_is_trusted(const char *name) 1940static bool xattr_is_trusted(const char *name)
@@ -2175,7 +2231,6 @@ static void shmem_put_super(struct super_block *sb)
2175int shmem_fill_super(struct super_block *sb, void *data, int silent) 2231int shmem_fill_super(struct super_block *sb, void *data, int silent)
2176{ 2232{
2177 struct inode *inode; 2233 struct inode *inode;
2178 struct dentry *root;
2179 struct shmem_sb_info *sbinfo; 2234 struct shmem_sb_info *sbinfo;
2180 int err = -ENOMEM; 2235 int err = -ENOMEM;
2181 2236
@@ -2232,14 +2287,11 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2232 goto failed; 2287 goto failed;
2233 inode->i_uid = sbinfo->uid; 2288 inode->i_uid = sbinfo->uid;
2234 inode->i_gid = sbinfo->gid; 2289 inode->i_gid = sbinfo->gid;
2235 root = d_alloc_root(inode); 2290 sb->s_root = d_make_root(inode);
2236 if (!root) 2291 if (!sb->s_root)
2237 goto failed_iput; 2292 goto failed;
2238 sb->s_root = root;
2239 return 0; 2293 return 0;
2240 2294
2241failed_iput:
2242 iput(inode);
2243failed: 2295failed:
2244 shmem_put_super(sb); 2296 shmem_put_super(sb);
2245 return err; 2297 return err;
diff --git a/mm/slab.c b/mm/slab.c
index f0bd7857ab3..29c8716eb7a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3284 if (in_interrupt() || (flags & __GFP_THISNODE)) 3284 if (in_interrupt() || (flags & __GFP_THISNODE))
3285 return NULL; 3285 return NULL;
3286 nid_alloc = nid_here = numa_mem_id(); 3286 nid_alloc = nid_here = numa_mem_id();
3287 get_mems_allowed();
3288 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3287 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3289 nid_alloc = cpuset_slab_spread_node(); 3288 nid_alloc = cpuset_slab_spread_node();
3290 else if (current->mempolicy) 3289 else if (current->mempolicy)
3291 nid_alloc = slab_node(current->mempolicy); 3290 nid_alloc = slab_node(current->mempolicy);
3292 put_mems_allowed();
3293 if (nid_alloc != nid_here) 3291 if (nid_alloc != nid_here)
3294 return ____cache_alloc_node(cachep, flags, nid_alloc); 3292 return ____cache_alloc_node(cachep, flags, nid_alloc);
3295 return NULL; 3293 return NULL;
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3312 enum zone_type high_zoneidx = gfp_zone(flags); 3310 enum zone_type high_zoneidx = gfp_zone(flags);
3313 void *obj = NULL; 3311 void *obj = NULL;
3314 int nid; 3312 int nid;
3313 unsigned int cpuset_mems_cookie;
3315 3314
3316 if (flags & __GFP_THISNODE) 3315 if (flags & __GFP_THISNODE)
3317 return NULL; 3316 return NULL;
3318 3317
3319 get_mems_allowed();
3320 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3321 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3318 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3322 3319
3320retry_cpuset:
3321 cpuset_mems_cookie = get_mems_allowed();
3322 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3323
3323retry: 3324retry:
3324 /* 3325 /*
3325 * Look through allowed nodes for objects available 3326 * Look through allowed nodes for objects available
@@ -3372,7 +3373,9 @@ retry:
3372 } 3373 }
3373 } 3374 }
3374 } 3375 }
3375 put_mems_allowed(); 3376
3377 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
3378 goto retry_cpuset;
3376 return obj; 3379 return obj;
3377} 3380}
3378 3381
diff --git a/mm/slub.c b/mm/slub.c
index 4907563ef7f..f4a6229848f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1581 struct zone *zone; 1581 struct zone *zone;
1582 enum zone_type high_zoneidx = gfp_zone(flags); 1582 enum zone_type high_zoneidx = gfp_zone(flags);
1583 void *object; 1583 void *object;
1584 unsigned int cpuset_mems_cookie;
1584 1585
1585 /* 1586 /*
1586 * The defrag ratio allows a configuration of the tradeoffs between 1587 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1604 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1605 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1605 return NULL; 1606 return NULL;
1606 1607
1607 get_mems_allowed(); 1608 do {
1608 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1609 cpuset_mems_cookie = get_mems_allowed();
1609 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1610 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1610 struct kmem_cache_node *n; 1611 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1611 1612 struct kmem_cache_node *n;
1612 n = get_node(s, zone_to_nid(zone)); 1613
1613 1614 n = get_node(s, zone_to_nid(zone));
1614 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1615
1615 n->nr_partial > s->min_partial) { 1616 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1616 object = get_partial_node(s, n, c); 1617 n->nr_partial > s->min_partial) {
1617 if (object) { 1618 object = get_partial_node(s, n, c);
1618 put_mems_allowed(); 1619 if (object) {
1619 return object; 1620 /*
1621 * Return the object even if
1622 * put_mems_allowed indicated that
1623 * the cpuset mems_allowed was
1624 * updated in parallel. It's a
1625 * harmless race between the alloc
1626 * and the cpuset update.
1627 */
1628 put_mems_allowed(cpuset_mems_cookie);
1629 return object;
1630 }
1620 } 1631 }
1621 } 1632 }
1622 } 1633 } while (!put_mems_allowed(cpuset_mems_cookie));
1623 put_mems_allowed();
1624#endif 1634#endif
1625 return NULL; 1635 return NULL;
1626} 1636}
diff --git a/mm/sparse.c b/mm/sparse.c
index 61d7cde2311..a8bc7d364de 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
353 353
354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count); 355 usemap_count);
356 if (usemap) { 356 if (!usemap) {
357 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 357 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
358 if (!present_section_nr(pnum)) 358 if (!usemap) {
359 continue; 359 printk(KERN_WARNING "%s: allocation failed\n", __func__);
360 usemap_map[pnum] = usemap; 360 return;
361 usemap += size;
362 } 361 }
363 return;
364 } 362 }
365 363
366 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); 364 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
367 if (usemap) { 365 if (!present_section_nr(pnum))
368 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 366 continue;
369 if (!present_section_nr(pnum)) 367 usemap_map[pnum] = usemap;
370 continue; 368 usemap += size;
371 usemap_map[pnum] = usemap; 369 check_usemap_section_nr(nodeid, usemap_map[pnum]);
372 usemap += size;
373 check_usemap_section_nr(nodeid, usemap_map[pnum]);
374 }
375 return;
376 } 370 }
377
378 printk(KERN_WARNING "%s: allocation failed\n", __func__);
379} 371}
380 372
381#ifndef CONFIG_SPARSEMEM_VMEMMAP 373#ifndef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/mm/swap.c b/mm/swap.c
index b0f529b3897..5c13f133897 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg)
496 * Either "cpu" is the current CPU, and preemption has already been 496 * Either "cpu" is the current CPU, and preemption has already been
497 * disabled; or "cpu" is being hot-unplugged, and is already dead. 497 * disabled; or "cpu" is being hot-unplugged, and is already dead.
498 */ 498 */
499static void drain_cpu_pagevecs(int cpu) 499void lru_add_drain_cpu(int cpu)
500{ 500{
501 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 501 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
502 struct pagevec *pvec; 502 struct pagevec *pvec;
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page)
553 553
554void lru_add_drain(void) 554void lru_add_drain(void)
555{ 555{
556 drain_cpu_pagevecs(get_cpu()); 556 lru_add_drain_cpu(get_cpu());
557 put_cpu(); 557 put_cpu();
558} 558}
559 559
@@ -652,14 +652,14 @@ EXPORT_SYMBOL(__pagevec_release);
652void lru_add_page_tail(struct zone* zone, 652void lru_add_page_tail(struct zone* zone,
653 struct page *page, struct page *page_tail) 653 struct page *page, struct page *page_tail)
654{ 654{
655 int active; 655 int uninitialized_var(active);
656 enum lru_list lru; 656 enum lru_list lru;
657 const int file = 0; 657 const int file = 0;
658 658
659 VM_BUG_ON(!PageHead(page)); 659 VM_BUG_ON(!PageHead(page));
660 VM_BUG_ON(PageCompound(page_tail)); 660 VM_BUG_ON(PageCompound(page_tail));
661 VM_BUG_ON(PageLRU(page_tail)); 661 VM_BUG_ON(PageLRU(page_tail));
662 VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); 662 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock));
663 663
664 SetPageLRU(page_tail); 664 SetPageLRU(page_tail);
665 665
@@ -672,7 +672,6 @@ void lru_add_page_tail(struct zone* zone,
672 active = 0; 672 active = 0;
673 lru = LRU_INACTIVE_ANON; 673 lru = LRU_INACTIVE_ANON;
674 } 674 }
675 update_page_reclaim_stat(zone, page_tail, file, active);
676 } else { 675 } else {
677 SetPageUnevictable(page_tail); 676 SetPageUnevictable(page_tail);
678 lru = LRU_UNEVICTABLE; 677 lru = LRU_UNEVICTABLE;
@@ -693,6 +692,9 @@ void lru_add_page_tail(struct zone* zone,
693 list_head = page_tail->lru.prev; 692 list_head = page_tail->lru.prev;
694 list_move_tail(&page_tail->lru, list_head); 693 list_move_tail(&page_tail->lru, list_head);
695 } 694 }
695
696 if (!PageUnevictable(page))
697 update_page_reclaim_stat(zone, page_tail, file, active);
696} 698}
697#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 699#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
698 700
@@ -710,8 +712,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
710 SetPageLRU(page); 712 SetPageLRU(page);
711 if (active) 713 if (active)
712 SetPageActive(page); 714 SetPageActive(page);
713 update_page_reclaim_stat(zone, page, file, active);
714 add_page_to_lru_list(zone, page, lru); 715 add_page_to_lru_list(zone, page, lru);
716 update_page_reclaim_stat(zone, page, file, active);
715} 717}
716 718
717/* 719/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 470038a9187..9d3dd3763cf 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -300,16 +300,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
300 new_page = alloc_page_vma(gfp_mask, vma, addr); 300 new_page = alloc_page_vma(gfp_mask, vma, addr);
301 if (!new_page) 301 if (!new_page)
302 break; /* Out of memory */ 302 break; /* Out of memory */
303 /*
304 * The memcg-specific accounting when moving
305 * pages around the LRU lists relies on the
306 * page's owner (memcg) to be valid. Usually,
307 * pages are assigned to a new owner before
308 * being put on the LRU list, but since this
309 * is not the case here, the stale owner from
310 * a previous allocation cycle must be reset.
311 */
312 mem_cgroup_reset_owner(new_page);
313 } 303 }
314 304
315 /* 305 /*
@@ -382,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
382struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 372struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
383 struct vm_area_struct *vma, unsigned long addr) 373 struct vm_area_struct *vma, unsigned long addr)
384{ 374{
385 int nr_pages;
386 struct page *page; 375 struct page *page;
387 unsigned long offset; 376 unsigned long offset = swp_offset(entry);
388 unsigned long end_offset; 377 unsigned long start_offset, end_offset;
378 unsigned long mask = (1UL << page_cluster) - 1;
389 379
390 /* 380 /* Read a page_cluster sized and aligned cluster around offset. */
391 * Get starting offset for readaround, and number of pages to read. 381 start_offset = offset & ~mask;
392 * Adjust starting address by readbehind (for NUMA interleave case)? 382 end_offset = offset | mask;
393 * No, it's very unlikely that swap layout would follow vma layout, 383 if (!start_offset) /* First page is swap header. */
394 * more likely that neighbouring swap pages came from the same node: 384 start_offset++;
395 * so use the same "addr" to choose the same node for each swap read. 385
396 */ 386 for (offset = start_offset; offset <= end_offset ; offset++) {
397 nr_pages = valid_swaphandles(entry, &offset);
398 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
399 /* Ok, do the async read-ahead now */ 387 /* Ok, do the async read-ahead now */
400 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
401 gfp_mask, vma, addr); 389 gfp_mask, vma, addr);
402 if (!page) 390 if (!page)
403 break; 391 continue;
404 page_cache_release(page); 392 page_cache_release(page);
405 } 393 }
406 lru_add_drain(); /* Push any new pages onto the LRU now */ 394 lru_add_drain(); /* Push any new pages onto the LRU now */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d999f090dfd..dae42f380d6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
932 pmd = pmd_offset(pud, addr); 932 pmd = pmd_offset(pud, addr);
933 do { 933 do {
934 next = pmd_addr_end(addr, end); 934 next = pmd_addr_end(addr, end);
935 if (unlikely(pmd_trans_huge(*pmd))) 935 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
936 continue;
937 if (pmd_none_or_clear_bad(pmd))
938 continue; 936 continue;
939 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 937 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
940 if (ret) 938 if (ret)
@@ -1563,6 +1561,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1563 if (!capable(CAP_SYS_ADMIN)) 1561 if (!capable(CAP_SYS_ADMIN))
1564 return -EPERM; 1562 return -EPERM;
1565 1563
1564 BUG_ON(!current->mm);
1565
1566 pathname = getname(specialfile); 1566 pathname = getname(specialfile);
1567 err = PTR_ERR(pathname); 1567 err = PTR_ERR(pathname);
1568 if (IS_ERR(pathname)) 1568 if (IS_ERR(pathname))
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1590 spin_unlock(&swap_lock); 1590 spin_unlock(&swap_lock);
1591 goto out_dput; 1591 goto out_dput;
1592 } 1592 }
1593 if (!security_vm_enough_memory(p->pages)) 1593 if (!security_vm_enough_memory_mm(current->mm, p->pages))
1594 vm_unacct_memory(p->pages); 1594 vm_unacct_memory(p->pages);
1595 else { 1595 else {
1596 err = -ENOMEM; 1596 err = -ENOMEM;
@@ -2105,7 +2105,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2105 p->flags |= SWP_SOLIDSTATE; 2105 p->flags |= SWP_SOLIDSTATE;
2106 p->cluster_next = 1 + (random32() % p->highest_bit); 2106 p->cluster_next = 1 + (random32() % p->highest_bit);
2107 } 2107 }
2108 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) 2108 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
2109 p->flags |= SWP_DISCARDABLE; 2109 p->flags |= SWP_DISCARDABLE;
2110 } 2110 }
2111 2111
@@ -2290,58 +2290,6 @@ int swapcache_prepare(swp_entry_t entry)
2290} 2290}
2291 2291
2292/* 2292/*
2293 * swap_lock prevents swap_map being freed. Don't grab an extra
2294 * reference on the swaphandle, it doesn't matter if it becomes unused.
2295 */
2296int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2297{
2298 struct swap_info_struct *si;
2299 int our_page_cluster = page_cluster;
2300 pgoff_t target, toff;
2301 pgoff_t base, end;
2302 int nr_pages = 0;
2303
2304 if (!our_page_cluster) /* no readahead */
2305 return 0;
2306
2307 si = swap_info[swp_type(entry)];
2308 target = swp_offset(entry);
2309 base = (target >> our_page_cluster) << our_page_cluster;
2310 end = base + (1 << our_page_cluster);
2311 if (!base) /* first page is swap header */
2312 base++;
2313
2314 spin_lock(&swap_lock);
2315 if (end > si->max) /* don't go beyond end of map */
2316 end = si->max;
2317
2318 /* Count contiguous allocated slots above our target */
2319 for (toff = target; ++toff < end; nr_pages++) {
2320 /* Don't read in free or bad pages */
2321 if (!si->swap_map[toff])
2322 break;
2323 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2324 break;
2325 }
2326 /* Count contiguous allocated slots below our target */
2327 for (toff = target; --toff >= base; nr_pages++) {
2328 /* Don't read in free or bad pages */
2329 if (!si->swap_map[toff])
2330 break;
2331 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2332 break;
2333 }
2334 spin_unlock(&swap_lock);
2335
2336 /*
2337 * Indicate starting offset, and return number of pages to get:
2338 * if only 1, say 0, since there's then no readahead to be done.
2339 */
2340 *offset = ++toff;
2341 return nr_pages? ++nr_pages: 0;
2342}
2343
2344/*
2345 * add_swap_count_continuation - called when a swap count is duplicated 2293 * add_swap_count_continuation - called when a swap count is duplicated
2346 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 2294 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2347 * page of the original vmalloc'ed swap_map, to hold the continuation count 2295 * page of the original vmalloc'ed swap_map, to hold the continuation count
@@ -2427,9 +2375,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2427 if (!(count & COUNT_CONTINUED)) 2375 if (!(count & COUNT_CONTINUED))
2428 goto out; 2376 goto out;
2429 2377
2430 map = kmap_atomic(list_page, KM_USER0) + offset; 2378 map = kmap_atomic(list_page) + offset;
2431 count = *map; 2379 count = *map;
2432 kunmap_atomic(map, KM_USER0); 2380 kunmap_atomic(map);
2433 2381
2434 /* 2382 /*
2435 * If this continuation count now has some space in it, 2383 * If this continuation count now has some space in it,
@@ -2472,7 +2420,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
2472 2420
2473 offset &= ~PAGE_MASK; 2421 offset &= ~PAGE_MASK;
2474 page = list_entry(head->lru.next, struct page, lru); 2422 page = list_entry(head->lru.next, struct page, lru);
2475 map = kmap_atomic(page, KM_USER0) + offset; 2423 map = kmap_atomic(page) + offset;
2476 2424
2477 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 2425 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
2478 goto init_map; /* jump over SWAP_CONT_MAX checks */ 2426 goto init_map; /* jump over SWAP_CONT_MAX checks */
@@ -2482,26 +2430,26 @@ static bool swap_count_continued(struct swap_info_struct *si,
2482 * Think of how you add 1 to 999 2430 * Think of how you add 1 to 999
2483 */ 2431 */
2484 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 2432 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2485 kunmap_atomic(map, KM_USER0); 2433 kunmap_atomic(map);
2486 page = list_entry(page->lru.next, struct page, lru); 2434 page = list_entry(page->lru.next, struct page, lru);
2487 BUG_ON(page == head); 2435 BUG_ON(page == head);
2488 map = kmap_atomic(page, KM_USER0) + offset; 2436 map = kmap_atomic(page) + offset;
2489 } 2437 }
2490 if (*map == SWAP_CONT_MAX) { 2438 if (*map == SWAP_CONT_MAX) {
2491 kunmap_atomic(map, KM_USER0); 2439 kunmap_atomic(map);
2492 page = list_entry(page->lru.next, struct page, lru); 2440 page = list_entry(page->lru.next, struct page, lru);
2493 if (page == head) 2441 if (page == head)
2494 return false; /* add count continuation */ 2442 return false; /* add count continuation */
2495 map = kmap_atomic(page, KM_USER0) + offset; 2443 map = kmap_atomic(page) + offset;
2496init_map: *map = 0; /* we didn't zero the page */ 2444init_map: *map = 0; /* we didn't zero the page */
2497 } 2445 }
2498 *map += 1; 2446 *map += 1;
2499 kunmap_atomic(map, KM_USER0); 2447 kunmap_atomic(map);
2500 page = list_entry(page->lru.prev, struct page, lru); 2448 page = list_entry(page->lru.prev, struct page, lru);
2501 while (page != head) { 2449 while (page != head) {
2502 map = kmap_atomic(page, KM_USER0) + offset; 2450 map = kmap_atomic(page) + offset;
2503 *map = COUNT_CONTINUED; 2451 *map = COUNT_CONTINUED;
2504 kunmap_atomic(map, KM_USER0); 2452 kunmap_atomic(map);
2505 page = list_entry(page->lru.prev, struct page, lru); 2453 page = list_entry(page->lru.prev, struct page, lru);
2506 } 2454 }
2507 return true; /* incremented */ 2455 return true; /* incremented */
@@ -2512,22 +2460,22 @@ init_map: *map = 0; /* we didn't zero the page */
2512 */ 2460 */
2513 BUG_ON(count != COUNT_CONTINUED); 2461 BUG_ON(count != COUNT_CONTINUED);
2514 while (*map == COUNT_CONTINUED) { 2462 while (*map == COUNT_CONTINUED) {
2515 kunmap_atomic(map, KM_USER0); 2463 kunmap_atomic(map);
2516 page = list_entry(page->lru.next, struct page, lru); 2464 page = list_entry(page->lru.next, struct page, lru);
2517 BUG_ON(page == head); 2465 BUG_ON(page == head);
2518 map = kmap_atomic(page, KM_USER0) + offset; 2466 map = kmap_atomic(page) + offset;
2519 } 2467 }
2520 BUG_ON(*map == 0); 2468 BUG_ON(*map == 0);
2521 *map -= 1; 2469 *map -= 1;
2522 if (*map == 0) 2470 if (*map == 0)
2523 count = 0; 2471 count = 0;
2524 kunmap_atomic(map, KM_USER0); 2472 kunmap_atomic(map);
2525 page = list_entry(page->lru.prev, struct page, lru); 2473 page = list_entry(page->lru.prev, struct page, lru);
2526 while (page != head) { 2474 while (page != head) {
2527 map = kmap_atomic(page, KM_USER0) + offset; 2475 map = kmap_atomic(page) + offset;
2528 *map = SWAP_CONT_MAX | count; 2476 *map = SWAP_CONT_MAX | count;
2529 count = COUNT_CONTINUED; 2477 count = COUNT_CONTINUED;
2530 kunmap_atomic(map, KM_USER0); 2478 kunmap_atomic(map);
2531 page = list_entry(page->lru.prev, struct page, lru); 2479 page = list_entry(page->lru.prev, struct page, lru);
2532 } 2480 }
2533 return count == COUNT_CONTINUED; 2481 return count == COUNT_CONTINUED;
diff --git a/mm/truncate.c b/mm/truncate.c
index 632b15e29f7..18aded3a89f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -52,7 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
52static inline void truncate_partial_page(struct page *page, unsigned partial) 52static inline void truncate_partial_page(struct page *page, unsigned partial)
53{ 53{
54 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 54 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
55 cleancache_flush_page(page->mapping, page); 55 cleancache_invalidate_page(page->mapping, page);
56 if (page_has_private(page)) 56 if (page_has_private(page))
57 do_invalidatepage(page, partial); 57 do_invalidatepage(page, partial);
58} 58}
@@ -184,7 +184,7 @@ int invalidate_inode_page(struct page *page)
184} 184}
185 185
186/** 186/**
187 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 187 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
188 * @mapping: mapping to truncate 188 * @mapping: mapping to truncate
189 * @lstart: offset from which to truncate 189 * @lstart: offset from which to truncate
190 * @lend: offset to which to truncate 190 * @lend: offset to which to truncate
@@ -213,7 +213,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
213 pgoff_t end; 213 pgoff_t end;
214 int i; 214 int i;
215 215
216 cleancache_flush_inode(mapping); 216 cleancache_invalidate_inode(mapping);
217 if (mapping->nrpages == 0) 217 if (mapping->nrpages == 0)
218 return; 218 return;
219 219
@@ -292,7 +292,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
292 mem_cgroup_uncharge_end(); 292 mem_cgroup_uncharge_end();
293 index++; 293 index++;
294 } 294 }
295 cleancache_flush_inode(mapping); 295 cleancache_invalidate_inode(mapping);
296} 296}
297EXPORT_SYMBOL(truncate_inode_pages_range); 297EXPORT_SYMBOL(truncate_inode_pages_range);
298 298
@@ -444,7 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
444 int ret2 = 0; 444 int ret2 = 0;
445 int did_range_unmap = 0; 445 int did_range_unmap = 0;
446 446
447 cleancache_flush_inode(mapping); 447 cleancache_invalidate_inode(mapping);
448 pagevec_init(&pvec, 0); 448 pagevec_init(&pvec, 0);
449 index = start; 449 index = start;
450 while (index <= end && pagevec_lookup(&pvec, mapping, index, 450 while (index <= end && pagevec_lookup(&pvec, mapping, index,
@@ -500,7 +500,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
500 cond_resched(); 500 cond_resched();
501 index++; 501 index++;
502 } 502 }
503 cleancache_flush_inode(mapping); 503 cleancache_invalidate_inode(mapping);
504 return ret; 504 return ret;
505} 505}
506EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 506EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/util.c b/mm/util.c
index 136ac4f322b..ae962b31de8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
239 next->vm_prev = vma; 239 next->vm_prev = vma;
240} 240}
241 241
242/* Check if the vma is being used as a stack by this task */
243static int vm_is_stack_for_task(struct task_struct *t,
244 struct vm_area_struct *vma)
245{
246 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
247}
248
249/*
250 * Check if the vma is being used as a stack.
251 * If is_group is non-zero, check in the entire thread group or else
252 * just check in the current task. Returns the pid of the task that
253 * the vma is stack for.
254 */
255pid_t vm_is_stack(struct task_struct *task,
256 struct vm_area_struct *vma, int in_group)
257{
258 pid_t ret = 0;
259
260 if (vm_is_stack_for_task(task, vma))
261 return task->pid;
262
263 if (in_group) {
264 struct task_struct *t;
265 rcu_read_lock();
266 if (!pid_alive(task))
267 goto done;
268
269 t = task;
270 do {
271 if (vm_is_stack_for_task(t, vma)) {
272 ret = t->pid;
273 goto done;
274 }
275 } while_each_thread(task, t);
276done:
277 rcu_read_unlock();
278 }
279
280 return ret;
281}
282
242#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 283#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
243void arch_pick_mmap_layout(struct mm_struct *mm) 284void arch_pick_mmap_layout(struct mm_struct *mm)
244{ 285{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 86ce9a526c1..94dff883b44 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1906,9 +1906,9 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
1906 * we can expect USER0 is not used (see vread/vwrite's 1906 * we can expect USER0 is not used (see vread/vwrite's
1907 * function description) 1907 * function description)
1908 */ 1908 */
1909 void *map = kmap_atomic(p, KM_USER0); 1909 void *map = kmap_atomic(p);
1910 memcpy(buf, map + offset, length); 1910 memcpy(buf, map + offset, length);
1911 kunmap_atomic(map, KM_USER0); 1911 kunmap_atomic(map);
1912 } else 1912 } else
1913 memset(buf, 0, length); 1913 memset(buf, 0, length);
1914 1914
@@ -1945,9 +1945,9 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1945 * we can expect USER0 is not used (see vread/vwrite's 1945 * we can expect USER0 is not used (see vread/vwrite's
1946 * function description) 1946 * function description)
1947 */ 1947 */
1948 void *map = kmap_atomic(p, KM_USER0); 1948 void *map = kmap_atomic(p);
1949 memcpy(map + offset, buf, length); 1949 memcpy(map + offset, buf, length);
1950 kunmap_atomic(map, KM_USER0); 1950 kunmap_atomic(map);
1951 } 1951 }
1952 addr += length; 1952 addr += length;
1953 buf += length; 1953 buf += length;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c52b2355265..33c332bbab7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1138 * @mz: The mem_cgroup_zone to pull pages from. 1138 * @mz: The mem_cgroup_zone to pull pages from.
1139 * @dst: The temp list to put pages on to. 1139 * @dst: The temp list to put pages on to.
1140 * @nr_scanned: The number of pages that were scanned. 1140 * @nr_scanned: The number of pages that were scanned.
1141 * @order: The caller's attempted allocation order 1141 * @sc: The scan_control struct for this reclaim session
1142 * @mode: One of the LRU isolation modes 1142 * @mode: One of the LRU isolation modes
1143 * @active: True [1] if isolating active pages 1143 * @active: True [1] if isolating active pages
1144 * @file: True [1] if isolating file [!anon] pages 1144 * @file: True [1] if isolating file [!anon] pages
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1147 */ 1147 */
1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1149 struct mem_cgroup_zone *mz, struct list_head *dst, 1149 struct mem_cgroup_zone *mz, struct list_head *dst,
1150 unsigned long *nr_scanned, int order, isolate_mode_t mode, 1150 unsigned long *nr_scanned, struct scan_control *sc,
1151 int active, int file) 1151 isolate_mode_t mode, int active, int file)
1152{ 1152{
1153 struct lruvec *lruvec; 1153 struct lruvec *lruvec;
1154 struct list_head *src; 1154 struct list_head *src;
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1194 BUG(); 1194 BUG();
1195 } 1195 }
1196 1196
1197 if (!order) 1197 if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
1198 continue; 1198 continue;
1199 1199
1200 /* 1200 /*
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1208 */ 1208 */
1209 zone_id = page_zone_id(page); 1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page); 1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << order) - 1); 1211 pfn = page_pfn & ~((1 << sc->order) - 1);
1212 end_pfn = pfn + (1 << order); 1212 end_pfn = pfn + (1 << sc->order);
1213 for (; pfn < end_pfn; pfn++) { 1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page; 1214 struct page *cursor_page;
1215 1215
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1275 1275
1276 *nr_scanned = scan; 1276 *nr_scanned = scan;
1277 1277
1278 trace_mm_vmscan_lru_isolate(order, 1278 trace_mm_vmscan_lru_isolate(sc->order,
1279 nr_to_scan, scan, 1279 nr_to_scan, scan,
1280 nr_taken, 1280 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, 1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1413 unsigned long *nr_anon, 1413 unsigned long *nr_anon,
1414 unsigned long *nr_file) 1414 unsigned long *nr_file)
1415{ 1415{
1416 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1417 struct zone *zone = mz->zone; 1416 struct zone *zone = mz->zone;
1418 unsigned int count[NR_LRU_LISTS] = { 0, }; 1417 unsigned int count[NR_LRU_LISTS] = { 0, };
1419 unsigned long nr_active = 0; 1418 unsigned long nr_active = 0;
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1434 count[lru] += numpages; 1433 count[lru] += numpages;
1435 } 1434 }
1436 1435
1436 preempt_disable();
1437 __count_vm_events(PGDEACTIVATE, nr_active); 1437 __count_vm_events(PGDEACTIVATE, nr_active);
1438 1438
1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1450 1450
1451 reclaim_stat->recent_scanned[0] += *nr_anon; 1451 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1452 reclaim_stat->recent_scanned[1] += *nr_file; 1452 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1453 preempt_enable();
1453} 1454}
1454 1455
1455/* 1456/*
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1509 unsigned long nr_file; 1510 unsigned long nr_file;
1510 unsigned long nr_dirty = 0; 1511 unsigned long nr_dirty = 0;
1511 unsigned long nr_writeback = 0; 1512 unsigned long nr_writeback = 0;
1512 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; 1513 isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
1513 struct zone *zone = mz->zone; 1514 struct zone *zone = mz->zone;
1515 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1514 1516
1515 while (unlikely(too_many_isolated(zone, file, sc))) { 1517 while (unlikely(too_many_isolated(zone, file, sc))) {
1516 congestion_wait(BLK_RW_ASYNC, HZ/10); 1518 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1522 1524
1523 set_reclaim_mode(priority, sc, false); 1525 set_reclaim_mode(priority, sc, false);
1524 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) 1526 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1525 reclaim_mode |= ISOLATE_ACTIVE; 1527 isolate_mode |= ISOLATE_ACTIVE;
1526 1528
1527 lru_add_drain(); 1529 lru_add_drain();
1528 1530
1529 if (!sc->may_unmap) 1531 if (!sc->may_unmap)
1530 reclaim_mode |= ISOLATE_UNMAPPED; 1532 isolate_mode |= ISOLATE_UNMAPPED;
1531 if (!sc->may_writepage) 1533 if (!sc->may_writepage)
1532 reclaim_mode |= ISOLATE_CLEAN; 1534 isolate_mode |= ISOLATE_CLEAN;
1533 1535
1534 spin_lock_irq(&zone->lru_lock); 1536 spin_lock_irq(&zone->lru_lock);
1535 1537
1536 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, 1538 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
1537 &nr_scanned, sc->order, 1539 sc, isolate_mode, 0, file);
1538 reclaim_mode, 0, file);
1539 if (global_reclaim(sc)) { 1540 if (global_reclaim(sc)) {
1540 zone->pages_scanned += nr_scanned; 1541 zone->pages_scanned += nr_scanned;
1541 if (current_is_kswapd()) 1542 if (current_is_kswapd())
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1545 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1546 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1546 nr_scanned); 1547 nr_scanned);
1547 } 1548 }
1549 spin_unlock_irq(&zone->lru_lock);
1548 1550
1549 if (nr_taken == 0) { 1551 if (nr_taken == 0)
1550 spin_unlock_irq(&zone->lru_lock);
1551 return 0; 1552 return 0;
1552 }
1553 1553
1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); 1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
1555 1555
1556 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1557 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1558
1559 spin_unlock_irq(&zone->lru_lock);
1560
1561 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, 1556 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1562 &nr_dirty, &nr_writeback); 1557 &nr_dirty, &nr_writeback);
1563 1558
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1570 1565
1571 spin_lock_irq(&zone->lru_lock); 1566 spin_lock_irq(&zone->lru_lock);
1572 1567
1568 reclaim_stat->recent_scanned[0] += nr_anon;
1569 reclaim_stat->recent_scanned[1] += nr_file;
1570
1573 if (current_is_kswapd()) 1571 if (current_is_kswapd())
1574 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1572 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1575 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1573 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone,
1643 unsigned long pgmoved = 0; 1641 unsigned long pgmoved = 0;
1644 struct page *page; 1642 struct page *page;
1645 1643
1646 if (buffer_heads_over_limit) {
1647 spin_unlock_irq(&zone->lru_lock);
1648 list_for_each_entry(page, list, lru) {
1649 if (page_has_private(page) && trylock_page(page)) {
1650 if (page_has_private(page))
1651 try_to_release_page(page, 0);
1652 unlock_page(page);
1653 }
1654 }
1655 spin_lock_irq(&zone->lru_lock);
1656 }
1657
1658 while (!list_empty(list)) { 1644 while (!list_empty(list)) {
1659 struct lruvec *lruvec; 1645 struct lruvec *lruvec;
1660 1646
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan,
1699 struct page *page; 1685 struct page *page;
1700 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1686 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1701 unsigned long nr_rotated = 0; 1687 unsigned long nr_rotated = 0;
1702 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; 1688 isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
1703 struct zone *zone = mz->zone; 1689 struct zone *zone = mz->zone;
1704 1690
1705 lru_add_drain(); 1691 lru_add_drain();
1706 1692
1693 reset_reclaim_mode(sc);
1694
1707 if (!sc->may_unmap) 1695 if (!sc->may_unmap)
1708 reclaim_mode |= ISOLATE_UNMAPPED; 1696 isolate_mode |= ISOLATE_UNMAPPED;
1709 if (!sc->may_writepage) 1697 if (!sc->may_writepage)
1710 reclaim_mode |= ISOLATE_CLEAN; 1698 isolate_mode |= ISOLATE_CLEAN;
1711 1699
1712 spin_lock_irq(&zone->lru_lock); 1700 spin_lock_irq(&zone->lru_lock);
1713 1701
1714 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, 1702 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
1715 &nr_scanned, sc->order, 1703 isolate_mode, 1, file);
1716 reclaim_mode, 1, file);
1717 if (global_reclaim(sc)) 1704 if (global_reclaim(sc))
1718 zone->pages_scanned += nr_scanned; 1705 zone->pages_scanned += nr_scanned;
1719 1706
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
1737 continue; 1724 continue;
1738 } 1725 }
1739 1726
1727 if (unlikely(buffer_heads_over_limit)) {
1728 if (page_has_private(page) && trylock_page(page)) {
1729 if (page_has_private(page))
1730 try_to_release_page(page, 0);
1731 unlock_page(page);
1732 }
1733 }
1734
1740 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { 1735 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
1741 nr_rotated += hpage_nr_pages(page); 1736 nr_rotated += hpage_nr_pages(page);
1742 /* 1737 /*
@@ -2112,7 +2107,12 @@ restart:
2112 * with multiple processes reclaiming pages, the total 2107 * with multiple processes reclaiming pages, the total
2113 * freeing target can get unreasonably large. 2108 * freeing target can get unreasonably large.
2114 */ 2109 */
2115 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2110 if (nr_reclaimed >= nr_to_reclaim)
2111 nr_to_reclaim = 0;
2112 else
2113 nr_to_reclaim -= nr_reclaimed;
2114
2115 if (!nr_to_reclaim && priority < DEF_PRIORITY)
2116 break; 2116 break;
2117 } 2117 }
2118 blk_finish_plug(&plug); 2118 blk_finish_plug(&plug);
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2195 * If compaction is deferred, reclaim up to a point where 2195 * If compaction is deferred, reclaim up to a point where
2196 * compaction will have a chance of success when re-enabled 2196 * compaction will have a chance of success when re-enabled
2197 */ 2197 */
2198 if (compaction_deferred(zone)) 2198 if (compaction_deferred(zone, sc->order))
2199 return watermark_ok; 2199 return watermark_ok;
2200 2200
2201 /* If compaction is not ready to start, keep reclaiming */ 2201 /* If compaction is not ready to start, keep reclaiming */
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2235 unsigned long nr_soft_scanned; 2235 unsigned long nr_soft_scanned;
2236 bool aborted_reclaim = false; 2236 bool aborted_reclaim = false;
2237 2237
2238 /*
2239 * If the number of buffer_heads in the machine exceeds the maximum
2240 * allowed level, force direct reclaim to scan the highmem zone as
2241 * highmem pages could be pinning lowmem pages storing buffer_heads
2242 */
2243 if (buffer_heads_over_limit)
2244 sc->gfp_mask |= __GFP_HIGHMEM;
2245
2238 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2246 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2239 gfp_zone(sc->gfp_mask), sc->nodemask) { 2247 gfp_zone(sc->gfp_mask), sc->nodemask) {
2240 if (!populated_zone(zone)) 2248 if (!populated_zone(zone))
@@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2255 * Even though compaction is invoked for any 2263 * Even though compaction is invoked for any
2256 * non-zero order, only frequent costly order 2264 * non-zero order, only frequent costly order
2257 * reclamation is disruptive enough to become a 2265 * reclamation is disruptive enough to become a
2258 * noticable problem, like transparent huge page 2266 * noticeable problem, like transparent huge
2259 * allocations. 2267 * page allocations.
2260 */ 2268 */
2261 if (compaction_ready(zone, sc)) { 2269 if (compaction_ready(zone, sc)) {
2262 aborted_reclaim = true; 2270 aborted_reclaim = true;
@@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2337 unsigned long writeback_threshold; 2345 unsigned long writeback_threshold;
2338 bool aborted_reclaim; 2346 bool aborted_reclaim;
2339 2347
2340 get_mems_allowed();
2341 delayacct_freepages_start(); 2348 delayacct_freepages_start();
2342 2349
2343 if (global_reclaim(sc)) 2350 if (global_reclaim(sc))
@@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2401 2408
2402out: 2409out:
2403 delayacct_freepages_end(); 2410 delayacct_freepages_end();
2404 put_mems_allowed();
2405 2411
2406 if (sc->nr_reclaimed) 2412 if (sc->nr_reclaimed)
2407 return sc->nr_reclaimed; 2413 return sc->nr_reclaimed;
@@ -2724,6 +2730,17 @@ loop_again:
2724 */ 2730 */
2725 age_active_anon(zone, &sc, priority); 2731 age_active_anon(zone, &sc, priority);
2726 2732
2733 /*
2734 * If the number of buffer_heads in the machine
2735 * exceeds the maximum allowed level and this node
2736 * has a highmem zone, force kswapd to reclaim from
2737 * it to relieve lowmem pressure.
2738 */
2739 if (buffer_heads_over_limit && is_highmem_idx(i)) {
2740 end_zone = i;
2741 break;
2742 }
2743
2727 if (!zone_watermark_ok_safe(zone, order, 2744 if (!zone_watermark_ok_safe(zone, order,
2728 high_wmark_pages(zone), 0, 0)) { 2745 high_wmark_pages(zone), 0, 0)) {
2729 end_zone = i; 2746 end_zone = i;
@@ -2753,7 +2770,7 @@ loop_again:
2753 */ 2770 */
2754 for (i = 0; i <= end_zone; i++) { 2771 for (i = 0; i <= end_zone; i++) {
2755 struct zone *zone = pgdat->node_zones + i; 2772 struct zone *zone = pgdat->node_zones + i;
2756 int nr_slab; 2773 int nr_slab, testorder;
2757 unsigned long balance_gap; 2774 unsigned long balance_gap;
2758 2775
2759 if (!populated_zone(zone)) 2776 if (!populated_zone(zone))
@@ -2786,7 +2803,21 @@ loop_again:
2786 (zone->present_pages + 2803 (zone->present_pages +
2787 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2804 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2788 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2805 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2789 if (!zone_watermark_ok_safe(zone, order, 2806 /*
2807 * Kswapd reclaims only single pages with compaction
2808 * enabled. Trying too hard to reclaim until contiguous
2809 * free pages have become available can hurt performance
2810 * by evicting too much useful data from memory.
2811 * Do not reclaim more than needed for compaction.
2812 */
2813 testorder = order;
2814 if (COMPACTION_BUILD && order &&
2815 compaction_suitable(zone, order) !=
2816 COMPACT_SKIPPED)
2817 testorder = 0;
2818
2819 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2820 !zone_watermark_ok_safe(zone, testorder,
2790 high_wmark_pages(zone) + balance_gap, 2821 high_wmark_pages(zone) + balance_gap,
2791 end_zone, 0)) { 2822 end_zone, 0)) {
2792 shrink_zone(priority, zone, &sc); 2823 shrink_zone(priority, zone, &sc);
@@ -2815,7 +2846,7 @@ loop_again:
2815 continue; 2846 continue;
2816 } 2847 }
2817 2848
2818 if (!zone_watermark_ok_safe(zone, order, 2849 if (!zone_watermark_ok_safe(zone, testorder,
2819 high_wmark_pages(zone), end_zone, 0)) { 2850 high_wmark_pages(zone), end_zone, 0)) {
2820 all_zones_ok = 0; 2851 all_zones_ok = 0;
2821 /* 2852 /*
@@ -2903,6 +2934,8 @@ out:
2903 * and it is potentially going to sleep here. 2934 * and it is potentially going to sleep here.
2904 */ 2935 */
2905 if (order) { 2936 if (order) {
2937 int zones_need_compaction = 1;
2938
2906 for (i = 0; i <= end_zone; i++) { 2939 for (i = 0; i <= end_zone; i++) {
2907 struct zone *zone = pgdat->node_zones + i; 2940 struct zone *zone = pgdat->node_zones + i;
2908 2941
@@ -2912,6 +2945,11 @@ out:
2912 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2945 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2913 continue; 2946 continue;
2914 2947
2948 /* Would compaction fail due to lack of free memory? */
2949 if (COMPACTION_BUILD &&
2950 compaction_suitable(zone, order) == COMPACT_SKIPPED)
2951 goto loop_again;
2952
2915 /* Confirm the zone is balanced for order-0 */ 2953 /* Confirm the zone is balanced for order-0 */
2916 if (!zone_watermark_ok(zone, 0, 2954 if (!zone_watermark_ok(zone, 0,
2917 high_wmark_pages(zone), 0, 0)) { 2955 high_wmark_pages(zone), 0, 0)) {
@@ -2919,11 +2957,17 @@ out:
2919 goto loop_again; 2957 goto loop_again;
2920 } 2958 }
2921 2959
2960 /* Check if the memory needs to be defragmented. */
2961 if (zone_watermark_ok(zone, order,
2962 low_wmark_pages(zone), *classzone_idx, 0))
2963 zones_need_compaction = 0;
2964
2922 /* If balanced, clear the congested flag */ 2965 /* If balanced, clear the congested flag */
2923 zone_clear_flag(zone, ZONE_CONGESTED); 2966 zone_clear_flag(zone, ZONE_CONGESTED);
2924 if (i <= *classzone_idx)
2925 balanced += zone->present_pages;
2926 } 2967 }
2968
2969 if (zones_need_compaction)
2970 compact_pgdat(pgdat, order);
2927 } 2971 }
2928 2972
2929 /* 2973 /*