aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2012-04-14 07:18:27 -0400
committerIngo Molnar <mingo@kernel.org>2012-04-14 07:19:04 -0400
commit6ac1ef482d7ae0c690f1640bf6eb818ff9a2d91e (patch)
tree021cc9f6b477146fcebe6f3be4752abfa2ba18a9 /mm
parent682968e0c425c60f0dde37977e5beb2b12ddc4cc (diff)
parenta385ec4f11bdcf81af094c03e2444ee9b7fad2e5 (diff)
Merge branch 'perf/core' into perf/uprobes
Merge in latest upstream (and the latest perf development tree), to prepare for tooling changes, and also to pick up v3.4 MM changes that the uprobes code needs to take care of. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c5
-rw-r--r--mm/bounce.c4
-rw-r--r--mm/cleancache.c98
-rw-r--r--mm/compaction.c77
-rw-r--r--mm/filemap.c116
-rw-r--r--mm/huge_memory.c129
-rw-r--r--mm/hugetlb.c211
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/ksm.c57
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memcontrol.c678
-rw-r--r--mm/memory-failure.c98
-rw-r--r--mm/memory.c198
-rw-r--r--mm/mempolicy.c65
-rw-r--r--mm/migrate.c38
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mmap.c90
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/oom_kill.c166
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c102
-rw-r--r--mm/page_cgroup.c4
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu-vm.c3
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/rmap.c70
-rw-r--r--mm/shmem.c106
-rw-r--r--mm/slab.c69
-rw-r--r--mm/slub.c76
-rw-r--r--mm/sparse.c30
-rw-r--r--mm/swap.c12
-rw-r--r--mm/swap_state.c34
-rw-r--r--mm/swapfile.c95
-rw-r--r--mm/truncate.c52
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmalloc.c8
-rw-r--r--mm/vmscan.c152
40 files changed, 1727 insertions, 1197 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 668e94df8cf2..0131170c9d54 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size,
766 unsigned long section_nr) 766 unsigned long section_nr)
767{ 767{
768 bootmem_data_t *bdata; 768 bootmem_data_t *bdata;
769 unsigned long pfn, goal, limit; 769 unsigned long pfn, goal;
770 770
771 pfn = section_nr_to_pfn(section_nr); 771 pfn = section_nr_to_pfn(section_nr);
772 goal = pfn << PAGE_SHIFT; 772 goal = pfn << PAGE_SHIFT;
773 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
774 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 773 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
775 774
776 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 775 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
777} 776}
778#endif 777#endif
779 778
diff --git a/mm/bounce.c b/mm/bounce.c
index 4e9ae722af83..d1be02ca1889 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -50,9 +50,9 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
50 unsigned char *vto; 50 unsigned char *vto;
51 51
52 local_irq_save(flags); 52 local_irq_save(flags);
53 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); 53 vto = kmap_atomic(to->bv_page);
54 memcpy(vto + to->bv_offset, vfrom, to->bv_len); 54 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
55 kunmap_atomic(vto, KM_BOUNCE_READ); 55 kunmap_atomic(vto);
56 local_irq_restore(flags); 56 local_irq_restore(flags);
57} 57}
58 58
diff --git a/mm/cleancache.c b/mm/cleancache.c
index bcaae4c2a770..5646c740f613 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -15,29 +15,34 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/exportfs.h> 16#include <linux/exportfs.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/debugfs.h>
18#include <linux/cleancache.h> 19#include <linux/cleancache.h>
19 20
20/* 21/*
21 * This global enablement flag may be read thousands of times per second 22 * This global enablement flag may be read thousands of times per second
22 * by cleancache_get/put/flush even on systems where cleancache_ops 23 * by cleancache_get/put/invalidate even on systems where cleancache_ops
23 * is not claimed (e.g. cleancache is config'ed on but remains 24 * is not claimed (e.g. cleancache is config'ed on but remains
24 * disabled), so is preferred to the slower alternative: a function 25 * disabled), so is preferred to the slower alternative: a function
25 * call that checks a non-global. 26 * call that checks a non-global.
26 */ 27 */
27int cleancache_enabled; 28int cleancache_enabled __read_mostly;
28EXPORT_SYMBOL(cleancache_enabled); 29EXPORT_SYMBOL(cleancache_enabled);
29 30
30/* 31/*
31 * cleancache_ops is set by cleancache_ops_register to contain the pointers 32 * cleancache_ops is set by cleancache_ops_register to contain the pointers
32 * to the cleancache "backend" implementation functions. 33 * to the cleancache "backend" implementation functions.
33 */ 34 */
34static struct cleancache_ops cleancache_ops; 35static struct cleancache_ops cleancache_ops __read_mostly;
35 36
36/* useful stats available in /sys/kernel/mm/cleancache */ 37/*
37static unsigned long cleancache_succ_gets; 38 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
38static unsigned long cleancache_failed_gets; 39 * properly configured. These are for information only so are not protected
39static unsigned long cleancache_puts; 40 * against increment races.
40static unsigned long cleancache_flushes; 41 */
42static u64 cleancache_succ_gets;
43static u64 cleancache_failed_gets;
44static u64 cleancache_puts;
45static u64 cleancache_invalidates;
41 46
42/* 47/*
43 * register operations for cleancache, returning previous thus allowing 48 * register operations for cleancache, returning previous thus allowing
@@ -148,10 +153,11 @@ void __cleancache_put_page(struct page *page)
148EXPORT_SYMBOL(__cleancache_put_page); 153EXPORT_SYMBOL(__cleancache_put_page);
149 154
150/* 155/*
151 * Flush any data from cleancache associated with the poolid and the 156 * Invalidate any data from cleancache associated with the poolid and the
152 * page's inode and page index so that a subsequent "get" will fail. 157 * page's inode and page index so that a subsequent "get" will fail.
153 */ 158 */
154void __cleancache_flush_page(struct address_space *mapping, struct page *page) 159void __cleancache_invalidate_page(struct address_space *mapping,
160 struct page *page)
155{ 161{
156 /* careful... page->mapping is NULL sometimes when this is called */ 162 /* careful... page->mapping is NULL sometimes when this is called */
157 int pool_id = mapping->host->i_sb->cleancache_poolid; 163 int pool_id = mapping->host->i_sb->cleancache_poolid;
@@ -160,85 +166,57 @@ void __cleancache_flush_page(struct address_space *mapping, struct page *page)
160 if (pool_id >= 0) { 166 if (pool_id >= 0) {
161 VM_BUG_ON(!PageLocked(page)); 167 VM_BUG_ON(!PageLocked(page));
162 if (cleancache_get_key(mapping->host, &key) >= 0) { 168 if (cleancache_get_key(mapping->host, &key) >= 0) {
163 (*cleancache_ops.flush_page)(pool_id, key, page->index); 169 (*cleancache_ops.invalidate_page)(pool_id,
164 cleancache_flushes++; 170 key, page->index);
171 cleancache_invalidates++;
165 } 172 }
166 } 173 }
167} 174}
168EXPORT_SYMBOL(__cleancache_flush_page); 175EXPORT_SYMBOL(__cleancache_invalidate_page);
169 176
170/* 177/*
171 * Flush all data from cleancache associated with the poolid and the 178 * Invalidate all data from cleancache associated with the poolid and the
172 * mappings's inode so that all subsequent gets to this poolid/inode 179 * mappings's inode so that all subsequent gets to this poolid/inode
173 * will fail. 180 * will fail.
174 */ 181 */
175void __cleancache_flush_inode(struct address_space *mapping) 182void __cleancache_invalidate_inode(struct address_space *mapping)
176{ 183{
177 int pool_id = mapping->host->i_sb->cleancache_poolid; 184 int pool_id = mapping->host->i_sb->cleancache_poolid;
178 struct cleancache_filekey key = { .u.key = { 0 } }; 185 struct cleancache_filekey key = { .u.key = { 0 } };
179 186
180 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) 187 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
181 (*cleancache_ops.flush_inode)(pool_id, key); 188 (*cleancache_ops.invalidate_inode)(pool_id, key);
182} 189}
183EXPORT_SYMBOL(__cleancache_flush_inode); 190EXPORT_SYMBOL(__cleancache_invalidate_inode);
184 191
185/* 192/*
186 * Called by any cleancache-enabled filesystem at time of unmount; 193 * Called by any cleancache-enabled filesystem at time of unmount;
187 * note that pool_id is surrendered and may be reutrned by a subsequent 194 * note that pool_id is surrendered and may be reutrned by a subsequent
188 * cleancache_init_fs or cleancache_init_shared_fs 195 * cleancache_init_fs or cleancache_init_shared_fs
189 */ 196 */
190void __cleancache_flush_fs(struct super_block *sb) 197void __cleancache_invalidate_fs(struct super_block *sb)
191{ 198{
192 if (sb->cleancache_poolid >= 0) { 199 if (sb->cleancache_poolid >= 0) {
193 int old_poolid = sb->cleancache_poolid; 200 int old_poolid = sb->cleancache_poolid;
194 sb->cleancache_poolid = -1; 201 sb->cleancache_poolid = -1;
195 (*cleancache_ops.flush_fs)(old_poolid); 202 (*cleancache_ops.invalidate_fs)(old_poolid);
196 } 203 }
197} 204}
198EXPORT_SYMBOL(__cleancache_flush_fs); 205EXPORT_SYMBOL(__cleancache_invalidate_fs);
199
200#ifdef CONFIG_SYSFS
201
202/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
203
204#define CLEANCACHE_SYSFS_RO(_name) \
205 static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
206 struct kobj_attribute *attr, char *buf) \
207 { \
208 return sprintf(buf, "%lu\n", cleancache_##_name); \
209 } \
210 static struct kobj_attribute cleancache_##_name##_attr = { \
211 .attr = { .name = __stringify(_name), .mode = 0444 }, \
212 .show = cleancache_##_name##_show, \
213 }
214
215CLEANCACHE_SYSFS_RO(succ_gets);
216CLEANCACHE_SYSFS_RO(failed_gets);
217CLEANCACHE_SYSFS_RO(puts);
218CLEANCACHE_SYSFS_RO(flushes);
219
220static struct attribute *cleancache_attrs[] = {
221 &cleancache_succ_gets_attr.attr,
222 &cleancache_failed_gets_attr.attr,
223 &cleancache_puts_attr.attr,
224 &cleancache_flushes_attr.attr,
225 NULL,
226};
227
228static struct attribute_group cleancache_attr_group = {
229 .attrs = cleancache_attrs,
230 .name = "cleancache",
231};
232
233#endif /* CONFIG_SYSFS */
234 206
235static int __init init_cleancache(void) 207static int __init init_cleancache(void)
236{ 208{
237#ifdef CONFIG_SYSFS 209#ifdef CONFIG_DEBUG_FS
238 int err; 210 struct dentry *root = debugfs_create_dir("cleancache", NULL);
239 211 if (root == NULL)
240 err = sysfs_create_group(mm_kobj, &cleancache_attr_group); 212 return -ENXIO;
241#endif /* CONFIG_SYSFS */ 213 debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets);
214 debugfs_create_u64("failed_gets", S_IRUGO,
215 root, &cleancache_failed_gets);
216 debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts);
217 debugfs_create_u64("invalidates", S_IRUGO,
218 root, &cleancache_invalidates);
219#endif
242 return 0; 220 return 0;
243} 221}
244module_init(init_cleancache) 222module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
index d9ebebe1a2aa..74a8c825ff28 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,7 +35,7 @@ struct compact_control {
35 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */ 36 bool sync; /* Synchronous migration */
37 37
38 unsigned int order; /* order a direct compactor needs */ 38 int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone; 40 struct zone *zone;
41}; 41};
@@ -675,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
675 675
676 676
677/* Compact all zones within a node */ 677/* Compact all zones within a node */
678static int compact_node(int nid) 678static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
679{ 679{
680 int zoneid; 680 int zoneid;
681 pg_data_t *pgdat;
682 struct zone *zone; 681 struct zone *zone;
683 682
684 if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
685 return -EINVAL;
686 pgdat = NODE_DATA(nid);
687
688 /* Flush pending updates to the LRU lists */
689 lru_add_drain_all();
690
691 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 683 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
692 struct compact_control cc = {
693 .nr_freepages = 0,
694 .nr_migratepages = 0,
695 .order = -1,
696 .sync = true,
697 };
698 684
699 zone = &pgdat->node_zones[zoneid]; 685 zone = &pgdat->node_zones[zoneid];
700 if (!populated_zone(zone)) 686 if (!populated_zone(zone))
701 continue; 687 continue;
702 688
703 cc.zone = zone; 689 cc->nr_freepages = 0;
704 INIT_LIST_HEAD(&cc.freepages); 690 cc->nr_migratepages = 0;
705 INIT_LIST_HEAD(&cc.migratepages); 691 cc->zone = zone;
706 692 INIT_LIST_HEAD(&cc->freepages);
707 compact_zone(zone, &cc); 693 INIT_LIST_HEAD(&cc->migratepages);
694
695 if (cc->order == -1 || !compaction_deferred(zone, cc->order))
696 compact_zone(zone, cc);
697
698 if (cc->order > 0) {
699 int ok = zone_watermark_ok(zone, cc->order,
700 low_wmark_pages(zone), 0, 0);
701 if (ok && cc->order > zone->compact_order_failed)
702 zone->compact_order_failed = cc->order + 1;
703 /* Currently async compaction is never deferred. */
704 else if (!ok && cc->sync)
705 defer_compaction(zone, cc->order);
706 }
708 707
709 VM_BUG_ON(!list_empty(&cc.freepages)); 708 VM_BUG_ON(!list_empty(&cc->freepages));
710 VM_BUG_ON(!list_empty(&cc.migratepages)); 709 VM_BUG_ON(!list_empty(&cc->migratepages));
711 } 710 }
712 711
713 return 0; 712 return 0;
714} 713}
715 714
715int compact_pgdat(pg_data_t *pgdat, int order)
716{
717 struct compact_control cc = {
718 .order = order,
719 .sync = false,
720 };
721
722 return __compact_pgdat(pgdat, &cc);
723}
724
725static int compact_node(int nid)
726{
727 struct compact_control cc = {
728 .order = -1,
729 .sync = true,
730 };
731
732 return __compact_pgdat(NODE_DATA(nid), &cc);
733}
734
716/* Compact all nodes in the system */ 735/* Compact all nodes in the system */
717static int compact_nodes(void) 736static int compact_nodes(void)
718{ 737{
719 int nid; 738 int nid;
720 739
740 /* Flush pending updates to the LRU lists */
741 lru_add_drain_all();
742
721 for_each_online_node(nid) 743 for_each_online_node(nid)
722 compact_node(nid); 744 compact_node(nid);
723 745
@@ -750,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev,
750 struct device_attribute *attr, 772 struct device_attribute *attr,
751 const char *buf, size_t count) 773 const char *buf, size_t count)
752{ 774{
753 compact_node(dev->id); 775 int nid = dev->id;
776
777 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
778 /* Flush pending updates to the LRU lists */
779 lru_add_drain_all();
780
781 compact_node(nid);
782 }
754 783
755 return count; 784 return count;
756} 785}
diff --git a/mm/filemap.c b/mm/filemap.c
index b66275757c28..79c4b2b0b14e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,9 +101,8 @@
101 * ->inode->i_lock (zap_pte_range->set_page_dirty) 101 * ->inode->i_lock (zap_pte_range->set_page_dirty)
102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
103 * 103 *
104 * (code doesn't rely on that order, so you could switch it around) 104 * ->i_mmap_mutex
105 * ->tasklist_lock (memory_failure, collect_procs_ao) 105 * ->tasklist_lock (memory_failure, collect_procs_ao)
106 * ->i_mmap_mutex
107 */ 106 */
108 107
109/* 108/*
@@ -123,7 +122,7 @@ void __delete_from_page_cache(struct page *page)
123 if (PageUptodate(page) && PageMappedToDisk(page)) 122 if (PageUptodate(page) && PageMappedToDisk(page))
124 cleancache_put_page(page); 123 cleancache_put_page(page);
125 else 124 else
126 cleancache_flush_page(mapping, page); 125 cleancache_invalidate_page(mapping, page);
127 126
128 radix_tree_delete(&mapping->page_tree, page->index); 127 radix_tree_delete(&mapping->page_tree, page->index);
129 page->mapping = NULL; 128 page->mapping = NULL;
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
500 struct page *page; 499 struct page *page;
501 500
502 if (cpuset_do_page_mem_spread()) { 501 if (cpuset_do_page_mem_spread()) {
503 get_mems_allowed(); 502 unsigned int cpuset_mems_cookie;
504 n = cpuset_mem_spread_node(); 503 do {
505 page = alloc_pages_exact_node(n, gfp, 0); 504 cpuset_mems_cookie = get_mems_allowed();
506 put_mems_allowed(); 505 n = cpuset_mem_spread_node();
506 page = alloc_pages_exact_node(n, gfp, 0);
507 } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
508
507 return page; 509 return page;
508 } 510 }
509 return alloc_pages(gfp, 0); 511 return alloc_pages(gfp, 0);
@@ -811,20 +813,19 @@ EXPORT_SYMBOL(find_or_create_page);
811unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 813unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
812 unsigned int nr_pages, struct page **pages) 814 unsigned int nr_pages, struct page **pages)
813{ 815{
814 unsigned int i; 816 struct radix_tree_iter iter;
815 unsigned int ret; 817 void **slot;
816 unsigned int nr_found, nr_skip; 818 unsigned ret = 0;
819
820 if (unlikely(!nr_pages))
821 return 0;
817 822
818 rcu_read_lock(); 823 rcu_read_lock();
819restart: 824restart:
820 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 825 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
821 (void ***)pages, NULL, start, nr_pages);
822 ret = 0;
823 nr_skip = 0;
824 for (i = 0; i < nr_found; i++) {
825 struct page *page; 826 struct page *page;
826repeat: 827repeat:
827 page = radix_tree_deref_slot((void **)pages[i]); 828 page = radix_tree_deref_slot(slot);
828 if (unlikely(!page)) 829 if (unlikely(!page))
829 continue; 830 continue;
830 831
@@ -835,7 +836,7 @@ repeat:
835 * when entry at index 0 moves out of or back 836 * when entry at index 0 moves out of or back
836 * to root: none yet gotten, safe to restart. 837 * to root: none yet gotten, safe to restart.
837 */ 838 */
838 WARN_ON(start | i); 839 WARN_ON(iter.index);
839 goto restart; 840 goto restart;
840 } 841 }
841 /* 842 /*
@@ -843,7 +844,6 @@ repeat:
843 * here as an exceptional entry: so skip over it - 844 * here as an exceptional entry: so skip over it -
844 * we only reach this from invalidate_mapping_pages(). 845 * we only reach this from invalidate_mapping_pages().
845 */ 846 */
846 nr_skip++;
847 continue; 847 continue;
848 } 848 }
849 849
@@ -851,21 +851,16 @@ repeat:
851 goto repeat; 851 goto repeat;
852 852
853 /* Has the page moved? */ 853 /* Has the page moved? */
854 if (unlikely(page != *((void **)pages[i]))) { 854 if (unlikely(page != *slot)) {
855 page_cache_release(page); 855 page_cache_release(page);
856 goto repeat; 856 goto repeat;
857 } 857 }
858 858
859 pages[ret] = page; 859 pages[ret] = page;
860 ret++; 860 if (++ret == nr_pages)
861 break;
861 } 862 }
862 863
863 /*
864 * If all entries were removed before we could secure them,
865 * try again, because callers stop trying once 0 is returned.
866 */
867 if (unlikely(!ret && nr_found > nr_skip))
868 goto restart;
869 rcu_read_unlock(); 864 rcu_read_unlock();
870 return ret; 865 return ret;
871} 866}
@@ -885,21 +880,22 @@ repeat:
885unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 880unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
886 unsigned int nr_pages, struct page **pages) 881 unsigned int nr_pages, struct page **pages)
887{ 882{
888 unsigned int i; 883 struct radix_tree_iter iter;
889 unsigned int ret; 884 void **slot;
890 unsigned int nr_found; 885 unsigned int ret = 0;
886
887 if (unlikely(!nr_pages))
888 return 0;
891 889
892 rcu_read_lock(); 890 rcu_read_lock();
893restart: 891restart:
894 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 892 radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
895 (void ***)pages, NULL, index, nr_pages);
896 ret = 0;
897 for (i = 0; i < nr_found; i++) {
898 struct page *page; 893 struct page *page;
899repeat: 894repeat:
900 page = radix_tree_deref_slot((void **)pages[i]); 895 page = radix_tree_deref_slot(slot);
896 /* The hole, there no reason to continue */
901 if (unlikely(!page)) 897 if (unlikely(!page))
902 continue; 898 break;
903 899
904 if (radix_tree_exception(page)) { 900 if (radix_tree_exception(page)) {
905 if (radix_tree_deref_retry(page)) { 901 if (radix_tree_deref_retry(page)) {
@@ -922,7 +918,7 @@ repeat:
922 goto repeat; 918 goto repeat;
923 919
924 /* Has the page moved? */ 920 /* Has the page moved? */
925 if (unlikely(page != *((void **)pages[i]))) { 921 if (unlikely(page != *slot)) {
926 page_cache_release(page); 922 page_cache_release(page);
927 goto repeat; 923 goto repeat;
928 } 924 }
@@ -932,14 +928,14 @@ repeat:
932 * otherwise we can get both false positives and false 928 * otherwise we can get both false positives and false
933 * negatives, which is just confusing to the caller. 929 * negatives, which is just confusing to the caller.
934 */ 930 */
935 if (page->mapping == NULL || page->index != index) { 931 if (page->mapping == NULL || page->index != iter.index) {
936 page_cache_release(page); 932 page_cache_release(page);
937 break; 933 break;
938 } 934 }
939 935
940 pages[ret] = page; 936 pages[ret] = page;
941 ret++; 937 if (++ret == nr_pages)
942 index++; 938 break;
943 } 939 }
944 rcu_read_unlock(); 940 rcu_read_unlock();
945 return ret; 941 return ret;
@@ -960,19 +956,20 @@ EXPORT_SYMBOL(find_get_pages_contig);
960unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 956unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
961 int tag, unsigned int nr_pages, struct page **pages) 957 int tag, unsigned int nr_pages, struct page **pages)
962{ 958{
963 unsigned int i; 959 struct radix_tree_iter iter;
964 unsigned int ret; 960 void **slot;
965 unsigned int nr_found; 961 unsigned ret = 0;
962
963 if (unlikely(!nr_pages))
964 return 0;
966 965
967 rcu_read_lock(); 966 rcu_read_lock();
968restart: 967restart:
969 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, 968 radix_tree_for_each_tagged(slot, &mapping->page_tree,
970 (void ***)pages, *index, nr_pages, tag); 969 &iter, *index, tag) {
971 ret = 0;
972 for (i = 0; i < nr_found; i++) {
973 struct page *page; 970 struct page *page;
974repeat: 971repeat:
975 page = radix_tree_deref_slot((void **)pages[i]); 972 page = radix_tree_deref_slot(slot);
976 if (unlikely(!page)) 973 if (unlikely(!page))
977 continue; 974 continue;
978 975
@@ -996,21 +993,16 @@ repeat:
996 goto repeat; 993 goto repeat;
997 994
998 /* Has the page moved? */ 995 /* Has the page moved? */
999 if (unlikely(page != *((void **)pages[i]))) { 996 if (unlikely(page != *slot)) {
1000 page_cache_release(page); 997 page_cache_release(page);
1001 goto repeat; 998 goto repeat;
1002 } 999 }
1003 1000
1004 pages[ret] = page; 1001 pages[ret] = page;
1005 ret++; 1002 if (++ret == nr_pages)
1003 break;
1006 } 1004 }
1007 1005
1008 /*
1009 * If all entries were removed before we could secure them,
1010 * try again, because callers stop trying once 0 is returned.
1011 */
1012 if (unlikely(!ret && nr_found))
1013 goto restart;
1014 rcu_read_unlock(); 1006 rcu_read_unlock();
1015 1007
1016 if (ret) 1008 if (ret)
@@ -1318,10 +1310,10 @@ int file_read_actor(read_descriptor_t *desc, struct page *page,
1318 * taking the kmap. 1310 * taking the kmap.
1319 */ 1311 */
1320 if (!fault_in_pages_writeable(desc->arg.buf, size)) { 1312 if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1321 kaddr = kmap_atomic(page, KM_USER0); 1313 kaddr = kmap_atomic(page);
1322 left = __copy_to_user_inatomic(desc->arg.buf, 1314 left = __copy_to_user_inatomic(desc->arg.buf,
1323 kaddr + offset, size); 1315 kaddr + offset, size);
1324 kunmap_atomic(kaddr, KM_USER0); 1316 kunmap_atomic(kaddr);
1325 if (left == 0) 1317 if (left == 0)
1326 goto success; 1318 goto success;
1327 } 1319 }
@@ -2045,7 +2037,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
2045 size_t copied; 2037 size_t copied;
2046 2038
2047 BUG_ON(!in_atomic()); 2039 BUG_ON(!in_atomic());
2048 kaddr = kmap_atomic(page, KM_USER0); 2040 kaddr = kmap_atomic(page);
2049 if (likely(i->nr_segs == 1)) { 2041 if (likely(i->nr_segs == 1)) {
2050 int left; 2042 int left;
2051 char __user *buf = i->iov->iov_base + i->iov_offset; 2043 char __user *buf = i->iov->iov_base + i->iov_offset;
@@ -2055,7 +2047,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
2055 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 2047 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
2056 i->iov, i->iov_offset, bytes); 2048 i->iov, i->iov_offset, bytes);
2057 } 2049 }
2058 kunmap_atomic(kaddr, KM_USER0); 2050 kunmap_atomic(kaddr);
2059 2051
2060 return copied; 2052 return copied;
2061} 2053}
@@ -2341,7 +2333,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
2341 struct page *page; 2333 struct page *page;
2342 gfp_t gfp_notmask = 0; 2334 gfp_t gfp_notmask = 0;
2343 2335
2344 gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; 2336 gfp_mask = mapping_gfp_mask(mapping);
2337 if (mapping_cap_account_dirty(mapping))
2338 gfp_mask |= __GFP_WRITE;
2345 if (flags & AOP_FLAG_NOFS) 2339 if (flags & AOP_FLAG_NOFS)
2346 gfp_notmask = __GFP_FS; 2340 gfp_notmask = __GFP_FS;
2347repeat: 2341repeat:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 91d3efb25d15..f0e5306eeb55 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -671,6 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
671 set_pmd_at(mm, haddr, pmd, entry); 671 set_pmd_at(mm, haddr, pmd, entry);
672 prepare_pmd_huge_pte(pgtable, mm); 672 prepare_pmd_huge_pte(pgtable, mm);
673 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 673 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
674 mm->nr_ptes++;
674 spin_unlock(&mm->page_table_lock); 675 spin_unlock(&mm->page_table_lock);
675 } 676 }
676 677
@@ -789,6 +790,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
789 pmd = pmd_mkold(pmd_wrprotect(pmd)); 790 pmd = pmd_mkold(pmd_wrprotect(pmd));
790 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 791 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
791 prepare_pmd_huge_pte(pgtable, dst_mm); 792 prepare_pmd_huge_pte(pgtable, dst_mm);
793 dst_mm->nr_ptes++;
792 794
793 ret = 0; 795 ret = 0;
794out_unlock: 796out_unlock:
@@ -887,7 +889,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
887 } 889 }
888 kfree(pages); 890 kfree(pages);
889 891
890 mm->nr_ptes++;
891 smp_wmb(); /* make pte visible before pmd */ 892 smp_wmb(); /* make pte visible before pmd */
892 pmd_populate(mm, pmd, pgtable); 893 pmd_populate(mm, pmd, pgtable);
893 page_remove_rmap(page); 894 page_remove_rmap(page);
@@ -1030,31 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1030{ 1031{
1031 int ret = 0; 1032 int ret = 0;
1032 1033
1033 spin_lock(&tlb->mm->page_table_lock); 1034 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1034 if (likely(pmd_trans_huge(*pmd))) { 1035 struct page *page;
1035 if (unlikely(pmd_trans_splitting(*pmd))) { 1036 pgtable_t pgtable;
1036 spin_unlock(&tlb->mm->page_table_lock); 1037 pgtable = get_pmd_huge_pte(tlb->mm);
1037 wait_split_huge_page(vma->anon_vma, 1038 page = pmd_page(*pmd);
1038 pmd); 1039 pmd_clear(pmd);
1039 } else { 1040 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1040 struct page *page; 1041 page_remove_rmap(page);
1041 pgtable_t pgtable; 1042 VM_BUG_ON(page_mapcount(page) < 0);
1042 pgtable = get_pmd_huge_pte(tlb->mm); 1043 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1043 page = pmd_page(*pmd); 1044 VM_BUG_ON(!PageHead(page));
1044 pmd_clear(pmd); 1045 tlb->mm->nr_ptes--;
1045 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1046 page_remove_rmap(page);
1047 VM_BUG_ON(page_mapcount(page) < 0);
1048 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1049 VM_BUG_ON(!PageHead(page));
1050 spin_unlock(&tlb->mm->page_table_lock);
1051 tlb_remove_page(tlb, page);
1052 pte_free(tlb->mm, pgtable);
1053 ret = 1;
1054 }
1055 } else
1056 spin_unlock(&tlb->mm->page_table_lock); 1046 spin_unlock(&tlb->mm->page_table_lock);
1057 1047 tlb_remove_page(tlb, page);
1048 pte_free(tlb->mm, pgtable);
1049 ret = 1;
1050 }
1058 return ret; 1051 return ret;
1059} 1052}
1060 1053
@@ -1064,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1064{ 1057{
1065 int ret = 0; 1058 int ret = 0;
1066 1059
1067 spin_lock(&vma->vm_mm->page_table_lock); 1060 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1068 if (likely(pmd_trans_huge(*pmd))) { 1061 /*
1069 ret = !pmd_trans_splitting(*pmd); 1062 * All logical pages in the range are present
1070 spin_unlock(&vma->vm_mm->page_table_lock); 1063 * if backed by a huge page.
1071 if (unlikely(!ret)) 1064 */
1072 wait_split_huge_page(vma->anon_vma, pmd);
1073 else {
1074 /*
1075 * All logical pages in the range are present
1076 * if backed by a huge page.
1077 */
1078 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1079 }
1080 } else
1081 spin_unlock(&vma->vm_mm->page_table_lock); 1065 spin_unlock(&vma->vm_mm->page_table_lock);
1066 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1067 ret = 1;
1068 }
1082 1069
1083 return ret; 1070 return ret;
1084} 1071}
@@ -1108,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1108 goto out; 1095 goto out;
1109 } 1096 }
1110 1097
1111 spin_lock(&mm->page_table_lock); 1098 ret = __pmd_trans_huge_lock(old_pmd, vma);
1112 if (likely(pmd_trans_huge(*old_pmd))) { 1099 if (ret == 1) {
1113 if (pmd_trans_splitting(*old_pmd)) { 1100 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1114 spin_unlock(&mm->page_table_lock); 1101 VM_BUG_ON(!pmd_none(*new_pmd));
1115 wait_split_huge_page(vma->anon_vma, old_pmd); 1102 set_pmd_at(mm, new_addr, new_pmd, pmd);
1116 ret = -1;
1117 } else {
1118 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1119 VM_BUG_ON(!pmd_none(*new_pmd));
1120 set_pmd_at(mm, new_addr, new_pmd, pmd);
1121 spin_unlock(&mm->page_table_lock);
1122 ret = 1;
1123 }
1124 } else {
1125 spin_unlock(&mm->page_table_lock); 1103 spin_unlock(&mm->page_table_lock);
1126 } 1104 }
1127out: 1105out:
@@ -1134,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1134 struct mm_struct *mm = vma->vm_mm; 1112 struct mm_struct *mm = vma->vm_mm;
1135 int ret = 0; 1113 int ret = 0;
1136 1114
1137 spin_lock(&mm->page_table_lock); 1115 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1116 pmd_t entry;
1117 entry = pmdp_get_and_clear(mm, addr, pmd);
1118 entry = pmd_modify(entry, newprot);
1119 set_pmd_at(mm, addr, pmd, entry);
1120 spin_unlock(&vma->vm_mm->page_table_lock);
1121 ret = 1;
1122 }
1123
1124 return ret;
1125}
1126
1127/*
1128 * Returns 1 if a given pmd maps a stable (not under splitting) thp.
1129 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1130 *
1131 * Note that if it returns 1, this routine returns without unlocking page
1132 * table locks. So callers must unlock them.
1133 */
1134int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1135{
1136 spin_lock(&vma->vm_mm->page_table_lock);
1138 if (likely(pmd_trans_huge(*pmd))) { 1137 if (likely(pmd_trans_huge(*pmd))) {
1139 if (unlikely(pmd_trans_splitting(*pmd))) { 1138 if (unlikely(pmd_trans_splitting(*pmd))) {
1140 spin_unlock(&mm->page_table_lock); 1139 spin_unlock(&vma->vm_mm->page_table_lock);
1141 wait_split_huge_page(vma->anon_vma, pmd); 1140 wait_split_huge_page(vma->anon_vma, pmd);
1141 return -1;
1142 } else { 1142 } else {
1143 pmd_t entry; 1143 /* Thp mapped by 'pmd' is stable, so we can
1144 1144 * handle it as it is. */
1145 entry = pmdp_get_and_clear(mm, addr, pmd); 1145 return 1;
1146 entry = pmd_modify(entry, newprot);
1147 set_pmd_at(mm, addr, pmd, entry);
1148 spin_unlock(&vma->vm_mm->page_table_lock);
1149 ret = 1;
1150 } 1146 }
1151 } else 1147 }
1152 spin_unlock(&vma->vm_mm->page_table_lock); 1148 spin_unlock(&vma->vm_mm->page_table_lock);
1153 1149 return 0;
1154 return ret;
1155} 1150}
1156 1151
1157pmd_t *page_check_address_pmd(struct page *page, 1152pmd_t *page_check_address_pmd(struct page *page,
@@ -1375,7 +1370,6 @@ static int __split_huge_page_map(struct page *page,
1375 pte_unmap(pte); 1370 pte_unmap(pte);
1376 } 1371 }
1377 1372
1378 mm->nr_ptes++;
1379 smp_wmb(); /* make pte visible before pmd */ 1373 smp_wmb(); /* make pte visible before pmd */
1380 /* 1374 /*
1381 * Up to this point the pmd is present and huge and 1375 * Up to this point the pmd is present and huge and
@@ -1988,7 +1982,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1988 set_pmd_at(mm, address, pmd, _pmd); 1982 set_pmd_at(mm, address, pmd, _pmd);
1989 update_mmu_cache(vma, address, _pmd); 1983 update_mmu_cache(vma, address, _pmd);
1990 prepare_pmd_huge_pte(pgtable, mm); 1984 prepare_pmd_huge_pte(pgtable, mm);
1991 mm->nr_ptes--;
1992 spin_unlock(&mm->page_table_lock); 1985 spin_unlock(&mm->page_table_lock);
1993 1986
1994#ifndef CONFIG_NUMA 1987#ifndef CONFIG_NUMA
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5f34bd8dda34..b8ce6f450956 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54static DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{
58 bool free = (spool->count == 0) && (spool->used_hpages == 0);
59
60 spin_unlock(&spool->lock);
61
62 /* If no pages are used, and no other handles to the subpool
63 * remain, free the subpool the subpool remain */
64 if (free)
65 kfree(spool);
66}
67
68struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
69{
70 struct hugepage_subpool *spool;
71
72 spool = kmalloc(sizeof(*spool), GFP_KERNEL);
73 if (!spool)
74 return NULL;
75
76 spin_lock_init(&spool->lock);
77 spool->count = 1;
78 spool->max_hpages = nr_blocks;
79 spool->used_hpages = 0;
80
81 return spool;
82}
83
84void hugepage_put_subpool(struct hugepage_subpool *spool)
85{
86 spin_lock(&spool->lock);
87 BUG_ON(!spool->count);
88 spool->count--;
89 unlock_or_release_subpool(spool);
90}
91
92static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
93 long delta)
94{
95 int ret = 0;
96
97 if (!spool)
98 return 0;
99
100 spin_lock(&spool->lock);
101 if ((spool->used_hpages + delta) <= spool->max_hpages) {
102 spool->used_hpages += delta;
103 } else {
104 ret = -ENOMEM;
105 }
106 spin_unlock(&spool->lock);
107
108 return ret;
109}
110
111static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
112 long delta)
113{
114 if (!spool)
115 return;
116
117 spin_lock(&spool->lock);
118 spool->used_hpages -= delta;
119 /* If hugetlbfs_put_super couldn't free spool due to
120 * an outstanding quota reference, free it now. */
121 unlock_or_release_subpool(spool);
122}
123
124static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
125{
126 return HUGETLBFS_SB(inode->i_sb)->spool;
127}
128
129static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
130{
131 return subpool_inode(vma->vm_file->f_dentry->d_inode);
132}
133
56/* 134/*
57 * Region tracking -- allows tracking of reservations and instantiated pages 135 * Region tracking -- allows tracking of reservations and instantiated pages
58 * across the pages in a mapping. 136 * across the pages in a mapping.
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
454 struct vm_area_struct *vma, 532 struct vm_area_struct *vma,
455 unsigned long address, int avoid_reserve) 533 unsigned long address, int avoid_reserve)
456{ 534{
457 struct page *page = NULL; 535 struct page *page;
458 struct mempolicy *mpol; 536 struct mempolicy *mpol;
459 nodemask_t *nodemask; 537 nodemask_t *nodemask;
460 struct zonelist *zonelist; 538 struct zonelist *zonelist;
461 struct zone *zone; 539 struct zone *zone;
462 struct zoneref *z; 540 struct zoneref *z;
541 unsigned int cpuset_mems_cookie;
463 542
464 get_mems_allowed(); 543retry_cpuset:
544 cpuset_mems_cookie = get_mems_allowed();
465 zonelist = huge_zonelist(vma, address, 545 zonelist = huge_zonelist(vma, address,
466 htlb_alloc_mask, &mpol, &nodemask); 546 htlb_alloc_mask, &mpol, &nodemask);
467 /* 547 /*
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
488 } 568 }
489 } 569 }
490 } 570 }
491err: 571
492 mpol_cond_put(mpol); 572 mpol_cond_put(mpol);
493 put_mems_allowed(); 573 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
574 goto retry_cpuset;
494 return page; 575 return page;
576
577err:
578 mpol_cond_put(mpol);
579 return NULL;
495} 580}
496 581
497static void update_and_free_page(struct hstate *h, struct page *page) 582static void update_and_free_page(struct hstate *h, struct page *page)
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page)
533 */ 618 */
534 struct hstate *h = page_hstate(page); 619 struct hstate *h = page_hstate(page);
535 int nid = page_to_nid(page); 620 int nid = page_to_nid(page);
536 struct address_space *mapping; 621 struct hugepage_subpool *spool =
622 (struct hugepage_subpool *)page_private(page);
537 623
538 mapping = (struct address_space *) page_private(page);
539 set_page_private(page, 0); 624 set_page_private(page, 0);
540 page->mapping = NULL; 625 page->mapping = NULL;
541 BUG_ON(page_count(page)); 626 BUG_ON(page_count(page));
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page)
551 enqueue_huge_page(h, page); 636 enqueue_huge_page(h, page);
552 } 637 }
553 spin_unlock(&hugetlb_lock); 638 spin_unlock(&hugetlb_lock);
554 if (mapping) 639 hugepage_subpool_put_pages(spool, 1);
555 hugetlb_put_quota(mapping, 1);
556} 640}
557 641
558static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
852 struct page *page, *tmp; 936 struct page *page, *tmp;
853 int ret, i; 937 int ret, i;
854 int needed, allocated; 938 int needed, allocated;
939 bool alloc_ok = true;
855 940
856 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 941 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
857 if (needed <= 0) { 942 if (needed <= 0) {
@@ -867,17 +952,13 @@ retry:
867 spin_unlock(&hugetlb_lock); 952 spin_unlock(&hugetlb_lock);
868 for (i = 0; i < needed; i++) { 953 for (i = 0; i < needed; i++) {
869 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 954 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
870 if (!page) 955 if (!page) {
871 /* 956 alloc_ok = false;
872 * We were not able to allocate enough pages to 957 break;
873 * satisfy the entire reservation so we free what 958 }
874 * we've allocated so far.
875 */
876 goto free;
877
878 list_add(&page->lru, &surplus_list); 959 list_add(&page->lru, &surplus_list);
879 } 960 }
880 allocated += needed; 961 allocated += i;
881 962
882 /* 963 /*
883 * After retaking hugetlb_lock, we need to recalculate 'needed' 964 * After retaking hugetlb_lock, we need to recalculate 'needed'
@@ -886,9 +967,16 @@ retry:
886 spin_lock(&hugetlb_lock); 967 spin_lock(&hugetlb_lock);
887 needed = (h->resv_huge_pages + delta) - 968 needed = (h->resv_huge_pages + delta) -
888 (h->free_huge_pages + allocated); 969 (h->free_huge_pages + allocated);
889 if (needed > 0) 970 if (needed > 0) {
890 goto retry; 971 if (alloc_ok)
891 972 goto retry;
973 /*
974 * We were not able to allocate enough pages to
975 * satisfy the entire reservation so we free what
976 * we've allocated so far.
977 */
978 goto free;
979 }
892 /* 980 /*
893 * The surplus_list now contains _at_least_ the number of extra pages 981 * The surplus_list now contains _at_least_ the number of extra pages
894 * needed to accommodate the reservation. Add the appropriate number 982 * needed to accommodate the reservation. Add the appropriate number
@@ -914,10 +1002,10 @@ retry:
914 VM_BUG_ON(page_count(page)); 1002 VM_BUG_ON(page_count(page));
915 enqueue_huge_page(h, page); 1003 enqueue_huge_page(h, page);
916 } 1004 }
1005free:
917 spin_unlock(&hugetlb_lock); 1006 spin_unlock(&hugetlb_lock);
918 1007
919 /* Free unnecessary surplus pages to the buddy allocator */ 1008 /* Free unnecessary surplus pages to the buddy allocator */
920free:
921 if (!list_empty(&surplus_list)) { 1009 if (!list_empty(&surplus_list)) {
922 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
923 list_del(&page->lru); 1011 list_del(&page->lru);
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
966/* 1054/*
967 * Determine if the huge page at addr within the vma has an associated 1055 * Determine if the huge page at addr within the vma has an associated
968 * reservation. Where it does not we will need to logically increase 1056 * reservation. Where it does not we will need to logically increase
969 * reservation and actually increase quota before an allocation can occur. 1057 * reservation and actually increase subpool usage before an allocation
970 * Where any new reservation would be required the reservation change is 1058 * can occur. Where any new reservation would be required the
971 * prepared, but not committed. Once the page has been quota'd allocated 1059 * reservation change is prepared, but not committed. Once the page
972 * an instantiated the change should be committed via vma_commit_reservation. 1060 * has been allocated from the subpool and instantiated the change should
973 * No action is required on failure. 1061 * be committed via vma_commit_reservation. No action is required on
1062 * failure.
974 */ 1063 */
975static long vma_needs_reservation(struct hstate *h, 1064static long vma_needs_reservation(struct hstate *h,
976 struct vm_area_struct *vma, unsigned long addr) 1065 struct vm_area_struct *vma, unsigned long addr)
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
1019static struct page *alloc_huge_page(struct vm_area_struct *vma, 1108static struct page *alloc_huge_page(struct vm_area_struct *vma,
1020 unsigned long addr, int avoid_reserve) 1109 unsigned long addr, int avoid_reserve)
1021{ 1110{
1111 struct hugepage_subpool *spool = subpool_vma(vma);
1022 struct hstate *h = hstate_vma(vma); 1112 struct hstate *h = hstate_vma(vma);
1023 struct page *page; 1113 struct page *page;
1024 struct address_space *mapping = vma->vm_file->f_mapping;
1025 struct inode *inode = mapping->host;
1026 long chg; 1114 long chg;
1027 1115
1028 /* 1116 /*
1029 * Processes that did not create the mapping will have no reserves and 1117 * Processes that did not create the mapping will have no
1030 * will not have accounted against quota. Check that the quota can be 1118 * reserves and will not have accounted against subpool
1031 * made before satisfying the allocation 1119 * limit. Check that the subpool limit can be made before
1032 * MAP_NORESERVE mappings may also need pages and quota allocated 1120 * satisfying the allocation MAP_NORESERVE mappings may also
1033 * if no reserve mapping overlaps. 1121 * need pages and subpool limit allocated allocated if no reserve
1122 * mapping overlaps.
1034 */ 1123 */
1035 chg = vma_needs_reservation(h, vma, addr); 1124 chg = vma_needs_reservation(h, vma, addr);
1036 if (chg < 0) 1125 if (chg < 0)
1037 return ERR_PTR(-VM_FAULT_OOM); 1126 return ERR_PTR(-VM_FAULT_OOM);
1038 if (chg) 1127 if (chg)
1039 if (hugetlb_get_quota(inode->i_mapping, chg)) 1128 if (hugepage_subpool_get_pages(spool, chg))
1040 return ERR_PTR(-VM_FAULT_SIGBUS); 1129 return ERR_PTR(-VM_FAULT_SIGBUS);
1041 1130
1042 spin_lock(&hugetlb_lock); 1131 spin_lock(&hugetlb_lock);
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1046 if (!page) { 1135 if (!page) {
1047 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1048 if (!page) { 1137 if (!page) {
1049 hugetlb_put_quota(inode->i_mapping, chg); 1138 hugepage_subpool_put_pages(spool, chg);
1050 return ERR_PTR(-VM_FAULT_SIGBUS); 1139 return ERR_PTR(-VM_FAULT_SIGBUS);
1051 } 1140 }
1052 } 1141 }
1053 1142
1054 set_page_private(page, (unsigned long) mapping); 1143 set_page_private(page, (unsigned long)spool);
1055 1144
1056 vma_commit_reservation(h, vma, addr); 1145 vma_commit_reservation(h, vma, addr);
1057 1146
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2072{ 2161{
2073 struct hstate *h = hstate_vma(vma); 2162 struct hstate *h = hstate_vma(vma);
2074 struct resv_map *reservations = vma_resv_map(vma); 2163 struct resv_map *reservations = vma_resv_map(vma);
2164 struct hugepage_subpool *spool = subpool_vma(vma);
2075 unsigned long reserve; 2165 unsigned long reserve;
2076 unsigned long start; 2166 unsigned long start;
2077 unsigned long end; 2167 unsigned long end;
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2087 2177
2088 if (reserve) { 2178 if (reserve) {
2089 hugetlb_acct_memory(h, -reserve); 2179 hugetlb_acct_memory(h, -reserve);
2090 hugetlb_put_quota(vma->vm_file->f_mapping, reserve); 2180 hugepage_subpool_put_pages(spool, reserve);
2091 } 2181 }
2092 } 2182 }
2093} 2183}
@@ -2241,16 +2331,23 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2241 if (huge_pmd_unshare(mm, &address, ptep)) 2331 if (huge_pmd_unshare(mm, &address, ptep))
2242 continue; 2332 continue;
2243 2333
2334 pte = huge_ptep_get(ptep);
2335 if (huge_pte_none(pte))
2336 continue;
2337
2338 /*
2339 * HWPoisoned hugepage is already unmapped and dropped reference
2340 */
2341 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2342 continue;
2343
2344 page = pte_page(pte);
2244 /* 2345 /*
2245 * If a reference page is supplied, it is because a specific 2346 * If a reference page is supplied, it is because a specific
2246 * page is being unmapped, not a range. Ensure the page we 2347 * page is being unmapped, not a range. Ensure the page we
2247 * are about to unmap is the actual page of interest. 2348 * are about to unmap is the actual page of interest.
2248 */ 2349 */
2249 if (ref_page) { 2350 if (ref_page) {
2250 pte = huge_ptep_get(ptep);
2251 if (huge_pte_none(pte))
2252 continue;
2253 page = pte_page(pte);
2254 if (page != ref_page) 2351 if (page != ref_page)
2255 continue; 2352 continue;
2256 2353
@@ -2263,22 +2360,16 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2263 } 2360 }
2264 2361
2265 pte = huge_ptep_get_and_clear(mm, address, ptep); 2362 pte = huge_ptep_get_and_clear(mm, address, ptep);
2266 if (huge_pte_none(pte))
2267 continue;
2268
2269 /*
2270 * HWPoisoned hugepage is already unmapped and dropped reference
2271 */
2272 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2273 continue;
2274
2275 page = pte_page(pte);
2276 if (pte_dirty(pte)) 2363 if (pte_dirty(pte))
2277 set_page_dirty(page); 2364 set_page_dirty(page);
2278 list_add(&page->lru, &page_list); 2365 list_add(&page->lru, &page_list);
2366
2367 /* Bail out after unmapping reference page if supplied */
2368 if (ref_page)
2369 break;
2279 } 2370 }
2280 spin_unlock(&mm->page_table_lock);
2281 flush_tlb_range(vma, start, end); 2371 flush_tlb_range(vma, start, end);
2372 spin_unlock(&mm->page_table_lock);
2282 mmu_notifier_invalidate_range_end(mm, start, end); 2373 mmu_notifier_invalidate_range_end(mm, start, end);
2283 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2374 list_for_each_entry_safe(page, tmp, &page_list, lru) {
2284 page_remove_rmap(page); 2375 page_remove_rmap(page);
@@ -2316,7 +2407,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2316 */ 2407 */
2317 address = address & huge_page_mask(h); 2408 address = address & huge_page_mask(h);
2318 pgoff = vma_hugecache_offset(h, vma, address); 2409 pgoff = vma_hugecache_offset(h, vma, address);
2319 mapping = (struct address_space *)page_private(page); 2410 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2320 2411
2321 /* 2412 /*
2322 * Take the mapping lock for the duration of the table walk. As 2413 * Take the mapping lock for the duration of the table walk. As
@@ -2869,11 +2960,12 @@ int hugetlb_reserve_pages(struct inode *inode,
2869{ 2960{
2870 long ret, chg; 2961 long ret, chg;
2871 struct hstate *h = hstate_inode(inode); 2962 struct hstate *h = hstate_inode(inode);
2963 struct hugepage_subpool *spool = subpool_inode(inode);
2872 2964
2873 /* 2965 /*
2874 * Only apply hugepage reservation if asked. At fault time, an 2966 * Only apply hugepage reservation if asked. At fault time, an
2875 * attempt will be made for VM_NORESERVE to allocate a page 2967 * attempt will be made for VM_NORESERVE to allocate a page
2876 * and filesystem quota without using reserves 2968 * without using reserves
2877 */ 2969 */
2878 if (vm_flags & VM_NORESERVE) 2970 if (vm_flags & VM_NORESERVE)
2879 return 0; 2971 return 0;
@@ -2900,17 +2992,17 @@ int hugetlb_reserve_pages(struct inode *inode,
2900 if (chg < 0) 2992 if (chg < 0)
2901 return chg; 2993 return chg;
2902 2994
2903 /* There must be enough filesystem quota for the mapping */ 2995 /* There must be enough pages in the subpool for the mapping */
2904 if (hugetlb_get_quota(inode->i_mapping, chg)) 2996 if (hugepage_subpool_get_pages(spool, chg))
2905 return -ENOSPC; 2997 return -ENOSPC;
2906 2998
2907 /* 2999 /*
2908 * Check enough hugepages are available for the reservation. 3000 * Check enough hugepages are available for the reservation.
2909 * Hand back the quota if there are not 3001 * Hand the pages back to the subpool if there are not
2910 */ 3002 */
2911 ret = hugetlb_acct_memory(h, chg); 3003 ret = hugetlb_acct_memory(h, chg);
2912 if (ret < 0) { 3004 if (ret < 0) {
2913 hugetlb_put_quota(inode->i_mapping, chg); 3005 hugepage_subpool_put_pages(spool, chg);
2914 return ret; 3006 return ret;
2915 } 3007 }
2916 3008
@@ -2934,12 +3026,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2934{ 3026{
2935 struct hstate *h = hstate_inode(inode); 3027 struct hstate *h = hstate_inode(inode);
2936 long chg = region_truncate(&inode->i_mapping->private_list, offset); 3028 long chg = region_truncate(&inode->i_mapping->private_list, offset);
3029 struct hugepage_subpool *spool = subpool_inode(inode);
2937 3030
2938 spin_lock(&inode->i_lock); 3031 spin_lock(&inode->i_lock);
2939 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3032 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2940 spin_unlock(&inode->i_lock); 3033 spin_unlock(&inode->i_lock);
2941 3034
2942 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 3035 hugepage_subpool_put_pages(spool, (chg - freed));
2943 hugetlb_acct_memory(h, -(chg - freed)); 3036 hugetlb_acct_memory(h, -(chg - freed));
2944} 3037}
2945 3038
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index c7fc7fd00e32..cc448bb983ba 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -45,7 +45,7 @@ static int hwpoison_inject(void *data, u64 val)
45 * do a racy check with elevated page count, to make sure PG_hwpoison 45 * do a racy check with elevated page count, to make sure PG_hwpoison
46 * will only be set for the targeted owner (or on a free page). 46 * will only be set for the targeted owner (or on a free page).
47 * We temporarily take page lock for try_get_mem_cgroup_from_page(). 47 * We temporarily take page lock for try_get_mem_cgroup_from_page().
48 * __memory_failure() will redo the check reliably inside page lock. 48 * memory_failure() will redo the check reliably inside page lock.
49 */ 49 */
50 lock_page(hpage); 50 lock_page(hpage);
51 err = hwpoison_filter(hpage); 51 err = hwpoison_filter(hpage);
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
55 55
56inject: 56inject:
57 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); 57 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
58 return __memory_failure(pfn, 18, MF_COUNT_INCREASED); 58 return memory_failure(pfn, 18, MF_COUNT_INCREASED);
59} 59}
60 60
61static int hwpoison_unpoison(void *data, u64 val) 61static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/ksm.c b/mm/ksm.c
index 1925ffbfb27f..47c885368890 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -28,7 +28,6 @@
28#include <linux/kthread.h> 28#include <linux/kthread.h>
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/memcontrol.h>
32#include <linux/rbtree.h> 31#include <linux/rbtree.h>
33#include <linux/memory.h> 32#include <linux/memory.h>
34#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
@@ -375,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
375 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 374 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
376} 375}
377 376
377static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
378 unsigned long addr)
379{
380 struct vm_area_struct *vma;
381 if (ksm_test_exit(mm))
382 return NULL;
383 vma = find_vma(mm, addr);
384 if (!vma || vma->vm_start > addr)
385 return NULL;
386 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
387 return NULL;
388 return vma;
389}
390
378static void break_cow(struct rmap_item *rmap_item) 391static void break_cow(struct rmap_item *rmap_item)
379{ 392{
380 struct mm_struct *mm = rmap_item->mm; 393 struct mm_struct *mm = rmap_item->mm;
@@ -388,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item)
388 put_anon_vma(rmap_item->anon_vma); 401 put_anon_vma(rmap_item->anon_vma);
389 402
390 down_read(&mm->mmap_sem); 403 down_read(&mm->mmap_sem);
391 if (ksm_test_exit(mm)) 404 vma = find_mergeable_vma(mm, addr);
392 goto out; 405 if (vma)
393 vma = find_vma(mm, addr); 406 break_ksm(vma, addr);
394 if (!vma || vma->vm_start > addr)
395 goto out;
396 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
397 goto out;
398 break_ksm(vma, addr);
399out:
400 up_read(&mm->mmap_sem); 407 up_read(&mm->mmap_sem);
401} 408}
402 409
@@ -422,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
422 struct page *page; 429 struct page *page;
423 430
424 down_read(&mm->mmap_sem); 431 down_read(&mm->mmap_sem);
425 if (ksm_test_exit(mm)) 432 vma = find_mergeable_vma(mm, addr);
426 goto out; 433 if (!vma)
427 vma = find_vma(mm, addr);
428 if (!vma || vma->vm_start > addr)
429 goto out;
430 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
431 goto out; 434 goto out;
432 435
433 page = follow_page(vma, addr, FOLL_GET); 436 page = follow_page(vma, addr, FOLL_GET);
@@ -673,9 +676,9 @@ error:
673static u32 calc_checksum(struct page *page) 676static u32 calc_checksum(struct page *page)
674{ 677{
675 u32 checksum; 678 u32 checksum;
676 void *addr = kmap_atomic(page, KM_USER0); 679 void *addr = kmap_atomic(page);
677 checksum = jhash2(addr, PAGE_SIZE / 4, 17); 680 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
678 kunmap_atomic(addr, KM_USER0); 681 kunmap_atomic(addr);
679 return checksum; 682 return checksum;
680} 683}
681 684
@@ -684,11 +687,11 @@ static int memcmp_pages(struct page *page1, struct page *page2)
684 char *addr1, *addr2; 687 char *addr1, *addr2;
685 int ret; 688 int ret;
686 689
687 addr1 = kmap_atomic(page1, KM_USER0); 690 addr1 = kmap_atomic(page1);
688 addr2 = kmap_atomic(page2, KM_USER1); 691 addr2 = kmap_atomic(page2);
689 ret = memcmp(addr1, addr2, PAGE_SIZE); 692 ret = memcmp(addr1, addr2, PAGE_SIZE);
690 kunmap_atomic(addr2, KM_USER1); 693 kunmap_atomic(addr2);
691 kunmap_atomic(addr1, KM_USER0); 694 kunmap_atomic(addr1);
692 return ret; 695 return ret;
693} 696}
694 697
@@ -1572,16 +1575,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
1572 1575
1573 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1576 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1574 if (new_page) { 1577 if (new_page) {
1575 /*
1576 * The memcg-specific accounting when moving
1577 * pages around the LRU lists relies on the
1578 * page's owner (memcg) to be valid. Usually,
1579 * pages are assigned to a new owner before
1580 * being put on the LRU list, but since this
1581 * is not the case here, the stale owner from
1582 * a previous allocation cycle must be reset.
1583 */
1584 mem_cgroup_reset_owner(new_page);
1585 copy_user_highpage(new_page, page, address, vma); 1578 copy_user_highpage(new_page, page, address, vma);
1586 1579
1587 SetPageDirty(new_page); 1580 SetPageDirty(new_page);
diff --git a/mm/madvise.c b/mm/madvise.c
index 74bf193eff04..1ccbba5b6674 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,6 +65,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
65 } 65 }
66 new_flags &= ~VM_DONTCOPY; 66 new_flags &= ~VM_DONTCOPY;
67 break; 67 break;
68 case MADV_DONTDUMP:
69 new_flags |= VM_NODUMP;
70 break;
71 case MADV_DODUMP:
72 new_flags &= ~VM_NODUMP;
73 break;
68 case MADV_MERGEABLE: 74 case MADV_MERGEABLE:
69 case MADV_UNMERGEABLE: 75 case MADV_UNMERGEABLE:
70 error = ksm_madvise(vma, start, end, behavior, &new_flags); 76 error = ksm_madvise(vma, start, end, behavior, &new_flags);
@@ -251,7 +257,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
251 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 257 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
252 page_to_pfn(p), start); 258 page_to_pfn(p), start);
253 /* Ignore return value for now */ 259 /* Ignore return value for now */
254 __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 260 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
255 } 261 }
256 return ret; 262 return ret;
257} 263}
@@ -293,6 +299,8 @@ madvise_behavior_valid(int behavior)
293 case MADV_HUGEPAGE: 299 case MADV_HUGEPAGE:
294 case MADV_NOHUGEPAGE: 300 case MADV_NOHUGEPAGE:
295#endif 301#endif
302 case MADV_DONTDUMP:
303 case MADV_DODUMP:
296 return 1; 304 return 1;
297 305
298 default: 306 default:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 228d6461c12a..7d698df4a067 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 MEM_CGROUP_STAT_NSTATS, 92 MEM_CGROUP_STAT_NSTATS,
94}; 93};
95 94
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {
135 */ 134 */
136struct mem_cgroup_per_zone { 135struct mem_cgroup_per_zone {
137 struct lruvec lruvec; 136 struct lruvec lruvec;
138 unsigned long count[NR_LRU_LISTS]; 137 unsigned long lru_size[NR_LRU_LISTS];
139 138
140 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
141 140
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {
144 unsigned long long usage_in_excess;/* Set to the value by which */ 143 unsigned long long usage_in_excess;/* Set to the value by which */
145 /* the soft limit is exceeded*/ 144 /* the soft limit is exceeded*/
146 bool on_tree; 145 bool on_tree;
147 struct mem_cgroup *mem; /* Back pointer, we cannot */ 146 struct mem_cgroup *memcg; /* Back pointer, we cannot */
148 /* use container_of */ 147 /* use container_of */
149}; 148};
150/* Macro for accessing counter */
151#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
152 149
153struct mem_cgroup_per_node { 150struct mem_cgroup_per_node {
154 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 151 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
@@ -230,10 +227,30 @@ struct mem_cgroup {
230 * the counter to account for memory usage 227 * the counter to account for memory usage
231 */ 228 */
232 struct res_counter res; 229 struct res_counter res;
233 /* 230
234 * the counter to account for mem+swap usage. 231 union {
235 */ 232 /*
236 struct res_counter memsw; 233 * the counter to account for mem+swap usage.
234 */
235 struct res_counter memsw;
236
237 /*
238 * rcu_freeing is used only when freeing struct mem_cgroup,
239 * so put it into a union to avoid wasting more memory.
240 * It must be disjoint from the css field. It could be
241 * in a union with the res field, but res plays a much
242 * larger part in mem_cgroup life than memsw, and might
243 * be of interest, even at time of free, when debugging.
244 * So share rcu_head with the less interesting memsw.
245 */
246 struct rcu_head rcu_freeing;
247 /*
248 * But when using vfree(), that cannot be done at
249 * interrupt time, so we must then queue the work.
250 */
251 struct work_struct work_freeing;
252 };
253
237 /* 254 /*
238 * Per cgroup active and inactive list, similar to the 255 * Per cgroup active and inactive list, similar to the
239 * per zone LRU lists. 256 * per zone LRU lists.
@@ -280,6 +297,12 @@ struct mem_cgroup {
280 */ 297 */
281 unsigned long move_charge_at_immigrate; 298 unsigned long move_charge_at_immigrate;
282 /* 299 /*
300 * set > 0 if pages under this cgroup are moving to other cgroup.
301 */
302 atomic_t moving_account;
303 /* taken only while moving_account > 0 */
304 spinlock_t move_lock;
305 /*
283 * percpu counter. 306 * percpu counter.
284 */ 307 */
285 struct mem_cgroup_stat_cpu *stat; 308 struct mem_cgroup_stat_cpu *stat;
@@ -592,9 +615,9 @@ retry:
592 * we will to add it back at the end of reclaim to its correct 615 * we will to add it back at the end of reclaim to its correct
593 * position in the tree. 616 * position in the tree.
594 */ 617 */
595 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 618 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
596 if (!res_counter_soft_limit_excess(&mz->mem->res) || 619 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
597 !css_tryget(&mz->mem->css)) 620 !css_tryget(&mz->memcg->css))
598 goto retry; 621 goto retry;
599done: 622done:
600 return mz; 623 return mz;
@@ -672,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
672} 695}
673 696
674static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 697static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
675 bool file, int nr_pages) 698 bool anon, int nr_pages)
676{ 699{
677 preempt_disable(); 700 preempt_disable();
678 701
679 if (file) 702 /*
680 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 703 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
704 * counted as CACHE even if it's on ANON LRU.
705 */
706 if (anon)
707 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
681 nr_pages); 708 nr_pages);
682 else 709 else
683 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 710 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
684 nr_pages); 711 nr_pages);
685 712
686 /* pagein of a big page is an event. So, ignore page size */ 713 /* pagein of a big page is an event. So, ignore page size */
@@ -701,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
701 unsigned int lru_mask) 728 unsigned int lru_mask)
702{ 729{
703 struct mem_cgroup_per_zone *mz; 730 struct mem_cgroup_per_zone *mz;
704 enum lru_list l; 731 enum lru_list lru;
705 unsigned long ret = 0; 732 unsigned long ret = 0;
706 733
707 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 734 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
708 735
709 for_each_lru(l) { 736 for_each_lru(lru) {
710 if (BIT(l) & lru_mask) 737 if (BIT(lru) & lru_mask)
711 ret += MEM_CGROUP_ZSTAT(mz, l); 738 ret += mz->lru_size[lru];
712 } 739 }
713 return ret; 740 return ret;
714} 741}
@@ -1042,9 +1069,22 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1042 1069
1043 pc = lookup_page_cgroup(page); 1070 pc = lookup_page_cgroup(page);
1044 memcg = pc->mem_cgroup; 1071 memcg = pc->mem_cgroup;
1072
1073 /*
1074 * Surreptitiously switch any uncharged page to root:
1075 * an uncharged page off lru does nothing to secure
1076 * its former mem_cgroup from sudden removal.
1077 *
1078 * Our caller holds lru_lock, and PageCgroupUsed is updated
1079 * under page_cgroup lock: between them, they make all uses
1080 * of pc->mem_cgroup safe.
1081 */
1082 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1083 pc->mem_cgroup = memcg = root_mem_cgroup;
1084
1045 mz = page_cgroup_zoneinfo(memcg, page); 1085 mz = page_cgroup_zoneinfo(memcg, page);
1046 /* compound_order() is stabilized through lru_lock */ 1086 /* compound_order() is stabilized through lru_lock */
1047 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1087 mz->lru_size[lru] += 1 << compound_order(page);
1048 return &mz->lruvec; 1088 return &mz->lruvec;
1049} 1089}
1050 1090
@@ -1072,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1072 VM_BUG_ON(!memcg); 1112 VM_BUG_ON(!memcg);
1073 mz = page_cgroup_zoneinfo(memcg, page); 1113 mz = page_cgroup_zoneinfo(memcg, page);
1074 /* huge page split is done under lru_lock. so, we have no races. */ 1114 /* huge page split is done under lru_lock. so, we have no races. */
1075 VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); 1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
1076 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 1116 mz->lru_size[lru] -= 1 << compound_order(page);
1077} 1117}
1078 1118
1079void mem_cgroup_lru_del(struct page *page) 1119void mem_cgroup_lru_del(struct page *page)
@@ -1252,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1252 return memcg->swappiness; 1292 return memcg->swappiness;
1253} 1293}
1254 1294
1255static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1295/*
1256{ 1296 * memcg->moving_account is used for checking possibility that some thread is
1257 int cpu; 1297 * calling move_account(). When a thread on CPU-A starts moving pages under
1298 * a memcg, other threads should check memcg->moving_account under
1299 * rcu_read_lock(), like this:
1300 *
1301 * CPU-A CPU-B
1302 * rcu_read_lock()
1303 * memcg->moving_account+1 if (memcg->mocing_account)
1304 * take heavy locks.
1305 * synchronize_rcu() update something.
1306 * rcu_read_unlock()
1307 * start move here.
1308 */
1258 1309
1259 get_online_cpus(); 1310/* for quick checking without looking up memcg */
1260 spin_lock(&memcg->pcp_counter_lock); 1311atomic_t memcg_moving __read_mostly;
1261 for_each_online_cpu(cpu)
1262 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1263 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1264 spin_unlock(&memcg->pcp_counter_lock);
1265 put_online_cpus();
1266 1312
1313static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1314{
1315 atomic_inc(&memcg_moving);
1316 atomic_inc(&memcg->moving_account);
1267 synchronize_rcu(); 1317 synchronize_rcu();
1268} 1318}
1269 1319
1270static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1320static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1271{ 1321{
1272 int cpu; 1322 /*
1273 1323 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1274 if (!memcg) 1324 * We check NULL in callee rather than caller.
1275 return; 1325 */
1276 get_online_cpus(); 1326 if (memcg) {
1277 spin_lock(&memcg->pcp_counter_lock); 1327 atomic_dec(&memcg_moving);
1278 for_each_online_cpu(cpu) 1328 atomic_dec(&memcg->moving_account);
1279 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1329 }
1280 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1281 spin_unlock(&memcg->pcp_counter_lock);
1282 put_online_cpus();
1283} 1330}
1331
1284/* 1332/*
1285 * 2 routines for checking "mem" is under move_account() or not. 1333 * 2 routines for checking "mem" is under move_account() or not.
1286 * 1334 *
1287 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1335 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
1288 * for avoiding race in accounting. If true, 1336 * is used for avoiding races in accounting. If true,
1289 * pc->mem_cgroup may be overwritten. 1337 * pc->mem_cgroup may be overwritten.
1290 * 1338 *
1291 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1339 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
@@ -1293,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1293 * waiting at hith-memory prressure caused by "move". 1341 * waiting at hith-memory prressure caused by "move".
1294 */ 1342 */
1295 1343
1296static bool mem_cgroup_stealed(struct mem_cgroup *memcg) 1344static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1297{ 1345{
1298 VM_BUG_ON(!rcu_read_lock_held()); 1346 VM_BUG_ON(!rcu_read_lock_held());
1299 return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1347 return atomic_read(&memcg->moving_account) > 0;
1300} 1348}
1301 1349
1302static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1350static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
@@ -1337,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1337 return false; 1385 return false;
1338} 1386}
1339 1387
1388/*
1389 * Take this lock when
1390 * - a code tries to modify page's memcg while it's USED.
1391 * - a code tries to modify page state accounting in a memcg.
1392 * see mem_cgroup_stolen(), too.
1393 */
1394static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1395 unsigned long *flags)
1396{
1397 spin_lock_irqsave(&memcg->move_lock, *flags);
1398}
1399
1400static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1401 unsigned long *flags)
1402{
1403 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1404}
1405
1340/** 1406/**
1341 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1407 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1342 * @memcg: The memory cgroup that went over limit 1408 * @memcg: The memory cgroup that went over limit
@@ -1360,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1360 if (!memcg || !p) 1426 if (!memcg || !p)
1361 return; 1427 return;
1362 1428
1363
1364 rcu_read_lock(); 1429 rcu_read_lock();
1365 1430
1366 mem_cgrp = memcg->css.cgroup; 1431 mem_cgrp = memcg->css.cgroup;
@@ -1739,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);
1739static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1804static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1740 1805
1741struct oom_wait_info { 1806struct oom_wait_info {
1742 struct mem_cgroup *mem; 1807 struct mem_cgroup *memcg;
1743 wait_queue_t wait; 1808 wait_queue_t wait;
1744}; 1809};
1745 1810
1746static int memcg_oom_wake_function(wait_queue_t *wait, 1811static int memcg_oom_wake_function(wait_queue_t *wait,
1747 unsigned mode, int sync, void *arg) 1812 unsigned mode, int sync, void *arg)
1748{ 1813{
1749 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, 1814 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1750 *oom_wait_memcg; 1815 struct mem_cgroup *oom_wait_memcg;
1751 struct oom_wait_info *oom_wait_info; 1816 struct oom_wait_info *oom_wait_info;
1752 1817
1753 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1818 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1754 oom_wait_memcg = oom_wait_info->mem; 1819 oom_wait_memcg = oom_wait_info->memcg;
1755 1820
1756 /* 1821 /*
1757 * Both of oom_wait_info->mem and wake_mem are stable under us. 1822 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
1758 * Then we can use css_is_ancestor without taking care of RCU. 1823 * Then we can use css_is_ancestor without taking care of RCU.
1759 */ 1824 */
1760 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 1825 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
@@ -1778,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
1778/* 1843/*
1779 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1780 */ 1845 */
1781bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) 1846bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1782{ 1847{
1783 struct oom_wait_info owait; 1848 struct oom_wait_info owait;
1784 bool locked, need_to_kill; 1849 bool locked, need_to_kill;
1785 1850
1786 owait.mem = memcg; 1851 owait.memcg = memcg;
1787 owait.wait.flags = 0; 1852 owait.wait.flags = 0;
1788 owait.wait.func = memcg_oom_wake_function; 1853 owait.wait.func = memcg_oom_wake_function;
1789 owait.wait.private = current; 1854 owait.wait.private = current;
@@ -1808,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1808 1873
1809 if (need_to_kill) { 1874 if (need_to_kill) {
1810 finish_wait(&memcg_oom_waitq, &owait.wait); 1875 finish_wait(&memcg_oom_waitq, &owait.wait);
1811 mem_cgroup_out_of_memory(memcg, mask); 1876 mem_cgroup_out_of_memory(memcg, mask, order);
1812 } else { 1877 } else {
1813 schedule(); 1878 schedule();
1814 finish_wait(&memcg_oom_waitq, &owait.wait); 1879 finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1848,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1848 * by flags. 1913 * by flags.
1849 * 1914 *
1850 * Considering "move", this is an only case we see a race. To make the race 1915 * Considering "move", this is an only case we see a race. To make the race
1851 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1916 * small, we check mm->moving_account and detect there are possibility of race
1852 * possibility of race condition. If there is, we take a lock. 1917 * If there is, we take a lock.
1853 */ 1918 */
1854 1919
1920void __mem_cgroup_begin_update_page_stat(struct page *page,
1921 bool *locked, unsigned long *flags)
1922{
1923 struct mem_cgroup *memcg;
1924 struct page_cgroup *pc;
1925
1926 pc = lookup_page_cgroup(page);
1927again:
1928 memcg = pc->mem_cgroup;
1929 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1930 return;
1931 /*
1932 * If this memory cgroup is not under account moving, we don't
1933 * need to take move_lock_page_cgroup(). Because we already hold
1934 * rcu_read_lock(), any calls to move_account will be delayed until
1935 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1936 */
1937 if (!mem_cgroup_stolen(memcg))
1938 return;
1939
1940 move_lock_mem_cgroup(memcg, flags);
1941 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
1942 move_unlock_mem_cgroup(memcg, flags);
1943 goto again;
1944 }
1945 *locked = true;
1946}
1947
1948void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1949{
1950 struct page_cgroup *pc = lookup_page_cgroup(page);
1951
1952 /*
1953 * It's guaranteed that pc->mem_cgroup never changes while
1954 * lock is held because a routine modifies pc->mem_cgroup
1955 * should take move_lock_page_cgroup().
1956 */
1957 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1958}
1959
1855void mem_cgroup_update_page_stat(struct page *page, 1960void mem_cgroup_update_page_stat(struct page *page,
1856 enum mem_cgroup_page_stat_item idx, int val) 1961 enum mem_cgroup_page_stat_item idx, int val)
1857{ 1962{
1858 struct mem_cgroup *memcg; 1963 struct mem_cgroup *memcg;
1859 struct page_cgroup *pc = lookup_page_cgroup(page); 1964 struct page_cgroup *pc = lookup_page_cgroup(page);
1860 bool need_unlock = false;
1861 unsigned long uninitialized_var(flags); 1965 unsigned long uninitialized_var(flags);
1862 1966
1863 if (mem_cgroup_disabled()) 1967 if (mem_cgroup_disabled())
1864 return; 1968 return;
1865 1969
1866 rcu_read_lock();
1867 memcg = pc->mem_cgroup; 1970 memcg = pc->mem_cgroup;
1868 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1971 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1869 goto out; 1972 return;
1870 /* pc->mem_cgroup is unstable ? */
1871 if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
1872 /* take a lock against to access pc->mem_cgroup */
1873 move_lock_page_cgroup(pc, &flags);
1874 need_unlock = true;
1875 memcg = pc->mem_cgroup;
1876 if (!memcg || !PageCgroupUsed(pc))
1877 goto out;
1878 }
1879 1973
1880 switch (idx) { 1974 switch (idx) {
1881 case MEMCG_NR_FILE_MAPPED: 1975 case MEMCG_NR_FILE_MAPPED:
1882 if (val > 0)
1883 SetPageCgroupFileMapped(pc);
1884 else if (!page_mapped(page))
1885 ClearPageCgroupFileMapped(pc);
1886 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1976 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1887 break; 1977 break;
1888 default: 1978 default:
@@ -1890,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,
1890 } 1980 }
1891 1981
1892 this_cpu_add(memcg->stat->count[idx], val); 1982 this_cpu_add(memcg->stat->count[idx], val);
1893
1894out:
1895 if (unlikely(need_unlock))
1896 move_unlock_page_cgroup(pc, &flags);
1897 rcu_read_unlock();
1898 return;
1899} 1983}
1900EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1901 1984
1902/* 1985/*
1903 * size of first charge trial. "32" comes from vmscan.c's magic value. 1986 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2068,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2068 per_cpu(memcg->stat->events[i], cpu) = 0; 2151 per_cpu(memcg->stat->events[i], cpu) = 0;
2069 memcg->nocpu_base.events[i] += x; 2152 memcg->nocpu_base.events[i] += x;
2070 } 2153 }
2071 /* need to clear ON_MOVE value, works as a kind of lock. */
2072 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2073 spin_unlock(&memcg->pcp_counter_lock);
2074}
2075
2076static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2077{
2078 int idx = MEM_CGROUP_ON_MOVE;
2079
2080 spin_lock(&memcg->pcp_counter_lock);
2081 per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2082 spin_unlock(&memcg->pcp_counter_lock); 2154 spin_unlock(&memcg->pcp_counter_lock);
2083} 2155}
2084 2156
@@ -2090,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2090 struct memcg_stock_pcp *stock; 2162 struct memcg_stock_pcp *stock;
2091 struct mem_cgroup *iter; 2163 struct mem_cgroup *iter;
2092 2164
2093 if ((action == CPU_ONLINE)) { 2165 if (action == CPU_ONLINE)
2094 for_each_mem_cgroup(iter)
2095 synchronize_mem_cgroup_on_move(iter, cpu);
2096 return NOTIFY_OK; 2166 return NOTIFY_OK;
2097 }
2098 2167
2099 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2168 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2100 return NOTIFY_OK; 2169 return NOTIFY_OK;
@@ -2179,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2179 if (!oom_check) 2248 if (!oom_check)
2180 return CHARGE_NOMEM; 2249 return CHARGE_NOMEM;
2181 /* check OOM */ 2250 /* check OOM */
2182 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 2251 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2183 return CHARGE_OOM_DIE; 2252 return CHARGE_OOM_DIE;
2184 2253
2185 return CHARGE_RETRY; 2254 return CHARGE_RETRY;
@@ -2408,8 +2477,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2408 struct page *page, 2477 struct page *page,
2409 unsigned int nr_pages, 2478 unsigned int nr_pages,
2410 struct page_cgroup *pc, 2479 struct page_cgroup *pc,
2411 enum charge_type ctype) 2480 enum charge_type ctype,
2481 bool lrucare)
2412{ 2482{
2483 struct zone *uninitialized_var(zone);
2484 bool was_on_lru = false;
2485 bool anon;
2486
2413 lock_page_cgroup(pc); 2487 lock_page_cgroup(pc);
2414 if (unlikely(PageCgroupUsed(pc))) { 2488 if (unlikely(PageCgroupUsed(pc))) {
2415 unlock_page_cgroup(pc); 2489 unlock_page_cgroup(pc);
@@ -2420,6 +2494,21 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2420 * we don't need page_cgroup_lock about tail pages, becase they are not 2494 * we don't need page_cgroup_lock about tail pages, becase they are not
2421 * accessed by any other context at this point. 2495 * accessed by any other context at this point.
2422 */ 2496 */
2497
2498 /*
2499 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2500 * may already be on some other mem_cgroup's LRU. Take care of it.
2501 */
2502 if (lrucare) {
2503 zone = page_zone(page);
2504 spin_lock_irq(&zone->lru_lock);
2505 if (PageLRU(page)) {
2506 ClearPageLRU(page);
2507 del_page_from_lru_list(zone, page, page_lru(page));
2508 was_on_lru = true;
2509 }
2510 }
2511
2423 pc->mem_cgroup = memcg; 2512 pc->mem_cgroup = memcg;
2424 /* 2513 /*
2425 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2514 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2429,23 +2518,25 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2429 * See mem_cgroup_add_lru_list(), etc. 2518 * See mem_cgroup_add_lru_list(), etc.
2430 */ 2519 */
2431 smp_wmb(); 2520 smp_wmb();
2432 switch (ctype) { 2521 SetPageCgroupUsed(pc);
2433 case MEM_CGROUP_CHARGE_TYPE_CACHE: 2522
2434 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 2523 if (lrucare) {
2435 SetPageCgroupCache(pc); 2524 if (was_on_lru) {
2436 SetPageCgroupUsed(pc); 2525 VM_BUG_ON(PageLRU(page));
2437 break; 2526 SetPageLRU(page);
2438 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2527 add_page_to_lru_list(zone, page, page_lru(page));
2439 ClearPageCgroupCache(pc); 2528 }
2440 SetPageCgroupUsed(pc); 2529 spin_unlock_irq(&zone->lru_lock);
2441 break;
2442 default:
2443 break;
2444 } 2530 }
2445 2531
2446 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2532 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2533 anon = true;
2534 else
2535 anon = false;
2536
2537 mem_cgroup_charge_statistics(memcg, anon, nr_pages);
2447 unlock_page_cgroup(pc); 2538 unlock_page_cgroup(pc);
2448 WARN_ON_ONCE(PageLRU(page)); 2539
2449 /* 2540 /*
2450 * "charge_statistics" updated event counter. Then, check it. 2541 * "charge_statistics" updated event counter. Then, check it.
2451 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2542 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -2456,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2456 2547
2457#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2548#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2458 2549
2459#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2550#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
2460 (1 << PCG_MIGRATION))
2461/* 2551/*
2462 * Because tail pages are not marked as "used", set it. We're under 2552 * Because tail pages are not marked as "used", set it. We're under
2463 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2553 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2508,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,
2508{ 2598{
2509 unsigned long flags; 2599 unsigned long flags;
2510 int ret; 2600 int ret;
2601 bool anon = PageAnon(page);
2511 2602
2512 VM_BUG_ON(from == to); 2603 VM_BUG_ON(from == to);
2513 VM_BUG_ON(PageLRU(page)); 2604 VM_BUG_ON(PageLRU(page));
@@ -2527,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,
2527 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2618 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2528 goto unlock; 2619 goto unlock;
2529 2620
2530 move_lock_page_cgroup(pc, &flags); 2621 move_lock_mem_cgroup(from, &flags);
2531 2622
2532 if (PageCgroupFileMapped(pc)) { 2623 if (!anon && page_mapped(page)) {
2533 /* Update mapped_file data for mem_cgroup */ 2624 /* Update mapped_file data for mem_cgroup */
2534 preempt_disable(); 2625 preempt_disable();
2535 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2626 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2536 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2627 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2537 preempt_enable(); 2628 preempt_enable();
2538 } 2629 }
2539 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2630 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2540 if (uncharge) 2631 if (uncharge)
2541 /* This is not "cancel", but cancel_charge does all we need. */ 2632 /* This is not "cancel", but cancel_charge does all we need. */
2542 __mem_cgroup_cancel_charge(from, nr_pages); 2633 __mem_cgroup_cancel_charge(from, nr_pages);
2543 2634
2544 /* caller should have done css_get */ 2635 /* caller should have done css_get */
2545 pc->mem_cgroup = to; 2636 pc->mem_cgroup = to;
2546 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2637 mem_cgroup_charge_statistics(to, anon, nr_pages);
2547 /* 2638 /*
2548 * We charges against "to" which may not have any tasks. Then, "to" 2639 * We charges against "to" which may not have any tasks. Then, "to"
2549 * can be under rmdir(). But in current implementation, caller of 2640 * can be under rmdir(). But in current implementation, caller of
@@ -2551,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,
2551 * guaranteed that "to" is never removed. So, we don't check rmdir 2642 * guaranteed that "to" is never removed. So, we don't check rmdir
2552 * status here. 2643 * status here.
2553 */ 2644 */
2554 move_unlock_page_cgroup(pc, &flags); 2645 move_unlock_mem_cgroup(from, &flags);
2555 ret = 0; 2646 ret = 0;
2556unlock: 2647unlock:
2557 unlock_page_cgroup(pc); 2648 unlock_page_cgroup(pc);
@@ -2643,7 +2734,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2643 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2734 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2644 if (ret == -ENOMEM) 2735 if (ret == -ENOMEM)
2645 return ret; 2736 return ret;
2646 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); 2737 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
2647 return 0; 2738 return 0;
2648} 2739}
2649 2740
@@ -2663,35 +2754,6 @@ static void
2663__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2754__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2664 enum charge_type ctype); 2755 enum charge_type ctype);
2665 2756
2666static void
2667__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2668 enum charge_type ctype)
2669{
2670 struct page_cgroup *pc = lookup_page_cgroup(page);
2671 struct zone *zone = page_zone(page);
2672 unsigned long flags;
2673 bool removed = false;
2674
2675 /*
2676 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2677 * is already on LRU. It means the page may on some other page_cgroup's
2678 * LRU. Take care of it.
2679 */
2680 spin_lock_irqsave(&zone->lru_lock, flags);
2681 if (PageLRU(page)) {
2682 del_page_from_lru_list(zone, page, page_lru(page));
2683 ClearPageLRU(page);
2684 removed = true;
2685 }
2686 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2687 if (removed) {
2688 add_page_to_lru_list(zone, page, page_lru(page));
2689 SetPageLRU(page);
2690 }
2691 spin_unlock_irqrestore(&zone->lru_lock, flags);
2692 return;
2693}
2694
2695int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2757int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2696 gfp_t gfp_mask) 2758 gfp_t gfp_mask)
2697{ 2759{
@@ -2769,13 +2831,16 @@ static void
2769__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2831__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2770 enum charge_type ctype) 2832 enum charge_type ctype)
2771{ 2833{
2834 struct page_cgroup *pc;
2835
2772 if (mem_cgroup_disabled()) 2836 if (mem_cgroup_disabled())
2773 return; 2837 return;
2774 if (!memcg) 2838 if (!memcg)
2775 return; 2839 return;
2776 cgroup_exclude_rmdir(&memcg->css); 2840 cgroup_exclude_rmdir(&memcg->css);
2777 2841
2778 __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); 2842 pc = lookup_page_cgroup(page);
2843 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
2779 /* 2844 /*
2780 * Now swap is on-memory. This means this page may be 2845 * Now swap is on-memory. This means this page may be
2781 * counted both as mem and swap....double count. 2846 * counted both as mem and swap....double count.
@@ -2879,7 +2944,6 @@ direct_uncharge:
2879 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 2944 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2880 if (unlikely(batch->memcg != memcg)) 2945 if (unlikely(batch->memcg != memcg))
2881 memcg_oom_recover(memcg); 2946 memcg_oom_recover(memcg);
2882 return;
2883} 2947}
2884 2948
2885/* 2949/*
@@ -2891,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2891 struct mem_cgroup *memcg = NULL; 2955 struct mem_cgroup *memcg = NULL;
2892 unsigned int nr_pages = 1; 2956 unsigned int nr_pages = 1;
2893 struct page_cgroup *pc; 2957 struct page_cgroup *pc;
2958 bool anon;
2894 2959
2895 if (mem_cgroup_disabled()) 2960 if (mem_cgroup_disabled())
2896 return NULL; 2961 return NULL;
@@ -2916,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2916 if (!PageCgroupUsed(pc)) 2981 if (!PageCgroupUsed(pc))
2917 goto unlock_out; 2982 goto unlock_out;
2918 2983
2984 anon = PageAnon(page);
2985
2919 switch (ctype) { 2986 switch (ctype) {
2920 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2987 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2988 /*
2989 * Generally PageAnon tells if it's the anon statistics to be
2990 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
2991 * used before page reached the stage of being marked PageAnon.
2992 */
2993 anon = true;
2994 /* fallthrough */
2921 case MEM_CGROUP_CHARGE_TYPE_DROP: 2995 case MEM_CGROUP_CHARGE_TYPE_DROP:
2922 /* See mem_cgroup_prepare_migration() */ 2996 /* See mem_cgroup_prepare_migration() */
2923 if (page_mapped(page) || PageCgroupMigration(pc)) 2997 if (page_mapped(page) || PageCgroupMigration(pc))
@@ -2934,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2934 break; 3008 break;
2935 } 3009 }
2936 3010
2937 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); 3011 mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
2938 3012
2939 ClearPageCgroupUsed(pc); 3013 ClearPageCgroupUsed(pc);
2940 /* 3014 /*
@@ -3027,23 +3101,6 @@ void mem_cgroup_uncharge_end(void)
3027 batch->memcg = NULL; 3101 batch->memcg = NULL;
3028} 3102}
3029 3103
3030/*
3031 * A function for resetting pc->mem_cgroup for newly allocated pages.
3032 * This function should be called if the newpage will be added to LRU
3033 * before start accounting.
3034 */
3035void mem_cgroup_reset_owner(struct page *newpage)
3036{
3037 struct page_cgroup *pc;
3038
3039 if (mem_cgroup_disabled())
3040 return;
3041
3042 pc = lookup_page_cgroup(newpage);
3043 VM_BUG_ON(PageCgroupUsed(pc));
3044 pc->mem_cgroup = root_mem_cgroup;
3045}
3046
3047#ifdef CONFIG_SWAP 3104#ifdef CONFIG_SWAP
3048/* 3105/*
3049 * called after __delete_from_swap_cache() and drop "page" account. 3106 * called after __delete_from_swap_cache() and drop "page" account.
@@ -3248,7 +3305,7 @@ int mem_cgroup_prepare_migration(struct page *page,
3248 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3305 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3249 else 3306 else
3250 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3307 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3251 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype); 3308 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
3252 return ret; 3309 return ret;
3253} 3310}
3254 3311
@@ -3258,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3258{ 3315{
3259 struct page *used, *unused; 3316 struct page *used, *unused;
3260 struct page_cgroup *pc; 3317 struct page_cgroup *pc;
3318 bool anon;
3261 3319
3262 if (!memcg) 3320 if (!memcg)
3263 return; 3321 return;
@@ -3279,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3279 lock_page_cgroup(pc); 3337 lock_page_cgroup(pc);
3280 ClearPageCgroupMigration(pc); 3338 ClearPageCgroupMigration(pc);
3281 unlock_page_cgroup(pc); 3339 unlock_page_cgroup(pc);
3282 3340 anon = PageAnon(used);
3283 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3341 __mem_cgroup_uncharge_common(unused,
3342 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3343 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3284 3344
3285 /* 3345 /*
3286 * If a page is a file cache, radix-tree replacement is very atomic 3346 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3290,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3290 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3350 * and USED bit check in mem_cgroup_uncharge_page() will do enough
3291 * check. (see prepare_charge() also) 3351 * check. (see prepare_charge() also)
3292 */ 3352 */
3293 if (PageAnon(used)) 3353 if (anon)
3294 mem_cgroup_uncharge_page(used); 3354 mem_cgroup_uncharge_page(used);
3295 /* 3355 /*
3296 * At migration, we may charge account against cgroup which has no 3356 * At migration, we may charge account against cgroup which has no
@@ -3320,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3320 /* fix accounting on old pages */ 3380 /* fix accounting on old pages */
3321 lock_page_cgroup(pc); 3381 lock_page_cgroup(pc);
3322 memcg = pc->mem_cgroup; 3382 memcg = pc->mem_cgroup;
3323 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); 3383 mem_cgroup_charge_statistics(memcg, false, -1);
3324 ClearPageCgroupUsed(pc); 3384 ClearPageCgroupUsed(pc);
3325 unlock_page_cgroup(pc); 3385 unlock_page_cgroup(pc);
3326 3386
@@ -3332,7 +3392,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3332 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3392 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3333 * LRU while we overwrite pc->mem_cgroup. 3393 * LRU while we overwrite pc->mem_cgroup.
3334 */ 3394 */
3335 __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); 3395 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
3336} 3396}
3337 3397
3338#ifdef CONFIG_DEBUG_VM 3398#ifdef CONFIG_DEBUG_VM
@@ -3531,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3531 break; 3591 break;
3532 3592
3533 nr_scanned = 0; 3593 nr_scanned = 0;
3534 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, 3594 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3535 gfp_mask, &nr_scanned); 3595 gfp_mask, &nr_scanned);
3536 nr_reclaimed += reclaimed; 3596 nr_reclaimed += reclaimed;
3537 *total_scanned += nr_scanned; 3597 *total_scanned += nr_scanned;
@@ -3558,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3558 next_mz = 3618 next_mz =
3559 __mem_cgroup_largest_soft_limit_node(mctz); 3619 __mem_cgroup_largest_soft_limit_node(mctz);
3560 if (next_mz == mz) 3620 if (next_mz == mz)
3561 css_put(&next_mz->mem->css); 3621 css_put(&next_mz->memcg->css);
3562 else /* next_mz == NULL or other memcg */ 3622 else /* next_mz == NULL or other memcg */
3563 break; 3623 break;
3564 } while (1); 3624 } while (1);
3565 } 3625 }
3566 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3626 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
3567 excess = res_counter_soft_limit_excess(&mz->mem->res); 3627 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3568 /* 3628 /*
3569 * One school of thought says that we should not add 3629 * One school of thought says that we should not add
3570 * back the node to the tree if reclaim returns 0. 3630 * back the node to the tree if reclaim returns 0.
@@ -3574,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3574 * term TODO. 3634 * term TODO.
3575 */ 3635 */
3576 /* If excess == 0, no tree ops */ 3636 /* If excess == 0, no tree ops */
3577 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3637 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
3578 spin_unlock(&mctz->lock); 3638 spin_unlock(&mctz->lock);
3579 css_put(&mz->mem->css); 3639 css_put(&mz->memcg->css);
3580 loop++; 3640 loop++;
3581 /* 3641 /*
3582 * Could not reclaim anything and there are no more 3642 * Could not reclaim anything and there are no more
@@ -3589,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3589 break; 3649 break;
3590 } while (!nr_reclaimed); 3650 } while (!nr_reclaimed);
3591 if (next_mz) 3651 if (next_mz)
3592 css_put(&next_mz->mem->css); 3652 css_put(&next_mz->memcg->css);
3593 return nr_reclaimed; 3653 return nr_reclaimed;
3594} 3654}
3595 3655
@@ -3611,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3611 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3671 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3612 list = &mz->lruvec.lists[lru]; 3672 list = &mz->lruvec.lists[lru];
3613 3673
3614 loop = MEM_CGROUP_ZSTAT(mz, lru); 3674 loop = mz->lru_size[lru];
3615 /* give some margin against EBUSY etc...*/ 3675 /* give some margin against EBUSY etc...*/
3616 loop += 256; 3676 loop += 256;
3617 busy = NULL; 3677 busy = NULL;
@@ -3685,10 +3745,10 @@ move_account:
3685 mem_cgroup_start_move(memcg); 3745 mem_cgroup_start_move(memcg);
3686 for_each_node_state(node, N_HIGH_MEMORY) { 3746 for_each_node_state(node, N_HIGH_MEMORY) {
3687 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3747 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3688 enum lru_list l; 3748 enum lru_list lru;
3689 for_each_lru(l) { 3749 for_each_lru(lru) {
3690 ret = mem_cgroup_force_empty_list(memcg, 3750 ret = mem_cgroup_force_empty_list(memcg,
3691 node, zid, l); 3751 node, zid, lru);
3692 if (ret) 3752 if (ret)
3693 break; 3753 break;
3694 } 3754 }
@@ -3842,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3842 break; 3902 break;
3843 default: 3903 default:
3844 BUG(); 3904 BUG();
3845 break;
3846 } 3905 }
3847 return val; 3906 return val;
3848} 3907}
@@ -3921,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3921out: 3980out:
3922 *mem_limit = min_limit; 3981 *mem_limit = min_limit;
3923 *memsw_limit = min_memsw_limit; 3982 *memsw_limit = min_memsw_limit;
3924 return;
3925} 3983}
3926 3984
3927static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3985static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -4080,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4080 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4138 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4081 unsigned long node_nr; 4139 unsigned long node_nr;
4082 struct cgroup *cont = m->private; 4140 struct cgroup *cont = m->private;
4083 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4141 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4084 4142
4085 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); 4143 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4086 seq_printf(m, "total=%lu", total_nr); 4144 seq_printf(m, "total=%lu", total_nr);
4087 for_each_node_state(nid, N_HIGH_MEMORY) { 4145 for_each_node_state(nid, N_HIGH_MEMORY) {
4088 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); 4146 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4089 seq_printf(m, " N%d=%lu", nid, node_nr); 4147 seq_printf(m, " N%d=%lu", nid, node_nr);
4090 } 4148 }
4091 seq_putc(m, '\n'); 4149 seq_putc(m, '\n');
4092 4150
4093 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); 4151 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4094 seq_printf(m, "file=%lu", file_nr); 4152 seq_printf(m, "file=%lu", file_nr);
4095 for_each_node_state(nid, N_HIGH_MEMORY) { 4153 for_each_node_state(nid, N_HIGH_MEMORY) {
4096 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4154 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4097 LRU_ALL_FILE); 4155 LRU_ALL_FILE);
4098 seq_printf(m, " N%d=%lu", nid, node_nr); 4156 seq_printf(m, " N%d=%lu", nid, node_nr);
4099 } 4157 }
4100 seq_putc(m, '\n'); 4158 seq_putc(m, '\n');
4101 4159
4102 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); 4160 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4103 seq_printf(m, "anon=%lu", anon_nr); 4161 seq_printf(m, "anon=%lu", anon_nr);
4104 for_each_node_state(nid, N_HIGH_MEMORY) { 4162 for_each_node_state(nid, N_HIGH_MEMORY) {
4105 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4163 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4106 LRU_ALL_ANON); 4164 LRU_ALL_ANON);
4107 seq_printf(m, " N%d=%lu", nid, node_nr); 4165 seq_printf(m, " N%d=%lu", nid, node_nr);
4108 } 4166 }
4109 seq_putc(m, '\n'); 4167 seq_putc(m, '\n');
4110 4168
4111 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); 4169 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4112 seq_printf(m, "unevictable=%lu", unevictable_nr); 4170 seq_printf(m, "unevictable=%lu", unevictable_nr);
4113 for_each_node_state(nid, N_HIGH_MEMORY) { 4171 for_each_node_state(nid, N_HIGH_MEMORY) {
4114 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4172 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4115 BIT(LRU_UNEVICTABLE)); 4173 BIT(LRU_UNEVICTABLE));
4116 seq_printf(m, " N%d=%lu", nid, node_nr); 4174 seq_printf(m, " N%d=%lu", nid, node_nr);
4117 } 4175 }
@@ -4123,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4123static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4181static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4124 struct cgroup_map_cb *cb) 4182 struct cgroup_map_cb *cb)
4125{ 4183{
4126 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4184 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4127 struct mcs_total_stat mystat; 4185 struct mcs_total_stat mystat;
4128 int i; 4186 int i;
4129 4187
4130 memset(&mystat, 0, sizeof(mystat)); 4188 memset(&mystat, 0, sizeof(mystat));
4131 mem_cgroup_get_local_stat(mem_cont, &mystat); 4189 mem_cgroup_get_local_stat(memcg, &mystat);
4132 4190
4133 4191
4134 for (i = 0; i < NR_MCS_STAT; i++) { 4192 for (i = 0; i < NR_MCS_STAT; i++) {
@@ -4140,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4140 /* Hierarchical information */ 4198 /* Hierarchical information */
4141 { 4199 {
4142 unsigned long long limit, memsw_limit; 4200 unsigned long long limit, memsw_limit;
4143 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 4201 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4144 cb->fill(cb, "hierarchical_memory_limit", limit); 4202 cb->fill(cb, "hierarchical_memory_limit", limit);
4145 if (do_swap_account) 4203 if (do_swap_account)
4146 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4204 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4147 } 4205 }
4148 4206
4149 memset(&mystat, 0, sizeof(mystat)); 4207 memset(&mystat, 0, sizeof(mystat));
4150 mem_cgroup_get_total_stat(mem_cont, &mystat); 4208 mem_cgroup_get_total_stat(memcg, &mystat);
4151 for (i = 0; i < NR_MCS_STAT; i++) { 4209 for (i = 0; i < NR_MCS_STAT; i++) {
4152 if (i == MCS_SWAP && !do_swap_account) 4210 if (i == MCS_SWAP && !do_swap_account)
4153 continue; 4211 continue;
@@ -4163,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4163 4221
4164 for_each_online_node(nid) 4222 for_each_online_node(nid)
4165 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4223 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4166 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 4224 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4167 4225
4168 recent_rotated[0] += 4226 recent_rotated[0] +=
4169 mz->reclaim_stat.recent_rotated[0]; 4227 mz->reclaim_stat.recent_rotated[0];
@@ -4408,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4408 else 4466 else
4409 BUG(); 4467 BUG();
4410 4468
4411 /*
4412 * Something went wrong if we trying to unregister a threshold
4413 * if we don't have thresholds
4414 */
4415 BUG_ON(!thresholds);
4416
4417 if (!thresholds->primary) 4469 if (!thresholds->primary)
4418 goto unlock; 4470 goto unlock;
4419 4471
@@ -4584,10 +4636,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4584 return mem_cgroup_sockets_init(cont, ss); 4636 return mem_cgroup_sockets_init(cont, ss);
4585}; 4637};
4586 4638
4587static void kmem_cgroup_destroy(struct cgroup_subsys *ss, 4639static void kmem_cgroup_destroy(struct cgroup *cont)
4588 struct cgroup *cont)
4589{ 4640{
4590 mem_cgroup_sockets_destroy(cont, ss); 4641 mem_cgroup_sockets_destroy(cont);
4591} 4642}
4592#else 4643#else
4593static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4644static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
@@ -4595,8 +4646,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4595 return 0; 4646 return 0;
4596} 4647}
4597 4648
4598static void kmem_cgroup_destroy(struct cgroup_subsys *ss, 4649static void kmem_cgroup_destroy(struct cgroup *cont)
4599 struct cgroup *cont)
4600{ 4650{
4601} 4651}
4602#endif 4652#endif
@@ -4720,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4720{ 4770{
4721 struct mem_cgroup_per_node *pn; 4771 struct mem_cgroup_per_node *pn;
4722 struct mem_cgroup_per_zone *mz; 4772 struct mem_cgroup_per_zone *mz;
4723 enum lru_list l; 4773 enum lru_list lru;
4724 int zone, tmp = node; 4774 int zone, tmp = node;
4725 /* 4775 /*
4726 * This routine is called against possible nodes. 4776 * This routine is called against possible nodes.
@@ -4738,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4738 4788
4739 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4740 mz = &pn->zoneinfo[zone]; 4790 mz = &pn->zoneinfo[zone];
4741 for_each_lru(l) 4791 for_each_lru(lru)
4742 INIT_LIST_HEAD(&mz->lruvec.lists[l]); 4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
4743 mz->usage_in_excess = 0; 4793 mz->usage_in_excess = 0;
4744 mz->on_tree = false; 4794 mz->on_tree = false;
4745 mz->mem = memcg; 4795 mz->memcg = memcg;
4746 } 4796 }
4747 memcg->info.nodeinfo[node] = pn; 4797 memcg->info.nodeinfo[node] = pn;
4748 return 0; 4798 return 0;
@@ -4755,33 +4805,54 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4755 4805
4756static struct mem_cgroup *mem_cgroup_alloc(void) 4806static struct mem_cgroup *mem_cgroup_alloc(void)
4757{ 4807{
4758 struct mem_cgroup *mem; 4808 struct mem_cgroup *memcg;
4759 int size = sizeof(struct mem_cgroup); 4809 int size = sizeof(struct mem_cgroup);
4760 4810
4761 /* Can be very big if MAX_NUMNODES is very big */ 4811 /* Can be very big if MAX_NUMNODES is very big */
4762 if (size < PAGE_SIZE) 4812 if (size < PAGE_SIZE)
4763 mem = kzalloc(size, GFP_KERNEL); 4813 memcg = kzalloc(size, GFP_KERNEL);
4764 else 4814 else
4765 mem = vzalloc(size); 4815 memcg = vzalloc(size);
4766 4816
4767 if (!mem) 4817 if (!memcg)
4768 return NULL; 4818 return NULL;
4769 4819
4770 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4820 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4771 if (!mem->stat) 4821 if (!memcg->stat)
4772 goto out_free; 4822 goto out_free;
4773 spin_lock_init(&mem->pcp_counter_lock); 4823 spin_lock_init(&memcg->pcp_counter_lock);
4774 return mem; 4824 return memcg;
4775 4825
4776out_free: 4826out_free:
4777 if (size < PAGE_SIZE) 4827 if (size < PAGE_SIZE)
4778 kfree(mem); 4828 kfree(memcg);
4779 else 4829 else
4780 vfree(mem); 4830 vfree(memcg);
4781 return NULL; 4831 return NULL;
4782} 4832}
4783 4833
4784/* 4834/*
4835 * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
4836 * but in process context. The work_freeing structure is overlaid
4837 * on the rcu_freeing structure, which itself is overlaid on memsw.
4838 */
4839static void vfree_work(struct work_struct *work)
4840{
4841 struct mem_cgroup *memcg;
4842
4843 memcg = container_of(work, struct mem_cgroup, work_freeing);
4844 vfree(memcg);
4845}
4846static void vfree_rcu(struct rcu_head *rcu_head)
4847{
4848 struct mem_cgroup *memcg;
4849
4850 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4851 INIT_WORK(&memcg->work_freeing, vfree_work);
4852 schedule_work(&memcg->work_freeing);
4853}
4854
4855/*
4785 * At destroying mem_cgroup, references from swap_cgroup can remain. 4856 * At destroying mem_cgroup, references from swap_cgroup can remain.
4786 * (scanning all at force_empty is too costly...) 4857 * (scanning all at force_empty is too costly...)
4787 * 4858 *
@@ -4804,9 +4875,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4804 4875
4805 free_percpu(memcg->stat); 4876 free_percpu(memcg->stat);
4806 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4877 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4807 kfree(memcg); 4878 kfree_rcu(memcg, rcu_freeing);
4808 else 4879 else
4809 vfree(memcg); 4880 call_rcu(&memcg->rcu_freeing, vfree_rcu);
4810} 4881}
4811 4882
4812static void mem_cgroup_get(struct mem_cgroup *memcg) 4883static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4888,7 +4959,7 @@ err_cleanup:
4888} 4959}
4889 4960
4890static struct cgroup_subsys_state * __ref 4961static struct cgroup_subsys_state * __ref
4891mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4962mem_cgroup_create(struct cgroup *cont)
4892{ 4963{
4893 struct mem_cgroup *memcg, *parent; 4964 struct mem_cgroup *memcg, *parent;
4894 long error = -ENOMEM; 4965 long error = -ENOMEM;
@@ -4944,26 +5015,25 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4944 atomic_set(&memcg->refcnt, 1); 5015 atomic_set(&memcg->refcnt, 1);
4945 memcg->move_charge_at_immigrate = 0; 5016 memcg->move_charge_at_immigrate = 0;
4946 mutex_init(&memcg->thresholds_lock); 5017 mutex_init(&memcg->thresholds_lock);
5018 spin_lock_init(&memcg->move_lock);
4947 return &memcg->css; 5019 return &memcg->css;
4948free_out: 5020free_out:
4949 __mem_cgroup_free(memcg); 5021 __mem_cgroup_free(memcg);
4950 return ERR_PTR(error); 5022 return ERR_PTR(error);
4951} 5023}
4952 5024
4953static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 5025static int mem_cgroup_pre_destroy(struct cgroup *cont)
4954 struct cgroup *cont)
4955{ 5026{
4956 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5027 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4957 5028
4958 return mem_cgroup_force_empty(memcg, false); 5029 return mem_cgroup_force_empty(memcg, false);
4959} 5030}
4960 5031
4961static void mem_cgroup_destroy(struct cgroup_subsys *ss, 5032static void mem_cgroup_destroy(struct cgroup *cont)
4962 struct cgroup *cont)
4963{ 5033{
4964 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5034 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4965 5035
4966 kmem_cgroup_destroy(ss, cont); 5036 kmem_cgroup_destroy(cont);
4967 5037
4968 mem_cgroup_put(memcg); 5038 mem_cgroup_put(memcg);
4969} 5039}
@@ -5040,7 +5110,7 @@ one_by_one:
5040} 5110}
5041 5111
5042/** 5112/**
5043 * is_target_pte_for_mc - check a pte whether it is valid for move charge 5113 * get_mctgt_type - get target type of moving charge
5044 * @vma: the vma the pte to be checked belongs 5114 * @vma: the vma the pte to be checked belongs
5045 * @addr: the address corresponding to the pte to be checked 5115 * @addr: the address corresponding to the pte to be checked
5046 * @ptent: the pte to be checked 5116 * @ptent: the pte to be checked
@@ -5063,7 +5133,7 @@ union mc_target {
5063}; 5133};
5064 5134
5065enum mc_target_type { 5135enum mc_target_type {
5066 MC_TARGET_NONE, /* not used */ 5136 MC_TARGET_NONE = 0,
5067 MC_TARGET_PAGE, 5137 MC_TARGET_PAGE,
5068 MC_TARGET_SWAP, 5138 MC_TARGET_SWAP,
5069}; 5139};
@@ -5144,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5144 return page; 5214 return page;
5145} 5215}
5146 5216
5147static int is_target_pte_for_mc(struct vm_area_struct *vma, 5217static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5148 unsigned long addr, pte_t ptent, union mc_target *target) 5218 unsigned long addr, pte_t ptent, union mc_target *target)
5149{ 5219{
5150 struct page *page = NULL; 5220 struct page *page = NULL;
5151 struct page_cgroup *pc; 5221 struct page_cgroup *pc;
5152 int ret = 0; 5222 enum mc_target_type ret = MC_TARGET_NONE;
5153 swp_entry_t ent = { .val = 0 }; 5223 swp_entry_t ent = { .val = 0 };
5154 5224
5155 if (pte_present(ptent)) 5225 if (pte_present(ptent))
@@ -5160,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5160 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5230 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5161 5231
5162 if (!page && !ent.val) 5232 if (!page && !ent.val)
5163 return 0; 5233 return ret;
5164 if (page) { 5234 if (page) {
5165 pc = lookup_page_cgroup(page); 5235 pc = lookup_page_cgroup(page);
5166 /* 5236 /*
@@ -5186,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5186 return ret; 5256 return ret;
5187} 5257}
5188 5258
5259#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5260/*
5261 * We don't consider swapping or file mapped pages because THP does not
5262 * support them for now.
5263 * Caller should make sure that pmd_trans_huge(pmd) is true.
5264 */
5265static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5266 unsigned long addr, pmd_t pmd, union mc_target *target)
5267{
5268 struct page *page = NULL;
5269 struct page_cgroup *pc;
5270 enum mc_target_type ret = MC_TARGET_NONE;
5271
5272 page = pmd_page(pmd);
5273 VM_BUG_ON(!page || !PageHead(page));
5274 if (!move_anon())
5275 return ret;
5276 pc = lookup_page_cgroup(page);
5277 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5278 ret = MC_TARGET_PAGE;
5279 if (target) {
5280 get_page(page);
5281 target->page = page;
5282 }
5283 }
5284 return ret;
5285}
5286#else
5287static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5288 unsigned long addr, pmd_t pmd, union mc_target *target)
5289{
5290 return MC_TARGET_NONE;
5291}
5292#endif
5293
5189static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5294static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5190 unsigned long addr, unsigned long end, 5295 unsigned long addr, unsigned long end,
5191 struct mm_walk *walk) 5296 struct mm_walk *walk)
@@ -5194,11 +5299,18 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5194 pte_t *pte; 5299 pte_t *pte;
5195 spinlock_t *ptl; 5300 spinlock_t *ptl;
5196 5301
5197 split_huge_page_pmd(walk->mm, pmd); 5302 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5303 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5304 mc.precharge += HPAGE_PMD_NR;
5305 spin_unlock(&vma->vm_mm->page_table_lock);
5306 return 0;
5307 }
5198 5308
5309 if (pmd_trans_unstable(pmd))
5310 return 0;
5199 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5311 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5200 for (; addr != end; pte++, addr += PAGE_SIZE) 5312 for (; addr != end; pte++, addr += PAGE_SIZE)
5201 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5313 if (get_mctgt_type(vma, addr, *pte, NULL))
5202 mc.precharge++; /* increment precharge temporarily */ 5314 mc.precharge++; /* increment precharge temporarily */
5203 pte_unmap_unlock(pte - 1, ptl); 5315 pte_unmap_unlock(pte - 1, ptl);
5204 cond_resched(); 5316 cond_resched();
@@ -5300,9 +5412,8 @@ static void mem_cgroup_clear_mc(void)
5300 mem_cgroup_end_move(from); 5412 mem_cgroup_end_move(from);
5301} 5413}
5302 5414
5303static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5415static int mem_cgroup_can_attach(struct cgroup *cgroup,
5304 struct cgroup *cgroup, 5416 struct cgroup_taskset *tset)
5305 struct cgroup_taskset *tset)
5306{ 5417{
5307 struct task_struct *p = cgroup_taskset_first(tset); 5418 struct task_struct *p = cgroup_taskset_first(tset);
5308 int ret = 0; 5419 int ret = 0;
@@ -5340,9 +5451,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5340 return ret; 5451 return ret;
5341} 5452}
5342 5453
5343static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5454static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5344 struct cgroup *cgroup, 5455 struct cgroup_taskset *tset)
5345 struct cgroup_taskset *tset)
5346{ 5456{
5347 mem_cgroup_clear_mc(); 5457 mem_cgroup_clear_mc();
5348} 5458}
@@ -5355,23 +5465,57 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5355 struct vm_area_struct *vma = walk->private; 5465 struct vm_area_struct *vma = walk->private;
5356 pte_t *pte; 5466 pte_t *pte;
5357 spinlock_t *ptl; 5467 spinlock_t *ptl;
5468 enum mc_target_type target_type;
5469 union mc_target target;
5470 struct page *page;
5471 struct page_cgroup *pc;
5358 5472
5359 split_huge_page_pmd(walk->mm, pmd); 5473 /*
5474 * We don't take compound_lock() here but no race with splitting thp
5475 * happens because:
5476 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
5477 * under splitting, which means there's no concurrent thp split,
5478 * - if another thread runs into split_huge_page() just after we
5479 * entered this if-block, the thread must wait for page table lock
5480 * to be unlocked in __split_huge_page_splitting(), where the main
5481 * part of thp split is not executed yet.
5482 */
5483 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5484 if (!mc.precharge) {
5485 spin_unlock(&vma->vm_mm->page_table_lock);
5486 return 0;
5487 }
5488 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5489 if (target_type == MC_TARGET_PAGE) {
5490 page = target.page;
5491 if (!isolate_lru_page(page)) {
5492 pc = lookup_page_cgroup(page);
5493 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5494 pc, mc.from, mc.to,
5495 false)) {
5496 mc.precharge -= HPAGE_PMD_NR;
5497 mc.moved_charge += HPAGE_PMD_NR;
5498 }
5499 putback_lru_page(page);
5500 }
5501 put_page(page);
5502 }
5503 spin_unlock(&vma->vm_mm->page_table_lock);
5504 return 0;
5505 }
5506
5507 if (pmd_trans_unstable(pmd))
5508 return 0;
5360retry: 5509retry:
5361 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5510 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5362 for (; addr != end; addr += PAGE_SIZE) { 5511 for (; addr != end; addr += PAGE_SIZE) {
5363 pte_t ptent = *(pte++); 5512 pte_t ptent = *(pte++);
5364 union mc_target target;
5365 int type;
5366 struct page *page;
5367 struct page_cgroup *pc;
5368 swp_entry_t ent; 5513 swp_entry_t ent;
5369 5514
5370 if (!mc.precharge) 5515 if (!mc.precharge)
5371 break; 5516 break;
5372 5517
5373 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5518 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5374 switch (type) {
5375 case MC_TARGET_PAGE: 5519 case MC_TARGET_PAGE:
5376 page = target.page; 5520 page = target.page;
5377 if (isolate_lru_page(page)) 5521 if (isolate_lru_page(page))
@@ -5384,7 +5528,7 @@ retry:
5384 mc.moved_charge++; 5528 mc.moved_charge++;
5385 } 5529 }
5386 putback_lru_page(page); 5530 putback_lru_page(page);
5387put: /* is_target_pte_for_mc() gets the page */ 5531put: /* get_mctgt_type() gets the page */
5388 put_page(page); 5532 put_page(page);
5389 break; 5533 break;
5390 case MC_TARGET_SWAP: 5534 case MC_TARGET_SWAP:
@@ -5457,9 +5601,8 @@ retry:
5457 up_read(&mm->mmap_sem); 5601 up_read(&mm->mmap_sem);
5458} 5602}
5459 5603
5460static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5604static void mem_cgroup_move_task(struct cgroup *cont,
5461 struct cgroup *cont, 5605 struct cgroup_taskset *tset)
5462 struct cgroup_taskset *tset)
5463{ 5606{
5464 struct task_struct *p = cgroup_taskset_first(tset); 5607 struct task_struct *p = cgroup_taskset_first(tset);
5465 struct mm_struct *mm = get_task_mm(p); 5608 struct mm_struct *mm = get_task_mm(p);
@@ -5474,20 +5617,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5474 mem_cgroup_clear_mc(); 5617 mem_cgroup_clear_mc();
5475} 5618}
5476#else /* !CONFIG_MMU */ 5619#else /* !CONFIG_MMU */
5477static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5620static int mem_cgroup_can_attach(struct cgroup *cgroup,
5478 struct cgroup *cgroup, 5621 struct cgroup_taskset *tset)
5479 struct cgroup_taskset *tset)
5480{ 5622{
5481 return 0; 5623 return 0;
5482} 5624}
5483static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5625static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5484 struct cgroup *cgroup, 5626 struct cgroup_taskset *tset)
5485 struct cgroup_taskset *tset)
5486{ 5627{
5487} 5628}
5488static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5629static void mem_cgroup_move_task(struct cgroup *cont,
5489 struct cgroup *cont, 5630 struct cgroup_taskset *tset)
5490 struct cgroup_taskset *tset)
5491{ 5631{
5492} 5632}
5493#endif 5633#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 56080ea36140..97cc2733551a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -187,33 +187,40 @@ int hwpoison_filter(struct page *p)
187EXPORT_SYMBOL_GPL(hwpoison_filter); 187EXPORT_SYMBOL_GPL(hwpoison_filter);
188 188
189/* 189/*
190 * Send all the processes who have the page mapped an ``action optional'' 190 * Send all the processes who have the page mapped a signal.
191 * signal. 191 * ``action optional'' if they are not immediately affected by the error
192 * ``action required'' if error happened in current execution context
192 */ 193 */
193static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, 194static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
194 unsigned long pfn, struct page *page) 195 unsigned long pfn, struct page *page, int flags)
195{ 196{
196 struct siginfo si; 197 struct siginfo si;
197 int ret; 198 int ret;
198 199
199 printk(KERN_ERR 200 printk(KERN_ERR
200 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", 201 "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
201 pfn, t->comm, t->pid); 202 pfn, t->comm, t->pid);
202 si.si_signo = SIGBUS; 203 si.si_signo = SIGBUS;
203 si.si_errno = 0; 204 si.si_errno = 0;
204 si.si_code = BUS_MCEERR_AO;
205 si.si_addr = (void *)addr; 205 si.si_addr = (void *)addr;
206#ifdef __ARCH_SI_TRAPNO 206#ifdef __ARCH_SI_TRAPNO
207 si.si_trapno = trapno; 207 si.si_trapno = trapno;
208#endif 208#endif
209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; 209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
210 /* 210
211 * Don't use force here, it's convenient if the signal 211 if ((flags & MF_ACTION_REQUIRED) && t == current) {
212 * can be temporarily blocked. 212 si.si_code = BUS_MCEERR_AR;
213 * This could cause a loop when the user sets SIGBUS 213 ret = force_sig_info(SIGBUS, &si, t);
214 * to SIG_IGN, but hopefully no one will do that? 214 } else {
215 */ 215 /*
216 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ 216 * Don't use force here, it's convenient if the signal
217 * can be temporarily blocked.
218 * This could cause a loop when the user sets SIGBUS
219 * to SIG_IGN, but hopefully no one will do that?
220 */
221 si.si_code = BUS_MCEERR_AO;
222 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
223 }
217 if (ret < 0) 224 if (ret < 0)
218 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", 225 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
219 t->comm, t->pid, ret); 226 t->comm, t->pid, ret);
@@ -338,8 +345,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
338 * Also when FAIL is set do a force kill because something went 345 * Also when FAIL is set do a force kill because something went
339 * wrong earlier. 346 * wrong earlier.
340 */ 347 */
341static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, 348static void kill_procs(struct list_head *to_kill, int doit, int trapno,
342 int fail, struct page *page, unsigned long pfn) 349 int fail, struct page *page, unsigned long pfn,
350 int flags)
343{ 351{
344 struct to_kill *tk, *next; 352 struct to_kill *tk, *next;
345 353
@@ -363,8 +371,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
363 * check for that, but we need to tell the 371 * check for that, but we need to tell the
364 * process anyways. 372 * process anyways.
365 */ 373 */
366 else if (kill_proc_ao(tk->tsk, tk->addr, trapno, 374 else if (kill_proc(tk->tsk, tk->addr, trapno,
367 pfn, page) < 0) 375 pfn, page, flags) < 0)
368 printk(KERN_ERR 376 printk(KERN_ERR
369 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", 377 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
370 pfn, tk->tsk->comm, tk->tsk->pid); 378 pfn, tk->tsk->comm, tk->tsk->pid);
@@ -844,7 +852,7 @@ static int page_action(struct page_state *ps, struct page *p,
844 * the pages and send SIGBUS to the processes if the data was dirty. 852 * the pages and send SIGBUS to the processes if the data was dirty.
845 */ 853 */
846static int hwpoison_user_mappings(struct page *p, unsigned long pfn, 854static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
847 int trapno) 855 int trapno, int flags)
848{ 856{
849 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 857 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
850 struct address_space *mapping; 858 struct address_space *mapping;
@@ -962,8 +970,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
962 * use a more force-full uncatchable kill to prevent 970 * use a more force-full uncatchable kill to prevent
963 * any accesses to the poisoned memory. 971 * any accesses to the poisoned memory.
964 */ 972 */
965 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, 973 kill_procs(&tokill, !!PageDirty(ppage), trapno,
966 ret != SWAP_SUCCESS, p, pfn); 974 ret != SWAP_SUCCESS, p, pfn, flags);
967 975
968 return ret; 976 return ret;
969} 977}
@@ -984,7 +992,25 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
984 ClearPageHWPoison(hpage + i); 992 ClearPageHWPoison(hpage + i);
985} 993}
986 994
987int __memory_failure(unsigned long pfn, int trapno, int flags) 995/**
996 * memory_failure - Handle memory failure of a page.
997 * @pfn: Page Number of the corrupted page
998 * @trapno: Trap number reported in the signal to user space.
999 * @flags: fine tune action taken
1000 *
1001 * This function is called by the low level machine check code
1002 * of an architecture when it detects hardware memory corruption
1003 * of a page. It tries its best to recover, which includes
1004 * dropping pages, killing processes etc.
1005 *
1006 * The function is primarily of use for corruptions that
1007 * happen outside the current execution context (e.g. when
1008 * detected by a background scrubber)
1009 *
1010 * Must run in process context (e.g. a work queue) with interrupts
1011 * enabled and no spinlocks hold.
1012 */
1013int memory_failure(unsigned long pfn, int trapno, int flags)
988{ 1014{
989 struct page_state *ps; 1015 struct page_state *ps;
990 struct page *p; 1016 struct page *p;
@@ -1063,7 +1089,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1063 * The check (unnecessarily) ignores LRU pages being isolated and 1089 * The check (unnecessarily) ignores LRU pages being isolated and
1064 * walked by the page reclaim code, however that's not a big loss. 1090 * walked by the page reclaim code, however that's not a big loss.
1065 */ 1091 */
1066 if (!PageHuge(p) && !PageTransCompound(p)) { 1092 if (!PageHuge(p) && !PageTransTail(p)) {
1067 if (!PageLRU(p)) 1093 if (!PageLRU(p))
1068 shake_page(p, 0); 1094 shake_page(p, 0);
1069 if (!PageLRU(p)) { 1095 if (!PageLRU(p)) {
@@ -1130,7 +1156,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1130 * Now take care of user space mappings. 1156 * Now take care of user space mappings.
1131 * Abort on fail: __delete_from_page_cache() assumes unmapped page. 1157 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1132 */ 1158 */
1133 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { 1159 if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
1134 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 1160 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1135 res = -EBUSY; 1161 res = -EBUSY;
1136 goto out; 1162 goto out;
@@ -1156,29 +1182,7 @@ out:
1156 unlock_page(hpage); 1182 unlock_page(hpage);
1157 return res; 1183 return res;
1158} 1184}
1159EXPORT_SYMBOL_GPL(__memory_failure); 1185EXPORT_SYMBOL_GPL(memory_failure);
1160
1161/**
1162 * memory_failure - Handle memory failure of a page.
1163 * @pfn: Page Number of the corrupted page
1164 * @trapno: Trap number reported in the signal to user space.
1165 *
1166 * This function is called by the low level machine check code
1167 * of an architecture when it detects hardware memory corruption
1168 * of a page. It tries its best to recover, which includes
1169 * dropping pages, killing processes etc.
1170 *
1171 * The function is primarily of use for corruptions that
1172 * happen outside the current execution context (e.g. when
1173 * detected by a background scrubber)
1174 *
1175 * Must run in process context (e.g. a work queue) with interrupts
1176 * enabled and no spinlocks hold.
1177 */
1178void memory_failure(unsigned long pfn, int trapno)
1179{
1180 __memory_failure(pfn, trapno, 0);
1181}
1182 1186
1183#define MEMORY_FAILURE_FIFO_ORDER 4 1187#define MEMORY_FAILURE_FIFO_ORDER 4
1184#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) 1188#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
@@ -1251,7 +1255,7 @@ static void memory_failure_work_func(struct work_struct *work)
1251 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); 1255 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1252 if (!gotten) 1256 if (!gotten)
1253 break; 1257 break;
1254 __memory_failure(entry.pfn, entry.trapno, entry.flags); 1258 memory_failure(entry.pfn, entry.trapno, entry.flags);
1255 } 1259 }
1256} 1260}
1257 1261
diff --git a/mm/memory.c b/mm/memory.c
index fa2f04e0337c..6105f475fa86 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
125 125
126#if defined(SPLIT_RSS_COUNTING) 126#if defined(SPLIT_RSS_COUNTING)
127 127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) 128void sync_mm_rss(struct mm_struct *mm)
129{ 129{
130 int i; 130 int i;
131 131
132 for (i = 0; i < NR_MM_COUNTERS; i++) { 132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) { 133 if (current->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]); 134 add_mm_counter(mm, i, current->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0; 135 current->rss_stat.count[i] = 0;
136 } 136 }
137 } 137 }
138 task->rss_stat.events = 0; 138 current->rss_stat.events = 0;
139} 139}
140 140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) 141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task)
157 if (unlikely(task != current)) 157 if (unlikely(task != current))
158 return; 158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) 159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm); 160 sync_mm_rss(task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167 /*
168 * Don't use task->mm here...for avoiding to use task_get_mm()..
169 * The caller must guarantee task->mm is not invalid.
170 */
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172 /*
173 * counter is updated in asynchronous manner and may go to minus.
174 * But it's never be expected number for users.
175 */
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184} 161}
185#else /* SPLIT_RSS_COUNTING */ 162#else /* SPLIT_RSS_COUNTING */
186 163
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
661 int i; 638 int i;
662 639
663 if (current->mm == mm) 640 if (current->mm == mm)
664 sync_mm_rss(current, mm); 641 sync_mm_rss(mm);
665 for (i = 0; i < NR_MM_COUNTERS; i++) 642 for (i = 0; i < NR_MM_COUNTERS; i++)
666 if (rss[i]) 643 if (rss[i])
667 add_mm_counter(mm, i, rss[i]); 644 add_mm_counter(mm, i, rss[i]);
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1247 do { 1224 do {
1248 next = pmd_addr_end(addr, end); 1225 next = pmd_addr_end(addr, end);
1249 if (pmd_trans_huge(*pmd)) { 1226 if (pmd_trans_huge(*pmd)) {
1250 if (next-addr != HPAGE_PMD_SIZE) { 1227 if (next - addr != HPAGE_PMD_SIZE) {
1251 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1228 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1252 split_huge_page_pmd(vma->vm_mm, pmd); 1229 split_huge_page_pmd(vma->vm_mm, pmd);
1253 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1230 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1254 continue; 1231 goto next;
1255 /* fall through */ 1232 /* fall through */
1256 } 1233 }
1257 if (pmd_none_or_clear_bad(pmd)) 1234 /*
1258 continue; 1235 * Here there can be other concurrent MADV_DONTNEED or
1236 * trans huge page faults running, and if the pmd is
1237 * none or trans huge it can change under us. This is
1238 * because MADV_DONTNEED holds the mmap_sem in read
1239 * mode.
1240 */
1241 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1242 goto next;
1259 next = zap_pte_range(tlb, vma, pmd, addr, next, details); 1243 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1244next:
1260 cond_resched(); 1245 cond_resched();
1261 } while (pmd++, addr = next, addr != end); 1246 } while (pmd++, addr = next, addr != end);
1262 1247
@@ -1282,10 +1267,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1282 return addr; 1267 return addr;
1283} 1268}
1284 1269
1285static unsigned long unmap_page_range(struct mmu_gather *tlb, 1270static void unmap_page_range(struct mmu_gather *tlb,
1286 struct vm_area_struct *vma, 1271 struct vm_area_struct *vma,
1287 unsigned long addr, unsigned long end, 1272 unsigned long addr, unsigned long end,
1288 struct zap_details *details) 1273 struct zap_details *details)
1289{ 1274{
1290 pgd_t *pgd; 1275 pgd_t *pgd;
1291 unsigned long next; 1276 unsigned long next;
@@ -1305,8 +1290,47 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1305 } while (pgd++, addr = next, addr != end); 1290 } while (pgd++, addr = next, addr != end);
1306 tlb_end_vma(tlb, vma); 1291 tlb_end_vma(tlb, vma);
1307 mem_cgroup_uncharge_end(); 1292 mem_cgroup_uncharge_end();
1293}
1308 1294
1309 return addr; 1295
1296static void unmap_single_vma(struct mmu_gather *tlb,
1297 struct vm_area_struct *vma, unsigned long start_addr,
1298 unsigned long end_addr, unsigned long *nr_accounted,
1299 struct zap_details *details)
1300{
1301 unsigned long start = max(vma->vm_start, start_addr);
1302 unsigned long end;
1303
1304 if (start >= vma->vm_end)
1305 return;
1306 end = min(vma->vm_end, end_addr);
1307 if (end <= vma->vm_start)
1308 return;
1309
1310 if (vma->vm_flags & VM_ACCOUNT)
1311 *nr_accounted += (end - start) >> PAGE_SHIFT;
1312
1313 if (unlikely(is_pfn_mapping(vma)))
1314 untrack_pfn_vma(vma, 0, 0);
1315
1316 if (start != end) {
1317 if (unlikely(is_vm_hugetlb_page(vma))) {
1318 /*
1319 * It is undesirable to test vma->vm_file as it
1320 * should be non-null for valid hugetlb area.
1321 * However, vm_file will be NULL in the error
1322 * cleanup path of do_mmap_pgoff. When
1323 * hugetlbfs ->mmap method fails,
1324 * do_mmap_pgoff() nullifies vma->vm_file
1325 * before calling this function to clean up.
1326 * Since no pte has actually been setup, it is
1327 * safe to do nothing in this case.
1328 */
1329 if (vma->vm_file)
1330 unmap_hugepage_range(vma, start, end, NULL);
1331 } else
1332 unmap_page_range(tlb, vma, start, end, details);
1333 }
1310} 1334}
1311 1335
1312/** 1336/**
@@ -1318,8 +1342,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1318 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here 1342 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
1319 * @details: details of nonlinear truncation or shared cache invalidation 1343 * @details: details of nonlinear truncation or shared cache invalidation
1320 * 1344 *
1321 * Returns the end address of the unmapping (restart addr if interrupted).
1322 *
1323 * Unmap all pages in the vma list. 1345 * Unmap all pages in the vma list.
1324 * 1346 *
1325 * Only addresses between `start' and `end' will be unmapped. 1347 * Only addresses between `start' and `end' will be unmapped.
@@ -1331,55 +1353,18 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1331 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 1353 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1332 * drops the lock and schedules. 1354 * drops the lock and schedules.
1333 */ 1355 */
1334unsigned long unmap_vmas(struct mmu_gather *tlb, 1356void unmap_vmas(struct mmu_gather *tlb,
1335 struct vm_area_struct *vma, unsigned long start_addr, 1357 struct vm_area_struct *vma, unsigned long start_addr,
1336 unsigned long end_addr, unsigned long *nr_accounted, 1358 unsigned long end_addr, unsigned long *nr_accounted,
1337 struct zap_details *details) 1359 struct zap_details *details)
1338{ 1360{
1339 unsigned long start = start_addr;
1340 struct mm_struct *mm = vma->vm_mm; 1361 struct mm_struct *mm = vma->vm_mm;
1341 1362
1342 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1363 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1343 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 1364 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1344 unsigned long end; 1365 unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted,
1345 1366 details);
1346 start = max(vma->vm_start, start_addr);
1347 if (start >= vma->vm_end)
1348 continue;
1349 end = min(vma->vm_end, end_addr);
1350 if (end <= vma->vm_start)
1351 continue;
1352
1353 if (vma->vm_flags & VM_ACCOUNT)
1354 *nr_accounted += (end - start) >> PAGE_SHIFT;
1355
1356 if (unlikely(is_pfn_mapping(vma)))
1357 untrack_pfn_vma(vma, 0, 0);
1358
1359 while (start != end) {
1360 if (unlikely(is_vm_hugetlb_page(vma))) {
1361 /*
1362 * It is undesirable to test vma->vm_file as it
1363 * should be non-null for valid hugetlb area.
1364 * However, vm_file will be NULL in the error
1365 * cleanup path of do_mmap_pgoff. When
1366 * hugetlbfs ->mmap method fails,
1367 * do_mmap_pgoff() nullifies vma->vm_file
1368 * before calling this function to clean up.
1369 * Since no pte has actually been setup, it is
1370 * safe to do nothing in this case.
1371 */
1372 if (vma->vm_file)
1373 unmap_hugepage_range(vma, start, end, NULL);
1374
1375 start = end;
1376 } else
1377 start = unmap_page_range(tlb, vma, start, end, details);
1378 }
1379 }
1380
1381 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1367 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1382 return start; /* which is now the end (or restart) address */
1383} 1368}
1384 1369
1385/** 1370/**
@@ -1388,8 +1373,10 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
1388 * @address: starting address of pages to zap 1373 * @address: starting address of pages to zap
1389 * @size: number of bytes to zap 1374 * @size: number of bytes to zap
1390 * @details: details of nonlinear truncation or shared cache invalidation 1375 * @details: details of nonlinear truncation or shared cache invalidation
1376 *
1377 * Caller must protect the VMA list
1391 */ 1378 */
1392unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, 1379void zap_page_range(struct vm_area_struct *vma, unsigned long address,
1393 unsigned long size, struct zap_details *details) 1380 unsigned long size, struct zap_details *details)
1394{ 1381{
1395 struct mm_struct *mm = vma->vm_mm; 1382 struct mm_struct *mm = vma->vm_mm;
@@ -1400,9 +1387,34 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1400 lru_add_drain(); 1387 lru_add_drain();
1401 tlb_gather_mmu(&tlb, mm, 0); 1388 tlb_gather_mmu(&tlb, mm, 0);
1402 update_hiwater_rss(mm); 1389 update_hiwater_rss(mm);
1403 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1390 unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1391 tlb_finish_mmu(&tlb, address, end);
1392}
1393
1394/**
1395 * zap_page_range_single - remove user pages in a given range
1396 * @vma: vm_area_struct holding the applicable pages
1397 * @address: starting address of pages to zap
1398 * @size: number of bytes to zap
1399 * @details: details of nonlinear truncation or shared cache invalidation
1400 *
1401 * The range must fit into one VMA.
1402 */
1403static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1404 unsigned long size, struct zap_details *details)
1405{
1406 struct mm_struct *mm = vma->vm_mm;
1407 struct mmu_gather tlb;
1408 unsigned long end = address + size;
1409 unsigned long nr_accounted = 0;
1410
1411 lru_add_drain();
1412 tlb_gather_mmu(&tlb, mm, 0);
1413 update_hiwater_rss(mm);
1414 mmu_notifier_invalidate_range_start(mm, address, end);
1415 unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details);
1416 mmu_notifier_invalidate_range_end(mm, address, end);
1404 tlb_finish_mmu(&tlb, address, end); 1417 tlb_finish_mmu(&tlb, address, end);
1405 return end;
1406} 1418}
1407 1419
1408/** 1420/**
@@ -1423,7 +1435,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1423 if (address < vma->vm_start || address + size > vma->vm_end || 1435 if (address < vma->vm_start || address + size > vma->vm_end ||
1424 !(vma->vm_flags & VM_PFNMAP)) 1436 !(vma->vm_flags & VM_PFNMAP))
1425 return -1; 1437 return -1;
1426 zap_page_range(vma, address, size, NULL); 1438 zap_page_range_single(vma, address, size, NULL);
1427 return 0; 1439 return 0;
1428} 1440}
1429EXPORT_SYMBOL_GPL(zap_vma_ptes); 1441EXPORT_SYMBOL_GPL(zap_vma_ptes);
@@ -2447,7 +2459,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2447 * fails, we just zero-fill it. Live with it. 2459 * fails, we just zero-fill it. Live with it.
2448 */ 2460 */
2449 if (unlikely(!src)) { 2461 if (unlikely(!src)) {
2450 void *kaddr = kmap_atomic(dst, KM_USER0); 2462 void *kaddr = kmap_atomic(dst);
2451 void __user *uaddr = (void __user *)(va & PAGE_MASK); 2463 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2452 2464
2453 /* 2465 /*
@@ -2458,7 +2470,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2458 */ 2470 */
2459 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 2471 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2460 clear_page(kaddr); 2472 clear_page(kaddr);
2461 kunmap_atomic(kaddr, KM_USER0); 2473 kunmap_atomic(kaddr);
2462 flush_dcache_page(dst); 2474 flush_dcache_page(dst);
2463 } else 2475 } else
2464 copy_user_highpage(dst, src, va, vma); 2476 copy_user_highpage(dst, src, va, vma);
@@ -2770,7 +2782,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2770 unsigned long start_addr, unsigned long end_addr, 2782 unsigned long start_addr, unsigned long end_addr,
2771 struct zap_details *details) 2783 struct zap_details *details)
2772{ 2784{
2773 zap_page_range(vma, start_addr, end_addr - start_addr, details); 2785 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2774} 2786}
2775 2787
2776static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2788static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -3611,13 +3623,7 @@ static int __init gate_vma_init(void)
3611 gate_vma.vm_end = FIXADDR_USER_END; 3623 gate_vma.vm_end = FIXADDR_USER_END;
3612 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; 3624 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3613 gate_vma.vm_page_prot = __P101; 3625 gate_vma.vm_page_prot = __P101;
3614 /* 3626
3615 * Make sure the vDSO gets into every core dump.
3616 * Dumping its contents makes post-mortem fully interpretable later
3617 * without matching up the same kernel and hardware config to see
3618 * what PC values meant.
3619 */
3620 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3621 return 0; 3627 return 0;
3622} 3628}
3623__initcall(gate_vma_init); 3629__initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 06b145fb64ab..cfb6c8678754 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
512 do { 512 do {
513 next = pmd_addr_end(addr, end); 513 next = pmd_addr_end(addr, end);
514 split_huge_page_pmd(vma->vm_mm, pmd); 514 split_huge_page_pmd(vma->vm_mm, pmd);
515 if (pmd_none_or_clear_bad(pmd)) 515 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
516 continue; 516 continue;
517 if (check_pte_range(vma, pmd, addr, next, nodes, 517 if (check_pte_range(vma, pmd, addr, next, nodes,
518 flags, private)) 518 flags, private))
@@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
640 unsigned long vmstart; 640 unsigned long vmstart;
641 unsigned long vmend; 641 unsigned long vmend;
642 642
643 vma = find_vma_prev(mm, start, &prev); 643 vma = find_vma(mm, start);
644 if (!vma || vma->vm_start > start) 644 if (!vma || vma->vm_start > start)
645 return -EFAULT; 645 return -EFAULT;
646 646
647 prev = vma->vm_prev;
647 if (start > vma->vm_start) 648 if (start > vma->vm_start)
648 prev = vma; 649 prev = vma;
649 650
@@ -1322,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1322 err = -ESRCH; 1323 err = -ESRCH;
1323 goto out; 1324 goto out;
1324 } 1325 }
1325 mm = get_task_mm(task); 1326 get_task_struct(task);
1326 rcu_read_unlock();
1327 1327
1328 err = -EINVAL; 1328 err = -EINVAL;
1329 if (!mm)
1330 goto out;
1331 1329
1332 /* 1330 /*
1333 * Check if this process has the right to modify the specified 1331 * Check if this process has the right to modify the specified
@@ -1335,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1335 * capabilities, superuser privileges or the same 1333 * capabilities, superuser privileges or the same
1336 * userid as the target process. 1334 * userid as the target process.
1337 */ 1335 */
1338 rcu_read_lock();
1339 tcred = __task_cred(task); 1336 tcred = __task_cred(task);
1340 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1337 if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1341 cred->uid != tcred->suid && cred->uid != tcred->uid && 1338 cred->uid != tcred->suid && cred->uid != tcred->uid &&
1342 !capable(CAP_SYS_NICE)) { 1339 !capable(CAP_SYS_NICE)) {
1343 rcu_read_unlock(); 1340 rcu_read_unlock();
1344 err = -EPERM; 1341 err = -EPERM;
1345 goto out; 1342 goto out_put;
1346 } 1343 }
1347 rcu_read_unlock(); 1344 rcu_read_unlock();
1348 1345
@@ -1350,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1350 /* Is the user allowed to access the target nodes? */ 1347 /* Is the user allowed to access the target nodes? */
1351 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1348 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1352 err = -EPERM; 1349 err = -EPERM;
1353 goto out; 1350 goto out_put;
1354 } 1351 }
1355 1352
1356 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { 1353 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1357 err = -EINVAL; 1354 err = -EINVAL;
1358 goto out; 1355 goto out_put;
1359 } 1356 }
1360 1357
1361 err = security_task_movememory(task); 1358 err = security_task_movememory(task);
1362 if (err) 1359 if (err)
1363 goto out; 1360 goto out_put;
1364 1361
1365 err = do_migrate_pages(mm, old, new, 1362 mm = get_task_mm(task);
1366 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1363 put_task_struct(task);
1367out:
1368 if (mm) 1364 if (mm)
1369 mmput(mm); 1365 err = do_migrate_pages(mm, old, new,
1366 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1367 else
1368 err = -EINVAL;
1369
1370 mmput(mm);
1371out:
1370 NODEMASK_SCRATCH_FREE(scratch); 1372 NODEMASK_SCRATCH_FREE(scratch);
1371 1373
1372 return err; 1374 return err;
1375
1376out_put:
1377 put_task_struct(task);
1378 goto out;
1379
1373} 1380}
1374 1381
1375 1382
@@ -1843,18 +1850,24 @@ struct page *
1843alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1850alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1844 unsigned long addr, int node) 1851 unsigned long addr, int node)
1845{ 1852{
1846 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1853 struct mempolicy *pol;
1847 struct zonelist *zl; 1854 struct zonelist *zl;
1848 struct page *page; 1855 struct page *page;
1856 unsigned int cpuset_mems_cookie;
1857
1858retry_cpuset:
1859 pol = get_vma_policy(current, vma, addr);
1860 cpuset_mems_cookie = get_mems_allowed();
1849 1861
1850 get_mems_allowed();
1851 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1862 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1852 unsigned nid; 1863 unsigned nid;
1853 1864
1854 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1865 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1855 mpol_cond_put(pol); 1866 mpol_cond_put(pol);
1856 page = alloc_page_interleave(gfp, order, nid); 1867 page = alloc_page_interleave(gfp, order, nid);
1857 put_mems_allowed(); 1868 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1869 goto retry_cpuset;
1870
1858 return page; 1871 return page;
1859 } 1872 }
1860 zl = policy_zonelist(gfp, pol, node); 1873 zl = policy_zonelist(gfp, pol, node);
@@ -1865,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1865 struct page *page = __alloc_pages_nodemask(gfp, order, 1878 struct page *page = __alloc_pages_nodemask(gfp, order,
1866 zl, policy_nodemask(gfp, pol)); 1879 zl, policy_nodemask(gfp, pol));
1867 __mpol_put(pol); 1880 __mpol_put(pol);
1868 put_mems_allowed(); 1881 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1882 goto retry_cpuset;
1869 return page; 1883 return page;
1870 } 1884 }
1871 /* 1885 /*
@@ -1873,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1873 */ 1887 */
1874 page = __alloc_pages_nodemask(gfp, order, zl, 1888 page = __alloc_pages_nodemask(gfp, order, zl,
1875 policy_nodemask(gfp, pol)); 1889 policy_nodemask(gfp, pol));
1876 put_mems_allowed(); 1890 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1891 goto retry_cpuset;
1877 return page; 1892 return page;
1878} 1893}
1879 1894
@@ -1900,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1900{ 1915{
1901 struct mempolicy *pol = current->mempolicy; 1916 struct mempolicy *pol = current->mempolicy;
1902 struct page *page; 1917 struct page *page;
1918 unsigned int cpuset_mems_cookie;
1903 1919
1904 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1920 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1905 pol = &default_policy; 1921 pol = &default_policy;
1906 1922
1907 get_mems_allowed(); 1923retry_cpuset:
1924 cpuset_mems_cookie = get_mems_allowed();
1925
1908 /* 1926 /*
1909 * No reference counting needed for current->mempolicy 1927 * No reference counting needed for current->mempolicy
1910 * nor system default_policy 1928 * nor system default_policy
@@ -1915,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1915 page = __alloc_pages_nodemask(gfp, order, 1933 page = __alloc_pages_nodemask(gfp, order,
1916 policy_zonelist(gfp, pol, numa_node_id()), 1934 policy_zonelist(gfp, pol, numa_node_id()),
1917 policy_nodemask(gfp, pol)); 1935 policy_nodemask(gfp, pol));
1918 put_mems_allowed(); 1936
1937 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1938 goto retry_cpuset;
1939
1919 return page; 1940 return page;
1920} 1941}
1921EXPORT_SYMBOL(alloc_pages_current); 1942EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/migrate.c b/mm/migrate.c
index df141f60289e..51c08a0c6f68 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -839,8 +839,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
839 if (!newpage) 839 if (!newpage)
840 return -ENOMEM; 840 return -ENOMEM;
841 841
842 mem_cgroup_reset_owner(newpage);
843
844 if (page_count(page) == 1) { 842 if (page_count(page) == 1) {
845 /* page was freed from under us. So we are done. */ 843 /* page was freed from under us. So we are done. */
846 goto out; 844 goto out;
@@ -1176,20 +1174,17 @@ set_status:
1176 * Migrate an array of page address onto an array of nodes and fill 1174 * Migrate an array of page address onto an array of nodes and fill
1177 * the corresponding array of status. 1175 * the corresponding array of status.
1178 */ 1176 */
1179static int do_pages_move(struct mm_struct *mm, struct task_struct *task, 1177static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1180 unsigned long nr_pages, 1178 unsigned long nr_pages,
1181 const void __user * __user *pages, 1179 const void __user * __user *pages,
1182 const int __user *nodes, 1180 const int __user *nodes,
1183 int __user *status, int flags) 1181 int __user *status, int flags)
1184{ 1182{
1185 struct page_to_node *pm; 1183 struct page_to_node *pm;
1186 nodemask_t task_nodes;
1187 unsigned long chunk_nr_pages; 1184 unsigned long chunk_nr_pages;
1188 unsigned long chunk_start; 1185 unsigned long chunk_start;
1189 int err; 1186 int err;
1190 1187
1191 task_nodes = cpuset_mems_allowed(task);
1192
1193 err = -ENOMEM; 1188 err = -ENOMEM;
1194 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 1189 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1195 if (!pm) 1190 if (!pm)
@@ -1351,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1351 struct task_struct *task; 1346 struct task_struct *task;
1352 struct mm_struct *mm; 1347 struct mm_struct *mm;
1353 int err; 1348 int err;
1349 nodemask_t task_nodes;
1354 1350
1355 /* Check flags */ 1351 /* Check flags */
1356 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1352 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1366,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1366 rcu_read_unlock(); 1362 rcu_read_unlock();
1367 return -ESRCH; 1363 return -ESRCH;
1368 } 1364 }
1369 mm = get_task_mm(task); 1365 get_task_struct(task);
1370 rcu_read_unlock();
1371
1372 if (!mm)
1373 return -EINVAL;
1374 1366
1375 /* 1367 /*
1376 * Check if this process has the right to modify the specified 1368 * Check if this process has the right to modify the specified
@@ -1378,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1378 * capabilities, superuser privileges or the same 1370 * capabilities, superuser privileges or the same
1379 * userid as the target process. 1371 * userid as the target process.
1380 */ 1372 */
1381 rcu_read_lock();
1382 tcred = __task_cred(task); 1373 tcred = __task_cred(task);
1383 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1374 if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1384 cred->uid != tcred->suid && cred->uid != tcred->uid && 1375 cred->uid != tcred->suid && cred->uid != tcred->uid &&
@@ -1393,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1393 if (err) 1384 if (err)
1394 goto out; 1385 goto out;
1395 1386
1396 if (nodes) { 1387 task_nodes = cpuset_mems_allowed(task);
1397 err = do_pages_move(mm, task, nr_pages, pages, nodes, status, 1388 mm = get_task_mm(task);
1398 flags); 1389 put_task_struct(task);
1399 } else { 1390
1400 err = do_pages_stat(mm, nr_pages, pages, status); 1391 if (mm) {
1401 } 1392 if (nodes)
1393 err = do_pages_move(mm, task_nodes, nr_pages, pages,
1394 nodes, status, flags);
1395 else
1396 err = do_pages_stat(mm, nr_pages, pages, status);
1397 } else
1398 err = -EINVAL;
1402 1399
1403out:
1404 mmput(mm); 1400 mmput(mm);
1405 return err; 1401 return err;
1402
1403out:
1404 put_task_struct(task);
1405 return err;
1406} 1406}
1407 1407
1408/* 1408/*
diff --git a/mm/mincore.c b/mm/mincore.c
index 636a86876ff2..936b4cee8cb1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
164 } 164 }
165 /* fall through */ 165 /* fall through */
166 } 166 }
167 if (pmd_none_or_clear_bad(pmd)) 167 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
168 mincore_unmapped_range(vma, addr, next, vec); 168 mincore_unmapped_range(vma, addr, next, vec);
169 else 169 else
170 mincore_pte_range(vma, pmd, addr, next, vec); 170 mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4f4f53bdc65d..ef726e8aa8e9 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on)
385 return -EINVAL; 385 return -EINVAL;
386 if (end == start) 386 if (end == start)
387 return 0; 387 return 0;
388 vma = find_vma_prev(current->mm, start, &prev); 388 vma = find_vma(current->mm, start);
389 if (!vma || vma->vm_start > start) 389 if (!vma || vma->vm_start > start)
390 return -ENOMEM; 390 return -ENOMEM;
391 391
392 prev = vma->vm_prev;
392 if (start > vma->vm_start) 393 if (start > vma->vm_start)
393 prev = vma; 394 prev = vma;
394 395
diff --git a/mm/mmap.c b/mm/mmap.c
index 7c112fbca405..b17a39f31a5e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -453,9 +453,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
453} 453}
454 454
455/* 455/*
456 * Helper for vma_adjust in the split_vma insert case: 456 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
457 * insert vm structure into list and rbtree and anon_vma, 457 * mm's list and rbtree. It has already been inserted into the prio_tree.
458 * but it has already been inserted into prio_tree earlier.
459 */ 458 */
460static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 459static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
461{ 460{
@@ -954,6 +953,19 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
954#endif /* CONFIG_PROC_FS */ 953#endif /* CONFIG_PROC_FS */
955 954
956/* 955/*
956 * If a hint addr is less than mmap_min_addr change hint to be as
957 * low as possible but still greater than mmap_min_addr
958 */
959static inline unsigned long round_hint_to_min(unsigned long hint)
960{
961 hint &= PAGE_MASK;
962 if (((void *)hint != NULL) &&
963 (hint < mmap_min_addr))
964 return PAGE_ALIGN(mmap_min_addr);
965 return hint;
966}
967
968/*
957 * The caller must hold down_write(&current->mm->mmap_sem). 969 * The caller must hold down_write(&current->mm->mmap_sem).
958 */ 970 */
959 971
@@ -1117,9 +1129,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1117 * A dummy user value is used because we are not locking 1129 * A dummy user value is used because we are not locking
1118 * memory so no accounting is necessary 1130 * memory so no accounting is necessary
1119 */ 1131 */
1120 len = ALIGN(len, huge_page_size(&default_hstate)); 1132 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1121 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, 1133 VM_NORESERVE, &user,
1122 &user, HUGETLB_ANONHUGE_INODE); 1134 HUGETLB_ANONHUGE_INODE);
1123 if (IS_ERR(file)) 1135 if (IS_ERR(file))
1124 return PTR_ERR(file); 1136 return PTR_ERR(file);
1125 } 1137 }
@@ -1253,7 +1265,7 @@ munmap_back:
1253 */ 1265 */
1254 if (accountable_mapping(file, vm_flags)) { 1266 if (accountable_mapping(file, vm_flags)) {
1255 charged = len >> PAGE_SHIFT; 1267 charged = len >> PAGE_SHIFT;
1256 if (security_vm_enough_memory(charged)) 1268 if (security_vm_enough_memory_mm(mm, charged))
1257 return -ENOMEM; 1269 return -ENOMEM;
1258 vm_flags |= VM_ACCOUNT; 1270 vm_flags |= VM_ACCOUNT;
1259 } 1271 }
@@ -1284,8 +1296,9 @@ munmap_back:
1284 vma->vm_pgoff = pgoff; 1296 vma->vm_pgoff = pgoff;
1285 INIT_LIST_HEAD(&vma->anon_vma_chain); 1297 INIT_LIST_HEAD(&vma->anon_vma_chain);
1286 1298
1299 error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
1300
1287 if (file) { 1301 if (file) {
1288 error = -EINVAL;
1289 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1302 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1290 goto free_vma; 1303 goto free_vma;
1291 if (vm_flags & VM_DENYWRITE) { 1304 if (vm_flags & VM_DENYWRITE) {
@@ -1311,6 +1324,8 @@ munmap_back:
1311 pgoff = vma->vm_pgoff; 1324 pgoff = vma->vm_pgoff;
1312 vm_flags = vma->vm_flags; 1325 vm_flags = vma->vm_flags;
1313 } else if (vm_flags & VM_SHARED) { 1326 } else if (vm_flags & VM_SHARED) {
1327 if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
1328 goto free_vma;
1314 error = shmem_zero_setup(vma); 1329 error = shmem_zero_setup(vma);
1315 if (error) 1330 if (error)
1316 goto free_vma; 1331 goto free_vma;
@@ -1446,10 +1461,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1446 /* 1461 /*
1447 * Is this a new hole at the lowest possible address? 1462 * Is this a new hole at the lowest possible address?
1448 */ 1463 */
1449 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { 1464 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
1450 mm->free_area_cache = addr; 1465 mm->free_area_cache = addr;
1451 mm->cached_hole_size = ~0UL;
1452 }
1453} 1466}
1454 1467
1455/* 1468/*
@@ -1464,7 +1477,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1464{ 1477{
1465 struct vm_area_struct *vma; 1478 struct vm_area_struct *vma;
1466 struct mm_struct *mm = current->mm; 1479 struct mm_struct *mm = current->mm;
1467 unsigned long addr = addr0; 1480 unsigned long addr = addr0, start_addr;
1468 1481
1469 /* requested length too big for entire address space */ 1482 /* requested length too big for entire address space */
1470 if (len > TASK_SIZE) 1483 if (len > TASK_SIZE)
@@ -1488,22 +1501,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1488 mm->free_area_cache = mm->mmap_base; 1501 mm->free_area_cache = mm->mmap_base;
1489 } 1502 }
1490 1503
1504try_again:
1491 /* either no address requested or can't fit in requested address hole */ 1505 /* either no address requested or can't fit in requested address hole */
1492 addr = mm->free_area_cache; 1506 start_addr = addr = mm->free_area_cache;
1493
1494 /* make sure it can fit in the remaining address space */
1495 if (addr > len) {
1496 vma = find_vma(mm, addr-len);
1497 if (!vma || addr <= vma->vm_start)
1498 /* remember the address as a hint for next time */
1499 return (mm->free_area_cache = addr-len);
1500 }
1501 1507
1502 if (mm->mmap_base < len) 1508 if (addr < len)
1503 goto bottomup; 1509 goto fail;
1504
1505 addr = mm->mmap_base-len;
1506 1510
1511 addr -= len;
1507 do { 1512 do {
1508 /* 1513 /*
1509 * Lookup failure means no vma is above this address, 1514 * Lookup failure means no vma is above this address,
@@ -1523,7 +1528,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1523 addr = vma->vm_start-len; 1528 addr = vma->vm_start-len;
1524 } while (len < vma->vm_start); 1529 } while (len < vma->vm_start);
1525 1530
1526bottomup: 1531fail:
1532 /*
1533 * if hint left us with no space for the requested
1534 * mapping then try again:
1535 *
1536 * Note: this is different with the case of bottomup
1537 * which does the fully line-search, but we use find_vma
1538 * here that causes some holes skipped.
1539 */
1540 if (start_addr != mm->mmap_base) {
1541 mm->free_area_cache = mm->mmap_base;
1542 mm->cached_hole_size = 0;
1543 goto try_again;
1544 }
1545
1527 /* 1546 /*
1528 * A failed mmap() very likely causes application failure, 1547 * A failed mmap() very likely causes application failure,
1529 * so fall back to the bottom-up function here. This scenario 1548 * so fall back to the bottom-up function here. This scenario
@@ -1628,7 +1647,6 @@ EXPORT_SYMBOL(find_vma);
1628 1647
1629/* 1648/*
1630 * Same as find_vma, but also return a pointer to the previous VMA in *pprev. 1649 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
1631 * Note: pprev is set to NULL when return value is NULL.
1632 */ 1650 */
1633struct vm_area_struct * 1651struct vm_area_struct *
1634find_vma_prev(struct mm_struct *mm, unsigned long addr, 1652find_vma_prev(struct mm_struct *mm, unsigned long addr,
@@ -1637,7 +1655,16 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
1637 struct vm_area_struct *vma; 1655 struct vm_area_struct *vma;
1638 1656
1639 vma = find_vma(mm, addr); 1657 vma = find_vma(mm, addr);
1640 *pprev = vma ? vma->vm_prev : NULL; 1658 if (vma) {
1659 *pprev = vma->vm_prev;
1660 } else {
1661 struct rb_node *rb_node = mm->mm_rb.rb_node;
1662 *pprev = NULL;
1663 while (rb_node) {
1664 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
1665 rb_node = rb_node->rb_right;
1666 }
1667 }
1641 return vma; 1668 return vma;
1642} 1669}
1643 1670
@@ -2192,7 +2219,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2192 if (mm->map_count > sysctl_max_map_count) 2219 if (mm->map_count > sysctl_max_map_count)
2193 return -ENOMEM; 2220 return -ENOMEM;
2194 2221
2195 if (security_vm_enough_memory(len >> PAGE_SHIFT)) 2222 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
2196 return -ENOMEM; 2223 return -ENOMEM;
2197 2224
2198 /* Can we just expand an old private anonymous mapping? */ 2225 /* Can we just expand an old private anonymous mapping? */
@@ -2236,7 +2263,6 @@ void exit_mmap(struct mm_struct *mm)
2236 struct mmu_gather tlb; 2263 struct mmu_gather tlb;
2237 struct vm_area_struct *vma; 2264 struct vm_area_struct *vma;
2238 unsigned long nr_accounted = 0; 2265 unsigned long nr_accounted = 0;
2239 unsigned long end;
2240 2266
2241 /* mm's last user has gone, and its about to be pulled down */ 2267 /* mm's last user has gone, and its about to be pulled down */
2242 mmu_notifier_release(mm); 2268 mmu_notifier_release(mm);
@@ -2261,11 +2287,11 @@ void exit_mmap(struct mm_struct *mm)
2261 tlb_gather_mmu(&tlb, mm, 1); 2287 tlb_gather_mmu(&tlb, mm, 1);
2262 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2288 /* update_hiwater_rss(mm) here? but nobody should be looking */
2263 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2289 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2264 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2290 unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2265 vm_unacct_memory(nr_accounted); 2291 vm_unacct_memory(nr_accounted);
2266 2292
2267 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2293 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
2268 tlb_finish_mmu(&tlb, 0, end); 2294 tlb_finish_mmu(&tlb, 0, -1);
2269 2295
2270 /* 2296 /*
2271 * Walk the list again, actually closing and freeing it, 2297 * Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index cf332bc0080a..3dcfaf4ed355 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
53 struct task_struct *tsk = current; 53 struct task_struct *tsk = current;
54 54
55 task_lock(tsk); 55 task_lock(tsk);
56 sync_mm_rss(tsk, mm); 56 sync_mm_rss(mm);
57 tsk->mm = NULL; 57 tsk->mm = NULL;
58 /* active_mm is still 'mm' */ 58 /* active_mm is still 'mm' */
59 enter_lazy_tlb(mm, tsk); 59 enter_lazy_tlb(mm, tsk);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5a688a2756be..a40992610ab6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
60 ptent = pte_mkwrite(ptent); 60 ptent = pte_mkwrite(ptent);
61 61
62 ptep_modify_prot_commit(mm, addr, pte, ptent); 62 ptep_modify_prot_commit(mm, addr, pte, ptent);
63 } else if (PAGE_MIGRATION && !pte_file(oldpte)) { 63 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 64 swp_entry_t entry = pte_to_swp_entry(oldpte);
65 65
66 if (is_write_migration_entry(entry)) { 66 if (is_write_migration_entry(entry)) {
@@ -168,7 +168,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
168 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| 168 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
169 VM_SHARED|VM_NORESERVE))) { 169 VM_SHARED|VM_NORESERVE))) {
170 charged = nrpages; 170 charged = nrpages;
171 if (security_vm_enough_memory(charged)) 171 if (security_vm_enough_memory_mm(mm, charged))
172 return -ENOMEM; 172 return -ENOMEM;
173 newflags |= VM_ACCOUNT; 173 newflags |= VM_ACCOUNT;
174 } 174 }
@@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
262 262
263 down_write(&current->mm->mmap_sem); 263 down_write(&current->mm->mmap_sem);
264 264
265 vma = find_vma_prev(current->mm, start, &prev); 265 vma = find_vma(current->mm, start);
266 error = -ENOMEM; 266 error = -ENOMEM;
267 if (!vma) 267 if (!vma)
268 goto out; 268 goto out;
269 prev = vma->vm_prev;
269 if (unlikely(grows & PROT_GROWSDOWN)) { 270 if (unlikely(grows & PROT_GROWSDOWN)) {
270 if (vma->vm_start >= end) 271 if (vma->vm_start >= end)
271 goto out; 272 goto out;
diff --git a/mm/mremap.c b/mm/mremap.c
index 87bb8393e7d2..db8d983b5a7d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -329,7 +329,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
329 329
330 if (vma->vm_flags & VM_ACCOUNT) { 330 if (vma->vm_flags & VM_ACCOUNT) {
331 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; 331 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
332 if (security_vm_enough_memory(charged)) 332 if (security_vm_enough_memory_mm(mm, charged))
333 goto Efault; 333 goto Efault;
334 *p = charged; 334 *p = charged;
335 } 335 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2958fd8e7c9a..46bf2ed5594c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,7 @@
34#include <linux/ptrace.h> 34#include <linux/ptrace.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/ftrace.h> 36#include <linux/ftrace.h>
37#include <linux/ratelimit.h>
37 38
38#define CREATE_TRACE_POINTS 39#define CREATE_TRACE_POINTS
39#include <trace/events/oom.h> 40#include <trace/events/oom.h>
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
309 */ 310 */
310static struct task_struct *select_bad_process(unsigned int *ppoints, 311static struct task_struct *select_bad_process(unsigned int *ppoints,
311 unsigned long totalpages, struct mem_cgroup *memcg, 312 unsigned long totalpages, struct mem_cgroup *memcg,
312 const nodemask_t *nodemask) 313 const nodemask_t *nodemask, bool force_kill)
313{ 314{
314 struct task_struct *g, *p; 315 struct task_struct *g, *p;
315 struct task_struct *chosen = NULL; 316 struct task_struct *chosen = NULL;
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
335 if (test_tsk_thread_flag(p, TIF_MEMDIE)) { 336 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
336 if (unlikely(frozen(p))) 337 if (unlikely(frozen(p)))
337 __thaw_task(p); 338 __thaw_task(p);
338 return ERR_PTR(-1UL); 339 if (!force_kill)
340 return ERR_PTR(-1UL);
339 } 341 }
340 if (!p->mm) 342 if (!p->mm)
341 continue; 343 continue;
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
353 if (p == current) { 355 if (p == current) {
354 chosen = p; 356 chosen = p;
355 *ppoints = 1000; 357 *ppoints = 1000;
356 } else { 358 } else if (!force_kill) {
357 /* 359 /*
358 * If this task is not being ptraced on exit, 360 * If this task is not being ptraced on exit,
359 * then wait for it to finish before killing 361 * then wait for it to finish before killing
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
434} 436}
435 437
436#define K(x) ((x) << (PAGE_SHIFT-10)) 438#define K(x) ((x) << (PAGE_SHIFT-10))
437static int oom_kill_task(struct task_struct *p) 439static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
438{ 440 unsigned int points, unsigned long totalpages,
439 struct task_struct *q; 441 struct mem_cgroup *memcg, nodemask_t *nodemask,
440 struct mm_struct *mm; 442 const char *message)
441
442 p = find_lock_task_mm(p);
443 if (!p)
444 return 1;
445
446 /* mm cannot be safely dereferenced after task_unlock(p) */
447 mm = p->mm;
448
449 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
450 task_pid_nr(p), p->comm, K(p->mm->total_vm),
451 K(get_mm_counter(p->mm, MM_ANONPAGES)),
452 K(get_mm_counter(p->mm, MM_FILEPAGES)));
453 task_unlock(p);
454
455 /*
456 * Kill all user processes sharing p->mm in other thread groups, if any.
457 * They don't get access to memory reserves or a higher scheduler
458 * priority, though, to avoid depletion of all memory or task
459 * starvation. This prevents mm->mmap_sem livelock when an oom killed
460 * task cannot exit because it requires the semaphore and its contended
461 * by another thread trying to allocate memory itself. That thread will
462 * now get access to memory reserves since it has a pending fatal
463 * signal.
464 */
465 for_each_process(q)
466 if (q->mm == mm && !same_thread_group(q, p) &&
467 !(q->flags & PF_KTHREAD)) {
468 if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
469 continue;
470
471 task_lock(q); /* Protect ->comm from prctl() */
472 pr_err("Kill process %d (%s) sharing same memory\n",
473 task_pid_nr(q), q->comm);
474 task_unlock(q);
475 force_sig(SIGKILL, q);
476 }
477
478 set_tsk_thread_flag(p, TIF_MEMDIE);
479 force_sig(SIGKILL, p);
480
481 return 0;
482}
483#undef K
484
485static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
486 unsigned int points, unsigned long totalpages,
487 struct mem_cgroup *memcg, nodemask_t *nodemask,
488 const char *message)
489{ 443{
490 struct task_struct *victim = p; 444 struct task_struct *victim = p;
491 struct task_struct *child; 445 struct task_struct *child;
492 struct task_struct *t = p; 446 struct task_struct *t = p;
447 struct mm_struct *mm;
493 unsigned int victim_points = 0; 448 unsigned int victim_points = 0;
494 449 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
495 if (printk_ratelimit()) 450 DEFAULT_RATELIMIT_BURST);
496 dump_header(p, gfp_mask, order, memcg, nodemask);
497 451
498 /* 452 /*
499 * If the task is already exiting, don't alarm the sysadmin or kill 453 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
501 */ 455 */
502 if (p->flags & PF_EXITING) { 456 if (p->flags & PF_EXITING) {
503 set_tsk_thread_flag(p, TIF_MEMDIE); 457 set_tsk_thread_flag(p, TIF_MEMDIE);
504 return 0; 458 return;
505 } 459 }
506 460
461 if (__ratelimit(&oom_rs))
462 dump_header(p, gfp_mask, order, memcg, nodemask);
463
507 task_lock(p); 464 task_lock(p);
508 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", 465 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
509 message, task_pid_nr(p), p->comm, points); 466 message, task_pid_nr(p), p->comm, points);
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
533 } 490 }
534 } while_each_thread(p, t); 491 } while_each_thread(p, t);
535 492
536 return oom_kill_task(victim); 493 victim = find_lock_task_mm(victim);
494 if (!victim)
495 return;
496
497 /* mm cannot safely be dereferenced after task_unlock(victim) */
498 mm = victim->mm;
499 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
500 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
501 K(get_mm_counter(victim->mm, MM_ANONPAGES)),
502 K(get_mm_counter(victim->mm, MM_FILEPAGES)));
503 task_unlock(victim);
504
505 /*
506 * Kill all user processes sharing victim->mm in other thread groups, if
507 * any. They don't get access to memory reserves, though, to avoid
508 * depletion of all memory. This prevents mm->mmap_sem livelock when an
509 * oom killed thread cannot exit because it requires the semaphore and
510 * its contended by another thread trying to allocate memory itself.
511 * That thread will now get access to memory reserves since it has a
512 * pending fatal signal.
513 */
514 for_each_process(p)
515 if (p->mm == mm && !same_thread_group(p, victim) &&
516 !(p->flags & PF_KTHREAD)) {
517 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
518 continue;
519
520 task_lock(p); /* Protect ->comm from prctl() */
521 pr_err("Kill process %d (%s) sharing same memory\n",
522 task_pid_nr(p), p->comm);
523 task_unlock(p);
524 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
525 }
526
527 set_tsk_thread_flag(victim, TIF_MEMDIE);
528 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
537} 529}
530#undef K
538 531
539/* 532/*
540 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 533 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
561} 554}
562 555
563#ifdef CONFIG_CGROUP_MEM_RES_CTLR 556#ifdef CONFIG_CGROUP_MEM_RES_CTLR
564void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) 557void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
558 int order)
565{ 559{
566 unsigned long limit; 560 unsigned long limit;
567 unsigned int points = 0; 561 unsigned int points = 0;
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
577 return; 571 return;
578 } 572 }
579 573
580 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 574 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
581 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; 575 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
582 read_lock(&tasklist_lock); 576 read_lock(&tasklist_lock);
583retry: 577 p = select_bad_process(&points, limit, memcg, NULL, false);
584 p = select_bad_process(&points, limit, memcg, NULL); 578 if (p && PTR_ERR(p) != -1UL)
585 if (!p || PTR_ERR(p) == -1UL) 579 oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
586 goto out; 580 "Memory cgroup out of memory");
587
588 if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
589 "Memory cgroup out of memory"))
590 goto retry;
591out:
592 read_unlock(&tasklist_lock); 581 read_unlock(&tasklist_lock);
593} 582}
594#endif 583#endif
@@ -700,6 +689,7 @@ static void clear_system_oom(void)
700 * @gfp_mask: memory allocation flags 689 * @gfp_mask: memory allocation flags
701 * @order: amount of memory being requested as a power of 2 690 * @order: amount of memory being requested as a power of 2
702 * @nodemask: nodemask passed to page allocator 691 * @nodemask: nodemask passed to page allocator
692 * @force_kill: true if a task must be killed, even if others are exiting
703 * 693 *
704 * If we run out of memory, we have the choice between either 694 * If we run out of memory, we have the choice between either
705 * killing a random task (bad), letting the system crash (worse) 695 * killing a random task (bad), letting the system crash (worse)
@@ -707,7 +697,7 @@ static void clear_system_oom(void)
707 * don't have to be perfect here, we just have to be good. 697 * don't have to be perfect here, we just have to be good.
708 */ 698 */
709void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 699void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
710 int order, nodemask_t *nodemask) 700 int order, nodemask_t *nodemask, bool force_kill)
711{ 701{
712 const nodemask_t *mpol_mask; 702 const nodemask_t *mpol_mask;
713 struct task_struct *p; 703 struct task_struct *p;
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
745 if (sysctl_oom_kill_allocating_task && 735 if (sysctl_oom_kill_allocating_task &&
746 !oom_unkillable_task(current, NULL, nodemask) && 736 !oom_unkillable_task(current, NULL, nodemask) &&
747 current->mm) { 737 current->mm) {
748 /* 738 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
749 * oom_kill_process() needs tasklist_lock held. If it returns 739 nodemask,
750 * non-zero, current could not be killed so we must fallback to 740 "Out of memory (oom_kill_allocating_task)");
751 * the tasklist scan.
752 */
753 if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
754 NULL, nodemask,
755 "Out of memory (oom_kill_allocating_task)"))
756 goto out;
757 }
758
759retry:
760 p = select_bad_process(&points, totalpages, NULL, mpol_mask);
761 if (PTR_ERR(p) == -1UL)
762 goto out; 741 goto out;
742 }
763 743
744 p = select_bad_process(&points, totalpages, NULL, mpol_mask,
745 force_kill);
764 /* Found nothing?!?! Either we hang forever, or we panic. */ 746 /* Found nothing?!?! Either we hang forever, or we panic. */
765 if (!p) { 747 if (!p) {
766 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 748 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
767 read_unlock(&tasklist_lock); 749 read_unlock(&tasklist_lock);
768 panic("Out of memory and no killable processes...\n"); 750 panic("Out of memory and no killable processes...\n");
769 } 751 }
770 752 if (PTR_ERR(p) != -1UL) {
771 if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, 753 oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
772 nodemask, "Out of memory")) 754 nodemask, "Out of memory");
773 goto retry; 755 killed = 1;
774 killed = 1; 756 }
775out: 757out:
776 read_unlock(&tasklist_lock); 758 read_unlock(&tasklist_lock);
777 759
@@ -792,7 +774,7 @@ out:
792void pagefault_out_of_memory(void) 774void pagefault_out_of_memory(void)
793{ 775{
794 if (try_set_system_oom()) { 776 if (try_set_system_oom()) {
795 out_of_memory(NULL, 0, 0, NULL); 777 out_of_memory(NULL, 0, 0, NULL, false);
796 clear_system_oom(); 778 clear_system_oom();
797 } 779 }
798 if (!test_thread_flag(TIF_MEMDIE)) 780 if (!test_thread_flag(TIF_MEMDIE))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 363ba7082ef5..26adea8ca2e7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -95,6 +95,8 @@ unsigned long vm_dirty_bytes;
95 */ 95 */
96unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ 96unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
97 97
98EXPORT_SYMBOL_GPL(dirty_writeback_interval);
99
98/* 100/*
99 * The longest time for which data is allowed to remain dirty 101 * The longest time for which data is allowed to remain dirty
100 */ 102 */
@@ -1472,6 +1474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
1472 1474
1473 for ( ; ; ) { 1475 for ( ; ; ) {
1474 global_dirty_limits(&background_thresh, &dirty_thresh); 1476 global_dirty_limits(&background_thresh, &dirty_thresh);
1477 dirty_thresh = hard_dirty_limit(dirty_thresh);
1475 1478
1476 /* 1479 /*
1477 * Boost the allowable dirty threshold a bit for page 1480 * Boost the allowable dirty threshold a bit for page
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a13ded1938f0..a712fb9e04ce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg)
1161} 1161}
1162 1162
1163/* 1163/*
1164 * Spill all the per-cpu pages from all CPUs back into the buddy allocator 1164 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1165 *
1166 * Note that this code is protected against sending an IPI to an offline
1167 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1168 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1169 * nothing keeps CPUs from showing up after we populated the cpumask and
1170 * before the call to on_each_cpu_mask().
1165 */ 1171 */
1166void drain_all_pages(void) 1172void drain_all_pages(void)
1167{ 1173{
1168 on_each_cpu(drain_local_pages, NULL, 1); 1174 int cpu;
1175 struct per_cpu_pageset *pcp;
1176 struct zone *zone;
1177
1178 /*
1179 * Allocate in the BSS so we wont require allocation in
1180 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1181 */
1182 static cpumask_t cpus_with_pcps;
1183
1184 /*
1185 * We don't care about racing with CPU hotplug event
1186 * as offline notification will cause the notified
1187 * cpu to drain that CPU pcps and on_each_cpu_mask
1188 * disables preemption as part of its processing
1189 */
1190 for_each_online_cpu(cpu) {
1191 bool has_pcps = false;
1192 for_each_populated_zone(zone) {
1193 pcp = per_cpu_ptr(zone->pageset, cpu);
1194 if (pcp->pcp.count) {
1195 has_pcps = true;
1196 break;
1197 }
1198 }
1199 if (has_pcps)
1200 cpumask_set_cpu(cpu, &cpus_with_pcps);
1201 else
1202 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1203 }
1204 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1169} 1205}
1170 1206
1171#ifdef CONFIG_HIBERNATION 1207#ifdef CONFIG_HIBERNATION
@@ -1968,7 +2004,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1968 goto out; 2004 goto out;
1969 } 2005 }
1970 /* Exhausted what can be done so it's blamo time */ 2006 /* Exhausted what can be done so it's blamo time */
1971 out_of_memory(zonelist, gfp_mask, order, nodemask); 2007 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
1972 2008
1973out: 2009out:
1974 clear_zonelist_oom(zonelist, gfp_mask); 2010 clear_zonelist_oom(zonelist, gfp_mask);
@@ -1990,7 +2026,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1990 if (!order) 2026 if (!order)
1991 return NULL; 2027 return NULL;
1992 2028
1993 if (compaction_deferred(preferred_zone)) { 2029 if (compaction_deferred(preferred_zone, order)) {
1994 *deferred_compaction = true; 2030 *deferred_compaction = true;
1995 return NULL; 2031 return NULL;
1996 } 2032 }
@@ -2012,6 +2048,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2012 if (page) { 2048 if (page) {
2013 preferred_zone->compact_considered = 0; 2049 preferred_zone->compact_considered = 0;
2014 preferred_zone->compact_defer_shift = 0; 2050 preferred_zone->compact_defer_shift = 0;
2051 if (order >= preferred_zone->compact_order_failed)
2052 preferred_zone->compact_order_failed = order + 1;
2015 count_vm_event(COMPACTSUCCESS); 2053 count_vm_event(COMPACTSUCCESS);
2016 return page; 2054 return page;
2017 } 2055 }
@@ -2028,7 +2066,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2028 * defer if the failure was a sync compaction failure. 2066 * defer if the failure was a sync compaction failure.
2029 */ 2067 */
2030 if (sync_migration) 2068 if (sync_migration)
2031 defer_compaction(preferred_zone); 2069 defer_compaction(preferred_zone, order);
2032 2070
2033 cond_resched(); 2071 cond_resched();
2034 } 2072 }
@@ -2306,6 +2344,10 @@ rebalance:
2306 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2344 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2307 if (oom_killer_disabled) 2345 if (oom_killer_disabled)
2308 goto nopage; 2346 goto nopage;
2347 /* Coredumps can quickly deplete all memory reserves */
2348 if ((current->flags & PF_DUMPCORE) &&
2349 !(gfp_mask & __GFP_NOFAIL))
2350 goto nopage;
2309 page = __alloc_pages_may_oom(gfp_mask, order, 2351 page = __alloc_pages_may_oom(gfp_mask, order,
2310 zonelist, high_zoneidx, 2352 zonelist, high_zoneidx,
2311 nodemask, preferred_zone, 2353 nodemask, preferred_zone,
@@ -2378,8 +2420,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2378{ 2420{
2379 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2421 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2380 struct zone *preferred_zone; 2422 struct zone *preferred_zone;
2381 struct page *page; 2423 struct page *page = NULL;
2382 int migratetype = allocflags_to_migratetype(gfp_mask); 2424 int migratetype = allocflags_to_migratetype(gfp_mask);
2425 unsigned int cpuset_mems_cookie;
2383 2426
2384 gfp_mask &= gfp_allowed_mask; 2427 gfp_mask &= gfp_allowed_mask;
2385 2428
@@ -2398,15 +2441,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2398 if (unlikely(!zonelist->_zonerefs->zone)) 2441 if (unlikely(!zonelist->_zonerefs->zone))
2399 return NULL; 2442 return NULL;
2400 2443
2401 get_mems_allowed(); 2444retry_cpuset:
2445 cpuset_mems_cookie = get_mems_allowed();
2446
2402 /* The preferred zone is used for statistics later */ 2447 /* The preferred zone is used for statistics later */
2403 first_zones_zonelist(zonelist, high_zoneidx, 2448 first_zones_zonelist(zonelist, high_zoneidx,
2404 nodemask ? : &cpuset_current_mems_allowed, 2449 nodemask ? : &cpuset_current_mems_allowed,
2405 &preferred_zone); 2450 &preferred_zone);
2406 if (!preferred_zone) { 2451 if (!preferred_zone)
2407 put_mems_allowed(); 2452 goto out;
2408 return NULL;
2409 }
2410 2453
2411 /* First allocation attempt */ 2454 /* First allocation attempt */
2412 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2455 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2416,9 +2459,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2416 page = __alloc_pages_slowpath(gfp_mask, order, 2459 page = __alloc_pages_slowpath(gfp_mask, order,
2417 zonelist, high_zoneidx, nodemask, 2460 zonelist, high_zoneidx, nodemask,
2418 preferred_zone, migratetype); 2461 preferred_zone, migratetype);
2419 put_mems_allowed();
2420 2462
2421 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2463 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2464
2465out:
2466 /*
2467 * When updating a task's mems_allowed, it is possible to race with
2468 * parallel threads in such a way that an allocation can fail while
2469 * the mask is being updated. If a page allocation is about to fail,
2470 * check if the cpuset changed during allocation and if so, retry.
2471 */
2472 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2473 goto retry_cpuset;
2474
2422 return page; 2475 return page;
2423} 2476}
2424EXPORT_SYMBOL(__alloc_pages_nodemask); 2477EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2632,13 +2685,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2632bool skip_free_areas_node(unsigned int flags, int nid) 2685bool skip_free_areas_node(unsigned int flags, int nid)
2633{ 2686{
2634 bool ret = false; 2687 bool ret = false;
2688 unsigned int cpuset_mems_cookie;
2635 2689
2636 if (!(flags & SHOW_MEM_FILTER_NODES)) 2690 if (!(flags & SHOW_MEM_FILTER_NODES))
2637 goto out; 2691 goto out;
2638 2692
2639 get_mems_allowed(); 2693 do {
2640 ret = !node_isset(nid, cpuset_current_mems_allowed); 2694 cpuset_mems_cookie = get_mems_allowed();
2641 put_mems_allowed(); 2695 ret = !node_isset(nid, cpuset_current_mems_allowed);
2696 } while (!put_mems_allowed(cpuset_mems_cookie));
2642out: 2697out:
2643 return ret; 2698 return ret;
2644} 2699}
@@ -3925,18 +3980,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
3925 } 3980 }
3926} 3981}
3927 3982
3928int __init add_from_early_node_map(struct range *range, int az,
3929 int nr_range, int nid)
3930{
3931 unsigned long start_pfn, end_pfn;
3932 int i;
3933
3934 /* need to go over early_node_map to find out good range for node */
3935 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
3936 nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
3937 return nr_range;
3938}
3939
3940/** 3983/**
3941 * sparse_memory_present_with_active_regions - Call memory_present for each active range 3984 * sparse_memory_present_with_active_regions - Call memory_present for each active range
3942 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 3985 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4521,7 +4564,7 @@ static unsigned long __init early_calculate_totalpages(void)
4521 * memory. When they don't, some nodes will have more kernelcore than 4564 * memory. When they don't, some nodes will have more kernelcore than
4522 * others 4565 * others
4523 */ 4566 */
4524static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 4567static void __init find_zone_movable_pfns_for_nodes(void)
4525{ 4568{
4526 int i, nid; 4569 int i, nid;
4527 unsigned long usable_startpfn; 4570 unsigned long usable_startpfn;
@@ -4713,7 +4756,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4713 4756
4714 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 4757 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
4715 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 4758 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4716 find_zone_movable_pfns_for_nodes(zone_movable_pfn); 4759 find_zone_movable_pfns_for_nodes();
4717 4760
4718 /* Print out the zone ranges */ 4761 /* Print out the zone ranges */
4719 printk("Zone PFN ranges:\n"); 4762 printk("Zone PFN ranges:\n");
@@ -4823,6 +4866,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
4823 int cpu = (unsigned long)hcpu; 4866 int cpu = (unsigned long)hcpu;
4824 4867
4825 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 4868 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
4869 lru_add_drain_cpu(cpu);
4826 drain_pages(cpu); 4870 drain_pages(cpu);
4827 4871
4828 /* 4872 /*
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index de1616aa9b1e..1ccbd714059c 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -379,13 +379,15 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
379 pgoff_t offset = swp_offset(ent); 379 pgoff_t offset = swp_offset(ent);
380 struct swap_cgroup_ctrl *ctrl; 380 struct swap_cgroup_ctrl *ctrl;
381 struct page *mappage; 381 struct page *mappage;
382 struct swap_cgroup *sc;
382 383
383 ctrl = &swap_cgroup_ctrl[swp_type(ent)]; 384 ctrl = &swap_cgroup_ctrl[swp_type(ent)];
384 if (ctrlp) 385 if (ctrlp)
385 *ctrlp = ctrl; 386 *ctrlp = ctrl;
386 387
387 mappage = ctrl->map[offset / SC_PER_PAGE]; 388 mappage = ctrl->map[offset / SC_PER_PAGE];
388 return page_address(mappage) + offset % SC_PER_PAGE; 389 sc = page_address(mappage);
390 return sc + offset % SC_PER_PAGE;
389} 391}
390 392
391/** 393/**
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2f5cf10ff660..aa9701e12714 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd(walk->mm, pmd);
62 if (pmd_none_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
65 if (err) 65 if (err)
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 12a48a88c0d8..405d331804c3 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -184,8 +184,7 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
184 page_end - page_start); 184 page_end - page_start);
185 } 185 }
186 186
187 for (i = page_start; i < page_end; i++) 187 bitmap_clear(populated, page_start, page_end - page_start);
188 __clear_bit(i, populated);
189} 188}
190 189
191/** 190/**
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index eb663fb533e0..5a74fea182f1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
70 unsigned long address, pmd_t *pmdp) 70 unsigned long address, pmd_t *pmdp)
71{ 71{
72 int young; 72 int young;
73#ifndef CONFIG_TRANSPARENT_HUGEPAGE 73#ifdef CONFIG_TRANSPARENT_HUGEPAGE
74 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
75#else
74 BUG(); 76 BUG();
75#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 77#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
76 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
77 young = pmdp_test_and_clear_young(vma, address, pmdp); 78 young = pmdp_test_and_clear_young(vma, address, pmdp);
78 if (young) 79 if (young)
79 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 80 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/mm/rmap.c b/mm/rmap.c
index c8454e06b6c8..5b5ad584ffb7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
121} 121}
122 122
123static void anon_vma_chain_link(struct vm_area_struct *vma,
124 struct anon_vma_chain *avc,
125 struct anon_vma *anon_vma)
126{
127 avc->vma = vma;
128 avc->anon_vma = anon_vma;
129 list_add(&avc->same_vma, &vma->anon_vma_chain);
130
131 /*
132 * It's critical to add new vmas to the tail of the anon_vma,
133 * see comment in huge_memory.c:__split_huge_page().
134 */
135 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
136}
137
123/** 138/**
124 * anon_vma_prepare - attach an anon_vma to a memory region 139 * anon_vma_prepare - attach an anon_vma to a memory region
125 * @vma: the memory region in question 140 * @vma: the memory region in question
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
175 spin_lock(&mm->page_table_lock); 190 spin_lock(&mm->page_table_lock);
176 if (likely(!vma->anon_vma)) { 191 if (likely(!vma->anon_vma)) {
177 vma->anon_vma = anon_vma; 192 vma->anon_vma = anon_vma;
178 avc->anon_vma = anon_vma; 193 anon_vma_chain_link(vma, avc, anon_vma);
179 avc->vma = vma;
180 list_add(&avc->same_vma, &vma->anon_vma_chain);
181 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
182 allocated = NULL; 194 allocated = NULL;
183 avc = NULL; 195 avc = NULL;
184 } 196 }
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
224 mutex_unlock(&root->mutex); 236 mutex_unlock(&root->mutex);
225} 237}
226 238
227static void anon_vma_chain_link(struct vm_area_struct *vma,
228 struct anon_vma_chain *avc,
229 struct anon_vma *anon_vma)
230{
231 avc->vma = vma;
232 avc->anon_vma = anon_vma;
233 list_add(&avc->same_vma, &vma->anon_vma_chain);
234
235 /*
236 * It's critical to add new vmas to the tail of the anon_vma,
237 * see comment in huge_memory.c:__split_huge_page().
238 */
239 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
240}
241
242/* 239/*
243 * Attach the anon_vmas from src to dst. 240 * Attach the anon_vmas from src to dst.
244 * Returns 0 on success, -ENOMEM on failure. 241 * Returns 0 on success, -ENOMEM on failure.
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page,
1151 */ 1148 */
1152void page_add_file_rmap(struct page *page) 1149void page_add_file_rmap(struct page *page)
1153{ 1150{
1151 bool locked;
1152 unsigned long flags;
1153
1154 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1154 if (atomic_inc_and_test(&page->_mapcount)) { 1155 if (atomic_inc_and_test(&page->_mapcount)) {
1155 __inc_zone_page_state(page, NR_FILE_MAPPED); 1156 __inc_zone_page_state(page, NR_FILE_MAPPED);
1156 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); 1157 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
1157 } 1158 }
1159 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1158} 1160}
1159 1161
1160/** 1162/**
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page)
1165 */ 1167 */
1166void page_remove_rmap(struct page *page) 1168void page_remove_rmap(struct page *page)
1167{ 1169{
1170 bool anon = PageAnon(page);
1171 bool locked;
1172 unsigned long flags;
1173
1174 /*
1175 * The anon case has no mem_cgroup page_stat to update; but may
1176 * uncharge_page() below, where the lock ordering can deadlock if
1177 * we hold the lock against page_stat move: so avoid it on anon.
1178 */
1179 if (!anon)
1180 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1181
1168 /* page still mapped by someone else? */ 1182 /* page still mapped by someone else? */
1169 if (!atomic_add_negative(-1, &page->_mapcount)) 1183 if (!atomic_add_negative(-1, &page->_mapcount))
1170 return; 1184 goto out;
1171 1185
1172 /* 1186 /*
1173 * Now that the last pte has gone, s390 must transfer dirty 1187 * Now that the last pte has gone, s390 must transfer dirty
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page)
1176 * not if it's in swapcache - there might be another pte slot 1190 * not if it's in swapcache - there might be another pte slot
1177 * containing the swap entry, but page not yet written to swap. 1191 * containing the swap entry, but page not yet written to swap.
1178 */ 1192 */
1179 if ((!PageAnon(page) || PageSwapCache(page)) && 1193 if ((!anon || PageSwapCache(page)) &&
1180 page_test_and_clear_dirty(page_to_pfn(page), 1)) 1194 page_test_and_clear_dirty(page_to_pfn(page), 1))
1181 set_page_dirty(page); 1195 set_page_dirty(page);
1182 /* 1196 /*
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page)
1184 * and not charged by memcg for now. 1198 * and not charged by memcg for now.
1185 */ 1199 */
1186 if (unlikely(PageHuge(page))) 1200 if (unlikely(PageHuge(page)))
1187 return; 1201 goto out;
1188 if (PageAnon(page)) { 1202 if (anon) {
1189 mem_cgroup_uncharge_page(page); 1203 mem_cgroup_uncharge_page(page);
1190 if (!PageTransHuge(page)) 1204 if (!PageTransHuge(page))
1191 __dec_zone_page_state(page, NR_ANON_PAGES); 1205 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page)
1205 * Leaving it set also helps swapoff to reinstate ptes 1219 * Leaving it set also helps swapoff to reinstate ptes
1206 * faster for those pages still in swapcache. 1220 * faster for those pages still in swapcache.
1207 */ 1221 */
1222out:
1223 if (!anon)
1224 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1208} 1225}
1209 1226
1210/* 1227/*
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1282 } 1299 }
1283 dec_mm_counter(mm, MM_ANONPAGES); 1300 dec_mm_counter(mm, MM_ANONPAGES);
1284 inc_mm_counter(mm, MM_SWAPENTS); 1301 inc_mm_counter(mm, MM_SWAPENTS);
1285 } else if (PAGE_MIGRATION) { 1302 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
1286 /* 1303 /*
1287 * Store the pfn of the page in a special migration 1304 * Store the pfn of the page in a special migration
1288 * pte. do_swap_page() will wait until the migration 1305 * pte. do_swap_page() will wait until the migration
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1293 } 1310 }
1294 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 1311 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1295 BUG_ON(pte_file(*pte)); 1312 BUG_ON(pte_file(*pte));
1296 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { 1313 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1314 (TTU_ACTION(flags) == TTU_MIGRATION)) {
1297 /* Establish migration entry for a file page */ 1315 /* Establish migration entry for a file page */
1298 swp_entry_t entry; 1316 swp_entry_t entry;
1299 entry = make_migration_entry(page, pte_write(pteval)); 1317 entry = make_migration_entry(page, pte_write(pteval));
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1499 * locking requirements of exec(), migration skips 1517 * locking requirements of exec(), migration skips
1500 * temporary VMAs until after exec() completes. 1518 * temporary VMAs until after exec() completes.
1501 */ 1519 */
1502 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && 1520 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
1503 is_vma_temporary_stack(vma)) 1521 is_vma_temporary_stack(vma))
1504 continue; 1522 continue;
1505 1523
diff --git a/mm/shmem.c b/mm/shmem.c
index 269d049294ab..f99ff3e50bd6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -127,7 +127,7 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
127static inline int shmem_acct_size(unsigned long flags, loff_t size) 127static inline int shmem_acct_size(unsigned long flags, loff_t size)
128{ 128{
129 return (flags & VM_NORESERVE) ? 129 return (flags & VM_NORESERVE) ?
130 0 : security_vm_enough_memory_kern(VM_ACCT(size)); 130 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
131} 131}
132 132
133static inline void shmem_unacct_size(unsigned long flags, loff_t size) 133static inline void shmem_unacct_size(unsigned long flags, loff_t size)
@@ -145,7 +145,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
145static inline int shmem_acct_block(unsigned long flags) 145static inline int shmem_acct_block(unsigned long flags)
146{ 146{
147 return (flags & VM_NORESERVE) ? 147 return (flags & VM_NORESERVE) ?
148 security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; 148 security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
149} 149}
150 150
151static inline void shmem_unacct_blocks(unsigned long flags, long pages) 151static inline void shmem_unacct_blocks(unsigned long flags, long pages)
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1178static const struct inode_operations shmem_symlink_inode_operations; 1178static const struct inode_operations shmem_symlink_inode_operations;
1179static const struct inode_operations shmem_short_symlink_operations; 1179static const struct inode_operations shmem_short_symlink_operations;
1180 1180
1181#ifdef CONFIG_TMPFS_XATTR
1182static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
1183#else
1184#define shmem_initxattrs NULL
1185#endif
1186
1181static int 1187static int
1182shmem_write_begin(struct file *file, struct address_space *mapping, 1188shmem_write_begin(struct file *file, struct address_space *mapping,
1183 loff_t pos, unsigned len, unsigned flags, 1189 loff_t pos, unsigned len, unsigned flags,
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1490 if (inode) { 1496 if (inode) {
1491 error = security_inode_init_security(inode, dir, 1497 error = security_inode_init_security(inode, dir,
1492 &dentry->d_name, 1498 &dentry->d_name,
1493 NULL, NULL); 1499 shmem_initxattrs, NULL);
1494 if (error) { 1500 if (error) {
1495 if (error != -EOPNOTSUPP) { 1501 if (error != -EOPNOTSUPP) {
1496 iput(inode); 1502 iput(inode);
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1630 return -ENOSPC; 1636 return -ENOSPC;
1631 1637
1632 error = security_inode_init_security(inode, dir, &dentry->d_name, 1638 error = security_inode_init_security(inode, dir, &dentry->d_name,
1633 NULL, NULL); 1639 shmem_initxattrs, NULL);
1634 if (error) { 1640 if (error) {
1635 if (error != -EOPNOTSUPP) { 1641 if (error != -EOPNOTSUPP) {
1636 iput(inode); 1642 iput(inode);
@@ -1656,9 +1662,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1656 } 1662 }
1657 inode->i_mapping->a_ops = &shmem_aops; 1663 inode->i_mapping->a_ops = &shmem_aops;
1658 inode->i_op = &shmem_symlink_inode_operations; 1664 inode->i_op = &shmem_symlink_inode_operations;
1659 kaddr = kmap_atomic(page, KM_USER0); 1665 kaddr = kmap_atomic(page);
1660 memcpy(kaddr, symname, len); 1666 memcpy(kaddr, symname, len);
1661 kunmap_atomic(kaddr, KM_USER0); 1667 kunmap_atomic(kaddr);
1662 set_page_dirty(page); 1668 set_page_dirty(page);
1663 unlock_page(page); 1669 unlock_page(page);
1664 page_cache_release(page); 1670 page_cache_release(page);
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
1704 * filesystem level, though. 1710 * filesystem level, though.
1705 */ 1711 */
1706 1712
1713/*
1714 * Allocate new xattr and copy in the value; but leave the name to callers.
1715 */
1716static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
1717{
1718 struct shmem_xattr *new_xattr;
1719 size_t len;
1720
1721 /* wrap around? */
1722 len = sizeof(*new_xattr) + size;
1723 if (len <= sizeof(*new_xattr))
1724 return NULL;
1725
1726 new_xattr = kmalloc(len, GFP_KERNEL);
1727 if (!new_xattr)
1728 return NULL;
1729
1730 new_xattr->size = size;
1731 memcpy(new_xattr->value, value, size);
1732 return new_xattr;
1733}
1734
1735/*
1736 * Callback for security_inode_init_security() for acquiring xattrs.
1737 */
1738static int shmem_initxattrs(struct inode *inode,
1739 const struct xattr *xattr_array,
1740 void *fs_info)
1741{
1742 struct shmem_inode_info *info = SHMEM_I(inode);
1743 const struct xattr *xattr;
1744 struct shmem_xattr *new_xattr;
1745 size_t len;
1746
1747 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
1748 new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
1749 if (!new_xattr)
1750 return -ENOMEM;
1751
1752 len = strlen(xattr->name) + 1;
1753 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
1754 GFP_KERNEL);
1755 if (!new_xattr->name) {
1756 kfree(new_xattr);
1757 return -ENOMEM;
1758 }
1759
1760 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
1761 XATTR_SECURITY_PREFIX_LEN);
1762 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
1763 xattr->name, len);
1764
1765 spin_lock(&info->lock);
1766 list_add(&new_xattr->list, &info->xattr_list);
1767 spin_unlock(&info->lock);
1768 }
1769
1770 return 0;
1771}
1772
1707static int shmem_xattr_get(struct dentry *dentry, const char *name, 1773static int shmem_xattr_get(struct dentry *dentry, const char *name,
1708 void *buffer, size_t size) 1774 void *buffer, size_t size)
1709{ 1775{
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name,
1731 return ret; 1797 return ret;
1732} 1798}
1733 1799
1734static int shmem_xattr_set(struct dentry *dentry, const char *name, 1800static int shmem_xattr_set(struct inode *inode, const char *name,
1735 const void *value, size_t size, int flags) 1801 const void *value, size_t size, int flags)
1736{ 1802{
1737 struct inode *inode = dentry->d_inode;
1738 struct shmem_inode_info *info = SHMEM_I(inode); 1803 struct shmem_inode_info *info = SHMEM_I(inode);
1739 struct shmem_xattr *xattr; 1804 struct shmem_xattr *xattr;
1740 struct shmem_xattr *new_xattr = NULL; 1805 struct shmem_xattr *new_xattr = NULL;
1741 size_t len;
1742 int err = 0; 1806 int err = 0;
1743 1807
1744 /* value == NULL means remove */ 1808 /* value == NULL means remove */
1745 if (value) { 1809 if (value) {
1746 /* wrap around? */ 1810 new_xattr = shmem_xattr_alloc(value, size);
1747 len = sizeof(*new_xattr) + size;
1748 if (len <= sizeof(*new_xattr))
1749 return -ENOMEM;
1750
1751 new_xattr = kmalloc(len, GFP_KERNEL);
1752 if (!new_xattr) 1811 if (!new_xattr)
1753 return -ENOMEM; 1812 return -ENOMEM;
1754 1813
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name,
1757 kfree(new_xattr); 1816 kfree(new_xattr);
1758 return -ENOMEM; 1817 return -ENOMEM;
1759 } 1818 }
1760
1761 new_xattr->size = size;
1762 memcpy(new_xattr->value, value, size);
1763 } 1819 }
1764 1820
1765 spin_lock(&info->lock); 1821 spin_lock(&info->lock);
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
1858 if (size == 0) 1914 if (size == 0)
1859 value = ""; /* empty EA, do not remove */ 1915 value = ""; /* empty EA, do not remove */
1860 1916
1861 return shmem_xattr_set(dentry, name, value, size, flags); 1917 return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
1862 1918
1863} 1919}
1864 1920
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
1878 if (err) 1934 if (err)
1879 return err; 1935 return err;
1880 1936
1881 return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); 1937 return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
1882} 1938}
1883 1939
1884static bool xattr_is_trusted(const char *name) 1940static bool xattr_is_trusted(const char *name)
@@ -2175,7 +2231,6 @@ static void shmem_put_super(struct super_block *sb)
2175int shmem_fill_super(struct super_block *sb, void *data, int silent) 2231int shmem_fill_super(struct super_block *sb, void *data, int silent)
2176{ 2232{
2177 struct inode *inode; 2233 struct inode *inode;
2178 struct dentry *root;
2179 struct shmem_sb_info *sbinfo; 2234 struct shmem_sb_info *sbinfo;
2180 int err = -ENOMEM; 2235 int err = -ENOMEM;
2181 2236
@@ -2232,14 +2287,11 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2232 goto failed; 2287 goto failed;
2233 inode->i_uid = sbinfo->uid; 2288 inode->i_uid = sbinfo->uid;
2234 inode->i_gid = sbinfo->gid; 2289 inode->i_gid = sbinfo->gid;
2235 root = d_alloc_root(inode); 2290 sb->s_root = d_make_root(inode);
2236 if (!root) 2291 if (!sb->s_root)
2237 goto failed_iput; 2292 goto failed;
2238 sb->s_root = root;
2239 return 0; 2293 return 0;
2240 2294
2241failed_iput:
2242 iput(inode);
2243failed: 2295failed:
2244 shmem_put_super(sb); 2296 shmem_put_super(sb);
2245 return err; 2297 return err;
diff --git a/mm/slab.c b/mm/slab.c
index f0bd7857ab3b..e901a36e2520 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1731,6 +1731,52 @@ static int __init cpucache_init(void)
1731} 1731}
1732__initcall(cpucache_init); 1732__initcall(cpucache_init);
1733 1733
1734static noinline void
1735slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1736{
1737 struct kmem_list3 *l3;
1738 struct slab *slabp;
1739 unsigned long flags;
1740 int node;
1741
1742 printk(KERN_WARNING
1743 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1744 nodeid, gfpflags);
1745 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
1746 cachep->name, cachep->buffer_size, cachep->gfporder);
1747
1748 for_each_online_node(node) {
1749 unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
1750 unsigned long active_slabs = 0, num_slabs = 0;
1751
1752 l3 = cachep->nodelists[node];
1753 if (!l3)
1754 continue;
1755
1756 spin_lock_irqsave(&l3->list_lock, flags);
1757 list_for_each_entry(slabp, &l3->slabs_full, list) {
1758 active_objs += cachep->num;
1759 active_slabs++;
1760 }
1761 list_for_each_entry(slabp, &l3->slabs_partial, list) {
1762 active_objs += slabp->inuse;
1763 active_slabs++;
1764 }
1765 list_for_each_entry(slabp, &l3->slabs_free, list)
1766 num_slabs++;
1767
1768 free_objects += l3->free_objects;
1769 spin_unlock_irqrestore(&l3->list_lock, flags);
1770
1771 num_slabs += active_slabs;
1772 num_objs = num_slabs * cachep->num;
1773 printk(KERN_WARNING
1774 " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1775 node, active_slabs, num_slabs, active_objs, num_objs,
1776 free_objects);
1777 }
1778}
1779
1734/* 1780/*
1735 * Interface to system's page allocator. No need to hold the cache-lock. 1781 * Interface to system's page allocator. No need to hold the cache-lock.
1736 * 1782 *
@@ -1757,8 +1803,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1757 flags |= __GFP_RECLAIMABLE; 1803 flags |= __GFP_RECLAIMABLE;
1758 1804
1759 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1805 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1760 if (!page) 1806 if (!page) {
1807 if (!(flags & __GFP_NOWARN) && printk_ratelimit())
1808 slab_out_of_memory(cachep, flags, nodeid);
1761 return NULL; 1809 return NULL;
1810 }
1762 1811
1763 nr_pages = (1 << cachep->gfporder); 1812 nr_pages = (1 << cachep->gfporder);
1764 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1813 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -3284,12 +3333,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3284 if (in_interrupt() || (flags & __GFP_THISNODE)) 3333 if (in_interrupt() || (flags & __GFP_THISNODE))
3285 return NULL; 3334 return NULL;
3286 nid_alloc = nid_here = numa_mem_id(); 3335 nid_alloc = nid_here = numa_mem_id();
3287 get_mems_allowed();
3288 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3336 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3289 nid_alloc = cpuset_slab_spread_node(); 3337 nid_alloc = cpuset_slab_spread_node();
3290 else if (current->mempolicy) 3338 else if (current->mempolicy)
3291 nid_alloc = slab_node(current->mempolicy); 3339 nid_alloc = slab_node(current->mempolicy);
3292 put_mems_allowed();
3293 if (nid_alloc != nid_here) 3340 if (nid_alloc != nid_here)
3294 return ____cache_alloc_node(cachep, flags, nid_alloc); 3341 return ____cache_alloc_node(cachep, flags, nid_alloc);
3295 return NULL; 3342 return NULL;
@@ -3312,14 +3359,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3312 enum zone_type high_zoneidx = gfp_zone(flags); 3359 enum zone_type high_zoneidx = gfp_zone(flags);
3313 void *obj = NULL; 3360 void *obj = NULL;
3314 int nid; 3361 int nid;
3362 unsigned int cpuset_mems_cookie;
3315 3363
3316 if (flags & __GFP_THISNODE) 3364 if (flags & __GFP_THISNODE)
3317 return NULL; 3365 return NULL;
3318 3366
3319 get_mems_allowed();
3320 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3321 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3367 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3322 3368
3369retry_cpuset:
3370 cpuset_mems_cookie = get_mems_allowed();
3371 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3372
3323retry: 3373retry:
3324 /* 3374 /*
3325 * Look through allowed nodes for objects available 3375 * Look through allowed nodes for objects available
@@ -3372,7 +3422,9 @@ retry:
3372 } 3422 }
3373 } 3423 }
3374 } 3424 }
3375 put_mems_allowed(); 3425
3426 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
3427 goto retry_cpuset;
3376 return obj; 3428 return obj;
3377} 3429}
3378 3430
@@ -3693,13 +3745,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3693 3745
3694 if (likely(ac->avail < ac->limit)) { 3746 if (likely(ac->avail < ac->limit)) {
3695 STATS_INC_FREEHIT(cachep); 3747 STATS_INC_FREEHIT(cachep);
3696 ac->entry[ac->avail++] = objp;
3697 return;
3698 } else { 3748 } else {
3699 STATS_INC_FREEMISS(cachep); 3749 STATS_INC_FREEMISS(cachep);
3700 cache_flusharray(cachep, ac); 3750 cache_flusharray(cachep, ac);
3701 ac->entry[ac->avail++] = objp;
3702 } 3751 }
3752
3753 ac->entry[ac->avail++] = objp;
3703} 3754}
3704 3755
3705/** 3756/**
diff --git a/mm/slub.c b/mm/slub.c
index 4907563ef7ff..ffe13fdf8144 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -29,6 +29,7 @@
29#include <linux/math64.h> 29#include <linux/math64.h>
30#include <linux/fault-inject.h> 30#include <linux/fault-inject.h>
31#include <linux/stacktrace.h> 31#include <linux/stacktrace.h>
32#include <linux/prefetch.h>
32 33
33#include <trace/events/kmem.h> 34#include <trace/events/kmem.h>
34 35
@@ -269,6 +270,11 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
269 return *(void **)(object + s->offset); 270 return *(void **)(object + s->offset);
270} 271}
271 272
273static void prefetch_freepointer(const struct kmem_cache *s, void *object)
274{
275 prefetch(object + s->offset);
276}
277
272static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 278static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
273{ 279{
274 void *p; 280 void *p;
@@ -1560,6 +1566,7 @@ static void *get_partial_node(struct kmem_cache *s,
1560 } else { 1566 } else {
1561 page->freelist = t; 1567 page->freelist = t;
1562 available = put_cpu_partial(s, page, 0); 1568 available = put_cpu_partial(s, page, 0);
1569 stat(s, CPU_PARTIAL_NODE);
1563 } 1570 }
1564 if (kmem_cache_debug(s) || available > s->cpu_partial / 2) 1571 if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
1565 break; 1572 break;
@@ -1581,6 +1588,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1581 struct zone *zone; 1588 struct zone *zone;
1582 enum zone_type high_zoneidx = gfp_zone(flags); 1589 enum zone_type high_zoneidx = gfp_zone(flags);
1583 void *object; 1590 void *object;
1591 unsigned int cpuset_mems_cookie;
1584 1592
1585 /* 1593 /*
1586 * The defrag ratio allows a configuration of the tradeoffs between 1594 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1604,23 +1612,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1604 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1612 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1605 return NULL; 1613 return NULL;
1606 1614
1607 get_mems_allowed(); 1615 do {
1608 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1616 cpuset_mems_cookie = get_mems_allowed();
1609 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1617 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1610 struct kmem_cache_node *n; 1618 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1611 1619 struct kmem_cache_node *n;
1612 n = get_node(s, zone_to_nid(zone)); 1620
1613 1621 n = get_node(s, zone_to_nid(zone));
1614 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1622
1615 n->nr_partial > s->min_partial) { 1623 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1616 object = get_partial_node(s, n, c); 1624 n->nr_partial > s->min_partial) {
1617 if (object) { 1625 object = get_partial_node(s, n, c);
1618 put_mems_allowed(); 1626 if (object) {
1619 return object; 1627 /*
1628 * Return the object even if
1629 * put_mems_allowed indicated that
1630 * the cpuset mems_allowed was
1631 * updated in parallel. It's a
1632 * harmless race between the alloc
1633 * and the cpuset update.
1634 */
1635 put_mems_allowed(cpuset_mems_cookie);
1636 return object;
1637 }
1620 } 1638 }
1621 } 1639 }
1622 } 1640 } while (!put_mems_allowed(cpuset_mems_cookie));
1623 put_mems_allowed();
1624#endif 1641#endif
1625 return NULL; 1642 return NULL;
1626} 1643}
@@ -1973,6 +1990,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1973 local_irq_restore(flags); 1990 local_irq_restore(flags);
1974 pobjects = 0; 1991 pobjects = 0;
1975 pages = 0; 1992 pages = 0;
1993 stat(s, CPU_PARTIAL_DRAIN);
1976 } 1994 }
1977 } 1995 }
1978 1996
@@ -1984,7 +2002,6 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1984 page->next = oldpage; 2002 page->next = oldpage;
1985 2003
1986 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); 2004 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
1987 stat(s, CPU_PARTIAL_FREE);
1988 return pobjects; 2005 return pobjects;
1989} 2006}
1990 2007
@@ -2018,9 +2035,17 @@ static void flush_cpu_slab(void *d)
2018 __flush_cpu_slab(s, smp_processor_id()); 2035 __flush_cpu_slab(s, smp_processor_id());
2019} 2036}
2020 2037
2038static bool has_cpu_slab(int cpu, void *info)
2039{
2040 struct kmem_cache *s = info;
2041 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2042
2043 return !!(c->page);
2044}
2045
2021static void flush_all(struct kmem_cache *s) 2046static void flush_all(struct kmem_cache *s)
2022{ 2047{
2023 on_each_cpu(flush_cpu_slab, s, 1); 2048 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
2024} 2049}
2025 2050
2026/* 2051/*
@@ -2309,6 +2334,8 @@ redo:
2309 object = __slab_alloc(s, gfpflags, node, addr, c); 2334 object = __slab_alloc(s, gfpflags, node, addr, c);
2310 2335
2311 else { 2336 else {
2337 void *next_object = get_freepointer_safe(s, object);
2338
2312 /* 2339 /*
2313 * The cmpxchg will only match if there was no additional 2340 * The cmpxchg will only match if there was no additional
2314 * operation and if we are on the right processor. 2341 * operation and if we are on the right processor.
@@ -2324,11 +2351,12 @@ redo:
2324 if (unlikely(!this_cpu_cmpxchg_double( 2351 if (unlikely(!this_cpu_cmpxchg_double(
2325 s->cpu_slab->freelist, s->cpu_slab->tid, 2352 s->cpu_slab->freelist, s->cpu_slab->tid,
2326 object, tid, 2353 object, tid,
2327 get_freepointer_safe(s, object), next_tid(tid)))) { 2354 next_object, next_tid(tid)))) {
2328 2355
2329 note_cmpxchg_failure("slab_alloc", s, tid); 2356 note_cmpxchg_failure("slab_alloc", s, tid);
2330 goto redo; 2357 goto redo;
2331 } 2358 }
2359 prefetch_freepointer(s, next_object);
2332 stat(s, ALLOC_FASTPATH); 2360 stat(s, ALLOC_FASTPATH);
2333 } 2361 }
2334 2362
@@ -2465,9 +2493,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2465 * If we just froze the page then put it onto the 2493 * If we just froze the page then put it onto the
2466 * per cpu partial list. 2494 * per cpu partial list.
2467 */ 2495 */
2468 if (new.frozen && !was_frozen) 2496 if (new.frozen && !was_frozen) {
2469 put_cpu_partial(s, page, 1); 2497 put_cpu_partial(s, page, 1);
2470 2498 stat(s, CPU_PARTIAL_FREE);
2499 }
2471 /* 2500 /*
2472 * The list lock was not taken therefore no list 2501 * The list lock was not taken therefore no list
2473 * activity can be necessary. 2502 * activity can be necessary.
@@ -3929,13 +3958,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3929 if (kmem_cache_open(s, n, 3958 if (kmem_cache_open(s, n,
3930 size, align, flags, ctor)) { 3959 size, align, flags, ctor)) {
3931 list_add(&s->list, &slab_caches); 3960 list_add(&s->list, &slab_caches);
3961 up_write(&slub_lock);
3932 if (sysfs_slab_add(s)) { 3962 if (sysfs_slab_add(s)) {
3963 down_write(&slub_lock);
3933 list_del(&s->list); 3964 list_del(&s->list);
3934 kfree(n); 3965 kfree(n);
3935 kfree(s); 3966 kfree(s);
3936 goto err; 3967 goto err;
3937 } 3968 }
3938 up_write(&slub_lock);
3939 return s; 3969 return s;
3940 } 3970 }
3941 kfree(n); 3971 kfree(n);
@@ -5059,6 +5089,8 @@ STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
5059STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 5089STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
5060STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 5090STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
5061STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 5091STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
5092STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
5093STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
5062#endif 5094#endif
5063 5095
5064static struct attribute *slab_attrs[] = { 5096static struct attribute *slab_attrs[] = {
@@ -5124,6 +5156,8 @@ static struct attribute *slab_attrs[] = {
5124 &cmpxchg_double_cpu_fail_attr.attr, 5156 &cmpxchg_double_cpu_fail_attr.attr,
5125 &cpu_partial_alloc_attr.attr, 5157 &cpu_partial_alloc_attr.attr,
5126 &cpu_partial_free_attr.attr, 5158 &cpu_partial_free_attr.attr,
5159 &cpu_partial_node_attr.attr,
5160 &cpu_partial_drain_attr.attr,
5127#endif 5161#endif
5128#ifdef CONFIG_FAILSLAB 5162#ifdef CONFIG_FAILSLAB
5129 &failslab_attr.attr, 5163 &failslab_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index 61d7cde23111..a8bc7d364deb 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
353 353
354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count); 355 usemap_count);
356 if (usemap) { 356 if (!usemap) {
357 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 357 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
358 if (!present_section_nr(pnum)) 358 if (!usemap) {
359 continue; 359 printk(KERN_WARNING "%s: allocation failed\n", __func__);
360 usemap_map[pnum] = usemap; 360 return;
361 usemap += size;
362 } 361 }
363 return;
364 } 362 }
365 363
366 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); 364 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
367 if (usemap) { 365 if (!present_section_nr(pnum))
368 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 366 continue;
369 if (!present_section_nr(pnum)) 367 usemap_map[pnum] = usemap;
370 continue; 368 usemap += size;
371 usemap_map[pnum] = usemap; 369 check_usemap_section_nr(nodeid, usemap_map[pnum]);
372 usemap += size;
373 check_usemap_section_nr(nodeid, usemap_map[pnum]);
374 }
375 return;
376 } 370 }
377
378 printk(KERN_WARNING "%s: allocation failed\n", __func__);
379} 371}
380 372
381#ifndef CONFIG_SPARSEMEM_VMEMMAP 373#ifndef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/mm/swap.c b/mm/swap.c
index fff1ff7fb9ad..5c13f1338972 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg)
496 * Either "cpu" is the current CPU, and preemption has already been 496 * Either "cpu" is the current CPU, and preemption has already been
497 * disabled; or "cpu" is being hot-unplugged, and is already dead. 497 * disabled; or "cpu" is being hot-unplugged, and is already dead.
498 */ 498 */
499static void drain_cpu_pagevecs(int cpu) 499void lru_add_drain_cpu(int cpu)
500{ 500{
501 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 501 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
502 struct pagevec *pvec; 502 struct pagevec *pvec;
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page)
553 553
554void lru_add_drain(void) 554void lru_add_drain(void)
555{ 555{
556 drain_cpu_pagevecs(get_cpu()); 556 lru_add_drain_cpu(get_cpu());
557 put_cpu(); 557 put_cpu();
558} 558}
559 559
@@ -652,7 +652,7 @@ EXPORT_SYMBOL(__pagevec_release);
652void lru_add_page_tail(struct zone* zone, 652void lru_add_page_tail(struct zone* zone,
653 struct page *page, struct page *page_tail) 653 struct page *page, struct page *page_tail)
654{ 654{
655 int active; 655 int uninitialized_var(active);
656 enum lru_list lru; 656 enum lru_list lru;
657 const int file = 0; 657 const int file = 0;
658 658
@@ -672,7 +672,6 @@ void lru_add_page_tail(struct zone* zone,
672 active = 0; 672 active = 0;
673 lru = LRU_INACTIVE_ANON; 673 lru = LRU_INACTIVE_ANON;
674 } 674 }
675 update_page_reclaim_stat(zone, page_tail, file, active);
676 } else { 675 } else {
677 SetPageUnevictable(page_tail); 676 SetPageUnevictable(page_tail);
678 lru = LRU_UNEVICTABLE; 677 lru = LRU_UNEVICTABLE;
@@ -693,6 +692,9 @@ void lru_add_page_tail(struct zone* zone,
693 list_head = page_tail->lru.prev; 692 list_head = page_tail->lru.prev;
694 list_move_tail(&page_tail->lru, list_head); 693 list_move_tail(&page_tail->lru, list_head);
695 } 694 }
695
696 if (!PageUnevictable(page))
697 update_page_reclaim_stat(zone, page_tail, file, active);
696} 698}
697#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 699#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
698 700
@@ -710,8 +712,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
710 SetPageLRU(page); 712 SetPageLRU(page);
711 if (active) 713 if (active)
712 SetPageActive(page); 714 SetPageActive(page);
713 update_page_reclaim_stat(zone, page, file, active);
714 add_page_to_lru_list(zone, page, lru); 715 add_page_to_lru_list(zone, page, lru);
716 update_page_reclaim_stat(zone, page, file, active);
715} 717}
716 718
717/* 719/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 470038a91873..9d3dd3763cf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -300,16 +300,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
300 new_page = alloc_page_vma(gfp_mask, vma, addr); 300 new_page = alloc_page_vma(gfp_mask, vma, addr);
301 if (!new_page) 301 if (!new_page)
302 break; /* Out of memory */ 302 break; /* Out of memory */
303 /*
304 * The memcg-specific accounting when moving
305 * pages around the LRU lists relies on the
306 * page's owner (memcg) to be valid. Usually,
307 * pages are assigned to a new owner before
308 * being put on the LRU list, but since this
309 * is not the case here, the stale owner from
310 * a previous allocation cycle must be reset.
311 */
312 mem_cgroup_reset_owner(new_page);
313 } 303 }
314 304
315 /* 305 /*
@@ -382,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
382struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 372struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
383 struct vm_area_struct *vma, unsigned long addr) 373 struct vm_area_struct *vma, unsigned long addr)
384{ 374{
385 int nr_pages;
386 struct page *page; 375 struct page *page;
387 unsigned long offset; 376 unsigned long offset = swp_offset(entry);
388 unsigned long end_offset; 377 unsigned long start_offset, end_offset;
378 unsigned long mask = (1UL << page_cluster) - 1;
389 379
390 /* 380 /* Read a page_cluster sized and aligned cluster around offset. */
391 * Get starting offset for readaround, and number of pages to read. 381 start_offset = offset & ~mask;
392 * Adjust starting address by readbehind (for NUMA interleave case)? 382 end_offset = offset | mask;
393 * No, it's very unlikely that swap layout would follow vma layout, 383 if (!start_offset) /* First page is swap header. */
394 * more likely that neighbouring swap pages came from the same node: 384 start_offset++;
395 * so use the same "addr" to choose the same node for each swap read. 385
396 */ 386 for (offset = start_offset; offset <= end_offset ; offset++) {
397 nr_pages = valid_swaphandles(entry, &offset);
398 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
399 /* Ok, do the async read-ahead now */ 387 /* Ok, do the async read-ahead now */
400 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
401 gfp_mask, vma, addr); 389 gfp_mask, vma, addr);
402 if (!page) 390 if (!page)
403 break; 391 continue;
404 page_cache_release(page); 392 page_cache_release(page);
405 } 393 }
406 lru_add_drain(); /* Push any new pages onto the LRU now */ 394 lru_add_drain(); /* Push any new pages onto the LRU now */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d999f090dfda..fafc26d1b1dc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
932 pmd = pmd_offset(pud, addr); 932 pmd = pmd_offset(pud, addr);
933 do { 933 do {
934 next = pmd_addr_end(addr, end); 934 next = pmd_addr_end(addr, end);
935 if (unlikely(pmd_trans_huge(*pmd))) 935 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
936 continue;
937 if (pmd_none_or_clear_bad(pmd))
938 continue; 936 continue;
939 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 937 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
940 if (ret) 938 if (ret)
@@ -1563,6 +1561,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1563 if (!capable(CAP_SYS_ADMIN)) 1561 if (!capable(CAP_SYS_ADMIN))
1564 return -EPERM; 1562 return -EPERM;
1565 1563
1564 BUG_ON(!current->mm);
1565
1566 pathname = getname(specialfile); 1566 pathname = getname(specialfile);
1567 err = PTR_ERR(pathname); 1567 err = PTR_ERR(pathname);
1568 if (IS_ERR(pathname)) 1568 if (IS_ERR(pathname))
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1590 spin_unlock(&swap_lock); 1590 spin_unlock(&swap_lock);
1591 goto out_dput; 1591 goto out_dput;
1592 } 1592 }
1593 if (!security_vm_enough_memory(p->pages)) 1593 if (!security_vm_enough_memory_mm(current->mm, p->pages))
1594 vm_unacct_memory(p->pages); 1594 vm_unacct_memory(p->pages);
1595 else { 1595 else {
1596 err = -ENOMEM; 1596 err = -ENOMEM;
@@ -2022,6 +2022,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2022 struct page *page = NULL; 2022 struct page *page = NULL;
2023 struct inode *inode = NULL; 2023 struct inode *inode = NULL;
2024 2024
2025 if (swap_flags & ~SWAP_FLAGS_VALID)
2026 return -EINVAL;
2027
2025 if (!capable(CAP_SYS_ADMIN)) 2028 if (!capable(CAP_SYS_ADMIN))
2026 return -EPERM; 2029 return -EPERM;
2027 2030
@@ -2105,7 +2108,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2105 p->flags |= SWP_SOLIDSTATE; 2108 p->flags |= SWP_SOLIDSTATE;
2106 p->cluster_next = 1 + (random32() % p->highest_bit); 2109 p->cluster_next = 1 + (random32() % p->highest_bit);
2107 } 2110 }
2108 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) 2111 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
2109 p->flags |= SWP_DISCARDABLE; 2112 p->flags |= SWP_DISCARDABLE;
2110 } 2113 }
2111 2114
@@ -2290,58 +2293,6 @@ int swapcache_prepare(swp_entry_t entry)
2290} 2293}
2291 2294
2292/* 2295/*
2293 * swap_lock prevents swap_map being freed. Don't grab an extra
2294 * reference on the swaphandle, it doesn't matter if it becomes unused.
2295 */
2296int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2297{
2298 struct swap_info_struct *si;
2299 int our_page_cluster = page_cluster;
2300 pgoff_t target, toff;
2301 pgoff_t base, end;
2302 int nr_pages = 0;
2303
2304 if (!our_page_cluster) /* no readahead */
2305 return 0;
2306
2307 si = swap_info[swp_type(entry)];
2308 target = swp_offset(entry);
2309 base = (target >> our_page_cluster) << our_page_cluster;
2310 end = base + (1 << our_page_cluster);
2311 if (!base) /* first page is swap header */
2312 base++;
2313
2314 spin_lock(&swap_lock);
2315 if (end > si->max) /* don't go beyond end of map */
2316 end = si->max;
2317
2318 /* Count contiguous allocated slots above our target */
2319 for (toff = target; ++toff < end; nr_pages++) {
2320 /* Don't read in free or bad pages */
2321 if (!si->swap_map[toff])
2322 break;
2323 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2324 break;
2325 }
2326 /* Count contiguous allocated slots below our target */
2327 for (toff = target; --toff >= base; nr_pages++) {
2328 /* Don't read in free or bad pages */
2329 if (!si->swap_map[toff])
2330 break;
2331 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2332 break;
2333 }
2334 spin_unlock(&swap_lock);
2335
2336 /*
2337 * Indicate starting offset, and return number of pages to get:
2338 * if only 1, say 0, since there's then no readahead to be done.
2339 */
2340 *offset = ++toff;
2341 return nr_pages? ++nr_pages: 0;
2342}
2343
2344/*
2345 * add_swap_count_continuation - called when a swap count is duplicated 2296 * add_swap_count_continuation - called when a swap count is duplicated
2346 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 2297 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2347 * page of the original vmalloc'ed swap_map, to hold the continuation count 2298 * page of the original vmalloc'ed swap_map, to hold the continuation count
@@ -2427,9 +2378,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2427 if (!(count & COUNT_CONTINUED)) 2378 if (!(count & COUNT_CONTINUED))
2428 goto out; 2379 goto out;
2429 2380
2430 map = kmap_atomic(list_page, KM_USER0) + offset; 2381 map = kmap_atomic(list_page) + offset;
2431 count = *map; 2382 count = *map;
2432 kunmap_atomic(map, KM_USER0); 2383 kunmap_atomic(map);
2433 2384
2434 /* 2385 /*
2435 * If this continuation count now has some space in it, 2386 * If this continuation count now has some space in it,
@@ -2472,7 +2423,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
2472 2423
2473 offset &= ~PAGE_MASK; 2424 offset &= ~PAGE_MASK;
2474 page = list_entry(head->lru.next, struct page, lru); 2425 page = list_entry(head->lru.next, struct page, lru);
2475 map = kmap_atomic(page, KM_USER0) + offset; 2426 map = kmap_atomic(page) + offset;
2476 2427
2477 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 2428 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
2478 goto init_map; /* jump over SWAP_CONT_MAX checks */ 2429 goto init_map; /* jump over SWAP_CONT_MAX checks */
@@ -2482,26 +2433,26 @@ static bool swap_count_continued(struct swap_info_struct *si,
2482 * Think of how you add 1 to 999 2433 * Think of how you add 1 to 999
2483 */ 2434 */
2484 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 2435 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2485 kunmap_atomic(map, KM_USER0); 2436 kunmap_atomic(map);
2486 page = list_entry(page->lru.next, struct page, lru); 2437 page = list_entry(page->lru.next, struct page, lru);
2487 BUG_ON(page == head); 2438 BUG_ON(page == head);
2488 map = kmap_atomic(page, KM_USER0) + offset; 2439 map = kmap_atomic(page) + offset;
2489 } 2440 }
2490 if (*map == SWAP_CONT_MAX) { 2441 if (*map == SWAP_CONT_MAX) {
2491 kunmap_atomic(map, KM_USER0); 2442 kunmap_atomic(map);
2492 page = list_entry(page->lru.next, struct page, lru); 2443 page = list_entry(page->lru.next, struct page, lru);
2493 if (page == head) 2444 if (page == head)
2494 return false; /* add count continuation */ 2445 return false; /* add count continuation */
2495 map = kmap_atomic(page, KM_USER0) + offset; 2446 map = kmap_atomic(page) + offset;
2496init_map: *map = 0; /* we didn't zero the page */ 2447init_map: *map = 0; /* we didn't zero the page */
2497 } 2448 }
2498 *map += 1; 2449 *map += 1;
2499 kunmap_atomic(map, KM_USER0); 2450 kunmap_atomic(map);
2500 page = list_entry(page->lru.prev, struct page, lru); 2451 page = list_entry(page->lru.prev, struct page, lru);
2501 while (page != head) { 2452 while (page != head) {
2502 map = kmap_atomic(page, KM_USER0) + offset; 2453 map = kmap_atomic(page) + offset;
2503 *map = COUNT_CONTINUED; 2454 *map = COUNT_CONTINUED;
2504 kunmap_atomic(map, KM_USER0); 2455 kunmap_atomic(map);
2505 page = list_entry(page->lru.prev, struct page, lru); 2456 page = list_entry(page->lru.prev, struct page, lru);
2506 } 2457 }
2507 return true; /* incremented */ 2458 return true; /* incremented */
@@ -2512,22 +2463,22 @@ init_map: *map = 0; /* we didn't zero the page */
2512 */ 2463 */
2513 BUG_ON(count != COUNT_CONTINUED); 2464 BUG_ON(count != COUNT_CONTINUED);
2514 while (*map == COUNT_CONTINUED) { 2465 while (*map == COUNT_CONTINUED) {
2515 kunmap_atomic(map, KM_USER0); 2466 kunmap_atomic(map);
2516 page = list_entry(page->lru.next, struct page, lru); 2467 page = list_entry(page->lru.next, struct page, lru);
2517 BUG_ON(page == head); 2468 BUG_ON(page == head);
2518 map = kmap_atomic(page, KM_USER0) + offset; 2469 map = kmap_atomic(page) + offset;
2519 } 2470 }
2520 BUG_ON(*map == 0); 2471 BUG_ON(*map == 0);
2521 *map -= 1; 2472 *map -= 1;
2522 if (*map == 0) 2473 if (*map == 0)
2523 count = 0; 2474 count = 0;
2524 kunmap_atomic(map, KM_USER0); 2475 kunmap_atomic(map);
2525 page = list_entry(page->lru.prev, struct page, lru); 2476 page = list_entry(page->lru.prev, struct page, lru);
2526 while (page != head) { 2477 while (page != head) {
2527 map = kmap_atomic(page, KM_USER0) + offset; 2478 map = kmap_atomic(page) + offset;
2528 *map = SWAP_CONT_MAX | count; 2479 *map = SWAP_CONT_MAX | count;
2529 count = COUNT_CONTINUED; 2480 count = COUNT_CONTINUED;
2530 kunmap_atomic(map, KM_USER0); 2481 kunmap_atomic(map);
2531 page = list_entry(page->lru.prev, struct page, lru); 2482 page = list_entry(page->lru.prev, struct page, lru);
2532 } 2483 }
2533 return count == COUNT_CONTINUED; 2484 return count == COUNT_CONTINUED;
diff --git a/mm/truncate.c b/mm/truncate.c
index 632b15e29f74..61a183b89df6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -52,7 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
52static inline void truncate_partial_page(struct page *page, unsigned partial) 52static inline void truncate_partial_page(struct page *page, unsigned partial)
53{ 53{
54 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 54 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
55 cleancache_flush_page(page->mapping, page); 55 cleancache_invalidate_page(page->mapping, page);
56 if (page_has_private(page)) 56 if (page_has_private(page))
57 do_invalidatepage(page, partial); 57 do_invalidatepage(page, partial);
58} 58}
@@ -184,7 +184,7 @@ int invalidate_inode_page(struct page *page)
184} 184}
185 185
186/** 186/**
187 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 187 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
188 * @mapping: mapping to truncate 188 * @mapping: mapping to truncate
189 * @lstart: offset from which to truncate 189 * @lstart: offset from which to truncate
190 * @lend: offset to which to truncate 190 * @lend: offset to which to truncate
@@ -213,7 +213,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
213 pgoff_t end; 213 pgoff_t end;
214 int i; 214 int i;
215 215
216 cleancache_flush_inode(mapping); 216 cleancache_invalidate_inode(mapping);
217 if (mapping->nrpages == 0) 217 if (mapping->nrpages == 0)
218 return; 218 return;
219 219
@@ -292,7 +292,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
292 mem_cgroup_uncharge_end(); 292 mem_cgroup_uncharge_end();
293 index++; 293 index++;
294 } 294 }
295 cleancache_flush_inode(mapping); 295 cleancache_invalidate_inode(mapping);
296} 296}
297EXPORT_SYMBOL(truncate_inode_pages_range); 297EXPORT_SYMBOL(truncate_inode_pages_range);
298 298
@@ -444,7 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
444 int ret2 = 0; 444 int ret2 = 0;
445 int did_range_unmap = 0; 445 int did_range_unmap = 0;
446 446
447 cleancache_flush_inode(mapping); 447 cleancache_invalidate_inode(mapping);
448 pagevec_init(&pvec, 0); 448 pagevec_init(&pvec, 0);
449 index = start; 449 index = start;
450 while (index <= end && pagevec_lookup(&pvec, mapping, index, 450 while (index <= end && pagevec_lookup(&pvec, mapping, index,
@@ -500,7 +500,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
500 cond_resched(); 500 cond_resched();
501 index++; 501 index++;
502 } 502 }
503 cleancache_flush_inode(mapping); 503 cleancache_invalidate_inode(mapping);
504 return ret; 504 return ret;
505} 505}
506EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 506EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
@@ -626,3 +626,43 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
626 626
627 return 0; 627 return 0;
628} 628}
629
630/**
631 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
632 * @inode: inode
633 * @lstart: offset of beginning of hole
634 * @lend: offset of last byte of hole
635 *
636 * This function should typically be called before the filesystem
637 * releases resources associated with the freed range (eg. deallocates
638 * blocks). This way, pagecache will always stay logically coherent
639 * with on-disk format, and the filesystem would not have to deal with
640 * situations such as writepage being called for a page that has already
641 * had its underlying blocks deallocated.
642 */
643void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
644{
645 struct address_space *mapping = inode->i_mapping;
646 loff_t unmap_start = round_up(lstart, PAGE_SIZE);
647 loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
648 /*
649 * This rounding is currently just for example: unmap_mapping_range
650 * expands its hole outwards, whereas we want it to contract the hole
651 * inwards. However, existing callers of truncate_pagecache_range are
652 * doing their own page rounding first; and truncate_inode_pages_range
653 * currently BUGs if lend is not pagealigned-1 (it handles partial
654 * page at start of hole, but not partial page at end of hole). Note
655 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
656 */
657
658 /*
659 * Unlike in truncate_pagecache, unmap_mapping_range is called only
660 * once (before truncating pagecache), and without "even_cows" flag:
661 * hole-punching should not remove private COWed pages from the hole.
662 */
663 if ((u64)unmap_end > (u64)unmap_start)
664 unmap_mapping_range(mapping, unmap_start,
665 1 + unmap_end - unmap_start, 0);
666 truncate_inode_pages_range(mapping, lstart, lend);
667}
668EXPORT_SYMBOL(truncate_pagecache_range);
diff --git a/mm/util.c b/mm/util.c
index 136ac4f322b8..ae962b31de88 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
239 next->vm_prev = vma; 239 next->vm_prev = vma;
240} 240}
241 241
242/* Check if the vma is being used as a stack by this task */
243static int vm_is_stack_for_task(struct task_struct *t,
244 struct vm_area_struct *vma)
245{
246 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
247}
248
249/*
250 * Check if the vma is being used as a stack.
251 * If is_group is non-zero, check in the entire thread group or else
252 * just check in the current task. Returns the pid of the task that
253 * the vma is stack for.
254 */
255pid_t vm_is_stack(struct task_struct *task,
256 struct vm_area_struct *vma, int in_group)
257{
258 pid_t ret = 0;
259
260 if (vm_is_stack_for_task(task, vma))
261 return task->pid;
262
263 if (in_group) {
264 struct task_struct *t;
265 rcu_read_lock();
266 if (!pid_alive(task))
267 goto done;
268
269 t = task;
270 do {
271 if (vm_is_stack_for_task(t, vma)) {
272 ret = t->pid;
273 goto done;
274 }
275 } while_each_thread(task, t);
276done:
277 rcu_read_unlock();
278 }
279
280 return ret;
281}
282
242#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 283#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
243void arch_pick_mmap_layout(struct mm_struct *mm) 284void arch_pick_mmap_layout(struct mm_struct *mm)
244{ 285{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 86ce9a526c17..94dff883b449 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1906,9 +1906,9 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
1906 * we can expect USER0 is not used (see vread/vwrite's 1906 * we can expect USER0 is not used (see vread/vwrite's
1907 * function description) 1907 * function description)
1908 */ 1908 */
1909 void *map = kmap_atomic(p, KM_USER0); 1909 void *map = kmap_atomic(p);
1910 memcpy(buf, map + offset, length); 1910 memcpy(buf, map + offset, length);
1911 kunmap_atomic(map, KM_USER0); 1911 kunmap_atomic(map);
1912 } else 1912 } else
1913 memset(buf, 0, length); 1913 memset(buf, 0, length);
1914 1914
@@ -1945,9 +1945,9 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1945 * we can expect USER0 is not used (see vread/vwrite's 1945 * we can expect USER0 is not used (see vread/vwrite's
1946 * function description) 1946 * function description)
1947 */ 1947 */
1948 void *map = kmap_atomic(p, KM_USER0); 1948 void *map = kmap_atomic(p);
1949 memcpy(map + offset, buf, length); 1949 memcpy(map + offset, buf, length);
1950 kunmap_atomic(map, KM_USER0); 1950 kunmap_atomic(map);
1951 } 1951 }
1952 addr += length; 1952 addr += length;
1953 buf += length; 1953 buf += length;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c52b23552659..33c332bbab73 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1138 * @mz: The mem_cgroup_zone to pull pages from. 1138 * @mz: The mem_cgroup_zone to pull pages from.
1139 * @dst: The temp list to put pages on to. 1139 * @dst: The temp list to put pages on to.
1140 * @nr_scanned: The number of pages that were scanned. 1140 * @nr_scanned: The number of pages that were scanned.
1141 * @order: The caller's attempted allocation order 1141 * @sc: The scan_control struct for this reclaim session
1142 * @mode: One of the LRU isolation modes 1142 * @mode: One of the LRU isolation modes
1143 * @active: True [1] if isolating active pages 1143 * @active: True [1] if isolating active pages
1144 * @file: True [1] if isolating file [!anon] pages 1144 * @file: True [1] if isolating file [!anon] pages
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1147 */ 1147 */
1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1149 struct mem_cgroup_zone *mz, struct list_head *dst, 1149 struct mem_cgroup_zone *mz, struct list_head *dst,
1150 unsigned long *nr_scanned, int order, isolate_mode_t mode, 1150 unsigned long *nr_scanned, struct scan_control *sc,
1151 int active, int file) 1151 isolate_mode_t mode, int active, int file)
1152{ 1152{
1153 struct lruvec *lruvec; 1153 struct lruvec *lruvec;
1154 struct list_head *src; 1154 struct list_head *src;
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1194 BUG(); 1194 BUG();
1195 } 1195 }
1196 1196
1197 if (!order) 1197 if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
1198 continue; 1198 continue;
1199 1199
1200 /* 1200 /*
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1208 */ 1208 */
1209 zone_id = page_zone_id(page); 1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page); 1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << order) - 1); 1211 pfn = page_pfn & ~((1 << sc->order) - 1);
1212 end_pfn = pfn + (1 << order); 1212 end_pfn = pfn + (1 << sc->order);
1213 for (; pfn < end_pfn; pfn++) { 1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page; 1214 struct page *cursor_page;
1215 1215
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1275 1275
1276 *nr_scanned = scan; 1276 *nr_scanned = scan;
1277 1277
1278 trace_mm_vmscan_lru_isolate(order, 1278 trace_mm_vmscan_lru_isolate(sc->order,
1279 nr_to_scan, scan, 1279 nr_to_scan, scan,
1280 nr_taken, 1280 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, 1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1413 unsigned long *nr_anon, 1413 unsigned long *nr_anon,
1414 unsigned long *nr_file) 1414 unsigned long *nr_file)
1415{ 1415{
1416 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1417 struct zone *zone = mz->zone; 1416 struct zone *zone = mz->zone;
1418 unsigned int count[NR_LRU_LISTS] = { 0, }; 1417 unsigned int count[NR_LRU_LISTS] = { 0, };
1419 unsigned long nr_active = 0; 1418 unsigned long nr_active = 0;
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1434 count[lru] += numpages; 1433 count[lru] += numpages;
1435 } 1434 }
1436 1435
1436 preempt_disable();
1437 __count_vm_events(PGDEACTIVATE, nr_active); 1437 __count_vm_events(PGDEACTIVATE, nr_active);
1438 1438
1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1450 1450
1451 reclaim_stat->recent_scanned[0] += *nr_anon; 1451 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1452 reclaim_stat->recent_scanned[1] += *nr_file; 1452 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1453 preempt_enable();
1453} 1454}
1454 1455
1455/* 1456/*
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1509 unsigned long nr_file; 1510 unsigned long nr_file;
1510 unsigned long nr_dirty = 0; 1511 unsigned long nr_dirty = 0;
1511 unsigned long nr_writeback = 0; 1512 unsigned long nr_writeback = 0;
1512 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; 1513 isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
1513 struct zone *zone = mz->zone; 1514 struct zone *zone = mz->zone;
1515 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1514 1516
1515 while (unlikely(too_many_isolated(zone, file, sc))) { 1517 while (unlikely(too_many_isolated(zone, file, sc))) {
1516 congestion_wait(BLK_RW_ASYNC, HZ/10); 1518 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1522 1524
1523 set_reclaim_mode(priority, sc, false); 1525 set_reclaim_mode(priority, sc, false);
1524 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) 1526 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1525 reclaim_mode |= ISOLATE_ACTIVE; 1527 isolate_mode |= ISOLATE_ACTIVE;
1526 1528
1527 lru_add_drain(); 1529 lru_add_drain();
1528 1530
1529 if (!sc->may_unmap) 1531 if (!sc->may_unmap)
1530 reclaim_mode |= ISOLATE_UNMAPPED; 1532 isolate_mode |= ISOLATE_UNMAPPED;
1531 if (!sc->may_writepage) 1533 if (!sc->may_writepage)
1532 reclaim_mode |= ISOLATE_CLEAN; 1534 isolate_mode |= ISOLATE_CLEAN;
1533 1535
1534 spin_lock_irq(&zone->lru_lock); 1536 spin_lock_irq(&zone->lru_lock);
1535 1537
1536 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, 1538 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
1537 &nr_scanned, sc->order, 1539 sc, isolate_mode, 0, file);
1538 reclaim_mode, 0, file);
1539 if (global_reclaim(sc)) { 1540 if (global_reclaim(sc)) {
1540 zone->pages_scanned += nr_scanned; 1541 zone->pages_scanned += nr_scanned;
1541 if (current_is_kswapd()) 1542 if (current_is_kswapd())
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1545 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1546 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1546 nr_scanned); 1547 nr_scanned);
1547 } 1548 }
1549 spin_unlock_irq(&zone->lru_lock);
1548 1550
1549 if (nr_taken == 0) { 1551 if (nr_taken == 0)
1550 spin_unlock_irq(&zone->lru_lock);
1551 return 0; 1552 return 0;
1552 }
1553 1553
1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); 1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
1555 1555
1556 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1557 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1558
1559 spin_unlock_irq(&zone->lru_lock);
1560
1561 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, 1556 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1562 &nr_dirty, &nr_writeback); 1557 &nr_dirty, &nr_writeback);
1563 1558
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1570 1565
1571 spin_lock_irq(&zone->lru_lock); 1566 spin_lock_irq(&zone->lru_lock);
1572 1567
1568 reclaim_stat->recent_scanned[0] += nr_anon;
1569 reclaim_stat->recent_scanned[1] += nr_file;
1570
1573 if (current_is_kswapd()) 1571 if (current_is_kswapd())
1574 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1572 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1575 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1573 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone,
1643 unsigned long pgmoved = 0; 1641 unsigned long pgmoved = 0;
1644 struct page *page; 1642 struct page *page;
1645 1643
1646 if (buffer_heads_over_limit) {
1647 spin_unlock_irq(&zone->lru_lock);
1648 list_for_each_entry(page, list, lru) {
1649 if (page_has_private(page) && trylock_page(page)) {
1650 if (page_has_private(page))
1651 try_to_release_page(page, 0);
1652 unlock_page(page);
1653 }
1654 }
1655 spin_lock_irq(&zone->lru_lock);
1656 }
1657
1658 while (!list_empty(list)) { 1644 while (!list_empty(list)) {
1659 struct lruvec *lruvec; 1645 struct lruvec *lruvec;
1660 1646
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan,
1699 struct page *page; 1685 struct page *page;
1700 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1686 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1701 unsigned long nr_rotated = 0; 1687 unsigned long nr_rotated = 0;
1702 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; 1688 isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
1703 struct zone *zone = mz->zone; 1689 struct zone *zone = mz->zone;
1704 1690
1705 lru_add_drain(); 1691 lru_add_drain();
1706 1692
1693 reset_reclaim_mode(sc);
1694
1707 if (!sc->may_unmap) 1695 if (!sc->may_unmap)
1708 reclaim_mode |= ISOLATE_UNMAPPED; 1696 isolate_mode |= ISOLATE_UNMAPPED;
1709 if (!sc->may_writepage) 1697 if (!sc->may_writepage)
1710 reclaim_mode |= ISOLATE_CLEAN; 1698 isolate_mode |= ISOLATE_CLEAN;
1711 1699
1712 spin_lock_irq(&zone->lru_lock); 1700 spin_lock_irq(&zone->lru_lock);
1713 1701
1714 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, 1702 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
1715 &nr_scanned, sc->order, 1703 isolate_mode, 1, file);
1716 reclaim_mode, 1, file);
1717 if (global_reclaim(sc)) 1704 if (global_reclaim(sc))
1718 zone->pages_scanned += nr_scanned; 1705 zone->pages_scanned += nr_scanned;
1719 1706
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
1737 continue; 1724 continue;
1738 } 1725 }
1739 1726
1727 if (unlikely(buffer_heads_over_limit)) {
1728 if (page_has_private(page) && trylock_page(page)) {
1729 if (page_has_private(page))
1730 try_to_release_page(page, 0);
1731 unlock_page(page);
1732 }
1733 }
1734
1740 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { 1735 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
1741 nr_rotated += hpage_nr_pages(page); 1736 nr_rotated += hpage_nr_pages(page);
1742 /* 1737 /*
@@ -2112,7 +2107,12 @@ restart:
2112 * with multiple processes reclaiming pages, the total 2107 * with multiple processes reclaiming pages, the total
2113 * freeing target can get unreasonably large. 2108 * freeing target can get unreasonably large.
2114 */ 2109 */
2115 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2110 if (nr_reclaimed >= nr_to_reclaim)
2111 nr_to_reclaim = 0;
2112 else
2113 nr_to_reclaim -= nr_reclaimed;
2114
2115 if (!nr_to_reclaim && priority < DEF_PRIORITY)
2116 break; 2116 break;
2117 } 2117 }
2118 blk_finish_plug(&plug); 2118 blk_finish_plug(&plug);
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2195 * If compaction is deferred, reclaim up to a point where 2195 * If compaction is deferred, reclaim up to a point where
2196 * compaction will have a chance of success when re-enabled 2196 * compaction will have a chance of success when re-enabled
2197 */ 2197 */
2198 if (compaction_deferred(zone)) 2198 if (compaction_deferred(zone, sc->order))
2199 return watermark_ok; 2199 return watermark_ok;
2200 2200
2201 /* If compaction is not ready to start, keep reclaiming */ 2201 /* If compaction is not ready to start, keep reclaiming */
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2235 unsigned long nr_soft_scanned; 2235 unsigned long nr_soft_scanned;
2236 bool aborted_reclaim = false; 2236 bool aborted_reclaim = false;
2237 2237
2238 /*
2239 * If the number of buffer_heads in the machine exceeds the maximum
2240 * allowed level, force direct reclaim to scan the highmem zone as
2241 * highmem pages could be pinning lowmem pages storing buffer_heads
2242 */
2243 if (buffer_heads_over_limit)
2244 sc->gfp_mask |= __GFP_HIGHMEM;
2245
2238 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2246 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2239 gfp_zone(sc->gfp_mask), sc->nodemask) { 2247 gfp_zone(sc->gfp_mask), sc->nodemask) {
2240 if (!populated_zone(zone)) 2248 if (!populated_zone(zone))
@@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2255 * Even though compaction is invoked for any 2263 * Even though compaction is invoked for any
2256 * non-zero order, only frequent costly order 2264 * non-zero order, only frequent costly order
2257 * reclamation is disruptive enough to become a 2265 * reclamation is disruptive enough to become a
2258 * noticable problem, like transparent huge page 2266 * noticeable problem, like transparent huge
2259 * allocations. 2267 * page allocations.
2260 */ 2268 */
2261 if (compaction_ready(zone, sc)) { 2269 if (compaction_ready(zone, sc)) {
2262 aborted_reclaim = true; 2270 aborted_reclaim = true;
@@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2337 unsigned long writeback_threshold; 2345 unsigned long writeback_threshold;
2338 bool aborted_reclaim; 2346 bool aborted_reclaim;
2339 2347
2340 get_mems_allowed();
2341 delayacct_freepages_start(); 2348 delayacct_freepages_start();
2342 2349
2343 if (global_reclaim(sc)) 2350 if (global_reclaim(sc))
@@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2401 2408
2402out: 2409out:
2403 delayacct_freepages_end(); 2410 delayacct_freepages_end();
2404 put_mems_allowed();
2405 2411
2406 if (sc->nr_reclaimed) 2412 if (sc->nr_reclaimed)
2407 return sc->nr_reclaimed; 2413 return sc->nr_reclaimed;
@@ -2724,6 +2730,17 @@ loop_again:
2724 */ 2730 */
2725 age_active_anon(zone, &sc, priority); 2731 age_active_anon(zone, &sc, priority);
2726 2732
2733 /*
2734 * If the number of buffer_heads in the machine
2735 * exceeds the maximum allowed level and this node
2736 * has a highmem zone, force kswapd to reclaim from
2737 * it to relieve lowmem pressure.
2738 */
2739 if (buffer_heads_over_limit && is_highmem_idx(i)) {
2740 end_zone = i;
2741 break;
2742 }
2743
2727 if (!zone_watermark_ok_safe(zone, order, 2744 if (!zone_watermark_ok_safe(zone, order,
2728 high_wmark_pages(zone), 0, 0)) { 2745 high_wmark_pages(zone), 0, 0)) {
2729 end_zone = i; 2746 end_zone = i;
@@ -2753,7 +2770,7 @@ loop_again:
2753 */ 2770 */
2754 for (i = 0; i <= end_zone; i++) { 2771 for (i = 0; i <= end_zone; i++) {
2755 struct zone *zone = pgdat->node_zones + i; 2772 struct zone *zone = pgdat->node_zones + i;
2756 int nr_slab; 2773 int nr_slab, testorder;
2757 unsigned long balance_gap; 2774 unsigned long balance_gap;
2758 2775
2759 if (!populated_zone(zone)) 2776 if (!populated_zone(zone))
@@ -2786,7 +2803,21 @@ loop_again:
2786 (zone->present_pages + 2803 (zone->present_pages +
2787 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2804 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2788 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2805 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2789 if (!zone_watermark_ok_safe(zone, order, 2806 /*
2807 * Kswapd reclaims only single pages with compaction
2808 * enabled. Trying too hard to reclaim until contiguous
2809 * free pages have become available can hurt performance
2810 * by evicting too much useful data from memory.
2811 * Do not reclaim more than needed for compaction.
2812 */
2813 testorder = order;
2814 if (COMPACTION_BUILD && order &&
2815 compaction_suitable(zone, order) !=
2816 COMPACT_SKIPPED)
2817 testorder = 0;
2818
2819 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2820 !zone_watermark_ok_safe(zone, testorder,
2790 high_wmark_pages(zone) + balance_gap, 2821 high_wmark_pages(zone) + balance_gap,
2791 end_zone, 0)) { 2822 end_zone, 0)) {
2792 shrink_zone(priority, zone, &sc); 2823 shrink_zone(priority, zone, &sc);
@@ -2815,7 +2846,7 @@ loop_again:
2815 continue; 2846 continue;
2816 } 2847 }
2817 2848
2818 if (!zone_watermark_ok_safe(zone, order, 2849 if (!zone_watermark_ok_safe(zone, testorder,
2819 high_wmark_pages(zone), end_zone, 0)) { 2850 high_wmark_pages(zone), end_zone, 0)) {
2820 all_zones_ok = 0; 2851 all_zones_ok = 0;
2821 /* 2852 /*
@@ -2903,6 +2934,8 @@ out:
2903 * and it is potentially going to sleep here. 2934 * and it is potentially going to sleep here.
2904 */ 2935 */
2905 if (order) { 2936 if (order) {
2937 int zones_need_compaction = 1;
2938
2906 for (i = 0; i <= end_zone; i++) { 2939 for (i = 0; i <= end_zone; i++) {
2907 struct zone *zone = pgdat->node_zones + i; 2940 struct zone *zone = pgdat->node_zones + i;
2908 2941
@@ -2912,6 +2945,11 @@ out:
2912 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2945 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2913 continue; 2946 continue;
2914 2947
2948 /* Would compaction fail due to lack of free memory? */
2949 if (COMPACTION_BUILD &&
2950 compaction_suitable(zone, order) == COMPACT_SKIPPED)
2951 goto loop_again;
2952
2915 /* Confirm the zone is balanced for order-0 */ 2953 /* Confirm the zone is balanced for order-0 */
2916 if (!zone_watermark_ok(zone, 0, 2954 if (!zone_watermark_ok(zone, 0,
2917 high_wmark_pages(zone), 0, 0)) { 2955 high_wmark_pages(zone), 0, 0)) {
@@ -2919,11 +2957,17 @@ out:
2919 goto loop_again; 2957 goto loop_again;
2920 } 2958 }
2921 2959
2960 /* Check if the memory needs to be defragmented. */
2961 if (zone_watermark_ok(zone, order,
2962 low_wmark_pages(zone), *classzone_idx, 0))
2963 zones_need_compaction = 0;
2964
2922 /* If balanced, clear the congested flag */ 2965 /* If balanced, clear the congested flag */
2923 zone_clear_flag(zone, ZONE_CONGESTED); 2966 zone_clear_flag(zone, ZONE_CONGESTED);
2924 if (i <= *classzone_idx)
2925 balanced += zone->present_pages;
2926 } 2967 }
2968
2969 if (zones_need_compaction)
2970 compact_pgdat(pgdat, order);
2927 } 2971 }
2928 2972
2929 /* 2973 /*