aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig27
-rw-r--r--mm/backing-dev.c11
-rw-r--r--mm/bootmem.c32
-rw-r--r--mm/bounce.c48
-rw-r--r--mm/cleancache.c2
-rw-r--r--mm/compaction.c133
-rw-r--r--mm/fadvise.c20
-rw-r--r--mm/filemap.c7
-rw-r--r--mm/fremap.c51
-rw-r--r--mm/huge_memory.c113
-rw-r--r--mm/hugetlb.c39
-rw-r--r--mm/internal.h7
-rw-r--r--mm/kmemleak.c14
-rw-r--r--mm/ksm.c670
-rw-r--r--mm/madvise.c105
-rw-r--r--mm/memblock.c70
-rw-r--r--mm/memcontrol.c477
-rw-r--r--mm/memory-failure.c202
-rw-r--r--mm/memory.c127
-rw-r--r--mm/memory_hotplug.c553
-rw-r--r--mm/mempolicy.c59
-rw-r--r--mm/migrate.c154
-rw-r--r--mm/mincore.c5
-rw-r--r--mm/mlock.c137
-rw-r--r--mm/mm_init.c31
-rw-r--r--mm/mmap.c123
-rw-r--r--mm/mmu_notifier.c102
-rw-r--r--mm/mmzone.c20
-rw-r--r--mm/mremap.c28
-rw-r--r--mm/nobootmem.c23
-rw-r--r--mm/nommu.c41
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c28
-rw-r--r--mm/page_alloc.c498
-rw-r--r--mm/rmap.c30
-rw-r--r--mm/shmem.c102
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c4
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c9
-rw-r--r--mm/swap_state.c58
-rw-r--r--mm/swapfile.c176
-rw-r--r--mm/util.c26
-rw-r--r--mm/vmalloc.c33
-rw-r--r--mm/vmscan.c397
-rw-r--r--mm/vmstat.c7
47 files changed, 3230 insertions, 1591 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 278e3ab1f169..ae55c1e04d10 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1,6 +1,6 @@
1config SELECT_MEMORY_MODEL 1config SELECT_MEMORY_MODEL
2 def_bool y 2 def_bool y
3 depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL 3 depends on ARCH_SELECT_MEMORY_MODEL
4 4
5choice 5choice
6 prompt "Memory model" 6 prompt "Memory model"
@@ -162,10 +162,16 @@ config MOVABLE_NODE
162 Say Y here if you want to hotplug a whole node. 162 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly. 163 Say N here if you want kernel to use memory on all nodes evenly.
164 164
165#
166# Only be set on architectures that have completely implemented memory hotplug
167# feature. If you are not sure, don't touch it.
168#
169config HAVE_BOOTMEM_INFO_NODE
170 def_bool n
171
165# eventually, we can have this option just 'select SPARSEMEM' 172# eventually, we can have this option just 'select SPARSEMEM'
166config MEMORY_HOTPLUG 173config MEMORY_HOTPLUG
167 bool "Allow for memory hot-add" 174 bool "Allow for memory hot-add"
168 select MEMORY_ISOLATION
169 depends on SPARSEMEM || X86_64_ACPI_NUMA 175 depends on SPARSEMEM || X86_64_ACPI_NUMA
170 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG 176 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
171 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) 177 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE
176 182
177config MEMORY_HOTREMOVE 183config MEMORY_HOTREMOVE
178 bool "Allow for memory hot remove" 184 bool "Allow for memory hot remove"
185 select MEMORY_ISOLATION
186 select HAVE_BOOTMEM_INFO_NODE if X86_64
179 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE 187 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
180 depends on MIGRATION 188 depends on MIGRATION
181 189
@@ -258,6 +266,19 @@ config BOUNCE
258 def_bool y 266 def_bool y
259 depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) 267 depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
260 268
269# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
270# have more than 4GB of memory, but we don't currently use the IOTLB to present
271# a 32-bit address to OHCI. So we need to use a bounce pool instead.
272#
273# We also use the bounce pool to provide stable page writes for jbd. jbd
274# initiates buffer writeback without locking the page or setting PG_writeback,
275# and fixing that behavior (a second time; jbd2 doesn't have this problem) is
276# a major rework effort. Instead, use the bounce buffer to snapshot pages
277# (until jbd goes away). The only jbd user is ext3.
278config NEED_BOUNCE_POOL
279 bool
280 default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD)
281
261config NR_QUICK 282config NR_QUICK
262 int 283 int
263 depends on QUICKLIST 284 depends on QUICKLIST
@@ -266,7 +287,7 @@ config NR_QUICK
266 287
267config VIRT_TO_BUS 288config VIRT_TO_BUS
268 def_bool y 289 def_bool y
269 depends on !ARCH_NO_VIRT_TO_BUS 290 depends on HAVE_VIRT_TO_BUS
270 291
271config MMU_NOTIFIER 292config MMU_NOTIFIER
272 bool 293 bool
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca2b3ee176..41733c5dc820 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -221,12 +221,23 @@ static ssize_t max_ratio_store(struct device *dev,
221} 221}
222BDI_SHOW(max_ratio, bdi->max_ratio) 222BDI_SHOW(max_ratio, bdi->max_ratio)
223 223
224static ssize_t stable_pages_required_show(struct device *dev,
225 struct device_attribute *attr,
226 char *page)
227{
228 struct backing_dev_info *bdi = dev_get_drvdata(dev);
229
230 return snprintf(page, PAGE_SIZE-1, "%d\n",
231 bdi_cap_stable_pages_required(bdi) ? 1 : 0);
232}
233
224#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) 234#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
225 235
226static struct device_attribute bdi_dev_attrs[] = { 236static struct device_attribute bdi_dev_attrs[] = {
227 __ATTR_RW(read_ahead_kb), 237 __ATTR_RW(read_ahead_kb),
228 __ATTR_RW(min_ratio), 238 __ATTR_RW(min_ratio),
229 __ATTR_RW(max_ratio), 239 __ATTR_RW(max_ratio),
240 __ATTR_RO(stable_pages_required),
230 __ATTR_NULL, 241 __ATTR_NULL,
231}; 242};
232 243
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 1324cd74faec..2b0bcb019ec2 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
185 185
186 while (start < end) { 186 while (start < end) {
187 unsigned long *map, idx, vec; 187 unsigned long *map, idx, vec;
188 unsigned shift;
188 189
189 map = bdata->node_bootmem_map; 190 map = bdata->node_bootmem_map;
190 idx = start - bdata->node_min_pfn; 191 idx = start - bdata->node_min_pfn;
192 shift = idx & (BITS_PER_LONG - 1);
193 /*
194 * vec holds at most BITS_PER_LONG map bits,
195 * bit 0 corresponds to start.
196 */
191 vec = ~map[idx / BITS_PER_LONG]; 197 vec = ~map[idx / BITS_PER_LONG];
198
199 if (shift) {
200 vec >>= shift;
201 if (end - start >= BITS_PER_LONG)
202 vec |= ~map[idx / BITS_PER_LONG + 1] <<
203 (BITS_PER_LONG - shift);
204 }
192 /* 205 /*
193 * If we have a properly aligned and fully unreserved 206 * If we have a properly aligned and fully unreserved
194 * BITS_PER_LONG block of pages in front of us, free 207 * BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
201 count += BITS_PER_LONG; 214 count += BITS_PER_LONG;
202 start += BITS_PER_LONG; 215 start += BITS_PER_LONG;
203 } else { 216 } else {
204 unsigned long off = 0; 217 unsigned long cur = start;
205 218
206 vec >>= start & (BITS_PER_LONG - 1); 219 start = ALIGN(start + 1, BITS_PER_LONG);
207 while (vec) { 220 while (vec && cur != start) {
208 if (vec & 1) { 221 if (vec & 1) {
209 page = pfn_to_page(start + off); 222 page = pfn_to_page(cur);
210 __free_pages_bootmem(page, 0); 223 __free_pages_bootmem(page, 0);
211 count++; 224 count++;
212 } 225 }
213 vec >>= 1; 226 vec >>= 1;
214 off++; 227 ++cur;
215 } 228 }
216 start = ALIGN(start + 1, BITS_PER_LONG);
217 } 229 }
218 } 230 }
219 231
@@ -821,6 +833,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
821 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); 833 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
822} 834}
823 835
836void * __init __alloc_bootmem_low_nopanic(unsigned long size,
837 unsigned long align,
838 unsigned long goal)
839{
840 return ___alloc_bootmem_nopanic(size, align, goal,
841 ARCH_LOW_ADDRESS_LIMIT);
842}
843
824/** 844/**
825 * __alloc_bootmem_low_node - allocate low boot memory from a specific node 845 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
826 * @pgdat: node to allocate from 846 * @pgdat: node to allocate from
diff --git a/mm/bounce.c b/mm/bounce.c
index 042086775561..5f8901768602 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -178,8 +178,45 @@ static void bounce_end_io_read_isa(struct bio *bio, int err)
178 __bounce_end_io_read(bio, isa_page_pool, err); 178 __bounce_end_io_read(bio, isa_page_pool, err);
179} 179}
180 180
181#ifdef CONFIG_NEED_BOUNCE_POOL
182static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
183{
184 struct page *page;
185 struct backing_dev_info *bdi;
186 struct address_space *mapping;
187 struct bio_vec *from;
188 int i;
189
190 if (bio_data_dir(bio) != WRITE)
191 return 0;
192
193 if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
194 return 0;
195
196 /*
197 * Based on the first page that has a valid mapping, decide whether or
198 * not we have to employ bounce buffering to guarantee stable pages.
199 */
200 bio_for_each_segment(from, bio, i) {
201 page = from->bv_page;
202 mapping = page_mapping(page);
203 if (!mapping)
204 continue;
205 bdi = mapping->backing_dev_info;
206 return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
207 }
208
209 return 0;
210}
211#else
212static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
213{
214 return 0;
215}
216#endif /* CONFIG_NEED_BOUNCE_POOL */
217
181static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, 218static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
182 mempool_t *pool) 219 mempool_t *pool, int force)
183{ 220{
184 struct page *page; 221 struct page *page;
185 struct bio *bio = NULL; 222 struct bio *bio = NULL;
@@ -192,7 +229,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
192 /* 229 /*
193 * is destination page below bounce pfn? 230 * is destination page below bounce pfn?
194 */ 231 */
195 if (page_to_pfn(page) <= queue_bounce_pfn(q)) 232 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
196 continue; 233 continue;
197 234
198 /* 235 /*
@@ -270,6 +307,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
270 307
271void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) 308void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
272{ 309{
310 int must_bounce;
273 mempool_t *pool; 311 mempool_t *pool;
274 312
275 /* 313 /*
@@ -278,13 +316,15 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
278 if (!bio_has_data(*bio_orig)) 316 if (!bio_has_data(*bio_orig))
279 return; 317 return;
280 318
319 must_bounce = must_snapshot_stable_pages(q, *bio_orig);
320
281 /* 321 /*
282 * for non-isa bounce case, just check if the bounce pfn is equal 322 * for non-isa bounce case, just check if the bounce pfn is equal
283 * to or bigger than the highest pfn in the system -- in that case, 323 * to or bigger than the highest pfn in the system -- in that case,
284 * don't waste time iterating over bio segments 324 * don't waste time iterating over bio segments
285 */ 325 */
286 if (!(q->bounce_gfp & GFP_DMA)) { 326 if (!(q->bounce_gfp & GFP_DMA)) {
287 if (queue_bounce_pfn(q) >= blk_max_pfn) 327 if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
288 return; 328 return;
289 pool = page_pool; 329 pool = page_pool;
290 } else { 330 } else {
@@ -295,7 +335,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
295 /* 335 /*
296 * slow path 336 * slow path
297 */ 337 */
298 __blk_queue_bounce(q, bio_orig, pool); 338 __blk_queue_bounce(q, bio_orig, pool, must_bounce);
299} 339}
300 340
301EXPORT_SYMBOL(blk_queue_bounce); 341EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 32e6f4136fa2..d76ba74be2d0 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -89,7 +89,7 @@ static int cleancache_get_key(struct inode *inode,
89 fhfn = sb->s_export_op->encode_fh; 89 fhfn = sb->s_export_op->encode_fh;
90 if (fhfn) { 90 if (fhfn) {
91 len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); 91 len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
92 if (len <= 0 || len == 255) 92 if (len <= FILEID_ROOT || len == FILEID_INVALID)
93 return -1; 93 return -1;
94 if (maxlen > CLEANCACHE_KEY_MAX) 94 if (maxlen > CLEANCACHE_KEY_MAX)
95 return -1; 95 return -1;
diff --git a/mm/compaction.c b/mm/compaction.c
index 6b807e466497..05ccb4cc0bdb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -15,6 +15,7 @@
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include <linux/balloon_compaction.h> 17#include <linux/balloon_compaction.h>
18#include <linux/page-isolation.h>
18#include "internal.h" 19#include "internal.h"
19 20
20#ifdef CONFIG_COMPACTION 21#ifdef CONFIG_COMPACTION
@@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
85static void __reset_isolation_suitable(struct zone *zone) 86static void __reset_isolation_suitable(struct zone *zone)
86{ 87{
87 unsigned long start_pfn = zone->zone_start_pfn; 88 unsigned long start_pfn = zone->zone_start_pfn;
88 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; 89 unsigned long end_pfn = zone_end_pfn(zone);
89 unsigned long pfn; 90 unsigned long pfn;
90 91
91 zone->compact_cached_migrate_pfn = start_pfn; 92 zone->compact_cached_migrate_pfn = start_pfn;
@@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page)
215 int migratetype = get_pageblock_migratetype(page); 216 int migratetype = get_pageblock_migratetype(page);
216 217
217 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ 218 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
218 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) 219 if (migratetype == MIGRATE_RESERVE)
220 return false;
221
222 if (is_migrate_isolate(migratetype))
219 return false; 223 return false;
220 224
221 /* If the page is a large free page, then allow migration */ 225 /* If the page is a large free page, then allow migration */
@@ -611,8 +615,7 @@ check_compact_cluster:
611 continue; 615 continue;
612 616
613next_pageblock: 617next_pageblock:
614 low_pfn += pageblock_nr_pages; 618 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
615 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
616 last_pageblock_nr = pageblock_nr; 619 last_pageblock_nr = pageblock_nr;
617 } 620 }
618 621
@@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone,
644 struct compact_control *cc) 647 struct compact_control *cc)
645{ 648{
646 struct page *page; 649 struct page *page;
647 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; 650 unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
648 int nr_freepages = cc->nr_freepages; 651 int nr_freepages = cc->nr_freepages;
649 struct list_head *freelist = &cc->freepages; 652 struct list_head *freelist = &cc->freepages;
650 653
@@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone,
663 */ 666 */
664 high_pfn = min(low_pfn, pfn); 667 high_pfn = min(low_pfn, pfn);
665 668
666 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 669 z_end_pfn = zone_end_pfn(zone);
667 670
668 /* 671 /*
669 * Isolate free pages until enough are available to migrate the 672 * Isolate free pages until enough are available to migrate the
@@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone,
706 * only scans within a pageblock 709 * only scans within a pageblock
707 */ 710 */
708 end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 711 end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
709 end_pfn = min(end_pfn, zone_end_pfn); 712 end_pfn = min(end_pfn, z_end_pfn);
710 isolated = isolate_freepages_block(cc, pfn, end_pfn, 713 isolated = isolate_freepages_block(cc, pfn, end_pfn,
711 freelist, false); 714 freelist, false);
712 nr_freepages += isolated; 715 nr_freepages += isolated;
@@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
795 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 798 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
796 799
797 /* Only scan within a pageblock boundary */ 800 /* Only scan within a pageblock boundary */
798 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); 801 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
799 802
800 /* Do not cross the free scanner or scan within a memory hole */ 803 /* Do not cross the free scanner or scan within a memory hole */
801 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 804 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
@@ -816,6 +819,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
816static int compact_finished(struct zone *zone, 819static int compact_finished(struct zone *zone,
817 struct compact_control *cc) 820 struct compact_control *cc)
818{ 821{
822 unsigned int order;
819 unsigned long watermark; 823 unsigned long watermark;
820 824
821 if (fatal_signal_pending(current)) 825 if (fatal_signal_pending(current))
@@ -850,22 +854,16 @@ static int compact_finished(struct zone *zone,
850 return COMPACT_CONTINUE; 854 return COMPACT_CONTINUE;
851 855
852 /* Direct compactor: Is a suitable page free? */ 856 /* Direct compactor: Is a suitable page free? */
853 if (cc->page) { 857 for (order = cc->order; order < MAX_ORDER; order++) {
854 /* Was a suitable page captured? */ 858 struct free_area *area = &zone->free_area[order];
855 if (*cc->page) 859
860 /* Job done if page is free of the right migratetype */
861 if (!list_empty(&area->free_list[cc->migratetype]))
862 return COMPACT_PARTIAL;
863
864 /* Job done if allocation would set block type */
865 if (cc->order >= pageblock_order && area->nr_free)
856 return COMPACT_PARTIAL; 866 return COMPACT_PARTIAL;
857 } else {
858 unsigned int order;
859 for (order = cc->order; order < MAX_ORDER; order++) {
860 struct free_area *area = &zone->free_area[cc->order];
861 /* Job done if page is free of the right migratetype */
862 if (!list_empty(&area->free_list[cc->migratetype]))
863 return COMPACT_PARTIAL;
864
865 /* Job done if allocation would set block type */
866 if (cc->order >= pageblock_order && area->nr_free)
867 return COMPACT_PARTIAL;
868 }
869 } 867 }
870 868
871 return COMPACT_CONTINUE; 869 return COMPACT_CONTINUE;
@@ -921,65 +919,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
921 return COMPACT_CONTINUE; 919 return COMPACT_CONTINUE;
922} 920}
923 921
924static void compact_capture_page(struct compact_control *cc)
925{
926 unsigned long flags;
927 int mtype, mtype_low, mtype_high;
928
929 if (!cc->page || *cc->page)
930 return;
931
932 /*
933 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
934 * regardless of the migratetype of the freelist is is captured from.
935 * This is fine because the order for a high-order MIGRATE_MOVABLE
936 * allocation is typically at least a pageblock size and overall
937 * fragmentation is not impaired. Other allocation types must
938 * capture pages from their own migratelist because otherwise they
939 * could pollute other pageblocks like MIGRATE_MOVABLE with
940 * difficult to move pages and making fragmentation worse overall.
941 */
942 if (cc->migratetype == MIGRATE_MOVABLE) {
943 mtype_low = 0;
944 mtype_high = MIGRATE_PCPTYPES;
945 } else {
946 mtype_low = cc->migratetype;
947 mtype_high = cc->migratetype + 1;
948 }
949
950 /* Speculatively examine the free lists without zone lock */
951 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
952 int order;
953 for (order = cc->order; order < MAX_ORDER; order++) {
954 struct page *page;
955 struct free_area *area;
956 area = &(cc->zone->free_area[order]);
957 if (list_empty(&area->free_list[mtype]))
958 continue;
959
960 /* Take the lock and attempt capture of the page */
961 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
962 return;
963 if (!list_empty(&area->free_list[mtype])) {
964 page = list_entry(area->free_list[mtype].next,
965 struct page, lru);
966 if (capture_free_page(page, cc->order, mtype)) {
967 spin_unlock_irqrestore(&cc->zone->lock,
968 flags);
969 *cc->page = page;
970 return;
971 }
972 }
973 spin_unlock_irqrestore(&cc->zone->lock, flags);
974 }
975 }
976}
977
978static int compact_zone(struct zone *zone, struct compact_control *cc) 922static int compact_zone(struct zone *zone, struct compact_control *cc)
979{ 923{
980 int ret; 924 int ret;
981 unsigned long start_pfn = zone->zone_start_pfn; 925 unsigned long start_pfn = zone->zone_start_pfn;
982 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; 926 unsigned long end_pfn = zone_end_pfn(zone);
983 927
984 ret = compaction_suitable(zone, cc->order); 928 ret = compaction_suitable(zone, cc->order);
985 switch (ret) { 929 switch (ret) {
@@ -1036,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1036 980
1037 nr_migrate = cc->nr_migratepages; 981 nr_migrate = cc->nr_migratepages;
1038 err = migrate_pages(&cc->migratepages, compaction_alloc, 982 err = migrate_pages(&cc->migratepages, compaction_alloc,
1039 (unsigned long)cc, false, 983 (unsigned long)cc,
1040 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, 984 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
1041 MR_COMPACTION); 985 MR_COMPACTION);
1042 update_nr_listpages(cc); 986 update_nr_listpages(cc);
@@ -1054,9 +998,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1054 goto out; 998 goto out;
1055 } 999 }
1056 } 1000 }
1057
1058 /* Capture a page now if it is a suitable size */
1059 compact_capture_page(cc);
1060 } 1001 }
1061 1002
1062out: 1003out:
@@ -1069,8 +1010,7 @@ out:
1069 1010
1070static unsigned long compact_zone_order(struct zone *zone, 1011static unsigned long compact_zone_order(struct zone *zone,
1071 int order, gfp_t gfp_mask, 1012 int order, gfp_t gfp_mask,
1072 bool sync, bool *contended, 1013 bool sync, bool *contended)
1073 struct page **page)
1074{ 1014{
1075 unsigned long ret; 1015 unsigned long ret;
1076 struct compact_control cc = { 1016 struct compact_control cc = {
@@ -1080,7 +1020,6 @@ static unsigned long compact_zone_order(struct zone *zone,
1080 .migratetype = allocflags_to_migratetype(gfp_mask), 1020 .migratetype = allocflags_to_migratetype(gfp_mask),
1081 .zone = zone, 1021 .zone = zone,
1082 .sync = sync, 1022 .sync = sync,
1083 .page = page,
1084 }; 1023 };
1085 INIT_LIST_HEAD(&cc.freepages); 1024 INIT_LIST_HEAD(&cc.freepages);
1086 INIT_LIST_HEAD(&cc.migratepages); 1025 INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1049,7 @@ int sysctl_extfrag_threshold = 500;
1110 */ 1049 */
1111unsigned long try_to_compact_pages(struct zonelist *zonelist, 1050unsigned long try_to_compact_pages(struct zonelist *zonelist,
1112 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1051 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1113 bool sync, bool *contended, struct page **page) 1052 bool sync, bool *contended)
1114{ 1053{
1115 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1054 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1116 int may_enter_fs = gfp_mask & __GFP_FS; 1055 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1075,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1136 int status; 1075 int status;
1137 1076
1138 status = compact_zone_order(zone, order, gfp_mask, sync, 1077 status = compact_zone_order(zone, order, gfp_mask, sync,
1139 contended, page); 1078 contended);
1140 rc = max(status, rc); 1079 rc = max(status, rc);
1141 1080
1142 /* If a normal allocation would succeed, stop compacting */ 1081 /* If a normal allocation would succeed, stop compacting */
@@ -1150,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1150 1089
1151 1090
1152/* Compact all zones within a node */ 1091/* Compact all zones within a node */
1153static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 1092static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1154{ 1093{
1155 int zoneid; 1094 int zoneid;
1156 struct zone *zone; 1095 struct zone *zone;
@@ -1183,34 +1122,30 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1183 VM_BUG_ON(!list_empty(&cc->freepages)); 1122 VM_BUG_ON(!list_empty(&cc->freepages));
1184 VM_BUG_ON(!list_empty(&cc->migratepages)); 1123 VM_BUG_ON(!list_empty(&cc->migratepages));
1185 } 1124 }
1186
1187 return 0;
1188} 1125}
1189 1126
1190int compact_pgdat(pg_data_t *pgdat, int order) 1127void compact_pgdat(pg_data_t *pgdat, int order)
1191{ 1128{
1192 struct compact_control cc = { 1129 struct compact_control cc = {
1193 .order = order, 1130 .order = order,
1194 .sync = false, 1131 .sync = false,
1195 .page = NULL,
1196 }; 1132 };
1197 1133
1198 return __compact_pgdat(pgdat, &cc); 1134 __compact_pgdat(pgdat, &cc);
1199} 1135}
1200 1136
1201static int compact_node(int nid) 1137static void compact_node(int nid)
1202{ 1138{
1203 struct compact_control cc = { 1139 struct compact_control cc = {
1204 .order = -1, 1140 .order = -1,
1205 .sync = true, 1141 .sync = true,
1206 .page = NULL,
1207 }; 1142 };
1208 1143
1209 return __compact_pgdat(NODE_DATA(nid), &cc); 1144 __compact_pgdat(NODE_DATA(nid), &cc);
1210} 1145}
1211 1146
1212/* Compact all nodes in the system */ 1147/* Compact all nodes in the system */
1213static int compact_nodes(void) 1148static void compact_nodes(void)
1214{ 1149{
1215 int nid; 1150 int nid;
1216 1151
@@ -1219,8 +1154,6 @@ static int compact_nodes(void)
1219 1154
1220 for_each_online_node(nid) 1155 for_each_online_node(nid)
1221 compact_node(nid); 1156 compact_node(nid);
1222
1223 return COMPACT_COMPLETE;
1224} 1157}
1225 1158
1226/* The written value is actually unused, all memory is compacted */ 1159/* The written value is actually unused, all memory is compacted */
@@ -1231,7 +1164,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
1231 void __user *buffer, size_t *length, loff_t *ppos) 1164 void __user *buffer, size_t *length, loff_t *ppos)
1232{ 1165{
1233 if (write) 1166 if (write)
1234 return compact_nodes(); 1167 compact_nodes();
1235 1168
1236 return 0; 1169 return 0;
1237} 1170}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a47f0f50c89f..7e092689a12a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -17,6 +17,7 @@
17#include <linux/fadvise.h> 17#include <linux/fadvise.h>
18#include <linux/writeback.h> 18#include <linux/writeback.h>
19#include <linux/syscalls.h> 19#include <linux/syscalls.h>
20#include <linux/swap.h>
20 21
21#include <asm/unistd.h> 22#include <asm/unistd.h>
22 23
@@ -38,7 +39,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
38 if (!f.file) 39 if (!f.file)
39 return -EBADF; 40 return -EBADF;
40 41
41 if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) { 42 if (S_ISFIFO(file_inode(f.file)->i_mode)) {
42 ret = -ESPIPE; 43 ret = -ESPIPE;
43 goto out; 44 goto out;
44 } 45 }
@@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
120 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; 121 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
121 end_index = (endbyte >> PAGE_CACHE_SHIFT); 122 end_index = (endbyte >> PAGE_CACHE_SHIFT);
122 123
123 if (end_index >= start_index) 124 if (end_index >= start_index) {
124 invalidate_mapping_pages(mapping, start_index, 125 unsigned long count = invalidate_mapping_pages(mapping,
126 start_index, end_index);
127
128 /*
129 * If fewer pages were invalidated than expected then
130 * it is possible that some of the pages were on
131 * a per-cpu pagevec for a remote CPU. Drain all
132 * pagevecs and try again.
133 */
134 if (count < (end_index - start_index + 1)) {
135 lru_add_drain_all();
136 invalidate_mapping_pages(mapping, start_index,
125 end_index); 137 end_index);
138 }
139 }
126 break; 140 break;
127 default: 141 default:
128 ret = -EINVAL; 142 ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 83efee76a5c0..e1979fdca805 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1711,7 +1711,7 @@ EXPORT_SYMBOL(filemap_fault);
1711int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1711int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1712{ 1712{
1713 struct page *page = vmf->page; 1713 struct page *page = vmf->page;
1714 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1714 struct inode *inode = file_inode(vma->vm_file);
1715 int ret = VM_FAULT_LOCKED; 1715 int ret = VM_FAULT_LOCKED;
1716 1716
1717 sb_start_pagefault(inode->i_sb); 1717 sb_start_pagefault(inode->i_sb);
@@ -1728,6 +1728,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1728 * see the dirty page and writeprotect it again. 1728 * see the dirty page and writeprotect it again.
1729 */ 1729 */
1730 set_page_dirty(page); 1730 set_page_dirty(page);
1731 wait_for_stable_page(page);
1731out: 1732out:
1732 sb_end_pagefault(inode->i_sb); 1733 sb_end_pagefault(inode->i_sb);
1733 return ret; 1734 return ret;
@@ -2056,7 +2057,7 @@ EXPORT_SYMBOL(iov_iter_fault_in_readable);
2056/* 2057/*
2057 * Return the count of just the current iov_iter segment. 2058 * Return the count of just the current iov_iter segment.
2058 */ 2059 */
2059size_t iov_iter_single_seg_count(struct iov_iter *i) 2060size_t iov_iter_single_seg_count(const struct iov_iter *i)
2060{ 2061{
2061 const struct iovec *iov = i->iov; 2062 const struct iovec *iov = i->iov;
2062 if (i->nr_segs == 1) 2063 if (i->nr_segs == 1)
@@ -2274,7 +2275,7 @@ repeat:
2274 return NULL; 2275 return NULL;
2275 } 2276 }
2276found: 2277found:
2277 wait_on_page_writeback(page); 2278 wait_for_stable_page(page);
2278 return page; 2279 return page;
2279} 2280}
2280EXPORT_SYMBOL(grab_cache_page_write_begin); 2281EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e56800..0cd4c11488ed 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
129 struct vm_area_struct *vma; 129 struct vm_area_struct *vma;
130 int err = -EINVAL; 130 int err = -EINVAL;
131 int has_write_lock = 0; 131 int has_write_lock = 0;
132 vm_flags_t vm_flags;
132 133
133 if (prot) 134 if (prot)
134 return err; 135 return err;
@@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
160 /* 161 /*
161 * Make sure the vma is shared, that it supports prefaulting, 162 * Make sure the vma is shared, that it supports prefaulting,
162 * and that the remapped range is valid and fully within 163 * and that the remapped range is valid and fully within
163 * the single existing vma. vm_private_data is used as a 164 * the single existing vma.
164 * swapout cursor in a VM_NONLINEAR vma.
165 */ 165 */
166 if (!vma || !(vma->vm_flags & VM_SHARED)) 166 if (!vma || !(vma->vm_flags & VM_SHARED))
167 goto out; 167 goto out;
168 168
169 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
170 goto out;
171
172 if (!vma->vm_ops || !vma->vm_ops->remap_pages) 169 if (!vma->vm_ops || !vma->vm_ops->remap_pages)
173 goto out; 170 goto out;
174 171
@@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
177 174
178 /* Must set VM_NONLINEAR before any pages are populated. */ 175 /* Must set VM_NONLINEAR before any pages are populated. */
179 if (!(vma->vm_flags & VM_NONLINEAR)) { 176 if (!(vma->vm_flags & VM_NONLINEAR)) {
177 /*
178 * vm_private_data is used as a swapout cursor
179 * in a VM_NONLINEAR vma.
180 */
181 if (vma->vm_private_data)
182 goto out;
183
180 /* Don't need a nonlinear mapping, exit success */ 184 /* Don't need a nonlinear mapping, exit success */
181 if (pgoff == linear_page_index(vma, start)) { 185 if (pgoff == linear_page_index(vma, start)) {
182 err = 0; 186 err = 0;
@@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
184 } 188 }
185 189
186 if (!has_write_lock) { 190 if (!has_write_lock) {
191get_write_lock:
187 up_read(&mm->mmap_sem); 192 up_read(&mm->mmap_sem);
188 down_write(&mm->mmap_sem); 193 down_write(&mm->mmap_sem);
189 has_write_lock = 1; 194 has_write_lock = 1;
@@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
199 unsigned long addr; 204 unsigned long addr;
200 struct file *file = get_file(vma->vm_file); 205 struct file *file = get_file(vma->vm_file);
201 206
202 flags &= MAP_NONBLOCK; 207 vm_flags = vma->vm_flags;
203 addr = mmap_region(file, start, size, 208 if (!(flags & MAP_NONBLOCK))
204 flags, vma->vm_flags, pgoff); 209 vm_flags |= VM_POPULATE;
210 addr = mmap_region(file, start, size, vm_flags, pgoff);
205 fput(file); 211 fput(file);
206 if (IS_ERR_VALUE(addr)) { 212 if (IS_ERR_VALUE(addr)) {
207 err = addr; 213 err = addr;
@@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
220 mutex_unlock(&mapping->i_mmap_mutex); 226 mutex_unlock(&mapping->i_mmap_mutex);
221 } 227 }
222 228
229 if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
230 if (!has_write_lock)
231 goto get_write_lock;
232 vma->vm_flags |= VM_POPULATE;
233 }
234
223 if (vma->vm_flags & VM_LOCKED) { 235 if (vma->vm_flags & VM_LOCKED) {
224 /* 236 /*
225 * drop PG_Mlocked flag for over-mapped range 237 * drop PG_Mlocked flag for over-mapped range
226 */ 238 */
227 vm_flags_t saved_flags = vma->vm_flags; 239 if (!has_write_lock)
240 goto get_write_lock;
241 vm_flags = vma->vm_flags;
228 munlock_vma_pages_range(vma, start, start + size); 242 munlock_vma_pages_range(vma, start, start + size);
229 vma->vm_flags = saved_flags; 243 vma->vm_flags = vm_flags;
230 } 244 }
231 245
232 mmu_notifier_invalidate_range_start(mm, start, start + size); 246 mmu_notifier_invalidate_range_start(mm, start, start + size);
233 err = vma->vm_ops->remap_pages(vma, start, size, pgoff); 247 err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
234 mmu_notifier_invalidate_range_end(mm, start, start + size); 248 mmu_notifier_invalidate_range_end(mm, start, start + size);
235 if (!err && !(flags & MAP_NONBLOCK)) {
236 if (vma->vm_flags & VM_LOCKED) {
237 /*
238 * might be mapping previously unmapped range of file
239 */
240 mlock_vma_pages_range(vma, start, start + size);
241 } else {
242 if (unlikely(has_write_lock)) {
243 downgrade_write(&mm->mmap_sem);
244 has_write_lock = 0;
245 }
246 make_pages_present(start, start+size);
247 }
248 }
249 249
250 /* 250 /*
251 * We can't clear VM_NONLINEAR because we'd have to do 251 * We can't clear VM_NONLINEAR because we'd have to do
@@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
254 */ 254 */
255 255
256out: 256out:
257 vm_flags = vma->vm_flags;
257 if (likely(!has_write_lock)) 258 if (likely(!has_write_lock))
258 up_read(&mm->mmap_sem); 259 up_read(&mm->mmap_sem);
259 else 260 else
260 up_write(&mm->mmap_sem); 261 up_write(&mm->mmap_sem);
262 if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
263 mm_populate(start, size);
261 264
262 return err; 265 return err;
263} 266}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9e894edc7811..e2f7f5aaaafb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -20,6 +20,7 @@
20#include <linux/mman.h> 20#include <linux/mman.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/migrate.h> 22#include <linux/migrate.h>
23#include <linux/hashtable.h>
23 24
24#include <asm/tlb.h> 25#include <asm/tlb.h>
25#include <asm/pgalloc.h> 26#include <asm/pgalloc.h>
@@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
62static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; 63static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
63 64
64static int khugepaged(void *none); 65static int khugepaged(void *none);
65static int mm_slots_hash_init(void);
66static int khugepaged_slab_init(void); 66static int khugepaged_slab_init(void);
67static void khugepaged_slab_free(void);
68 67
69#define MM_SLOTS_HASH_HEADS 1024 68#define MM_SLOTS_HASH_BITS 10
70static struct hlist_head *mm_slots_hash __read_mostly; 69static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
70
71static struct kmem_cache *mm_slot_cache __read_mostly; 71static struct kmem_cache *mm_slot_cache __read_mostly;
72 72
73/** 73/**
@@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void)
105 struct zone *zone; 105 struct zone *zone;
106 int nr_zones = 0; 106 int nr_zones = 0;
107 unsigned long recommended_min; 107 unsigned long recommended_min;
108 extern int min_free_kbytes;
109 108
110 if (!khugepaged_enabled()) 109 if (!khugepaged_enabled())
111 return 0; 110 return 0;
@@ -634,12 +633,6 @@ static int __init hugepage_init(void)
634 if (err) 633 if (err)
635 goto out; 634 goto out;
636 635
637 err = mm_slots_hash_init();
638 if (err) {
639 khugepaged_slab_free();
640 goto out;
641 }
642
643 register_shrinker(&huge_zero_page_shrinker); 636 register_shrinker(&huge_zero_page_shrinker);
644 637
645 /* 638 /*
@@ -1257,6 +1250,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1257 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1250 if (flags & FOLL_WRITE && !pmd_write(*pmd))
1258 goto out; 1251 goto out;
1259 1252
1253 /* Avoid dumping huge zero page */
1254 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1255 return ERR_PTR(-EFAULT);
1256
1260 page = pmd_page(*pmd); 1257 page = pmd_page(*pmd);
1261 VM_BUG_ON(!PageHead(page)); 1258 VM_BUG_ON(!PageHead(page));
1262 if (flags & FOLL_TOUCH) { 1259 if (flags & FOLL_TOUCH) {
@@ -1298,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1298 int target_nid; 1295 int target_nid;
1299 int current_nid = -1; 1296 int current_nid = -1;
1300 bool migrated; 1297 bool migrated;
1301 bool page_locked = false;
1302 1298
1303 spin_lock(&mm->page_table_lock); 1299 spin_lock(&mm->page_table_lock);
1304 if (unlikely(!pmd_same(pmd, *pmdp))) 1300 if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1320,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1320 /* Acquire the page lock to serialise THP migrations */ 1316 /* Acquire the page lock to serialise THP migrations */
1321 spin_unlock(&mm->page_table_lock); 1317 spin_unlock(&mm->page_table_lock);
1322 lock_page(page); 1318 lock_page(page);
1323 page_locked = true;
1324 1319
1325 /* Confirm the PTE did not while locked */ 1320 /* Confirm the PTE did not while locked */
1326 spin_lock(&mm->page_table_lock); 1321 spin_lock(&mm->page_table_lock);
@@ -1333,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1333 1328
1334 /* Migrate the THP to the requested node */ 1329 /* Migrate the THP to the requested node */
1335 migrated = migrate_misplaced_transhuge_page(mm, vma, 1330 migrated = migrate_misplaced_transhuge_page(mm, vma,
1336 pmdp, pmd, addr, 1331 pmdp, pmd, addr, page, target_nid);
1337 page, target_nid); 1332 if (!migrated)
1338 if (migrated) 1333 goto check_same;
1339 current_nid = target_nid;
1340 else {
1341 spin_lock(&mm->page_table_lock);
1342 if (unlikely(!pmd_same(pmd, *pmdp))) {
1343 unlock_page(page);
1344 goto out_unlock;
1345 }
1346 goto clear_pmdnuma;
1347 }
1348 1334
1349 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); 1335 task_numa_fault(target_nid, HPAGE_PMD_NR, true);
1350 return 0; 1336 return 0;
1351 1337
1338check_same:
1339 spin_lock(&mm->page_table_lock);
1340 if (unlikely(!pmd_same(pmd, *pmdp)))
1341 goto out_unlock;
1352clear_pmdnuma: 1342clear_pmdnuma:
1353 pmd = pmd_mknonnuma(pmd); 1343 pmd = pmd_mknonnuma(pmd);
1354 set_pmd_at(mm, haddr, pmdp, pmd); 1344 set_pmd_at(mm, haddr, pmdp, pmd);
1355 VM_BUG_ON(pmd_numa(*pmdp)); 1345 VM_BUG_ON(pmd_numa(*pmdp));
1356 update_mmu_cache_pmd(vma, addr, pmdp); 1346 update_mmu_cache_pmd(vma, addr, pmdp);
1357 if (page_locked)
1358 unlock_page(page);
1359
1360out_unlock: 1347out_unlock:
1361 spin_unlock(&mm->page_table_lock); 1348 spin_unlock(&mm->page_table_lock);
1362 if (current_nid != -1) 1349 if (current_nid != -1)
1363 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); 1350 task_numa_fault(current_nid, HPAGE_PMD_NR, false);
1364 return 0; 1351 return 0;
1365} 1352}
1366 1353
@@ -1652,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page)
1652 page_tail->mapping = page->mapping; 1639 page_tail->mapping = page->mapping;
1653 1640
1654 page_tail->index = page->index + i; 1641 page_tail->index = page->index + i;
1655 page_xchg_last_nid(page_tail, page_last_nid(page)); 1642 page_nid_xchg_last(page_tail, page_nid_last(page));
1656 1643
1657 BUG_ON(!PageAnon(page_tail)); 1644 BUG_ON(!PageAnon(page_tail));
1658 BUG_ON(!PageUptodate(page_tail)); 1645 BUG_ON(!PageUptodate(page_tail));
@@ -1819,9 +1806,19 @@ int split_huge_page(struct page *page)
1819 1806
1820 BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); 1807 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1821 BUG_ON(!PageAnon(page)); 1808 BUG_ON(!PageAnon(page));
1822 anon_vma = page_lock_anon_vma_read(page); 1809
1810 /*
1811 * The caller does not necessarily hold an mmap_sem that would prevent
1812 * the anon_vma disappearing so we first we take a reference to it
1813 * and then lock the anon_vma for write. This is similar to
1814 * page_lock_anon_vma_read except the write lock is taken to serialise
1815 * against parallel split or collapse operations.
1816 */
1817 anon_vma = page_get_anon_vma(page);
1823 if (!anon_vma) 1818 if (!anon_vma)
1824 goto out; 1819 goto out;
1820 anon_vma_lock_write(anon_vma);
1821
1825 ret = 0; 1822 ret = 0;
1826 if (!PageCompound(page)) 1823 if (!PageCompound(page))
1827 goto out_unlock; 1824 goto out_unlock;
@@ -1832,7 +1829,8 @@ int split_huge_page(struct page *page)
1832 1829
1833 BUG_ON(PageCompound(page)); 1830 BUG_ON(PageCompound(page));
1834out_unlock: 1831out_unlock:
1835 page_unlock_anon_vma_read(anon_vma); 1832 anon_vma_unlock_write(anon_vma);
1833 put_anon_vma(anon_vma);
1836out: 1834out:
1837 return ret; 1835 return ret;
1838} 1836}
@@ -1893,12 +1891,6 @@ static int __init khugepaged_slab_init(void)
1893 return 0; 1891 return 0;
1894} 1892}
1895 1893
1896static void __init khugepaged_slab_free(void)
1897{
1898 kmem_cache_destroy(mm_slot_cache);
1899 mm_slot_cache = NULL;
1900}
1901
1902static inline struct mm_slot *alloc_mm_slot(void) 1894static inline struct mm_slot *alloc_mm_slot(void)
1903{ 1895{
1904 if (!mm_slot_cache) /* initialization failed */ 1896 if (!mm_slot_cache) /* initialization failed */
@@ -1911,47 +1903,22 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
1911 kmem_cache_free(mm_slot_cache, mm_slot); 1903 kmem_cache_free(mm_slot_cache, mm_slot);
1912} 1904}
1913 1905
1914static int __init mm_slots_hash_init(void)
1915{
1916 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1917 GFP_KERNEL);
1918 if (!mm_slots_hash)
1919 return -ENOMEM;
1920 return 0;
1921}
1922
1923#if 0
1924static void __init mm_slots_hash_free(void)
1925{
1926 kfree(mm_slots_hash);
1927 mm_slots_hash = NULL;
1928}
1929#endif
1930
1931static struct mm_slot *get_mm_slot(struct mm_struct *mm) 1906static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1932{ 1907{
1933 struct mm_slot *mm_slot; 1908 struct mm_slot *mm_slot;
1934 struct hlist_head *bucket;
1935 struct hlist_node *node;
1936 1909
1937 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 1910 hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
1938 % MM_SLOTS_HASH_HEADS];
1939 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1940 if (mm == mm_slot->mm) 1911 if (mm == mm_slot->mm)
1941 return mm_slot; 1912 return mm_slot;
1942 } 1913
1943 return NULL; 1914 return NULL;
1944} 1915}
1945 1916
1946static void insert_to_mm_slots_hash(struct mm_struct *mm, 1917static void insert_to_mm_slots_hash(struct mm_struct *mm,
1947 struct mm_slot *mm_slot) 1918 struct mm_slot *mm_slot)
1948{ 1919{
1949 struct hlist_head *bucket;
1950
1951 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1952 % MM_SLOTS_HASH_HEADS];
1953 mm_slot->mm = mm; 1920 mm_slot->mm = mm;
1954 hlist_add_head(&mm_slot->hash, bucket); 1921 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
1955} 1922}
1956 1923
1957static inline int khugepaged_test_exit(struct mm_struct *mm) 1924static inline int khugepaged_test_exit(struct mm_struct *mm)
@@ -2020,7 +1987,7 @@ void __khugepaged_exit(struct mm_struct *mm)
2020 spin_lock(&khugepaged_mm_lock); 1987 spin_lock(&khugepaged_mm_lock);
2021 mm_slot = get_mm_slot(mm); 1988 mm_slot = get_mm_slot(mm);
2022 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 1989 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
2023 hlist_del(&mm_slot->hash); 1990 hash_del(&mm_slot->hash);
2024 list_del(&mm_slot->mm_node); 1991 list_del(&mm_slot->mm_node);
2025 free = 1; 1992 free = 1;
2026 } 1993 }
@@ -2353,7 +2320,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2353 BUG_ON(!pmd_none(*pmd)); 2320 BUG_ON(!pmd_none(*pmd));
2354 set_pmd_at(mm, address, pmd, _pmd); 2321 set_pmd_at(mm, address, pmd, _pmd);
2355 spin_unlock(&mm->page_table_lock); 2322 spin_unlock(&mm->page_table_lock);
2356 anon_vma_unlock(vma->anon_vma); 2323 anon_vma_unlock_write(vma->anon_vma);
2357 goto out; 2324 goto out;
2358 } 2325 }
2359 2326
@@ -2361,7 +2328,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2361 * All pages are isolated and locked so anon_vma rmap 2328 * All pages are isolated and locked so anon_vma rmap
2362 * can't run anymore. 2329 * can't run anymore.
2363 */ 2330 */
2364 anon_vma_unlock(vma->anon_vma); 2331 anon_vma_unlock_write(vma->anon_vma);
2365 2332
2366 __collapse_huge_page_copy(pte, new_page, vma, address, ptl); 2333 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
2367 pte_unmap(pte); 2334 pte_unmap(pte);
@@ -2408,7 +2375,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2408 struct page *page; 2375 struct page *page;
2409 unsigned long _address; 2376 unsigned long _address;
2410 spinlock_t *ptl; 2377 spinlock_t *ptl;
2411 int node = -1; 2378 int node = NUMA_NO_NODE;
2412 2379
2413 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2380 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2414 2381
@@ -2438,7 +2405,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2438 * be more sophisticated and look at more pages, 2405 * be more sophisticated and look at more pages,
2439 * but isn't for now. 2406 * but isn't for now.
2440 */ 2407 */
2441 if (node == -1) 2408 if (node == NUMA_NO_NODE)
2442 node = page_to_nid(page); 2409 node = page_to_nid(page);
2443 VM_BUG_ON(PageCompound(page)); 2410 VM_BUG_ON(PageCompound(page));
2444 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2411 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
@@ -2469,7 +2436,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
2469 2436
2470 if (khugepaged_test_exit(mm)) { 2437 if (khugepaged_test_exit(mm)) {
2471 /* free mm_slot */ 2438 /* free mm_slot */
2472 hlist_del(&mm_slot->hash); 2439 hash_del(&mm_slot->hash);
2473 list_del(&mm_slot->mm_node); 2440 list_del(&mm_slot->mm_node);
2474 2441
2475 /* 2442 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4f3ea0b1e57c..0a0be33bb199 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -127,7 +127,7 @@ static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
127 127
128static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 128static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
129{ 129{
130 return subpool_inode(vma->vm_file->f_dentry->d_inode); 130 return subpool_inode(file_inode(vma->vm_file));
131} 131}
132 132
133/* 133/*
@@ -1293,8 +1293,7 @@ static void __init report_hugepages(void)
1293 1293
1294 for_each_hstate(h) { 1294 for_each_hstate(h) {
1295 char buf[32]; 1295 char buf[32];
1296 printk(KERN_INFO "HugeTLB registered %s page size, " 1296 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
1297 "pre-allocated %ld pages\n",
1298 memfmt(buf, huge_page_size(h)), 1297 memfmt(buf, huge_page_size(h)),
1299 h->free_huge_pages); 1298 h->free_huge_pages);
1300 } 1299 }
@@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void)
1702 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 1701 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1703 hstate_kobjs, &hstate_attr_group); 1702 hstate_kobjs, &hstate_attr_group);
1704 if (err) 1703 if (err)
1705 printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1704 pr_err("Hugetlb: Unable to add hstate %s", h->name);
1706 h->name);
1707 } 1705 }
1708} 1706}
1709 1707
@@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node)
1826 nhs->hstate_kobjs, 1824 nhs->hstate_kobjs,
1827 &per_node_hstate_attr_group); 1825 &per_node_hstate_attr_group);
1828 if (err) { 1826 if (err) {
1829 printk(KERN_ERR "Hugetlb: Unable to add hstate %s" 1827 pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
1830 " for node %d\n", 1828 h->name, node->dev.id);
1831 h->name, node->dev.id);
1832 hugetlb_unregister_node(node); 1829 hugetlb_unregister_node(node);
1833 break; 1830 break;
1834 } 1831 }
@@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order)
1924 unsigned long i; 1921 unsigned long i;
1925 1922
1926 if (size_to_hstate(PAGE_SIZE << order)) { 1923 if (size_to_hstate(PAGE_SIZE << order)) {
1927 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1924 pr_warning("hugepagesz= specified twice, ignoring\n");
1928 return; 1925 return;
1929 } 1926 }
1930 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s)
1960 mhp = &parsed_hstate->max_huge_pages; 1957 mhp = &parsed_hstate->max_huge_pages;
1961 1958
1962 if (mhp == last_mhp) { 1959 if (mhp == last_mhp) {
1963 printk(KERN_WARNING "hugepages= specified twice without " 1960 pr_warning("hugepages= specified twice without "
1964 "interleaving hugepagesz=, ignoring\n"); 1961 "interleaving hugepagesz=, ignoring\n");
1965 return 1; 1962 return 1;
1966 } 1963 }
1967 1964
@@ -2482,7 +2479,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2482 address = address & huge_page_mask(h); 2479 address = address & huge_page_mask(h);
2483 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 2480 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
2484 vma->vm_pgoff; 2481 vma->vm_pgoff;
2485 mapping = vma->vm_file->f_dentry->d_inode->i_mapping; 2482 mapping = file_inode(vma->vm_file)->i_mapping;
2486 2483
2487 /* 2484 /*
2488 * Take the mapping lock for the duration of the table walk. As 2485 * Take the mapping lock for the duration of the table walk. As
@@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2692 * COW. Warn that such a situation has occurred as it may not be obvious 2689 * COW. Warn that such a situation has occurred as it may not be obvious
2693 */ 2690 */
2694 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 2691 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2695 printk(KERN_WARNING 2692 pr_warning("PID %d killed due to inadequate hugepage pool\n",
2696 "PID %d killed due to inadequate hugepage pool\n", 2693 current->pid);
2697 current->pid);
2698 return ret; 2694 return ret;
2699 } 2695 }
2700 2696
@@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2924 return NULL; 2920 return NULL;
2925} 2921}
2926 2922
2927int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2923long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2928 struct page **pages, struct vm_area_struct **vmas, 2924 struct page **pages, struct vm_area_struct **vmas,
2929 unsigned long *position, int *length, int i, 2925 unsigned long *position, unsigned long *nr_pages,
2930 unsigned int flags) 2926 long i, unsigned int flags)
2931{ 2927{
2932 unsigned long pfn_offset; 2928 unsigned long pfn_offset;
2933 unsigned long vaddr = *position; 2929 unsigned long vaddr = *position;
2934 int remainder = *length; 2930 unsigned long remainder = *nr_pages;
2935 struct hstate *h = hstate_vma(vma); 2931 struct hstate *h = hstate_vma(vma);
2936 2932
2937 spin_lock(&mm->page_table_lock); 2933 spin_lock(&mm->page_table_lock);
@@ -3001,7 +2997,7 @@ same_page:
3001 } 2997 }
3002 } 2998 }
3003 spin_unlock(&mm->page_table_lock); 2999 spin_unlock(&mm->page_table_lock);
3004 *length = remainder; 3000 *nr_pages = remainder;
3005 *position = vaddr; 3001 *position = vaddr;
3006 3002
3007 return i ? i : -EFAULT; 3003 return i ? i : -EFAULT;
@@ -3033,6 +3029,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3033 if (!huge_pte_none(huge_ptep_get(ptep))) { 3029 if (!huge_pte_none(huge_ptep_get(ptep))) {
3034 pte = huge_ptep_get_and_clear(mm, address, ptep); 3030 pte = huge_ptep_get_and_clear(mm, address, ptep);
3035 pte = pte_mkhuge(pte_modify(pte, newprot)); 3031 pte = pte_mkhuge(pte_modify(pte, newprot));
3032 pte = arch_make_huge_pte(pte, vma, NULL, 0);
3036 set_huge_pte_at(mm, address, ptep, pte); 3033 set_huge_pte_at(mm, address, ptep, pte);
3037 pages++; 3034 pages++;
3038 } 3035 }
diff --git a/mm/internal.h b/mm/internal.h
index d597f94cc205..8562de0a5197 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -135,7 +135,6 @@ struct compact_control {
135 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 135 int migratetype; /* MOVABLE, RECLAIMABLE etc */
136 struct zone *zone; 136 struct zone *zone;
137 bool contended; /* True if a lock was contended */ 137 bool contended; /* True if a lock was contended */
138 struct page **page; /* Page captured of requested size */
139}; 138};
140 139
141unsigned long 140unsigned long
@@ -163,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
163 struct vm_area_struct *prev, struct rb_node *rb_parent); 162 struct vm_area_struct *prev, struct rb_node *rb_parent);
164 163
165#ifdef CONFIG_MMU 164#ifdef CONFIG_MMU
166extern long mlock_vma_pages_range(struct vm_area_struct *vma, 165extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
167 unsigned long start, unsigned long end); 166 unsigned long start, unsigned long end, int *nonblocking);
168extern void munlock_vma_pages_range(struct vm_area_struct *vma, 167extern void munlock_vma_pages_range(struct vm_area_struct *vma,
169 unsigned long start, unsigned long end); 168 unsigned long start, unsigned long end);
170static inline void munlock_vma_pages_all(struct vm_area_struct *vma) 169static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
@@ -196,7 +195,7 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
196 * must be called with vma's mmap_sem held for read or write, and page locked. 195 * must be called with vma's mmap_sem held for read or write, and page locked.
197 */ 196 */
198extern void mlock_vma_page(struct page *page); 197extern void mlock_vma_page(struct page *page);
199extern void munlock_vma_page(struct page *page); 198extern unsigned int munlock_vma_page(struct page *page);
200 199
201/* 200/*
202 * Clear the page's PageMlocked(). This can be useful in a situation where 201 * Clear the page's PageMlocked(). This can be useful in a situation where
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 752a705c77c2..c8d7f3110fd0 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -436,7 +436,7 @@ static int get_object(struct kmemleak_object *object)
436 */ 436 */
437static void free_object_rcu(struct rcu_head *rcu) 437static void free_object_rcu(struct rcu_head *rcu)
438{ 438{
439 struct hlist_node *elem, *tmp; 439 struct hlist_node *tmp;
440 struct kmemleak_scan_area *area; 440 struct kmemleak_scan_area *area;
441 struct kmemleak_object *object = 441 struct kmemleak_object *object =
442 container_of(rcu, struct kmemleak_object, rcu); 442 container_of(rcu, struct kmemleak_object, rcu);
@@ -445,8 +445,8 @@ static void free_object_rcu(struct rcu_head *rcu)
445 * Once use_count is 0 (guaranteed by put_object), there is no other 445 * Once use_count is 0 (guaranteed by put_object), there is no other
446 * code accessing this object, hence no need for locking. 446 * code accessing this object, hence no need for locking.
447 */ 447 */
448 hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) { 448 hlist_for_each_entry_safe(area, tmp, &object->area_list, node) {
449 hlist_del(elem); 449 hlist_del(&area->node);
450 kmem_cache_free(scan_area_cache, area); 450 kmem_cache_free(scan_area_cache, area);
451 } 451 }
452 kmem_cache_free(object_cache, object); 452 kmem_cache_free(object_cache, object);
@@ -1177,7 +1177,6 @@ static void scan_block(void *_start, void *_end,
1177static void scan_object(struct kmemleak_object *object) 1177static void scan_object(struct kmemleak_object *object)
1178{ 1178{
1179 struct kmemleak_scan_area *area; 1179 struct kmemleak_scan_area *area;
1180 struct hlist_node *elem;
1181 unsigned long flags; 1180 unsigned long flags;
1182 1181
1183 /* 1182 /*
@@ -1205,7 +1204,7 @@ static void scan_object(struct kmemleak_object *object)
1205 spin_lock_irqsave(&object->lock, flags); 1204 spin_lock_irqsave(&object->lock, flags);
1206 } 1205 }
1207 } else 1206 } else
1208 hlist_for_each_entry(area, elem, &object->area_list, node) 1207 hlist_for_each_entry(area, &object->area_list, node)
1209 scan_block((void *)area->start, 1208 scan_block((void *)area->start,
1210 (void *)(area->start + area->size), 1209 (void *)(area->start + area->size),
1211 object, 0); 1210 object, 0);
@@ -1300,9 +1299,8 @@ static void kmemleak_scan(void)
1300 */ 1299 */
1301 lock_memory_hotplug(); 1300 lock_memory_hotplug();
1302 for_each_online_node(i) { 1301 for_each_online_node(i) {
1303 pg_data_t *pgdat = NODE_DATA(i); 1302 unsigned long start_pfn = node_start_pfn(i);
1304 unsigned long start_pfn = pgdat->node_start_pfn; 1303 unsigned long end_pfn = node_end_pfn(i);
1305 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1306 unsigned long pfn; 1304 unsigned long pfn;
1307 1305
1308 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1306 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
diff --git a/mm/ksm.c b/mm/ksm.c
index 51573858938d..85bfd4c16346 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,13 +33,22 @@
33#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h> 36#include <linux/hashtable.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/oom.h> 38#include <linux/oom.h>
39#include <linux/numa.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
42 43
44#ifdef CONFIG_NUMA
45#define NUMA(x) (x)
46#define DO_NUMA(x) do { (x); } while (0)
47#else
48#define NUMA(x) (0)
49#define DO_NUMA(x) do { } while (0)
50#endif
51
43/* 52/*
44 * A few notes about the KSM scanning process, 53 * A few notes about the KSM scanning process,
45 * to make it easier to understand the data structures below: 54 * to make it easier to understand the data structures below:
@@ -78,6 +87,9 @@
78 * take 10 attempts to find a page in the unstable tree, once it is found, 87 * take 10 attempts to find a page in the unstable tree, once it is found,
79 * it is secured in the stable tree. (When we scan a new page, we first 88 * it is secured in the stable tree. (When we scan a new page, we first
80 * compare it against the stable tree, and then against the unstable tree.) 89 * compare it against the stable tree, and then against the unstable tree.)
90 *
91 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
92 * stable trees and multiple unstable trees: one of each for each NUMA node.
81 */ 93 */
82 94
83/** 95/**
@@ -113,19 +125,32 @@ struct ksm_scan {
113/** 125/**
114 * struct stable_node - node of the stable rbtree 126 * struct stable_node - node of the stable rbtree
115 * @node: rb node of this ksm page in the stable tree 127 * @node: rb node of this ksm page in the stable tree
128 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
129 * @list: linked into migrate_nodes, pending placement in the proper node tree
116 * @hlist: hlist head of rmap_items using this ksm page 130 * @hlist: hlist head of rmap_items using this ksm page
117 * @kpfn: page frame number of this ksm page 131 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
132 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
118 */ 133 */
119struct stable_node { 134struct stable_node {
120 struct rb_node node; 135 union {
136 struct rb_node node; /* when node of stable tree */
137 struct { /* when listed for migration */
138 struct list_head *head;
139 struct list_head list;
140 };
141 };
121 struct hlist_head hlist; 142 struct hlist_head hlist;
122 unsigned long kpfn; 143 unsigned long kpfn;
144#ifdef CONFIG_NUMA
145 int nid;
146#endif
123}; 147};
124 148
125/** 149/**
126 * struct rmap_item - reverse mapping item for virtual addresses 150 * struct rmap_item - reverse mapping item for virtual addresses
127 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 151 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
128 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree 152 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
153 * @nid: NUMA node id of unstable tree in which linked (may not match page)
129 * @mm: the memory structure this rmap_item is pointing into 154 * @mm: the memory structure this rmap_item is pointing into
130 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 155 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
131 * @oldchecksum: previous checksum of the page at that virtual address 156 * @oldchecksum: previous checksum of the page at that virtual address
@@ -135,7 +160,12 @@ struct stable_node {
135 */ 160 */
136struct rmap_item { 161struct rmap_item {
137 struct rmap_item *rmap_list; 162 struct rmap_item *rmap_list;
138 struct anon_vma *anon_vma; /* when stable */ 163 union {
164 struct anon_vma *anon_vma; /* when stable */
165#ifdef CONFIG_NUMA
166 int nid; /* when node of unstable tree */
167#endif
168 };
139 struct mm_struct *mm; 169 struct mm_struct *mm;
140 unsigned long address; /* + low bits used for flags below */ 170 unsigned long address; /* + low bits used for flags below */
141 unsigned int oldchecksum; /* when unstable */ 171 unsigned int oldchecksum; /* when unstable */
@@ -153,12 +183,16 @@ struct rmap_item {
153#define STABLE_FLAG 0x200 /* is listed from the stable tree */ 183#define STABLE_FLAG 0x200 /* is listed from the stable tree */
154 184
155/* The stable and unstable tree heads */ 185/* The stable and unstable tree heads */
156static struct rb_root root_stable_tree = RB_ROOT; 186static struct rb_root one_stable_tree[1] = { RB_ROOT };
157static struct rb_root root_unstable_tree = RB_ROOT; 187static struct rb_root one_unstable_tree[1] = { RB_ROOT };
188static struct rb_root *root_stable_tree = one_stable_tree;
189static struct rb_root *root_unstable_tree = one_unstable_tree;
158 190
159#define MM_SLOTS_HASH_SHIFT 10 191/* Recently migrated nodes of stable tree, pending proper placement */
160#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) 192static LIST_HEAD(migrate_nodes);
161static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; 193
194#define MM_SLOTS_HASH_BITS 10
195static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
162 196
163static struct mm_slot ksm_mm_head = { 197static struct mm_slot ksm_mm_head = {
164 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 198 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100;
189/* Milliseconds ksmd should sleep between batches */ 223/* Milliseconds ksmd should sleep between batches */
190static unsigned int ksm_thread_sleep_millisecs = 20; 224static unsigned int ksm_thread_sleep_millisecs = 20;
191 225
226#ifdef CONFIG_NUMA
227/* Zeroed when merging across nodes is not allowed */
228static unsigned int ksm_merge_across_nodes = 1;
229static int ksm_nr_node_ids = 1;
230#else
231#define ksm_merge_across_nodes 1U
232#define ksm_nr_node_ids 1
233#endif
234
192#define KSM_RUN_STOP 0 235#define KSM_RUN_STOP 0
193#define KSM_RUN_MERGE 1 236#define KSM_RUN_MERGE 1
194#define KSM_RUN_UNMERGE 2 237#define KSM_RUN_UNMERGE 2
195static unsigned int ksm_run = KSM_RUN_STOP; 238#define KSM_RUN_OFFLINE 4
239static unsigned long ksm_run = KSM_RUN_STOP;
240static void wait_while_offlining(void);
196 241
197static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 242static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
198static DEFINE_MUTEX(ksm_thread_mutex); 243static DEFINE_MUTEX(ksm_thread_mutex);
@@ -275,31 +320,20 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
275 320
276static struct mm_slot *get_mm_slot(struct mm_struct *mm) 321static struct mm_slot *get_mm_slot(struct mm_struct *mm)
277{ 322{
278 struct mm_slot *mm_slot; 323 struct mm_slot *slot;
279 struct hlist_head *bucket; 324
280 struct hlist_node *node; 325 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
326 if (slot->mm == mm)
327 return slot;
281 328
282 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
283 hlist_for_each_entry(mm_slot, node, bucket, link) {
284 if (mm == mm_slot->mm)
285 return mm_slot;
286 }
287 return NULL; 329 return NULL;
288} 330}
289 331
290static void insert_to_mm_slots_hash(struct mm_struct *mm, 332static void insert_to_mm_slots_hash(struct mm_struct *mm,
291 struct mm_slot *mm_slot) 333 struct mm_slot *mm_slot)
292{ 334{
293 struct hlist_head *bucket;
294
295 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
296 mm_slot->mm = mm; 335 mm_slot->mm = mm;
297 hlist_add_head(&mm_slot->link, bucket); 336 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
298}
299
300static inline int in_stable_tree(struct rmap_item *rmap_item)
301{
302 return rmap_item->address & STABLE_FLAG;
303} 337}
304 338
305/* 339/*
@@ -333,7 +367,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
333 367
334 do { 368 do {
335 cond_resched(); 369 cond_resched();
336 page = follow_page(vma, addr, FOLL_GET); 370 page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
337 if (IS_ERR_OR_NULL(page)) 371 if (IS_ERR_OR_NULL(page))
338 break; 372 break;
339 if (PageKsm(page)) 373 if (PageKsm(page))
@@ -447,12 +481,22 @@ out: page = NULL;
447 return page; 481 return page;
448} 482}
449 483
484/*
485 * This helper is used for getting right index into array of tree roots.
486 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
487 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
488 * every node has its own stable and unstable tree.
489 */
490static inline int get_kpfn_nid(unsigned long kpfn)
491{
492 return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn);
493}
494
450static void remove_node_from_stable_tree(struct stable_node *stable_node) 495static void remove_node_from_stable_tree(struct stable_node *stable_node)
451{ 496{
452 struct rmap_item *rmap_item; 497 struct rmap_item *rmap_item;
453 struct hlist_node *hlist;
454 498
455 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 499 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
456 if (rmap_item->hlist.next) 500 if (rmap_item->hlist.next)
457 ksm_pages_sharing--; 501 ksm_pages_sharing--;
458 else 502 else
@@ -462,7 +506,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
462 cond_resched(); 506 cond_resched();
463 } 507 }
464 508
465 rb_erase(&stable_node->node, &root_stable_tree); 509 if (stable_node->head == &migrate_nodes)
510 list_del(&stable_node->list);
511 else
512 rb_erase(&stable_node->node,
513 root_stable_tree + NUMA(stable_node->nid));
466 free_stable_node(stable_node); 514 free_stable_node(stable_node);
467} 515}
468 516
@@ -472,6 +520,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
472 * In which case we can trust the content of the page, and it 520 * In which case we can trust the content of the page, and it
473 * returns the gotten page; but if the page has now been zapped, 521 * returns the gotten page; but if the page has now been zapped,
474 * remove the stale node from the stable tree and return NULL. 522 * remove the stale node from the stable tree and return NULL.
523 * But beware, the stable node's page might be being migrated.
475 * 524 *
476 * You would expect the stable_node to hold a reference to the ksm page. 525 * You would expect the stable_node to hold a reference to the ksm page.
477 * But if it increments the page's count, swapping out has to wait for 526 * But if it increments the page's count, swapping out has to wait for
@@ -482,40 +531,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
482 * pointing back to this stable node. This relies on freeing a PageAnon 531 * pointing back to this stable node. This relies on freeing a PageAnon
483 * page to reset its page->mapping to NULL, and relies on no other use of 532 * page to reset its page->mapping to NULL, and relies on no other use of
484 * a page to put something that might look like our key in page->mapping. 533 * a page to put something that might look like our key in page->mapping.
485 *
486 * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
487 * but this is different - made simpler by ksm_thread_mutex being held, but
488 * interesting for assuming that no other use of the struct page could ever
489 * put our expected_mapping into page->mapping (or a field of the union which
490 * coincides with page->mapping). The RCU calls are not for KSM at all, but
491 * to keep the page_count protocol described with page_cache_get_speculative.
492 *
493 * Note: it is possible that get_ksm_page() will return NULL one moment,
494 * then page the next, if the page is in between page_freeze_refs() and
495 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
496 * is on its way to being freed; but it is an anomaly to bear in mind. 534 * is on its way to being freed; but it is an anomaly to bear in mind.
497 */ 535 */
498static struct page *get_ksm_page(struct stable_node *stable_node) 536static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
499{ 537{
500 struct page *page; 538 struct page *page;
501 void *expected_mapping; 539 void *expected_mapping;
540 unsigned long kpfn;
502 541
503 page = pfn_to_page(stable_node->kpfn);
504 expected_mapping = (void *)stable_node + 542 expected_mapping = (void *)stable_node +
505 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 543 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
506 rcu_read_lock(); 544again:
507 if (page->mapping != expected_mapping) 545 kpfn = ACCESS_ONCE(stable_node->kpfn);
508 goto stale; 546 page = pfn_to_page(kpfn);
509 if (!get_page_unless_zero(page)) 547
548 /*
549 * page is computed from kpfn, so on most architectures reading
550 * page->mapping is naturally ordered after reading node->kpfn,
551 * but on Alpha we need to be more careful.
552 */
553 smp_read_barrier_depends();
554 if (ACCESS_ONCE(page->mapping) != expected_mapping)
510 goto stale; 555 goto stale;
511 if (page->mapping != expected_mapping) { 556
557 /*
558 * We cannot do anything with the page while its refcount is 0.
559 * Usually 0 means free, or tail of a higher-order page: in which
560 * case this node is no longer referenced, and should be freed;
561 * however, it might mean that the page is under page_freeze_refs().
562 * The __remove_mapping() case is easy, again the node is now stale;
563 * but if page is swapcache in migrate_page_move_mapping(), it might
564 * still be our page, in which case it's essential to keep the node.
565 */
566 while (!get_page_unless_zero(page)) {
567 /*
568 * Another check for page->mapping != expected_mapping would
569 * work here too. We have chosen the !PageSwapCache test to
570 * optimize the common case, when the page is or is about to
571 * be freed: PageSwapCache is cleared (under spin_lock_irq)
572 * in the freeze_refs section of __remove_mapping(); but Anon
573 * page->mapping reset to NULL later, in free_pages_prepare().
574 */
575 if (!PageSwapCache(page))
576 goto stale;
577 cpu_relax();
578 }
579
580 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
512 put_page(page); 581 put_page(page);
513 goto stale; 582 goto stale;
514 } 583 }
515 rcu_read_unlock(); 584
585 if (lock_it) {
586 lock_page(page);
587 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
588 unlock_page(page);
589 put_page(page);
590 goto stale;
591 }
592 }
516 return page; 593 return page;
594
517stale: 595stale:
518 rcu_read_unlock(); 596 /*
597 * We come here from above when page->mapping or !PageSwapCache
598 * suggests that the node is stale; but it might be under migration.
599 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
600 * before checking whether node->kpfn has been changed.
601 */
602 smp_rmb();
603 if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
604 goto again;
519 remove_node_from_stable_tree(stable_node); 605 remove_node_from_stable_tree(stable_node);
520 return NULL; 606 return NULL;
521} 607}
@@ -531,11 +617,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
531 struct page *page; 617 struct page *page;
532 618
533 stable_node = rmap_item->head; 619 stable_node = rmap_item->head;
534 page = get_ksm_page(stable_node); 620 page = get_ksm_page(stable_node, true);
535 if (!page) 621 if (!page)
536 goto out; 622 goto out;
537 623
538 lock_page(page);
539 hlist_del(&rmap_item->hlist); 624 hlist_del(&rmap_item->hlist);
540 unlock_page(page); 625 unlock_page(page);
541 put_page(page); 626 put_page(page);
@@ -560,8 +645,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
560 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 645 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
561 BUG_ON(age > 1); 646 BUG_ON(age > 1);
562 if (!age) 647 if (!age)
563 rb_erase(&rmap_item->node, &root_unstable_tree); 648 rb_erase(&rmap_item->node,
564 649 root_unstable_tree + NUMA(rmap_item->nid));
565 ksm_pages_unshared--; 650 ksm_pages_unshared--;
566 rmap_item->address &= PAGE_MASK; 651 rmap_item->address &= PAGE_MASK;
567 } 652 }
@@ -581,7 +666,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
581} 666}
582 667
583/* 668/*
584 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 669 * Though it's very tempting to unmerge rmap_items from stable tree rather
585 * than check every pte of a given vma, the locking doesn't quite work for 670 * than check every pte of a given vma, the locking doesn't quite work for
586 * that - an rmap_item is assigned to the stable tree after inserting ksm 671 * that - an rmap_item is assigned to the stable tree after inserting ksm
587 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 672 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
@@ -614,6 +699,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
614/* 699/*
615 * Only called through the sysfs control interface: 700 * Only called through the sysfs control interface:
616 */ 701 */
702static int remove_stable_node(struct stable_node *stable_node)
703{
704 struct page *page;
705 int err;
706
707 page = get_ksm_page(stable_node, true);
708 if (!page) {
709 /*
710 * get_ksm_page did remove_node_from_stable_tree itself.
711 */
712 return 0;
713 }
714
715 if (WARN_ON_ONCE(page_mapped(page))) {
716 /*
717 * This should not happen: but if it does, just refuse to let
718 * merge_across_nodes be switched - there is no need to panic.
719 */
720 err = -EBUSY;
721 } else {
722 /*
723 * The stable node did not yet appear stale to get_ksm_page(),
724 * since that allows for an unmapped ksm page to be recognized
725 * right up until it is freed; but the node is safe to remove.
726 * This page might be in a pagevec waiting to be freed,
727 * or it might be PageSwapCache (perhaps under writeback),
728 * or it might have been removed from swapcache a moment ago.
729 */
730 set_page_stable_node(page, NULL);
731 remove_node_from_stable_tree(stable_node);
732 err = 0;
733 }
734
735 unlock_page(page);
736 put_page(page);
737 return err;
738}
739
740static int remove_all_stable_nodes(void)
741{
742 struct stable_node *stable_node;
743 struct list_head *this, *next;
744 int nid;
745 int err = 0;
746
747 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
748 while (root_stable_tree[nid].rb_node) {
749 stable_node = rb_entry(root_stable_tree[nid].rb_node,
750 struct stable_node, node);
751 if (remove_stable_node(stable_node)) {
752 err = -EBUSY;
753 break; /* proceed to next nid */
754 }
755 cond_resched();
756 }
757 }
758 list_for_each_safe(this, next, &migrate_nodes) {
759 stable_node = list_entry(this, struct stable_node, list);
760 if (remove_stable_node(stable_node))
761 err = -EBUSY;
762 cond_resched();
763 }
764 return err;
765}
766
617static int unmerge_and_remove_all_rmap_items(void) 767static int unmerge_and_remove_all_rmap_items(void)
618{ 768{
619 struct mm_slot *mm_slot; 769 struct mm_slot *mm_slot;
@@ -647,7 +797,7 @@ static int unmerge_and_remove_all_rmap_items(void)
647 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 797 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
648 struct mm_slot, mm_list); 798 struct mm_slot, mm_list);
649 if (ksm_test_exit(mm)) { 799 if (ksm_test_exit(mm)) {
650 hlist_del(&mm_slot->link); 800 hash_del(&mm_slot->link);
651 list_del(&mm_slot->mm_list); 801 list_del(&mm_slot->mm_list);
652 spin_unlock(&ksm_mmlist_lock); 802 spin_unlock(&ksm_mmlist_lock);
653 803
@@ -661,6 +811,8 @@ static int unmerge_and_remove_all_rmap_items(void)
661 } 811 }
662 } 812 }
663 813
814 /* Clean up stable nodes, but don't worry if some are still busy */
815 remove_all_stable_nodes();
664 ksm_scan.seqnr = 0; 816 ksm_scan.seqnr = 0;
665 return 0; 817 return 0;
666 818
@@ -946,6 +1098,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
946 if (err) 1098 if (err)
947 goto out; 1099 goto out;
948 1100
1101 /* Unstable nid is in union with stable anon_vma: remove first */
1102 remove_rmap_item_from_tree(rmap_item);
1103
949 /* Must get reference to anon_vma while still holding mmap_sem */ 1104 /* Must get reference to anon_vma while still holding mmap_sem */
950 rmap_item->anon_vma = vma->anon_vma; 1105 rmap_item->anon_vma = vma->anon_vma;
951 get_anon_vma(vma->anon_vma); 1106 get_anon_vma(vma->anon_vma);
@@ -996,42 +1151,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
996 */ 1151 */
997static struct page *stable_tree_search(struct page *page) 1152static struct page *stable_tree_search(struct page *page)
998{ 1153{
999 struct rb_node *node = root_stable_tree.rb_node; 1154 int nid;
1155 struct rb_root *root;
1156 struct rb_node **new;
1157 struct rb_node *parent;
1000 struct stable_node *stable_node; 1158 struct stable_node *stable_node;
1159 struct stable_node *page_node;
1001 1160
1002 stable_node = page_stable_node(page); 1161 page_node = page_stable_node(page);
1003 if (stable_node) { /* ksm page forked */ 1162 if (page_node && page_node->head != &migrate_nodes) {
1163 /* ksm page forked */
1004 get_page(page); 1164 get_page(page);
1005 return page; 1165 return page;
1006 } 1166 }
1007 1167
1008 while (node) { 1168 nid = get_kpfn_nid(page_to_pfn(page));
1169 root = root_stable_tree + nid;
1170again:
1171 new = &root->rb_node;
1172 parent = NULL;
1173
1174 while (*new) {
1009 struct page *tree_page; 1175 struct page *tree_page;
1010 int ret; 1176 int ret;
1011 1177
1012 cond_resched(); 1178 cond_resched();
1013 stable_node = rb_entry(node, struct stable_node, node); 1179 stable_node = rb_entry(*new, struct stable_node, node);
1014 tree_page = get_ksm_page(stable_node); 1180 tree_page = get_ksm_page(stable_node, false);
1015 if (!tree_page) 1181 if (!tree_page)
1016 return NULL; 1182 return NULL;
1017 1183
1018 ret = memcmp_pages(page, tree_page); 1184 ret = memcmp_pages(page, tree_page);
1185 put_page(tree_page);
1019 1186
1020 if (ret < 0) { 1187 parent = *new;
1021 put_page(tree_page); 1188 if (ret < 0)
1022 node = node->rb_left; 1189 new = &parent->rb_left;
1023 } else if (ret > 0) { 1190 else if (ret > 0)
1024 put_page(tree_page); 1191 new = &parent->rb_right;
1025 node = node->rb_right; 1192 else {
1026 } else 1193 /*
1027 return tree_page; 1194 * Lock and unlock the stable_node's page (which
1195 * might already have been migrated) so that page
1196 * migration is sure to notice its raised count.
1197 * It would be more elegant to return stable_node
1198 * than kpage, but that involves more changes.
1199 */
1200 tree_page = get_ksm_page(stable_node, true);
1201 if (tree_page) {
1202 unlock_page(tree_page);
1203 if (get_kpfn_nid(stable_node->kpfn) !=
1204 NUMA(stable_node->nid)) {
1205 put_page(tree_page);
1206 goto replace;
1207 }
1208 return tree_page;
1209 }
1210 /*
1211 * There is now a place for page_node, but the tree may
1212 * have been rebalanced, so re-evaluate parent and new.
1213 */
1214 if (page_node)
1215 goto again;
1216 return NULL;
1217 }
1028 } 1218 }
1029 1219
1030 return NULL; 1220 if (!page_node)
1221 return NULL;
1222
1223 list_del(&page_node->list);
1224 DO_NUMA(page_node->nid = nid);
1225 rb_link_node(&page_node->node, parent, new);
1226 rb_insert_color(&page_node->node, root);
1227 get_page(page);
1228 return page;
1229
1230replace:
1231 if (page_node) {
1232 list_del(&page_node->list);
1233 DO_NUMA(page_node->nid = nid);
1234 rb_replace_node(&stable_node->node, &page_node->node, root);
1235 get_page(page);
1236 } else {
1237 rb_erase(&stable_node->node, root);
1238 page = NULL;
1239 }
1240 stable_node->head = &migrate_nodes;
1241 list_add(&stable_node->list, stable_node->head);
1242 return page;
1031} 1243}
1032 1244
1033/* 1245/*
1034 * stable_tree_insert - insert rmap_item pointing to new ksm page 1246 * stable_tree_insert - insert stable tree node pointing to new ksm page
1035 * into the stable tree. 1247 * into the stable tree.
1036 * 1248 *
1037 * This function returns the stable tree node just allocated on success, 1249 * This function returns the stable tree node just allocated on success,
@@ -1039,17 +1251,25 @@ static struct page *stable_tree_search(struct page *page)
1039 */ 1251 */
1040static struct stable_node *stable_tree_insert(struct page *kpage) 1252static struct stable_node *stable_tree_insert(struct page *kpage)
1041{ 1253{
1042 struct rb_node **new = &root_stable_tree.rb_node; 1254 int nid;
1255 unsigned long kpfn;
1256 struct rb_root *root;
1257 struct rb_node **new;
1043 struct rb_node *parent = NULL; 1258 struct rb_node *parent = NULL;
1044 struct stable_node *stable_node; 1259 struct stable_node *stable_node;
1045 1260
1261 kpfn = page_to_pfn(kpage);
1262 nid = get_kpfn_nid(kpfn);
1263 root = root_stable_tree + nid;
1264 new = &root->rb_node;
1265
1046 while (*new) { 1266 while (*new) {
1047 struct page *tree_page; 1267 struct page *tree_page;
1048 int ret; 1268 int ret;
1049 1269
1050 cond_resched(); 1270 cond_resched();
1051 stable_node = rb_entry(*new, struct stable_node, node); 1271 stable_node = rb_entry(*new, struct stable_node, node);
1052 tree_page = get_ksm_page(stable_node); 1272 tree_page = get_ksm_page(stable_node, false);
1053 if (!tree_page) 1273 if (!tree_page)
1054 return NULL; 1274 return NULL;
1055 1275
@@ -1075,13 +1295,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1075 if (!stable_node) 1295 if (!stable_node)
1076 return NULL; 1296 return NULL;
1077 1297
1078 rb_link_node(&stable_node->node, parent, new);
1079 rb_insert_color(&stable_node->node, &root_stable_tree);
1080
1081 INIT_HLIST_HEAD(&stable_node->hlist); 1298 INIT_HLIST_HEAD(&stable_node->hlist);
1082 1299 stable_node->kpfn = kpfn;
1083 stable_node->kpfn = page_to_pfn(kpage);
1084 set_page_stable_node(kpage, stable_node); 1300 set_page_stable_node(kpage, stable_node);
1301 DO_NUMA(stable_node->nid = nid);
1302 rb_link_node(&stable_node->node, parent, new);
1303 rb_insert_color(&stable_node->node, root);
1085 1304
1086 return stable_node; 1305 return stable_node;
1087} 1306}
@@ -1104,10 +1323,15 @@ static
1104struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, 1323struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1105 struct page *page, 1324 struct page *page,
1106 struct page **tree_pagep) 1325 struct page **tree_pagep)
1107
1108{ 1326{
1109 struct rb_node **new = &root_unstable_tree.rb_node; 1327 struct rb_node **new;
1328 struct rb_root *root;
1110 struct rb_node *parent = NULL; 1329 struct rb_node *parent = NULL;
1330 int nid;
1331
1332 nid = get_kpfn_nid(page_to_pfn(page));
1333 root = root_unstable_tree + nid;
1334 new = &root->rb_node;
1111 1335
1112 while (*new) { 1336 while (*new) {
1113 struct rmap_item *tree_rmap_item; 1337 struct rmap_item *tree_rmap_item;
@@ -1137,6 +1361,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1137 } else if (ret > 0) { 1361 } else if (ret > 0) {
1138 put_page(tree_page); 1362 put_page(tree_page);
1139 new = &parent->rb_right; 1363 new = &parent->rb_right;
1364 } else if (!ksm_merge_across_nodes &&
1365 page_to_nid(tree_page) != nid) {
1366 /*
1367 * If tree_page has been migrated to another NUMA node,
1368 * it will be flushed out and put in the right unstable
1369 * tree next time: only merge with it when across_nodes.
1370 */
1371 put_page(tree_page);
1372 return NULL;
1140 } else { 1373 } else {
1141 *tree_pagep = tree_page; 1374 *tree_pagep = tree_page;
1142 return tree_rmap_item; 1375 return tree_rmap_item;
@@ -1145,8 +1378,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1145 1378
1146 rmap_item->address |= UNSTABLE_FLAG; 1379 rmap_item->address |= UNSTABLE_FLAG;
1147 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1380 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1381 DO_NUMA(rmap_item->nid = nid);
1148 rb_link_node(&rmap_item->node, parent, new); 1382 rb_link_node(&rmap_item->node, parent, new);
1149 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1383 rb_insert_color(&rmap_item->node, root);
1150 1384
1151 ksm_pages_unshared++; 1385 ksm_pages_unshared++;
1152 return NULL; 1386 return NULL;
@@ -1188,10 +1422,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1188 unsigned int checksum; 1422 unsigned int checksum;
1189 int err; 1423 int err;
1190 1424
1191 remove_rmap_item_from_tree(rmap_item); 1425 stable_node = page_stable_node(page);
1426 if (stable_node) {
1427 if (stable_node->head != &migrate_nodes &&
1428 get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
1429 rb_erase(&stable_node->node,
1430 root_stable_tree + NUMA(stable_node->nid));
1431 stable_node->head = &migrate_nodes;
1432 list_add(&stable_node->list, stable_node->head);
1433 }
1434 if (stable_node->head != &migrate_nodes &&
1435 rmap_item->head == stable_node)
1436 return;
1437 }
1192 1438
1193 /* We first start with searching the page inside the stable tree */ 1439 /* We first start with searching the page inside the stable tree */
1194 kpage = stable_tree_search(page); 1440 kpage = stable_tree_search(page);
1441 if (kpage == page && rmap_item->head == stable_node) {
1442 put_page(kpage);
1443 return;
1444 }
1445
1446 remove_rmap_item_from_tree(rmap_item);
1447
1195 if (kpage) { 1448 if (kpage) {
1196 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 1449 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1197 if (!err) { 1450 if (!err) {
@@ -1225,14 +1478,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1225 kpage = try_to_merge_two_pages(rmap_item, page, 1478 kpage = try_to_merge_two_pages(rmap_item, page,
1226 tree_rmap_item, tree_page); 1479 tree_rmap_item, tree_page);
1227 put_page(tree_page); 1480 put_page(tree_page);
1228 /*
1229 * As soon as we merge this page, we want to remove the
1230 * rmap_item of the page we have merged with from the unstable
1231 * tree, and insert it instead as new node in the stable tree.
1232 */
1233 if (kpage) { 1481 if (kpage) {
1234 remove_rmap_item_from_tree(tree_rmap_item); 1482 /*
1235 1483 * The pages were successfully merged: insert new
1484 * node in the stable tree and add both rmap_items.
1485 */
1236 lock_page(kpage); 1486 lock_page(kpage);
1237 stable_node = stable_tree_insert(kpage); 1487 stable_node = stable_tree_insert(kpage);
1238 if (stable_node) { 1488 if (stable_node) {
@@ -1289,6 +1539,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1289 struct mm_slot *slot; 1539 struct mm_slot *slot;
1290 struct vm_area_struct *vma; 1540 struct vm_area_struct *vma;
1291 struct rmap_item *rmap_item; 1541 struct rmap_item *rmap_item;
1542 int nid;
1292 1543
1293 if (list_empty(&ksm_mm_head.mm_list)) 1544 if (list_empty(&ksm_mm_head.mm_list))
1294 return NULL; 1545 return NULL;
@@ -1307,7 +1558,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1307 */ 1558 */
1308 lru_add_drain_all(); 1559 lru_add_drain_all();
1309 1560
1310 root_unstable_tree = RB_ROOT; 1561 /*
1562 * Whereas stale stable_nodes on the stable_tree itself
1563 * get pruned in the regular course of stable_tree_search(),
1564 * those moved out to the migrate_nodes list can accumulate:
1565 * so prune them once before each full scan.
1566 */
1567 if (!ksm_merge_across_nodes) {
1568 struct stable_node *stable_node;
1569 struct list_head *this, *next;
1570 struct page *page;
1571
1572 list_for_each_safe(this, next, &migrate_nodes) {
1573 stable_node = list_entry(this,
1574 struct stable_node, list);
1575 page = get_ksm_page(stable_node, false);
1576 if (page)
1577 put_page(page);
1578 cond_resched();
1579 }
1580 }
1581
1582 for (nid = 0; nid < ksm_nr_node_ids; nid++)
1583 root_unstable_tree[nid] = RB_ROOT;
1311 1584
1312 spin_lock(&ksm_mmlist_lock); 1585 spin_lock(&ksm_mmlist_lock);
1313 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1586 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
@@ -1392,7 +1665,7 @@ next_mm:
1392 * or when all VM_MERGEABLE areas have been unmapped (and 1665 * or when all VM_MERGEABLE areas have been unmapped (and
1393 * mmap_sem then protects against race with MADV_MERGEABLE). 1666 * mmap_sem then protects against race with MADV_MERGEABLE).
1394 */ 1667 */
1395 hlist_del(&slot->link); 1668 hash_del(&slot->link);
1396 list_del(&slot->mm_list); 1669 list_del(&slot->mm_list);
1397 spin_unlock(&ksm_mmlist_lock); 1670 spin_unlock(&ksm_mmlist_lock);
1398 1671
@@ -1428,8 +1701,7 @@ static void ksm_do_scan(unsigned int scan_npages)
1428 rmap_item = scan_get_next_rmap_item(&page); 1701 rmap_item = scan_get_next_rmap_item(&page);
1429 if (!rmap_item) 1702 if (!rmap_item)
1430 return; 1703 return;
1431 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1704 cmp_and_merge_page(page, rmap_item);
1432 cmp_and_merge_page(page, rmap_item);
1433 put_page(page); 1705 put_page(page);
1434 } 1706 }
1435} 1707}
@@ -1446,6 +1718,7 @@ static int ksm_scan_thread(void *nothing)
1446 1718
1447 while (!kthread_should_stop()) { 1719 while (!kthread_should_stop()) {
1448 mutex_lock(&ksm_thread_mutex); 1720 mutex_lock(&ksm_thread_mutex);
1721 wait_while_offlining();
1449 if (ksmd_should_run()) 1722 if (ksmd_should_run())
1450 ksm_do_scan(ksm_thread_pages_to_scan); 1723 ksm_do_scan(ksm_thread_pages_to_scan);
1451 mutex_unlock(&ksm_thread_mutex); 1724 mutex_unlock(&ksm_thread_mutex);
@@ -1525,11 +1798,19 @@ int __ksm_enter(struct mm_struct *mm)
1525 spin_lock(&ksm_mmlist_lock); 1798 spin_lock(&ksm_mmlist_lock);
1526 insert_to_mm_slots_hash(mm, mm_slot); 1799 insert_to_mm_slots_hash(mm, mm_slot);
1527 /* 1800 /*
1528 * Insert just behind the scanning cursor, to let the area settle 1801 * When KSM_RUN_MERGE (or KSM_RUN_STOP),
1802 * insert just behind the scanning cursor, to let the area settle
1529 * down a little; when fork is followed by immediate exec, we don't 1803 * down a little; when fork is followed by immediate exec, we don't
1530 * want ksmd to waste time setting up and tearing down an rmap_list. 1804 * want ksmd to waste time setting up and tearing down an rmap_list.
1805 *
1806 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
1807 * scanning cursor, otherwise KSM pages in newly forked mms will be
1808 * missed: then we might as well insert at the end of the list.
1531 */ 1809 */
1532 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 1810 if (ksm_run & KSM_RUN_UNMERGE)
1811 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
1812 else
1813 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1533 spin_unlock(&ksm_mmlist_lock); 1814 spin_unlock(&ksm_mmlist_lock);
1534 1815
1535 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1816 set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -1559,7 +1840,7 @@ void __ksm_exit(struct mm_struct *mm)
1559 mm_slot = get_mm_slot(mm); 1840 mm_slot = get_mm_slot(mm);
1560 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1841 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1561 if (!mm_slot->rmap_list) { 1842 if (!mm_slot->rmap_list) {
1562 hlist_del(&mm_slot->link); 1843 hash_del(&mm_slot->link);
1563 list_del(&mm_slot->mm_list); 1844 list_del(&mm_slot->mm_list);
1564 easy_to_free = 1; 1845 easy_to_free = 1;
1565 } else { 1846 } else {
@@ -1579,24 +1860,32 @@ void __ksm_exit(struct mm_struct *mm)
1579 } 1860 }
1580} 1861}
1581 1862
1582struct page *ksm_does_need_to_copy(struct page *page, 1863struct page *ksm_might_need_to_copy(struct page *page,
1583 struct vm_area_struct *vma, unsigned long address) 1864 struct vm_area_struct *vma, unsigned long address)
1584{ 1865{
1866 struct anon_vma *anon_vma = page_anon_vma(page);
1585 struct page *new_page; 1867 struct page *new_page;
1586 1868
1869 if (PageKsm(page)) {
1870 if (page_stable_node(page) &&
1871 !(ksm_run & KSM_RUN_UNMERGE))
1872 return page; /* no need to copy it */
1873 } else if (!anon_vma) {
1874 return page; /* no need to copy it */
1875 } else if (anon_vma->root == vma->anon_vma->root &&
1876 page->index == linear_page_index(vma, address)) {
1877 return page; /* still no need to copy it */
1878 }
1879 if (!PageUptodate(page))
1880 return page; /* let do_swap_page report the error */
1881
1587 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1882 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1588 if (new_page) { 1883 if (new_page) {
1589 copy_user_highpage(new_page, page, address, vma); 1884 copy_user_highpage(new_page, page, address, vma);
1590 1885
1591 SetPageDirty(new_page); 1886 SetPageDirty(new_page);
1592 __SetPageUptodate(new_page); 1887 __SetPageUptodate(new_page);
1593 SetPageSwapBacked(new_page);
1594 __set_page_locked(new_page); 1888 __set_page_locked(new_page);
1595
1596 if (!mlocked_vma_newpage(vma, new_page))
1597 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1598 else
1599 add_page_to_unevictable_list(new_page);
1600 } 1889 }
1601 1890
1602 return new_page; 1891 return new_page;
@@ -1607,7 +1896,6 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1607{ 1896{
1608 struct stable_node *stable_node; 1897 struct stable_node *stable_node;
1609 struct rmap_item *rmap_item; 1898 struct rmap_item *rmap_item;
1610 struct hlist_node *hlist;
1611 unsigned int mapcount = page_mapcount(page); 1899 unsigned int mapcount = page_mapcount(page);
1612 int referenced = 0; 1900 int referenced = 0;
1613 int search_new_forks = 0; 1901 int search_new_forks = 0;
@@ -1619,7 +1907,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1619 if (!stable_node) 1907 if (!stable_node)
1620 return 0; 1908 return 0;
1621again: 1909again:
1622 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1910 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1623 struct anon_vma *anon_vma = rmap_item->anon_vma; 1911 struct anon_vma *anon_vma = rmap_item->anon_vma;
1624 struct anon_vma_chain *vmac; 1912 struct anon_vma_chain *vmac;
1625 struct vm_area_struct *vma; 1913 struct vm_area_struct *vma;
@@ -1661,7 +1949,6 @@ out:
1661int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) 1949int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1662{ 1950{
1663 struct stable_node *stable_node; 1951 struct stable_node *stable_node;
1664 struct hlist_node *hlist;
1665 struct rmap_item *rmap_item; 1952 struct rmap_item *rmap_item;
1666 int ret = SWAP_AGAIN; 1953 int ret = SWAP_AGAIN;
1667 int search_new_forks = 0; 1954 int search_new_forks = 0;
@@ -1673,7 +1960,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1673 if (!stable_node) 1960 if (!stable_node)
1674 return SWAP_FAIL; 1961 return SWAP_FAIL;
1675again: 1962again:
1676 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1963 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1677 struct anon_vma *anon_vma = rmap_item->anon_vma; 1964 struct anon_vma *anon_vma = rmap_item->anon_vma;
1678 struct anon_vma_chain *vmac; 1965 struct anon_vma_chain *vmac;
1679 struct vm_area_struct *vma; 1966 struct vm_area_struct *vma;
@@ -1714,7 +2001,6 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1714 struct vm_area_struct *, unsigned long, void *), void *arg) 2001 struct vm_area_struct *, unsigned long, void *), void *arg)
1715{ 2002{
1716 struct stable_node *stable_node; 2003 struct stable_node *stable_node;
1717 struct hlist_node *hlist;
1718 struct rmap_item *rmap_item; 2004 struct rmap_item *rmap_item;
1719 int ret = SWAP_AGAIN; 2005 int ret = SWAP_AGAIN;
1720 int search_new_forks = 0; 2006 int search_new_forks = 0;
@@ -1726,7 +2012,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1726 if (!stable_node) 2012 if (!stable_node)
1727 return ret; 2013 return ret;
1728again: 2014again:
1729 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 2015 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1730 struct anon_vma *anon_vma = rmap_item->anon_vma; 2016 struct anon_vma *anon_vma = rmap_item->anon_vma;
1731 struct anon_vma_chain *vmac; 2017 struct anon_vma_chain *vmac;
1732 struct vm_area_struct *vma; 2018 struct vm_area_struct *vma;
@@ -1773,64 +2059,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1773 if (stable_node) { 2059 if (stable_node) {
1774 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); 2060 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
1775 stable_node->kpfn = page_to_pfn(newpage); 2061 stable_node->kpfn = page_to_pfn(newpage);
2062 /*
2063 * newpage->mapping was set in advance; now we need smp_wmb()
2064 * to make sure that the new stable_node->kpfn is visible
2065 * to get_ksm_page() before it can see that oldpage->mapping
2066 * has gone stale (or that PageSwapCache has been cleared).
2067 */
2068 smp_wmb();
2069 set_page_stable_node(oldpage, NULL);
1776 } 2070 }
1777} 2071}
1778#endif /* CONFIG_MIGRATION */ 2072#endif /* CONFIG_MIGRATION */
1779 2073
1780#ifdef CONFIG_MEMORY_HOTREMOVE 2074#ifdef CONFIG_MEMORY_HOTREMOVE
1781static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, 2075static int just_wait(void *word)
1782 unsigned long end_pfn)
1783{ 2076{
1784 struct rb_node *node; 2077 schedule();
2078 return 0;
2079}
1785 2080
1786 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { 2081static void wait_while_offlining(void)
1787 struct stable_node *stable_node; 2082{
2083 while (ksm_run & KSM_RUN_OFFLINE) {
2084 mutex_unlock(&ksm_thread_mutex);
2085 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2086 just_wait, TASK_UNINTERRUPTIBLE);
2087 mutex_lock(&ksm_thread_mutex);
2088 }
2089}
1788 2090
1789 stable_node = rb_entry(node, struct stable_node, node); 2091static void ksm_check_stable_tree(unsigned long start_pfn,
2092 unsigned long end_pfn)
2093{
2094 struct stable_node *stable_node;
2095 struct list_head *this, *next;
2096 struct rb_node *node;
2097 int nid;
2098
2099 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2100 node = rb_first(root_stable_tree + nid);
2101 while (node) {
2102 stable_node = rb_entry(node, struct stable_node, node);
2103 if (stable_node->kpfn >= start_pfn &&
2104 stable_node->kpfn < end_pfn) {
2105 /*
2106 * Don't get_ksm_page, page has already gone:
2107 * which is why we keep kpfn instead of page*
2108 */
2109 remove_node_from_stable_tree(stable_node);
2110 node = rb_first(root_stable_tree + nid);
2111 } else
2112 node = rb_next(node);
2113 cond_resched();
2114 }
2115 }
2116 list_for_each_safe(this, next, &migrate_nodes) {
2117 stable_node = list_entry(this, struct stable_node, list);
1790 if (stable_node->kpfn >= start_pfn && 2118 if (stable_node->kpfn >= start_pfn &&
1791 stable_node->kpfn < end_pfn) 2119 stable_node->kpfn < end_pfn)
1792 return stable_node; 2120 remove_node_from_stable_tree(stable_node);
2121 cond_resched();
1793 } 2122 }
1794 return NULL;
1795} 2123}
1796 2124
1797static int ksm_memory_callback(struct notifier_block *self, 2125static int ksm_memory_callback(struct notifier_block *self,
1798 unsigned long action, void *arg) 2126 unsigned long action, void *arg)
1799{ 2127{
1800 struct memory_notify *mn = arg; 2128 struct memory_notify *mn = arg;
1801 struct stable_node *stable_node;
1802 2129
1803 switch (action) { 2130 switch (action) {
1804 case MEM_GOING_OFFLINE: 2131 case MEM_GOING_OFFLINE:
1805 /* 2132 /*
1806 * Keep it very simple for now: just lock out ksmd and 2133 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
1807 * MADV_UNMERGEABLE while any memory is going offline. 2134 * and remove_all_stable_nodes() while memory is going offline:
1808 * mutex_lock_nested() is necessary because lockdep was alarmed 2135 * it is unsafe for them to touch the stable tree at this time.
1809 * that here we take ksm_thread_mutex inside notifier chain 2136 * But unmerge_ksm_pages(), rmap lookups and other entry points
1810 * mutex, and later take notifier chain mutex inside 2137 * which do not need the ksm_thread_mutex are all safe.
1811 * ksm_thread_mutex to unlock it. But that's safe because both
1812 * are inside mem_hotplug_mutex.
1813 */ 2138 */
1814 mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); 2139 mutex_lock(&ksm_thread_mutex);
2140 ksm_run |= KSM_RUN_OFFLINE;
2141 mutex_unlock(&ksm_thread_mutex);
1815 break; 2142 break;
1816 2143
1817 case MEM_OFFLINE: 2144 case MEM_OFFLINE:
1818 /* 2145 /*
1819 * Most of the work is done by page migration; but there might 2146 * Most of the work is done by page migration; but there might
1820 * be a few stable_nodes left over, still pointing to struct 2147 * be a few stable_nodes left over, still pointing to struct
1821 * pages which have been offlined: prune those from the tree. 2148 * pages which have been offlined: prune those from the tree,
2149 * otherwise get_ksm_page() might later try to access a
2150 * non-existent struct page.
1822 */ 2151 */
1823 while ((stable_node = ksm_check_stable_tree(mn->start_pfn, 2152 ksm_check_stable_tree(mn->start_pfn,
1824 mn->start_pfn + mn->nr_pages)) != NULL) 2153 mn->start_pfn + mn->nr_pages);
1825 remove_node_from_stable_tree(stable_node);
1826 /* fallthrough */ 2154 /* fallthrough */
1827 2155
1828 case MEM_CANCEL_OFFLINE: 2156 case MEM_CANCEL_OFFLINE:
2157 mutex_lock(&ksm_thread_mutex);
2158 ksm_run &= ~KSM_RUN_OFFLINE;
1829 mutex_unlock(&ksm_thread_mutex); 2159 mutex_unlock(&ksm_thread_mutex);
2160
2161 smp_mb(); /* wake_up_bit advises this */
2162 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
1830 break; 2163 break;
1831 } 2164 }
1832 return NOTIFY_OK; 2165 return NOTIFY_OK;
1833} 2166}
2167#else
2168static void wait_while_offlining(void)
2169{
2170}
1834#endif /* CONFIG_MEMORY_HOTREMOVE */ 2171#endif /* CONFIG_MEMORY_HOTREMOVE */
1835 2172
1836#ifdef CONFIG_SYSFS 2173#ifdef CONFIG_SYSFS
@@ -1893,7 +2230,7 @@ KSM_ATTR(pages_to_scan);
1893static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 2230static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1894 char *buf) 2231 char *buf)
1895{ 2232{
1896 return sprintf(buf, "%u\n", ksm_run); 2233 return sprintf(buf, "%lu\n", ksm_run);
1897} 2234}
1898 2235
1899static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 2236static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1916,6 +2253,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1916 */ 2253 */
1917 2254
1918 mutex_lock(&ksm_thread_mutex); 2255 mutex_lock(&ksm_thread_mutex);
2256 wait_while_offlining();
1919 if (ksm_run != flags) { 2257 if (ksm_run != flags) {
1920 ksm_run = flags; 2258 ksm_run = flags;
1921 if (flags & KSM_RUN_UNMERGE) { 2259 if (flags & KSM_RUN_UNMERGE) {
@@ -1937,6 +2275,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1937} 2275}
1938KSM_ATTR(run); 2276KSM_ATTR(run);
1939 2277
2278#ifdef CONFIG_NUMA
2279static ssize_t merge_across_nodes_show(struct kobject *kobj,
2280 struct kobj_attribute *attr, char *buf)
2281{
2282 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2283}
2284
2285static ssize_t merge_across_nodes_store(struct kobject *kobj,
2286 struct kobj_attribute *attr,
2287 const char *buf, size_t count)
2288{
2289 int err;
2290 unsigned long knob;
2291
2292 err = kstrtoul(buf, 10, &knob);
2293 if (err)
2294 return err;
2295 if (knob > 1)
2296 return -EINVAL;
2297
2298 mutex_lock(&ksm_thread_mutex);
2299 wait_while_offlining();
2300 if (ksm_merge_across_nodes != knob) {
2301 if (ksm_pages_shared || remove_all_stable_nodes())
2302 err = -EBUSY;
2303 else if (root_stable_tree == one_stable_tree) {
2304 struct rb_root *buf;
2305 /*
2306 * This is the first time that we switch away from the
2307 * default of merging across nodes: must now allocate
2308 * a buffer to hold as many roots as may be needed.
2309 * Allocate stable and unstable together:
2310 * MAXSMP NODES_SHIFT 10 will use 16kB.
2311 */
2312 buf = kcalloc(nr_node_ids + nr_node_ids,
2313 sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
2314 /* Let us assume that RB_ROOT is NULL is zero */
2315 if (!buf)
2316 err = -ENOMEM;
2317 else {
2318 root_stable_tree = buf;
2319 root_unstable_tree = buf + nr_node_ids;
2320 /* Stable tree is empty but not the unstable */
2321 root_unstable_tree[0] = one_unstable_tree[0];
2322 }
2323 }
2324 if (!err) {
2325 ksm_merge_across_nodes = knob;
2326 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2327 }
2328 }
2329 mutex_unlock(&ksm_thread_mutex);
2330
2331 return err ? err : count;
2332}
2333KSM_ATTR(merge_across_nodes);
2334#endif
2335
1940static ssize_t pages_shared_show(struct kobject *kobj, 2336static ssize_t pages_shared_show(struct kobject *kobj,
1941 struct kobj_attribute *attr, char *buf) 2337 struct kobj_attribute *attr, char *buf)
1942{ 2338{
@@ -1991,6 +2387,9 @@ static struct attribute *ksm_attrs[] = {
1991 &pages_unshared_attr.attr, 2387 &pages_unshared_attr.attr,
1992 &pages_volatile_attr.attr, 2388 &pages_volatile_attr.attr,
1993 &full_scans_attr.attr, 2389 &full_scans_attr.attr,
2390#ifdef CONFIG_NUMA
2391 &merge_across_nodes_attr.attr,
2392#endif
1994 NULL, 2393 NULL,
1995}; 2394};
1996 2395
@@ -2029,10 +2428,7 @@ static int __init ksm_init(void)
2029#endif /* CONFIG_SYSFS */ 2428#endif /* CONFIG_SYSFS */
2030 2429
2031#ifdef CONFIG_MEMORY_HOTREMOVE 2430#ifdef CONFIG_MEMORY_HOTREMOVE
2032 /* 2431 /* There is no significance to this priority 100 */
2033 * Choose a high priority since the callback takes ksm_thread_mutex:
2034 * later callbacks could only be taking locks which nest within that.
2035 */
2036 hotplug_memory_notifier(ksm_memory_callback, 100); 2432 hotplug_memory_notifier(ksm_memory_callback, 100);
2037#endif 2433#endif
2038 return 0; 2434 return 0;
diff --git a/mm/madvise.c b/mm/madvise.c
index 03dfa5c7adb3..c58c94b56c3d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -16,6 +16,9 @@
16#include <linux/ksm.h> 16#include <linux/ksm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/file.h> 18#include <linux/file.h>
19#include <linux/blkdev.h>
20#include <linux/swap.h>
21#include <linux/swapops.h>
19 22
20/* 23/*
21 * Any behaviour which results in changes to the vma->vm_flags needs to 24 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -131,6 +134,84 @@ out:
131 return error; 134 return error;
132} 135}
133 136
137#ifdef CONFIG_SWAP
138static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
139 unsigned long end, struct mm_walk *walk)
140{
141 pte_t *orig_pte;
142 struct vm_area_struct *vma = walk->private;
143 unsigned long index;
144
145 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
146 return 0;
147
148 for (index = start; index != end; index += PAGE_SIZE) {
149 pte_t pte;
150 swp_entry_t entry;
151 struct page *page;
152 spinlock_t *ptl;
153
154 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
155 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
156 pte_unmap_unlock(orig_pte, ptl);
157
158 if (pte_present(pte) || pte_none(pte) || pte_file(pte))
159 continue;
160 entry = pte_to_swp_entry(pte);
161 if (unlikely(non_swap_entry(entry)))
162 continue;
163
164 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
165 vma, index);
166 if (page)
167 page_cache_release(page);
168 }
169
170 return 0;
171}
172
173static void force_swapin_readahead(struct vm_area_struct *vma,
174 unsigned long start, unsigned long end)
175{
176 struct mm_walk walk = {
177 .mm = vma->vm_mm,
178 .pmd_entry = swapin_walk_pmd_entry,
179 .private = vma,
180 };
181
182 walk_page_range(start, end, &walk);
183
184 lru_add_drain(); /* Push any new pages onto the LRU now */
185}
186
187static void force_shm_swapin_readahead(struct vm_area_struct *vma,
188 unsigned long start, unsigned long end,
189 struct address_space *mapping)
190{
191 pgoff_t index;
192 struct page *page;
193 swp_entry_t swap;
194
195 for (; start < end; start += PAGE_SIZE) {
196 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
197
198 page = find_get_page(mapping, index);
199 if (!radix_tree_exceptional_entry(page)) {
200 if (page)
201 page_cache_release(page);
202 continue;
203 }
204 swap = radix_to_swp_entry(page);
205 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
206 NULL, 0);
207 if (page)
208 page_cache_release(page);
209 }
210
211 lru_add_drain(); /* Push any new pages onto the LRU now */
212}
213#endif /* CONFIG_SWAP */
214
134/* 215/*
135 * Schedule all required I/O operations. Do not wait for completion. 216 * Schedule all required I/O operations. Do not wait for completion.
136 */ 217 */
@@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma,
140{ 221{
141 struct file *file = vma->vm_file; 222 struct file *file = vma->vm_file;
142 223
224#ifdef CONFIG_SWAP
225 if (!file || mapping_cap_swap_backed(file->f_mapping)) {
226 *prev = vma;
227 if (!file)
228 force_swapin_readahead(vma, start, end);
229 else
230 force_shm_swapin_readahead(vma, start, end,
231 file->f_mapping);
232 return 0;
233 }
234#endif
235
143 if (!file) 236 if (!file)
144 return -EBADF; 237 return -EBADF;
145 238
@@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
371 int error = -EINVAL; 464 int error = -EINVAL;
372 int write; 465 int write;
373 size_t len; 466 size_t len;
467 struct blk_plug plug;
374 468
375#ifdef CONFIG_MEMORY_FAILURE 469#ifdef CONFIG_MEMORY_FAILURE
376 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 470 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
410 if (vma && start > vma->vm_start) 504 if (vma && start > vma->vm_start)
411 prev = vma; 505 prev = vma;
412 506
507 blk_start_plug(&plug);
413 for (;;) { 508 for (;;) {
414 /* Still start < end. */ 509 /* Still start < end. */
415 error = -ENOMEM; 510 error = -ENOMEM;
416 if (!vma) 511 if (!vma)
417 goto out; 512 goto out_plug;
418 513
419 /* Here start < (end|vma->vm_end). */ 514 /* Here start < (end|vma->vm_end). */
420 if (start < vma->vm_start) { 515 if (start < vma->vm_start) {
421 unmapped_error = -ENOMEM; 516 unmapped_error = -ENOMEM;
422 start = vma->vm_start; 517 start = vma->vm_start;
423 if (start >= end) 518 if (start >= end)
424 goto out; 519 goto out_plug;
425 } 520 }
426 521
427 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 522 /* Here vma->vm_start <= start < (end|vma->vm_end) */
@@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
432 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 527 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
433 error = madvise_vma(vma, &prev, start, tmp, behavior); 528 error = madvise_vma(vma, &prev, start, tmp, behavior);
434 if (error) 529 if (error)
435 goto out; 530 goto out_plug;
436 start = tmp; 531 start = tmp;
437 if (prev && start < prev->vm_end) 532 if (prev && start < prev->vm_end)
438 start = prev->vm_end; 533 start = prev->vm_end;
439 error = unmapped_error; 534 error = unmapped_error;
440 if (start >= end) 535 if (start >= end)
441 goto out; 536 goto out_plug;
442 if (prev) 537 if (prev)
443 vma = prev->vm_next; 538 vma = prev->vm_next;
444 else /* madvise_remove dropped mmap_sem */ 539 else /* madvise_remove dropped mmap_sem */
445 vma = find_vma(current->mm, start); 540 vma = find_vma(current->mm, start);
446 } 541 }
542out_plug:
543 blk_finish_plug(&plug);
447out: 544out:
448 if (write) 545 if (write)
449 up_write(&current->mm->mmap_sem); 546 up_write(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index 625905523c2a..1bcd9b970564 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
92 * 92 *
93 * Find @size free area aligned to @align in the specified range and node. 93 * Find @size free area aligned to @align in the specified range and node.
94 * 94 *
95 * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the
96 * memory we found if not in hotpluggable ranges.
97 *
95 * RETURNS: 98 * RETURNS:
96 * Found address on success, %0 on failure. 99 * Found address on success, %0 on failure.
97 */ 100 */
101#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
102phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
103 phys_addr_t end, phys_addr_t size,
104 phys_addr_t align, int nid)
105{
106 phys_addr_t this_start, this_end, cand;
107 u64 i;
108 int curr = movablemem_map.nr_map - 1;
109
110 /* pump up @end */
111 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
112 end = memblock.current_limit;
113
114 /* avoid allocating the first page */
115 start = max_t(phys_addr_t, start, PAGE_SIZE);
116 end = max(start, end);
117
118 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
119 this_start = clamp(this_start, start, end);
120 this_end = clamp(this_end, start, end);
121
122restart:
123 if (this_end <= this_start || this_end < size)
124 continue;
125
126 for (; curr >= 0; curr--) {
127 if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT)
128 < this_end)
129 break;
130 }
131
132 cand = round_down(this_end - size, align);
133 if (curr >= 0 &&
134 cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) {
135 this_end = movablemem_map.map[curr].start_pfn
136 << PAGE_SHIFT;
137 goto restart;
138 }
139
140 if (cand >= this_start)
141 return cand;
142 }
143
144 return 0;
145}
146#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
98phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 147phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
99 phys_addr_t end, phys_addr_t size, 148 phys_addr_t end, phys_addr_t size,
100 phys_addr_t align, int nid) 149 phys_addr_t align, int nid)
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
123 } 172 }
124 return 0; 173 return 0;
125} 174}
175#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
126 176
127/** 177/**
128 * memblock_find_in_range - find free area in given range 178 * memblock_find_in_range - find free area in given range
@@ -314,7 +364,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
314 } 364 }
315 365
316 this->size += next->size; 366 this->size += next->size;
317 memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); 367 /* move forward from next + 1, index of which is i + 2 */
368 memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
318 type->cnt--; 369 type->cnt--;
319 } 370 }
320} 371}
@@ -827,6 +878,23 @@ phys_addr_t __init memblock_phys_mem_size(void)
827 return memblock.memory.total_size; 878 return memblock.memory.total_size;
828} 879}
829 880
881phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
882{
883 unsigned long pages = 0;
884 struct memblock_region *r;
885 unsigned long start_pfn, end_pfn;
886
887 for_each_memblock(memory, r) {
888 start_pfn = memblock_region_memory_base_pfn(r);
889 end_pfn = memblock_region_memory_end_pfn(r);
890 start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
891 end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
892 pages += end_pfn - start_pfn;
893 }
894
895 return (phys_addr_t)pages << PAGE_SHIFT;
896}
897
830/* lowest address */ 898/* lowest address */
831phys_addr_t __init_memblock memblock_start_of_DRAM(void) 899phys_addr_t __init_memblock memblock_start_of_DRAM(void)
832{ 900{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09255ec8159c..53b8201b31eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = {
120 "pgmajfault", 120 "pgmajfault",
121}; 121};
122 122
123static const char * const mem_cgroup_lru_names[] = {
124 "inactive_anon",
125 "active_anon",
126 "inactive_file",
127 "active_file",
128 "unevictable",
129};
130
123/* 131/*
124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 132 * Per memcg event counter is incremented at every pagein/pageout. With THP,
125 * it will be incremated by the number of pages. This counter is used for 133 * it will be incremated by the number of pages. This counter is used for
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node {
172}; 180};
173 181
174struct mem_cgroup_lru_info { 182struct mem_cgroup_lru_info {
175 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 183 struct mem_cgroup_per_node *nodeinfo[0];
176}; 184};
177 185
178/* 186/*
@@ -276,17 +284,6 @@ struct mem_cgroup {
276 */ 284 */
277 struct res_counter kmem; 285 struct res_counter kmem;
278 /* 286 /*
279 * Per cgroup active and inactive list, similar to the
280 * per zone LRU lists.
281 */
282 struct mem_cgroup_lru_info info;
283 int last_scanned_node;
284#if MAX_NUMNODES > 1
285 nodemask_t scan_nodes;
286 atomic_t numainfo_events;
287 atomic_t numainfo_updating;
288#endif
289 /*
290 * Should the accounting and control be hierarchical, per subtree? 287 * Should the accounting and control be hierarchical, per subtree?
291 */ 288 */
292 bool use_hierarchy; 289 bool use_hierarchy;
@@ -349,8 +346,29 @@ struct mem_cgroup {
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 346 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id; 347 int kmemcg_id;
351#endif 348#endif
349
350 int last_scanned_node;
351#if MAX_NUMNODES > 1
352 nodemask_t scan_nodes;
353 atomic_t numainfo_events;
354 atomic_t numainfo_updating;
355#endif
356 /*
357 * Per cgroup active and inactive list, similar to the
358 * per zone LRU lists.
359 *
360 * WARNING: This has to be the last element of the struct. Don't
361 * add new fields after this point.
362 */
363 struct mem_cgroup_lru_info info;
352}; 364};
353 365
366static size_t memcg_size(void)
367{
368 return sizeof(struct mem_cgroup) +
369 nr_node_ids * sizeof(struct mem_cgroup_per_node);
370}
371
354/* internal only representation about the status of kmem accounting. */ 372/* internal only representation about the status of kmem accounting. */
355enum { 373enum {
356 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ 374 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
398 416
399/* Stuffs for move charges at task migration. */ 417/* Stuffs for move charges at task migration. */
400/* 418/*
401 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 419 * Types of charges to be moved. "move_charge_at_immitgrate" and
402 * left-shifted bitmap of these types. 420 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
403 */ 421 */
404enum move_type { 422enum move_type {
405 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 423 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
@@ -412,6 +430,7 @@ static struct move_charge_struct {
412 spinlock_t lock; /* for from, to */ 430 spinlock_t lock; /* for from, to */
413 struct mem_cgroup *from; 431 struct mem_cgroup *from;
414 struct mem_cgroup *to; 432 struct mem_cgroup *to;
433 unsigned long immigrate_flags;
415 unsigned long precharge; 434 unsigned long precharge;
416 unsigned long moved_charge; 435 unsigned long moved_charge;
417 unsigned long moved_swap; 436 unsigned long moved_swap;
@@ -424,14 +443,12 @@ static struct move_charge_struct {
424 443
425static bool move_anon(void) 444static bool move_anon(void)
426{ 445{
427 return test_bit(MOVE_CHARGE_TYPE_ANON, 446 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
428 &mc.to->move_charge_at_immigrate);
429} 447}
430 448
431static bool move_file(void) 449static bool move_file(void)
432{ 450{
433 return test_bit(MOVE_CHARGE_TYPE_FILE, 451 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
434 &mc.to->move_charge_at_immigrate);
435} 452}
436 453
437/* 454/*
@@ -471,6 +488,13 @@ enum res_type {
471#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 488#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
472#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 489#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
473 490
491/*
492 * The memcg_create_mutex will be held whenever a new cgroup is created.
493 * As a consequence, any change that needs to protect against new child cgroups
494 * appearing has to hold it as well.
495 */
496static DEFINE_MUTEX(memcg_create_mutex);
497
474static void mem_cgroup_get(struct mem_cgroup *memcg); 498static void mem_cgroup_get(struct mem_cgroup *memcg);
475static void mem_cgroup_put(struct mem_cgroup *memcg); 499static void mem_cgroup_put(struct mem_cgroup *memcg);
476 500
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg);
627static struct mem_cgroup_per_zone * 651static struct mem_cgroup_per_zone *
628mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 652mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
629{ 653{
654 VM_BUG_ON((unsigned)nid >= nr_node_ids);
630 return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; 655 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
631} 656}
632 657
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1371 return inactive * inactive_ratio < active; 1396 return inactive * inactive_ratio < active;
1372} 1397}
1373 1398
1374int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1375{
1376 unsigned long active;
1377 unsigned long inactive;
1378
1379 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1380 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1381
1382 return (active > inactive);
1383}
1384
1385#define mem_cgroup_from_res_counter(counter, member) \ 1399#define mem_cgroup_from_res_counter(counter, member) \
1386 container_of(counter, struct mem_cgroup, member) 1400 container_of(counter, struct mem_cgroup, member)
1387 1401
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1524 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1538 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1525} 1539}
1526 1540
1541#define K(x) ((x) << (PAGE_SHIFT-10))
1527/** 1542/**
1528 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1543 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1529 * @memcg: The memory cgroup that went over limit 1544 * @memcg: The memory cgroup that went over limit
1530 * @p: Task that is going to be killed 1545 * @p: Task that is going to be killed
1531 * 1546 *
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1543 */ 1558 */
1544 static char memcg_name[PATH_MAX]; 1559 static char memcg_name[PATH_MAX];
1545 int ret; 1560 int ret;
1561 struct mem_cgroup *iter;
1562 unsigned int i;
1546 1563
1547 if (!memcg || !p) 1564 if (!p)
1548 return; 1565 return;
1549 1566
1550 rcu_read_lock(); 1567 rcu_read_lock();
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1563 } 1580 }
1564 rcu_read_unlock(); 1581 rcu_read_unlock();
1565 1582
1566 printk(KERN_INFO "Task in %s killed", memcg_name); 1583 pr_info("Task in %s killed", memcg_name);
1567 1584
1568 rcu_read_lock(); 1585 rcu_read_lock();
1569 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1586 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1576 /* 1593 /*
1577 * Continues from above, so we don't need an KERN_ level 1594 * Continues from above, so we don't need an KERN_ level
1578 */ 1595 */
1579 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1596 pr_cont(" as a result of limit of %s\n", memcg_name);
1580done: 1597done:
1581 1598
1582 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1599 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1583 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1600 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1584 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1601 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1585 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1602 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1586 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1603 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1587 "failcnt %llu\n",
1588 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1604 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1589 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1605 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1590 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1606 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1591 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1607 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1592 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1608 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1593 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1609 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1594 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1610 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1611
1612 for_each_mem_cgroup_tree(iter, memcg) {
1613 pr_info("Memory cgroup stats");
1614
1615 rcu_read_lock();
1616 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1617 if (!ret)
1618 pr_cont(" for %s", memcg_name);
1619 rcu_read_unlock();
1620 pr_cont(":");
1621
1622 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1623 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1624 continue;
1625 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1626 K(mem_cgroup_read_stat(iter, i)));
1627 }
1628
1629 for (i = 0; i < NR_LRU_LISTS; i++)
1630 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1631 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1632
1633 pr_cont("\n");
1634 }
1595} 1635}
1596 1636
1597/* 1637/*
@@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy)
2256 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2296 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2257} 2297}
2258 2298
2299static void __init memcg_stock_init(void)
2300{
2301 int cpu;
2302
2303 for_each_possible_cpu(cpu) {
2304 struct memcg_stock_pcp *stock =
2305 &per_cpu(memcg_stock, cpu);
2306 INIT_WORK(&stock->work, drain_local_stock);
2307 }
2308}
2309
2259/* 2310/*
2260 * Cache charges(val) which is from res_counter, to local per_cpu area. 2311 * Cache charges(val) which is from res_counter, to local per_cpu area.
2261 * This will be consumed by consume_stock() function, later. 2312 * This will be consumed by consume_stock() function, later.
@@ -3030,7 +3081,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3030 if (memcg) { 3081 if (memcg) {
3031 s->memcg_params->memcg = memcg; 3082 s->memcg_params->memcg = memcg;
3032 s->memcg_params->root_cache = root_cache; 3083 s->memcg_params->root_cache = root_cache;
3033 } 3084 } else
3085 s->memcg_params->is_root_cache = true;
3086
3034 return 0; 3087 return 0;
3035} 3088}
3036 3089
@@ -4389,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page)
4389 4442
4390 pc = lookup_page_cgroup_used(page); 4443 pc = lookup_page_cgroup_used(page);
4391 if (pc) { 4444 if (pc) {
4392 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 4445 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4393 pc, pc->flags, pc->mem_cgroup); 4446 pc, pc->flags, pc->mem_cgroup);
4394 } 4447 }
4395} 4448}
4396#endif 4449#endif
@@ -4717,6 +4770,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4717} 4770}
4718 4771
4719/* 4772/*
4773 * This mainly exists for tests during the setting of set of use_hierarchy.
4774 * Since this is the very setting we are changing, the current hierarchy value
4775 * is meaningless
4776 */
4777static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4778{
4779 struct cgroup *pos;
4780
4781 /* bounce at first found */
4782 cgroup_for_each_child(pos, memcg->css.cgroup)
4783 return true;
4784 return false;
4785}
4786
4787/*
4788 * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
4789 * to be already dead (as in mem_cgroup_force_empty, for instance). This is
4790 * from mem_cgroup_count_children(), in the sense that we don't really care how
4791 * many children we have; we only need to know if we have any. It also counts
4792 * any memcg without hierarchy as infertile.
4793 */
4794static inline bool memcg_has_children(struct mem_cgroup *memcg)
4795{
4796 return memcg->use_hierarchy && __memcg_has_children(memcg);
4797}
4798
4799/*
4720 * Reclaims as many pages from the given memcg as possible and moves 4800 * Reclaims as many pages from the given memcg as possible and moves
4721 * the rest to the parent. 4801 * the rest to the parent.
4722 * 4802 *
@@ -4786,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4786 if (parent) 4866 if (parent)
4787 parent_memcg = mem_cgroup_from_cont(parent); 4867 parent_memcg = mem_cgroup_from_cont(parent);
4788 4868
4789 cgroup_lock(); 4869 mutex_lock(&memcg_create_mutex);
4790 4870
4791 if (memcg->use_hierarchy == val) 4871 if (memcg->use_hierarchy == val)
4792 goto out; 4872 goto out;
@@ -4801,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4801 */ 4881 */
4802 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 4882 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
4803 (val == 1 || val == 0)) { 4883 (val == 1 || val == 0)) {
4804 if (list_empty(&cont->children)) 4884 if (!__memcg_has_children(memcg))
4805 memcg->use_hierarchy = val; 4885 memcg->use_hierarchy = val;
4806 else 4886 else
4807 retval = -EBUSY; 4887 retval = -EBUSY;
@@ -4809,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4809 retval = -EINVAL; 4889 retval = -EINVAL;
4810 4890
4811out: 4891out:
4812 cgroup_unlock(); 4892 mutex_unlock(&memcg_create_mutex);
4813 4893
4814 return retval; 4894 return retval;
4815} 4895}
@@ -4894,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4894{ 4974{
4895 int ret = -EINVAL; 4975 int ret = -EINVAL;
4896#ifdef CONFIG_MEMCG_KMEM 4976#ifdef CONFIG_MEMCG_KMEM
4897 bool must_inc_static_branch = false;
4898
4899 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4977 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4900 /* 4978 /*
4901 * For simplicity, we won't allow this to be disabled. It also can't 4979 * For simplicity, we won't allow this to be disabled. It also can't
@@ -4908,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4908 * 4986 *
4909 * After it first became limited, changes in the value of the limit are 4987 * After it first became limited, changes in the value of the limit are
4910 * of course permitted. 4988 * of course permitted.
4911 *
4912 * Taking the cgroup_lock is really offensive, but it is so far the only
4913 * way to guarantee that no children will appear. There are plenty of
4914 * other offenders, and they should all go away. Fine grained locking
4915 * is probably the way to go here. When we are fully hierarchical, we
4916 * can also get rid of the use_hierarchy check.
4917 */ 4989 */
4918 cgroup_lock(); 4990 mutex_lock(&memcg_create_mutex);
4919 mutex_lock(&set_limit_mutex); 4991 mutex_lock(&set_limit_mutex);
4920 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { 4992 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4921 if (cgroup_task_count(cont) || (memcg->use_hierarchy && 4993 if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
4922 !list_empty(&cont->children))) {
4923 ret = -EBUSY; 4994 ret = -EBUSY;
4924 goto out; 4995 goto out;
4925 } 4996 }
@@ -4931,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4931 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); 5002 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4932 goto out; 5003 goto out;
4933 } 5004 }
4934 must_inc_static_branch = true; 5005 static_key_slow_inc(&memcg_kmem_enabled_key);
5006 /*
5007 * setting the active bit after the inc will guarantee no one
5008 * starts accounting before all call sites are patched
5009 */
5010 memcg_kmem_set_active(memcg);
5011
4935 /* 5012 /*
4936 * kmem charges can outlive the cgroup. In the case of slab 5013 * kmem charges can outlive the cgroup. In the case of slab
4937 * pages, for instance, a page contain objects from various 5014 * pages, for instance, a page contain objects from various
@@ -4943,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4943 ret = res_counter_set_limit(&memcg->kmem, val); 5020 ret = res_counter_set_limit(&memcg->kmem, val);
4944out: 5021out:
4945 mutex_unlock(&set_limit_mutex); 5022 mutex_unlock(&set_limit_mutex);
4946 cgroup_unlock(); 5023 mutex_unlock(&memcg_create_mutex);
4947
4948 /*
4949 * We are by now familiar with the fact that we can't inc the static
4950 * branch inside cgroup_lock. See disarm functions for details. A
4951 * worker here is overkill, but also wrong: After the limit is set, we
4952 * must start accounting right away. Since this operation can't fail,
4953 * we can safely defer it to here - no rollback will be needed.
4954 *
4955 * The boolean used to control this is also safe, because
4956 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
4957 * able to set it to true;
4958 */
4959 if (must_inc_static_branch) {
4960 static_key_slow_inc(&memcg_kmem_enabled_key);
4961 /*
4962 * setting the active bit after the inc will guarantee no one
4963 * starts accounting before all call sites are patched
4964 */
4965 memcg_kmem_set_active(memcg);
4966 }
4967
4968#endif 5024#endif
4969 return ret; 5025 return ret;
4970} 5026}
4971 5027
5028#ifdef CONFIG_MEMCG_KMEM
4972static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5029static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4973{ 5030{
4974 int ret = 0; 5031 int ret = 0;
@@ -4977,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4977 goto out; 5034 goto out;
4978 5035
4979 memcg->kmem_account_flags = parent->kmem_account_flags; 5036 memcg->kmem_account_flags = parent->kmem_account_flags;
4980#ifdef CONFIG_MEMCG_KMEM
4981 /* 5037 /*
4982 * When that happen, we need to disable the static branch only on those 5038 * When that happen, we need to disable the static branch only on those
4983 * memcgs that enabled it. To achieve this, we would be forced to 5039 * memcgs that enabled it. To achieve this, we would be forced to
@@ -5003,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5003 mutex_lock(&set_limit_mutex); 5059 mutex_lock(&set_limit_mutex);
5004 ret = memcg_update_cache_sizes(memcg); 5060 ret = memcg_update_cache_sizes(memcg);
5005 mutex_unlock(&set_limit_mutex); 5061 mutex_unlock(&set_limit_mutex);
5006#endif
5007out: 5062out:
5008 return ret; 5063 return ret;
5009} 5064}
5065#endif /* CONFIG_MEMCG_KMEM */
5010 5066
5011/* 5067/*
5012 * The user of this function is... 5068 * The user of this function is...
@@ -5146,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5146 5202
5147 if (val >= (1 << NR_MOVE_TYPE)) 5203 if (val >= (1 << NR_MOVE_TYPE))
5148 return -EINVAL; 5204 return -EINVAL;
5205
5149 /* 5206 /*
5150 * We check this value several times in both in can_attach() and 5207 * No kind of locking is needed in here, because ->can_attach() will
5151 * attach(), so we need cgroup lock to prevent this value from being 5208 * check this value once in the beginning of the process, and then carry
5152 * inconsistent. 5209 * on with stale data. This means that changes to this value will only
5210 * affect task migrations starting after the change.
5153 */ 5211 */
5154 cgroup_lock();
5155 memcg->move_charge_at_immigrate = val; 5212 memcg->move_charge_at_immigrate = val;
5156 cgroup_unlock();
5157
5158 return 0; 5213 return 0;
5159} 5214}
5160#else 5215#else
@@ -5212,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
5212} 5267}
5213#endif /* CONFIG_NUMA */ 5268#endif /* CONFIG_NUMA */
5214 5269
5215static const char * const mem_cgroup_lru_names[] = {
5216 "inactive_anon",
5217 "active_anon",
5218 "inactive_file",
5219 "active_file",
5220 "unevictable",
5221};
5222
5223static inline void mem_cgroup_lru_names_not_uptodate(void) 5270static inline void mem_cgroup_lru_names_not_uptodate(void)
5224{ 5271{
5225 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5272 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -5333,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
5333 5380
5334 parent = mem_cgroup_from_cont(cgrp->parent); 5381 parent = mem_cgroup_from_cont(cgrp->parent);
5335 5382
5336 cgroup_lock(); 5383 mutex_lock(&memcg_create_mutex);
5337 5384
5338 /* If under hierarchy, only empty-root can set this value */ 5385 /* If under hierarchy, only empty-root can set this value */
5339 if ((parent->use_hierarchy) || 5386 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5340 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 5387 mutex_unlock(&memcg_create_mutex);
5341 cgroup_unlock();
5342 return -EINVAL; 5388 return -EINVAL;
5343 } 5389 }
5344 5390
5345 memcg->swappiness = val; 5391 memcg->swappiness = val;
5346 5392
5347 cgroup_unlock(); 5393 mutex_unlock(&memcg_create_mutex);
5348 5394
5349 return 0; 5395 return 0;
5350} 5396}
@@ -5670,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
5670 5716
5671 parent = mem_cgroup_from_cont(cgrp->parent); 5717 parent = mem_cgroup_from_cont(cgrp->parent);
5672 5718
5673 cgroup_lock(); 5719 mutex_lock(&memcg_create_mutex);
5674 /* oom-kill-disable is a flag for subhierarchy. */ 5720 /* oom-kill-disable is a flag for subhierarchy. */
5675 if ((parent->use_hierarchy) || 5721 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5676 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 5722 mutex_unlock(&memcg_create_mutex);
5677 cgroup_unlock();
5678 return -EINVAL; 5723 return -EINVAL;
5679 } 5724 }
5680 memcg->oom_kill_disable = val; 5725 memcg->oom_kill_disable = val;
5681 if (!val) 5726 if (!val)
5682 memcg_oom_recover(memcg); 5727 memcg_oom_recover(memcg);
5683 cgroup_unlock(); 5728 mutex_unlock(&memcg_create_mutex);
5684 return 0; 5729 return 0;
5685} 5730}
5686 5731
@@ -5795,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = {
5795 .read_seq_string = memcg_numa_stat_show, 5840 .read_seq_string = memcg_numa_stat_show,
5796 }, 5841 },
5797#endif 5842#endif
5798#ifdef CONFIG_MEMCG_SWAP
5799 {
5800 .name = "memsw.usage_in_bytes",
5801 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5802 .read = mem_cgroup_read,
5803 .register_event = mem_cgroup_usage_register_event,
5804 .unregister_event = mem_cgroup_usage_unregister_event,
5805 },
5806 {
5807 .name = "memsw.max_usage_in_bytes",
5808 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5809 .trigger = mem_cgroup_reset,
5810 .read = mem_cgroup_read,
5811 },
5812 {
5813 .name = "memsw.limit_in_bytes",
5814 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5815 .write_string = mem_cgroup_write,
5816 .read = mem_cgroup_read,
5817 },
5818 {
5819 .name = "memsw.failcnt",
5820 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5821 .trigger = mem_cgroup_reset,
5822 .read = mem_cgroup_read,
5823 },
5824#endif
5825#ifdef CONFIG_MEMCG_KMEM 5843#ifdef CONFIG_MEMCG_KMEM
5826 { 5844 {
5827 .name = "kmem.limit_in_bytes", 5845 .name = "kmem.limit_in_bytes",
@@ -5856,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = {
5856 { }, /* terminate */ 5874 { }, /* terminate */
5857}; 5875};
5858 5876
5877#ifdef CONFIG_MEMCG_SWAP
5878static struct cftype memsw_cgroup_files[] = {
5879 {
5880 .name = "memsw.usage_in_bytes",
5881 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5882 .read = mem_cgroup_read,
5883 .register_event = mem_cgroup_usage_register_event,
5884 .unregister_event = mem_cgroup_usage_unregister_event,
5885 },
5886 {
5887 .name = "memsw.max_usage_in_bytes",
5888 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5889 .trigger = mem_cgroup_reset,
5890 .read = mem_cgroup_read,
5891 },
5892 {
5893 .name = "memsw.limit_in_bytes",
5894 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5895 .write_string = mem_cgroup_write,
5896 .read = mem_cgroup_read,
5897 },
5898 {
5899 .name = "memsw.failcnt",
5900 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5901 .trigger = mem_cgroup_reset,
5902 .read = mem_cgroup_read,
5903 },
5904 { }, /* terminate */
5905};
5906#endif
5859static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 5907static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5860{ 5908{
5861 struct mem_cgroup_per_node *pn; 5909 struct mem_cgroup_per_node *pn;
@@ -5894,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5894static struct mem_cgroup *mem_cgroup_alloc(void) 5942static struct mem_cgroup *mem_cgroup_alloc(void)
5895{ 5943{
5896 struct mem_cgroup *memcg; 5944 struct mem_cgroup *memcg;
5897 int size = sizeof(struct mem_cgroup); 5945 size_t size = memcg_size();
5898 5946
5899 /* Can be very big if MAX_NUMNODES is very big */ 5947 /* Can be very big if nr_node_ids is very big */
5900 if (size < PAGE_SIZE) 5948 if (size < PAGE_SIZE)
5901 memcg = kzalloc(size, GFP_KERNEL); 5949 memcg = kzalloc(size, GFP_KERNEL);
5902 else 5950 else
@@ -5933,7 +5981,7 @@ out_free:
5933static void __mem_cgroup_free(struct mem_cgroup *memcg) 5981static void __mem_cgroup_free(struct mem_cgroup *memcg)
5934{ 5982{
5935 int node; 5983 int node;
5936 int size = sizeof(struct mem_cgroup); 5984 size_t size = memcg_size();
5937 5985
5938 mem_cgroup_remove_from_trees(memcg); 5986 mem_cgroup_remove_from_trees(memcg);
5939 free_css_id(&mem_cgroup_subsys, &memcg->css); 5987 free_css_id(&mem_cgroup_subsys, &memcg->css);
@@ -6015,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6015} 6063}
6016EXPORT_SYMBOL(parent_mem_cgroup); 6064EXPORT_SYMBOL(parent_mem_cgroup);
6017 6065
6018#ifdef CONFIG_MEMCG_SWAP 6066static void __init mem_cgroup_soft_limit_tree_init(void)
6019static void __init enable_swap_cgroup(void)
6020{
6021 if (!mem_cgroup_disabled() && really_do_swap_account)
6022 do_swap_account = 1;
6023}
6024#else
6025static void __init enable_swap_cgroup(void)
6026{
6027}
6028#endif
6029
6030static int mem_cgroup_soft_limit_tree_init(void)
6031{ 6067{
6032 struct mem_cgroup_tree_per_node *rtpn; 6068 struct mem_cgroup_tree_per_node *rtpn;
6033 struct mem_cgroup_tree_per_zone *rtpz; 6069 struct mem_cgroup_tree_per_zone *rtpz;
@@ -6038,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
6038 if (!node_state(node, N_NORMAL_MEMORY)) 6074 if (!node_state(node, N_NORMAL_MEMORY))
6039 tmp = -1; 6075 tmp = -1;
6040 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6076 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6041 if (!rtpn) 6077 BUG_ON(!rtpn);
6042 goto err_cleanup;
6043 6078
6044 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6079 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6045 6080
@@ -6049,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void)
6049 spin_lock_init(&rtpz->lock); 6084 spin_lock_init(&rtpz->lock);
6050 } 6085 }
6051 } 6086 }
6052 return 0;
6053
6054err_cleanup:
6055 for_each_node(node) {
6056 if (!soft_limit_tree.rb_tree_per_node[node])
6057 break;
6058 kfree(soft_limit_tree.rb_tree_per_node[node]);
6059 soft_limit_tree.rb_tree_per_node[node] = NULL;
6060 }
6061 return 1;
6062
6063} 6087}
6064 6088
6065static struct cgroup_subsys_state * __ref 6089static struct cgroup_subsys_state * __ref
6066mem_cgroup_css_alloc(struct cgroup *cont) 6090mem_cgroup_css_alloc(struct cgroup *cont)
6067{ 6091{
6068 struct mem_cgroup *memcg, *parent; 6092 struct mem_cgroup *memcg;
6069 long error = -ENOMEM; 6093 long error = -ENOMEM;
6070 int node; 6094 int node;
6071 6095
@@ -6079,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6079 6103
6080 /* root ? */ 6104 /* root ? */
6081 if (cont->parent == NULL) { 6105 if (cont->parent == NULL) {
6082 int cpu;
6083 enable_swap_cgroup();
6084 parent = NULL;
6085 if (mem_cgroup_soft_limit_tree_init())
6086 goto free_out;
6087 root_mem_cgroup = memcg; 6106 root_mem_cgroup = memcg;
6088 for_each_possible_cpu(cpu) { 6107 res_counter_init(&memcg->res, NULL);
6089 struct memcg_stock_pcp *stock = 6108 res_counter_init(&memcg->memsw, NULL);
6090 &per_cpu(memcg_stock, cpu); 6109 res_counter_init(&memcg->kmem, NULL);
6091 INIT_WORK(&stock->work, drain_local_stock);
6092 }
6093 } else {
6094 parent = mem_cgroup_from_cont(cont->parent);
6095 memcg->use_hierarchy = parent->use_hierarchy;
6096 memcg->oom_kill_disable = parent->oom_kill_disable;
6097 } 6110 }
6098 6111
6099 if (parent && parent->use_hierarchy) { 6112 memcg->last_scanned_node = MAX_NUMNODES;
6113 INIT_LIST_HEAD(&memcg->oom_notify);
6114 atomic_set(&memcg->refcnt, 1);
6115 memcg->move_charge_at_immigrate = 0;
6116 mutex_init(&memcg->thresholds_lock);
6117 spin_lock_init(&memcg->move_lock);
6118
6119 return &memcg->css;
6120
6121free_out:
6122 __mem_cgroup_free(memcg);
6123 return ERR_PTR(error);
6124}
6125
6126static int
6127mem_cgroup_css_online(struct cgroup *cont)
6128{
6129 struct mem_cgroup *memcg, *parent;
6130 int error = 0;
6131
6132 if (!cont->parent)
6133 return 0;
6134
6135 mutex_lock(&memcg_create_mutex);
6136 memcg = mem_cgroup_from_cont(cont);
6137 parent = mem_cgroup_from_cont(cont->parent);
6138
6139 memcg->use_hierarchy = parent->use_hierarchy;
6140 memcg->oom_kill_disable = parent->oom_kill_disable;
6141 memcg->swappiness = mem_cgroup_swappiness(parent);
6142
6143 if (parent->use_hierarchy) {
6100 res_counter_init(&memcg->res, &parent->res); 6144 res_counter_init(&memcg->res, &parent->res);
6101 res_counter_init(&memcg->memsw, &parent->memsw); 6145 res_counter_init(&memcg->memsw, &parent->memsw);
6102 res_counter_init(&memcg->kmem, &parent->kmem); 6146 res_counter_init(&memcg->kmem, &parent->kmem);
@@ -6117,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6117 * much sense so let cgroup subsystem know about this 6161 * much sense so let cgroup subsystem know about this
6118 * unfortunate state in our controller. 6162 * unfortunate state in our controller.
6119 */ 6163 */
6120 if (parent && parent != root_mem_cgroup) 6164 if (parent != root_mem_cgroup)
6121 mem_cgroup_subsys.broken_hierarchy = true; 6165 mem_cgroup_subsys.broken_hierarchy = true;
6122 } 6166 }
6123 memcg->last_scanned_node = MAX_NUMNODES;
6124 INIT_LIST_HEAD(&memcg->oom_notify);
6125
6126 if (parent)
6127 memcg->swappiness = mem_cgroup_swappiness(parent);
6128 atomic_set(&memcg->refcnt, 1);
6129 memcg->move_charge_at_immigrate = 0;
6130 mutex_init(&memcg->thresholds_lock);
6131 spin_lock_init(&memcg->move_lock);
6132 6167
6133 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 6168 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6169 mutex_unlock(&memcg_create_mutex);
6134 if (error) { 6170 if (error) {
6135 /* 6171 /*
6136 * We call put now because our (and parent's) refcnts 6172 * We call put now because our (and parent's) refcnts
@@ -6138,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6138 * call __mem_cgroup_free, so return directly 6174 * call __mem_cgroup_free, so return directly
6139 */ 6175 */
6140 mem_cgroup_put(memcg); 6176 mem_cgroup_put(memcg);
6141 return ERR_PTR(error); 6177 if (parent->use_hierarchy)
6178 mem_cgroup_put(parent);
6142 } 6179 }
6143 return &memcg->css; 6180 return error;
6144free_out:
6145 __mem_cgroup_free(memcg);
6146 return ERR_PTR(error);
6147} 6181}
6148 6182
6149static void mem_cgroup_css_offline(struct cgroup *cont) 6183static void mem_cgroup_css_offline(struct cgroup *cont)
@@ -6279,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6279 * Because lookup_swap_cache() updates some statistics counter, 6313 * Because lookup_swap_cache() updates some statistics counter,
6280 * we call find_get_page() with swapper_space directly. 6314 * we call find_get_page() with swapper_space directly.
6281 */ 6315 */
6282 page = find_get_page(&swapper_space, ent.val); 6316 page = find_get_page(swap_address_space(ent), ent.val);
6283 if (do_swap_account) 6317 if (do_swap_account)
6284 entry->val = ent.val; 6318 entry->val = ent.val;
6285 6319
@@ -6320,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6320 swp_entry_t swap = radix_to_swp_entry(page); 6354 swp_entry_t swap = radix_to_swp_entry(page);
6321 if (do_swap_account) 6355 if (do_swap_account)
6322 *entry = swap; 6356 *entry = swap;
6323 page = find_get_page(&swapper_space, swap.val); 6357 page = find_get_page(swap_address_space(swap), swap.val);
6324 } 6358 }
6325#endif 6359#endif
6326 return page; 6360 return page;
@@ -6530,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
6530 struct task_struct *p = cgroup_taskset_first(tset); 6564 struct task_struct *p = cgroup_taskset_first(tset);
6531 int ret = 0; 6565 int ret = 0;
6532 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 6566 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
6567 unsigned long move_charge_at_immigrate;
6533 6568
6534 if (memcg->move_charge_at_immigrate) { 6569 /*
6570 * We are now commited to this value whatever it is. Changes in this
6571 * tunable will only affect upcoming migrations, not the current one.
6572 * So we need to save it, and keep it going.
6573 */
6574 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
6575 if (move_charge_at_immigrate) {
6535 struct mm_struct *mm; 6576 struct mm_struct *mm;
6536 struct mem_cgroup *from = mem_cgroup_from_task(p); 6577 struct mem_cgroup *from = mem_cgroup_from_task(p);
6537 6578
@@ -6551,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
6551 spin_lock(&mc.lock); 6592 spin_lock(&mc.lock);
6552 mc.from = from; 6593 mc.from = from;
6553 mc.to = memcg; 6594 mc.to = memcg;
6595 mc.immigrate_flags = move_charge_at_immigrate;
6554 spin_unlock(&mc.lock); 6596 spin_unlock(&mc.lock);
6555 /* We set mc.moving_task later */ 6597 /* We set mc.moving_task later */
6556 6598
@@ -6745,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
6745 .name = "memory", 6787 .name = "memory",
6746 .subsys_id = mem_cgroup_subsys_id, 6788 .subsys_id = mem_cgroup_subsys_id,
6747 .css_alloc = mem_cgroup_css_alloc, 6789 .css_alloc = mem_cgroup_css_alloc,
6790 .css_online = mem_cgroup_css_online,
6748 .css_offline = mem_cgroup_css_offline, 6791 .css_offline = mem_cgroup_css_offline,
6749 .css_free = mem_cgroup_css_free, 6792 .css_free = mem_cgroup_css_free,
6750 .can_attach = mem_cgroup_can_attach, 6793 .can_attach = mem_cgroup_can_attach,
@@ -6755,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
6755 .use_id = 1, 6798 .use_id = 1,
6756}; 6799};
6757 6800
6758/*
6759 * The rest of init is performed during ->css_alloc() for root css which
6760 * happens before initcalls. hotcpu_notifier() can't be done together as
6761 * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
6762 * dependency. Do it from a subsys_initcall().
6763 */
6764static int __init mem_cgroup_init(void)
6765{
6766 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6767 return 0;
6768}
6769subsys_initcall(mem_cgroup_init);
6770
6771#ifdef CONFIG_MEMCG_SWAP 6801#ifdef CONFIG_MEMCG_SWAP
6772static int __init enable_swap_account(char *s) 6802static int __init enable_swap_account(char *s)
6773{ 6803{
@@ -6780,4 +6810,39 @@ static int __init enable_swap_account(char *s)
6780} 6810}
6781__setup("swapaccount=", enable_swap_account); 6811__setup("swapaccount=", enable_swap_account);
6782 6812
6813static void __init memsw_file_init(void)
6814{
6815 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
6816}
6817
6818static void __init enable_swap_cgroup(void)
6819{
6820 if (!mem_cgroup_disabled() && really_do_swap_account) {
6821 do_swap_account = 1;
6822 memsw_file_init();
6823 }
6824}
6825
6826#else
6827static void __init enable_swap_cgroup(void)
6828{
6829}
6783#endif 6830#endif
6831
6832/*
6833 * subsys_initcall() for memory controller.
6834 *
6835 * Some parts like hotcpu_notifier() have to be initialized from this context
6836 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
6837 * everything that doesn't depend on a specific mem_cgroup structure should
6838 * be initialized from here.
6839 */
6840static int __init mem_cgroup_init(void)
6841{
6842 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6843 enable_swap_cgroup();
6844 mem_cgroup_soft_limit_tree_init();
6845 memcg_stock_init();
6846 return 0;
6847}
6848subsys_initcall(mem_cgroup_init);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c6e4dd3e1c08..df0694c6adef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0;
61 61
62int sysctl_memory_failure_recovery __read_mostly = 1; 62int sysctl_memory_failure_recovery __read_mostly = 1;
63 63
64atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 64atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
65 65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) 66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67 67
@@ -784,12 +784,12 @@ static struct page_state {
784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, 784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, 785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786 786
787 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
788 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
789
790 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, 787 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
791 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, 788 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
792 789
790 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
791 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
792
793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, 793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795 795
@@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1021 struct page *hpage; 1021 struct page *hpage;
1022 int res; 1022 int res;
1023 unsigned int nr_pages; 1023 unsigned int nr_pages;
1024 unsigned long page_flags;
1024 1025
1025 if (!sysctl_memory_failure_recovery) 1026 if (!sysctl_memory_failure_recovery)
1026 panic("Memory failure from trap %d on page %lx", trapno, pfn); 1027 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1039 return 0; 1040 return 0;
1040 } 1041 }
1041 1042
1042 nr_pages = 1 << compound_trans_order(hpage); 1043 /*
1043 atomic_long_add(nr_pages, &mce_bad_pages); 1044 * Currently errors on hugetlbfs pages are measured in hugepage units,
1045 * so nr_pages should be 1 << compound_order. OTOH when errors are on
1046 * transparent hugepages, they are supposed to be split and error
1047 * measurement is done in normal page units. So nr_pages should be one
1048 * in this case.
1049 */
1050 if (PageHuge(p))
1051 nr_pages = 1 << compound_order(hpage);
1052 else /* normal page or thp */
1053 nr_pages = 1;
1054 atomic_long_add(nr_pages, &num_poisoned_pages);
1044 1055
1045 /* 1056 /*
1046 * We need/can do nothing about count=0 pages. 1057 * We need/can do nothing about count=0 pages.
@@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1070 if (!PageHWPoison(hpage) 1081 if (!PageHWPoison(hpage)
1071 || (hwpoison_filter(p) && TestClearPageHWPoison(p)) 1082 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1072 || (p != hpage && TestSetPageHWPoison(hpage))) { 1083 || (p != hpage && TestSetPageHWPoison(hpage))) {
1073 atomic_long_sub(nr_pages, &mce_bad_pages); 1084 atomic_long_sub(nr_pages, &num_poisoned_pages);
1074 return 0; 1085 return 0;
1075 } 1086 }
1076 set_page_hwpoison_huge_page(hpage); 1087 set_page_hwpoison_huge_page(hpage);
@@ -1119,6 +1130,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1119 lock_page(hpage); 1130 lock_page(hpage);
1120 1131
1121 /* 1132 /*
1133 * We use page flags to determine what action should be taken, but
1134 * the flags can be modified by the error containment action. One
1135 * example is an mlocked page, where PG_mlocked is cleared by
1136 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
1137 * correctly, we save a copy of the page flags at this time.
1138 */
1139 page_flags = p->flags;
1140
1141 /*
1122 * unpoison always clear PG_hwpoison inside page lock 1142 * unpoison always clear PG_hwpoison inside page lock
1123 */ 1143 */
1124 if (!PageHWPoison(p)) { 1144 if (!PageHWPoison(p)) {
@@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1128 } 1148 }
1129 if (hwpoison_filter(p)) { 1149 if (hwpoison_filter(p)) {
1130 if (TestClearPageHWPoison(p)) 1150 if (TestClearPageHWPoison(p))
1131 atomic_long_sub(nr_pages, &mce_bad_pages); 1151 atomic_long_sub(nr_pages, &num_poisoned_pages);
1132 unlock_page(hpage); 1152 unlock_page(hpage);
1133 put_page(hpage); 1153 put_page(hpage);
1134 return 0; 1154 return 0;
@@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1176 } 1196 }
1177 1197
1178 res = -EBUSY; 1198 res = -EBUSY;
1179 for (ps = error_states;; ps++) { 1199 /*
1180 if ((p->flags & ps->mask) == ps->res) { 1200 * The first check uses the current page flags which may not have any
1181 res = page_action(ps, p, pfn); 1201 * relevant information. The second check with the saved page flagss is
1202 * carried out only if the first check can't determine the page status.
1203 */
1204 for (ps = error_states;; ps++)
1205 if ((p->flags & ps->mask) == ps->res)
1182 break; 1206 break;
1183 } 1207 if (!ps->mask)
1184 } 1208 for (ps = error_states;; ps++)
1209 if ((page_flags & ps->mask) == ps->res)
1210 break;
1211 res = page_action(ps, p, pfn);
1185out: 1212out:
1186 unlock_page(hpage); 1213 unlock_page(hpage);
1187 return res; 1214 return res;
@@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn)
1323 return 0; 1350 return 0;
1324 } 1351 }
1325 if (TestClearPageHWPoison(p)) 1352 if (TestClearPageHWPoison(p))
1326 atomic_long_sub(nr_pages, &mce_bad_pages); 1353 atomic_long_sub(nr_pages, &num_poisoned_pages);
1327 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); 1354 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1328 return 0; 1355 return 0;
1329 } 1356 }
@@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn)
1337 */ 1364 */
1338 if (TestClearPageHWPoison(page)) { 1365 if (TestClearPageHWPoison(page)) {
1339 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); 1366 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1340 atomic_long_sub(nr_pages, &mce_bad_pages); 1367 atomic_long_sub(nr_pages, &num_poisoned_pages);
1341 freeit = 1; 1368 freeit = 1;
1342 if (PageHuge(page)) 1369 if (PageHuge(page))
1343 clear_page_hwpoison_huge_page(page); 1370 clear_page_hwpoison_huge_page(page);
@@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
1368 * that is not free, and 1 for any other page type. 1395 * that is not free, and 1 for any other page type.
1369 * For 1 the page is returned with increased page count, otherwise not. 1396 * For 1 the page is returned with increased page count, otherwise not.
1370 */ 1397 */
1371static int get_any_page(struct page *p, unsigned long pfn, int flags) 1398static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1372{ 1399{
1373 int ret; 1400 int ret;
1374 1401
@@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1393 if (!get_page_unless_zero(compound_head(p))) { 1420 if (!get_page_unless_zero(compound_head(p))) {
1394 if (PageHuge(p)) { 1421 if (PageHuge(p)) {
1395 pr_info("%s: %#lx free huge page\n", __func__, pfn); 1422 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1396 ret = dequeue_hwpoisoned_huge_page(compound_head(p)); 1423 ret = 0;
1397 } else if (is_free_buddy_page(p)) { 1424 } else if (is_free_buddy_page(p)) {
1398 pr_info("%s: %#lx free buddy page\n", __func__, pfn); 1425 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1399 /* Set hwpoison bit while page is still isolated */
1400 SetPageHWPoison(p);
1401 ret = 0; 1426 ret = 0;
1402 } else { 1427 } else {
1403 pr_info("%s: %#lx: unknown zero refcount page type %lx\n", 1428 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
@@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1413 return ret; 1438 return ret;
1414} 1439}
1415 1440
1441static int get_any_page(struct page *page, unsigned long pfn, int flags)
1442{
1443 int ret = __get_any_page(page, pfn, flags);
1444
1445 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1446 /*
1447 * Try to free it.
1448 */
1449 put_page(page);
1450 shake_page(page, 1);
1451
1452 /*
1453 * Did it turn free?
1454 */
1455 ret = __get_any_page(page, pfn, 0);
1456 if (!PageLRU(page)) {
1457 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1458 pfn, page->flags);
1459 return -EIO;
1460 }
1461 }
1462 return ret;
1463}
1464
1416static int soft_offline_huge_page(struct page *page, int flags) 1465static int soft_offline_huge_page(struct page *page, int flags)
1417{ 1466{
1418 int ret; 1467 int ret;
1419 unsigned long pfn = page_to_pfn(page); 1468 unsigned long pfn = page_to_pfn(page);
1420 struct page *hpage = compound_head(page); 1469 struct page *hpage = compound_head(page);
1421 1470
1422 ret = get_any_page(page, pfn, flags); 1471 /*
1423 if (ret < 0) 1472 * This double-check of PageHWPoison is to avoid the race with
1424 return ret; 1473 * memory_failure(). See also comment in __soft_offline_page().
1425 if (ret == 0) 1474 */
1426 goto done; 1475 lock_page(hpage);
1427
1428 if (PageHWPoison(hpage)) { 1476 if (PageHWPoison(hpage)) {
1477 unlock_page(hpage);
1429 put_page(hpage); 1478 put_page(hpage);
1430 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); 1479 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1431 return -EBUSY; 1480 return -EBUSY;
1432 } 1481 }
1482 unlock_page(hpage);
1433 1483
1434 /* Keep page count to indicate a given hugepage is isolated. */ 1484 /* Keep page count to indicate a given hugepage is isolated. */
1435 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, 1485 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
1436 MIGRATE_SYNC); 1486 MIGRATE_SYNC);
1437 put_page(hpage); 1487 put_page(hpage);
1438 if (ret) { 1488 if (ret) {
1439 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1489 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1440 pfn, ret, page->flags); 1490 pfn, ret, page->flags);
1441 return ret; 1491 } else {
1442 } 1492 set_page_hwpoison_huge_page(hpage);
1443done: 1493 dequeue_hwpoisoned_huge_page(hpage);
1444 if (!PageHWPoison(hpage))
1445 atomic_long_add(1 << compound_trans_order(hpage), 1494 atomic_long_add(1 << compound_trans_order(hpage),
1446 &mce_bad_pages); 1495 &num_poisoned_pages);
1447 set_page_hwpoison_huge_page(hpage); 1496 }
1448 dequeue_hwpoisoned_huge_page(hpage);
1449 /* keep elevated page count for bad page */ 1497 /* keep elevated page count for bad page */
1450 return ret; 1498 return ret;
1451} 1499}
1452 1500
1501static int __soft_offline_page(struct page *page, int flags);
1502
1453/** 1503/**
1454 * soft_offline_page - Soft offline a page. 1504 * soft_offline_page - Soft offline a page.
1455 * @page: page to offline 1505 * @page: page to offline
@@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags)
1478 unsigned long pfn = page_to_pfn(page); 1528 unsigned long pfn = page_to_pfn(page);
1479 struct page *hpage = compound_trans_head(page); 1529 struct page *hpage = compound_trans_head(page);
1480 1530
1481 if (PageHuge(page)) 1531 if (PageHWPoison(page)) {
1482 return soft_offline_huge_page(page, flags); 1532 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1483 if (PageTransHuge(hpage)) { 1533 return -EBUSY;
1534 }
1535 if (!PageHuge(page) && PageTransHuge(hpage)) {
1484 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { 1536 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1485 pr_info("soft offline: %#lx: failed to split THP\n", 1537 pr_info("soft offline: %#lx: failed to split THP\n",
1486 pfn); 1538 pfn);
@@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags)
1491 ret = get_any_page(page, pfn, flags); 1543 ret = get_any_page(page, pfn, flags);
1492 if (ret < 0) 1544 if (ret < 0)
1493 return ret; 1545 return ret;
1494 if (ret == 0) 1546 if (ret) { /* for in-use pages */
1495 goto done; 1547 if (PageHuge(page))
1496 1548 ret = soft_offline_huge_page(page, flags);
1497 /* 1549 else
1498 * Page cache page we can handle? 1550 ret = __soft_offline_page(page, flags);
1499 */ 1551 } else { /* for free pages */
1500 if (!PageLRU(page)) { 1552 if (PageHuge(page)) {
1501 /* 1553 set_page_hwpoison_huge_page(hpage);
1502 * Try to free it. 1554 dequeue_hwpoisoned_huge_page(hpage);
1503 */ 1555 atomic_long_add(1 << compound_trans_order(hpage),
1504 put_page(page); 1556 &num_poisoned_pages);
1505 shake_page(page, 1); 1557 } else {
1506 1558 SetPageHWPoison(page);
1507 /* 1559 atomic_long_inc(&num_poisoned_pages);
1508 * Did it turn free? 1560 }
1509 */
1510 ret = get_any_page(page, pfn, 0);
1511 if (ret < 0)
1512 return ret;
1513 if (ret == 0)
1514 goto done;
1515 }
1516 if (!PageLRU(page)) {
1517 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1518 pfn, page->flags);
1519 return -EIO;
1520 } 1561 }
1562 /* keep elevated page count for bad page */
1563 return ret;
1564}
1521 1565
1522 lock_page(page); 1566static int __soft_offline_page(struct page *page, int flags)
1523 wait_on_page_writeback(page); 1567{
1568 int ret;
1569 unsigned long pfn = page_to_pfn(page);
1524 1570
1525 /* 1571 /*
1526 * Synchronized using the page lock with memory_failure() 1572 * Check PageHWPoison again inside page lock because PageHWPoison
1573 * is set by memory_failure() outside page lock. Note that
1574 * memory_failure() also double-checks PageHWPoison inside page lock,
1575 * so there's no race between soft_offline_page() and memory_failure().
1527 */ 1576 */
1577 lock_page(page);
1578 wait_on_page_writeback(page);
1528 if (PageHWPoison(page)) { 1579 if (PageHWPoison(page)) {
1529 unlock_page(page); 1580 unlock_page(page);
1530 put_page(page); 1581 put_page(page);
1531 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1582 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1532 return -EBUSY; 1583 return -EBUSY;
1533 } 1584 }
1534
1535 /* 1585 /*
1536 * Try to invalidate first. This should work for 1586 * Try to invalidate first. This should work for
1537 * non dirty unmapped page cache pages. 1587 * non dirty unmapped page cache pages.
@@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags)
1544 */ 1594 */
1545 if (ret == 1) { 1595 if (ret == 1) {
1546 put_page(page); 1596 put_page(page);
1547 ret = 0;
1548 pr_info("soft_offline: %#lx: invalidated\n", pfn); 1597 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1549 goto done; 1598 SetPageHWPoison(page);
1599 atomic_long_inc(&num_poisoned_pages);
1600 return 0;
1550 } 1601 }
1551 1602
1552 /* 1603 /*
@@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags)
1563 if (!ret) { 1614 if (!ret) {
1564 LIST_HEAD(pagelist); 1615 LIST_HEAD(pagelist);
1565 inc_zone_page_state(page, NR_ISOLATED_ANON + 1616 inc_zone_page_state(page, NR_ISOLATED_ANON +
1566 page_is_file_cache(page)); 1617 page_is_file_cache(page));
1567 list_add(&page->lru, &pagelist); 1618 list_add(&page->lru, &pagelist);
1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1619 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1569 false, MIGRATE_SYNC, 1620 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1570 MR_MEMORY_FAILURE);
1571 if (ret) { 1621 if (ret) {
1572 putback_lru_pages(&pagelist); 1622 putback_lru_pages(&pagelist);
1573 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1623 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1574 pfn, ret, page->flags); 1624 pfn, ret, page->flags);
1575 if (ret > 0) 1625 if (ret > 0)
1576 ret = -EIO; 1626 ret = -EIO;
1627 } else {
1628 SetPageHWPoison(page);
1629 atomic_long_inc(&num_poisoned_pages);
1577 } 1630 }
1578 } else { 1631 } else {
1579 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1632 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1580 pfn, ret, page_count(page), page->flags); 1633 pfn, ret, page_count(page), page->flags);
1581 } 1634 }
1582 if (ret)
1583 return ret;
1584
1585done:
1586 atomic_long_add(1, &mce_bad_pages);
1587 SetPageHWPoison(page);
1588 /* keep elevated page count for bad page */
1589 return ret; 1635 return ret;
1590} 1636}
diff --git a/mm/memory.c b/mm/memory.c
index bb1369f7b9b4..494526ae024a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,10 @@
69 69
70#include "internal.h" 70#include "internal.h"
71 71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
74#endif
75
72#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
73/* use the per-pgdat data instead for discontigmem - mbligh */ 77/* use the per-pgdat data instead for discontigmem - mbligh */
74unsigned long max_mapnr; 78unsigned long max_mapnr;
@@ -716,7 +720,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
716 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", 720 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
717 (unsigned long)vma->vm_file->f_op->mmap); 721 (unsigned long)vma->vm_file->f_op->mmap);
718 dump_stack(); 722 dump_stack();
719 add_taint(TAINT_BAD_PAGE); 723 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
720} 724}
721 725
722static inline bool is_cow_mapping(vm_flags_t flags) 726static inline bool is_cow_mapping(vm_flags_t flags)
@@ -1458,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1458EXPORT_SYMBOL_GPL(zap_vma_ptes); 1462EXPORT_SYMBOL_GPL(zap_vma_ptes);
1459 1463
1460/** 1464/**
1461 * follow_page - look up a page descriptor from a user-virtual address 1465 * follow_page_mask - look up a page descriptor from a user-virtual address
1462 * @vma: vm_area_struct mapping @address 1466 * @vma: vm_area_struct mapping @address
1463 * @address: virtual address to look up 1467 * @address: virtual address to look up
1464 * @flags: flags modifying lookup behaviour 1468 * @flags: flags modifying lookup behaviour
1469 * @page_mask: on output, *page_mask is set according to the size of the page
1465 * 1470 *
1466 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 1471 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1467 * 1472 *
@@ -1469,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
1469 * an error pointer if there is a mapping to something not represented 1474 * an error pointer if there is a mapping to something not represented
1470 * by a page descriptor (see also vm_normal_page()). 1475 * by a page descriptor (see also vm_normal_page()).
1471 */ 1476 */
1472struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1477struct page *follow_page_mask(struct vm_area_struct *vma,
1473 unsigned int flags) 1478 unsigned long address, unsigned int flags,
1479 unsigned int *page_mask)
1474{ 1480{
1475 pgd_t *pgd; 1481 pgd_t *pgd;
1476 pud_t *pud; 1482 pud_t *pud;
@@ -1480,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1480 struct page *page; 1486 struct page *page;
1481 struct mm_struct *mm = vma->vm_mm; 1487 struct mm_struct *mm = vma->vm_mm;
1482 1488
1489 *page_mask = 0;
1490
1483 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 1491 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1484 if (!IS_ERR(page)) { 1492 if (!IS_ERR(page)) {
1485 BUG_ON(flags & FOLL_GET); 1493 BUG_ON(flags & FOLL_GET);
@@ -1526,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1526 page = follow_trans_huge_pmd(vma, address, 1534 page = follow_trans_huge_pmd(vma, address,
1527 pmd, flags); 1535 pmd, flags);
1528 spin_unlock(&mm->page_table_lock); 1536 spin_unlock(&mm->page_table_lock);
1537 *page_mask = HPAGE_PMD_NR - 1;
1529 goto out; 1538 goto out;
1530 } 1539 }
1531 } else 1540 } else
@@ -1539,8 +1548,24 @@ split_fallthrough:
1539 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 1548 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1540 1549
1541 pte = *ptep; 1550 pte = *ptep;
1542 if (!pte_present(pte)) 1551 if (!pte_present(pte)) {
1543 goto no_page; 1552 swp_entry_t entry;
1553 /*
1554 * KSM's break_ksm() relies upon recognizing a ksm page
1555 * even while it is being migrated, so for that case we
1556 * need migration_entry_wait().
1557 */
1558 if (likely(!(flags & FOLL_MIGRATION)))
1559 goto no_page;
1560 if (pte_none(pte) || pte_file(pte))
1561 goto no_page;
1562 entry = pte_to_swp_entry(pte);
1563 if (!is_migration_entry(entry))
1564 goto no_page;
1565 pte_unmap_unlock(ptep, ptl);
1566 migration_entry_wait(mm, pmd, address);
1567 goto split_fallthrough;
1568 }
1544 if ((flags & FOLL_NUMA) && pte_numa(pte)) 1569 if ((flags & FOLL_NUMA) && pte_numa(pte))
1545 goto no_page; 1570 goto no_page;
1546 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1571 if ((flags & FOLL_WRITE) && !pte_write(pte))
@@ -1673,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
1673 * instead of __get_user_pages. __get_user_pages should be used only if 1698 * instead of __get_user_pages. __get_user_pages should be used only if
1674 * you need some special @gup_flags. 1699 * you need some special @gup_flags.
1675 */ 1700 */
1676int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1701long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1677 unsigned long start, int nr_pages, unsigned int gup_flags, 1702 unsigned long start, unsigned long nr_pages,
1678 struct page **pages, struct vm_area_struct **vmas, 1703 unsigned int gup_flags, struct page **pages,
1679 int *nonblocking) 1704 struct vm_area_struct **vmas, int *nonblocking)
1680{ 1705{
1681 int i; 1706 long i;
1682 unsigned long vm_flags; 1707 unsigned long vm_flags;
1708 unsigned int page_mask;
1683 1709
1684 if (nr_pages <= 0) 1710 if (!nr_pages)
1685 return 0; 1711 return 0;
1686 1712
1687 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 1713 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
@@ -1757,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1757 get_page(page); 1783 get_page(page);
1758 } 1784 }
1759 pte_unmap(pte); 1785 pte_unmap(pte);
1786 page_mask = 0;
1760 goto next_page; 1787 goto next_page;
1761 } 1788 }
1762 1789
@@ -1774,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1774 do { 1801 do {
1775 struct page *page; 1802 struct page *page;
1776 unsigned int foll_flags = gup_flags; 1803 unsigned int foll_flags = gup_flags;
1804 unsigned int page_increm;
1777 1805
1778 /* 1806 /*
1779 * If we have a pending SIGKILL, don't keep faulting 1807 * If we have a pending SIGKILL, don't keep faulting
@@ -1783,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1783 return i ? i : -ERESTARTSYS; 1811 return i ? i : -ERESTARTSYS;
1784 1812
1785 cond_resched(); 1813 cond_resched();
1786 while (!(page = follow_page(vma, start, foll_flags))) { 1814 while (!(page = follow_page_mask(vma, start,
1815 foll_flags, &page_mask))) {
1787 int ret; 1816 int ret;
1788 unsigned int fault_flags = 0; 1817 unsigned int fault_flags = 0;
1789 1818
@@ -1857,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1857 1886
1858 flush_anon_page(vma, page, start); 1887 flush_anon_page(vma, page, start);
1859 flush_dcache_page(page); 1888 flush_dcache_page(page);
1889 page_mask = 0;
1860 } 1890 }
1861next_page: 1891next_page:
1862 if (vmas) 1892 if (vmas) {
1863 vmas[i] = vma; 1893 vmas[i] = vma;
1864 i++; 1894 page_mask = 0;
1865 start += PAGE_SIZE; 1895 }
1866 nr_pages--; 1896 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
1897 if (page_increm > nr_pages)
1898 page_increm = nr_pages;
1899 i += page_increm;
1900 start += page_increm * PAGE_SIZE;
1901 nr_pages -= page_increm;
1867 } while (nr_pages && start < vma->vm_end); 1902 } while (nr_pages && start < vma->vm_end);
1868 } while (nr_pages); 1903 } while (nr_pages);
1869 return i; 1904 return i;
@@ -1977,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1977 * 2012 *
1978 * See also get_user_pages_fast, for performance critical applications. 2013 * See also get_user_pages_fast, for performance critical applications.
1979 */ 2014 */
1980int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 2015long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1981 unsigned long start, int nr_pages, int write, int force, 2016 unsigned long start, unsigned long nr_pages, int write,
1982 struct page **pages, struct vm_area_struct **vmas) 2017 int force, struct page **pages, struct vm_area_struct **vmas)
1983{ 2018{
1984 int flags = FOLL_TOUCH; 2019 int flags = FOLL_TOUCH;
1985 2020
@@ -2919,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2919 unsigned int flags, pte_t orig_pte) 2954 unsigned int flags, pte_t orig_pte)
2920{ 2955{
2921 spinlock_t *ptl; 2956 spinlock_t *ptl;
2922 struct page *page, *swapcache = NULL; 2957 struct page *page, *swapcache;
2923 swp_entry_t entry; 2958 swp_entry_t entry;
2924 pte_t pte; 2959 pte_t pte;
2925 int locked; 2960 int locked;
@@ -2970,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2970 */ 3005 */
2971 ret = VM_FAULT_HWPOISON; 3006 ret = VM_FAULT_HWPOISON;
2972 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 3007 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3008 swapcache = page;
2973 goto out_release; 3009 goto out_release;
2974 } 3010 }
2975 3011
3012 swapcache = page;
2976 locked = lock_page_or_retry(page, mm, flags); 3013 locked = lock_page_or_retry(page, mm, flags);
2977 3014
2978 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 3015 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2990,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2990 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 3027 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2991 goto out_page; 3028 goto out_page;
2992 3029
2993 if (ksm_might_need_to_copy(page, vma, address)) { 3030 page = ksm_might_need_to_copy(page, vma, address);
2994 swapcache = page; 3031 if (unlikely(!page)) {
2995 page = ksm_does_need_to_copy(page, vma, address); 3032 ret = VM_FAULT_OOM;
2996 3033 page = swapcache;
2997 if (unlikely(!page)) { 3034 goto out_page;
2998 ret = VM_FAULT_OOM;
2999 page = swapcache;
3000 swapcache = NULL;
3001 goto out_page;
3002 }
3003 } 3035 }
3004 3036
3005 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 3037 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -3044,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
3044 } 3076 }
3045 flush_icache_page(vma, page); 3077 flush_icache_page(vma, page);
3046 set_pte_at(mm, address, page_table, pte); 3078 set_pte_at(mm, address, page_table, pte);
3047 do_page_add_anon_rmap(page, vma, address, exclusive); 3079 if (page == swapcache)
3080 do_page_add_anon_rmap(page, vma, address, exclusive);
3081 else /* ksm created a completely new copy */
3082 page_add_new_anon_rmap(page, vma, address);
3048 /* It's better to call commit-charge after rmap is established */ 3083 /* It's better to call commit-charge after rmap is established */
3049 mem_cgroup_commit_charge_swapin(page, ptr); 3084 mem_cgroup_commit_charge_swapin(page, ptr);
3050 3085
@@ -3052,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
3052 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 3087 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3053 try_to_free_swap(page); 3088 try_to_free_swap(page);
3054 unlock_page(page); 3089 unlock_page(page);
3055 if (swapcache) { 3090 if (page != swapcache) {
3056 /* 3091 /*
3057 * Hold the lock to avoid the swap entry to be reused 3092 * Hold the lock to avoid the swap entry to be reused
3058 * until we take the PT lock for the pte_same() check 3093 * until we take the PT lock for the pte_same() check
@@ -3085,7 +3120,7 @@ out_page:
3085 unlock_page(page); 3120 unlock_page(page);
3086out_release: 3121out_release:
3087 page_cache_release(page); 3122 page_cache_release(page);
3088 if (swapcache) { 3123 if (page != swapcache) {
3089 unlock_page(swapcache); 3124 unlock_page(swapcache);
3090 page_cache_release(swapcache); 3125 page_cache_release(swapcache);
3091 } 3126 }
@@ -3821,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3821} 3856}
3822#endif /* __PAGETABLE_PMD_FOLDED */ 3857#endif /* __PAGETABLE_PMD_FOLDED */
3823 3858
3824int make_pages_present(unsigned long addr, unsigned long end)
3825{
3826 int ret, len, write;
3827 struct vm_area_struct * vma;
3828
3829 vma = find_vma(current->mm, addr);
3830 if (!vma)
3831 return -ENOMEM;
3832 /*
3833 * We want to touch writable mappings with a write fault in order
3834 * to break COW, except for shared mappings because these don't COW
3835 * and we would not want to dirty them for nothing.
3836 */
3837 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3838 BUG_ON(addr >= end);
3839 BUG_ON(end > vma->vm_end);
3840 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3841 ret = get_user_pages(current, current->mm, addr,
3842 len, write, 0, NULL, NULL);
3843 if (ret < 0)
3844 return ret;
3845 return ret == len ? 0 : -EFAULT;
3846}
3847
3848#if !defined(__HAVE_ARCH_GATE_AREA) 3859#if !defined(__HAVE_ARCH_GATE_AREA)
3849 3860
3850#if defined(AT_SYSINFO_EHDR) 3861#if defined(AT_SYSINFO_EHDR)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d04ed87bfacb..b81a367b9f39 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,7 @@
29#include <linux/suspend.h> 29#include <linux/suspend.h>
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h>
32 33
33#include <asm/tlbflush.h> 34#include <asm/tlbflush.h>
34 35
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res)
91} 92}
92 93
93#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 94#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
94#ifndef CONFIG_SPARSEMEM_VMEMMAP 95void get_page_bootmem(unsigned long info, struct page *page,
95static void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type)
96 unsigned long type)
97{ 97{
98 page->lru.next = (struct list_head *) type; 98 page->lru.next = (struct list_head *) type;
99 SetPagePrivate(page); 99 SetPagePrivate(page);
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page)
124 mutex_lock(&ppb_lock); 124 mutex_lock(&ppb_lock);
125 __free_pages_bootmem(page, 0); 125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock); 126 mutex_unlock(&ppb_lock);
127 totalram_pages++;
127 } 128 }
128 129
129} 130}
130 131
132#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
133#ifndef CONFIG_SPARSEMEM_VMEMMAP
131static void register_page_bootmem_info_section(unsigned long start_pfn) 134static void register_page_bootmem_info_section(unsigned long start_pfn)
132{ 135{
133 unsigned long *usemap, mapsize, section_nr, i; 136 unsigned long *usemap, mapsize, section_nr, i;
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
161 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 164 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
162 165
163} 166}
167#else /* CONFIG_SPARSEMEM_VMEMMAP */
168static void register_page_bootmem_info_section(unsigned long start_pfn)
169{
170 unsigned long *usemap, mapsize, section_nr, i;
171 struct mem_section *ms;
172 struct page *page, *memmap;
173
174 if (!pfn_valid(start_pfn))
175 return;
176
177 section_nr = pfn_to_section_nr(start_pfn);
178 ms = __nr_to_section(section_nr);
179
180 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
181
182 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
183
184 usemap = __nr_to_section(section_nr)->pageblock_flags;
185 page = virt_to_page(usemap);
186
187 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
188
189 for (i = 0; i < mapsize; i++, page++)
190 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
191}
192#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
164 193
165void register_page_bootmem_info_node(struct pglist_data *pgdat) 194void register_page_bootmem_info_node(struct pglist_data *pgdat)
166{ 195{
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
189 } 218 }
190 219
191 pfn = pgdat->node_start_pfn; 220 pfn = pgdat->node_start_pfn;
192 end_pfn = pfn + pgdat->node_spanned_pages; 221 end_pfn = pgdat_end_pfn(pgdat);
193 222
194 /* register_section info */ 223 /* register_section info */
195 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
203 register_page_bootmem_info_section(pfn); 232 register_page_bootmem_info_section(pfn);
204 } 233 }
205} 234}
206#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 235#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
207 236
208static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 237static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
209 unsigned long end_pfn) 238 unsigned long end_pfn)
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
253 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 282 set_page_links(pfn_to_page(pfn), zid, nid, pfn);
254} 283}
255 284
285/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
286 * alloc_bootmem_node_nopanic() */
287static int __ref ensure_zone_is_initialized(struct zone *zone,
288 unsigned long start_pfn, unsigned long num_pages)
289{
290 if (!zone_is_initialized(zone))
291 return init_currently_empty_zone(zone, start_pfn, num_pages,
292 MEMMAP_HOTPLUG);
293 return 0;
294}
295
256static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 296static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
257 unsigned long start_pfn, unsigned long end_pfn) 297 unsigned long start_pfn, unsigned long end_pfn)
258{ 298{
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
260 unsigned long flags; 300 unsigned long flags;
261 unsigned long z1_start_pfn; 301 unsigned long z1_start_pfn;
262 302
263 if (!z1->wait_table) { 303 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
264 ret = init_currently_empty_zone(z1, start_pfn, 304 if (ret)
265 end_pfn - start_pfn, MEMMAP_HOTPLUG); 305 return ret;
266 if (ret)
267 return ret;
268 }
269 306
270 pgdat_resize_lock(z1->zone_pgdat, &flags); 307 pgdat_resize_lock(z1->zone_pgdat, &flags);
271 308
272 /* can't move pfns which are higher than @z2 */ 309 /* can't move pfns which are higher than @z2 */
273 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) 310 if (end_pfn > zone_end_pfn(z2))
274 goto out_fail; 311 goto out_fail;
275 /* the move out part mast at the left most of @z2 */ 312 /* the move out part mast at the left most of @z2 */
276 if (start_pfn > z2->zone_start_pfn) 313 if (start_pfn > z2->zone_start_pfn)
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
286 z1_start_pfn = start_pfn; 323 z1_start_pfn = start_pfn;
287 324
288 resize_zone(z1, z1_start_pfn, end_pfn); 325 resize_zone(z1, z1_start_pfn, end_pfn);
289 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); 326 resize_zone(z2, end_pfn, zone_end_pfn(z2));
290 327
291 pgdat_resize_unlock(z1->zone_pgdat, &flags); 328 pgdat_resize_unlock(z1->zone_pgdat, &flags);
292 329
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
305 unsigned long flags; 342 unsigned long flags;
306 unsigned long z2_end_pfn; 343 unsigned long z2_end_pfn;
307 344
308 if (!z2->wait_table) { 345 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
309 ret = init_currently_empty_zone(z2, start_pfn, 346 if (ret)
310 end_pfn - start_pfn, MEMMAP_HOTPLUG); 347 return ret;
311 if (ret)
312 return ret;
313 }
314 348
315 pgdat_resize_lock(z1->zone_pgdat, &flags); 349 pgdat_resize_lock(z1->zone_pgdat, &flags);
316 350
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
318 if (z1->zone_start_pfn > start_pfn) 352 if (z1->zone_start_pfn > start_pfn)
319 goto out_fail; 353 goto out_fail;
320 /* the move out part mast at the right most of @z1 */ 354 /* the move out part mast at the right most of @z1 */
321 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) 355 if (zone_end_pfn(z1) > end_pfn)
322 goto out_fail; 356 goto out_fail;
323 /* must included/overlap */ 357 /* must included/overlap */
324 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) 358 if (start_pfn >= zone_end_pfn(z1))
325 goto out_fail; 359 goto out_fail;
326 360
327 /* use end_pfn for z2's end_pfn if z2 is empty */ 361 /* use end_pfn for z2's end_pfn if z2 is empty */
328 if (z2->spanned_pages) 362 if (z2->spanned_pages)
329 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; 363 z2_end_pfn = zone_end_pfn(z2);
330 else 364 else
331 z2_end_pfn = end_pfn; 365 z2_end_pfn = end_pfn;
332 366
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
363 int nid = pgdat->node_id; 397 int nid = pgdat->node_id;
364 int zone_type; 398 int zone_type;
365 unsigned long flags; 399 unsigned long flags;
400 int ret;
366 401
367 zone_type = zone - pgdat->node_zones; 402 zone_type = zone - pgdat->node_zones;
368 if (!zone->wait_table) { 403 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
369 int ret; 404 if (ret)
405 return ret;
370 406
371 ret = init_currently_empty_zone(zone, phys_start_pfn,
372 nr_pages, MEMMAP_HOTPLUG);
373 if (ret)
374 return ret;
375 }
376 pgdat_resize_lock(zone->zone_pgdat, &flags); 407 pgdat_resize_lock(zone->zone_pgdat, &flags);
377 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 408 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
378 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 409 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
405 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
406} 437}
407 438
408#ifdef CONFIG_SPARSEMEM_VMEMMAP 439/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
409static int __remove_section(struct zone *zone, struct mem_section *ms) 440static int find_smallest_section_pfn(int nid, struct zone *zone,
441 unsigned long start_pfn,
442 unsigned long end_pfn)
443{
444 struct mem_section *ms;
445
446 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
447 ms = __pfn_to_section(start_pfn);
448
449 if (unlikely(!valid_section(ms)))
450 continue;
451
452 if (unlikely(pfn_to_nid(start_pfn) != nid))
453 continue;
454
455 if (zone && zone != page_zone(pfn_to_page(start_pfn)))
456 continue;
457
458 return start_pfn;
459 }
460
461 return 0;
462}
463
464/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
465static int find_biggest_section_pfn(int nid, struct zone *zone,
466 unsigned long start_pfn,
467 unsigned long end_pfn)
468{
469 struct mem_section *ms;
470 unsigned long pfn;
471
472 /* pfn is the end pfn of a memory section. */
473 pfn = end_pfn - 1;
474 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
475 ms = __pfn_to_section(pfn);
476
477 if (unlikely(!valid_section(ms)))
478 continue;
479
480 if (unlikely(pfn_to_nid(pfn) != nid))
481 continue;
482
483 if (zone && zone != page_zone(pfn_to_page(pfn)))
484 continue;
485
486 return pfn;
487 }
488
489 return 0;
490}
491
492static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
493 unsigned long end_pfn)
410{ 494{
495 unsigned long zone_start_pfn = zone->zone_start_pfn;
496 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
497 unsigned long pfn;
498 struct mem_section *ms;
499 int nid = zone_to_nid(zone);
500
501 zone_span_writelock(zone);
502 if (zone_start_pfn == start_pfn) {
503 /*
504 * If the section is smallest section in the zone, it need
505 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
506 * In this case, we find second smallest valid mem_section
507 * for shrinking zone.
508 */
509 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
510 zone_end_pfn);
511 if (pfn) {
512 zone->zone_start_pfn = pfn;
513 zone->spanned_pages = zone_end_pfn - pfn;
514 }
515 } else if (zone_end_pfn == end_pfn) {
516 /*
517 * If the section is biggest section in the zone, it need
518 * shrink zone->spanned_pages.
519 * In this case, we find second biggest valid mem_section for
520 * shrinking zone.
521 */
522 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
523 start_pfn);
524 if (pfn)
525 zone->spanned_pages = pfn - zone_start_pfn + 1;
526 }
527
411 /* 528 /*
412 * XXX: Freeing memmap with vmemmap is not implement yet. 529 * The section is not biggest or smallest mem_section in the zone, it
413 * This should be removed later. 530 * only creates a hole in the zone. So in this case, we need not
531 * change the zone. But perhaps, the zone has only hole data. Thus
532 * it check the zone has only hole or not.
414 */ 533 */
415 return -EBUSY; 534 pfn = zone_start_pfn;
535 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
536 ms = __pfn_to_section(pfn);
537
538 if (unlikely(!valid_section(ms)))
539 continue;
540
541 if (page_zone(pfn_to_page(pfn)) != zone)
542 continue;
543
544 /* If the section is current section, it continues the loop */
545 if (start_pfn == pfn)
546 continue;
547
548 /* If we find valid section, we have nothing to do */
549 zone_span_writeunlock(zone);
550 return;
551 }
552
553 /* The zone has no valid section */
554 zone->zone_start_pfn = 0;
555 zone->spanned_pages = 0;
556 zone_span_writeunlock(zone);
416} 557}
417#else 558
418static int __remove_section(struct zone *zone, struct mem_section *ms) 559static void shrink_pgdat_span(struct pglist_data *pgdat,
560 unsigned long start_pfn, unsigned long end_pfn)
561{
562 unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
563 unsigned long pgdat_end_pfn =
564 pgdat->node_start_pfn + pgdat->node_spanned_pages;
565 unsigned long pfn;
566 struct mem_section *ms;
567 int nid = pgdat->node_id;
568
569 if (pgdat_start_pfn == start_pfn) {
570 /*
571 * If the section is smallest section in the pgdat, it need
572 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
573 * In this case, we find second smallest valid mem_section
574 * for shrinking zone.
575 */
576 pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
577 pgdat_end_pfn);
578 if (pfn) {
579 pgdat->node_start_pfn = pfn;
580 pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
581 }
582 } else if (pgdat_end_pfn == end_pfn) {
583 /*
584 * If the section is biggest section in the pgdat, it need
585 * shrink pgdat->node_spanned_pages.
586 * In this case, we find second biggest valid mem_section for
587 * shrinking zone.
588 */
589 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
590 start_pfn);
591 if (pfn)
592 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
593 }
594
595 /*
596 * If the section is not biggest or smallest mem_section in the pgdat,
597 * it only creates a hole in the pgdat. So in this case, we need not
598 * change the pgdat.
599 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
600 * has only hole or not.
601 */
602 pfn = pgdat_start_pfn;
603 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
604 ms = __pfn_to_section(pfn);
605
606 if (unlikely(!valid_section(ms)))
607 continue;
608
609 if (pfn_to_nid(pfn) != nid)
610 continue;
611
612 /* If the section is current section, it continues the loop */
613 if (start_pfn == pfn)
614 continue;
615
616 /* If we find valid section, we have nothing to do */
617 return;
618 }
619
620 /* The pgdat has no valid section */
621 pgdat->node_start_pfn = 0;
622 pgdat->node_spanned_pages = 0;
623}
624
625static void __remove_zone(struct zone *zone, unsigned long start_pfn)
419{ 626{
420 unsigned long flags;
421 struct pglist_data *pgdat = zone->zone_pgdat; 627 struct pglist_data *pgdat = zone->zone_pgdat;
628 int nr_pages = PAGES_PER_SECTION;
629 int zone_type;
630 unsigned long flags;
631
632 zone_type = zone - pgdat->node_zones;
633
634 pgdat_resize_lock(zone->zone_pgdat, &flags);
635 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
636 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
637 pgdat_resize_unlock(zone->zone_pgdat, &flags);
638}
639
640static int __remove_section(struct zone *zone, struct mem_section *ms)
641{
642 unsigned long start_pfn;
643 int scn_nr;
422 int ret = -EINVAL; 644 int ret = -EINVAL;
423 645
424 if (!valid_section(ms)) 646 if (!valid_section(ms))
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
428 if (ret) 650 if (ret)
429 return ret; 651 return ret;
430 652
431 pgdat_resize_lock(pgdat, &flags); 653 scn_nr = __section_nr(ms);
654 start_pfn = section_nr_to_pfn(scn_nr);
655 __remove_zone(zone, start_pfn);
656
432 sparse_remove_one_section(zone, ms); 657 sparse_remove_one_section(zone, ms);
433 pgdat_resize_unlock(pgdat, &flags);
434 return 0; 658 return 0;
435} 659}
436#endif
437 660
438/* 661/*
439 * Reasonably generic function for adding memory. It is 662 * Reasonably generic function for adding memory. It is
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
797 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1020 unsigned long zholes_size[MAX_NR_ZONES] = {0};
798 unsigned long start_pfn = start >> PAGE_SHIFT; 1021 unsigned long start_pfn = start >> PAGE_SHIFT;
799 1022
800 pgdat = arch_alloc_nodedata(nid); 1023 pgdat = NODE_DATA(nid);
801 if (!pgdat) 1024 if (!pgdat) {
802 return NULL; 1025 pgdat = arch_alloc_nodedata(nid);
1026 if (!pgdat)
1027 return NULL;
803 1028
804 arch_refresh_nodedata(nid, pgdat); 1029 arch_refresh_nodedata(nid, pgdat);
1030 }
805 1031
806 /* we can use NODE_DATA(nid) from here */ 1032 /* we can use NODE_DATA(nid) from here */
807 1033
@@ -854,7 +1080,8 @@ out:
854int __ref add_memory(int nid, u64 start, u64 size) 1080int __ref add_memory(int nid, u64 start, u64 size)
855{ 1081{
856 pg_data_t *pgdat = NULL; 1082 pg_data_t *pgdat = NULL;
857 int new_pgdat = 0; 1083 bool new_pgdat;
1084 bool new_node;
858 struct resource *res; 1085 struct resource *res;
859 int ret; 1086 int ret;
860 1087
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
865 if (!res) 1092 if (!res)
866 goto out; 1093 goto out;
867 1094
868 if (!node_online(nid)) { 1095 { /* Stupid hack to suppress address-never-null warning */
1096 void *p = NODE_DATA(nid);
1097 new_pgdat = !p;
1098 }
1099 new_node = !node_online(nid);
1100 if (new_node) {
869 pgdat = hotadd_new_pgdat(nid, start); 1101 pgdat = hotadd_new_pgdat(nid, start);
870 ret = -ENOMEM; 1102 ret = -ENOMEM;
871 if (!pgdat) 1103 if (!pgdat)
872 goto error; 1104 goto error;
873 new_pgdat = 1;
874 } 1105 }
875 1106
876 /* call arch's memory hotadd */ 1107 /* call arch's memory hotadd */
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
882 /* we online node here. we can't roll back from here. */ 1113 /* we online node here. we can't roll back from here. */
883 node_set_online(nid); 1114 node_set_online(nid);
884 1115
885 if (new_pgdat) { 1116 if (new_node) {
886 ret = register_one_node(nid); 1117 ret = register_one_node(nid);
887 /* 1118 /*
888 * If sysfs file of new node can't create, cpu on the node 1119 * If sysfs file of new node can't create, cpu on the node
@@ -901,8 +1132,7 @@ error:
901 /* rollback pgdat allocation and others */ 1132 /* rollback pgdat allocation and others */
902 if (new_pgdat) 1133 if (new_pgdat)
903 rollback_node_hotadd(nid, pgdat); 1134 rollback_node_hotadd(nid, pgdat);
904 if (res) 1135 release_memory_resource(res);
905 release_memory_resource(res);
906 1136
907out: 1137out:
908 unlock_memory_hotplug(); 1138 unlock_memory_hotplug();
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1058 * migrate_pages returns # of failed pages. 1288 * migrate_pages returns # of failed pages.
1059 */ 1289 */
1060 ret = migrate_pages(&source, alloc_migrate_target, 0, 1290 ret = migrate_pages(&source, alloc_migrate_target, 0,
1061 true, MIGRATE_SYNC, 1291 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1062 MR_MEMORY_HOTPLUG);
1063 if (ret) 1292 if (ret)
1064 putback_lru_pages(&source); 1293 putback_lru_pages(&source);
1065 } 1294 }
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1381 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1610 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1382} 1611}
1383 1612
1384int remove_memory(u64 start, u64 size) 1613/**
1614 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1615 * @start_pfn: start pfn of the memory range
1616 * @end_pfn: end pft of the memory range
1617 * @arg: argument passed to func
1618 * @func: callback for each memory section walked
1619 *
1620 * This function walks through all present mem sections in range
1621 * [start_pfn, end_pfn) and call func on each mem section.
1622 *
1623 * Returns the return value of func.
1624 */
1625static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1626 void *arg, int (*func)(struct memory_block *, void *))
1385{ 1627{
1386 struct memory_block *mem = NULL; 1628 struct memory_block *mem = NULL;
1387 struct mem_section *section; 1629 struct mem_section *section;
1388 unsigned long start_pfn, end_pfn;
1389 unsigned long pfn, section_nr; 1630 unsigned long pfn, section_nr;
1390 int ret; 1631 int ret;
1391 1632
1392 start_pfn = PFN_DOWN(start);
1393 end_pfn = start_pfn + PFN_DOWN(size);
1394
1395 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1633 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1396 section_nr = pfn_to_section_nr(pfn); 1634 section_nr = pfn_to_section_nr(pfn);
1397 if (!present_section_nr(section_nr)) 1635 if (!present_section_nr(section_nr))
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size)
1408 if (!mem) 1646 if (!mem)
1409 continue; 1647 continue;
1410 1648
1411 ret = offline_memory_block(mem); 1649 ret = func(mem, arg);
1412 if (ret) { 1650 if (ret) {
1413 kobject_put(&mem->dev.kobj); 1651 kobject_put(&mem->dev.kobj);
1414 return ret; 1652 return ret;
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size)
1420 1658
1421 return 0; 1659 return 0;
1422} 1660}
1661
1662/**
1663 * offline_memory_block_cb - callback function for offlining memory block
1664 * @mem: the memory block to be offlined
1665 * @arg: buffer to hold error msg
1666 *
1667 * Always return 0, and put the error msg in arg if any.
1668 */
1669static int offline_memory_block_cb(struct memory_block *mem, void *arg)
1670{
1671 int *ret = arg;
1672 int error = offline_memory_block(mem);
1673
1674 if (error != 0 && *ret == 0)
1675 *ret = error;
1676
1677 return 0;
1678}
1679
1680static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
1681{
1682 int ret = !is_memblock_offlined(mem);
1683
1684 if (unlikely(ret))
1685 pr_warn("removing memory fails, because memory "
1686 "[%#010llx-%#010llx] is onlined\n",
1687 PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
1688 PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
1689
1690 return ret;
1691}
1692
1693static int check_cpu_on_node(void *data)
1694{
1695 struct pglist_data *pgdat = data;
1696 int cpu;
1697
1698 for_each_present_cpu(cpu) {
1699 if (cpu_to_node(cpu) == pgdat->node_id)
1700 /*
1701 * the cpu on this node isn't removed, and we can't
1702 * offline this node.
1703 */
1704 return -EBUSY;
1705 }
1706
1707 return 0;
1708}
1709
1710static void unmap_cpu_on_node(void *data)
1711{
1712#ifdef CONFIG_ACPI_NUMA
1713 struct pglist_data *pgdat = data;
1714 int cpu;
1715
1716 for_each_possible_cpu(cpu)
1717 if (cpu_to_node(cpu) == pgdat->node_id)
1718 numa_clear_node(cpu);
1719#endif
1720}
1721
1722static int check_and_unmap_cpu_on_node(void *data)
1723{
1724 int ret = check_cpu_on_node(data);
1725
1726 if (ret)
1727 return ret;
1728
1729 /*
1730 * the node will be offlined when we come here, so we can clear
1731 * the cpu_to_node() now.
1732 */
1733
1734 unmap_cpu_on_node(data);
1735 return 0;
1736}
1737
1738/* offline the node if all memory sections of this node are removed */
1739void try_offline_node(int nid)
1740{
1741 pg_data_t *pgdat = NODE_DATA(nid);
1742 unsigned long start_pfn = pgdat->node_start_pfn;
1743 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1744 unsigned long pfn;
1745 struct page *pgdat_page = virt_to_page(pgdat);
1746 int i;
1747
1748 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1749 unsigned long section_nr = pfn_to_section_nr(pfn);
1750
1751 if (!present_section_nr(section_nr))
1752 continue;
1753
1754 if (pfn_to_nid(pfn) != nid)
1755 continue;
1756
1757 /*
1758 * some memory sections of this node are not removed, and we
1759 * can't offline node now.
1760 */
1761 return;
1762 }
1763
1764 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
1765 return;
1766
1767 /*
1768 * all memory/cpu of this node are removed, we can offline this
1769 * node now.
1770 */
1771 node_set_offline(nid);
1772 unregister_one_node(nid);
1773
1774 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
1775 /* node data is allocated from boot memory */
1776 return;
1777
1778 /* free waittable in each zone */
1779 for (i = 0; i < MAX_NR_ZONES; i++) {
1780 struct zone *zone = pgdat->node_zones + i;
1781
1782 if (zone->wait_table)
1783 vfree(zone->wait_table);
1784 }
1785
1786 /*
1787 * Since there is no way to guarentee the address of pgdat/zone is not
1788 * on stack of any kernel threads or used by other kernel objects
1789 * without reference counting or other symchronizing method, do not
1790 * reset node_data and free pgdat here. Just reset it to 0 and reuse
1791 * the memory when the node is online again.
1792 */
1793 memset(pgdat, 0, sizeof(*pgdat));
1794}
1795EXPORT_SYMBOL(try_offline_node);
1796
1797int __ref remove_memory(int nid, u64 start, u64 size)
1798{
1799 unsigned long start_pfn, end_pfn;
1800 int ret = 0;
1801 int retry = 1;
1802
1803 start_pfn = PFN_DOWN(start);
1804 end_pfn = start_pfn + PFN_DOWN(size);
1805
1806 /*
1807 * When CONFIG_MEMCG is on, one memory block may be used by other
1808 * blocks to store page cgroup when onlining pages. But we don't know
1809 * in what order pages are onlined. So we iterate twice to offline
1810 * memory:
1811 * 1st iterate: offline every non primary memory block.
1812 * 2nd iterate: offline primary (i.e. first added) memory block.
1813 */
1814repeat:
1815 walk_memory_range(start_pfn, end_pfn, &ret,
1816 offline_memory_block_cb);
1817 if (ret) {
1818 if (!retry)
1819 return ret;
1820
1821 retry = 0;
1822 ret = 0;
1823 goto repeat;
1824 }
1825
1826 lock_memory_hotplug();
1827
1828 /*
1829 * we have offlined all memory blocks like this:
1830 * 1. lock memory hotplug
1831 * 2. offline a memory block
1832 * 3. unlock memory hotplug
1833 *
1834 * repeat step1-3 to offline the memory block. All memory blocks
1835 * must be offlined before removing memory. But we don't hold the
1836 * lock in the whole operation. So we should check whether all
1837 * memory blocks are offlined.
1838 */
1839
1840 ret = walk_memory_range(start_pfn, end_pfn, NULL,
1841 is_memblock_offlined_cb);
1842 if (ret) {
1843 unlock_memory_hotplug();
1844 return ret;
1845 }
1846
1847 /* remove memmap entry */
1848 firmware_map_remove(start, start + size, "System RAM");
1849
1850 arch_remove_memory(start, size);
1851
1852 try_offline_node(nid);
1853
1854 unlock_memory_hotplug();
1855
1856 return 0;
1857}
1423#else 1858#else
1424int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1859int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1425{ 1860{
1426 return -EINVAL; 1861 return -EINVAL;
1427} 1862}
1428int remove_memory(u64 start, u64 size) 1863int remove_memory(int nid, u64 start, u64 size)
1429{ 1864{
1430 return -EINVAL; 1865 return -EINVAL;
1431} 1866}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e2df1c1fb41f..31d26637b658 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -26,7 +26,7 @@
26 * the allocation to memory nodes instead 26 * the allocation to memory nodes instead
27 * 27 *
28 * preferred Try a specific node first before normal fallback. 28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation 29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default, 30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default 31 * but useful to set in a VMA when you have a non default
32 * process policy. 32 * process policy.
@@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p)
127 127
128 if (!pol) { 128 if (!pol) {
129 node = numa_node_id(); 129 node = numa_node_id();
130 if (node != -1) 130 if (node != NUMA_NO_NODE)
131 pol = &preferred_node_policy[node]; 131 pol = &preferred_node_policy[node];
132 132
133 /* preferred_node_policy is not initialised early in boot */ 133 /* preferred_node_policy is not initialised early in boot */
@@ -161,19 +161,7 @@ static const struct mempolicy_operations {
161/* Check that the nodemask contains at least one populated zone */ 161/* Check that the nodemask contains at least one populated zone */
162static int is_valid_nodemask(const nodemask_t *nodemask) 162static int is_valid_nodemask(const nodemask_t *nodemask)
163{ 163{
164 int nd, k; 164 return nodes_intersects(*nodemask, node_states[N_MEMORY]);
165
166 for_each_node_mask(nd, *nodemask) {
167 struct zone *z;
168
169 for (k = 0; k <= policy_zone; k++) {
170 z = &NODE_DATA(nd)->node_zones[k];
171 if (z->present_pages > 0)
172 return 1;
173 }
174 }
175
176 return 0;
177} 165}
178 166
179static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 167static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
270 struct mempolicy *policy; 258 struct mempolicy *policy;
271 259
272 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 260 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
273 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 261 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
274 262
275 if (mode == MPOL_DEFAULT) { 263 if (mode == MPOL_DEFAULT) {
276 if (nodes && !nodes_empty(*nodes)) 264 if (nodes && !nodes_empty(*nodes))
@@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
508 /* 496 /*
509 * vm_normal_page() filters out zero pages, but there might 497 * vm_normal_page() filters out zero pages, but there might
510 * still be PageReserved pages to skip, perhaps in a VDSO. 498 * still be PageReserved pages to skip, perhaps in a VDSO.
511 * And we cannot move PageKsm pages sensibly or safely yet.
512 */ 499 */
513 if (PageReserved(page) || PageKsm(page)) 500 if (PageReserved(page))
514 continue; 501 continue;
515 nid = page_to_nid(page); 502 nid = page_to_nid(page);
516 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 503 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1027 1014
1028 if (!list_empty(&pagelist)) { 1015 if (!list_empty(&pagelist)) {
1029 err = migrate_pages(&pagelist, new_node_page, dest, 1016 err = migrate_pages(&pagelist, new_node_page, dest,
1030 false, MIGRATE_SYNC, 1017 MIGRATE_SYNC, MR_SYSCALL);
1031 MR_SYSCALL);
1032 if (err) 1018 if (err)
1033 putback_lru_pages(&pagelist); 1019 putback_lru_pages(&pagelist);
1034 } 1020 }
@@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1235 1221
1236 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", 1222 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1237 start, start + len, mode, mode_flags, 1223 start, start + len, mode, mode_flags,
1238 nmask ? nodes_addr(*nmask)[0] : -1); 1224 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1239 1225
1240 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 1226 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1241 1227
@@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len,
1272 if (!list_empty(&pagelist)) { 1258 if (!list_empty(&pagelist)) {
1273 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1259 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1274 nr_failed = migrate_pages(&pagelist, new_vma_page, 1260 nr_failed = migrate_pages(&pagelist, new_vma_page,
1275 (unsigned long)vma, 1261 (unsigned long)vma,
1276 false, MIGRATE_SYNC, 1262 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1277 MR_MEMPOLICY_MBIND);
1278 if (nr_failed) 1263 if (nr_failed)
1279 putback_lru_pages(&pagelist); 1264 putback_lru_pages(&pagelist);
1280 } 1265 }
@@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1644 return pol; 1629 return pol;
1645} 1630}
1646 1631
1632static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1633{
1634 enum zone_type dynamic_policy_zone = policy_zone;
1635
1636 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1637
1638 /*
1639 * if policy->v.nodes has movable memory only,
1640 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1641 *
1642 * policy->v.nodes is intersect with node_states[N_MEMORY].
1643 * so if the following test faile, it implies
1644 * policy->v.nodes has movable memory only.
1645 */
1646 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1647 dynamic_policy_zone = ZONE_MOVABLE;
1648
1649 return zone >= dynamic_policy_zone;
1650}
1651
1647/* 1652/*
1648 * Return a nodemask representing a mempolicy for filtering nodes for 1653 * Return a nodemask representing a mempolicy for filtering nodes for
1649 * page allocation 1654 * page allocation
@@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1652{ 1657{
1653 /* Lower zones don't get a nodemask applied for MPOL_BIND */ 1658 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1654 if (unlikely(policy->mode == MPOL_BIND) && 1659 if (unlikely(policy->mode == MPOL_BIND) &&
1655 gfp_zone(gfp) >= policy_zone && 1660 apply_policy_zone(policy, gfp_zone(gfp)) &&
1656 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) 1661 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1657 return &policy->v.nodes; 1662 return &policy->v.nodes;
1658 1663
@@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2308 * it less likely we act on an unlikely task<->page 2313 * it less likely we act on an unlikely task<->page
2309 * relation. 2314 * relation.
2310 */ 2315 */
2311 last_nid = page_xchg_last_nid(page, polnid); 2316 last_nid = page_nid_xchg_last(page, polnid);
2312 if (last_nid != polnid) 2317 if (last_nid != polnid)
2313 goto out; 2318 goto out;
2314 } 2319 }
@@ -2483,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
2483 vma->vm_pgoff, 2488 vma->vm_pgoff,
2484 sz, npol ? npol->mode : -1, 2489 sz, npol ? npol->mode : -1,
2485 npol ? npol->flags : -1, 2490 npol ? npol->flags : -1,
2486 npol ? nodes_addr(npol->v.nodes)[0] : -1); 2491 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2487 2492
2488 if (npol) { 2493 if (npol) {
2489 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 2494 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
diff --git a/mm/migrate.c b/mm/migrate.c
index 3b676b0c5c3e..3bbaf5d230b0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -160,8 +160,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
160 if (is_write_migration_entry(entry)) 160 if (is_write_migration_entry(entry))
161 pte = pte_mkwrite(pte); 161 pte = pte_mkwrite(pte);
162#ifdef CONFIG_HUGETLB_PAGE 162#ifdef CONFIG_HUGETLB_PAGE
163 if (PageHuge(new)) 163 if (PageHuge(new)) {
164 pte = pte_mkhuge(pte); 164 pte = pte_mkhuge(pte);
165 pte = arch_make_huge_pte(pte, vma, new, 0);
166 }
165#endif 167#endif
166 flush_cache_page(vma, addr, pte_pfn(pte)); 168 flush_cache_page(vma, addr, pte_pfn(pte));
167 set_pte_at(mm, addr, ptep, pte); 169 set_pte_at(mm, addr, ptep, pte);
@@ -462,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
462 464
463 mlock_migrate_page(newpage, page); 465 mlock_migrate_page(newpage, page);
464 ksm_migrate_page(newpage, page); 466 ksm_migrate_page(newpage, page);
465 467 /*
468 * Please do not reorder this without considering how mm/ksm.c's
469 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
470 */
466 ClearPageSwapCache(page); 471 ClearPageSwapCache(page);
467 ClearPagePrivate(page); 472 ClearPagePrivate(page);
468 set_page_private(page, 0); 473 set_page_private(page, 0);
@@ -696,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
696} 701}
697 702
698static int __unmap_and_move(struct page *page, struct page *newpage, 703static int __unmap_and_move(struct page *page, struct page *newpage,
699 int force, bool offlining, enum migrate_mode mode) 704 int force, enum migrate_mode mode)
700{ 705{
701 int rc = -EAGAIN; 706 int rc = -EAGAIN;
702 int remap_swapcache = 1; 707 int remap_swapcache = 1;
@@ -726,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
726 lock_page(page); 731 lock_page(page);
727 } 732 }
728 733
729 /*
730 * Only memory hotplug's offline_pages() caller has locked out KSM,
731 * and can safely migrate a KSM page. The other cases have skipped
732 * PageKsm along with PageReserved - but it is only now when we have
733 * the page lock that we can be certain it will not go KSM beneath us
734 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
735 * its pagecount raised, but only here do we take the page lock which
736 * serializes that).
737 */
738 if (PageKsm(page) && !offlining) {
739 rc = -EBUSY;
740 goto unlock;
741 }
742
743 /* charge against new page */ 734 /* charge against new page */
744 mem_cgroup_prepare_migration(page, newpage, &mem); 735 mem_cgroup_prepare_migration(page, newpage, &mem);
745 736
@@ -766,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
766 * File Caches may use write_page() or lock_page() in migration, then, 757 * File Caches may use write_page() or lock_page() in migration, then,
767 * just care Anon page here. 758 * just care Anon page here.
768 */ 759 */
769 if (PageAnon(page)) { 760 if (PageAnon(page) && !PageKsm(page)) {
770 /* 761 /*
771 * Only page_lock_anon_vma_read() understands the subtleties of 762 * Only page_lock_anon_vma_read() understands the subtleties of
772 * getting a hold on an anon_vma from outside one of its mms. 763 * getting a hold on an anon_vma from outside one of its mms.
@@ -846,7 +837,6 @@ uncharge:
846 mem_cgroup_end_migration(mem, page, newpage, 837 mem_cgroup_end_migration(mem, page, newpage,
847 (rc == MIGRATEPAGE_SUCCESS || 838 (rc == MIGRATEPAGE_SUCCESS ||
848 rc == MIGRATEPAGE_BALLOON_SUCCESS)); 839 rc == MIGRATEPAGE_BALLOON_SUCCESS));
849unlock:
850 unlock_page(page); 840 unlock_page(page);
851out: 841out:
852 return rc; 842 return rc;
@@ -857,8 +847,7 @@ out:
857 * to the newly allocated page in newpage. 847 * to the newly allocated page in newpage.
858 */ 848 */
859static int unmap_and_move(new_page_t get_new_page, unsigned long private, 849static int unmap_and_move(new_page_t get_new_page, unsigned long private,
860 struct page *page, int force, bool offlining, 850 struct page *page, int force, enum migrate_mode mode)
861 enum migrate_mode mode)
862{ 851{
863 int rc = 0; 852 int rc = 0;
864 int *result = NULL; 853 int *result = NULL;
@@ -876,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
876 if (unlikely(split_huge_page(page))) 865 if (unlikely(split_huge_page(page)))
877 goto out; 866 goto out;
878 867
879 rc = __unmap_and_move(page, newpage, force, offlining, mode); 868 rc = __unmap_and_move(page, newpage, force, mode);
880 869
881 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { 870 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
882 /* 871 /*
@@ -936,8 +925,7 @@ out:
936 */ 925 */
937static int unmap_and_move_huge_page(new_page_t get_new_page, 926static int unmap_and_move_huge_page(new_page_t get_new_page,
938 unsigned long private, struct page *hpage, 927 unsigned long private, struct page *hpage,
939 int force, bool offlining, 928 int force, enum migrate_mode mode)
940 enum migrate_mode mode)
941{ 929{
942 int rc = 0; 930 int rc = 0;
943 int *result = NULL; 931 int *result = NULL;
@@ -999,9 +987,8 @@ out:
999 * 987 *
1000 * Return: Number of pages not migrated or error code. 988 * Return: Number of pages not migrated or error code.
1001 */ 989 */
1002int migrate_pages(struct list_head *from, 990int migrate_pages(struct list_head *from, new_page_t get_new_page,
1003 new_page_t get_new_page, unsigned long private, bool offlining, 991 unsigned long private, enum migrate_mode mode, int reason)
1004 enum migrate_mode mode, int reason)
1005{ 992{
1006 int retry = 1; 993 int retry = 1;
1007 int nr_failed = 0; 994 int nr_failed = 0;
@@ -1022,8 +1009,7 @@ int migrate_pages(struct list_head *from,
1022 cond_resched(); 1009 cond_resched();
1023 1010
1024 rc = unmap_and_move(get_new_page, private, 1011 rc = unmap_and_move(get_new_page, private,
1025 page, pass > 2, offlining, 1012 page, pass > 2, mode);
1026 mode);
1027 1013
1028 switch(rc) { 1014 switch(rc) {
1029 case -ENOMEM: 1015 case -ENOMEM:
@@ -1056,15 +1042,13 @@ out:
1056} 1042}
1057 1043
1058int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 1044int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1059 unsigned long private, bool offlining, 1045 unsigned long private, enum migrate_mode mode)
1060 enum migrate_mode mode)
1061{ 1046{
1062 int pass, rc; 1047 int pass, rc;
1063 1048
1064 for (pass = 0; pass < 10; pass++) { 1049 for (pass = 0; pass < 10; pass++) {
1065 rc = unmap_and_move_huge_page(get_new_page, 1050 rc = unmap_and_move_huge_page(get_new_page, private,
1066 private, hpage, pass > 2, offlining, 1051 hpage, pass > 2, mode);
1067 mode);
1068 switch (rc) { 1052 switch (rc) {
1069 case -ENOMEM: 1053 case -ENOMEM:
1070 goto out; 1054 goto out;
@@ -1150,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1150 goto set_status; 1134 goto set_status;
1151 1135
1152 /* Use PageReserved to check for zero page */ 1136 /* Use PageReserved to check for zero page */
1153 if (PageReserved(page) || PageKsm(page)) 1137 if (PageReserved(page))
1154 goto put_and_set; 1138 goto put_and_set;
1155 1139
1156 pp->page = page; 1140 pp->page = page;
@@ -1187,8 +1171,7 @@ set_status:
1187 err = 0; 1171 err = 0;
1188 if (!list_empty(&pagelist)) { 1172 if (!list_empty(&pagelist)) {
1189 err = migrate_pages(&pagelist, new_page_node, 1173 err = migrate_pages(&pagelist, new_page_node,
1190 (unsigned long)pm, 0, MIGRATE_SYNC, 1174 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1191 MR_SYSCALL);
1192 if (err) 1175 if (err)
1193 putback_lru_pages(&pagelist); 1176 putback_lru_pages(&pagelist);
1194 } 1177 }
@@ -1312,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1312 1295
1313 err = -ENOENT; 1296 err = -ENOENT;
1314 /* Use PageReserved to check for zero page */ 1297 /* Use PageReserved to check for zero page */
1315 if (!page || PageReserved(page) || PageKsm(page)) 1298 if (!page || PageReserved(page))
1316 goto set_status; 1299 goto set_status;
1317 1300
1318 err = page_to_nid(page); 1301 err = page_to_nid(page);
@@ -1459,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1459 * pages. Currently it only checks the watermarks which crude 1442 * pages. Currently it only checks the watermarks which crude
1460 */ 1443 */
1461static bool migrate_balanced_pgdat(struct pglist_data *pgdat, 1444static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1462 int nr_migrate_pages) 1445 unsigned long nr_migrate_pages)
1463{ 1446{
1464 int z; 1447 int z;
1465 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1448 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
@@ -1495,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1495 __GFP_NOWARN) & 1478 __GFP_NOWARN) &
1496 ~GFP_IOFS, 0); 1479 ~GFP_IOFS, 0);
1497 if (newpage) 1480 if (newpage)
1498 page_xchg_last_nid(newpage, page_last_nid(page)); 1481 page_nid_xchg_last(newpage, page_nid_last(page));
1499 1482
1500 return newpage; 1483 return newpage;
1501} 1484}
@@ -1555,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
1555 1538
1556int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1539int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1557{ 1540{
1558 int ret = 0; 1541 int page_lru;
1542
1543 VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
1559 1544
1560 /* Avoid migrating to a node that is nearly full */ 1545 /* Avoid migrating to a node that is nearly full */
1561 if (migrate_balanced_pgdat(pgdat, 1)) { 1546 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1562 int page_lru; 1547 return 0;
1563 1548
1564 if (isolate_lru_page(page)) { 1549 if (isolate_lru_page(page))
1565 put_page(page); 1550 return 0;
1566 return 0;
1567 }
1568 1551
1569 /* Page is isolated */ 1552 /*
1570 ret = 1; 1553 * migrate_misplaced_transhuge_page() skips page migration's usual
1571 page_lru = page_is_file_cache(page); 1554 * check on page_count(), so we must do it here, now that the page
1572 if (!PageTransHuge(page)) 1555 * has been isolated: a GUP pin, or any other pin, prevents migration.
1573 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); 1556 * The expected page count is 3: 1 for page's mapcount and 1 for the
1574 else 1557 * caller's pin and 1 for the reference taken by isolate_lru_page().
1575 mod_zone_page_state(page_zone(page), 1558 */
1576 NR_ISOLATED_ANON + page_lru, 1559 if (PageTransHuge(page) && page_count(page) != 3) {
1577 HPAGE_PMD_NR); 1560 putback_lru_page(page);
1561 return 0;
1578 } 1562 }
1579 1563
1564 page_lru = page_is_file_cache(page);
1565 mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
1566 hpage_nr_pages(page));
1567
1580 /* 1568 /*
1581 * Page is either isolated or there is not enough space on the target 1569 * Isolating the page has taken another reference, so the
1582 * node. If isolated, then it has taken a reference count and the 1570 * caller's reference can be safely dropped without the page
1583 * callers reference can be safely dropped without the page 1571 * disappearing underneath us during migration.
1584 * disappearing underneath us during migration. Otherwise the page is
1585 * not to be migrated but the callers reference should still be
1586 * dropped so it does not leak.
1587 */ 1572 */
1588 put_page(page); 1573 put_page(page);
1589 1574 return 1;
1590 return ret;
1591} 1575}
1592 1576
1593/* 1577/*
@@ -1598,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1598int migrate_misplaced_page(struct page *page, int node) 1582int migrate_misplaced_page(struct page *page, int node)
1599{ 1583{
1600 pg_data_t *pgdat = NODE_DATA(node); 1584 pg_data_t *pgdat = NODE_DATA(node);
1601 int isolated = 0; 1585 int isolated;
1602 int nr_remaining; 1586 int nr_remaining;
1603 LIST_HEAD(migratepages); 1587 LIST_HEAD(migratepages);
1604 1588
@@ -1606,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node)
1606 * Don't migrate pages that are mapped in multiple processes. 1590 * Don't migrate pages that are mapped in multiple processes.
1607 * TODO: Handle false sharing detection instead of this hammer 1591 * TODO: Handle false sharing detection instead of this hammer
1608 */ 1592 */
1609 if (page_mapcount(page) != 1) { 1593 if (page_mapcount(page) != 1)
1610 put_page(page);
1611 goto out; 1594 goto out;
1612 }
1613 1595
1614 /* 1596 /*
1615 * Rate-limit the amount of data that is being migrated to a node. 1597 * Rate-limit the amount of data that is being migrated to a node.
1616 * Optimal placement is no good if the memory bus is saturated and 1598 * Optimal placement is no good if the memory bus is saturated and
1617 * all the time is being spent migrating! 1599 * all the time is being spent migrating!
1618 */ 1600 */
1619 if (numamigrate_update_ratelimit(pgdat, 1)) { 1601 if (numamigrate_update_ratelimit(pgdat, 1))
1620 put_page(page);
1621 goto out; 1602 goto out;
1622 }
1623 1603
1624 isolated = numamigrate_isolate_page(pgdat, page); 1604 isolated = numamigrate_isolate_page(pgdat, page);
1625 if (!isolated) 1605 if (!isolated)
1626 goto out; 1606 goto out;
1627 1607
1628 list_add(&page->lru, &migratepages); 1608 list_add(&page->lru, &migratepages);
1629 nr_remaining = migrate_pages(&migratepages, 1609 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1630 alloc_misplaced_dst_page, 1610 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1631 node, false, MIGRATE_ASYNC,
1632 MR_NUMA_MISPLACED);
1633 if (nr_remaining) { 1611 if (nr_remaining) {
1634 putback_lru_pages(&migratepages); 1612 putback_lru_pages(&migratepages);
1635 isolated = 0; 1613 isolated = 0;
1636 } else 1614 } else
1637 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1615 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1638 BUG_ON(!list_empty(&migratepages)); 1616 BUG_ON(!list_empty(&migratepages));
1639out:
1640 return isolated; 1617 return isolated;
1618
1619out:
1620 put_page(page);
1621 return 0;
1641} 1622}
1642#endif /* CONFIG_NUMA_BALANCING */ 1623#endif /* CONFIG_NUMA_BALANCING */
1643 1624
1644#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1625#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1626/*
1627 * Migrates a THP to a given target node. page must be locked and is unlocked
1628 * before returning.
1629 */
1645int migrate_misplaced_transhuge_page(struct mm_struct *mm, 1630int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1646 struct vm_area_struct *vma, 1631 struct vm_area_struct *vma,
1647 pmd_t *pmd, pmd_t entry, 1632 pmd_t *pmd, pmd_t entry,
@@ -1672,17 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1672 1657
1673 new_page = alloc_pages_node(node, 1658 new_page = alloc_pages_node(node,
1674 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); 1659 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
1675 if (!new_page) { 1660 if (!new_page)
1676 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1661 goto out_fail;
1677 goto out_dropref; 1662
1678 } 1663 page_nid_xchg_last(new_page, page_nid_last(page));
1679 page_xchg_last_nid(new_page, page_last_nid(page));
1680 1664
1681 isolated = numamigrate_isolate_page(pgdat, page); 1665 isolated = numamigrate_isolate_page(pgdat, page);
1682 if (!isolated) { 1666 if (!isolated) {
1683 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1684 put_page(new_page); 1667 put_page(new_page);
1685 goto out_keep_locked; 1668 goto out_fail;
1686 } 1669 }
1687 1670
1688 /* Prepare a page as a migration target */ 1671 /* Prepare a page as a migration target */
@@ -1714,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1714 putback_lru_page(page); 1697 putback_lru_page(page);
1715 1698
1716 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1699 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1700 isolated = 0;
1717 goto out; 1701 goto out;
1718 } 1702 }
1719 1703
@@ -1758,9 +1742,11 @@ out:
1758 -HPAGE_PMD_NR); 1742 -HPAGE_PMD_NR);
1759 return isolated; 1743 return isolated;
1760 1744
1745out_fail:
1746 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1761out_dropref: 1747out_dropref:
1748 unlock_page(page);
1762 put_page(page); 1749 put_page(page);
1763out_keep_locked:
1764 return 0; 1750 return 0;
1765} 1751}
1766#endif /* CONFIG_NUMA_BALANCING */ 1752#endif /* CONFIG_NUMA_BALANCING */
diff --git a/mm/mincore.c b/mm/mincore.c
index 936b4cee8cb1..da2be56a7b8f 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
75 /* shmem/tmpfs may return swap: account for swapcache page too. */ 75 /* shmem/tmpfs may return swap: account for swapcache page too. */
76 if (radix_tree_exceptional_entry(page)) { 76 if (radix_tree_exceptional_entry(page)) {
77 swp_entry_t swap = radix_to_swp_entry(page); 77 swp_entry_t swap = radix_to_swp_entry(page);
78 page = find_get_page(&swapper_space, swap.val); 78 page = find_get_page(swap_address_space(swap), swap.val);
79 } 79 }
80#endif 80#endif
81 if (page) { 81 if (page) {
@@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
135 } else { 135 } else {
136#ifdef CONFIG_SWAP 136#ifdef CONFIG_SWAP
137 pgoff = entry.val; 137 pgoff = entry.val;
138 *vec = mincore_page(&swapper_space, pgoff); 138 *vec = mincore_page(swap_address_space(entry),
139 pgoff);
139#else 140#else
140 WARN_ON(1); 141 WARN_ON(1);
141 *vec = 1; 142 *vec = 1;
diff --git a/mm/mlock.c b/mm/mlock.c
index f0b9ce572fc7..1c5e33fce639 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -102,13 +102,16 @@ void mlock_vma_page(struct page *page)
102 * can't isolate the page, we leave it for putback_lru_page() and vmscan 102 * can't isolate the page, we leave it for putback_lru_page() and vmscan
103 * [page_referenced()/try_to_unmap()] to deal with. 103 * [page_referenced()/try_to_unmap()] to deal with.
104 */ 104 */
105void munlock_vma_page(struct page *page) 105unsigned int munlock_vma_page(struct page *page)
106{ 106{
107 unsigned int page_mask = 0;
108
107 BUG_ON(!PageLocked(page)); 109 BUG_ON(!PageLocked(page));
108 110
109 if (TestClearPageMlocked(page)) { 111 if (TestClearPageMlocked(page)) {
110 mod_zone_page_state(page_zone(page), NR_MLOCK, 112 unsigned int nr_pages = hpage_nr_pages(page);
111 -hpage_nr_pages(page)); 113 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
114 page_mask = nr_pages - 1;
112 if (!isolate_lru_page(page)) { 115 if (!isolate_lru_page(page)) {
113 int ret = SWAP_AGAIN; 116 int ret = SWAP_AGAIN;
114 117
@@ -141,6 +144,8 @@ void munlock_vma_page(struct page *page)
141 count_vm_event(UNEVICTABLE_PGMUNLOCKED); 144 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
142 } 145 }
143 } 146 }
147
148 return page_mask;
144} 149}
145 150
146/** 151/**
@@ -155,13 +160,11 @@ void munlock_vma_page(struct page *page)
155 * 160 *
156 * vma->vm_mm->mmap_sem must be held for at least read. 161 * vma->vm_mm->mmap_sem must be held for at least read.
157 */ 162 */
158static long __mlock_vma_pages_range(struct vm_area_struct *vma, 163long __mlock_vma_pages_range(struct vm_area_struct *vma,
159 unsigned long start, unsigned long end, 164 unsigned long start, unsigned long end, int *nonblocking)
160 int *nonblocking)
161{ 165{
162 struct mm_struct *mm = vma->vm_mm; 166 struct mm_struct *mm = vma->vm_mm;
163 unsigned long addr = start; 167 unsigned long nr_pages = (end - start) / PAGE_SIZE;
164 int nr_pages = (end - start) / PAGE_SIZE;
165 int gup_flags; 168 int gup_flags;
166 169
167 VM_BUG_ON(start & ~PAGE_MASK); 170 VM_BUG_ON(start & ~PAGE_MASK);
@@ -186,7 +189,11 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
186 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) 189 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
187 gup_flags |= FOLL_FORCE; 190 gup_flags |= FOLL_FORCE;
188 191
189 return __get_user_pages(current, mm, addr, nr_pages, gup_flags, 192 /*
193 * We made sure addr is within a VMA, so the following will
194 * not result in a stack expansion that recurses back here.
195 */
196 return __get_user_pages(current, mm, start, nr_pages, gup_flags,
190 NULL, NULL, nonblocking); 197 NULL, NULL, nonblocking);
191} 198}
192 199
@@ -202,56 +209,6 @@ static int __mlock_posix_error_return(long retval)
202 return retval; 209 return retval;
203} 210}
204 211
205/**
206 * mlock_vma_pages_range() - mlock pages in specified vma range.
207 * @vma - the vma containing the specfied address range
208 * @start - starting address in @vma to mlock
209 * @end - end address [+1] in @vma to mlock
210 *
211 * For mmap()/mremap()/expansion of mlocked vma.
212 *
213 * return 0 on success for "normal" vmas.
214 *
215 * return number of pages [> 0] to be removed from locked_vm on success
216 * of "special" vmas.
217 */
218long mlock_vma_pages_range(struct vm_area_struct *vma,
219 unsigned long start, unsigned long end)
220{
221 int nr_pages = (end - start) / PAGE_SIZE;
222 BUG_ON(!(vma->vm_flags & VM_LOCKED));
223
224 /*
225 * filter unlockable vmas
226 */
227 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
228 goto no_mlock;
229
230 if (!((vma->vm_flags & VM_DONTEXPAND) ||
231 is_vm_hugetlb_page(vma) ||
232 vma == get_gate_vma(current->mm))) {
233
234 __mlock_vma_pages_range(vma, start, end, NULL);
235
236 /* Hide errors from mmap() and other callers */
237 return 0;
238 }
239
240 /*
241 * User mapped kernel pages or huge pages:
242 * make these pages present to populate the ptes, but
243 * fall thru' to reset VM_LOCKED--no need to unlock, and
244 * return nr_pages so these don't get counted against task's
245 * locked limit. huge pages are already counted against
246 * locked vm limit.
247 */
248 make_pages_present(start, end);
249
250no_mlock:
251 vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
252 return nr_pages; /* error or pages NOT mlocked */
253}
254
255/* 212/*
256 * munlock_vma_pages_range() - munlock all pages in the vma range.' 213 * munlock_vma_pages_range() - munlock all pages in the vma range.'
257 * @vma - vma containing range to be munlock()ed. 214 * @vma - vma containing range to be munlock()ed.
@@ -273,13 +230,12 @@ no_mlock:
273void munlock_vma_pages_range(struct vm_area_struct *vma, 230void munlock_vma_pages_range(struct vm_area_struct *vma,
274 unsigned long start, unsigned long end) 231 unsigned long start, unsigned long end)
275{ 232{
276 unsigned long addr;
277
278 lru_add_drain();
279 vma->vm_flags &= ~VM_LOCKED; 233 vma->vm_flags &= ~VM_LOCKED;
280 234
281 for (addr = start; addr < end; addr += PAGE_SIZE) { 235 while (start < end) {
282 struct page *page; 236 struct page *page;
237 unsigned int page_mask, page_increm;
238
283 /* 239 /*
284 * Although FOLL_DUMP is intended for get_dump_page(), 240 * Although FOLL_DUMP is intended for get_dump_page(),
285 * it just so happens that its special treatment of the 241 * it just so happens that its special treatment of the
@@ -287,13 +243,22 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
287 * suits munlock very well (and if somehow an abnormal page 243 * suits munlock very well (and if somehow an abnormal page
288 * has sneaked into the range, we won't oops here: great). 244 * has sneaked into the range, we won't oops here: great).
289 */ 245 */
290 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 246 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
247 &page_mask);
291 if (page && !IS_ERR(page)) { 248 if (page && !IS_ERR(page)) {
292 lock_page(page); 249 lock_page(page);
293 munlock_vma_page(page); 250 lru_add_drain();
251 /*
252 * Any THP page found by follow_page_mask() may have
253 * gotten split before reaching munlock_vma_page(),
254 * so we need to recompute the page_mask here.
255 */
256 page_mask = munlock_vma_page(page);
294 unlock_page(page); 257 unlock_page(page);
295 put_page(page); 258 put_page(page);
296 } 259 }
260 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
261 start += page_increm * PAGE_SIZE;
297 cond_resched(); 262 cond_resched();
298 } 263 }
299} 264}
@@ -303,7 +268,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
303 * 268 *
304 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and 269 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
305 * munlock is a no-op. However, for some special vmas, we go ahead and 270 * munlock is a no-op. However, for some special vmas, we go ahead and
306 * populate the ptes via make_pages_present(). 271 * populate the ptes.
307 * 272 *
308 * For vmas that pass the filters, merge/split as appropriate. 273 * For vmas that pass the filters, merge/split as appropriate.
309 */ 274 */
@@ -391,9 +356,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
391 356
392 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 357 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
393 358
394 newflags = vma->vm_flags | VM_LOCKED; 359 newflags = vma->vm_flags & ~VM_LOCKED;
395 if (!on) 360 if (on)
396 newflags &= ~VM_LOCKED; 361 newflags |= VM_LOCKED | VM_POPULATE;
397 362
398 tmp = vma->vm_end; 363 tmp = vma->vm_end;
399 if (tmp > end) 364 if (tmp > end)
@@ -416,13 +381,20 @@ static int do_mlock(unsigned long start, size_t len, int on)
416 return error; 381 return error;
417} 382}
418 383
419static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) 384/*
385 * __mm_populate - populate and/or mlock pages within a range of address space.
386 *
387 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
388 * flags. VMAs must be already marked with the desired vm_flags, and
389 * mmap_sem must not be held.
390 */
391int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
420{ 392{
421 struct mm_struct *mm = current->mm; 393 struct mm_struct *mm = current->mm;
422 unsigned long end, nstart, nend; 394 unsigned long end, nstart, nend;
423 struct vm_area_struct *vma = NULL; 395 struct vm_area_struct *vma = NULL;
424 int locked = 0; 396 int locked = 0;
425 int ret = 0; 397 long ret = 0;
426 398
427 VM_BUG_ON(start & ~PAGE_MASK); 399 VM_BUG_ON(start & ~PAGE_MASK);
428 VM_BUG_ON(len != PAGE_ALIGN(len)); 400 VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -446,7 +418,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
446 * range with the first VMA. Also, skip undesirable VMA types. 418 * range with the first VMA. Also, skip undesirable VMA types.
447 */ 419 */
448 nend = min(end, vma->vm_end); 420 nend = min(end, vma->vm_end);
449 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 421 if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
422 VM_POPULATE)
450 continue; 423 continue;
451 if (nstart < vma->vm_start) 424 if (nstart < vma->vm_start)
452 nstart = vma->vm_start; 425 nstart = vma->vm_start;
@@ -498,7 +471,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
498 error = do_mlock(start, len, 1); 471 error = do_mlock(start, len, 1);
499 up_write(&current->mm->mmap_sem); 472 up_write(&current->mm->mmap_sem);
500 if (!error) 473 if (!error)
501 error = do_mlock_pages(start, len, 0); 474 error = __mm_populate(start, len, 0);
502 return error; 475 return error;
503} 476}
504 477
@@ -517,20 +490,20 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
517static int do_mlockall(int flags) 490static int do_mlockall(int flags)
518{ 491{
519 struct vm_area_struct * vma, * prev = NULL; 492 struct vm_area_struct * vma, * prev = NULL;
520 unsigned int def_flags = 0;
521 493
522 if (flags & MCL_FUTURE) 494 if (flags & MCL_FUTURE)
523 def_flags = VM_LOCKED; 495 current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
524 current->mm->def_flags = def_flags; 496 else
497 current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
525 if (flags == MCL_FUTURE) 498 if (flags == MCL_FUTURE)
526 goto out; 499 goto out;
527 500
528 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { 501 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
529 vm_flags_t newflags; 502 vm_flags_t newflags;
530 503
531 newflags = vma->vm_flags | VM_LOCKED; 504 newflags = vma->vm_flags & ~VM_LOCKED;
532 if (!(flags & MCL_CURRENT)) 505 if (flags & MCL_CURRENT)
533 newflags &= ~VM_LOCKED; 506 newflags |= VM_LOCKED | VM_POPULATE;
534 507
535 /* Ignore errors */ 508 /* Ignore errors */
536 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); 509 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -564,10 +537,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
564 capable(CAP_IPC_LOCK)) 537 capable(CAP_IPC_LOCK))
565 ret = do_mlockall(flags); 538 ret = do_mlockall(flags);
566 up_write(&current->mm->mmap_sem); 539 up_write(&current->mm->mmap_sem);
567 if (!ret && (flags & MCL_CURRENT)) { 540 if (!ret && (flags & MCL_CURRENT))
568 /* Ignore errors */ 541 mm_populate(0, TASK_SIZE);
569 do_mlock_pages(0, TASK_SIZE, 1);
570 }
571out: 542out:
572 return ret; 543 return ret;
573} 544}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1ffd97ae26d7..c280a02ea11e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void)
69 unsigned long or_mask, add_mask; 69 unsigned long or_mask, add_mask;
70 70
71 shift = 8 * sizeof(unsigned long); 71 shift = 8 * sizeof(unsigned long);
72 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; 72 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
73 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 73 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
74 "Section %d Node %d Zone %d Flags %d\n", 74 "Section %d Node %d Zone %d Lastnid %d Flags %d\n",
75 SECTIONS_WIDTH, 75 SECTIONS_WIDTH,
76 NODES_WIDTH, 76 NODES_WIDTH,
77 ZONES_WIDTH, 77 ZONES_WIDTH,
78 LAST_NID_WIDTH,
78 NR_PAGEFLAGS); 79 NR_PAGEFLAGS);
79 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 80 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
80 "Section %d Node %d Zone %d\n", 81 "Section %d Node %d Zone %d Lastnid %d\n",
81 SECTIONS_SHIFT, 82 SECTIONS_SHIFT,
82 NODES_SHIFT, 83 NODES_SHIFT,
83 ZONES_SHIFT); 84 ZONES_SHIFT,
84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", 85 LAST_NID_SHIFT);
85 "Section %lu Node %lu Zone %lu\n", 86 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
87 "Section %lu Node %lu Zone %lu Lastnid %lu\n",
86 (unsigned long)SECTIONS_PGSHIFT, 88 (unsigned long)SECTIONS_PGSHIFT,
87 (unsigned long)NODES_PGSHIFT, 89 (unsigned long)NODES_PGSHIFT,
88 (unsigned long)ZONES_PGSHIFT); 90 (unsigned long)ZONES_PGSHIFT,
89 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", 91 (unsigned long)LAST_NID_PGSHIFT);
90 "Zone ID: %lu -> %lu\n", 92 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
91 (unsigned long)ZONEID_PGOFF, 93 "Node/Zone ID: %lu -> %lu\n",
92 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); 94 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
95 (unsigned long)ZONEID_PGOFF);
93 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", 96 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
94 "location: %d -> %d unused %d -> %d flags %d -> %d\n", 97 "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
95 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); 98 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
96#ifdef NODE_NOT_IN_PAGE_FLAGS 99#ifdef NODE_NOT_IN_PAGE_FLAGS
97 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 100 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
98 "Node not in page flags"); 101 "Node not in page flags");
99#endif 102#endif
103#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
104 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
105 "Last nid not in page flags");
106#endif
100 107
101 if (SECTIONS_WIDTH) { 108 if (SECTIONS_WIDTH) {
102 shift -= SECTIONS_WIDTH; 109 shift -= SECTIONS_WIDTH;
diff --git a/mm/mmap.c b/mm/mmap.c
index f54b235f29a9..2664a47cec93 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -32,6 +32,7 @@
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/rbtree_augmented.h> 34#include <linux/rbtree_augmented.h>
35#include <linux/sched/sysctl.h>
35 36
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include <asm/cacheflush.h> 38#include <asm/cacheflush.h>
@@ -143,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
143 */ 144 */
144 free -= global_page_state(NR_SHMEM); 145 free -= global_page_state(NR_SHMEM);
145 146
146 free += nr_swap_pages; 147 free += get_nr_swap_pages();
147 148
148 /* 149 /*
149 * Any slabs which are created with the 150 * Any slabs which are created with the
@@ -202,7 +203,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
202 struct file *file, struct address_space *mapping) 203 struct file *file, struct address_space *mapping)
203{ 204{
204 if (vma->vm_flags & VM_DENYWRITE) 205 if (vma->vm_flags & VM_DENYWRITE)
205 atomic_inc(&file->f_path.dentry->d_inode->i_writecount); 206 atomic_inc(&file_inode(file)->i_writecount);
206 if (vma->vm_flags & VM_SHARED) 207 if (vma->vm_flags & VM_SHARED)
207 mapping->i_mmap_writable--; 208 mapping->i_mmap_writable--;
208 209
@@ -255,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
255 unsigned long newbrk, oldbrk; 256 unsigned long newbrk, oldbrk;
256 struct mm_struct *mm = current->mm; 257 struct mm_struct *mm = current->mm;
257 unsigned long min_brk; 258 unsigned long min_brk;
259 bool populate;
258 260
259 down_write(&mm->mmap_sem); 261 down_write(&mm->mmap_sem);
260 262
@@ -304,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
304 /* Ok, looks good - let it rip. */ 306 /* Ok, looks good - let it rip. */
305 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) 307 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
306 goto out; 308 goto out;
309
307set_brk: 310set_brk:
308 mm->brk = brk; 311 mm->brk = brk;
312 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
313 up_write(&mm->mmap_sem);
314 if (populate)
315 mm_populate(oldbrk, newbrk - oldbrk);
316 return brk;
317
309out: 318out:
310 retval = mm->brk; 319 retval = mm->brk;
311 up_write(&mm->mmap_sem); 320 up_write(&mm->mmap_sem);
@@ -567,7 +576,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
567 struct address_space *mapping = file->f_mapping; 576 struct address_space *mapping = file->f_mapping;
568 577
569 if (vma->vm_flags & VM_DENYWRITE) 578 if (vma->vm_flags & VM_DENYWRITE)
570 atomic_dec(&file->f_path.dentry->d_inode->i_writecount); 579 atomic_dec(&file_inode(file)->i_writecount);
571 if (vma->vm_flags & VM_SHARED) 580 if (vma->vm_flags & VM_SHARED)
572 mapping->i_mmap_writable++; 581 mapping->i_mmap_writable++;
573 582
@@ -800,7 +809,7 @@ again: remove_next = 1 + (end > next->vm_end);
800 anon_vma_interval_tree_post_update_vma(vma); 809 anon_vma_interval_tree_post_update_vma(vma);
801 if (adjust_next) 810 if (adjust_next)
802 anon_vma_interval_tree_post_update_vma(next); 811 anon_vma_interval_tree_post_update_vma(next);
803 anon_vma_unlock(anon_vma); 812 anon_vma_unlock_write(anon_vma);
804 } 813 }
805 if (mapping) 814 if (mapping)
806 mutex_unlock(&mapping->i_mmap_mutex); 815 mutex_unlock(&mapping->i_mmap_mutex);
@@ -1153,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
1153 1162
1154unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1163unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1155 unsigned long len, unsigned long prot, 1164 unsigned long len, unsigned long prot,
1156 unsigned long flags, unsigned long pgoff) 1165 unsigned long flags, unsigned long pgoff,
1166 unsigned long *populate)
1157{ 1167{
1158 struct mm_struct * mm = current->mm; 1168 struct mm_struct * mm = current->mm;
1159 struct inode *inode; 1169 struct inode *inode;
1160 vm_flags_t vm_flags; 1170 vm_flags_t vm_flags;
1161 1171
1172 *populate = 0;
1173
1162 /* 1174 /*
1163 * Does the application expect PROT_READ to imply PROT_EXEC? 1175 * Does the application expect PROT_READ to imply PROT_EXEC?
1164 * 1176 *
@@ -1217,7 +1229,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1217 return -EAGAIN; 1229 return -EAGAIN;
1218 } 1230 }
1219 1231
1220 inode = file ? file->f_path.dentry->d_inode : NULL; 1232 inode = file ? file_inode(file) : NULL;
1221 1233
1222 if (file) { 1234 if (file) {
1223 switch (flags & MAP_TYPE) { 1235 switch (flags & MAP_TYPE) {
@@ -1279,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1279 } 1291 }
1280 } 1292 }
1281 1293
1282 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1294 /*
1295 * Set 'VM_NORESERVE' if we should not account for the
1296 * memory use of this mapping.
1297 */
1298 if (flags & MAP_NORESERVE) {
1299 /* We honor MAP_NORESERVE if allowed to overcommit */
1300 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1301 vm_flags |= VM_NORESERVE;
1302
1303 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1304 if (file && is_file_hugepages(file))
1305 vm_flags |= VM_NORESERVE;
1306 }
1307
1308 addr = mmap_region(file, addr, len, vm_flags, pgoff);
1309 if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
1310 *populate = len;
1311 return addr;
1283} 1312}
1284 1313
1285SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1314SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1394,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1394} 1423}
1395 1424
1396unsigned long mmap_region(struct file *file, unsigned long addr, 1425unsigned long mmap_region(struct file *file, unsigned long addr,
1397 unsigned long len, unsigned long flags, 1426 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
1398 vm_flags_t vm_flags, unsigned long pgoff)
1399{ 1427{
1400 struct mm_struct *mm = current->mm; 1428 struct mm_struct *mm = current->mm;
1401 struct vm_area_struct *vma, *prev; 1429 struct vm_area_struct *vma, *prev;
@@ -1403,7 +1431,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1403 int error; 1431 int error;
1404 struct rb_node **rb_link, *rb_parent; 1432 struct rb_node **rb_link, *rb_parent;
1405 unsigned long charged = 0; 1433 unsigned long charged = 0;
1406 struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; 1434 struct inode *inode = file ? file_inode(file) : NULL;
1407 1435
1408 /* Clear old maps */ 1436 /* Clear old maps */
1409 error = -ENOMEM; 1437 error = -ENOMEM;
@@ -1419,20 +1447,6 @@ munmap_back:
1419 return -ENOMEM; 1447 return -ENOMEM;
1420 1448
1421 /* 1449 /*
1422 * Set 'VM_NORESERVE' if we should not account for the
1423 * memory use of this mapping.
1424 */
1425 if ((flags & MAP_NORESERVE)) {
1426 /* We honor MAP_NORESERVE if allowed to overcommit */
1427 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1428 vm_flags |= VM_NORESERVE;
1429
1430 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1431 if (file && is_file_hugepages(file))
1432 vm_flags |= VM_NORESERVE;
1433 }
1434
1435 /*
1436 * Private writable mapping: check memory availability 1450 * Private writable mapping: check memory availability
1437 */ 1451 */
1438 if (accountable_mapping(file, vm_flags)) { 1452 if (accountable_mapping(file, vm_flags)) {
@@ -1530,10 +1544,12 @@ out:
1530 1544
1531 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1545 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1532 if (vm_flags & VM_LOCKED) { 1546 if (vm_flags & VM_LOCKED) {
1533 if (!mlock_vma_pages_range(vma, addr, addr + len)) 1547 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
1548 vma == get_gate_vma(current->mm)))
1534 mm->locked_vm += (len >> PAGE_SHIFT); 1549 mm->locked_vm += (len >> PAGE_SHIFT);
1535 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1550 else
1536 make_pages_present(addr, addr + len); 1551 vma->vm_flags &= ~VM_LOCKED;
1552 }
1537 1553
1538 if (file) 1554 if (file)
1539 uprobe_mmap(vma); 1555 uprobe_mmap(vma);
@@ -2169,9 +2185,28 @@ int expand_downwards(struct vm_area_struct *vma,
2169 return error; 2185 return error;
2170} 2186}
2171 2187
2188/*
2189 * Note how expand_stack() refuses to expand the stack all the way to
2190 * abut the next virtual mapping, *unless* that mapping itself is also
2191 * a stack mapping. We want to leave room for a guard page, after all
2192 * (the guard page itself is not added here, that is done by the
2193 * actual page faulting logic)
2194 *
2195 * This matches the behavior of the guard page logic (see mm/memory.c:
2196 * check_stack_guard_page()), which only allows the guard page to be
2197 * removed under these circumstances.
2198 */
2172#ifdef CONFIG_STACK_GROWSUP 2199#ifdef CONFIG_STACK_GROWSUP
2173int expand_stack(struct vm_area_struct *vma, unsigned long address) 2200int expand_stack(struct vm_area_struct *vma, unsigned long address)
2174{ 2201{
2202 struct vm_area_struct *next;
2203
2204 address &= PAGE_MASK;
2205 next = vma->vm_next;
2206 if (next && next->vm_start == address + PAGE_SIZE) {
2207 if (!(next->vm_flags & VM_GROWSUP))
2208 return -ENOMEM;
2209 }
2175 return expand_upwards(vma, address); 2210 return expand_upwards(vma, address);
2176} 2211}
2177 2212
@@ -2186,14 +2221,21 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
2186 return vma; 2221 return vma;
2187 if (!prev || expand_stack(prev, addr)) 2222 if (!prev || expand_stack(prev, addr))
2188 return NULL; 2223 return NULL;
2189 if (prev->vm_flags & VM_LOCKED) { 2224 if (prev->vm_flags & VM_LOCKED)
2190 mlock_vma_pages_range(prev, addr, prev->vm_end); 2225 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
2191 }
2192 return prev; 2226 return prev;
2193} 2227}
2194#else 2228#else
2195int expand_stack(struct vm_area_struct *vma, unsigned long address) 2229int expand_stack(struct vm_area_struct *vma, unsigned long address)
2196{ 2230{
2231 struct vm_area_struct *prev;
2232
2233 address &= PAGE_MASK;
2234 prev = vma->vm_prev;
2235 if (prev && prev->vm_end == address) {
2236 if (!(prev->vm_flags & VM_GROWSDOWN))
2237 return -ENOMEM;
2238 }
2197 return expand_downwards(vma, address); 2239 return expand_downwards(vma, address);
2198} 2240}
2199 2241
@@ -2214,9 +2256,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
2214 start = vma->vm_start; 2256 start = vma->vm_start;
2215 if (expand_stack(vma, addr)) 2257 if (expand_stack(vma, addr))
2216 return NULL; 2258 return NULL;
2217 if (vma->vm_flags & VM_LOCKED) { 2259 if (vma->vm_flags & VM_LOCKED)
2218 mlock_vma_pages_range(vma, addr, start); 2260 __mlock_vma_pages_range(vma, addr, start, NULL);
2219 }
2220 return vma; 2261 return vma;
2221} 2262}
2222#endif 2263#endif
@@ -2589,10 +2630,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2589out: 2630out:
2590 perf_event_mmap(vma); 2631 perf_event_mmap(vma);
2591 mm->total_vm += len >> PAGE_SHIFT; 2632 mm->total_vm += len >> PAGE_SHIFT;
2592 if (flags & VM_LOCKED) { 2633 if (flags & VM_LOCKED)
2593 if (!mlock_vma_pages_range(vma, addr, addr + len)) 2634 mm->locked_vm += (len >> PAGE_SHIFT);
2594 mm->locked_vm += (len >> PAGE_SHIFT);
2595 }
2596 return addr; 2635 return addr;
2597} 2636}
2598 2637
@@ -2600,10 +2639,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
2600{ 2639{
2601 struct mm_struct *mm = current->mm; 2640 struct mm_struct *mm = current->mm;
2602 unsigned long ret; 2641 unsigned long ret;
2642 bool populate;
2603 2643
2604 down_write(&mm->mmap_sem); 2644 down_write(&mm->mmap_sem);
2605 ret = do_brk(addr, len); 2645 ret = do_brk(addr, len);
2646 populate = ((mm->def_flags & VM_LOCKED) != 0);
2606 up_write(&mm->mmap_sem); 2647 up_write(&mm->mmap_sem);
2648 if (populate)
2649 mm_populate(addr, len);
2607 return ret; 2650 return ret;
2608} 2651}
2609EXPORT_SYMBOL(vm_brk); 2652EXPORT_SYMBOL(vm_brk);
@@ -2886,7 +2929,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2886 * The LSB of head.next can't change from under us 2929 * The LSB of head.next can't change from under us
2887 * because we hold the mm_all_locks_mutex. 2930 * because we hold the mm_all_locks_mutex.
2888 */ 2931 */
2889 down_write(&anon_vma->root->rwsem); 2932 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
2890 /* 2933 /*
2891 * We can safely modify head.next after taking the 2934 * We can safely modify head.next after taking the
2892 * anon_vma->root->rwsem. If some other vma in this mm shares 2935 * anon_vma->root->rwsem. If some other vma in this mm shares
@@ -2943,7 +2986,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2943 * vma in this mm is backed by the same anon_vma or address_space. 2986 * vma in this mm is backed by the same anon_vma or address_space.
2944 * 2987 *
2945 * We can take all the locks in random order because the VM code 2988 * We can take all the locks in random order because the VM code
2946 * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never 2989 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
2947 * takes more than one of them in a row. Secondly we're protected 2990 * takes more than one of them in a row. Secondly we're protected
2948 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 2991 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2949 * 2992 *
@@ -3001,7 +3044,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3001 if (!__test_and_clear_bit(0, (unsigned long *) 3044 if (!__test_and_clear_bit(0, (unsigned long *)
3002 &anon_vma->root->rb_root.rb_node)) 3045 &anon_vma->root->rb_root.rb_node))
3003 BUG(); 3046 BUG();
3004 anon_vma_unlock(anon_vma); 3047 anon_vma_unlock_write(anon_vma);
3005 } 3048 }
3006} 3049}
3007 3050
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8a5ac8c686b0..be04122fb277 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -37,49 +37,51 @@ static struct srcu_struct srcu;
37void __mmu_notifier_release(struct mm_struct *mm) 37void __mmu_notifier_release(struct mm_struct *mm)
38{ 38{
39 struct mmu_notifier *mn; 39 struct mmu_notifier *mn;
40 struct hlist_node *n;
41 int id; 40 int id;
42 41
43 /* 42 /*
44 * SRCU here will block mmu_notifier_unregister until 43 * srcu_read_lock() here will block synchronize_srcu() in
45 * ->release returns. 44 * mmu_notifier_unregister() until all registered
45 * ->release() callouts this function makes have
46 * returned.
46 */ 47 */
47 id = srcu_read_lock(&srcu); 48 id = srcu_read_lock(&srcu);
48 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
49 /*
50 * if ->release runs before mmu_notifier_unregister it
51 * must be handled as it's the only way for the driver
52 * to flush all existing sptes and stop the driver
53 * from establishing any more sptes before all the
54 * pages in the mm are freed.
55 */
56 if (mn->ops->release)
57 mn->ops->release(mn, mm);
58 srcu_read_unlock(&srcu, id);
59
60 spin_lock(&mm->mmu_notifier_mm->lock); 49 spin_lock(&mm->mmu_notifier_mm->lock);
61 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 50 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
62 mn = hlist_entry(mm->mmu_notifier_mm->list.first, 51 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
63 struct mmu_notifier, 52 struct mmu_notifier,
64 hlist); 53 hlist);
54
65 /* 55 /*
66 * We arrived before mmu_notifier_unregister so 56 * Unlink. This will prevent mmu_notifier_unregister()
67 * mmu_notifier_unregister will do nothing other than 57 * from also making the ->release() callout.
68 * to wait ->release to finish and
69 * mmu_notifier_unregister to return.
70 */ 58 */
71 hlist_del_init_rcu(&mn->hlist); 59 hlist_del_init_rcu(&mn->hlist);
60 spin_unlock(&mm->mmu_notifier_mm->lock);
61
62 /*
63 * Clear sptes. (see 'release' description in mmu_notifier.h)
64 */
65 if (mn->ops->release)
66 mn->ops->release(mn, mm);
67
68 spin_lock(&mm->mmu_notifier_mm->lock);
72 } 69 }
73 spin_unlock(&mm->mmu_notifier_mm->lock); 70 spin_unlock(&mm->mmu_notifier_mm->lock);
74 71
75 /* 72 /*
76 * synchronize_srcu here prevents mmu_notifier_release to 73 * All callouts to ->release() which we have done are complete.
77 * return to exit_mmap (which would proceed freeing all pages 74 * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
78 * in the mm) until the ->release method returns, if it was 75 */
79 * invoked by mmu_notifier_unregister. 76 srcu_read_unlock(&srcu, id);
80 * 77
81 * The mmu_notifier_mm can't go away from under us because one 78 /*
82 * mm_count is hold by exit_mmap. 79 * mmu_notifier_unregister() may have unlinked a notifier and may
80 * still be calling out to it. Additionally, other notifiers
81 * may have been active via vmtruncate() et. al. Block here
82 * to ensure that all notifier callouts for this mm have been
83 * completed and the sptes are really cleaned up before returning
84 * to exit_mmap().
83 */ 85 */
84 synchronize_srcu(&srcu); 86 synchronize_srcu(&srcu);
85} 87}
@@ -93,11 +95,10 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
93 unsigned long address) 95 unsigned long address)
94{ 96{
95 struct mmu_notifier *mn; 97 struct mmu_notifier *mn;
96 struct hlist_node *n;
97 int young = 0, id; 98 int young = 0, id;
98 99
99 id = srcu_read_lock(&srcu); 100 id = srcu_read_lock(&srcu);
100 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 101 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
101 if (mn->ops->clear_flush_young) 102 if (mn->ops->clear_flush_young)
102 young |= mn->ops->clear_flush_young(mn, mm, address); 103 young |= mn->ops->clear_flush_young(mn, mm, address);
103 } 104 }
@@ -110,11 +111,10 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
110 unsigned long address) 111 unsigned long address)
111{ 112{
112 struct mmu_notifier *mn; 113 struct mmu_notifier *mn;
113 struct hlist_node *n;
114 int young = 0, id; 114 int young = 0, id;
115 115
116 id = srcu_read_lock(&srcu); 116 id = srcu_read_lock(&srcu);
117 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 117 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
118 if (mn->ops->test_young) { 118 if (mn->ops->test_young) {
119 young = mn->ops->test_young(mn, mm, address); 119 young = mn->ops->test_young(mn, mm, address);
120 if (young) 120 if (young)
@@ -130,11 +130,10 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
130 pte_t pte) 130 pte_t pte)
131{ 131{
132 struct mmu_notifier *mn; 132 struct mmu_notifier *mn;
133 struct hlist_node *n;
134 int id; 133 int id;
135 134
136 id = srcu_read_lock(&srcu); 135 id = srcu_read_lock(&srcu);
137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 136 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
138 if (mn->ops->change_pte) 137 if (mn->ops->change_pte)
139 mn->ops->change_pte(mn, mm, address, pte); 138 mn->ops->change_pte(mn, mm, address, pte);
140 } 139 }
@@ -145,11 +144,10 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
145 unsigned long address) 144 unsigned long address)
146{ 145{
147 struct mmu_notifier *mn; 146 struct mmu_notifier *mn;
148 struct hlist_node *n;
149 int id; 147 int id;
150 148
151 id = srcu_read_lock(&srcu); 149 id = srcu_read_lock(&srcu);
152 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 150 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
153 if (mn->ops->invalidate_page) 151 if (mn->ops->invalidate_page)
154 mn->ops->invalidate_page(mn, mm, address); 152 mn->ops->invalidate_page(mn, mm, address);
155 } 153 }
@@ -160,31 +158,31 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
160 unsigned long start, unsigned long end) 158 unsigned long start, unsigned long end)
161{ 159{
162 struct mmu_notifier *mn; 160 struct mmu_notifier *mn;
163 struct hlist_node *n;
164 int id; 161 int id;
165 162
166 id = srcu_read_lock(&srcu); 163 id = srcu_read_lock(&srcu);
167 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 164 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
168 if (mn->ops->invalidate_range_start) 165 if (mn->ops->invalidate_range_start)
169 mn->ops->invalidate_range_start(mn, mm, start, end); 166 mn->ops->invalidate_range_start(mn, mm, start, end);
170 } 167 }
171 srcu_read_unlock(&srcu, id); 168 srcu_read_unlock(&srcu, id);
172} 169}
170EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
173 171
174void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 172void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
175 unsigned long start, unsigned long end) 173 unsigned long start, unsigned long end)
176{ 174{
177 struct mmu_notifier *mn; 175 struct mmu_notifier *mn;
178 struct hlist_node *n;
179 int id; 176 int id;
180 177
181 id = srcu_read_lock(&srcu); 178 id = srcu_read_lock(&srcu);
182 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 179 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
183 if (mn->ops->invalidate_range_end) 180 if (mn->ops->invalidate_range_end)
184 mn->ops->invalidate_range_end(mn, mm, start, end); 181 mn->ops->invalidate_range_end(mn, mm, start, end);
185 } 182 }
186 srcu_read_unlock(&srcu, id); 183 srcu_read_unlock(&srcu, id);
187} 184}
185EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
188 186
189static int do_mmu_notifier_register(struct mmu_notifier *mn, 187static int do_mmu_notifier_register(struct mmu_notifier *mn,
190 struct mm_struct *mm, 188 struct mm_struct *mm,
@@ -294,31 +292,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
294{ 292{
295 BUG_ON(atomic_read(&mm->mm_count) <= 0); 293 BUG_ON(atomic_read(&mm->mm_count) <= 0);
296 294
295 spin_lock(&mm->mmu_notifier_mm->lock);
297 if (!hlist_unhashed(&mn->hlist)) { 296 if (!hlist_unhashed(&mn->hlist)) {
298 /*
299 * SRCU here will force exit_mmap to wait ->release to finish
300 * before freeing the pages.
301 */
302 int id; 297 int id;
303 298
304 id = srcu_read_lock(&srcu);
305 /* 299 /*
306 * exit_mmap will block in mmu_notifier_release to 300 * Ensure we synchronize up with __mmu_notifier_release().
307 * guarantee ->release is called before freeing the
308 * pages.
309 */ 301 */
302 id = srcu_read_lock(&srcu);
303
304 hlist_del_rcu(&mn->hlist);
305 spin_unlock(&mm->mmu_notifier_mm->lock);
306
310 if (mn->ops->release) 307 if (mn->ops->release)
311 mn->ops->release(mn, mm); 308 mn->ops->release(mn, mm);
312 srcu_read_unlock(&srcu, id);
313 309
314 spin_lock(&mm->mmu_notifier_mm->lock); 310 /*
315 hlist_del_rcu(&mn->hlist); 311 * Allow __mmu_notifier_release() to complete.
312 */
313 srcu_read_unlock(&srcu, id);
314 } else
316 spin_unlock(&mm->mmu_notifier_mm->lock); 315 spin_unlock(&mm->mmu_notifier_mm->lock);
317 }
318 316
319 /* 317 /*
320 * Wait any running method to finish, of course including 318 * Wait for any running method to finish, including ->release() if it
321 * ->release if it was run by mmu_notifier_relase instead of us. 319 * was run by __mmu_notifier_release() instead of us.
322 */ 320 */
323 synchronize_srcu(&srcu); 321 synchronize_srcu(&srcu);
324 322
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4596d81b89b1..2ac0afbd68f3 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/mm/mmzone.c 2 * linux/mm/mmzone.c
3 * 3 *
4 * management codes for pgdats and zones. 4 * management codes for pgdats, zones and page flags
5 */ 5 */
6 6
7 7
@@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec)
96 for_each_lru(lru) 96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98} 98}
99
100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
101int page_nid_xchg_last(struct page *page, int nid)
102{
103 unsigned long old_flags, flags;
104 int last_nid;
105
106 do {
107 old_flags = flags = page->flags;
108 last_nid = page_nid_last(page);
109
110 flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
111 flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
113
114 return last_nid;
115}
116#endif
diff --git a/mm/mremap.c b/mm/mremap.c
index e1031e1f6a61..463a25705ac6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -19,6 +19,7 @@
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/mmu_notifier.h> 21#include <linux/mmu_notifier.h>
22#include <linux/sched/sysctl.h>
22 23
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24#include <asm/cacheflush.h> 25#include <asm/cacheflush.h>
@@ -134,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
134 pte_unmap(new_pte - 1); 135 pte_unmap(new_pte - 1);
135 pte_unmap_unlock(old_pte - 1, old_ptl); 136 pte_unmap_unlock(old_pte - 1, old_ptl);
136 if (anon_vma) 137 if (anon_vma)
137 anon_vma_unlock(anon_vma); 138 anon_vma_unlock_write(anon_vma);
138 if (mapping) 139 if (mapping)
139 mutex_unlock(&mapping->i_mmap_mutex); 140 mutex_unlock(&mapping->i_mmap_mutex);
140} 141}
@@ -208,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
208 209
209static unsigned long move_vma(struct vm_area_struct *vma, 210static unsigned long move_vma(struct vm_area_struct *vma,
210 unsigned long old_addr, unsigned long old_len, 211 unsigned long old_addr, unsigned long old_len,
211 unsigned long new_len, unsigned long new_addr) 212 unsigned long new_len, unsigned long new_addr, bool *locked)
212{ 213{
213 struct mm_struct *mm = vma->vm_mm; 214 struct mm_struct *mm = vma->vm_mm;
214 struct vm_area_struct *new_vma; 215 struct vm_area_struct *new_vma;
@@ -299,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
299 300
300 if (vm_flags & VM_LOCKED) { 301 if (vm_flags & VM_LOCKED) {
301 mm->locked_vm += new_len >> PAGE_SHIFT; 302 mm->locked_vm += new_len >> PAGE_SHIFT;
302 if (new_len > old_len) 303 *locked = true;
303 mlock_vma_pages_range(new_vma, new_addr + old_len,
304 new_addr + new_len);
305 } 304 }
306 305
307 return new_addr; 306 return new_addr;
@@ -366,9 +365,8 @@ Eagain:
366 return ERR_PTR(-EAGAIN); 365 return ERR_PTR(-EAGAIN);
367} 366}
368 367
369static unsigned long mremap_to(unsigned long addr, 368static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
370 unsigned long old_len, unsigned long new_addr, 369 unsigned long new_addr, unsigned long new_len, bool *locked)
371 unsigned long new_len)
372{ 370{
373 struct mm_struct *mm = current->mm; 371 struct mm_struct *mm = current->mm;
374 struct vm_area_struct *vma; 372 struct vm_area_struct *vma;
@@ -418,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr,
418 if (ret & ~PAGE_MASK) 416 if (ret & ~PAGE_MASK)
419 goto out1; 417 goto out1;
420 418
421 ret = move_vma(vma, addr, old_len, new_len, new_addr); 419 ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
422 if (!(ret & ~PAGE_MASK)) 420 if (!(ret & ~PAGE_MASK))
423 goto out; 421 goto out;
424out1: 422out1:
@@ -456,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
456 struct vm_area_struct *vma; 454 struct vm_area_struct *vma;
457 unsigned long ret = -EINVAL; 455 unsigned long ret = -EINVAL;
458 unsigned long charged = 0; 456 unsigned long charged = 0;
457 bool locked = false;
459 458
460 down_write(&current->mm->mmap_sem); 459 down_write(&current->mm->mmap_sem);
461 460
@@ -478,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
478 477
479 if (flags & MREMAP_FIXED) { 478 if (flags & MREMAP_FIXED) {
480 if (flags & MREMAP_MAYMOVE) 479 if (flags & MREMAP_MAYMOVE)
481 ret = mremap_to(addr, old_len, new_addr, new_len); 480 ret = mremap_to(addr, old_len, new_addr, new_len,
481 &locked);
482 goto out; 482 goto out;
483 } 483 }
484 484
@@ -520,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
520 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 520 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
521 if (vma->vm_flags & VM_LOCKED) { 521 if (vma->vm_flags & VM_LOCKED) {
522 mm->locked_vm += pages; 522 mm->locked_vm += pages;
523 mlock_vma_pages_range(vma, addr + old_len, 523 locked = true;
524 addr + new_len); 524 new_addr = addr;
525 } 525 }
526 ret = addr; 526 ret = addr;
527 goto out; 527 goto out;
@@ -547,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
547 goto out; 547 goto out;
548 } 548 }
549 549
550 ret = move_vma(vma, addr, old_len, new_len, new_addr); 550 ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
551 } 551 }
552out: 552out:
553 if (ret & ~PAGE_MASK) 553 if (ret & ~PAGE_MASK)
554 vm_unacct_memory(charged); 554 vm_unacct_memory(charged);
555 up_write(&current->mm->mmap_sem); 555 up_write(&current->mm->mmap_sem);
556 if (locked && new_len > old_len)
557 mm_populate(new_addr + old_len, new_len - old_len);
556 return ret; 558 return ret;
557} 559}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index b8294fc03df8..5e07d36e381e 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -154,21 +154,6 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
154} 154}
155 155
156/** 156/**
157 * free_all_bootmem_node - release a node's free pages to the buddy allocator
158 * @pgdat: node to be released
159 *
160 * Returns the number of pages actually released.
161 */
162unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
163{
164 register_page_bootmem_info_node(pgdat);
165 reset_node_lowmem_managed_pages(pgdat);
166
167 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
168 return 0;
169}
170
171/**
172 * free_all_bootmem - release free pages to the buddy allocator 157 * free_all_bootmem - release free pages to the buddy allocator
173 * 158 *
174 * Returns the number of pages actually released. 159 * Returns the number of pages actually released.
@@ -406,6 +391,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
406 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); 391 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
407} 392}
408 393
394void * __init __alloc_bootmem_low_nopanic(unsigned long size,
395 unsigned long align,
396 unsigned long goal)
397{
398 return ___alloc_bootmem_nopanic(size, align, goal,
399 ARCH_LOW_ADDRESS_LIMIT);
400}
401
409/** 402/**
410 * __alloc_bootmem_low_node - allocate low boot memory from a specific node 403 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
411 * @pgdat: node to allocate from 404 * @pgdat: node to allocate from
diff --git a/mm/nommu.c b/mm/nommu.c
index 79c3cac87afa..e19328087534 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -29,6 +29,7 @@
29#include <linux/security.h> 29#include <linux/security.h>
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/sched/sysctl.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/tlb.h> 35#include <asm/tlb.h>
@@ -139,10 +140,10 @@ unsigned int kobjsize(const void *objp)
139 return PAGE_SIZE << compound_order(page); 140 return PAGE_SIZE << compound_order(page);
140} 141}
141 142
142int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 143long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
143 unsigned long start, int nr_pages, unsigned int foll_flags, 144 unsigned long start, unsigned long nr_pages,
144 struct page **pages, struct vm_area_struct **vmas, 145 unsigned int foll_flags, struct page **pages,
145 int *retry) 146 struct vm_area_struct **vmas, int *nonblocking)
146{ 147{
147 struct vm_area_struct *vma; 148 struct vm_area_struct *vma;
148 unsigned long vm_flags; 149 unsigned long vm_flags;
@@ -189,9 +190,10 @@ finish_or_fault:
189 * slab page or a secondary page from a compound page 190 * slab page or a secondary page from a compound page
190 * - don't permit access to VMAs that don't support it, such as I/O mappings 191 * - don't permit access to VMAs that don't support it, such as I/O mappings
191 */ 192 */
192int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 193long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
193 unsigned long start, int nr_pages, int write, int force, 194 unsigned long start, unsigned long nr_pages,
194 struct page **pages, struct vm_area_struct **vmas) 195 int write, int force, struct page **pages,
196 struct vm_area_struct **vmas)
195{ 197{
196 int flags = 0; 198 int flags = 0;
197 199
@@ -941,7 +943,7 @@ static int validate_mmap_request(struct file *file,
941 */ 943 */
942 mapping = file->f_mapping; 944 mapping = file->f_mapping;
943 if (!mapping) 945 if (!mapping)
944 mapping = file->f_path.dentry->d_inode->i_mapping; 946 mapping = file_inode(file)->i_mapping;
945 947
946 capabilities = 0; 948 capabilities = 0;
947 if (mapping && mapping->backing_dev_info) 949 if (mapping && mapping->backing_dev_info)
@@ -950,7 +952,7 @@ static int validate_mmap_request(struct file *file,
950 if (!capabilities) { 952 if (!capabilities) {
951 /* no explicit capabilities set, so assume some 953 /* no explicit capabilities set, so assume some
952 * defaults */ 954 * defaults */
953 switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { 955 switch (file_inode(file)->i_mode & S_IFMT) {
954 case S_IFREG: 956 case S_IFREG:
955 case S_IFBLK: 957 case S_IFBLK:
956 capabilities = BDI_CAP_MAP_COPY; 958 capabilities = BDI_CAP_MAP_COPY;
@@ -985,11 +987,11 @@ static int validate_mmap_request(struct file *file,
985 !(file->f_mode & FMODE_WRITE)) 987 !(file->f_mode & FMODE_WRITE))
986 return -EACCES; 988 return -EACCES;
987 989
988 if (IS_APPEND(file->f_path.dentry->d_inode) && 990 if (IS_APPEND(file_inode(file)) &&
989 (file->f_mode & FMODE_WRITE)) 991 (file->f_mode & FMODE_WRITE))
990 return -EACCES; 992 return -EACCES;
991 993
992 if (locks_verify_locked(file->f_path.dentry->d_inode)) 994 if (locks_verify_locked(file_inode(file)))
993 return -EAGAIN; 995 return -EAGAIN;
994 996
995 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 997 if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -1249,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file,
1249 unsigned long len, 1251 unsigned long len,
1250 unsigned long prot, 1252 unsigned long prot,
1251 unsigned long flags, 1253 unsigned long flags,
1252 unsigned long pgoff) 1254 unsigned long pgoff,
1255 unsigned long *populate)
1253{ 1256{
1254 struct vm_area_struct *vma; 1257 struct vm_area_struct *vma;
1255 struct vm_region *region; 1258 struct vm_region *region;
@@ -1259,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file,
1259 1262
1260 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); 1263 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1261 1264
1265 *populate = 0;
1266
1262 /* decide whether we should attempt the mapping, and if so what sort of 1267 /* decide whether we should attempt the mapping, and if so what sort of
1263 * mapping */ 1268 * mapping */
1264 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1269 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1322,8 +1327,8 @@ unsigned long do_mmap_pgoff(struct file *file,
1322 continue; 1327 continue;
1323 1328
1324 /* search for overlapping mappings on the same file */ 1329 /* search for overlapping mappings on the same file */
1325 if (pregion->vm_file->f_path.dentry->d_inode != 1330 if (file_inode(pregion->vm_file) !=
1326 file->f_path.dentry->d_inode) 1331 file_inode(file))
1327 continue; 1332 continue;
1328 1333
1329 if (pregion->vm_pgoff >= pgend) 1334 if (pregion->vm_pgoff >= pgend)
@@ -1814,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1814 return ret; 1819 return ret;
1815} 1820}
1816 1821
1817struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1822struct page *follow_page_mask(struct vm_area_struct *vma,
1818 unsigned int foll_flags) 1823 unsigned long address, unsigned int flags,
1824 unsigned int *page_mask)
1819{ 1825{
1826 *page_mask = 0;
1820 return NULL; 1827 return NULL;
1821} 1828}
1822 1829
@@ -1903,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1903 */ 1910 */
1904 free -= global_page_state(NR_SHMEM); 1911 free -= global_page_state(NR_SHMEM);
1905 1912
1906 free += nr_swap_pages; 1913 free += get_nr_swap_pages();
1907 1914
1908 /* 1915 /*
1909 * Any slabs which are created with the 1916 * Any slabs which are created with the
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0399f146ae49..79e451a78c9e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
386 cpuset_print_task_mems_allowed(current); 386 cpuset_print_task_mems_allowed(current);
387 task_unlock(current); 387 task_unlock(current);
388 dump_stack(); 388 dump_stack();
389 mem_cgroup_print_oom_info(memcg, p); 389 if (memcg)
390 show_mem(SHOW_MEM_FILTER_NODES); 390 mem_cgroup_print_oom_info(memcg, p);
391 else
392 show_mem(SHOW_MEM_FILTER_NODES);
391 if (sysctl_oom_dump_tasks) 393 if (sysctl_oom_dump_tasks)
392 dump_tasks(memcg, nodemask); 394 dump_tasks(memcg, nodemask);
393} 395}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3734cefd4de4..742c40583159 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -35,6 +35,7 @@
35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ 35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/timer.h> 37#include <linux/timer.h>
38#include <linux/sched/rt.h>
38#include <trace/events/writeback.h> 39#include <trace/events/writeback.h>
39 40
40/* 41/*
@@ -240,6 +241,9 @@ static unsigned long global_dirtyable_memory(void)
240 if (!vm_highmem_is_dirtyable) 241 if (!vm_highmem_is_dirtyable)
241 x -= highmem_dirtyable_memory(x); 242 x -= highmem_dirtyable_memory(x);
242 243
244 /* Subtract min_free_kbytes */
245 x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
246
243 return x + 1; /* Ensure that we never return 0 */ 247 return x + 1; /* Ensure that we never return 0 */
244} 248}
245 249
@@ -2291,3 +2295,27 @@ int mapping_tagged(struct address_space *mapping, int tag)
2291 return radix_tree_tagged(&mapping->page_tree, tag); 2295 return radix_tree_tagged(&mapping->page_tree, tag);
2292} 2296}
2293EXPORT_SYMBOL(mapping_tagged); 2297EXPORT_SYMBOL(mapping_tagged);
2298
2299/**
2300 * wait_for_stable_page() - wait for writeback to finish, if necessary.
2301 * @page: The page to wait on.
2302 *
2303 * This function determines if the given page is related to a backing device
2304 * that requires page contents to be held stable during writeback. If so, then
2305 * it will wait for any pending writeback to complete.
2306 */
2307void wait_for_stable_page(struct page *page)
2308{
2309 struct address_space *mapping = page_mapping(page);
2310 struct backing_dev_info *bdi = mapping->backing_dev_info;
2311
2312 if (!bdi_cap_stable_pages_required(bdi))
2313 return;
2314#ifdef CONFIG_NEED_BOUNCE_POOL
2315 if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
2316 return;
2317#endif /* CONFIG_NEED_BOUNCE_POOL */
2318
2319 wait_on_page_writeback(page);
2320}
2321EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bc6cc0e913bd..0dade3f18f7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,6 +58,7 @@
58#include <linux/prefetch.h> 58#include <linux/prefetch.h>
59#include <linux/migrate.h> 59#include <linux/migrate.h>
60#include <linux/page-debug-flags.h> 60#include <linux/page-debug-flags.h>
61#include <linux/sched/rt.h>
61 62
62#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
63#include <asm/div64.h> 64#include <asm/div64.h>
@@ -201,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages;
201static unsigned long __meminitdata dma_reserve; 202static unsigned long __meminitdata dma_reserve;
202 203
203#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 204#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
205/* Movable memory ranges, will also be used by memblock subsystem. */
206struct movablemem_map movablemem_map = {
207 .acpi = false,
208 .nr_map = 0,
209};
210
204static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 211static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
205static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 212static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
206static unsigned long __initdata required_kernelcore; 213static unsigned long __initdata required_kernelcore;
207static unsigned long __initdata required_movablecore; 214static unsigned long __initdata required_movablecore;
208static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 215static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
216static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
209 217
210/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 218/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
211int movable_zone; 219int movable_zone;
@@ -239,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
239 int ret = 0; 247 int ret = 0;
240 unsigned seq; 248 unsigned seq;
241 unsigned long pfn = page_to_pfn(page); 249 unsigned long pfn = page_to_pfn(page);
250 unsigned long sp, start_pfn;
242 251
243 do { 252 do {
244 seq = zone_span_seqbegin(zone); 253 seq = zone_span_seqbegin(zone);
245 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 254 start_pfn = zone->zone_start_pfn;
246 ret = 1; 255 sp = zone->spanned_pages;
247 else if (pfn < zone->zone_start_pfn) 256 if (!zone_spans_pfn(zone, pfn))
248 ret = 1; 257 ret = 1;
249 } while (zone_span_seqretry(zone, seq)); 258 } while (zone_span_seqretry(zone, seq));
250 259
260 if (ret)
261 pr_err("page %lu outside zone [ %lu - %lu ]\n",
262 pfn, start_pfn, start_pfn + sp);
263
251 return ret; 264 return ret;
252} 265}
253 266
@@ -287,7 +300,7 @@ static void bad_page(struct page *page)
287 300
288 /* Don't complain about poisoned pages */ 301 /* Don't complain about poisoned pages */
289 if (PageHWPoison(page)) { 302 if (PageHWPoison(page)) {
290 reset_page_mapcount(page); /* remove PageBuddy */ 303 page_mapcount_reset(page); /* remove PageBuddy */
291 return; 304 return;
292 } 305 }
293 306
@@ -319,8 +332,8 @@ static void bad_page(struct page *page)
319 dump_stack(); 332 dump_stack();
320out: 333out:
321 /* Leave bad fields for debug, except PageBuddy could make trouble */ 334 /* Leave bad fields for debug, except PageBuddy could make trouble */
322 reset_page_mapcount(page); /* remove PageBuddy */ 335 page_mapcount_reset(page); /* remove PageBuddy */
323 add_taint(TAINT_BAD_PAGE); 336 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
324} 337}
325 338
326/* 339/*
@@ -532,6 +545,8 @@ static inline void __free_one_page(struct page *page,
532 unsigned long uninitialized_var(buddy_idx); 545 unsigned long uninitialized_var(buddy_idx);
533 struct page *buddy; 546 struct page *buddy;
534 547
548 VM_BUG_ON(!zone_is_initialized(zone));
549
535 if (unlikely(PageCompound(page))) 550 if (unlikely(PageCompound(page)))
536 if (unlikely(destroy_compound_page(page, order))) 551 if (unlikely(destroy_compound_page(page, order)))
537 return; 552 return;
@@ -605,7 +620,7 @@ static inline int free_pages_check(struct page *page)
605 bad_page(page); 620 bad_page(page);
606 return 1; 621 return 1;
607 } 622 }
608 reset_page_last_nid(page); 623 page_nid_reset_last(page);
609 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 624 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
610 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 625 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
611 return 0; 626 return 0;
@@ -665,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
665 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 680 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
666 __free_one_page(page, zone, 0, mt); 681 __free_one_page(page, zone, 0, mt);
667 trace_mm_page_pcpu_drain(page, 0, mt); 682 trace_mm_page_pcpu_drain(page, 0, mt);
668 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { 683 if (likely(!is_migrate_isolate_page(page))) {
669 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 684 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
670 if (is_migrate_cma(mt)) 685 if (is_migrate_cma(mt))
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 686 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
@@ -683,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
683 zone->pages_scanned = 0; 698 zone->pages_scanned = 0;
684 699
685 __free_one_page(page, zone, order, migratetype); 700 __free_one_page(page, zone, order, migratetype);
686 if (unlikely(migratetype != MIGRATE_ISOLATE)) 701 if (unlikely(!is_migrate_isolate(migratetype)))
687 __mod_zone_freepage_state(zone, 1 << order, migratetype); 702 __mod_zone_freepage_state(zone, 1 << order, migratetype);
688 spin_unlock(&zone->lock); 703 spin_unlock(&zone->lock);
689} 704}
@@ -773,6 +788,10 @@ void __init init_cma_reserved_pageblock(struct page *page)
773 set_pageblock_migratetype(page, MIGRATE_CMA); 788 set_pageblock_migratetype(page, MIGRATE_CMA);
774 __free_pages(page, pageblock_order); 789 __free_pages(page, pageblock_order);
775 totalram_pages += pageblock_nr_pages; 790 totalram_pages += pageblock_nr_pages;
791#ifdef CONFIG_HIGHMEM
792 if (PageHighMem(page))
793 totalhigh_pages += pageblock_nr_pages;
794#endif
776} 795}
777#endif 796#endif
778 797
@@ -911,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = {
911 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 930 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
912#endif 931#endif
913 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 932 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
933#ifdef CONFIG_MEMORY_ISOLATION
914 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 934 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
935#endif
915}; 936};
916 937
917/* 938/*
@@ -976,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
976 end_pfn = start_pfn + pageblock_nr_pages - 1; 997 end_pfn = start_pfn + pageblock_nr_pages - 1;
977 998
978 /* Do not cross zone boundaries */ 999 /* Do not cross zone boundaries */
979 if (start_pfn < zone->zone_start_pfn) 1000 if (!zone_spans_pfn(zone, start_pfn))
980 start_page = page; 1001 start_page = page;
981 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 1002 if (!zone_spans_pfn(zone, end_pfn))
982 return 0; 1003 return 0;
983 1004
984 return move_freepages(zone, start_page, end_page, migratetype); 1005 return move_freepages(zone, start_page, end_page, migratetype);
@@ -1137,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1137 list_add_tail(&page->lru, list); 1158 list_add_tail(&page->lru, list);
1138 if (IS_ENABLED(CONFIG_CMA)) { 1159 if (IS_ENABLED(CONFIG_CMA)) {
1139 mt = get_pageblock_migratetype(page); 1160 mt = get_pageblock_migratetype(page);
1140 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) 1161 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
1141 mt = migratetype; 1162 mt = migratetype;
1142 } 1163 }
1143 set_freepage_migratetype(page, mt); 1164 set_freepage_migratetype(page, mt);
@@ -1272,7 +1293,7 @@ void mark_free_pages(struct zone *zone)
1272 1293
1273 spin_lock_irqsave(&zone->lock, flags); 1294 spin_lock_irqsave(&zone->lock, flags);
1274 1295
1275 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1296 max_zone_pfn = zone_end_pfn(zone);
1276 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1297 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1277 if (pfn_valid(pfn)) { 1298 if (pfn_valid(pfn)) {
1278 struct page *page = pfn_to_page(pfn); 1299 struct page *page = pfn_to_page(pfn);
@@ -1321,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold)
1321 * excessively into the page allocator 1342 * excessively into the page allocator
1322 */ 1343 */
1323 if (migratetype >= MIGRATE_PCPTYPES) { 1344 if (migratetype >= MIGRATE_PCPTYPES) {
1324 if (unlikely(migratetype == MIGRATE_ISOLATE)) { 1345 if (unlikely(is_migrate_isolate(migratetype))) {
1325 free_one_page(zone, page, 0, migratetype); 1346 free_one_page(zone, page, 0, migratetype);
1326 goto out; 1347 goto out;
1327 } 1348 }
@@ -1384,14 +1405,8 @@ void split_page(struct page *page, unsigned int order)
1384 set_page_refcounted(page + i); 1405 set_page_refcounted(page + i);
1385} 1406}
1386 1407
1387/* 1408static int __isolate_free_page(struct page *page, unsigned int order)
1388 * Similar to the split_page family of functions except that the page
1389 * required at the given order and being isolated now to prevent races
1390 * with parallel allocators
1391 */
1392int capture_free_page(struct page *page, int alloc_order, int migratetype)
1393{ 1409{
1394 unsigned int order;
1395 unsigned long watermark; 1410 unsigned long watermark;
1396 struct zone *zone; 1411 struct zone *zone;
1397 int mt; 1412 int mt;
@@ -1399,16 +1414,15 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1399 BUG_ON(!PageBuddy(page)); 1414 BUG_ON(!PageBuddy(page));
1400 1415
1401 zone = page_zone(page); 1416 zone = page_zone(page);
1402 order = page_order(page);
1403 mt = get_pageblock_migratetype(page); 1417 mt = get_pageblock_migratetype(page);
1404 1418
1405 if (mt != MIGRATE_ISOLATE) { 1419 if (!is_migrate_isolate(mt)) {
1406 /* Obey watermarks as if the page was being allocated */ 1420 /* Obey watermarks as if the page was being allocated */
1407 watermark = low_wmark_pages(zone) + (1 << order); 1421 watermark = low_wmark_pages(zone) + (1 << order);
1408 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1422 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1409 return 0; 1423 return 0;
1410 1424
1411 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); 1425 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1412 } 1426 }
1413 1427
1414 /* Remove page from free list */ 1428 /* Remove page from free list */
@@ -1416,22 +1430,18 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1416 zone->free_area[order].nr_free--; 1430 zone->free_area[order].nr_free--;
1417 rmv_page_order(page); 1431 rmv_page_order(page);
1418 1432
1419 if (alloc_order != order) 1433 /* Set the pageblock if the isolated page is at least a pageblock */
1420 expand(zone, page, alloc_order, order,
1421 &zone->free_area[order], migratetype);
1422
1423 /* Set the pageblock if the captured page is at least a pageblock */
1424 if (order >= pageblock_order - 1) { 1434 if (order >= pageblock_order - 1) {
1425 struct page *endpage = page + (1 << order) - 1; 1435 struct page *endpage = page + (1 << order) - 1;
1426 for (; page < endpage; page += pageblock_nr_pages) { 1436 for (; page < endpage; page += pageblock_nr_pages) {
1427 int mt = get_pageblock_migratetype(page); 1437 int mt = get_pageblock_migratetype(page);
1428 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) 1438 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1429 set_pageblock_migratetype(page, 1439 set_pageblock_migratetype(page,
1430 MIGRATE_MOVABLE); 1440 MIGRATE_MOVABLE);
1431 } 1441 }
1432 } 1442 }
1433 1443
1434 return 1UL << alloc_order; 1444 return 1UL << order;
1435} 1445}
1436 1446
1437/* 1447/*
@@ -1449,10 +1459,9 @@ int split_free_page(struct page *page)
1449 unsigned int order; 1459 unsigned int order;
1450 int nr_pages; 1460 int nr_pages;
1451 1461
1452 BUG_ON(!PageBuddy(page));
1453 order = page_order(page); 1462 order = page_order(page);
1454 1463
1455 nr_pages = capture_free_page(page, order, 0); 1464 nr_pages = __isolate_free_page(page, order);
1456 if (!nr_pages) 1465 if (!nr_pages)
1457 return 0; 1466 return 0;
1458 1467
@@ -2136,8 +2145,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2136 bool *contended_compaction, bool *deferred_compaction, 2145 bool *contended_compaction, bool *deferred_compaction,
2137 unsigned long *did_some_progress) 2146 unsigned long *did_some_progress)
2138{ 2147{
2139 struct page *page = NULL;
2140
2141 if (!order) 2148 if (!order)
2142 return NULL; 2149 return NULL;
2143 2150
@@ -2149,16 +2156,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2149 current->flags |= PF_MEMALLOC; 2156 current->flags |= PF_MEMALLOC;
2150 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2157 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2151 nodemask, sync_migration, 2158 nodemask, sync_migration,
2152 contended_compaction, &page); 2159 contended_compaction);
2153 current->flags &= ~PF_MEMALLOC; 2160 current->flags &= ~PF_MEMALLOC;
2154 2161
2155 /* If compaction captured a page, prep and use it */
2156 if (page) {
2157 prep_new_page(page, order, gfp_mask);
2158 goto got_page;
2159 }
2160
2161 if (*did_some_progress != COMPACT_SKIPPED) { 2162 if (*did_some_progress != COMPACT_SKIPPED) {
2163 struct page *page;
2164
2162 /* Page migration frees to the PCP lists but we want merging */ 2165 /* Page migration frees to the PCP lists but we want merging */
2163 drain_pages(get_cpu()); 2166 drain_pages(get_cpu());
2164 put_cpu(); 2167 put_cpu();
@@ -2168,7 +2171,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2168 alloc_flags & ~ALLOC_NO_WATERMARKS, 2171 alloc_flags & ~ALLOC_NO_WATERMARKS,
2169 preferred_zone, migratetype); 2172 preferred_zone, migratetype);
2170 if (page) { 2173 if (page) {
2171got_page:
2172 preferred_zone->compact_blockskip_flush = false; 2174 preferred_zone->compact_blockskip_flush = false;
2173 preferred_zone->compact_considered = 0; 2175 preferred_zone->compact_considered = 0;
2174 preferred_zone->compact_defer_shift = 0; 2176 preferred_zone->compact_defer_shift = 0;
@@ -2629,10 +2631,17 @@ retry_cpuset:
2629 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2631 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2630 zonelist, high_zoneidx, alloc_flags, 2632 zonelist, high_zoneidx, alloc_flags,
2631 preferred_zone, migratetype); 2633 preferred_zone, migratetype);
2632 if (unlikely(!page)) 2634 if (unlikely(!page)) {
2635 /*
2636 * Runtime PM, block IO and its error handling path
2637 * can deadlock because I/O on the device might not
2638 * complete.
2639 */
2640 gfp_mask = memalloc_noio_flags(gfp_mask);
2633 page = __alloc_pages_slowpath(gfp_mask, order, 2641 page = __alloc_pages_slowpath(gfp_mask, order,
2634 zonelist, high_zoneidx, nodemask, 2642 zonelist, high_zoneidx, nodemask,
2635 preferred_zone, migratetype); 2643 preferred_zone, migratetype);
2644 }
2636 2645
2637 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2646 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2638 2647
@@ -2804,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size)
2804} 2813}
2805EXPORT_SYMBOL(free_pages_exact); 2814EXPORT_SYMBOL(free_pages_exact);
2806 2815
2807static unsigned int nr_free_zone_pages(int offset) 2816/**
2817 * nr_free_zone_pages - count number of pages beyond high watermark
2818 * @offset: The zone index of the highest zone
2819 *
2820 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2821 * high watermark within all zones at or below a given zone index. For each
2822 * zone, the number of pages is calculated as:
2823 * present_pages - high_pages
2824 */
2825static unsigned long nr_free_zone_pages(int offset)
2808{ 2826{
2809 struct zoneref *z; 2827 struct zoneref *z;
2810 struct zone *zone; 2828 struct zone *zone;
2811 2829
2812 /* Just pick one node, since fallback list is circular */ 2830 /* Just pick one node, since fallback list is circular */
2813 unsigned int sum = 0; 2831 unsigned long sum = 0;
2814 2832
2815 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2833 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2816 2834
2817 for_each_zone_zonelist(zone, z, zonelist, offset) { 2835 for_each_zone_zonelist(zone, z, zonelist, offset) {
2818 unsigned long size = zone->present_pages; 2836 unsigned long size = zone->managed_pages;
2819 unsigned long high = high_wmark_pages(zone); 2837 unsigned long high = high_wmark_pages(zone);
2820 if (size > high) 2838 if (size > high)
2821 sum += size - high; 2839 sum += size - high;
@@ -2824,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset)
2824 return sum; 2842 return sum;
2825} 2843}
2826 2844
2827/* 2845/**
2828 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 2846 * nr_free_buffer_pages - count number of pages beyond high watermark
2847 *
2848 * nr_free_buffer_pages() counts the number of pages which are beyond the high
2849 * watermark within ZONE_DMA and ZONE_NORMAL.
2829 */ 2850 */
2830unsigned int nr_free_buffer_pages(void) 2851unsigned long nr_free_buffer_pages(void)
2831{ 2852{
2832 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2853 return nr_free_zone_pages(gfp_zone(GFP_USER));
2833} 2854}
2834EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2855EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2835 2856
2836/* 2857/**
2837 * Amount of free RAM allocatable within all zones 2858 * nr_free_pagecache_pages - count number of pages beyond high watermark
2859 *
2860 * nr_free_pagecache_pages() counts the number of pages which are beyond the
2861 * high watermark within all zones.
2838 */ 2862 */
2839unsigned int nr_free_pagecache_pages(void) 2863unsigned long nr_free_pagecache_pages(void)
2840{ 2864{
2841 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 2865 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2842} 2866}
@@ -2868,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2868 val->totalram = pgdat->node_present_pages; 2892 val->totalram = pgdat->node_present_pages;
2869 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2893 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2870#ifdef CONFIG_HIGHMEM 2894#ifdef CONFIG_HIGHMEM
2871 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 2895 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
2872 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 2896 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2873 NR_FREE_PAGES); 2897 NR_FREE_PAGES);
2874#else 2898#else
@@ -2911,7 +2935,9 @@ static void show_migration_types(unsigned char type)
2911#ifdef CONFIG_CMA 2935#ifdef CONFIG_CMA
2912 [MIGRATE_CMA] = 'C', 2936 [MIGRATE_CMA] = 'C',
2913#endif 2937#endif
2938#ifdef CONFIG_MEMORY_ISOLATION
2914 [MIGRATE_ISOLATE] = 'I', 2939 [MIGRATE_ISOLATE] = 'I',
2940#endif
2915 }; 2941 };
2916 char tmp[MIGRATE_TYPES + 1]; 2942 char tmp[MIGRATE_TYPES + 1];
2917 char *p = tmp; 2943 char *p = tmp;
@@ -3250,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
3250{ 3276{
3251 int n, val; 3277 int n, val;
3252 int min_val = INT_MAX; 3278 int min_val = INT_MAX;
3253 int best_node = -1; 3279 int best_node = NUMA_NO_NODE;
3254 const struct cpumask *tmp = cpumask_of_node(0); 3280 const struct cpumask *tmp = cpumask_of_node(0);
3255 3281
3256 /* Use the local node if we haven't already */ 3282 /* Use the local node if we haven't already */
@@ -3794,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3794 * the block. 3820 * the block.
3795 */ 3821 */
3796 start_pfn = zone->zone_start_pfn; 3822 start_pfn = zone->zone_start_pfn;
3797 end_pfn = start_pfn + zone->spanned_pages; 3823 end_pfn = zone_end_pfn(zone);
3798 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3824 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3799 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3825 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3800 pageblock_order; 3826 pageblock_order;
@@ -3890,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3890 set_page_links(page, zone, nid, pfn); 3916 set_page_links(page, zone, nid, pfn);
3891 mminit_verify_page_links(page, zone, nid, pfn); 3917 mminit_verify_page_links(page, zone, nid, pfn);
3892 init_page_count(page); 3918 init_page_count(page);
3893 reset_page_mapcount(page); 3919 page_mapcount_reset(page);
3894 reset_page_last_nid(page); 3920 page_nid_reset_last(page);
3895 SetPageReserved(page); 3921 SetPageReserved(page);
3896 /* 3922 /*
3897 * Mark the block movable so that blocks are reserved for 3923 * Mark the block movable so that blocks are reserved for
@@ -3908,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3908 * pfn out of zone. 3934 * pfn out of zone.
3909 */ 3935 */
3910 if ((z->zone_start_pfn <= pfn) 3936 if ((z->zone_start_pfn <= pfn)
3911 && (pfn < z->zone_start_pfn + z->spanned_pages) 3937 && (pfn < zone_end_pfn(z))
3912 && !(pfn & (pageblock_nr_pages - 1))) 3938 && !(pfn & (pageblock_nr_pages - 1)))
3913 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3939 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3914 3940
@@ -3946,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone)
3946 * 3972 *
3947 * OK, so we don't know how big the cache is. So guess. 3973 * OK, so we don't know how big the cache is. So guess.
3948 */ 3974 */
3949 batch = zone->present_pages / 1024; 3975 batch = zone->managed_pages / 1024;
3950 if (batch * PAGE_SIZE > 512 * 1024) 3976 if (batch * PAGE_SIZE > 512 * 1024)
3951 batch = (512 * 1024) / PAGE_SIZE; 3977 batch = (512 * 1024) / PAGE_SIZE;
3952 batch /= 4; /* We effectively *= 4 below */ 3978 batch /= 4; /* We effectively *= 4 below */
@@ -4030,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone)
4030 4056
4031 if (percpu_pagelist_fraction) 4057 if (percpu_pagelist_fraction)
4032 setup_pagelist_highmark(pcp, 4058 setup_pagelist_highmark(pcp,
4033 (zone->present_pages / 4059 (zone->managed_pages /
4034 percpu_pagelist_fraction)); 4060 percpu_pagelist_fraction));
4035 } 4061 }
4036} 4062}
@@ -4386,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4386 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4412 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4387} 4413}
4388 4414
4415/**
4416 * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
4417 *
4418 * zone_movable_limit is initialized as 0. This function will try to get
4419 * the first ZONE_MOVABLE pfn of each node from movablemem_map, and
4420 * assigne them to zone_movable_limit.
4421 * zone_movable_limit[nid] == 0 means no limit for the node.
4422 *
4423 * Note: Each range is represented as [start_pfn, end_pfn)
4424 */
4425static void __meminit sanitize_zone_movable_limit(void)
4426{
4427 int map_pos = 0, i, nid;
4428 unsigned long start_pfn, end_pfn;
4429
4430 if (!movablemem_map.nr_map)
4431 return;
4432
4433 /* Iterate all ranges from minimum to maximum */
4434 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4435 /*
4436 * If we have found lowest pfn of ZONE_MOVABLE of the node
4437 * specified by user, just go on to check next range.
4438 */
4439 if (zone_movable_limit[nid])
4440 continue;
4441
4442#ifdef CONFIG_ZONE_DMA
4443 /* Skip DMA memory. */
4444 if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA])
4445 start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA];
4446#endif
4447
4448#ifdef CONFIG_ZONE_DMA32
4449 /* Skip DMA32 memory. */
4450 if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32])
4451 start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32];
4452#endif
4453
4454#ifdef CONFIG_HIGHMEM
4455 /* Skip lowmem if ZONE_MOVABLE is highmem. */
4456 if (zone_movable_is_highmem() &&
4457 start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
4458 start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
4459#endif
4460
4461 if (start_pfn >= end_pfn)
4462 continue;
4463
4464 while (map_pos < movablemem_map.nr_map) {
4465 if (end_pfn <= movablemem_map.map[map_pos].start_pfn)
4466 break;
4467
4468 if (start_pfn >= movablemem_map.map[map_pos].end_pfn) {
4469 map_pos++;
4470 continue;
4471 }
4472
4473 /*
4474 * The start_pfn of ZONE_MOVABLE is either the minimum
4475 * pfn specified by movablemem_map, or 0, which means
4476 * the node has no ZONE_MOVABLE.
4477 */
4478 zone_movable_limit[nid] = max(start_pfn,
4479 movablemem_map.map[map_pos].start_pfn);
4480
4481 break;
4482 }
4483 }
4484}
4485
4389#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4486#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4390static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4487static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4391 unsigned long zone_type, 4488 unsigned long zone_type,
@@ -4403,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4403 4500
4404 return zholes_size[zone_type]; 4501 return zholes_size[zone_type];
4405} 4502}
4406
4407#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4503#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4408 4504
4409static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4505static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
@@ -4435,10 +4531,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4435 * round what is now in bits to nearest long in bits, then return it in 4531 * round what is now in bits to nearest long in bits, then return it in
4436 * bytes. 4532 * bytes.
4437 */ 4533 */
4438static unsigned long __init usemap_size(unsigned long zonesize) 4534static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4439{ 4535{
4440 unsigned long usemapsize; 4536 unsigned long usemapsize;
4441 4537
4538 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4442 usemapsize = roundup(zonesize, pageblock_nr_pages); 4539 usemapsize = roundup(zonesize, pageblock_nr_pages);
4443 usemapsize = usemapsize >> pageblock_order; 4540 usemapsize = usemapsize >> pageblock_order;
4444 usemapsize *= NR_PAGEBLOCK_BITS; 4541 usemapsize *= NR_PAGEBLOCK_BITS;
@@ -4448,17 +4545,19 @@ static unsigned long __init usemap_size(unsigned long zonesize)
4448} 4545}
4449 4546
4450static void __init setup_usemap(struct pglist_data *pgdat, 4547static void __init setup_usemap(struct pglist_data *pgdat,
4451 struct zone *zone, unsigned long zonesize) 4548 struct zone *zone,
4549 unsigned long zone_start_pfn,
4550 unsigned long zonesize)
4452{ 4551{
4453 unsigned long usemapsize = usemap_size(zonesize); 4552 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4454 zone->pageblock_flags = NULL; 4553 zone->pageblock_flags = NULL;
4455 if (usemapsize) 4554 if (usemapsize)
4456 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4555 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4457 usemapsize); 4556 usemapsize);
4458} 4557}
4459#else 4558#else
4460static inline void setup_usemap(struct pglist_data *pgdat, 4559static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4461 struct zone *zone, unsigned long zonesize) {} 4560 unsigned long zone_start_pfn, unsigned long zonesize) {}
4462#endif /* CONFIG_SPARSEMEM */ 4561#endif /* CONFIG_SPARSEMEM */
4463 4562
4464#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4563#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -4584,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4584 nr_all_pages += freesize; 4683 nr_all_pages += freesize;
4585 4684
4586 zone->spanned_pages = size; 4685 zone->spanned_pages = size;
4587 zone->present_pages = freesize; 4686 zone->present_pages = realsize;
4588 /* 4687 /*
4589 * Set an approximate value for lowmem here, it will be adjusted 4688 * Set an approximate value for lowmem here, it will be adjusted
4590 * when the bootmem allocator frees pages into the buddy system. 4689 * when the bootmem allocator frees pages into the buddy system.
@@ -4609,7 +4708,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4609 continue; 4708 continue;
4610 4709
4611 set_pageblock_order(); 4710 set_pageblock_order();
4612 setup_usemap(pgdat, zone, size); 4711 setup_usemap(pgdat, zone, zone_start_pfn, size);
4613 ret = init_currently_empty_zone(zone, zone_start_pfn, 4712 ret = init_currently_empty_zone(zone, zone_start_pfn,
4614 size, MEMMAP_EARLY); 4713 size, MEMMAP_EARLY);
4615 BUG_ON(ret); 4714 BUG_ON(ret);
@@ -4636,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4636 * for the buddy allocator to function correctly. 4735 * for the buddy allocator to function correctly.
4637 */ 4736 */
4638 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4737 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4639 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 4738 end = pgdat_end_pfn(pgdat);
4640 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4739 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4641 size = (end - start) * sizeof(struct page); 4740 size = (end - start) * sizeof(struct page);
4642 map = alloc_remap(pgdat->node_id, size); 4741 map = alloc_remap(pgdat->node_id, size);
@@ -4842,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4842 required_kernelcore = max(required_kernelcore, corepages); 4941 required_kernelcore = max(required_kernelcore, corepages);
4843 } 4942 }
4844 4943
4845 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4944 /*
4846 if (!required_kernelcore) 4945 * If neither kernelcore/movablecore nor movablemem_map is specified,
4946 * there is no ZONE_MOVABLE. But if movablemem_map is specified, the
4947 * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
4948 */
4949 if (!required_kernelcore) {
4950 if (movablemem_map.nr_map)
4951 memcpy(zone_movable_pfn, zone_movable_limit,
4952 sizeof(zone_movable_pfn));
4847 goto out; 4953 goto out;
4954 }
4848 4955
4849 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4956 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4850 find_usable_zone_for_movable();
4851 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 4957 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4852 4958
4853restart: 4959restart:
@@ -4875,10 +4981,24 @@ restart:
4875 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4981 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4876 unsigned long size_pages; 4982 unsigned long size_pages;
4877 4983
4984 /*
4985 * Find more memory for kernelcore in
4986 * [zone_movable_pfn[nid], zone_movable_limit[nid]).
4987 */
4878 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 4988 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4879 if (start_pfn >= end_pfn) 4989 if (start_pfn >= end_pfn)
4880 continue; 4990 continue;
4881 4991
4992 if (zone_movable_limit[nid]) {
4993 end_pfn = min(end_pfn, zone_movable_limit[nid]);
4994 /* No range left for kernelcore in this node */
4995 if (start_pfn >= end_pfn) {
4996 zone_movable_pfn[nid] =
4997 zone_movable_limit[nid];
4998 break;
4999 }
5000 }
5001
4882 /* Account for what is only usable for kernelcore */ 5002 /* Account for what is only usable for kernelcore */
4883 if (start_pfn < usable_startpfn) { 5003 if (start_pfn < usable_startpfn) {
4884 unsigned long kernel_pages; 5004 unsigned long kernel_pages;
@@ -4938,12 +5058,12 @@ restart:
4938 if (usable_nodes && required_kernelcore > usable_nodes) 5058 if (usable_nodes && required_kernelcore > usable_nodes)
4939 goto restart; 5059 goto restart;
4940 5060
5061out:
4941 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5062 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
4942 for (nid = 0; nid < MAX_NUMNODES; nid++) 5063 for (nid = 0; nid < MAX_NUMNODES; nid++)
4943 zone_movable_pfn[nid] = 5064 zone_movable_pfn[nid] =
4944 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5065 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4945 5066
4946out:
4947 /* restore the node_state */ 5067 /* restore the node_state */
4948 node_states[N_MEMORY] = saved_node_state; 5068 node_states[N_MEMORY] = saved_node_state;
4949} 5069}
@@ -5006,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5006 5126
5007 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5127 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5008 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5128 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5129 find_usable_zone_for_movable();
5130 sanitize_zone_movable_limit();
5009 find_zone_movable_pfns_for_nodes(); 5131 find_zone_movable_pfns_for_nodes();
5010 5132
5011 /* Print out the zone ranges */ 5133 /* Print out the zone ranges */
@@ -5089,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p)
5089early_param("kernelcore", cmdline_parse_kernelcore); 5211early_param("kernelcore", cmdline_parse_kernelcore);
5090early_param("movablecore", cmdline_parse_movablecore); 5212early_param("movablecore", cmdline_parse_movablecore);
5091 5213
5214/**
5215 * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
5216 * @start_pfn: start pfn of the range to be checked
5217 * @end_pfn: end pfn of the range to be checked (exclusive)
5218 *
5219 * This function checks if a given memory range [start_pfn, end_pfn) overlaps
5220 * the movablemem_map.map[] array.
5221 *
5222 * Return: index of the first overlapped element in movablemem_map.map[]
5223 * or -1 if they don't overlap each other.
5224 */
5225int __init movablemem_map_overlap(unsigned long start_pfn,
5226 unsigned long end_pfn)
5227{
5228 int overlap;
5229
5230 if (!movablemem_map.nr_map)
5231 return -1;
5232
5233 for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
5234 if (start_pfn < movablemem_map.map[overlap].end_pfn)
5235 break;
5236
5237 if (overlap == movablemem_map.nr_map ||
5238 end_pfn <= movablemem_map.map[overlap].start_pfn)
5239 return -1;
5240
5241 return overlap;
5242}
5243
5244/**
5245 * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
5246 * @start_pfn: start pfn of the range
5247 * @end_pfn: end pfn of the range
5248 *
5249 * This function will also merge the overlapped ranges, and sort the array
5250 * by start_pfn in monotonic increasing order.
5251 */
5252void __init insert_movablemem_map(unsigned long start_pfn,
5253 unsigned long end_pfn)
5254{
5255 int pos, overlap;
5256
5257 /*
5258 * pos will be at the 1st overlapped range, or the position
5259 * where the element should be inserted.
5260 */
5261 for (pos = 0; pos < movablemem_map.nr_map; pos++)
5262 if (start_pfn <= movablemem_map.map[pos].end_pfn)
5263 break;
5264
5265 /* If there is no overlapped range, just insert the element. */
5266 if (pos == movablemem_map.nr_map ||
5267 end_pfn < movablemem_map.map[pos].start_pfn) {
5268 /*
5269 * If pos is not the end of array, we need to move all
5270 * the rest elements backward.
5271 */
5272 if (pos < movablemem_map.nr_map)
5273 memmove(&movablemem_map.map[pos+1],
5274 &movablemem_map.map[pos],
5275 sizeof(struct movablemem_entry) *
5276 (movablemem_map.nr_map - pos));
5277 movablemem_map.map[pos].start_pfn = start_pfn;
5278 movablemem_map.map[pos].end_pfn = end_pfn;
5279 movablemem_map.nr_map++;
5280 return;
5281 }
5282
5283 /* overlap will be at the last overlapped range */
5284 for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++)
5285 if (end_pfn < movablemem_map.map[overlap].start_pfn)
5286 break;
5287
5288 /*
5289 * If there are more ranges overlapped, we need to merge them,
5290 * and move the rest elements forward.
5291 */
5292 overlap--;
5293 movablemem_map.map[pos].start_pfn = min(start_pfn,
5294 movablemem_map.map[pos].start_pfn);
5295 movablemem_map.map[pos].end_pfn = max(end_pfn,
5296 movablemem_map.map[overlap].end_pfn);
5297
5298 if (pos != overlap && overlap + 1 != movablemem_map.nr_map)
5299 memmove(&movablemem_map.map[pos+1],
5300 &movablemem_map.map[overlap+1],
5301 sizeof(struct movablemem_entry) *
5302 (movablemem_map.nr_map - overlap - 1));
5303
5304 movablemem_map.nr_map -= overlap - pos;
5305}
5306
5307/**
5308 * movablemem_map_add_region - Add a memory range into movablemem_map.
5309 * @start: physical start address of range
5310 * @end: physical end address of range
5311 *
5312 * This function transform the physical address into pfn, and then add the
5313 * range into movablemem_map by calling insert_movablemem_map().
5314 */
5315static void __init movablemem_map_add_region(u64 start, u64 size)
5316{
5317 unsigned long start_pfn, end_pfn;
5318
5319 /* In case size == 0 or start + size overflows */
5320 if (start + size <= start)
5321 return;
5322
5323 if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) {
5324 pr_err("movablemem_map: too many entries;"
5325 " ignoring [mem %#010llx-%#010llx]\n",
5326 (unsigned long long) start,
5327 (unsigned long long) (start + size - 1));
5328 return;
5329 }
5330
5331 start_pfn = PFN_DOWN(start);
5332 end_pfn = PFN_UP(start + size);
5333 insert_movablemem_map(start_pfn, end_pfn);
5334}
5335
5336/*
5337 * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
5338 * @p: The boot option of the following format:
5339 * movablemem_map=nn[KMG]@ss[KMG]
5340 *
5341 * This option sets the memory range [ss, ss+nn) to be used as movable memory.
5342 *
5343 * Return: 0 on success or -EINVAL on failure.
5344 */
5345static int __init cmdline_parse_movablemem_map(char *p)
5346{
5347 char *oldp;
5348 u64 start_at, mem_size;
5349
5350 if (!p)
5351 goto err;
5352
5353 if (!strcmp(p, "acpi"))
5354 movablemem_map.acpi = true;
5355
5356 /*
5357 * If user decide to use info from BIOS, all the other user specified
5358 * ranges will be ingored.
5359 */
5360 if (movablemem_map.acpi) {
5361 if (movablemem_map.nr_map) {
5362 memset(movablemem_map.map, 0,
5363 sizeof(struct movablemem_entry)
5364 * movablemem_map.nr_map);
5365 movablemem_map.nr_map = 0;
5366 }
5367 return 0;
5368 }
5369
5370 oldp = p;
5371 mem_size = memparse(p, &p);
5372 if (p == oldp)
5373 goto err;
5374
5375 if (*p == '@') {
5376 oldp = ++p;
5377 start_at = memparse(p, &p);
5378 if (p == oldp || *p != '\0')
5379 goto err;
5380
5381 movablemem_map_add_region(start_at, mem_size);
5382 return 0;
5383 }
5384err:
5385 return -EINVAL;
5386}
5387early_param("movablemem_map", cmdline_parse_movablemem_map);
5388
5092#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5389#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5093 5390
5094/** 5391/**
@@ -5171,8 +5468,8 @@ static void calculate_totalreserve_pages(void)
5171 /* we treat the high watermark as reserved pages. */ 5468 /* we treat the high watermark as reserved pages. */
5172 max += high_wmark_pages(zone); 5469 max += high_wmark_pages(zone);
5173 5470
5174 if (max > zone->present_pages) 5471 if (max > zone->managed_pages)
5175 max = zone->present_pages; 5472 max = zone->managed_pages;
5176 reserve_pages += max; 5473 reserve_pages += max;
5177 /* 5474 /*
5178 * Lowmem reserves are not available to 5475 * Lowmem reserves are not available to
@@ -5204,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void)
5204 for_each_online_pgdat(pgdat) { 5501 for_each_online_pgdat(pgdat) {
5205 for (j = 0; j < MAX_NR_ZONES; j++) { 5502 for (j = 0; j < MAX_NR_ZONES; j++) {
5206 struct zone *zone = pgdat->node_zones + j; 5503 struct zone *zone = pgdat->node_zones + j;
5207 unsigned long present_pages = zone->present_pages; 5504 unsigned long managed_pages = zone->managed_pages;
5208 5505
5209 zone->lowmem_reserve[j] = 0; 5506 zone->lowmem_reserve[j] = 0;
5210 5507
@@ -5218,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void)
5218 sysctl_lowmem_reserve_ratio[idx] = 1; 5515 sysctl_lowmem_reserve_ratio[idx] = 1;
5219 5516
5220 lower_zone = pgdat->node_zones + idx; 5517 lower_zone = pgdat->node_zones + idx;
5221 lower_zone->lowmem_reserve[j] = present_pages / 5518 lower_zone->lowmem_reserve[j] = managed_pages /
5222 sysctl_lowmem_reserve_ratio[idx]; 5519 sysctl_lowmem_reserve_ratio[idx];
5223 present_pages += lower_zone->present_pages; 5520 managed_pages += lower_zone->managed_pages;
5224 } 5521 }
5225 } 5522 }
5226 } 5523 }
@@ -5239,14 +5536,14 @@ static void __setup_per_zone_wmarks(void)
5239 /* Calculate total number of !ZONE_HIGHMEM pages */ 5536 /* Calculate total number of !ZONE_HIGHMEM pages */
5240 for_each_zone(zone) { 5537 for_each_zone(zone) {
5241 if (!is_highmem(zone)) 5538 if (!is_highmem(zone))
5242 lowmem_pages += zone->present_pages; 5539 lowmem_pages += zone->managed_pages;
5243 } 5540 }
5244 5541
5245 for_each_zone(zone) { 5542 for_each_zone(zone) {
5246 u64 tmp; 5543 u64 tmp;
5247 5544
5248 spin_lock_irqsave(&zone->lock, flags); 5545 spin_lock_irqsave(&zone->lock, flags);
5249 tmp = (u64)pages_min * zone->present_pages; 5546 tmp = (u64)pages_min * zone->managed_pages;
5250 do_div(tmp, lowmem_pages); 5547 do_div(tmp, lowmem_pages);
5251 if (is_highmem(zone)) { 5548 if (is_highmem(zone)) {
5252 /* 5549 /*
@@ -5258,13 +5555,10 @@ static void __setup_per_zone_wmarks(void)
5258 * deltas controls asynch page reclaim, and so should 5555 * deltas controls asynch page reclaim, and so should
5259 * not be capped for highmem. 5556 * not be capped for highmem.
5260 */ 5557 */
5261 int min_pages; 5558 unsigned long min_pages;
5262 5559
5263 min_pages = zone->present_pages / 1024; 5560 min_pages = zone->managed_pages / 1024;
5264 if (min_pages < SWAP_CLUSTER_MAX) 5561 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5265 min_pages = SWAP_CLUSTER_MAX;
5266 if (min_pages > 128)
5267 min_pages = 128;
5268 zone->watermark[WMARK_MIN] = min_pages; 5562 zone->watermark[WMARK_MIN] = min_pages;
5269 } else { 5563 } else {
5270 /* 5564 /*
@@ -5325,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5325 unsigned int gb, ratio; 5619 unsigned int gb, ratio;
5326 5620
5327 /* Zone size in gigabytes */ 5621 /* Zone size in gigabytes */
5328 gb = zone->present_pages >> (30 - PAGE_SHIFT); 5622 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
5329 if (gb) 5623 if (gb)
5330 ratio = int_sqrt(10 * gb); 5624 ratio = int_sqrt(10 * gb);
5331 else 5625 else
@@ -5411,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5411 return rc; 5705 return rc;
5412 5706
5413 for_each_zone(zone) 5707 for_each_zone(zone)
5414 zone->min_unmapped_pages = (zone->present_pages * 5708 zone->min_unmapped_pages = (zone->managed_pages *
5415 sysctl_min_unmapped_ratio) / 100; 5709 sysctl_min_unmapped_ratio) / 100;
5416 return 0; 5710 return 0;
5417} 5711}
@@ -5427,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5427 return rc; 5721 return rc;
5428 5722
5429 for_each_zone(zone) 5723 for_each_zone(zone)
5430 zone->min_slab_pages = (zone->present_pages * 5724 zone->min_slab_pages = (zone->managed_pages *
5431 sysctl_min_slab_ratio) / 100; 5725 sysctl_min_slab_ratio) / 100;
5432 return 0; 5726 return 0;
5433} 5727}
@@ -5469,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5469 for_each_populated_zone(zone) { 5763 for_each_populated_zone(zone) {
5470 for_each_possible_cpu(cpu) { 5764 for_each_possible_cpu(cpu) {
5471 unsigned long high; 5765 unsigned long high;
5472 high = zone->present_pages / percpu_pagelist_fraction; 5766 high = zone->managed_pages / percpu_pagelist_fraction;
5473 setup_pagelist_highmark( 5767 setup_pagelist_highmark(
5474 per_cpu_ptr(zone->pageset, cpu), high); 5768 per_cpu_ptr(zone->pageset, cpu), high);
5475 } 5769 }
@@ -5604,7 +5898,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5604 pfn &= (PAGES_PER_SECTION-1); 5898 pfn &= (PAGES_PER_SECTION-1);
5605 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5899 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5606#else 5900#else
5607 pfn = pfn - zone->zone_start_pfn; 5901 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5608 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5902 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5609#endif /* CONFIG_SPARSEMEM */ 5903#endif /* CONFIG_SPARSEMEM */
5610} 5904}
@@ -5656,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5656 pfn = page_to_pfn(page); 5950 pfn = page_to_pfn(page);
5657 bitmap = get_pageblock_bitmap(zone, pfn); 5951 bitmap = get_pageblock_bitmap(zone, pfn);
5658 bitidx = pfn_to_bitidx(zone, pfn); 5952 bitidx = pfn_to_bitidx(zone, pfn);
5659 VM_BUG_ON(pfn < zone->zone_start_pfn); 5953 VM_BUG_ON(!zone_spans_pfn(zone, pfn));
5660 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
5661 5954
5662 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5955 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5663 if (flags & value) 5956 if (flags & value)
@@ -5755,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5755 6048
5756 zone = page_zone(page); 6049 zone = page_zone(page);
5757 pfn = page_to_pfn(page); 6050 pfn = page_to_pfn(page);
5758 if (zone->zone_start_pfn > pfn || 6051 if (!zone_spans_pfn(zone, pfn))
5759 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5760 return false; 6052 return false;
5761 6053
5762 return !has_unmovable_pages(zone, page, 0, true); 6054 return !has_unmovable_pages(zone, page, 0, true);
@@ -5812,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5812 &cc->migratepages); 6104 &cc->migratepages);
5813 cc->nr_migratepages -= nr_reclaimed; 6105 cc->nr_migratepages -= nr_reclaimed;
5814 6106
5815 ret = migrate_pages(&cc->migratepages, 6107 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
5816 alloc_migrate_target, 6108 0, MIGRATE_SYNC, MR_CMA);
5817 0, false, MIGRATE_SYNC,
5818 MR_CMA);
5819 } 6109 }
5820 6110 if (ret < 0) {
5821 putback_movable_pages(&cc->migratepages); 6111 putback_movable_pages(&cc->migratepages);
5822 return ret > 0 ? 0 : ret; 6112 return ret;
6113 }
6114 return 0;
5823} 6115}
5824 6116
5825/** 6117/**
diff --git a/mm/rmap.c b/mm/rmap.c
index 2c78f8cadc95..807c96bf0dc6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
105 */ 105 */
106 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 106 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock_write(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock(anon_vma); 108 anon_vma_unlock_write(anon_vma);
109 } 109 }
110 110
111 kmem_cache_free(anon_vma_cachep, anon_vma); 111 kmem_cache_free(anon_vma_cachep, anon_vma);
@@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
191 avc = NULL; 191 avc = NULL;
192 } 192 }
193 spin_unlock(&mm->page_table_lock); 193 spin_unlock(&mm->page_table_lock);
194 anon_vma_unlock(anon_vma); 194 anon_vma_unlock_write(anon_vma);
195 195
196 if (unlikely(allocated)) 196 if (unlikely(allocated))
197 put_anon_vma(allocated); 197 put_anon_vma(allocated);
@@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
308 vma->anon_vma = anon_vma; 308 vma->anon_vma = anon_vma;
309 anon_vma_lock_write(anon_vma); 309 anon_vma_lock_write(anon_vma);
310 anon_vma_chain_link(vma, avc, anon_vma); 310 anon_vma_chain_link(vma, avc, anon_vma);
311 anon_vma_unlock(anon_vma); 311 anon_vma_unlock_write(anon_vma);
312 312
313 return 0; 313 return 0;
314 314
@@ -1126,7 +1126,6 @@ void page_add_file_rmap(struct page *page)
1126 */ 1126 */
1127void page_remove_rmap(struct page *page) 1127void page_remove_rmap(struct page *page)
1128{ 1128{
1129 struct address_space *mapping = page_mapping(page);
1130 bool anon = PageAnon(page); 1129 bool anon = PageAnon(page);
1131 bool locked; 1130 bool locked;
1132 unsigned long flags; 1131 unsigned long flags;
@@ -1144,29 +1143,6 @@ void page_remove_rmap(struct page *page)
1144 goto out; 1143 goto out;
1145 1144
1146 /* 1145 /*
1147 * Now that the last pte has gone, s390 must transfer dirty
1148 * flag from storage key to struct page. We can usually skip
1149 * this if the page is anon, so about to be freed; but perhaps
1150 * not if it's in swapcache - there might be another pte slot
1151 * containing the swap entry, but page not yet written to swap.
1152 *
1153 * And we can skip it on file pages, so long as the filesystem
1154 * participates in dirty tracking (note that this is not only an
1155 * optimization but also solves problems caused by dirty flag in
1156 * storage key getting set by a write from inside kernel); but need to
1157 * catch shm and tmpfs and ramfs pages which have been modified since
1158 * creation by read fault.
1159 *
1160 * Note that mapping must be decided above, before decrementing
1161 * mapcount (which luckily provides a barrier): once page is unmapped,
1162 * it could be truncated and page->mapping reset to NULL at any moment.
1163 * Note also that we are relying on page_mapping(page) to set mapping
1164 * to &swapper_space when PageSwapCache(page).
1165 */
1166 if (mapping && !mapping_cap_account_dirty(mapping) &&
1167 page_test_and_clear_dirty(page_to_pfn(page), 1))
1168 set_page_dirty(page);
1169 /*
1170 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED 1146 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
1171 * and not charged by memcg for now. 1147 * and not charged by memcg for now.
1172 */ 1148 */
diff --git a/mm/shmem.c b/mm/shmem.c
index 5dd56f6efdbd..ed2befb4952e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -335,19 +335,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
335 pgoff_t start, unsigned int nr_pages, 335 pgoff_t start, unsigned int nr_pages,
336 struct page **pages, pgoff_t *indices) 336 struct page **pages, pgoff_t *indices)
337{ 337{
338 unsigned int i; 338 void **slot;
339 unsigned int ret; 339 unsigned int ret = 0;
340 unsigned int nr_found; 340 struct radix_tree_iter iter;
341
342 if (!nr_pages)
343 return 0;
341 344
342 rcu_read_lock(); 345 rcu_read_lock();
343restart: 346restart:
344 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 347 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
345 (void ***)pages, indices, start, nr_pages);
346 ret = 0;
347 for (i = 0; i < nr_found; i++) {
348 struct page *page; 348 struct page *page;
349repeat: 349repeat:
350 page = radix_tree_deref_slot((void **)pages[i]); 350 page = radix_tree_deref_slot(slot);
351 if (unlikely(!page)) 351 if (unlikely(!page))
352 continue; 352 continue;
353 if (radix_tree_exception(page)) { 353 if (radix_tree_exception(page)) {
@@ -364,17 +364,16 @@ repeat:
364 goto repeat; 364 goto repeat;
365 365
366 /* Has the page moved? */ 366 /* Has the page moved? */
367 if (unlikely(page != *((void **)pages[i]))) { 367 if (unlikely(page != *slot)) {
368 page_cache_release(page); 368 page_cache_release(page);
369 goto repeat; 369 goto repeat;
370 } 370 }
371export: 371export:
372 indices[ret] = indices[i]; 372 indices[ret] = iter.index;
373 pages[ret] = page; 373 pages[ret] = page;
374 ret++; 374 if (++ret == nr_pages)
375 break;
375 } 376 }
376 if (unlikely(!ret && nr_found))
377 goto restart;
378 rcu_read_unlock(); 377 rcu_read_unlock();
379 return ret; 378 return ret;
380} 379}
@@ -1295,7 +1294,7 @@ unlock:
1295 1294
1296static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1295static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1297{ 1296{
1298 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1297 struct inode *inode = file_inode(vma->vm_file);
1299 int error; 1298 int error;
1300 int ret = VM_FAULT_LOCKED; 1299 int ret = VM_FAULT_LOCKED;
1301 1300
@@ -1313,14 +1312,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1313#ifdef CONFIG_NUMA 1312#ifdef CONFIG_NUMA
1314static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 1313static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1315{ 1314{
1316 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1315 struct inode *inode = file_inode(vma->vm_file);
1317 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 1316 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1318} 1317}
1319 1318
1320static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1319static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1321 unsigned long addr) 1320 unsigned long addr)
1322{ 1321{
1323 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1322 struct inode *inode = file_inode(vma->vm_file);
1324 pgoff_t index; 1323 pgoff_t index;
1325 1324
1326 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1325 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1330,7 +1329,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1330 1329
1331int shmem_lock(struct file *file, int lock, struct user_struct *user) 1330int shmem_lock(struct file *file, int lock, struct user_struct *user)
1332{ 1331{
1333 struct inode *inode = file->f_path.dentry->d_inode; 1332 struct inode *inode = file_inode(file);
1334 struct shmem_inode_info *info = SHMEM_I(inode); 1333 struct shmem_inode_info *info = SHMEM_I(inode);
1335 int retval = -ENOMEM; 1334 int retval = -ENOMEM;
1336 1335
@@ -1465,7 +1464,7 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1465 1464
1466static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) 1465static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1467{ 1466{
1468 struct inode *inode = filp->f_path.dentry->d_inode; 1467 struct inode *inode = file_inode(filp);
1469 struct address_space *mapping = inode->i_mapping; 1468 struct address_space *mapping = inode->i_mapping;
1470 pgoff_t index; 1469 pgoff_t index;
1471 unsigned long offset; 1470 unsigned long offset;
@@ -1808,7 +1807,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1808static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1807static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1809 loff_t len) 1808 loff_t len)
1810{ 1809{
1811 struct inode *inode = file->f_path.dentry->d_inode; 1810 struct inode *inode = file_inode(file);
1812 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1811 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1813 struct shmem_falloc shmem_falloc; 1812 struct shmem_falloc shmem_falloc;
1814 pgoff_t start, index, end; 1813 pgoff_t start, index, end;
@@ -2351,7 +2350,7 @@ static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
2351{ 2350{
2352 if (*len < 3) { 2351 if (*len < 3) {
2353 *len = 3; 2352 *len = 3;
2354 return 255; 2353 return FILEID_INVALID;
2355 } 2354 }
2356 2355
2357 if (inode_unhashed(inode)) { 2356 if (inode_unhashed(inode)) {
@@ -2386,6 +2385,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2386 bool remount) 2385 bool remount)
2387{ 2386{
2388 char *this_char, *value, *rest; 2387 char *this_char, *value, *rest;
2388 struct mempolicy *mpol = NULL;
2389 uid_t uid; 2389 uid_t uid;
2390 gid_t gid; 2390 gid_t gid;
2391 2391
@@ -2414,7 +2414,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2414 printk(KERN_ERR 2414 printk(KERN_ERR
2415 "tmpfs: No value for mount option '%s'\n", 2415 "tmpfs: No value for mount option '%s'\n",
2416 this_char); 2416 this_char);
2417 return 1; 2417 goto error;
2418 } 2418 }
2419 2419
2420 if (!strcmp(this_char,"size")) { 2420 if (!strcmp(this_char,"size")) {
@@ -2463,19 +2463,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2463 if (!gid_valid(sbinfo->gid)) 2463 if (!gid_valid(sbinfo->gid))
2464 goto bad_val; 2464 goto bad_val;
2465 } else if (!strcmp(this_char,"mpol")) { 2465 } else if (!strcmp(this_char,"mpol")) {
2466 if (mpol_parse_str(value, &sbinfo->mpol)) 2466 mpol_put(mpol);
2467 mpol = NULL;
2468 if (mpol_parse_str(value, &mpol))
2467 goto bad_val; 2469 goto bad_val;
2468 } else { 2470 } else {
2469 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2471 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2470 this_char); 2472 this_char);
2471 return 1; 2473 goto error;
2472 } 2474 }
2473 } 2475 }
2476 sbinfo->mpol = mpol;
2474 return 0; 2477 return 0;
2475 2478
2476bad_val: 2479bad_val:
2477 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 2480 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2478 value, this_char); 2481 value, this_char);
2482error:
2483 mpol_put(mpol);
2479 return 1; 2484 return 1;
2480 2485
2481} 2486}
@@ -2487,6 +2492,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2487 unsigned long inodes; 2492 unsigned long inodes;
2488 int error = -EINVAL; 2493 int error = -EINVAL;
2489 2494
2495 config.mpol = NULL;
2490 if (shmem_parse_options(data, &config, true)) 2496 if (shmem_parse_options(data, &config, true))
2491 return error; 2497 return error;
2492 2498
@@ -2511,8 +2517,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2511 sbinfo->max_inodes = config.max_inodes; 2517 sbinfo->max_inodes = config.max_inodes;
2512 sbinfo->free_inodes = config.max_inodes - inodes; 2518 sbinfo->free_inodes = config.max_inodes - inodes;
2513 2519
2514 mpol_put(sbinfo->mpol); 2520 /*
2515 sbinfo->mpol = config.mpol; /* transfers initial ref */ 2521 * Preserve previous mempolicy unless mpol remount option was specified.
2522 */
2523 if (config.mpol) {
2524 mpol_put(sbinfo->mpol);
2525 sbinfo->mpol = config.mpol; /* transfers initial ref */
2526 }
2516out: 2527out:
2517 spin_unlock(&sbinfo->stat_lock); 2528 spin_unlock(&sbinfo->stat_lock);
2518 return error; 2529 return error;
@@ -2545,6 +2556,7 @@ static void shmem_put_super(struct super_block *sb)
2545 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2556 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2546 2557
2547 percpu_counter_destroy(&sbinfo->used_blocks); 2558 percpu_counter_destroy(&sbinfo->used_blocks);
2559 mpol_put(sbinfo->mpol);
2548 kfree(sbinfo); 2560 kfree(sbinfo);
2549 sb->s_fs_info = NULL; 2561 sb->s_fs_info = NULL;
2550} 2562}
@@ -2766,6 +2778,7 @@ static struct file_system_type shmem_fs_type = {
2766 .name = "tmpfs", 2778 .name = "tmpfs",
2767 .mount = shmem_mount, 2779 .mount = shmem_mount,
2768 .kill_sb = kill_litter_super, 2780 .kill_sb = kill_litter_super,
2781 .fs_flags = FS_USERNS_MOUNT,
2769}; 2782};
2770 2783
2771int __init shmem_init(void) 2784int __init shmem_init(void)
@@ -2823,6 +2836,7 @@ static struct file_system_type shmem_fs_type = {
2823 .name = "tmpfs", 2836 .name = "tmpfs",
2824 .mount = ramfs_mount, 2837 .mount = ramfs_mount,
2825 .kill_sb = kill_litter_super, 2838 .kill_sb = kill_litter_super,
2839 .fs_flags = FS_USERNS_MOUNT,
2826}; 2840};
2827 2841
2828int __init shmem_init(void) 2842int __init shmem_init(void)
@@ -2865,6 +2879,16 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
2865 2879
2866/* common code */ 2880/* common code */
2867 2881
2882static char *shmem_dname(struct dentry *dentry, char *buffer, int buflen)
2883{
2884 return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
2885 dentry->d_name.name);
2886}
2887
2888static struct dentry_operations anon_ops = {
2889 .d_dname = shmem_dname
2890};
2891
2868/** 2892/**
2869 * shmem_file_setup - get an unlinked file living in tmpfs 2893 * shmem_file_setup - get an unlinked file living in tmpfs
2870 * @name: name for dentry (to be seen in /proc/<pid>/maps 2894 * @name: name for dentry (to be seen in /proc/<pid>/maps
@@ -2873,15 +2897,14 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
2873 */ 2897 */
2874struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 2898struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2875{ 2899{
2876 int error; 2900 struct file *res;
2877 struct file *file;
2878 struct inode *inode; 2901 struct inode *inode;
2879 struct path path; 2902 struct path path;
2880 struct dentry *root; 2903 struct super_block *sb;
2881 struct qstr this; 2904 struct qstr this;
2882 2905
2883 if (IS_ERR(shm_mnt)) 2906 if (IS_ERR(shm_mnt))
2884 return (void *)shm_mnt; 2907 return ERR_CAST(shm_mnt);
2885 2908
2886 if (size < 0 || size > MAX_LFS_FILESIZE) 2909 if (size < 0 || size > MAX_LFS_FILESIZE)
2887 return ERR_PTR(-EINVAL); 2910 return ERR_PTR(-EINVAL);
@@ -2889,18 +2912,19 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2889 if (shmem_acct_size(flags, size)) 2912 if (shmem_acct_size(flags, size))
2890 return ERR_PTR(-ENOMEM); 2913 return ERR_PTR(-ENOMEM);
2891 2914
2892 error = -ENOMEM; 2915 res = ERR_PTR(-ENOMEM);
2893 this.name = name; 2916 this.name = name;
2894 this.len = strlen(name); 2917 this.len = strlen(name);
2895 this.hash = 0; /* will go */ 2918 this.hash = 0; /* will go */
2896 root = shm_mnt->mnt_root; 2919 sb = shm_mnt->mnt_sb;
2897 path.dentry = d_alloc(root, &this); 2920 path.dentry = d_alloc_pseudo(sb, &this);
2898 if (!path.dentry) 2921 if (!path.dentry)
2899 goto put_memory; 2922 goto put_memory;
2923 d_set_d_op(path.dentry, &anon_ops);
2900 path.mnt = mntget(shm_mnt); 2924 path.mnt = mntget(shm_mnt);
2901 2925
2902 error = -ENOSPC; 2926 res = ERR_PTR(-ENOSPC);
2903 inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); 2927 inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
2904 if (!inode) 2928 if (!inode)
2905 goto put_dentry; 2929 goto put_dentry;
2906 2930
@@ -2909,23 +2933,23 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2909 clear_nlink(inode); /* It is unlinked */ 2933 clear_nlink(inode); /* It is unlinked */
2910#ifndef CONFIG_MMU 2934#ifndef CONFIG_MMU
2911 error = ramfs_nommu_expand_for_mapping(inode, size); 2935 error = ramfs_nommu_expand_for_mapping(inode, size);
2936 res = ERR_PTR(error);
2912 if (error) 2937 if (error)
2913 goto put_dentry; 2938 goto put_dentry;
2914#endif 2939#endif
2915 2940
2916 error = -ENFILE; 2941 res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2917 file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2918 &shmem_file_operations); 2942 &shmem_file_operations);
2919 if (!file) 2943 if (IS_ERR(res))
2920 goto put_dentry; 2944 goto put_dentry;
2921 2945
2922 return file; 2946 return res;
2923 2947
2924put_dentry: 2948put_dentry:
2925 path_put(&path); 2949 path_put(&path);
2926put_memory: 2950put_memory:
2927 shmem_unacct_size(flags, size); 2951 shmem_unacct_size(flags, size);
2928 return ERR_PTR(error); 2952 return res;
2929} 2953}
2930EXPORT_SYMBOL_GPL(shmem_file_setup); 2954EXPORT_SYMBOL_GPL(shmem_file_setup);
2931 2955
diff --git a/mm/slab.c b/mm/slab.c
index e7667a3584bc..856e4a192d25 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -812,7 +812,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
812 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 812 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
813 function, cachep->name, msg); 813 function, cachep->name, msg);
814 dump_stack(); 814 dump_stack();
815 add_taint(TAINT_BAD_PAGE); 815 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
816} 816}
817#endif 817#endif
818 818
diff --git a/mm/slob.c b/mm/slob.c
index a99fdf7a0907..eeed4a05a2ef 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
360 clear_slob_page_free(sp); 360 clear_slob_page_free(sp);
361 spin_unlock_irqrestore(&slob_lock, flags); 361 spin_unlock_irqrestore(&slob_lock, flags);
362 __ClearPageSlab(sp); 362 __ClearPageSlab(sp);
363 reset_page_mapcount(sp); 363 page_mapcount_reset(sp);
364 slob_free_pages(b, 0); 364 slob_free_pages(b, 0);
365 return; 365 return;
366 } 366 }
diff --git a/mm/slub.c b/mm/slub.c
index ba2ca53f6c3a..4aec53705e4f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -562,7 +562,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
562 printk(KERN_ERR "----------------------------------------" 562 printk(KERN_ERR "----------------------------------------"
563 "-------------------------------------\n\n"); 563 "-------------------------------------\n\n");
564 564
565 add_taint(TAINT_BAD_PAGE); 565 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
566} 566}
567 567
568static void slab_fix(struct kmem_cache *s, char *fmt, ...) 568static void slab_fix(struct kmem_cache *s, char *fmt, ...)
@@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1408 __ClearPageSlab(page); 1408 __ClearPageSlab(page);
1409 1409
1410 memcg_release_pages(s, order); 1410 memcg_release_pages(s, order);
1411 reset_page_mapcount(page); 1411 page_mapcount_reset(page);
1412 if (current->reclaim_state) 1412 if (current->reclaim_state)
1413 current->reclaim_state->reclaimed_slab += pages; 1413 current->reclaim_state->reclaimed_slab += pages;
1414 __free_memcg_kmem_pages(page, order); 1414 __free_memcg_kmem_pages(page, order);
diff --git a/mm/sparse.c b/mm/sparse.c
index 6b5fb762e2ca..7ca6dc847947 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
615} 615}
616static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 616static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
617{ 617{
618 return; /* XXX: Not implemented yet */ 618 vmemmap_free(memmap, nr_pages);
619} 619}
620static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) 620static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
621{ 621{
622 vmemmap_free(memmap, nr_pages);
622} 623}
623#else 624#else
624static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 625static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
@@ -697,7 +698,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
697 /* 698 /*
698 * Check to see if allocation came from hot-plug-add 699 * Check to see if allocation came from hot-plug-add
699 */ 700 */
700 if (PageSlab(usemap_page)) { 701 if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
701 kfree(usemap); 702 kfree(usemap);
702 if (memmap) 703 if (memmap)
703 __kfree_section_memmap(memmap, PAGES_PER_SECTION); 704 __kfree_section_memmap(memmap, PAGES_PER_SECTION);
@@ -782,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
782 783
783 for (i = 0; i < PAGES_PER_SECTION; i++) { 784 for (i = 0; i < PAGES_PER_SECTION; i++) {
784 if (PageHWPoison(&memmap[i])) { 785 if (PageHWPoison(&memmap[i])) {
785 atomic_long_sub(1, &mce_bad_pages); 786 atomic_long_sub(1, &num_poisoned_pages);
786 ClearPageHWPoison(&memmap[i]); 787 ClearPageHWPoison(&memmap[i]);
787 } 788 }
788 } 789 }
@@ -796,8 +797,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
796void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 797void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
797{ 798{
798 struct page *memmap = NULL; 799 struct page *memmap = NULL;
799 unsigned long *usemap = NULL; 800 unsigned long *usemap = NULL, flags;
801 struct pglist_data *pgdat = zone->zone_pgdat;
800 802
803 pgdat_resize_lock(pgdat, &flags);
801 if (ms->section_mem_map) { 804 if (ms->section_mem_map) {
802 usemap = ms->pageblock_flags; 805 usemap = ms->pageblock_flags;
803 memmap = sparse_decode_mem_map(ms->section_mem_map, 806 memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -805,6 +808,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
805 ms->section_mem_map = 0; 808 ms->section_mem_map = 0;
806 ms->pageblock_flags = NULL; 809 ms->pageblock_flags = NULL;
807 } 810 }
811 pgdat_resize_unlock(pgdat, &flags);
808 812
809 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); 813 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
810 free_section_usemap(memmap, usemap); 814 free_section_usemap(memmap, usemap);
diff --git a/mm/swap.c b/mm/swap.c
index 6310dc2008ff..8a529a01e8fc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
855void __init swap_setup(void) 855void __init swap_setup(void)
856{ 856{
857 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 857 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
858
859#ifdef CONFIG_SWAP 858#ifdef CONFIG_SWAP
860 bdi_init(swapper_space.backing_dev_info); 859 int i;
860
861 bdi_init(swapper_spaces[0].backing_dev_info);
862 for (i = 0; i < MAX_SWAPFILES; i++) {
863 spin_lock_init(&swapper_spaces[i].tree_lock);
864 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
865 }
861#endif 866#endif
862 867
863 /* Use a smaller cluster for small-memory machines */ 868 /* Use a smaller cluster for small-memory machines */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0cb36fb1f61c..7efcf1525921 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
37}; 37};
38 38
39struct address_space swapper_space = { 39struct address_space swapper_spaces[MAX_SWAPFILES] = {
40 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 40 [0 ... MAX_SWAPFILES - 1] = {
41 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .a_ops = &swap_aops, 42 .a_ops = &swap_aops,
43 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 43 .backing_dev_info = &swap_backing_dev_info,
44 .backing_dev_info = &swap_backing_dev_info, 44 }
45}; 45};
46 46
47#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 47#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -53,13 +53,24 @@ static struct {
53 unsigned long find_total; 53 unsigned long find_total;
54} swap_cache_info; 54} swap_cache_info;
55 55
56unsigned long total_swapcache_pages(void)
57{
58 int i;
59 unsigned long ret = 0;
60
61 for (i = 0; i < MAX_SWAPFILES; i++)
62 ret += swapper_spaces[i].nrpages;
63 return ret;
64}
65
56void show_swap_cache_info(void) 66void show_swap_cache_info(void)
57{ 67{
58 printk("%lu pages in swap cache\n", total_swapcache_pages); 68 printk("%lu pages in swap cache\n", total_swapcache_pages());
59 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 69 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swap_cache_info.del_total, 70 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, swap_cache_info.find_total); 71 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 72 printk("Free swap = %ldkB\n",
73 get_nr_swap_pages() << (PAGE_SHIFT - 10));
63 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 74 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
64} 75}
65 76
@@ -70,6 +81,7 @@ void show_swap_cache_info(void)
70static int __add_to_swap_cache(struct page *page, swp_entry_t entry) 81static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
71{ 82{
72 int error; 83 int error;
84 struct address_space *address_space;
73 85
74 VM_BUG_ON(!PageLocked(page)); 86 VM_BUG_ON(!PageLocked(page));
75 VM_BUG_ON(PageSwapCache(page)); 87 VM_BUG_ON(PageSwapCache(page));
@@ -79,14 +91,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
79 SetPageSwapCache(page); 91 SetPageSwapCache(page);
80 set_page_private(page, entry.val); 92 set_page_private(page, entry.val);
81 93
82 spin_lock_irq(&swapper_space.tree_lock); 94 address_space = swap_address_space(entry);
83 error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); 95 spin_lock_irq(&address_space->tree_lock);
96 error = radix_tree_insert(&address_space->page_tree,
97 entry.val, page);
84 if (likely(!error)) { 98 if (likely(!error)) {
85 total_swapcache_pages++; 99 address_space->nrpages++;
86 __inc_zone_page_state(page, NR_FILE_PAGES); 100 __inc_zone_page_state(page, NR_FILE_PAGES);
87 INC_CACHE_INFO(add_total); 101 INC_CACHE_INFO(add_total);
88 } 102 }
89 spin_unlock_irq(&swapper_space.tree_lock); 103 spin_unlock_irq(&address_space->tree_lock);
90 104
91 if (unlikely(error)) { 105 if (unlikely(error)) {
92 /* 106 /*
@@ -122,14 +136,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
122 */ 136 */
123void __delete_from_swap_cache(struct page *page) 137void __delete_from_swap_cache(struct page *page)
124{ 138{
139 swp_entry_t entry;
140 struct address_space *address_space;
141
125 VM_BUG_ON(!PageLocked(page)); 142 VM_BUG_ON(!PageLocked(page));
126 VM_BUG_ON(!PageSwapCache(page)); 143 VM_BUG_ON(!PageSwapCache(page));
127 VM_BUG_ON(PageWriteback(page)); 144 VM_BUG_ON(PageWriteback(page));
128 145
129 radix_tree_delete(&swapper_space.page_tree, page_private(page)); 146 entry.val = page_private(page);
147 address_space = swap_address_space(entry);
148 radix_tree_delete(&address_space->page_tree, page_private(page));
130 set_page_private(page, 0); 149 set_page_private(page, 0);
131 ClearPageSwapCache(page); 150 ClearPageSwapCache(page);
132 total_swapcache_pages--; 151 address_space->nrpages--;
133 __dec_zone_page_state(page, NR_FILE_PAGES); 152 __dec_zone_page_state(page, NR_FILE_PAGES);
134 INC_CACHE_INFO(del_total); 153 INC_CACHE_INFO(del_total);
135} 154}
@@ -195,12 +214,14 @@ int add_to_swap(struct page *page)
195void delete_from_swap_cache(struct page *page) 214void delete_from_swap_cache(struct page *page)
196{ 215{
197 swp_entry_t entry; 216 swp_entry_t entry;
217 struct address_space *address_space;
198 218
199 entry.val = page_private(page); 219 entry.val = page_private(page);
200 220
201 spin_lock_irq(&swapper_space.tree_lock); 221 address_space = swap_address_space(entry);
222 spin_lock_irq(&address_space->tree_lock);
202 __delete_from_swap_cache(page); 223 __delete_from_swap_cache(page);
203 spin_unlock_irq(&swapper_space.tree_lock); 224 spin_unlock_irq(&address_space->tree_lock);
204 225
205 swapcache_free(entry, page); 226 swapcache_free(entry, page);
206 page_cache_release(page); 227 page_cache_release(page);
@@ -263,7 +284,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
263{ 284{
264 struct page *page; 285 struct page *page;
265 286
266 page = find_get_page(&swapper_space, entry.val); 287 page = find_get_page(swap_address_space(entry), entry.val);
267 288
268 if (page) 289 if (page)
269 INC_CACHE_INFO(find_success); 290 INC_CACHE_INFO(find_success);
@@ -290,7 +311,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
290 * called after lookup_swap_cache() failed, re-calling 311 * called after lookup_swap_cache() failed, re-calling
291 * that would confuse statistics. 312 * that would confuse statistics.
292 */ 313 */
293 found_page = find_get_page(&swapper_space, entry.val); 314 found_page = find_get_page(swap_address_space(entry),
315 entry.val);
294 if (found_page) 316 if (found_page)
295 break; 317 break;
296 318
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e97a0e5aea91..a1f7772a01fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**);
47 47
48DEFINE_SPINLOCK(swap_lock); 48DEFINE_SPINLOCK(swap_lock);
49static unsigned int nr_swapfiles; 49static unsigned int nr_swapfiles;
50long nr_swap_pages; 50atomic_long_t nr_swap_pages;
51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
51long total_swap_pages; 52long total_swap_pages;
52static int least_priority; 53static int least_priority;
54static atomic_t highest_priority_index = ATOMIC_INIT(-1);
53 55
54static const char Bad_file[] = "Bad swap file entry "; 56static const char Bad_file[] = "Bad swap file entry ";
55static const char Unused_file[] = "Unused swap file entry "; 57static const char Unused_file[] = "Unused swap file entry ";
@@ -79,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
79 struct page *page; 81 struct page *page;
80 int ret = 0; 82 int ret = 0;
81 83
82 page = find_get_page(&swapper_space, entry.val); 84 page = find_get_page(swap_address_space(entry), entry.val);
83 if (!page) 85 if (!page)
84 return 0; 86 return 0;
85 /* 87 /*
@@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
223 si->lowest_alloc = si->max; 225 si->lowest_alloc = si->max;
224 si->highest_alloc = 0; 226 si->highest_alloc = 0;
225 } 227 }
226 spin_unlock(&swap_lock); 228 spin_unlock(&si->lock);
227 229
228 /* 230 /*
229 * If seek is expensive, start searching for new cluster from 231 * If seek is expensive, start searching for new cluster from
@@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
242 if (si->swap_map[offset]) 244 if (si->swap_map[offset])
243 last_in_cluster = offset + SWAPFILE_CLUSTER; 245 last_in_cluster = offset + SWAPFILE_CLUSTER;
244 else if (offset == last_in_cluster) { 246 else if (offset == last_in_cluster) {
245 spin_lock(&swap_lock); 247 spin_lock(&si->lock);
246 offset -= SWAPFILE_CLUSTER - 1; 248 offset -= SWAPFILE_CLUSTER - 1;
247 si->cluster_next = offset; 249 si->cluster_next = offset;
248 si->cluster_nr = SWAPFILE_CLUSTER - 1; 250 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
263 if (si->swap_map[offset]) 265 if (si->swap_map[offset])
264 last_in_cluster = offset + SWAPFILE_CLUSTER; 266 last_in_cluster = offset + SWAPFILE_CLUSTER;
265 else if (offset == last_in_cluster) { 267 else if (offset == last_in_cluster) {
266 spin_lock(&swap_lock); 268 spin_lock(&si->lock);
267 offset -= SWAPFILE_CLUSTER - 1; 269 offset -= SWAPFILE_CLUSTER - 1;
268 si->cluster_next = offset; 270 si->cluster_next = offset;
269 si->cluster_nr = SWAPFILE_CLUSTER - 1; 271 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
277 } 279 }
278 280
279 offset = scan_base; 281 offset = scan_base;
280 spin_lock(&swap_lock); 282 spin_lock(&si->lock);
281 si->cluster_nr = SWAPFILE_CLUSTER - 1; 283 si->cluster_nr = SWAPFILE_CLUSTER - 1;
282 si->lowest_alloc = 0; 284 si->lowest_alloc = 0;
283 } 285 }
@@ -293,9 +295,9 @@ checks:
293 /* reuse swap entry of cache-only swap if not busy. */ 295 /* reuse swap entry of cache-only swap if not busy. */
294 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 296 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
295 int swap_was_freed; 297 int swap_was_freed;
296 spin_unlock(&swap_lock); 298 spin_unlock(&si->lock);
297 swap_was_freed = __try_to_reclaim_swap(si, offset); 299 swap_was_freed = __try_to_reclaim_swap(si, offset);
298 spin_lock(&swap_lock); 300 spin_lock(&si->lock);
299 /* entry was freed successfully, try to use this again */ 301 /* entry was freed successfully, try to use this again */
300 if (swap_was_freed) 302 if (swap_was_freed)
301 goto checks; 303 goto checks;
@@ -335,13 +337,13 @@ checks:
335 si->lowest_alloc <= last_in_cluster) 337 si->lowest_alloc <= last_in_cluster)
336 last_in_cluster = si->lowest_alloc - 1; 338 last_in_cluster = si->lowest_alloc - 1;
337 si->flags |= SWP_DISCARDING; 339 si->flags |= SWP_DISCARDING;
338 spin_unlock(&swap_lock); 340 spin_unlock(&si->lock);
339 341
340 if (offset < last_in_cluster) 342 if (offset < last_in_cluster)
341 discard_swap_cluster(si, offset, 343 discard_swap_cluster(si, offset,
342 last_in_cluster - offset + 1); 344 last_in_cluster - offset + 1);
343 345
344 spin_lock(&swap_lock); 346 spin_lock(&si->lock);
345 si->lowest_alloc = 0; 347 si->lowest_alloc = 0;
346 si->flags &= ~SWP_DISCARDING; 348 si->flags &= ~SWP_DISCARDING;
347 349
@@ -355,10 +357,10 @@ checks:
355 * could defer that delay until swap_writepage, 357 * could defer that delay until swap_writepage,
356 * but it's easier to keep this self-contained. 358 * but it's easier to keep this self-contained.
357 */ 359 */
358 spin_unlock(&swap_lock); 360 spin_unlock(&si->lock);
359 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), 361 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
360 wait_for_discard, TASK_UNINTERRUPTIBLE); 362 wait_for_discard, TASK_UNINTERRUPTIBLE);
361 spin_lock(&swap_lock); 363 spin_lock(&si->lock);
362 } else { 364 } else {
363 /* 365 /*
364 * Note pages allocated by racing tasks while 366 * Note pages allocated by racing tasks while
@@ -374,14 +376,14 @@ checks:
374 return offset; 376 return offset;
375 377
376scan: 378scan:
377 spin_unlock(&swap_lock); 379 spin_unlock(&si->lock);
378 while (++offset <= si->highest_bit) { 380 while (++offset <= si->highest_bit) {
379 if (!si->swap_map[offset]) { 381 if (!si->swap_map[offset]) {
380 spin_lock(&swap_lock); 382 spin_lock(&si->lock);
381 goto checks; 383 goto checks;
382 } 384 }
383 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 385 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
384 spin_lock(&swap_lock); 386 spin_lock(&si->lock);
385 goto checks; 387 goto checks;
386 } 388 }
387 if (unlikely(--latency_ration < 0)) { 389 if (unlikely(--latency_ration < 0)) {
@@ -392,11 +394,11 @@ scan:
392 offset = si->lowest_bit; 394 offset = si->lowest_bit;
393 while (++offset < scan_base) { 395 while (++offset < scan_base) {
394 if (!si->swap_map[offset]) { 396 if (!si->swap_map[offset]) {
395 spin_lock(&swap_lock); 397 spin_lock(&si->lock);
396 goto checks; 398 goto checks;
397 } 399 }
398 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 400 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
399 spin_lock(&swap_lock); 401 spin_lock(&si->lock);
400 goto checks; 402 goto checks;
401 } 403 }
402 if (unlikely(--latency_ration < 0)) { 404 if (unlikely(--latency_ration < 0)) {
@@ -404,7 +406,7 @@ scan:
404 latency_ration = LATENCY_LIMIT; 406 latency_ration = LATENCY_LIMIT;
405 } 407 }
406 } 408 }
407 spin_lock(&swap_lock); 409 spin_lock(&si->lock);
408 410
409no_page: 411no_page:
410 si->flags -= SWP_SCANNING; 412 si->flags -= SWP_SCANNING;
@@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void)
417 pgoff_t offset; 419 pgoff_t offset;
418 int type, next; 420 int type, next;
419 int wrapped = 0; 421 int wrapped = 0;
422 int hp_index;
420 423
421 spin_lock(&swap_lock); 424 spin_lock(&swap_lock);
422 if (nr_swap_pages <= 0) 425 if (atomic_long_read(&nr_swap_pages) <= 0)
423 goto noswap; 426 goto noswap;
424 nr_swap_pages--; 427 atomic_long_dec(&nr_swap_pages);
425 428
426 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 429 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
430 hp_index = atomic_xchg(&highest_priority_index, -1);
431 /*
432 * highest_priority_index records current highest priority swap
433 * type which just frees swap entries. If its priority is
434 * higher than that of swap_list.next swap type, we use it. It
435 * isn't protected by swap_lock, so it can be an invalid value
436 * if the corresponding swap type is swapoff. We double check
437 * the flags here. It's even possible the swap type is swapoff
438 * and swapon again and its priority is changed. In such rare
439 * case, low prority swap type might be used, but eventually
440 * high priority swap will be used after several rounds of
441 * swap.
442 */
443 if (hp_index != -1 && hp_index != type &&
444 swap_info[type]->prio < swap_info[hp_index]->prio &&
445 (swap_info[hp_index]->flags & SWP_WRITEOK)) {
446 type = hp_index;
447 swap_list.next = type;
448 }
449
427 si = swap_info[type]; 450 si = swap_info[type];
428 next = si->next; 451 next = si->next;
429 if (next < 0 || 452 if (next < 0 ||
@@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void)
432 wrapped++; 455 wrapped++;
433 } 456 }
434 457
435 if (!si->highest_bit) 458 spin_lock(&si->lock);
459 if (!si->highest_bit) {
460 spin_unlock(&si->lock);
436 continue; 461 continue;
437 if (!(si->flags & SWP_WRITEOK)) 462 }
463 if (!(si->flags & SWP_WRITEOK)) {
464 spin_unlock(&si->lock);
438 continue; 465 continue;
466 }
439 467
440 swap_list.next = next; 468 swap_list.next = next;
469
470 spin_unlock(&swap_lock);
441 /* This is called for allocating swap entry for cache */ 471 /* This is called for allocating swap entry for cache */
442 offset = scan_swap_map(si, SWAP_HAS_CACHE); 472 offset = scan_swap_map(si, SWAP_HAS_CACHE);
443 if (offset) { 473 spin_unlock(&si->lock);
444 spin_unlock(&swap_lock); 474 if (offset)
445 return swp_entry(type, offset); 475 return swp_entry(type, offset);
446 } 476 spin_lock(&swap_lock);
447 next = swap_list.next; 477 next = swap_list.next;
448 } 478 }
449 479
450 nr_swap_pages++; 480 atomic_long_inc(&nr_swap_pages);
451noswap: 481noswap:
452 spin_unlock(&swap_lock); 482 spin_unlock(&swap_lock);
453 return (swp_entry_t) {0}; 483 return (swp_entry_t) {0};
@@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type)
459 struct swap_info_struct *si; 489 struct swap_info_struct *si;
460 pgoff_t offset; 490 pgoff_t offset;
461 491
462 spin_lock(&swap_lock);
463 si = swap_info[type]; 492 si = swap_info[type];
493 spin_lock(&si->lock);
464 if (si && (si->flags & SWP_WRITEOK)) { 494 if (si && (si->flags & SWP_WRITEOK)) {
465 nr_swap_pages--; 495 atomic_long_dec(&nr_swap_pages);
466 /* This is called for allocating swap entry, not cache */ 496 /* This is called for allocating swap entry, not cache */
467 offset = scan_swap_map(si, 1); 497 offset = scan_swap_map(si, 1);
468 if (offset) { 498 if (offset) {
469 spin_unlock(&swap_lock); 499 spin_unlock(&si->lock);
470 return swp_entry(type, offset); 500 return swp_entry(type, offset);
471 } 501 }
472 nr_swap_pages++; 502 atomic_long_inc(&nr_swap_pages);
473 } 503 }
474 spin_unlock(&swap_lock); 504 spin_unlock(&si->lock);
475 return (swp_entry_t) {0}; 505 return (swp_entry_t) {0};
476} 506}
477 507
@@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
493 goto bad_offset; 523 goto bad_offset;
494 if (!p->swap_map[offset]) 524 if (!p->swap_map[offset])
495 goto bad_free; 525 goto bad_free;
496 spin_lock(&swap_lock); 526 spin_lock(&p->lock);
497 return p; 527 return p;
498 528
499bad_free: 529bad_free:
@@ -511,6 +541,27 @@ out:
511 return NULL; 541 return NULL;
512} 542}
513 543
544/*
545 * This swap type frees swap entry, check if it is the highest priority swap
546 * type which just frees swap entry. get_swap_page() uses
547 * highest_priority_index to search highest priority swap type. The
548 * swap_info_struct.lock can't protect us if there are multiple swap types
549 * active, so we use atomic_cmpxchg.
550 */
551static void set_highest_priority_index(int type)
552{
553 int old_hp_index, new_hp_index;
554
555 do {
556 old_hp_index = atomic_read(&highest_priority_index);
557 if (old_hp_index != -1 &&
558 swap_info[old_hp_index]->prio >= swap_info[type]->prio)
559 break;
560 new_hp_index = type;
561 } while (atomic_cmpxchg(&highest_priority_index,
562 old_hp_index, new_hp_index) != old_hp_index);
563}
564
514static unsigned char swap_entry_free(struct swap_info_struct *p, 565static unsigned char swap_entry_free(struct swap_info_struct *p,
515 swp_entry_t entry, unsigned char usage) 566 swp_entry_t entry, unsigned char usage)
516{ 567{
@@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
553 p->lowest_bit = offset; 604 p->lowest_bit = offset;
554 if (offset > p->highest_bit) 605 if (offset > p->highest_bit)
555 p->highest_bit = offset; 606 p->highest_bit = offset;
556 if (swap_list.next >= 0 && 607 set_highest_priority_index(p->type);
557 p->prio > swap_info[swap_list.next]->prio) 608 atomic_long_inc(&nr_swap_pages);
558 swap_list.next = p->type;
559 nr_swap_pages++;
560 p->inuse_pages--; 609 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset); 610 frontswap_invalidate_page(p->type, offset);
562 if (p->flags & SWP_BLKDEV) { 611 if (p->flags & SWP_BLKDEV) {
@@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry)
581 p = swap_info_get(entry); 630 p = swap_info_get(entry);
582 if (p) { 631 if (p) {
583 swap_entry_free(p, entry, 1); 632 swap_entry_free(p, entry, 1);
584 spin_unlock(&swap_lock); 633 spin_unlock(&p->lock);
585 } 634 }
586} 635}
587 636
@@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
598 count = swap_entry_free(p, entry, SWAP_HAS_CACHE); 647 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
599 if (page) 648 if (page)
600 mem_cgroup_uncharge_swapcache(page, entry, count != 0); 649 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
601 spin_unlock(&swap_lock); 650 spin_unlock(&p->lock);
602 } 651 }
603} 652}
604 653
@@ -617,7 +666,7 @@ int page_swapcount(struct page *page)
617 p = swap_info_get(entry); 666 p = swap_info_get(entry);
618 if (p) { 667 if (p) {
619 count = swap_count(p->swap_map[swp_offset(entry)]); 668 count = swap_count(p->swap_map[swp_offset(entry)]);
620 spin_unlock(&swap_lock); 669 spin_unlock(&p->lock);
621 } 670 }
622 return count; 671 return count;
623} 672}
@@ -699,13 +748,14 @@ int free_swap_and_cache(swp_entry_t entry)
699 p = swap_info_get(entry); 748 p = swap_info_get(entry);
700 if (p) { 749 if (p) {
701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { 750 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
702 page = find_get_page(&swapper_space, entry.val); 751 page = find_get_page(swap_address_space(entry),
752 entry.val);
703 if (page && !trylock_page(page)) { 753 if (page && !trylock_page(page)) {
704 page_cache_release(page); 754 page_cache_release(page);
705 page = NULL; 755 page = NULL;
706 } 756 }
707 } 757 }
708 spin_unlock(&swap_lock); 758 spin_unlock(&p->lock);
709 } 759 }
710 if (page) { 760 if (page) {
711 /* 761 /*
@@ -803,11 +853,13 @@ unsigned int count_swap_pages(int type, int free)
803 if ((unsigned int)type < nr_swapfiles) { 853 if ((unsigned int)type < nr_swapfiles) {
804 struct swap_info_struct *sis = swap_info[type]; 854 struct swap_info_struct *sis = swap_info[type];
805 855
856 spin_lock(&sis->lock);
806 if (sis->flags & SWP_WRITEOK) { 857 if (sis->flags & SWP_WRITEOK) {
807 n = sis->pages; 858 n = sis->pages;
808 if (free) 859 if (free)
809 n -= sis->inuse_pages; 860 n -= sis->inuse_pages;
810 } 861 }
862 spin_unlock(&sis->lock);
811 } 863 }
812 spin_unlock(&swap_lock); 864 spin_unlock(&swap_lock);
813 return n; 865 return n;
@@ -822,11 +874,17 @@ unsigned int count_swap_pages(int type, int free)
822static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 874static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
823 unsigned long addr, swp_entry_t entry, struct page *page) 875 unsigned long addr, swp_entry_t entry, struct page *page)
824{ 876{
877 struct page *swapcache;
825 struct mem_cgroup *memcg; 878 struct mem_cgroup *memcg;
826 spinlock_t *ptl; 879 spinlock_t *ptl;
827 pte_t *pte; 880 pte_t *pte;
828 int ret = 1; 881 int ret = 1;
829 882
883 swapcache = page;
884 page = ksm_might_need_to_copy(page, vma, addr);
885 if (unlikely(!page))
886 return -ENOMEM;
887
830 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, 888 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
831 GFP_KERNEL, &memcg)) { 889 GFP_KERNEL, &memcg)) {
832 ret = -ENOMEM; 890 ret = -ENOMEM;
@@ -845,7 +903,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
845 get_page(page); 903 get_page(page);
846 set_pte_at(vma->vm_mm, addr, pte, 904 set_pte_at(vma->vm_mm, addr, pte,
847 pte_mkold(mk_pte(page, vma->vm_page_prot))); 905 pte_mkold(mk_pte(page, vma->vm_page_prot)));
848 page_add_anon_rmap(page, vma, addr); 906 if (page == swapcache)
907 page_add_anon_rmap(page, vma, addr);
908 else /* ksm created a completely new copy */
909 page_add_new_anon_rmap(page, vma, addr);
849 mem_cgroup_commit_charge_swapin(page, memcg); 910 mem_cgroup_commit_charge_swapin(page, memcg);
850 swap_free(entry); 911 swap_free(entry);
851 /* 912 /*
@@ -856,6 +917,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
856out: 917out:
857 pte_unmap_unlock(pte, ptl); 918 pte_unmap_unlock(pte, ptl);
858out_nolock: 919out_nolock:
920 if (page != swapcache) {
921 unlock_page(page);
922 put_page(page);
923 }
859 return ret; 924 return ret;
860} 925}
861 926
@@ -1456,7 +1521,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1456 p->swap_map = swap_map; 1521 p->swap_map = swap_map;
1457 frontswap_map_set(p, frontswap_map); 1522 frontswap_map_set(p, frontswap_map);
1458 p->flags |= SWP_WRITEOK; 1523 p->flags |= SWP_WRITEOK;
1459 nr_swap_pages += p->pages; 1524 atomic_long_add(p->pages, &nr_swap_pages);
1460 total_swap_pages += p->pages; 1525 total_swap_pages += p->pages;
1461 1526
1462 /* insert swap space into swap_list: */ 1527 /* insert swap space into swap_list: */
@@ -1478,15 +1543,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1478 unsigned long *frontswap_map) 1543 unsigned long *frontswap_map)
1479{ 1544{
1480 spin_lock(&swap_lock); 1545 spin_lock(&swap_lock);
1546 spin_lock(&p->lock);
1481 _enable_swap_info(p, prio, swap_map, frontswap_map); 1547 _enable_swap_info(p, prio, swap_map, frontswap_map);
1482 frontswap_init(p->type); 1548 frontswap_init(p->type);
1549 spin_unlock(&p->lock);
1483 spin_unlock(&swap_lock); 1550 spin_unlock(&swap_lock);
1484} 1551}
1485 1552
1486static void reinsert_swap_info(struct swap_info_struct *p) 1553static void reinsert_swap_info(struct swap_info_struct *p)
1487{ 1554{
1488 spin_lock(&swap_lock); 1555 spin_lock(&swap_lock);
1556 spin_lock(&p->lock);
1489 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1557 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1558 spin_unlock(&p->lock);
1490 spin_unlock(&swap_lock); 1559 spin_unlock(&swap_lock);
1491} 1560}
1492 1561
@@ -1546,14 +1615,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1546 /* just pick something that's safe... */ 1615 /* just pick something that's safe... */
1547 swap_list.next = swap_list.head; 1616 swap_list.next = swap_list.head;
1548 } 1617 }
1618 spin_lock(&p->lock);
1549 if (p->prio < 0) { 1619 if (p->prio < 0) {
1550 for (i = p->next; i >= 0; i = swap_info[i]->next) 1620 for (i = p->next; i >= 0; i = swap_info[i]->next)
1551 swap_info[i]->prio = p->prio--; 1621 swap_info[i]->prio = p->prio--;
1552 least_priority++; 1622 least_priority++;
1553 } 1623 }
1554 nr_swap_pages -= p->pages; 1624 atomic_long_sub(p->pages, &nr_swap_pages);
1555 total_swap_pages -= p->pages; 1625 total_swap_pages -= p->pages;
1556 p->flags &= ~SWP_WRITEOK; 1626 p->flags &= ~SWP_WRITEOK;
1627 spin_unlock(&p->lock);
1557 spin_unlock(&swap_lock); 1628 spin_unlock(&swap_lock);
1558 1629
1559 set_current_oom_origin(); 1630 set_current_oom_origin();
@@ -1572,14 +1643,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1572 1643
1573 mutex_lock(&swapon_mutex); 1644 mutex_lock(&swapon_mutex);
1574 spin_lock(&swap_lock); 1645 spin_lock(&swap_lock);
1646 spin_lock(&p->lock);
1575 drain_mmlist(); 1647 drain_mmlist();
1576 1648
1577 /* wait for anyone still in scan_swap_map */ 1649 /* wait for anyone still in scan_swap_map */
1578 p->highest_bit = 0; /* cuts scans short */ 1650 p->highest_bit = 0; /* cuts scans short */
1579 while (p->flags >= SWP_SCANNING) { 1651 while (p->flags >= SWP_SCANNING) {
1652 spin_unlock(&p->lock);
1580 spin_unlock(&swap_lock); 1653 spin_unlock(&swap_lock);
1581 schedule_timeout_uninterruptible(1); 1654 schedule_timeout_uninterruptible(1);
1582 spin_lock(&swap_lock); 1655 spin_lock(&swap_lock);
1656 spin_lock(&p->lock);
1583 } 1657 }
1584 1658
1585 swap_file = p->swap_file; 1659 swap_file = p->swap_file;
@@ -1589,6 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1589 p->swap_map = NULL; 1663 p->swap_map = NULL;
1590 p->flags = 0; 1664 p->flags = 0;
1591 frontswap_invalidate_area(type); 1665 frontswap_invalidate_area(type);
1666 spin_unlock(&p->lock);
1592 spin_unlock(&swap_lock); 1667 spin_unlock(&swap_lock);
1593 mutex_unlock(&swapon_mutex); 1668 mutex_unlock(&swapon_mutex);
1594 vfree(swap_map); 1669 vfree(swap_map);
@@ -1699,7 +1774,7 @@ static int swap_show(struct seq_file *swap, void *v)
1699 len = seq_path(swap, &file->f_path, " \t\n\\"); 1774 len = seq_path(swap, &file->f_path, " \t\n\\");
1700 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1775 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1701 len < 40 ? 40 - len : 1, " ", 1776 len < 40 ? 40 - len : 1, " ",
1702 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1777 S_ISBLK(file_inode(file)->i_mode) ?
1703 "partition" : "file\t", 1778 "partition" : "file\t",
1704 si->pages << (PAGE_SHIFT - 10), 1779 si->pages << (PAGE_SHIFT - 10),
1705 si->inuse_pages << (PAGE_SHIFT - 10), 1780 si->inuse_pages << (PAGE_SHIFT - 10),
@@ -1794,6 +1869,7 @@ static struct swap_info_struct *alloc_swap_info(void)
1794 p->flags = SWP_USED; 1869 p->flags = SWP_USED;
1795 p->next = -1; 1870 p->next = -1;
1796 spin_unlock(&swap_lock); 1871 spin_unlock(&swap_lock);
1872 spin_lock_init(&p->lock);
1797 1873
1798 return p; 1874 return p;
1799} 1875}
@@ -2116,7 +2192,7 @@ void si_swapinfo(struct sysinfo *val)
2116 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 2192 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2117 nr_to_be_unused += si->inuse_pages; 2193 nr_to_be_unused += si->inuse_pages;
2118 } 2194 }
2119 val->freeswap = nr_swap_pages + nr_to_be_unused; 2195 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
2120 val->totalswap = total_swap_pages + nr_to_be_unused; 2196 val->totalswap = total_swap_pages + nr_to_be_unused;
2121 spin_unlock(&swap_lock); 2197 spin_unlock(&swap_lock);
2122} 2198}
@@ -2149,7 +2225,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2149 p = swap_info[type]; 2225 p = swap_info[type];
2150 offset = swp_offset(entry); 2226 offset = swp_offset(entry);
2151 2227
2152 spin_lock(&swap_lock); 2228 spin_lock(&p->lock);
2153 if (unlikely(offset >= p->max)) 2229 if (unlikely(offset >= p->max))
2154 goto unlock_out; 2230 goto unlock_out;
2155 2231
@@ -2184,7 +2260,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2184 p->swap_map[offset] = count | has_cache; 2260 p->swap_map[offset] = count | has_cache;
2185 2261
2186unlock_out: 2262unlock_out:
2187 spin_unlock(&swap_lock); 2263 spin_unlock(&p->lock);
2188out: 2264out:
2189 return err; 2265 return err;
2190 2266
@@ -2309,7 +2385,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2309 } 2385 }
2310 2386
2311 if (!page) { 2387 if (!page) {
2312 spin_unlock(&swap_lock); 2388 spin_unlock(&si->lock);
2313 return -ENOMEM; 2389 return -ENOMEM;
2314 } 2390 }
2315 2391
@@ -2357,7 +2433,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2357 list_add_tail(&page->lru, &head->lru); 2433 list_add_tail(&page->lru, &head->lru);
2358 page = NULL; /* now it's attached, don't free it */ 2434 page = NULL; /* now it's attached, don't free it */
2359out: 2435out:
2360 spin_unlock(&swap_lock); 2436 spin_unlock(&si->lock);
2361outer: 2437outer:
2362 if (page) 2438 if (page)
2363 __free_page(page); 2439 __free_page(page);
diff --git a/mm/util.c b/mm/util.c
index c55e26b17d93..ab1424dbe2e6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,6 +5,8 @@
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/swap.h>
9#include <linux/swapops.h>
8#include <asm/uaccess.h> 10#include <asm/uaccess.h>
9 11
10#include "internal.h" 12#include "internal.h"
@@ -355,12 +357,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
355{ 357{
356 unsigned long ret; 358 unsigned long ret;
357 struct mm_struct *mm = current->mm; 359 struct mm_struct *mm = current->mm;
360 unsigned long populate;
358 361
359 ret = security_mmap_file(file, prot, flag); 362 ret = security_mmap_file(file, prot, flag);
360 if (!ret) { 363 if (!ret) {
361 down_write(&mm->mmap_sem); 364 down_write(&mm->mmap_sem);
362 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); 365 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
366 &populate);
363 up_write(&mm->mmap_sem); 367 up_write(&mm->mmap_sem);
368 if (populate)
369 mm_populate(ret, populate);
364 } 370 }
365 return ret; 371 return ret;
366} 372}
@@ -378,6 +384,24 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
378} 384}
379EXPORT_SYMBOL(vm_mmap); 385EXPORT_SYMBOL(vm_mmap);
380 386
387struct address_space *page_mapping(struct page *page)
388{
389 struct address_space *mapping = page->mapping;
390
391 VM_BUG_ON(PageSlab(page));
392#ifdef CONFIG_SWAP
393 if (unlikely(PageSwapCache(page))) {
394 swp_entry_t entry;
395
396 entry.val = page_private(page);
397 mapping = swap_address_space(entry);
398 } else
399#endif
400 if ((unsigned long)mapping & PAGE_MAPPING_ANON)
401 mapping = NULL;
402 return mapping;
403}
404
381/* Tracepoints definitions. */ 405/* Tracepoints definitions. */
382EXPORT_TRACEPOINT_SYMBOL(kmalloc); 406EXPORT_TRACEPOINT_SYMBOL(kmalloc);
383EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 407EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5123a169ab7b..0f751f2068c3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1376,8 +1376,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1376struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1376struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1377 unsigned long start, unsigned long end) 1377 unsigned long start, unsigned long end)
1378{ 1378{
1379 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1379 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1380 __builtin_return_address(0)); 1380 GFP_KERNEL, __builtin_return_address(0));
1381} 1381}
1382EXPORT_SYMBOL_GPL(__get_vm_area); 1382EXPORT_SYMBOL_GPL(__get_vm_area);
1383 1383
@@ -1385,8 +1385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1385 unsigned long start, unsigned long end, 1385 unsigned long start, unsigned long end,
1386 const void *caller) 1386 const void *caller)
1387{ 1387{
1388 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1388 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1389 caller); 1389 GFP_KERNEL, caller);
1390} 1390}
1391 1391
1392/** 1392/**
@@ -1401,14 +1401,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1401struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1401struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1402{ 1402{
1403 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1403 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1404 -1, GFP_KERNEL, __builtin_return_address(0)); 1404 NUMA_NO_NODE, GFP_KERNEL,
1405 __builtin_return_address(0));
1405} 1406}
1406 1407
1407struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1408struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1408 const void *caller) 1409 const void *caller)
1409{ 1410{
1410 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1411 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1411 -1, GFP_KERNEL, caller); 1412 NUMA_NO_NODE, GFP_KERNEL, caller);
1412} 1413}
1413 1414
1414/** 1415/**
@@ -1650,7 +1651,7 @@ fail:
1650 * @end: vm area range end 1651 * @end: vm area range end
1651 * @gfp_mask: flags for the page level allocator 1652 * @gfp_mask: flags for the page level allocator
1652 * @prot: protection mask for the allocated pages 1653 * @prot: protection mask for the allocated pages
1653 * @node: node to use for allocation or -1 1654 * @node: node to use for allocation or NUMA_NO_NODE
1654 * @caller: caller's return address 1655 * @caller: caller's return address
1655 * 1656 *
1656 * Allocate enough pages to cover @size from the page level 1657 * Allocate enough pages to cover @size from the page level
@@ -1706,7 +1707,7 @@ fail:
1706 * @align: desired alignment 1707 * @align: desired alignment
1707 * @gfp_mask: flags for the page level allocator 1708 * @gfp_mask: flags for the page level allocator
1708 * @prot: protection mask for the allocated pages 1709 * @prot: protection mask for the allocated pages
1709 * @node: node to use for allocation or -1 1710 * @node: node to use for allocation or NUMA_NO_NODE
1710 * @caller: caller's return address 1711 * @caller: caller's return address
1711 * 1712 *
1712 * Allocate enough pages to cover @size from the page level 1713 * Allocate enough pages to cover @size from the page level
@@ -1723,7 +1724,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1723 1724
1724void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1725void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1725{ 1726{
1726 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1727 return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
1727 __builtin_return_address(0)); 1728 __builtin_return_address(0));
1728} 1729}
1729EXPORT_SYMBOL(__vmalloc); 1730EXPORT_SYMBOL(__vmalloc);
@@ -1746,7 +1747,8 @@ static inline void *__vmalloc_node_flags(unsigned long size,
1746 */ 1747 */
1747void *vmalloc(unsigned long size) 1748void *vmalloc(unsigned long size)
1748{ 1749{
1749 return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); 1750 return __vmalloc_node_flags(size, NUMA_NO_NODE,
1751 GFP_KERNEL | __GFP_HIGHMEM);
1750} 1752}
1751EXPORT_SYMBOL(vmalloc); 1753EXPORT_SYMBOL(vmalloc);
1752 1754
@@ -1762,7 +1764,7 @@ EXPORT_SYMBOL(vmalloc);
1762 */ 1764 */
1763void *vzalloc(unsigned long size) 1765void *vzalloc(unsigned long size)
1764{ 1766{
1765 return __vmalloc_node_flags(size, -1, 1767 return __vmalloc_node_flags(size, NUMA_NO_NODE,
1766 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); 1768 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1767} 1769}
1768EXPORT_SYMBOL(vzalloc); 1770EXPORT_SYMBOL(vzalloc);
@@ -1781,7 +1783,8 @@ void *vmalloc_user(unsigned long size)
1781 1783
1782 ret = __vmalloc_node(size, SHMLBA, 1784 ret = __vmalloc_node(size, SHMLBA,
1783 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1785 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1784 PAGE_KERNEL, -1, __builtin_return_address(0)); 1786 PAGE_KERNEL, NUMA_NO_NODE,
1787 __builtin_return_address(0));
1785 if (ret) { 1788 if (ret) {
1786 area = find_vm_area(ret); 1789 area = find_vm_area(ret);
1787 area->flags |= VM_USERMAP; 1790 area->flags |= VM_USERMAP;
@@ -1846,7 +1849,7 @@ EXPORT_SYMBOL(vzalloc_node);
1846void *vmalloc_exec(unsigned long size) 1849void *vmalloc_exec(unsigned long size)
1847{ 1850{
1848 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1851 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1849 -1, __builtin_return_address(0)); 1852 NUMA_NO_NODE, __builtin_return_address(0));
1850} 1853}
1851 1854
1852#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 1855#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1867,7 +1870,7 @@ void *vmalloc_exec(unsigned long size)
1867void *vmalloc_32(unsigned long size) 1870void *vmalloc_32(unsigned long size)
1868{ 1871{
1869 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, 1872 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1870 -1, __builtin_return_address(0)); 1873 NUMA_NO_NODE, __builtin_return_address(0));
1871} 1874}
1872EXPORT_SYMBOL(vmalloc_32); 1875EXPORT_SYMBOL(vmalloc_32);
1873 1876
@@ -1884,7 +1887,7 @@ void *vmalloc_32_user(unsigned long size)
1884 void *ret; 1887 void *ret;
1885 1888
1886 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1889 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1887 -1, __builtin_return_address(0)); 1890 NUMA_NO_NODE, __builtin_return_address(0));
1888 if (ret) { 1891 if (ret) {
1889 area = find_vm_area(ret); 1892 area = find_vm_area(ret);
1890 area->flags |= VM_USERMAP; 1893 area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 196709f5ee58..88c5fed8b9a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -128,7 +128,7 @@ struct scan_control {
128 * From 0 .. 100. Higher means more swappy. 128 * From 0 .. 100. Higher means more swappy.
129 */ 129 */
130int vm_swappiness = 60; 130int vm_swappiness = 60;
131long vm_total_pages; /* The total number of pages which the VM controls */ 131unsigned long vm_total_pages; /* The total number of pages which the VM controls */
132 132
133static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
1579} 1579}
1580#endif 1580#endif
1581 1581
1582static int inactive_file_is_low_global(struct zone *zone)
1583{
1584 unsigned long active, inactive;
1585
1586 active = zone_page_state(zone, NR_ACTIVE_FILE);
1587 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1588
1589 return (active > inactive);
1590}
1591
1592/** 1582/**
1593 * inactive_file_is_low - check if file pages need to be deactivated 1583 * inactive_file_is_low - check if file pages need to be deactivated
1594 * @lruvec: LRU vector to check 1584 * @lruvec: LRU vector to check
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone)
1605 */ 1595 */
1606static int inactive_file_is_low(struct lruvec *lruvec) 1596static int inactive_file_is_low(struct lruvec *lruvec)
1607{ 1597{
1608 if (!mem_cgroup_disabled()) 1598 unsigned long inactive;
1609 return mem_cgroup_inactive_file_is_low(lruvec); 1599 unsigned long active;
1600
1601 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1602 active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
1610 1603
1611 return inactive_file_is_low_global(lruvec_zone(lruvec)); 1604 return active > inactive;
1612} 1605}
1613 1606
1614static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) 1607static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc)
1638 return mem_cgroup_swappiness(sc->target_mem_cgroup); 1631 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1639} 1632}
1640 1633
1634enum scan_balance {
1635 SCAN_EQUAL,
1636 SCAN_FRACT,
1637 SCAN_ANON,
1638 SCAN_FILE,
1639};
1640
1641/* 1641/*
1642 * Determine how aggressively the anon and file LRU lists should be 1642 * Determine how aggressively the anon and file LRU lists should be
1643 * scanned. The relative value of each set of LRU lists is determined 1643 * scanned. The relative value of each set of LRU lists is determined
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc)
1650static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1650static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1651 unsigned long *nr) 1651 unsigned long *nr)
1652{ 1652{
1653 unsigned long anon, file, free; 1653 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1654 u64 fraction[2];
1655 u64 denominator = 0; /* gcc */
1656 struct zone *zone = lruvec_zone(lruvec);
1654 unsigned long anon_prio, file_prio; 1657 unsigned long anon_prio, file_prio;
1658 enum scan_balance scan_balance;
1659 unsigned long anon, file, free;
1660 bool force_scan = false;
1655 unsigned long ap, fp; 1661 unsigned long ap, fp;
1656 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1657 u64 fraction[2], denominator;
1658 enum lru_list lru; 1662 enum lru_list lru;
1659 int noswap = 0;
1660 bool force_scan = false;
1661 struct zone *zone = lruvec_zone(lruvec);
1662 1663
1663 /* 1664 /*
1664 * If the zone or memcg is small, nr[l] can be 0. This 1665 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1676 force_scan = true; 1677 force_scan = true;
1677 1678
1678 /* If we have no swap space, do not bother scanning anon pages. */ 1679 /* If we have no swap space, do not bother scanning anon pages. */
1679 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1680 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
1680 noswap = 1; 1681 scan_balance = SCAN_FILE;
1681 fraction[0] = 0; 1682 goto out;
1682 fraction[1] = 1; 1683 }
1683 denominator = 1; 1684
1685 /*
1686 * Global reclaim will swap to prevent OOM even with no
1687 * swappiness, but memcg users want to use this knob to
1688 * disable swapping for individual groups completely when
1689 * using the memory controller's swap limit feature would be
1690 * too expensive.
1691 */
1692 if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
1693 scan_balance = SCAN_FILE;
1694 goto out;
1695 }
1696
1697 /*
1698 * Do not apply any pressure balancing cleverness when the
1699 * system is close to OOM, scan both anon and file equally
1700 * (unless the swappiness setting disagrees with swapping).
1701 */
1702 if (!sc->priority && vmscan_swappiness(sc)) {
1703 scan_balance = SCAN_EQUAL;
1684 goto out; 1704 goto out;
1685 } 1705 }
1686 1706
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1689 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + 1709 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1690 get_lru_size(lruvec, LRU_INACTIVE_FILE); 1710 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1691 1711
1712 /*
1713 * If it's foreseeable that reclaiming the file cache won't be
1714 * enough to get the zone back into a desirable shape, we have
1715 * to swap. Better start now and leave the - probably heavily
1716 * thrashing - remaining file pages alone.
1717 */
1692 if (global_reclaim(sc)) { 1718 if (global_reclaim(sc)) {
1693 free = zone_page_state(zone, NR_FREE_PAGES); 1719 free = zone_page_state(zone, NR_FREE_PAGES);
1694 if (unlikely(file + free <= high_wmark_pages(zone))) { 1720 if (unlikely(file + free <= high_wmark_pages(zone))) {
1695 /* 1721 scan_balance = SCAN_ANON;
1696 * If we have very few page cache pages, force-scan
1697 * anon pages.
1698 */
1699 fraction[0] = 1;
1700 fraction[1] = 0;
1701 denominator = 1;
1702 goto out;
1703 } else if (!inactive_file_is_low_global(zone)) {
1704 /*
1705 * There is enough inactive page cache, do not
1706 * reclaim anything from the working set right now.
1707 */
1708 fraction[0] = 0;
1709 fraction[1] = 1;
1710 denominator = 1;
1711 goto out; 1722 goto out;
1712 } 1723 }
1713 } 1724 }
1714 1725
1715 /* 1726 /*
1727 * There is enough inactive page cache, do not reclaim
1728 * anything from the anonymous working set right now.
1729 */
1730 if (!inactive_file_is_low(lruvec)) {
1731 scan_balance = SCAN_FILE;
1732 goto out;
1733 }
1734
1735 scan_balance = SCAN_FRACT;
1736
1737 /*
1716 * With swappiness at 100, anonymous and file have the same priority. 1738 * With swappiness at 100, anonymous and file have the same priority.
1717 * This scanning priority is essentially the inverse of IO cost. 1739 * This scanning priority is essentially the inverse of IO cost.
1718 */ 1740 */
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1759out: 1781out:
1760 for_each_evictable_lru(lru) { 1782 for_each_evictable_lru(lru) {
1761 int file = is_file_lru(lru); 1783 int file = is_file_lru(lru);
1784 unsigned long size;
1762 unsigned long scan; 1785 unsigned long scan;
1763 1786
1764 scan = get_lru_size(lruvec, lru); 1787 size = get_lru_size(lruvec, lru);
1765 if (sc->priority || noswap || !vmscan_swappiness(sc)) { 1788 scan = size >> sc->priority;
1766 scan >>= sc->priority; 1789
1767 if (!scan && force_scan) 1790 if (!scan && force_scan)
1768 scan = SWAP_CLUSTER_MAX; 1791 scan = min(size, SWAP_CLUSTER_MAX);
1792
1793 switch (scan_balance) {
1794 case SCAN_EQUAL:
1795 /* Scan lists relative to size */
1796 break;
1797 case SCAN_FRACT:
1798 /*
1799 * Scan types proportional to swappiness and
1800 * their relative recent reclaim efficiency.
1801 */
1769 scan = div64_u64(scan * fraction[file], denominator); 1802 scan = div64_u64(scan * fraction[file], denominator);
1803 break;
1804 case SCAN_FILE:
1805 case SCAN_ANON:
1806 /* Scan one type exclusively */
1807 if ((scan_balance == SCAN_FILE) != file)
1808 scan = 0;
1809 break;
1810 default:
1811 /* Look ma, no brain */
1812 BUG();
1770 } 1813 }
1771 nr[lru] = scan; 1814 nr[lru] = scan;
1772 } 1815 }
1773} 1816}
1774 1817
1818/*
1819 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1820 */
1821static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1822{
1823 unsigned long nr[NR_LRU_LISTS];
1824 unsigned long nr_to_scan;
1825 enum lru_list lru;
1826 unsigned long nr_reclaimed = 0;
1827 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1828 struct blk_plug plug;
1829
1830 get_scan_count(lruvec, sc, nr);
1831
1832 blk_start_plug(&plug);
1833 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1834 nr[LRU_INACTIVE_FILE]) {
1835 for_each_evictable_lru(lru) {
1836 if (nr[lru]) {
1837 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
1838 nr[lru] -= nr_to_scan;
1839
1840 nr_reclaimed += shrink_list(lru, nr_to_scan,
1841 lruvec, sc);
1842 }
1843 }
1844 /*
1845 * On large memory systems, scan >> priority can become
1846 * really large. This is fine for the starting priority;
1847 * we want to put equal scanning pressure on each zone.
1848 * However, if the VM has a harder time of freeing pages,
1849 * with multiple processes reclaiming pages, the total
1850 * freeing target can get unreasonably large.
1851 */
1852 if (nr_reclaimed >= nr_to_reclaim &&
1853 sc->priority < DEF_PRIORITY)
1854 break;
1855 }
1856 blk_finish_plug(&plug);
1857 sc->nr_reclaimed += nr_reclaimed;
1858
1859 /*
1860 * Even if we did not try to evict anon pages at all, we want to
1861 * rebalance the anon lru active/inactive ratio.
1862 */
1863 if (inactive_anon_is_low(lruvec))
1864 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1865 sc, LRU_ACTIVE_ANON);
1866
1867 throttle_vm_writeout(sc->gfp_mask);
1868}
1869
1775/* Use reclaim/compaction for costly allocs or under memory pressure */ 1870/* Use reclaim/compaction for costly allocs or under memory pressure */
1776static bool in_reclaim_compaction(struct scan_control *sc) 1871static bool in_reclaim_compaction(struct scan_control *sc)
1777{ 1872{
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
1790 * calls try_to_compact_zone() that it will have enough free pages to succeed. 1885 * calls try_to_compact_zone() that it will have enough free pages to succeed.
1791 * It will give up earlier than that if there is difficulty reclaiming pages. 1886 * It will give up earlier than that if there is difficulty reclaiming pages.
1792 */ 1887 */
1793static inline bool should_continue_reclaim(struct lruvec *lruvec, 1888static inline bool should_continue_reclaim(struct zone *zone,
1794 unsigned long nr_reclaimed, 1889 unsigned long nr_reclaimed,
1795 unsigned long nr_scanned, 1890 unsigned long nr_scanned,
1796 struct scan_control *sc) 1891 struct scan_control *sc)
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1830 * inactive lists are large enough, continue reclaiming 1925 * inactive lists are large enough, continue reclaiming
1831 */ 1926 */
1832 pages_for_compaction = (2UL << sc->order); 1927 pages_for_compaction = (2UL << sc->order);
1833 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1928 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
1834 if (nr_swap_pages > 0) 1929 if (get_nr_swap_pages() > 0)
1835 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); 1930 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
1836 if (sc->nr_reclaimed < pages_for_compaction && 1931 if (sc->nr_reclaimed < pages_for_compaction &&
1837 inactive_lru_pages > pages_for_compaction) 1932 inactive_lru_pages > pages_for_compaction)
1838 return true; 1933 return true;
1839 1934
1840 /* If compaction would go ahead or the allocation would succeed, stop */ 1935 /* If compaction would go ahead or the allocation would succeed, stop */
1841 switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { 1936 switch (compaction_suitable(zone, sc->order)) {
1842 case COMPACT_PARTIAL: 1937 case COMPACT_PARTIAL:
1843 case COMPACT_CONTINUE: 1938 case COMPACT_CONTINUE:
1844 return false; 1939 return false;
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1847 } 1942 }
1848} 1943}
1849 1944
1850/* 1945static void shrink_zone(struct zone *zone, struct scan_control *sc)
1851 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1852 */
1853static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1854{ 1946{
1855 unsigned long nr[NR_LRU_LISTS];
1856 unsigned long nr_to_scan;
1857 enum lru_list lru;
1858 unsigned long nr_reclaimed, nr_scanned; 1947 unsigned long nr_reclaimed, nr_scanned;
1859 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1860 struct blk_plug plug;
1861
1862restart:
1863 nr_reclaimed = 0;
1864 nr_scanned = sc->nr_scanned;
1865 get_scan_count(lruvec, sc, nr);
1866
1867 blk_start_plug(&plug);
1868 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1869 nr[LRU_INACTIVE_FILE]) {
1870 for_each_evictable_lru(lru) {
1871 if (nr[lru]) {
1872 nr_to_scan = min_t(unsigned long,
1873 nr[lru], SWAP_CLUSTER_MAX);
1874 nr[lru] -= nr_to_scan;
1875
1876 nr_reclaimed += shrink_list(lru, nr_to_scan,
1877 lruvec, sc);
1878 }
1879 }
1880 /*
1881 * On large memory systems, scan >> priority can become
1882 * really large. This is fine for the starting priority;
1883 * we want to put equal scanning pressure on each zone.
1884 * However, if the VM has a harder time of freeing pages,
1885 * with multiple processes reclaiming pages, the total
1886 * freeing target can get unreasonably large.
1887 */
1888 if (nr_reclaimed >= nr_to_reclaim &&
1889 sc->priority < DEF_PRIORITY)
1890 break;
1891 }
1892 blk_finish_plug(&plug);
1893 sc->nr_reclaimed += nr_reclaimed;
1894 1948
1895 /* 1949 do {
1896 * Even if we did not try to evict anon pages at all, we want to 1950 struct mem_cgroup *root = sc->target_mem_cgroup;
1897 * rebalance the anon lru active/inactive ratio. 1951 struct mem_cgroup_reclaim_cookie reclaim = {
1898 */ 1952 .zone = zone,
1899 if (inactive_anon_is_low(lruvec)) 1953 .priority = sc->priority,
1900 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 1954 };
1901 sc, LRU_ACTIVE_ANON); 1955 struct mem_cgroup *memcg;
1902
1903 /* reclaim/compaction might need reclaim to continue */
1904 if (should_continue_reclaim(lruvec, nr_reclaimed,
1905 sc->nr_scanned - nr_scanned, sc))
1906 goto restart;
1907 1956
1908 throttle_vm_writeout(sc->gfp_mask); 1957 nr_reclaimed = sc->nr_reclaimed;
1909} 1958 nr_scanned = sc->nr_scanned;
1910 1959
1911static void shrink_zone(struct zone *zone, struct scan_control *sc) 1960 memcg = mem_cgroup_iter(root, NULL, &reclaim);
1912{ 1961 do {
1913 struct mem_cgroup *root = sc->target_mem_cgroup; 1962 struct lruvec *lruvec;
1914 struct mem_cgroup_reclaim_cookie reclaim = {
1915 .zone = zone,
1916 .priority = sc->priority,
1917 };
1918 struct mem_cgroup *memcg;
1919 1963
1920 memcg = mem_cgroup_iter(root, NULL, &reclaim); 1964 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1921 do {
1922 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1923 1965
1924 shrink_lruvec(lruvec, sc); 1966 shrink_lruvec(lruvec, sc);
1925 1967
1926 /* 1968 /*
1927 * Limit reclaim has historically picked one memcg and 1969 * Direct reclaim and kswapd have to scan all memory
1928 * scanned it with decreasing priority levels until 1970 * cgroups to fulfill the overall scan target for the
1929 * nr_to_reclaim had been reclaimed. This priority 1971 * zone.
1930 * cycle is thus over after a single memcg. 1972 *
1931 * 1973 * Limit reclaim, on the other hand, only cares about
1932 * Direct reclaim and kswapd, on the other hand, have 1974 * nr_to_reclaim pages to be reclaimed and it will
1933 * to scan all memory cgroups to fulfill the overall 1975 * retry with decreasing priority if one round over the
1934 * scan target for the zone. 1976 * whole hierarchy is not sufficient.
1935 */ 1977 */
1936 if (!global_reclaim(sc)) { 1978 if (!global_reclaim(sc) &&
1937 mem_cgroup_iter_break(root, memcg); 1979 sc->nr_reclaimed >= sc->nr_to_reclaim) {
1938 break; 1980 mem_cgroup_iter_break(root, memcg);
1939 } 1981 break;
1940 memcg = mem_cgroup_iter(root, memcg, &reclaim); 1982 }
1941 } while (memcg); 1983 memcg = mem_cgroup_iter(root, memcg, &reclaim);
1984 } while (memcg);
1985 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
1986 sc->nr_scanned - nr_scanned, sc));
1942} 1987}
1943 1988
1944/* Returns true if compaction should go ahead for a high-order request */ 1989/* Returns true if compaction should go ahead for a high-order request */
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
1958 * a reasonable chance of completing and allocating the page 2003 * a reasonable chance of completing and allocating the page
1959 */ 2004 */
1960 balance_gap = min(low_wmark_pages(zone), 2005 balance_gap = min(low_wmark_pages(zone),
1961 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2006 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
1962 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2007 KSWAPD_ZONE_BALANCE_GAP_RATIO);
1963 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2008 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
1964 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2009 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2150 goto out; 2195 goto out;
2151 2196
2152 /* 2197 /*
2198 * If we're getting trouble reclaiming, start doing
2199 * writepage even in laptop mode.
2200 */
2201 if (sc->priority < DEF_PRIORITY - 2)
2202 sc->may_writepage = 1;
2203
2204 /*
2153 * Try to write back as many pages as we just scanned. This 2205 * Try to write back as many pages as we just scanned. This
2154 * tends to cause slow streaming writers to write data to the 2206 * tends to cause slow streaming writers to write data to the
2155 * disk smoothly, at the dirtying rate, which is nice. But 2207 * disk smoothly, at the dirtying rate, which is nice. But
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2300{ 2352{
2301 unsigned long nr_reclaimed; 2353 unsigned long nr_reclaimed;
2302 struct scan_control sc = { 2354 struct scan_control sc = {
2303 .gfp_mask = gfp_mask, 2355 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2304 .may_writepage = !laptop_mode, 2356 .may_writepage = !laptop_mode,
2305 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2357 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2306 .may_unmap = 1, 2358 .may_unmap = 1,
@@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order,
2473 */ 2525 */
2474static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) 2526static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2475{ 2527{
2476 unsigned long present_pages = 0; 2528 unsigned long managed_pages = 0;
2477 unsigned long balanced_pages = 0; 2529 unsigned long balanced_pages = 0;
2478 int i; 2530 int i;
2479 2531
@@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2484 if (!populated_zone(zone)) 2536 if (!populated_zone(zone))
2485 continue; 2537 continue;
2486 2538
2487 present_pages += zone->present_pages; 2539 managed_pages += zone->managed_pages;
2488 2540
2489 /* 2541 /*
2490 * A special case here: 2542 * A special case here:
@@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2494 * they must be considered balanced here as well! 2546 * they must be considered balanced here as well!
2495 */ 2547 */
2496 if (zone->all_unreclaimable) { 2548 if (zone->all_unreclaimable) {
2497 balanced_pages += zone->present_pages; 2549 balanced_pages += zone->managed_pages;
2498 continue; 2550 continue;
2499 } 2551 }
2500 2552
2501 if (zone_balanced(zone, order, 0, i)) 2553 if (zone_balanced(zone, order, 0, i))
2502 balanced_pages += zone->present_pages; 2554 balanced_pages += zone->managed_pages;
2503 else if (!order) 2555 else if (!order)
2504 return false; 2556 return false;
2505 } 2557 }
2506 2558
2507 if (order) 2559 if (order)
2508 return balanced_pages >= (present_pages >> 2); 2560 return balanced_pages >= (managed_pages >> 2);
2509 else 2561 else
2510 return true; 2562 return true;
2511} 2563}
@@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2564static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2616static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2565 int *classzone_idx) 2617 int *classzone_idx)
2566{ 2618{
2567 struct zone *unbalanced_zone; 2619 bool pgdat_is_balanced = false;
2568 int i; 2620 int i;
2569 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2621 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2570 unsigned long total_scanned; 2622 unsigned long total_scanned;
@@ -2595,9 +2647,6 @@ loop_again:
2595 2647
2596 do { 2648 do {
2597 unsigned long lru_pages = 0; 2649 unsigned long lru_pages = 0;
2598 int has_under_min_watermark_zone = 0;
2599
2600 unbalanced_zone = NULL;
2601 2650
2602 /* 2651 /*
2603 * Scan in the highmem->dma direction for the highest 2652 * Scan in the highmem->dma direction for the highest
@@ -2638,8 +2687,11 @@ loop_again:
2638 zone_clear_flag(zone, ZONE_CONGESTED); 2687 zone_clear_flag(zone, ZONE_CONGESTED);
2639 } 2688 }
2640 } 2689 }
2641 if (i < 0) 2690
2691 if (i < 0) {
2692 pgdat_is_balanced = true;
2642 goto out; 2693 goto out;
2694 }
2643 2695
2644 for (i = 0; i <= end_zone; i++) { 2696 for (i = 0; i <= end_zone; i++) {
2645 struct zone *zone = pgdat->node_zones + i; 2697 struct zone *zone = pgdat->node_zones + i;
@@ -2689,7 +2741,7 @@ loop_again:
2689 * of the zone, whichever is smaller. 2741 * of the zone, whichever is smaller.
2690 */ 2742 */
2691 balance_gap = min(low_wmark_pages(zone), 2743 balance_gap = min(low_wmark_pages(zone),
2692 (zone->present_pages + 2744 (zone->managed_pages +
2693 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2745 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2694 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2746 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2695 /* 2747 /*
@@ -2720,12 +2772,10 @@ loop_again:
2720 } 2772 }
2721 2773
2722 /* 2774 /*
2723 * If we've done a decent amount of scanning and 2775 * If we're getting trouble reclaiming, start doing
2724 * the reclaim ratio is low, start doing writepage 2776 * writepage even in laptop mode.
2725 * even in laptop mode
2726 */ 2777 */
2727 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2778 if (sc.priority < DEF_PRIORITY - 2)
2728 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2729 sc.may_writepage = 1; 2779 sc.may_writepage = 1;
2730 2780
2731 if (zone->all_unreclaimable) { 2781 if (zone->all_unreclaimable) {
@@ -2734,17 +2784,7 @@ loop_again:
2734 continue; 2784 continue;
2735 } 2785 }
2736 2786
2737 if (!zone_balanced(zone, testorder, 0, end_zone)) { 2787 if (zone_balanced(zone, testorder, 0, end_zone))
2738 unbalanced_zone = zone;
2739 /*
2740 * We are still under min water mark. This
2741 * means that we have a GFP_ATOMIC allocation
2742 * failure risk. Hurry up!
2743 */
2744 if (!zone_watermark_ok_safe(zone, order,
2745 min_wmark_pages(zone), end_zone, 0))
2746 has_under_min_watermark_zone = 1;
2747 } else {
2748 /* 2788 /*
2749 * If a zone reaches its high watermark, 2789 * If a zone reaches its high watermark,
2750 * consider it to be no longer congested. It's 2790 * consider it to be no longer congested. It's
@@ -2753,8 +2793,6 @@ loop_again:
2753 * speculatively avoid congestion waits 2793 * speculatively avoid congestion waits
2754 */ 2794 */
2755 zone_clear_flag(zone, ZONE_CONGESTED); 2795 zone_clear_flag(zone, ZONE_CONGESTED);
2756 }
2757
2758 } 2796 }
2759 2797
2760 /* 2798 /*
@@ -2766,17 +2804,9 @@ loop_again:
2766 pfmemalloc_watermark_ok(pgdat)) 2804 pfmemalloc_watermark_ok(pgdat))
2767 wake_up(&pgdat->pfmemalloc_wait); 2805 wake_up(&pgdat->pfmemalloc_wait);
2768 2806
2769 if (pgdat_balanced(pgdat, order, *classzone_idx)) 2807 if (pgdat_balanced(pgdat, order, *classzone_idx)) {
2808 pgdat_is_balanced = true;
2770 break; /* kswapd: all done */ 2809 break; /* kswapd: all done */
2771 /*
2772 * OK, kswapd is getting into trouble. Take a nap, then take
2773 * another pass across the zones.
2774 */
2775 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2776 if (has_under_min_watermark_zone)
2777 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2778 else if (unbalanced_zone)
2779 wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
2780 } 2810 }
2781 2811
2782 /* 2812 /*
@@ -2788,9 +2818,9 @@ loop_again:
2788 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2818 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2789 break; 2819 break;
2790 } while (--sc.priority >= 0); 2820 } while (--sc.priority >= 0);
2791out:
2792 2821
2793 if (!pgdat_balanced(pgdat, order, *classzone_idx)) { 2822out:
2823 if (!pgdat_is_balanced) {
2794 cond_resched(); 2824 cond_resched();
2795 2825
2796 try_to_freeze(); 2826 try_to_freeze();
@@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void)
3053 nr = global_page_state(NR_ACTIVE_FILE) + 3083 nr = global_page_state(NR_ACTIVE_FILE) +
3054 global_page_state(NR_INACTIVE_FILE); 3084 global_page_state(NR_INACTIVE_FILE);
3055 3085
3056 if (nr_swap_pages > 0) 3086 if (get_nr_swap_pages() > 0)
3057 nr += global_page_state(NR_ACTIVE_ANON) + 3087 nr += global_page_state(NR_ACTIVE_ANON) +
3058 global_page_state(NR_INACTIVE_ANON); 3088 global_page_state(NR_INACTIVE_ANON);
3059 3089
@@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
3067 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 3097 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3068 zone_page_state(zone, NR_INACTIVE_FILE); 3098 zone_page_state(zone, NR_INACTIVE_FILE);
3069 3099
3070 if (nr_swap_pages > 0) 3100 if (get_nr_swap_pages() > 0)
3071 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 3101 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3072 zone_page_state(zone, NR_INACTIVE_ANON); 3102 zone_page_state(zone, NR_INACTIVE_ANON);
3073 3103
@@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3280 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3310 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3281 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3311 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3282 .may_swap = 1, 3312 .may_swap = 1,
3283 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3313 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3284 SWAP_CLUSTER_MAX), 3314 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3285 .gfp_mask = gfp_mask,
3286 .order = order, 3315 .order = order,
3287 .priority = ZONE_RECLAIM_PRIORITY, 3316 .priority = ZONE_RECLAIM_PRIORITY,
3288 }; 3317 };
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9800306c8195..e1d8ed172c42 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -142,7 +142,7 @@ int calculate_normal_threshold(struct zone *zone)
142 * 125 1024 10 16-32 GB 9 142 * 125 1024 10 16-32 GB 9
143 */ 143 */
144 144
145 mem = zone->present_pages >> (27 - PAGE_SHIFT); 145 mem = zone->managed_pages >> (27 - PAGE_SHIFT);
146 146
147 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); 147 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
148 148
@@ -628,7 +628,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
628#ifdef CONFIG_CMA 628#ifdef CONFIG_CMA
629 "CMA", 629 "CMA",
630#endif 630#endif
631#ifdef CONFIG_MEMORY_ISOLATION
631 "Isolate", 632 "Isolate",
633#endif
632}; 634};
633 635
634static void *frag_start(struct seq_file *m, loff_t *pos) 636static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -768,7 +770,6 @@ const char * const vmstat_text[] = {
768 "kswapd_inodesteal", 770 "kswapd_inodesteal",
769 "kswapd_low_wmark_hit_quickly", 771 "kswapd_low_wmark_hit_quickly",
770 "kswapd_high_wmark_hit_quickly", 772 "kswapd_high_wmark_hit_quickly",
771 "kswapd_skip_congestion_wait",
772 "pageoutrun", 773 "pageoutrun",
773 "allocstall", 774 "allocstall",
774 775
@@ -890,7 +891,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
890 int mtype; 891 int mtype;
891 unsigned long pfn; 892 unsigned long pfn;
892 unsigned long start_pfn = zone->zone_start_pfn; 893 unsigned long start_pfn = zone->zone_start_pfn;
893 unsigned long end_pfn = start_pfn + zone->spanned_pages; 894 unsigned long end_pfn = zone_end_pfn(zone);
894 unsigned long count[MIGRATE_TYPES] = { 0, }; 895 unsigned long count[MIGRATE_TYPES] = { 0, };
895 896
896 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 897 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {