aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig25
-rw-r--r--mm/backing-dev.c11
-rw-r--r--mm/bootmem.c32
-rw-r--r--mm/bounce.c48
-rw-r--r--mm/compaction.c133
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/fremap.c51
-rw-r--r--mm/huge_memory.c112
-rw-r--r--mm/hugetlb.c35
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kmemleak.c5
-rw-r--r--mm/ksm.c657
-rw-r--r--mm/madvise.c105
-rw-r--r--mm/memblock.c70
-rw-r--r--mm/memcontrol.c477
-rw-r--r--mm/memory-failure.c202
-rw-r--r--mm/memory.c140
-rw-r--r--mm/memory_hotplug.c553
-rw-r--r--mm/mempolicy.c189
-rw-r--r--mm/migrate.c154
-rw-r--r--mm/mincore.c5
-rw-r--r--mm/mlock.c103
-rw-r--r--mm/mm_init.c31
-rw-r--r--mm/mmap.c88
-rw-r--r--mm/mmu_notifier.c84
-rw-r--r--mm/mmzone.c20
-rw-r--r--mm/mremap.c28
-rw-r--r--mm/nobootmem.c23
-rw-r--r--mm/nommu.c29
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c28
-rw-r--r--mm/page_alloc.c525
-rw-r--r--mm/page_isolation.c26
-rw-r--r--mm/rmap.c30
-rw-r--r--mm/shmem.c50
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c4
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c9
-rw-r--r--mm/swap_state.c58
-rw-r--r--mm/swapfile.c174
-rw-r--r--mm/util.c26
-rw-r--r--mm/vmalloc.c33
-rw-r--r--mm/vmscan.c494
-rw-r--r--mm/vmstat.c7
47 files changed, 3244 insertions, 1680 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 278e3ab1f169..2c7aea7106f9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1,6 +1,6 @@
1config SELECT_MEMORY_MODEL 1config SELECT_MEMORY_MODEL
2 def_bool y 2 def_bool y
3 depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL 3 depends on ARCH_SELECT_MEMORY_MODEL
4 4
5choice 5choice
6 prompt "Memory model" 6 prompt "Memory model"
@@ -162,10 +162,16 @@ config MOVABLE_NODE
162 Say Y here if you want to hotplug a whole node. 162 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly. 163 Say N here if you want kernel to use memory on all nodes evenly.
164 164
165#
166# Only be set on architectures that have completely implemented memory hotplug
167# feature. If you are not sure, don't touch it.
168#
169config HAVE_BOOTMEM_INFO_NODE
170 def_bool n
171
165# eventually, we can have this option just 'select SPARSEMEM' 172# eventually, we can have this option just 'select SPARSEMEM'
166config MEMORY_HOTPLUG 173config MEMORY_HOTPLUG
167 bool "Allow for memory hot-add" 174 bool "Allow for memory hot-add"
168 select MEMORY_ISOLATION
169 depends on SPARSEMEM || X86_64_ACPI_NUMA 175 depends on SPARSEMEM || X86_64_ACPI_NUMA
170 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG 176 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
171 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) 177 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE
176 182
177config MEMORY_HOTREMOVE 183config MEMORY_HOTREMOVE
178 bool "Allow for memory hot remove" 184 bool "Allow for memory hot remove"
185 select MEMORY_ISOLATION
186 select HAVE_BOOTMEM_INFO_NODE if X86_64
179 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE 187 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
180 depends on MIGRATION 188 depends on MIGRATION
181 189
@@ -258,6 +266,19 @@ config BOUNCE
258 def_bool y 266 def_bool y
259 depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) 267 depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
260 268
269# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
270# have more than 4GB of memory, but we don't currently use the IOTLB to present
271# a 32-bit address to OHCI. So we need to use a bounce pool instead.
272#
273# We also use the bounce pool to provide stable page writes for jbd. jbd
274# initiates buffer writeback without locking the page or setting PG_writeback,
275# and fixing that behavior (a second time; jbd2 doesn't have this problem) is
276# a major rework effort. Instead, use the bounce buffer to snapshot pages
277# (until jbd goes away). The only jbd user is ext3.
278config NEED_BOUNCE_POOL
279 bool
280 default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD)
281
261config NR_QUICK 282config NR_QUICK
262 int 283 int
263 depends on QUICKLIST 284 depends on QUICKLIST
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca2b3ee176..41733c5dc820 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -221,12 +221,23 @@ static ssize_t max_ratio_store(struct device *dev,
221} 221}
222BDI_SHOW(max_ratio, bdi->max_ratio) 222BDI_SHOW(max_ratio, bdi->max_ratio)
223 223
224static ssize_t stable_pages_required_show(struct device *dev,
225 struct device_attribute *attr,
226 char *page)
227{
228 struct backing_dev_info *bdi = dev_get_drvdata(dev);
229
230 return snprintf(page, PAGE_SIZE-1, "%d\n",
231 bdi_cap_stable_pages_required(bdi) ? 1 : 0);
232}
233
224#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) 234#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
225 235
226static struct device_attribute bdi_dev_attrs[] = { 236static struct device_attribute bdi_dev_attrs[] = {
227 __ATTR_RW(read_ahead_kb), 237 __ATTR_RW(read_ahead_kb),
228 __ATTR_RW(min_ratio), 238 __ATTR_RW(min_ratio),
229 __ATTR_RW(max_ratio), 239 __ATTR_RW(max_ratio),
240 __ATTR_RO(stable_pages_required),
230 __ATTR_NULL, 241 __ATTR_NULL,
231}; 242};
232 243
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 1324cd74faec..2b0bcb019ec2 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
185 185
186 while (start < end) { 186 while (start < end) {
187 unsigned long *map, idx, vec; 187 unsigned long *map, idx, vec;
188 unsigned shift;
188 189
189 map = bdata->node_bootmem_map; 190 map = bdata->node_bootmem_map;
190 idx = start - bdata->node_min_pfn; 191 idx = start - bdata->node_min_pfn;
192 shift = idx & (BITS_PER_LONG - 1);
193 /*
194 * vec holds at most BITS_PER_LONG map bits,
195 * bit 0 corresponds to start.
196 */
191 vec = ~map[idx / BITS_PER_LONG]; 197 vec = ~map[idx / BITS_PER_LONG];
198
199 if (shift) {
200 vec >>= shift;
201 if (end - start >= BITS_PER_LONG)
202 vec |= ~map[idx / BITS_PER_LONG + 1] <<
203 (BITS_PER_LONG - shift);
204 }
192 /* 205 /*
193 * If we have a properly aligned and fully unreserved 206 * If we have a properly aligned and fully unreserved
194 * BITS_PER_LONG block of pages in front of us, free 207 * BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
201 count += BITS_PER_LONG; 214 count += BITS_PER_LONG;
202 start += BITS_PER_LONG; 215 start += BITS_PER_LONG;
203 } else { 216 } else {
204 unsigned long off = 0; 217 unsigned long cur = start;
205 218
206 vec >>= start & (BITS_PER_LONG - 1); 219 start = ALIGN(start + 1, BITS_PER_LONG);
207 while (vec) { 220 while (vec && cur != start) {
208 if (vec & 1) { 221 if (vec & 1) {
209 page = pfn_to_page(start + off); 222 page = pfn_to_page(cur);
210 __free_pages_bootmem(page, 0); 223 __free_pages_bootmem(page, 0);
211 count++; 224 count++;
212 } 225 }
213 vec >>= 1; 226 vec >>= 1;
214 off++; 227 ++cur;
215 } 228 }
216 start = ALIGN(start + 1, BITS_PER_LONG);
217 } 229 }
218 } 230 }
219 231
@@ -821,6 +833,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
821 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); 833 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
822} 834}
823 835
836void * __init __alloc_bootmem_low_nopanic(unsigned long size,
837 unsigned long align,
838 unsigned long goal)
839{
840 return ___alloc_bootmem_nopanic(size, align, goal,
841 ARCH_LOW_ADDRESS_LIMIT);
842}
843
824/** 844/**
825 * __alloc_bootmem_low_node - allocate low boot memory from a specific node 845 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
826 * @pgdat: node to allocate from 846 * @pgdat: node to allocate from
diff --git a/mm/bounce.c b/mm/bounce.c
index 042086775561..5f8901768602 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -178,8 +178,45 @@ static void bounce_end_io_read_isa(struct bio *bio, int err)
178 __bounce_end_io_read(bio, isa_page_pool, err); 178 __bounce_end_io_read(bio, isa_page_pool, err);
179} 179}
180 180
181#ifdef CONFIG_NEED_BOUNCE_POOL
182static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
183{
184 struct page *page;
185 struct backing_dev_info *bdi;
186 struct address_space *mapping;
187 struct bio_vec *from;
188 int i;
189
190 if (bio_data_dir(bio) != WRITE)
191 return 0;
192
193 if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
194 return 0;
195
196 /*
197 * Based on the first page that has a valid mapping, decide whether or
198 * not we have to employ bounce buffering to guarantee stable pages.
199 */
200 bio_for_each_segment(from, bio, i) {
201 page = from->bv_page;
202 mapping = page_mapping(page);
203 if (!mapping)
204 continue;
205 bdi = mapping->backing_dev_info;
206 return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
207 }
208
209 return 0;
210}
211#else
212static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
213{
214 return 0;
215}
216#endif /* CONFIG_NEED_BOUNCE_POOL */
217
181static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, 218static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
182 mempool_t *pool) 219 mempool_t *pool, int force)
183{ 220{
184 struct page *page; 221 struct page *page;
185 struct bio *bio = NULL; 222 struct bio *bio = NULL;
@@ -192,7 +229,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
192 /* 229 /*
193 * is destination page below bounce pfn? 230 * is destination page below bounce pfn?
194 */ 231 */
195 if (page_to_pfn(page) <= queue_bounce_pfn(q)) 232 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
196 continue; 233 continue;
197 234
198 /* 235 /*
@@ -270,6 +307,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
270 307
271void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) 308void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
272{ 309{
310 int must_bounce;
273 mempool_t *pool; 311 mempool_t *pool;
274 312
275 /* 313 /*
@@ -278,13 +316,15 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
278 if (!bio_has_data(*bio_orig)) 316 if (!bio_has_data(*bio_orig))
279 return; 317 return;
280 318
319 must_bounce = must_snapshot_stable_pages(q, *bio_orig);
320
281 /* 321 /*
282 * for non-isa bounce case, just check if the bounce pfn is equal 322 * for non-isa bounce case, just check if the bounce pfn is equal
283 * to or bigger than the highest pfn in the system -- in that case, 323 * to or bigger than the highest pfn in the system -- in that case,
284 * don't waste time iterating over bio segments 324 * don't waste time iterating over bio segments
285 */ 325 */
286 if (!(q->bounce_gfp & GFP_DMA)) { 326 if (!(q->bounce_gfp & GFP_DMA)) {
287 if (queue_bounce_pfn(q) >= blk_max_pfn) 327 if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
288 return; 328 return;
289 pool = page_pool; 329 pool = page_pool;
290 } else { 330 } else {
@@ -295,7 +335,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
295 /* 335 /*
296 * slow path 336 * slow path
297 */ 337 */
298 __blk_queue_bounce(q, bio_orig, pool); 338 __blk_queue_bounce(q, bio_orig, pool, must_bounce);
299} 339}
300 340
301EXPORT_SYMBOL(blk_queue_bounce); 341EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/compaction.c b/mm/compaction.c
index 6b807e466497..05ccb4cc0bdb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -15,6 +15,7 @@
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include <linux/balloon_compaction.h> 17#include <linux/balloon_compaction.h>
18#include <linux/page-isolation.h>
18#include "internal.h" 19#include "internal.h"
19 20
20#ifdef CONFIG_COMPACTION 21#ifdef CONFIG_COMPACTION
@@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
85static void __reset_isolation_suitable(struct zone *zone) 86static void __reset_isolation_suitable(struct zone *zone)
86{ 87{
87 unsigned long start_pfn = zone->zone_start_pfn; 88 unsigned long start_pfn = zone->zone_start_pfn;
88 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; 89 unsigned long end_pfn = zone_end_pfn(zone);
89 unsigned long pfn; 90 unsigned long pfn;
90 91
91 zone->compact_cached_migrate_pfn = start_pfn; 92 zone->compact_cached_migrate_pfn = start_pfn;
@@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page)
215 int migratetype = get_pageblock_migratetype(page); 216 int migratetype = get_pageblock_migratetype(page);
216 217
217 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ 218 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
218 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) 219 if (migratetype == MIGRATE_RESERVE)
220 return false;
221
222 if (is_migrate_isolate(migratetype))
219 return false; 223 return false;
220 224
221 /* If the page is a large free page, then allow migration */ 225 /* If the page is a large free page, then allow migration */
@@ -611,8 +615,7 @@ check_compact_cluster:
611 continue; 615 continue;
612 616
613next_pageblock: 617next_pageblock:
614 low_pfn += pageblock_nr_pages; 618 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
615 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
616 last_pageblock_nr = pageblock_nr; 619 last_pageblock_nr = pageblock_nr;
617 } 620 }
618 621
@@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone,
644 struct compact_control *cc) 647 struct compact_control *cc)
645{ 648{
646 struct page *page; 649 struct page *page;
647 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; 650 unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
648 int nr_freepages = cc->nr_freepages; 651 int nr_freepages = cc->nr_freepages;
649 struct list_head *freelist = &cc->freepages; 652 struct list_head *freelist = &cc->freepages;
650 653
@@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone,
663 */ 666 */
664 high_pfn = min(low_pfn, pfn); 667 high_pfn = min(low_pfn, pfn);
665 668
666 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 669 z_end_pfn = zone_end_pfn(zone);
667 670
668 /* 671 /*
669 * Isolate free pages until enough are available to migrate the 672 * Isolate free pages until enough are available to migrate the
@@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone,
706 * only scans within a pageblock 709 * only scans within a pageblock
707 */ 710 */
708 end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 711 end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
709 end_pfn = min(end_pfn, zone_end_pfn); 712 end_pfn = min(end_pfn, z_end_pfn);
710 isolated = isolate_freepages_block(cc, pfn, end_pfn, 713 isolated = isolate_freepages_block(cc, pfn, end_pfn,
711 freelist, false); 714 freelist, false);
712 nr_freepages += isolated; 715 nr_freepages += isolated;
@@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
795 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 798 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
796 799
797 /* Only scan within a pageblock boundary */ 800 /* Only scan within a pageblock boundary */
798 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); 801 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
799 802
800 /* Do not cross the free scanner or scan within a memory hole */ 803 /* Do not cross the free scanner or scan within a memory hole */
801 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 804 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
@@ -816,6 +819,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
816static int compact_finished(struct zone *zone, 819static int compact_finished(struct zone *zone,
817 struct compact_control *cc) 820 struct compact_control *cc)
818{ 821{
822 unsigned int order;
819 unsigned long watermark; 823 unsigned long watermark;
820 824
821 if (fatal_signal_pending(current)) 825 if (fatal_signal_pending(current))
@@ -850,22 +854,16 @@ static int compact_finished(struct zone *zone,
850 return COMPACT_CONTINUE; 854 return COMPACT_CONTINUE;
851 855
852 /* Direct compactor: Is a suitable page free? */ 856 /* Direct compactor: Is a suitable page free? */
853 if (cc->page) { 857 for (order = cc->order; order < MAX_ORDER; order++) {
854 /* Was a suitable page captured? */ 858 struct free_area *area = &zone->free_area[order];
855 if (*cc->page) 859
860 /* Job done if page is free of the right migratetype */
861 if (!list_empty(&area->free_list[cc->migratetype]))
862 return COMPACT_PARTIAL;
863
864 /* Job done if allocation would set block type */
865 if (cc->order >= pageblock_order && area->nr_free)
856 return COMPACT_PARTIAL; 866 return COMPACT_PARTIAL;
857 } else {
858 unsigned int order;
859 for (order = cc->order; order < MAX_ORDER; order++) {
860 struct free_area *area = &zone->free_area[cc->order];
861 /* Job done if page is free of the right migratetype */
862 if (!list_empty(&area->free_list[cc->migratetype]))
863 return COMPACT_PARTIAL;
864
865 /* Job done if allocation would set block type */
866 if (cc->order >= pageblock_order && area->nr_free)
867 return COMPACT_PARTIAL;
868 }
869 } 867 }
870 868
871 return COMPACT_CONTINUE; 869 return COMPACT_CONTINUE;
@@ -921,65 +919,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
921 return COMPACT_CONTINUE; 919 return COMPACT_CONTINUE;
922} 920}
923 921
924static void compact_capture_page(struct compact_control *cc)
925{
926 unsigned long flags;
927 int mtype, mtype_low, mtype_high;
928
929 if (!cc->page || *cc->page)
930 return;
931
932 /*
933 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
934 * regardless of the migratetype of the freelist is is captured from.
935 * This is fine because the order for a high-order MIGRATE_MOVABLE
936 * allocation is typically at least a pageblock size and overall
937 * fragmentation is not impaired. Other allocation types must
938 * capture pages from their own migratelist because otherwise they
939 * could pollute other pageblocks like MIGRATE_MOVABLE with
940 * difficult to move pages and making fragmentation worse overall.
941 */
942 if (cc->migratetype == MIGRATE_MOVABLE) {
943 mtype_low = 0;
944 mtype_high = MIGRATE_PCPTYPES;
945 } else {
946 mtype_low = cc->migratetype;
947 mtype_high = cc->migratetype + 1;
948 }
949
950 /* Speculatively examine the free lists without zone lock */
951 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
952 int order;
953 for (order = cc->order; order < MAX_ORDER; order++) {
954 struct page *page;
955 struct free_area *area;
956 area = &(cc->zone->free_area[order]);
957 if (list_empty(&area->free_list[mtype]))
958 continue;
959
960 /* Take the lock and attempt capture of the page */
961 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
962 return;
963 if (!list_empty(&area->free_list[mtype])) {
964 page = list_entry(area->free_list[mtype].next,
965 struct page, lru);
966 if (capture_free_page(page, cc->order, mtype)) {
967 spin_unlock_irqrestore(&cc->zone->lock,
968 flags);
969 *cc->page = page;
970 return;
971 }
972 }
973 spin_unlock_irqrestore(&cc->zone->lock, flags);
974 }
975 }
976}
977
978static int compact_zone(struct zone *zone, struct compact_control *cc) 922static int compact_zone(struct zone *zone, struct compact_control *cc)
979{ 923{
980 int ret; 924 int ret;
981 unsigned long start_pfn = zone->zone_start_pfn; 925 unsigned long start_pfn = zone->zone_start_pfn;
982 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; 926 unsigned long end_pfn = zone_end_pfn(zone);
983 927
984 ret = compaction_suitable(zone, cc->order); 928 ret = compaction_suitable(zone, cc->order);
985 switch (ret) { 929 switch (ret) {
@@ -1036,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1036 980
1037 nr_migrate = cc->nr_migratepages; 981 nr_migrate = cc->nr_migratepages;
1038 err = migrate_pages(&cc->migratepages, compaction_alloc, 982 err = migrate_pages(&cc->migratepages, compaction_alloc,
1039 (unsigned long)cc, false, 983 (unsigned long)cc,
1040 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, 984 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
1041 MR_COMPACTION); 985 MR_COMPACTION);
1042 update_nr_listpages(cc); 986 update_nr_listpages(cc);
@@ -1054,9 +998,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1054 goto out; 998 goto out;
1055 } 999 }
1056 } 1000 }
1057
1058 /* Capture a page now if it is a suitable size */
1059 compact_capture_page(cc);
1060 } 1001 }
1061 1002
1062out: 1003out:
@@ -1069,8 +1010,7 @@ out:
1069 1010
1070static unsigned long compact_zone_order(struct zone *zone, 1011static unsigned long compact_zone_order(struct zone *zone,
1071 int order, gfp_t gfp_mask, 1012 int order, gfp_t gfp_mask,
1072 bool sync, bool *contended, 1013 bool sync, bool *contended)
1073 struct page **page)
1074{ 1014{
1075 unsigned long ret; 1015 unsigned long ret;
1076 struct compact_control cc = { 1016 struct compact_control cc = {
@@ -1080,7 +1020,6 @@ static unsigned long compact_zone_order(struct zone *zone,
1080 .migratetype = allocflags_to_migratetype(gfp_mask), 1020 .migratetype = allocflags_to_migratetype(gfp_mask),
1081 .zone = zone, 1021 .zone = zone,
1082 .sync = sync, 1022 .sync = sync,
1083 .page = page,
1084 }; 1023 };
1085 INIT_LIST_HEAD(&cc.freepages); 1024 INIT_LIST_HEAD(&cc.freepages);
1086 INIT_LIST_HEAD(&cc.migratepages); 1025 INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1049,7 @@ int sysctl_extfrag_threshold = 500;
1110 */ 1049 */
1111unsigned long try_to_compact_pages(struct zonelist *zonelist, 1050unsigned long try_to_compact_pages(struct zonelist *zonelist,
1112 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1051 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1113 bool sync, bool *contended, struct page **page) 1052 bool sync, bool *contended)
1114{ 1053{
1115 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1054 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1116 int may_enter_fs = gfp_mask & __GFP_FS; 1055 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1075,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1136 int status; 1075 int status;
1137 1076
1138 status = compact_zone_order(zone, order, gfp_mask, sync, 1077 status = compact_zone_order(zone, order, gfp_mask, sync,
1139 contended, page); 1078 contended);
1140 rc = max(status, rc); 1079 rc = max(status, rc);
1141 1080
1142 /* If a normal allocation would succeed, stop compacting */ 1081 /* If a normal allocation would succeed, stop compacting */
@@ -1150,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1150 1089
1151 1090
1152/* Compact all zones within a node */ 1091/* Compact all zones within a node */
1153static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 1092static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1154{ 1093{
1155 int zoneid; 1094 int zoneid;
1156 struct zone *zone; 1095 struct zone *zone;
@@ -1183,34 +1122,30 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1183 VM_BUG_ON(!list_empty(&cc->freepages)); 1122 VM_BUG_ON(!list_empty(&cc->freepages));
1184 VM_BUG_ON(!list_empty(&cc->migratepages)); 1123 VM_BUG_ON(!list_empty(&cc->migratepages));
1185 } 1124 }
1186
1187 return 0;
1188} 1125}
1189 1126
1190int compact_pgdat(pg_data_t *pgdat, int order) 1127void compact_pgdat(pg_data_t *pgdat, int order)
1191{ 1128{
1192 struct compact_control cc = { 1129 struct compact_control cc = {
1193 .order = order, 1130 .order = order,
1194 .sync = false, 1131 .sync = false,
1195 .page = NULL,
1196 }; 1132 };
1197 1133
1198 return __compact_pgdat(pgdat, &cc); 1134 __compact_pgdat(pgdat, &cc);
1199} 1135}
1200 1136
1201static int compact_node(int nid) 1137static void compact_node(int nid)
1202{ 1138{
1203 struct compact_control cc = { 1139 struct compact_control cc = {
1204 .order = -1, 1140 .order = -1,
1205 .sync = true, 1141 .sync = true,
1206 .page = NULL,
1207 }; 1142 };
1208 1143
1209 return __compact_pgdat(NODE_DATA(nid), &cc); 1144 __compact_pgdat(NODE_DATA(nid), &cc);
1210} 1145}
1211 1146
1212/* Compact all nodes in the system */ 1147/* Compact all nodes in the system */
1213static int compact_nodes(void) 1148static void compact_nodes(void)
1214{ 1149{
1215 int nid; 1150 int nid;
1216 1151
@@ -1219,8 +1154,6 @@ static int compact_nodes(void)
1219 1154
1220 for_each_online_node(nid) 1155 for_each_online_node(nid)
1221 compact_node(nid); 1156 compact_node(nid);
1222
1223 return COMPACT_COMPLETE;
1224} 1157}
1225 1158
1226/* The written value is actually unused, all memory is compacted */ 1159/* The written value is actually unused, all memory is compacted */
@@ -1231,7 +1164,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
1231 void __user *buffer, size_t *length, loff_t *ppos) 1164 void __user *buffer, size_t *length, loff_t *ppos)
1232{ 1165{
1233 if (write) 1166 if (write)
1234 return compact_nodes(); 1167 compact_nodes();
1235 1168
1236 return 0; 1169 return 0;
1237} 1170}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a47f0f50c89f..909ec558625c 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -17,6 +17,7 @@
17#include <linux/fadvise.h> 17#include <linux/fadvise.h>
18#include <linux/writeback.h> 18#include <linux/writeback.h>
19#include <linux/syscalls.h> 19#include <linux/syscalls.h>
20#include <linux/swap.h>
20 21
21#include <asm/unistd.h> 22#include <asm/unistd.h>
22 23
@@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
120 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; 121 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
121 end_index = (endbyte >> PAGE_CACHE_SHIFT); 122 end_index = (endbyte >> PAGE_CACHE_SHIFT);
122 123
123 if (end_index >= start_index) 124 if (end_index >= start_index) {
124 invalidate_mapping_pages(mapping, start_index, 125 unsigned long count = invalidate_mapping_pages(mapping,
126 start_index, end_index);
127
128 /*
129 * If fewer pages were invalidated than expected then
130 * it is possible that some of the pages were on
131 * a per-cpu pagevec for a remote CPU. Drain all
132 * pagevecs and try again.
133 */
134 if (count < (end_index - start_index + 1)) {
135 lru_add_drain_all();
136 invalidate_mapping_pages(mapping, start_index,
125 end_index); 137 end_index);
138 }
139 }
126 break; 140 break;
127 default: 141 default:
128 ret = -EINVAL; 142 ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 83efee76a5c0..c610076c30e1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1728,6 +1728,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1728 * see the dirty page and writeprotect it again. 1728 * see the dirty page and writeprotect it again.
1729 */ 1729 */
1730 set_page_dirty(page); 1730 set_page_dirty(page);
1731 wait_for_stable_page(page);
1731out: 1732out:
1732 sb_end_pagefault(inode->i_sb); 1733 sb_end_pagefault(inode->i_sb);
1733 return ret; 1734 return ret;
@@ -2056,7 +2057,7 @@ EXPORT_SYMBOL(iov_iter_fault_in_readable);
2056/* 2057/*
2057 * Return the count of just the current iov_iter segment. 2058 * Return the count of just the current iov_iter segment.
2058 */ 2059 */
2059size_t iov_iter_single_seg_count(struct iov_iter *i) 2060size_t iov_iter_single_seg_count(const struct iov_iter *i)
2060{ 2061{
2061 const struct iovec *iov = i->iov; 2062 const struct iovec *iov = i->iov;
2062 if (i->nr_segs == 1) 2063 if (i->nr_segs == 1)
@@ -2274,7 +2275,7 @@ repeat:
2274 return NULL; 2275 return NULL;
2275 } 2276 }
2276found: 2277found:
2277 wait_on_page_writeback(page); 2278 wait_for_stable_page(page);
2278 return page; 2279 return page;
2279} 2280}
2280EXPORT_SYMBOL(grab_cache_page_write_begin); 2281EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e56800..0cd4c11488ed 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
129 struct vm_area_struct *vma; 129 struct vm_area_struct *vma;
130 int err = -EINVAL; 130 int err = -EINVAL;
131 int has_write_lock = 0; 131 int has_write_lock = 0;
132 vm_flags_t vm_flags;
132 133
133 if (prot) 134 if (prot)
134 return err; 135 return err;
@@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
160 /* 161 /*
161 * Make sure the vma is shared, that it supports prefaulting, 162 * Make sure the vma is shared, that it supports prefaulting,
162 * and that the remapped range is valid and fully within 163 * and that the remapped range is valid and fully within
163 * the single existing vma. vm_private_data is used as a 164 * the single existing vma.
164 * swapout cursor in a VM_NONLINEAR vma.
165 */ 165 */
166 if (!vma || !(vma->vm_flags & VM_SHARED)) 166 if (!vma || !(vma->vm_flags & VM_SHARED))
167 goto out; 167 goto out;
168 168
169 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
170 goto out;
171
172 if (!vma->vm_ops || !vma->vm_ops->remap_pages) 169 if (!vma->vm_ops || !vma->vm_ops->remap_pages)
173 goto out; 170 goto out;
174 171
@@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
177 174
178 /* Must set VM_NONLINEAR before any pages are populated. */ 175 /* Must set VM_NONLINEAR before any pages are populated. */
179 if (!(vma->vm_flags & VM_NONLINEAR)) { 176 if (!(vma->vm_flags & VM_NONLINEAR)) {
177 /*
178 * vm_private_data is used as a swapout cursor
179 * in a VM_NONLINEAR vma.
180 */
181 if (vma->vm_private_data)
182 goto out;
183
180 /* Don't need a nonlinear mapping, exit success */ 184 /* Don't need a nonlinear mapping, exit success */
181 if (pgoff == linear_page_index(vma, start)) { 185 if (pgoff == linear_page_index(vma, start)) {
182 err = 0; 186 err = 0;
@@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
184 } 188 }
185 189
186 if (!has_write_lock) { 190 if (!has_write_lock) {
191get_write_lock:
187 up_read(&mm->mmap_sem); 192 up_read(&mm->mmap_sem);
188 down_write(&mm->mmap_sem); 193 down_write(&mm->mmap_sem);
189 has_write_lock = 1; 194 has_write_lock = 1;
@@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
199 unsigned long addr; 204 unsigned long addr;
200 struct file *file = get_file(vma->vm_file); 205 struct file *file = get_file(vma->vm_file);
201 206
202 flags &= MAP_NONBLOCK; 207 vm_flags = vma->vm_flags;
203 addr = mmap_region(file, start, size, 208 if (!(flags & MAP_NONBLOCK))
204 flags, vma->vm_flags, pgoff); 209 vm_flags |= VM_POPULATE;
210 addr = mmap_region(file, start, size, vm_flags, pgoff);
205 fput(file); 211 fput(file);
206 if (IS_ERR_VALUE(addr)) { 212 if (IS_ERR_VALUE(addr)) {
207 err = addr; 213 err = addr;
@@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
220 mutex_unlock(&mapping->i_mmap_mutex); 226 mutex_unlock(&mapping->i_mmap_mutex);
221 } 227 }
222 228
229 if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
230 if (!has_write_lock)
231 goto get_write_lock;
232 vma->vm_flags |= VM_POPULATE;
233 }
234
223 if (vma->vm_flags & VM_LOCKED) { 235 if (vma->vm_flags & VM_LOCKED) {
224 /* 236 /*
225 * drop PG_Mlocked flag for over-mapped range 237 * drop PG_Mlocked flag for over-mapped range
226 */ 238 */
227 vm_flags_t saved_flags = vma->vm_flags; 239 if (!has_write_lock)
240 goto get_write_lock;
241 vm_flags = vma->vm_flags;
228 munlock_vma_pages_range(vma, start, start + size); 242 munlock_vma_pages_range(vma, start, start + size);
229 vma->vm_flags = saved_flags; 243 vma->vm_flags = vm_flags;
230 } 244 }
231 245
232 mmu_notifier_invalidate_range_start(mm, start, start + size); 246 mmu_notifier_invalidate_range_start(mm, start, start + size);
233 err = vma->vm_ops->remap_pages(vma, start, size, pgoff); 247 err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
234 mmu_notifier_invalidate_range_end(mm, start, start + size); 248 mmu_notifier_invalidate_range_end(mm, start, start + size);
235 if (!err && !(flags & MAP_NONBLOCK)) {
236 if (vma->vm_flags & VM_LOCKED) {
237 /*
238 * might be mapping previously unmapped range of file
239 */
240 mlock_vma_pages_range(vma, start, start + size);
241 } else {
242 if (unlikely(has_write_lock)) {
243 downgrade_write(&mm->mmap_sem);
244 has_write_lock = 0;
245 }
246 make_pages_present(start, start+size);
247 }
248 }
249 249
250 /* 250 /*
251 * We can't clear VM_NONLINEAR because we'd have to do 251 * We can't clear VM_NONLINEAR because we'd have to do
@@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
254 */ 254 */
255 255
256out: 256out:
257 vm_flags = vma->vm_flags;
257 if (likely(!has_write_lock)) 258 if (likely(!has_write_lock))
258 up_read(&mm->mmap_sem); 259 up_read(&mm->mmap_sem);
259 else 260 else
260 up_write(&mm->mmap_sem); 261 up_write(&mm->mmap_sem);
262 if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
263 mm_populate(start, size);
261 264
262 return err; 265 return err;
263} 266}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9e894edc7811..bfa142e67b1c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -20,6 +20,7 @@
20#include <linux/mman.h> 20#include <linux/mman.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/migrate.h> 22#include <linux/migrate.h>
23#include <linux/hashtable.h>
23 24
24#include <asm/tlb.h> 25#include <asm/tlb.h>
25#include <asm/pgalloc.h> 26#include <asm/pgalloc.h>
@@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
62static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; 63static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
63 64
64static int khugepaged(void *none); 65static int khugepaged(void *none);
65static int mm_slots_hash_init(void);
66static int khugepaged_slab_init(void); 66static int khugepaged_slab_init(void);
67static void khugepaged_slab_free(void);
68 67
69#define MM_SLOTS_HASH_HEADS 1024 68#define MM_SLOTS_HASH_BITS 10
70static struct hlist_head *mm_slots_hash __read_mostly; 69static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
70
71static struct kmem_cache *mm_slot_cache __read_mostly; 71static struct kmem_cache *mm_slot_cache __read_mostly;
72 72
73/** 73/**
@@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void)
105 struct zone *zone; 105 struct zone *zone;
106 int nr_zones = 0; 106 int nr_zones = 0;
107 unsigned long recommended_min; 107 unsigned long recommended_min;
108 extern int min_free_kbytes;
109 108
110 if (!khugepaged_enabled()) 109 if (!khugepaged_enabled())
111 return 0; 110 return 0;
@@ -634,12 +633,6 @@ static int __init hugepage_init(void)
634 if (err) 633 if (err)
635 goto out; 634 goto out;
636 635
637 err = mm_slots_hash_init();
638 if (err) {
639 khugepaged_slab_free();
640 goto out;
641 }
642
643 register_shrinker(&huge_zero_page_shrinker); 636 register_shrinker(&huge_zero_page_shrinker);
644 637
645 /* 638 /*
@@ -1257,6 +1250,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1257 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1250 if (flags & FOLL_WRITE && !pmd_write(*pmd))
1258 goto out; 1251 goto out;
1259 1252
1253 /* Avoid dumping huge zero page */
1254 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1255 return ERR_PTR(-EFAULT);
1256
1260 page = pmd_page(*pmd); 1257 page = pmd_page(*pmd);
1261 VM_BUG_ON(!PageHead(page)); 1258 VM_BUG_ON(!PageHead(page));
1262 if (flags & FOLL_TOUCH) { 1259 if (flags & FOLL_TOUCH) {
@@ -1298,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1298 int target_nid; 1295 int target_nid;
1299 int current_nid = -1; 1296 int current_nid = -1;
1300 bool migrated; 1297 bool migrated;
1301 bool page_locked = false;
1302 1298
1303 spin_lock(&mm->page_table_lock); 1299 spin_lock(&mm->page_table_lock);
1304 if (unlikely(!pmd_same(pmd, *pmdp))) 1300 if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1320,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1320 /* Acquire the page lock to serialise THP migrations */ 1316 /* Acquire the page lock to serialise THP migrations */
1321 spin_unlock(&mm->page_table_lock); 1317 spin_unlock(&mm->page_table_lock);
1322 lock_page(page); 1318 lock_page(page);
1323 page_locked = true;
1324 1319
1325 /* Confirm the PTE did not while locked */ 1320 /* Confirm the PTE did not while locked */
1326 spin_lock(&mm->page_table_lock); 1321 spin_lock(&mm->page_table_lock);
@@ -1333,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1333 1328
1334 /* Migrate the THP to the requested node */ 1329 /* Migrate the THP to the requested node */
1335 migrated = migrate_misplaced_transhuge_page(mm, vma, 1330 migrated = migrate_misplaced_transhuge_page(mm, vma,
1336 pmdp, pmd, addr, 1331 pmdp, pmd, addr, page, target_nid);
1337 page, target_nid); 1332 if (!migrated)
1338 if (migrated) 1333 goto check_same;
1339 current_nid = target_nid;
1340 else {
1341 spin_lock(&mm->page_table_lock);
1342 if (unlikely(!pmd_same(pmd, *pmdp))) {
1343 unlock_page(page);
1344 goto out_unlock;
1345 }
1346 goto clear_pmdnuma;
1347 }
1348 1334
1349 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); 1335 task_numa_fault(target_nid, HPAGE_PMD_NR, true);
1350 return 0; 1336 return 0;
1351 1337
1338check_same:
1339 spin_lock(&mm->page_table_lock);
1340 if (unlikely(!pmd_same(pmd, *pmdp)))
1341 goto out_unlock;
1352clear_pmdnuma: 1342clear_pmdnuma:
1353 pmd = pmd_mknonnuma(pmd); 1343 pmd = pmd_mknonnuma(pmd);
1354 set_pmd_at(mm, haddr, pmdp, pmd); 1344 set_pmd_at(mm, haddr, pmdp, pmd);
1355 VM_BUG_ON(pmd_numa(*pmdp)); 1345 VM_BUG_ON(pmd_numa(*pmdp));
1356 update_mmu_cache_pmd(vma, addr, pmdp); 1346 update_mmu_cache_pmd(vma, addr, pmdp);
1357 if (page_locked)
1358 unlock_page(page);
1359
1360out_unlock: 1347out_unlock:
1361 spin_unlock(&mm->page_table_lock); 1348 spin_unlock(&mm->page_table_lock);
1362 if (current_nid != -1) 1349 if (current_nid != -1)
1363 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); 1350 task_numa_fault(current_nid, HPAGE_PMD_NR, false);
1364 return 0; 1351 return 0;
1365} 1352}
1366 1353
@@ -1652,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page)
1652 page_tail->mapping = page->mapping; 1639 page_tail->mapping = page->mapping;
1653 1640
1654 page_tail->index = page->index + i; 1641 page_tail->index = page->index + i;
1655 page_xchg_last_nid(page_tail, page_last_nid(page)); 1642 page_nid_xchg_last(page_tail, page_nid_last(page));
1656 1643
1657 BUG_ON(!PageAnon(page_tail)); 1644 BUG_ON(!PageAnon(page_tail));
1658 BUG_ON(!PageUptodate(page_tail)); 1645 BUG_ON(!PageUptodate(page_tail));
@@ -1819,9 +1806,19 @@ int split_huge_page(struct page *page)
1819 1806
1820 BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); 1807 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1821 BUG_ON(!PageAnon(page)); 1808 BUG_ON(!PageAnon(page));
1822 anon_vma = page_lock_anon_vma_read(page); 1809
1810 /*
1811 * The caller does not necessarily hold an mmap_sem that would prevent
1812 * the anon_vma disappearing so we first we take a reference to it
1813 * and then lock the anon_vma for write. This is similar to
1814 * page_lock_anon_vma_read except the write lock is taken to serialise
1815 * against parallel split or collapse operations.
1816 */
1817 anon_vma = page_get_anon_vma(page);
1823 if (!anon_vma) 1818 if (!anon_vma)
1824 goto out; 1819 goto out;
1820 anon_vma_lock_write(anon_vma);
1821
1825 ret = 0; 1822 ret = 0;
1826 if (!PageCompound(page)) 1823 if (!PageCompound(page))
1827 goto out_unlock; 1824 goto out_unlock;
@@ -1832,7 +1829,8 @@ int split_huge_page(struct page *page)
1832 1829
1833 BUG_ON(PageCompound(page)); 1830 BUG_ON(PageCompound(page));
1834out_unlock: 1831out_unlock:
1835 page_unlock_anon_vma_read(anon_vma); 1832 anon_vma_unlock_write(anon_vma);
1833 put_anon_vma(anon_vma);
1836out: 1834out:
1837 return ret; 1835 return ret;
1838} 1836}
@@ -1893,12 +1891,6 @@ static int __init khugepaged_slab_init(void)
1893 return 0; 1891 return 0;
1894} 1892}
1895 1893
1896static void __init khugepaged_slab_free(void)
1897{
1898 kmem_cache_destroy(mm_slot_cache);
1899 mm_slot_cache = NULL;
1900}
1901
1902static inline struct mm_slot *alloc_mm_slot(void) 1894static inline struct mm_slot *alloc_mm_slot(void)
1903{ 1895{
1904 if (!mm_slot_cache) /* initialization failed */ 1896 if (!mm_slot_cache) /* initialization failed */
@@ -1911,47 +1903,23 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
1911 kmem_cache_free(mm_slot_cache, mm_slot); 1903 kmem_cache_free(mm_slot_cache, mm_slot);
1912} 1904}
1913 1905
1914static int __init mm_slots_hash_init(void)
1915{
1916 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1917 GFP_KERNEL);
1918 if (!mm_slots_hash)
1919 return -ENOMEM;
1920 return 0;
1921}
1922
1923#if 0
1924static void __init mm_slots_hash_free(void)
1925{
1926 kfree(mm_slots_hash);
1927 mm_slots_hash = NULL;
1928}
1929#endif
1930
1931static struct mm_slot *get_mm_slot(struct mm_struct *mm) 1906static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1932{ 1907{
1933 struct mm_slot *mm_slot; 1908 struct mm_slot *mm_slot;
1934 struct hlist_head *bucket;
1935 struct hlist_node *node; 1909 struct hlist_node *node;
1936 1910
1937 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 1911 hash_for_each_possible(mm_slots_hash, mm_slot, node, hash, (unsigned long)mm)
1938 % MM_SLOTS_HASH_HEADS];
1939 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1940 if (mm == mm_slot->mm) 1912 if (mm == mm_slot->mm)
1941 return mm_slot; 1913 return mm_slot;
1942 } 1914
1943 return NULL; 1915 return NULL;
1944} 1916}
1945 1917
1946static void insert_to_mm_slots_hash(struct mm_struct *mm, 1918static void insert_to_mm_slots_hash(struct mm_struct *mm,
1947 struct mm_slot *mm_slot) 1919 struct mm_slot *mm_slot)
1948{ 1920{
1949 struct hlist_head *bucket;
1950
1951 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1952 % MM_SLOTS_HASH_HEADS];
1953 mm_slot->mm = mm; 1921 mm_slot->mm = mm;
1954 hlist_add_head(&mm_slot->hash, bucket); 1922 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
1955} 1923}
1956 1924
1957static inline int khugepaged_test_exit(struct mm_struct *mm) 1925static inline int khugepaged_test_exit(struct mm_struct *mm)
@@ -2020,7 +1988,7 @@ void __khugepaged_exit(struct mm_struct *mm)
2020 spin_lock(&khugepaged_mm_lock); 1988 spin_lock(&khugepaged_mm_lock);
2021 mm_slot = get_mm_slot(mm); 1989 mm_slot = get_mm_slot(mm);
2022 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 1990 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
2023 hlist_del(&mm_slot->hash); 1991 hash_del(&mm_slot->hash);
2024 list_del(&mm_slot->mm_node); 1992 list_del(&mm_slot->mm_node);
2025 free = 1; 1993 free = 1;
2026 } 1994 }
@@ -2353,7 +2321,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2353 BUG_ON(!pmd_none(*pmd)); 2321 BUG_ON(!pmd_none(*pmd));
2354 set_pmd_at(mm, address, pmd, _pmd); 2322 set_pmd_at(mm, address, pmd, _pmd);
2355 spin_unlock(&mm->page_table_lock); 2323 spin_unlock(&mm->page_table_lock);
2356 anon_vma_unlock(vma->anon_vma); 2324 anon_vma_unlock_write(vma->anon_vma);
2357 goto out; 2325 goto out;
2358 } 2326 }
2359 2327
@@ -2361,7 +2329,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2361 * All pages are isolated and locked so anon_vma rmap 2329 * All pages are isolated and locked so anon_vma rmap
2362 * can't run anymore. 2330 * can't run anymore.
2363 */ 2331 */
2364 anon_vma_unlock(vma->anon_vma); 2332 anon_vma_unlock_write(vma->anon_vma);
2365 2333
2366 __collapse_huge_page_copy(pte, new_page, vma, address, ptl); 2334 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
2367 pte_unmap(pte); 2335 pte_unmap(pte);
@@ -2408,7 +2376,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2408 struct page *page; 2376 struct page *page;
2409 unsigned long _address; 2377 unsigned long _address;
2410 spinlock_t *ptl; 2378 spinlock_t *ptl;
2411 int node = -1; 2379 int node = NUMA_NO_NODE;
2412 2380
2413 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2381 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2414 2382
@@ -2438,7 +2406,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2438 * be more sophisticated and look at more pages, 2406 * be more sophisticated and look at more pages,
2439 * but isn't for now. 2407 * but isn't for now.
2440 */ 2408 */
2441 if (node == -1) 2409 if (node == NUMA_NO_NODE)
2442 node = page_to_nid(page); 2410 node = page_to_nid(page);
2443 VM_BUG_ON(PageCompound(page)); 2411 VM_BUG_ON(PageCompound(page));
2444 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2412 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
@@ -2469,7 +2437,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
2469 2437
2470 if (khugepaged_test_exit(mm)) { 2438 if (khugepaged_test_exit(mm)) {
2471 /* free mm_slot */ 2439 /* free mm_slot */
2472 hlist_del(&mm_slot->hash); 2440 hash_del(&mm_slot->hash);
2473 list_del(&mm_slot->mm_node); 2441 list_del(&mm_slot->mm_node);
2474 2442
2475 /* 2443 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4f3ea0b1e57c..cdb64e4d238a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1293,8 +1293,7 @@ static void __init report_hugepages(void)
1293 1293
1294 for_each_hstate(h) { 1294 for_each_hstate(h) {
1295 char buf[32]; 1295 char buf[32];
1296 printk(KERN_INFO "HugeTLB registered %s page size, " 1296 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
1297 "pre-allocated %ld pages\n",
1298 memfmt(buf, huge_page_size(h)), 1297 memfmt(buf, huge_page_size(h)),
1299 h->free_huge_pages); 1298 h->free_huge_pages);
1300 } 1299 }
@@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void)
1702 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 1701 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1703 hstate_kobjs, &hstate_attr_group); 1702 hstate_kobjs, &hstate_attr_group);
1704 if (err) 1703 if (err)
1705 printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1704 pr_err("Hugetlb: Unable to add hstate %s", h->name);
1706 h->name);
1707 } 1705 }
1708} 1706}
1709 1707
@@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node)
1826 nhs->hstate_kobjs, 1824 nhs->hstate_kobjs,
1827 &per_node_hstate_attr_group); 1825 &per_node_hstate_attr_group);
1828 if (err) { 1826 if (err) {
1829 printk(KERN_ERR "Hugetlb: Unable to add hstate %s" 1827 pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
1830 " for node %d\n", 1828 h->name, node->dev.id);
1831 h->name, node->dev.id);
1832 hugetlb_unregister_node(node); 1829 hugetlb_unregister_node(node);
1833 break; 1830 break;
1834 } 1831 }
@@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order)
1924 unsigned long i; 1921 unsigned long i;
1925 1922
1926 if (size_to_hstate(PAGE_SIZE << order)) { 1923 if (size_to_hstate(PAGE_SIZE << order)) {
1927 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1924 pr_warning("hugepagesz= specified twice, ignoring\n");
1928 return; 1925 return;
1929 } 1926 }
1930 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s)
1960 mhp = &parsed_hstate->max_huge_pages; 1957 mhp = &parsed_hstate->max_huge_pages;
1961 1958
1962 if (mhp == last_mhp) { 1959 if (mhp == last_mhp) {
1963 printk(KERN_WARNING "hugepages= specified twice without " 1960 pr_warning("hugepages= specified twice without "
1964 "interleaving hugepagesz=, ignoring\n"); 1961 "interleaving hugepagesz=, ignoring\n");
1965 return 1; 1962 return 1;
1966 } 1963 }
1967 1964
@@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2692 * COW. Warn that such a situation has occurred as it may not be obvious 2689 * COW. Warn that such a situation has occurred as it may not be obvious
2693 */ 2690 */
2694 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 2691 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2695 printk(KERN_WARNING 2692 pr_warning("PID %d killed due to inadequate hugepage pool\n",
2696 "PID %d killed due to inadequate hugepage pool\n", 2693 current->pid);
2697 current->pid);
2698 return ret; 2694 return ret;
2699 } 2695 }
2700 2696
@@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2924 return NULL; 2920 return NULL;
2925} 2921}
2926 2922
2927int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2923long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2928 struct page **pages, struct vm_area_struct **vmas, 2924 struct page **pages, struct vm_area_struct **vmas,
2929 unsigned long *position, int *length, int i, 2925 unsigned long *position, unsigned long *nr_pages,
2930 unsigned int flags) 2926 long i, unsigned int flags)
2931{ 2927{
2932 unsigned long pfn_offset; 2928 unsigned long pfn_offset;
2933 unsigned long vaddr = *position; 2929 unsigned long vaddr = *position;
2934 int remainder = *length; 2930 unsigned long remainder = *nr_pages;
2935 struct hstate *h = hstate_vma(vma); 2931 struct hstate *h = hstate_vma(vma);
2936 2932
2937 spin_lock(&mm->page_table_lock); 2933 spin_lock(&mm->page_table_lock);
@@ -3001,7 +2997,7 @@ same_page:
3001 } 2997 }
3002 } 2998 }
3003 spin_unlock(&mm->page_table_lock); 2999 spin_unlock(&mm->page_table_lock);
3004 *length = remainder; 3000 *nr_pages = remainder;
3005 *position = vaddr; 3001 *position = vaddr;
3006 3002
3007 return i ? i : -EFAULT; 3003 return i ? i : -EFAULT;
@@ -3033,6 +3029,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3033 if (!huge_pte_none(huge_ptep_get(ptep))) { 3029 if (!huge_pte_none(huge_ptep_get(ptep))) {
3034 pte = huge_ptep_get_and_clear(mm, address, ptep); 3030 pte = huge_ptep_get_and_clear(mm, address, ptep);
3035 pte = pte_mkhuge(pte_modify(pte, newprot)); 3031 pte = pte_mkhuge(pte_modify(pte, newprot));
3032 pte = arch_make_huge_pte(pte, vma, NULL, 0);
3036 set_huge_pte_at(mm, address, ptep, pte); 3033 set_huge_pte_at(mm, address, ptep, pte);
3037 pages++; 3034 pages++;
3038 } 3035 }
diff --git a/mm/internal.h b/mm/internal.h
index d597f94cc205..1c0c4cc0fcf7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -135,7 +135,6 @@ struct compact_control {
135 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 135 int migratetype; /* MOVABLE, RECLAIMABLE etc */
136 struct zone *zone; 136 struct zone *zone;
137 bool contended; /* True if a lock was contended */ 137 bool contended; /* True if a lock was contended */
138 struct page **page; /* Page captured of requested size */
139}; 138};
140 139
141unsigned long 140unsigned long
@@ -163,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
163 struct vm_area_struct *prev, struct rb_node *rb_parent); 162 struct vm_area_struct *prev, struct rb_node *rb_parent);
164 163
165#ifdef CONFIG_MMU 164#ifdef CONFIG_MMU
166extern long mlock_vma_pages_range(struct vm_area_struct *vma, 165extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
167 unsigned long start, unsigned long end); 166 unsigned long start, unsigned long end, int *nonblocking);
168extern void munlock_vma_pages_range(struct vm_area_struct *vma, 167extern void munlock_vma_pages_range(struct vm_area_struct *vma,
169 unsigned long start, unsigned long end); 168 unsigned long start, unsigned long end);
170static inline void munlock_vma_pages_all(struct vm_area_struct *vma) 169static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 752a705c77c2..83dd5fbf5e60 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1300,9 +1300,8 @@ static void kmemleak_scan(void)
1300 */ 1300 */
1301 lock_memory_hotplug(); 1301 lock_memory_hotplug();
1302 for_each_online_node(i) { 1302 for_each_online_node(i) {
1303 pg_data_t *pgdat = NODE_DATA(i); 1303 unsigned long start_pfn = node_start_pfn(i);
1304 unsigned long start_pfn = pgdat->node_start_pfn; 1304 unsigned long end_pfn = node_end_pfn(i);
1305 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1306 unsigned long pfn; 1305 unsigned long pfn;
1307 1306
1308 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1307 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
diff --git a/mm/ksm.c b/mm/ksm.c
index 51573858938d..ab2ba9ad3c59 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,13 +33,22 @@
33#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h> 36#include <linux/hashtable.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/oom.h> 38#include <linux/oom.h>
39#include <linux/numa.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
42 43
44#ifdef CONFIG_NUMA
45#define NUMA(x) (x)
46#define DO_NUMA(x) do { (x); } while (0)
47#else
48#define NUMA(x) (0)
49#define DO_NUMA(x) do { } while (0)
50#endif
51
43/* 52/*
44 * A few notes about the KSM scanning process, 53 * A few notes about the KSM scanning process,
45 * to make it easier to understand the data structures below: 54 * to make it easier to understand the data structures below:
@@ -78,6 +87,9 @@
78 * take 10 attempts to find a page in the unstable tree, once it is found, 87 * take 10 attempts to find a page in the unstable tree, once it is found,
79 * it is secured in the stable tree. (When we scan a new page, we first 88 * it is secured in the stable tree. (When we scan a new page, we first
80 * compare it against the stable tree, and then against the unstable tree.) 89 * compare it against the stable tree, and then against the unstable tree.)
90 *
91 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
92 * stable trees and multiple unstable trees: one of each for each NUMA node.
81 */ 93 */
82 94
83/** 95/**
@@ -113,19 +125,32 @@ struct ksm_scan {
113/** 125/**
114 * struct stable_node - node of the stable rbtree 126 * struct stable_node - node of the stable rbtree
115 * @node: rb node of this ksm page in the stable tree 127 * @node: rb node of this ksm page in the stable tree
128 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
129 * @list: linked into migrate_nodes, pending placement in the proper node tree
116 * @hlist: hlist head of rmap_items using this ksm page 130 * @hlist: hlist head of rmap_items using this ksm page
117 * @kpfn: page frame number of this ksm page 131 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
132 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
118 */ 133 */
119struct stable_node { 134struct stable_node {
120 struct rb_node node; 135 union {
136 struct rb_node node; /* when node of stable tree */
137 struct { /* when listed for migration */
138 struct list_head *head;
139 struct list_head list;
140 };
141 };
121 struct hlist_head hlist; 142 struct hlist_head hlist;
122 unsigned long kpfn; 143 unsigned long kpfn;
144#ifdef CONFIG_NUMA
145 int nid;
146#endif
123}; 147};
124 148
125/** 149/**
126 * struct rmap_item - reverse mapping item for virtual addresses 150 * struct rmap_item - reverse mapping item for virtual addresses
127 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 151 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
128 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree 152 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
153 * @nid: NUMA node id of unstable tree in which linked (may not match page)
129 * @mm: the memory structure this rmap_item is pointing into 154 * @mm: the memory structure this rmap_item is pointing into
130 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 155 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
131 * @oldchecksum: previous checksum of the page at that virtual address 156 * @oldchecksum: previous checksum of the page at that virtual address
@@ -135,7 +160,12 @@ struct stable_node {
135 */ 160 */
136struct rmap_item { 161struct rmap_item {
137 struct rmap_item *rmap_list; 162 struct rmap_item *rmap_list;
138 struct anon_vma *anon_vma; /* when stable */ 163 union {
164 struct anon_vma *anon_vma; /* when stable */
165#ifdef CONFIG_NUMA
166 int nid; /* when node of unstable tree */
167#endif
168 };
139 struct mm_struct *mm; 169 struct mm_struct *mm;
140 unsigned long address; /* + low bits used for flags below */ 170 unsigned long address; /* + low bits used for flags below */
141 unsigned int oldchecksum; /* when unstable */ 171 unsigned int oldchecksum; /* when unstable */
@@ -153,12 +183,16 @@ struct rmap_item {
153#define STABLE_FLAG 0x200 /* is listed from the stable tree */ 183#define STABLE_FLAG 0x200 /* is listed from the stable tree */
154 184
155/* The stable and unstable tree heads */ 185/* The stable and unstable tree heads */
156static struct rb_root root_stable_tree = RB_ROOT; 186static struct rb_root one_stable_tree[1] = { RB_ROOT };
157static struct rb_root root_unstable_tree = RB_ROOT; 187static struct rb_root one_unstable_tree[1] = { RB_ROOT };
188static struct rb_root *root_stable_tree = one_stable_tree;
189static struct rb_root *root_unstable_tree = one_unstable_tree;
158 190
159#define MM_SLOTS_HASH_SHIFT 10 191/* Recently migrated nodes of stable tree, pending proper placement */
160#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) 192static LIST_HEAD(migrate_nodes);
161static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; 193
194#define MM_SLOTS_HASH_BITS 10
195static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
162 196
163static struct mm_slot ksm_mm_head = { 197static struct mm_slot ksm_mm_head = {
164 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 198 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100;
189/* Milliseconds ksmd should sleep between batches */ 223/* Milliseconds ksmd should sleep between batches */
190static unsigned int ksm_thread_sleep_millisecs = 20; 224static unsigned int ksm_thread_sleep_millisecs = 20;
191 225
226#ifdef CONFIG_NUMA
227/* Zeroed when merging across nodes is not allowed */
228static unsigned int ksm_merge_across_nodes = 1;
229static int ksm_nr_node_ids = 1;
230#else
231#define ksm_merge_across_nodes 1U
232#define ksm_nr_node_ids 1
233#endif
234
192#define KSM_RUN_STOP 0 235#define KSM_RUN_STOP 0
193#define KSM_RUN_MERGE 1 236#define KSM_RUN_MERGE 1
194#define KSM_RUN_UNMERGE 2 237#define KSM_RUN_UNMERGE 2
195static unsigned int ksm_run = KSM_RUN_STOP; 238#define KSM_RUN_OFFLINE 4
239static unsigned long ksm_run = KSM_RUN_STOP;
240static void wait_while_offlining(void);
196 241
197static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 242static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
198static DEFINE_MUTEX(ksm_thread_mutex); 243static DEFINE_MUTEX(ksm_thread_mutex);
@@ -275,31 +320,21 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
275 320
276static struct mm_slot *get_mm_slot(struct mm_struct *mm) 321static struct mm_slot *get_mm_slot(struct mm_struct *mm)
277{ 322{
278 struct mm_slot *mm_slot;
279 struct hlist_head *bucket;
280 struct hlist_node *node; 323 struct hlist_node *node;
324 struct mm_slot *slot;
325
326 hash_for_each_possible(mm_slots_hash, slot, node, link, (unsigned long)mm)
327 if (slot->mm == mm)
328 return slot;
281 329
282 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
283 hlist_for_each_entry(mm_slot, node, bucket, link) {
284 if (mm == mm_slot->mm)
285 return mm_slot;
286 }
287 return NULL; 330 return NULL;
288} 331}
289 332
290static void insert_to_mm_slots_hash(struct mm_struct *mm, 333static void insert_to_mm_slots_hash(struct mm_struct *mm,
291 struct mm_slot *mm_slot) 334 struct mm_slot *mm_slot)
292{ 335{
293 struct hlist_head *bucket;
294
295 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
296 mm_slot->mm = mm; 336 mm_slot->mm = mm;
297 hlist_add_head(&mm_slot->link, bucket); 337 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
298}
299
300static inline int in_stable_tree(struct rmap_item *rmap_item)
301{
302 return rmap_item->address & STABLE_FLAG;
303} 338}
304 339
305/* 340/*
@@ -333,7 +368,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
333 368
334 do { 369 do {
335 cond_resched(); 370 cond_resched();
336 page = follow_page(vma, addr, FOLL_GET); 371 page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
337 if (IS_ERR_OR_NULL(page)) 372 if (IS_ERR_OR_NULL(page))
338 break; 373 break;
339 if (PageKsm(page)) 374 if (PageKsm(page))
@@ -447,6 +482,17 @@ out: page = NULL;
447 return page; 482 return page;
448} 483}
449 484
485/*
486 * This helper is used for getting right index into array of tree roots.
487 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
488 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
489 * every node has its own stable and unstable tree.
490 */
491static inline int get_kpfn_nid(unsigned long kpfn)
492{
493 return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn);
494}
495
450static void remove_node_from_stable_tree(struct stable_node *stable_node) 496static void remove_node_from_stable_tree(struct stable_node *stable_node)
451{ 497{
452 struct rmap_item *rmap_item; 498 struct rmap_item *rmap_item;
@@ -462,7 +508,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
462 cond_resched(); 508 cond_resched();
463 } 509 }
464 510
465 rb_erase(&stable_node->node, &root_stable_tree); 511 if (stable_node->head == &migrate_nodes)
512 list_del(&stable_node->list);
513 else
514 rb_erase(&stable_node->node,
515 root_stable_tree + NUMA(stable_node->nid));
466 free_stable_node(stable_node); 516 free_stable_node(stable_node);
467} 517}
468 518
@@ -472,6 +522,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
472 * In which case we can trust the content of the page, and it 522 * In which case we can trust the content of the page, and it
473 * returns the gotten page; but if the page has now been zapped, 523 * returns the gotten page; but if the page has now been zapped,
474 * remove the stale node from the stable tree and return NULL. 524 * remove the stale node from the stable tree and return NULL.
525 * But beware, the stable node's page might be being migrated.
475 * 526 *
476 * You would expect the stable_node to hold a reference to the ksm page. 527 * You would expect the stable_node to hold a reference to the ksm page.
477 * But if it increments the page's count, swapping out has to wait for 528 * But if it increments the page's count, swapping out has to wait for
@@ -482,40 +533,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
482 * pointing back to this stable node. This relies on freeing a PageAnon 533 * pointing back to this stable node. This relies on freeing a PageAnon
483 * page to reset its page->mapping to NULL, and relies on no other use of 534 * page to reset its page->mapping to NULL, and relies on no other use of
484 * a page to put something that might look like our key in page->mapping. 535 * a page to put something that might look like our key in page->mapping.
485 *
486 * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
487 * but this is different - made simpler by ksm_thread_mutex being held, but
488 * interesting for assuming that no other use of the struct page could ever
489 * put our expected_mapping into page->mapping (or a field of the union which
490 * coincides with page->mapping). The RCU calls are not for KSM at all, but
491 * to keep the page_count protocol described with page_cache_get_speculative.
492 *
493 * Note: it is possible that get_ksm_page() will return NULL one moment,
494 * then page the next, if the page is in between page_freeze_refs() and
495 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
496 * is on its way to being freed; but it is an anomaly to bear in mind. 536 * is on its way to being freed; but it is an anomaly to bear in mind.
497 */ 537 */
498static struct page *get_ksm_page(struct stable_node *stable_node) 538static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
499{ 539{
500 struct page *page; 540 struct page *page;
501 void *expected_mapping; 541 void *expected_mapping;
542 unsigned long kpfn;
502 543
503 page = pfn_to_page(stable_node->kpfn);
504 expected_mapping = (void *)stable_node + 544 expected_mapping = (void *)stable_node +
505 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 545 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
506 rcu_read_lock(); 546again:
507 if (page->mapping != expected_mapping) 547 kpfn = ACCESS_ONCE(stable_node->kpfn);
508 goto stale; 548 page = pfn_to_page(kpfn);
509 if (!get_page_unless_zero(page)) 549
550 /*
551 * page is computed from kpfn, so on most architectures reading
552 * page->mapping is naturally ordered after reading node->kpfn,
553 * but on Alpha we need to be more careful.
554 */
555 smp_read_barrier_depends();
556 if (ACCESS_ONCE(page->mapping) != expected_mapping)
510 goto stale; 557 goto stale;
511 if (page->mapping != expected_mapping) { 558
559 /*
560 * We cannot do anything with the page while its refcount is 0.
561 * Usually 0 means free, or tail of a higher-order page: in which
562 * case this node is no longer referenced, and should be freed;
563 * however, it might mean that the page is under page_freeze_refs().
564 * The __remove_mapping() case is easy, again the node is now stale;
565 * but if page is swapcache in migrate_page_move_mapping(), it might
566 * still be our page, in which case it's essential to keep the node.
567 */
568 while (!get_page_unless_zero(page)) {
569 /*
570 * Another check for page->mapping != expected_mapping would
571 * work here too. We have chosen the !PageSwapCache test to
572 * optimize the common case, when the page is or is about to
573 * be freed: PageSwapCache is cleared (under spin_lock_irq)
574 * in the freeze_refs section of __remove_mapping(); but Anon
575 * page->mapping reset to NULL later, in free_pages_prepare().
576 */
577 if (!PageSwapCache(page))
578 goto stale;
579 cpu_relax();
580 }
581
582 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
512 put_page(page); 583 put_page(page);
513 goto stale; 584 goto stale;
514 } 585 }
515 rcu_read_unlock(); 586
587 if (lock_it) {
588 lock_page(page);
589 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
590 unlock_page(page);
591 put_page(page);
592 goto stale;
593 }
594 }
516 return page; 595 return page;
596
517stale: 597stale:
518 rcu_read_unlock(); 598 /*
599 * We come here from above when page->mapping or !PageSwapCache
600 * suggests that the node is stale; but it might be under migration.
601 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
602 * before checking whether node->kpfn has been changed.
603 */
604 smp_rmb();
605 if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
606 goto again;
519 remove_node_from_stable_tree(stable_node); 607 remove_node_from_stable_tree(stable_node);
520 return NULL; 608 return NULL;
521} 609}
@@ -531,11 +619,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
531 struct page *page; 619 struct page *page;
532 620
533 stable_node = rmap_item->head; 621 stable_node = rmap_item->head;
534 page = get_ksm_page(stable_node); 622 page = get_ksm_page(stable_node, true);
535 if (!page) 623 if (!page)
536 goto out; 624 goto out;
537 625
538 lock_page(page);
539 hlist_del(&rmap_item->hlist); 626 hlist_del(&rmap_item->hlist);
540 unlock_page(page); 627 unlock_page(page);
541 put_page(page); 628 put_page(page);
@@ -560,8 +647,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
560 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 647 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
561 BUG_ON(age > 1); 648 BUG_ON(age > 1);
562 if (!age) 649 if (!age)
563 rb_erase(&rmap_item->node, &root_unstable_tree); 650 rb_erase(&rmap_item->node,
564 651 root_unstable_tree + NUMA(rmap_item->nid));
565 ksm_pages_unshared--; 652 ksm_pages_unshared--;
566 rmap_item->address &= PAGE_MASK; 653 rmap_item->address &= PAGE_MASK;
567 } 654 }
@@ -581,7 +668,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
581} 668}
582 669
583/* 670/*
584 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 671 * Though it's very tempting to unmerge rmap_items from stable tree rather
585 * than check every pte of a given vma, the locking doesn't quite work for 672 * than check every pte of a given vma, the locking doesn't quite work for
586 * that - an rmap_item is assigned to the stable tree after inserting ksm 673 * that - an rmap_item is assigned to the stable tree after inserting ksm
587 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 674 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
@@ -614,6 +701,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
614/* 701/*
615 * Only called through the sysfs control interface: 702 * Only called through the sysfs control interface:
616 */ 703 */
704static int remove_stable_node(struct stable_node *stable_node)
705{
706 struct page *page;
707 int err;
708
709 page = get_ksm_page(stable_node, true);
710 if (!page) {
711 /*
712 * get_ksm_page did remove_node_from_stable_tree itself.
713 */
714 return 0;
715 }
716
717 if (WARN_ON_ONCE(page_mapped(page))) {
718 /*
719 * This should not happen: but if it does, just refuse to let
720 * merge_across_nodes be switched - there is no need to panic.
721 */
722 err = -EBUSY;
723 } else {
724 /*
725 * The stable node did not yet appear stale to get_ksm_page(),
726 * since that allows for an unmapped ksm page to be recognized
727 * right up until it is freed; but the node is safe to remove.
728 * This page might be in a pagevec waiting to be freed,
729 * or it might be PageSwapCache (perhaps under writeback),
730 * or it might have been removed from swapcache a moment ago.
731 */
732 set_page_stable_node(page, NULL);
733 remove_node_from_stable_tree(stable_node);
734 err = 0;
735 }
736
737 unlock_page(page);
738 put_page(page);
739 return err;
740}
741
742static int remove_all_stable_nodes(void)
743{
744 struct stable_node *stable_node;
745 struct list_head *this, *next;
746 int nid;
747 int err = 0;
748
749 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
750 while (root_stable_tree[nid].rb_node) {
751 stable_node = rb_entry(root_stable_tree[nid].rb_node,
752 struct stable_node, node);
753 if (remove_stable_node(stable_node)) {
754 err = -EBUSY;
755 break; /* proceed to next nid */
756 }
757 cond_resched();
758 }
759 }
760 list_for_each_safe(this, next, &migrate_nodes) {
761 stable_node = list_entry(this, struct stable_node, list);
762 if (remove_stable_node(stable_node))
763 err = -EBUSY;
764 cond_resched();
765 }
766 return err;
767}
768
617static int unmerge_and_remove_all_rmap_items(void) 769static int unmerge_and_remove_all_rmap_items(void)
618{ 770{
619 struct mm_slot *mm_slot; 771 struct mm_slot *mm_slot;
@@ -647,7 +799,7 @@ static int unmerge_and_remove_all_rmap_items(void)
647 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 799 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
648 struct mm_slot, mm_list); 800 struct mm_slot, mm_list);
649 if (ksm_test_exit(mm)) { 801 if (ksm_test_exit(mm)) {
650 hlist_del(&mm_slot->link); 802 hash_del(&mm_slot->link);
651 list_del(&mm_slot->mm_list); 803 list_del(&mm_slot->mm_list);
652 spin_unlock(&ksm_mmlist_lock); 804 spin_unlock(&ksm_mmlist_lock);
653 805
@@ -661,6 +813,8 @@ static int unmerge_and_remove_all_rmap_items(void)
661 } 813 }
662 } 814 }
663 815
816 /* Clean up stable nodes, but don't worry if some are still busy */
817 remove_all_stable_nodes();
664 ksm_scan.seqnr = 0; 818 ksm_scan.seqnr = 0;
665 return 0; 819 return 0;
666 820
@@ -946,6 +1100,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
946 if (err) 1100 if (err)
947 goto out; 1101 goto out;
948 1102
1103 /* Unstable nid is in union with stable anon_vma: remove first */
1104 remove_rmap_item_from_tree(rmap_item);
1105
949 /* Must get reference to anon_vma while still holding mmap_sem */ 1106 /* Must get reference to anon_vma while still holding mmap_sem */
950 rmap_item->anon_vma = vma->anon_vma; 1107 rmap_item->anon_vma = vma->anon_vma;
951 get_anon_vma(vma->anon_vma); 1108 get_anon_vma(vma->anon_vma);
@@ -996,42 +1153,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
996 */ 1153 */
997static struct page *stable_tree_search(struct page *page) 1154static struct page *stable_tree_search(struct page *page)
998{ 1155{
999 struct rb_node *node = root_stable_tree.rb_node; 1156 int nid;
1157 struct rb_root *root;
1158 struct rb_node **new;
1159 struct rb_node *parent;
1000 struct stable_node *stable_node; 1160 struct stable_node *stable_node;
1161 struct stable_node *page_node;
1001 1162
1002 stable_node = page_stable_node(page); 1163 page_node = page_stable_node(page);
1003 if (stable_node) { /* ksm page forked */ 1164 if (page_node && page_node->head != &migrate_nodes) {
1165 /* ksm page forked */
1004 get_page(page); 1166 get_page(page);
1005 return page; 1167 return page;
1006 } 1168 }
1007 1169
1008 while (node) { 1170 nid = get_kpfn_nid(page_to_pfn(page));
1171 root = root_stable_tree + nid;
1172again:
1173 new = &root->rb_node;
1174 parent = NULL;
1175
1176 while (*new) {
1009 struct page *tree_page; 1177 struct page *tree_page;
1010 int ret; 1178 int ret;
1011 1179
1012 cond_resched(); 1180 cond_resched();
1013 stable_node = rb_entry(node, struct stable_node, node); 1181 stable_node = rb_entry(*new, struct stable_node, node);
1014 tree_page = get_ksm_page(stable_node); 1182 tree_page = get_ksm_page(stable_node, false);
1015 if (!tree_page) 1183 if (!tree_page)
1016 return NULL; 1184 return NULL;
1017 1185
1018 ret = memcmp_pages(page, tree_page); 1186 ret = memcmp_pages(page, tree_page);
1187 put_page(tree_page);
1019 1188
1020 if (ret < 0) { 1189 parent = *new;
1021 put_page(tree_page); 1190 if (ret < 0)
1022 node = node->rb_left; 1191 new = &parent->rb_left;
1023 } else if (ret > 0) { 1192 else if (ret > 0)
1024 put_page(tree_page); 1193 new = &parent->rb_right;
1025 node = node->rb_right; 1194 else {
1026 } else 1195 /*
1027 return tree_page; 1196 * Lock and unlock the stable_node's page (which
1197 * might already have been migrated) so that page
1198 * migration is sure to notice its raised count.
1199 * It would be more elegant to return stable_node
1200 * than kpage, but that involves more changes.
1201 */
1202 tree_page = get_ksm_page(stable_node, true);
1203 if (tree_page) {
1204 unlock_page(tree_page);
1205 if (get_kpfn_nid(stable_node->kpfn) !=
1206 NUMA(stable_node->nid)) {
1207 put_page(tree_page);
1208 goto replace;
1209 }
1210 return tree_page;
1211 }
1212 /*
1213 * There is now a place for page_node, but the tree may
1214 * have been rebalanced, so re-evaluate parent and new.
1215 */
1216 if (page_node)
1217 goto again;
1218 return NULL;
1219 }
1028 } 1220 }
1029 1221
1030 return NULL; 1222 if (!page_node)
1223 return NULL;
1224
1225 list_del(&page_node->list);
1226 DO_NUMA(page_node->nid = nid);
1227 rb_link_node(&page_node->node, parent, new);
1228 rb_insert_color(&page_node->node, root);
1229 get_page(page);
1230 return page;
1231
1232replace:
1233 if (page_node) {
1234 list_del(&page_node->list);
1235 DO_NUMA(page_node->nid = nid);
1236 rb_replace_node(&stable_node->node, &page_node->node, root);
1237 get_page(page);
1238 } else {
1239 rb_erase(&stable_node->node, root);
1240 page = NULL;
1241 }
1242 stable_node->head = &migrate_nodes;
1243 list_add(&stable_node->list, stable_node->head);
1244 return page;
1031} 1245}
1032 1246
1033/* 1247/*
1034 * stable_tree_insert - insert rmap_item pointing to new ksm page 1248 * stable_tree_insert - insert stable tree node pointing to new ksm page
1035 * into the stable tree. 1249 * into the stable tree.
1036 * 1250 *
1037 * This function returns the stable tree node just allocated on success, 1251 * This function returns the stable tree node just allocated on success,
@@ -1039,17 +1253,25 @@ static struct page *stable_tree_search(struct page *page)
1039 */ 1253 */
1040static struct stable_node *stable_tree_insert(struct page *kpage) 1254static struct stable_node *stable_tree_insert(struct page *kpage)
1041{ 1255{
1042 struct rb_node **new = &root_stable_tree.rb_node; 1256 int nid;
1257 unsigned long kpfn;
1258 struct rb_root *root;
1259 struct rb_node **new;
1043 struct rb_node *parent = NULL; 1260 struct rb_node *parent = NULL;
1044 struct stable_node *stable_node; 1261 struct stable_node *stable_node;
1045 1262
1263 kpfn = page_to_pfn(kpage);
1264 nid = get_kpfn_nid(kpfn);
1265 root = root_stable_tree + nid;
1266 new = &root->rb_node;
1267
1046 while (*new) { 1268 while (*new) {
1047 struct page *tree_page; 1269 struct page *tree_page;
1048 int ret; 1270 int ret;
1049 1271
1050 cond_resched(); 1272 cond_resched();
1051 stable_node = rb_entry(*new, struct stable_node, node); 1273 stable_node = rb_entry(*new, struct stable_node, node);
1052 tree_page = get_ksm_page(stable_node); 1274 tree_page = get_ksm_page(stable_node, false);
1053 if (!tree_page) 1275 if (!tree_page)
1054 return NULL; 1276 return NULL;
1055 1277
@@ -1075,13 +1297,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1075 if (!stable_node) 1297 if (!stable_node)
1076 return NULL; 1298 return NULL;
1077 1299
1078 rb_link_node(&stable_node->node, parent, new);
1079 rb_insert_color(&stable_node->node, &root_stable_tree);
1080
1081 INIT_HLIST_HEAD(&stable_node->hlist); 1300 INIT_HLIST_HEAD(&stable_node->hlist);
1082 1301 stable_node->kpfn = kpfn;
1083 stable_node->kpfn = page_to_pfn(kpage);
1084 set_page_stable_node(kpage, stable_node); 1302 set_page_stable_node(kpage, stable_node);
1303 DO_NUMA(stable_node->nid = nid);
1304 rb_link_node(&stable_node->node, parent, new);
1305 rb_insert_color(&stable_node->node, root);
1085 1306
1086 return stable_node; 1307 return stable_node;
1087} 1308}
@@ -1104,10 +1325,15 @@ static
1104struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, 1325struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1105 struct page *page, 1326 struct page *page,
1106 struct page **tree_pagep) 1327 struct page **tree_pagep)
1107
1108{ 1328{
1109 struct rb_node **new = &root_unstable_tree.rb_node; 1329 struct rb_node **new;
1330 struct rb_root *root;
1110 struct rb_node *parent = NULL; 1331 struct rb_node *parent = NULL;
1332 int nid;
1333
1334 nid = get_kpfn_nid(page_to_pfn(page));
1335 root = root_unstable_tree + nid;
1336 new = &root->rb_node;
1111 1337
1112 while (*new) { 1338 while (*new) {
1113 struct rmap_item *tree_rmap_item; 1339 struct rmap_item *tree_rmap_item;
@@ -1137,6 +1363,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1137 } else if (ret > 0) { 1363 } else if (ret > 0) {
1138 put_page(tree_page); 1364 put_page(tree_page);
1139 new = &parent->rb_right; 1365 new = &parent->rb_right;
1366 } else if (!ksm_merge_across_nodes &&
1367 page_to_nid(tree_page) != nid) {
1368 /*
1369 * If tree_page has been migrated to another NUMA node,
1370 * it will be flushed out and put in the right unstable
1371 * tree next time: only merge with it when across_nodes.
1372 */
1373 put_page(tree_page);
1374 return NULL;
1140 } else { 1375 } else {
1141 *tree_pagep = tree_page; 1376 *tree_pagep = tree_page;
1142 return tree_rmap_item; 1377 return tree_rmap_item;
@@ -1145,8 +1380,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1145 1380
1146 rmap_item->address |= UNSTABLE_FLAG; 1381 rmap_item->address |= UNSTABLE_FLAG;
1147 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1382 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1383 DO_NUMA(rmap_item->nid = nid);
1148 rb_link_node(&rmap_item->node, parent, new); 1384 rb_link_node(&rmap_item->node, parent, new);
1149 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1385 rb_insert_color(&rmap_item->node, root);
1150 1386
1151 ksm_pages_unshared++; 1387 ksm_pages_unshared++;
1152 return NULL; 1388 return NULL;
@@ -1188,10 +1424,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1188 unsigned int checksum; 1424 unsigned int checksum;
1189 int err; 1425 int err;
1190 1426
1191 remove_rmap_item_from_tree(rmap_item); 1427 stable_node = page_stable_node(page);
1428 if (stable_node) {
1429 if (stable_node->head != &migrate_nodes &&
1430 get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
1431 rb_erase(&stable_node->node,
1432 root_stable_tree + NUMA(stable_node->nid));
1433 stable_node->head = &migrate_nodes;
1434 list_add(&stable_node->list, stable_node->head);
1435 }
1436 if (stable_node->head != &migrate_nodes &&
1437 rmap_item->head == stable_node)
1438 return;
1439 }
1192 1440
1193 /* We first start with searching the page inside the stable tree */ 1441 /* We first start with searching the page inside the stable tree */
1194 kpage = stable_tree_search(page); 1442 kpage = stable_tree_search(page);
1443 if (kpage == page && rmap_item->head == stable_node) {
1444 put_page(kpage);
1445 return;
1446 }
1447
1448 remove_rmap_item_from_tree(rmap_item);
1449
1195 if (kpage) { 1450 if (kpage) {
1196 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 1451 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1197 if (!err) { 1452 if (!err) {
@@ -1225,14 +1480,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1225 kpage = try_to_merge_two_pages(rmap_item, page, 1480 kpage = try_to_merge_two_pages(rmap_item, page,
1226 tree_rmap_item, tree_page); 1481 tree_rmap_item, tree_page);
1227 put_page(tree_page); 1482 put_page(tree_page);
1228 /*
1229 * As soon as we merge this page, we want to remove the
1230 * rmap_item of the page we have merged with from the unstable
1231 * tree, and insert it instead as new node in the stable tree.
1232 */
1233 if (kpage) { 1483 if (kpage) {
1234 remove_rmap_item_from_tree(tree_rmap_item); 1484 /*
1235 1485 * The pages were successfully merged: insert new
1486 * node in the stable tree and add both rmap_items.
1487 */
1236 lock_page(kpage); 1488 lock_page(kpage);
1237 stable_node = stable_tree_insert(kpage); 1489 stable_node = stable_tree_insert(kpage);
1238 if (stable_node) { 1490 if (stable_node) {
@@ -1289,6 +1541,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1289 struct mm_slot *slot; 1541 struct mm_slot *slot;
1290 struct vm_area_struct *vma; 1542 struct vm_area_struct *vma;
1291 struct rmap_item *rmap_item; 1543 struct rmap_item *rmap_item;
1544 int nid;
1292 1545
1293 if (list_empty(&ksm_mm_head.mm_list)) 1546 if (list_empty(&ksm_mm_head.mm_list))
1294 return NULL; 1547 return NULL;
@@ -1307,7 +1560,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1307 */ 1560 */
1308 lru_add_drain_all(); 1561 lru_add_drain_all();
1309 1562
1310 root_unstable_tree = RB_ROOT; 1563 /*
1564 * Whereas stale stable_nodes on the stable_tree itself
1565 * get pruned in the regular course of stable_tree_search(),
1566 * those moved out to the migrate_nodes list can accumulate:
1567 * so prune them once before each full scan.
1568 */
1569 if (!ksm_merge_across_nodes) {
1570 struct stable_node *stable_node;
1571 struct list_head *this, *next;
1572 struct page *page;
1573
1574 list_for_each_safe(this, next, &migrate_nodes) {
1575 stable_node = list_entry(this,
1576 struct stable_node, list);
1577 page = get_ksm_page(stable_node, false);
1578 if (page)
1579 put_page(page);
1580 cond_resched();
1581 }
1582 }
1583
1584 for (nid = 0; nid < ksm_nr_node_ids; nid++)
1585 root_unstable_tree[nid] = RB_ROOT;
1311 1586
1312 spin_lock(&ksm_mmlist_lock); 1587 spin_lock(&ksm_mmlist_lock);
1313 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1588 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
@@ -1392,7 +1667,7 @@ next_mm:
1392 * or when all VM_MERGEABLE areas have been unmapped (and 1667 * or when all VM_MERGEABLE areas have been unmapped (and
1393 * mmap_sem then protects against race with MADV_MERGEABLE). 1668 * mmap_sem then protects against race with MADV_MERGEABLE).
1394 */ 1669 */
1395 hlist_del(&slot->link); 1670 hash_del(&slot->link);
1396 list_del(&slot->mm_list); 1671 list_del(&slot->mm_list);
1397 spin_unlock(&ksm_mmlist_lock); 1672 spin_unlock(&ksm_mmlist_lock);
1398 1673
@@ -1428,8 +1703,7 @@ static void ksm_do_scan(unsigned int scan_npages)
1428 rmap_item = scan_get_next_rmap_item(&page); 1703 rmap_item = scan_get_next_rmap_item(&page);
1429 if (!rmap_item) 1704 if (!rmap_item)
1430 return; 1705 return;
1431 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1706 cmp_and_merge_page(page, rmap_item);
1432 cmp_and_merge_page(page, rmap_item);
1433 put_page(page); 1707 put_page(page);
1434 } 1708 }
1435} 1709}
@@ -1446,6 +1720,7 @@ static int ksm_scan_thread(void *nothing)
1446 1720
1447 while (!kthread_should_stop()) { 1721 while (!kthread_should_stop()) {
1448 mutex_lock(&ksm_thread_mutex); 1722 mutex_lock(&ksm_thread_mutex);
1723 wait_while_offlining();
1449 if (ksmd_should_run()) 1724 if (ksmd_should_run())
1450 ksm_do_scan(ksm_thread_pages_to_scan); 1725 ksm_do_scan(ksm_thread_pages_to_scan);
1451 mutex_unlock(&ksm_thread_mutex); 1726 mutex_unlock(&ksm_thread_mutex);
@@ -1525,11 +1800,19 @@ int __ksm_enter(struct mm_struct *mm)
1525 spin_lock(&ksm_mmlist_lock); 1800 spin_lock(&ksm_mmlist_lock);
1526 insert_to_mm_slots_hash(mm, mm_slot); 1801 insert_to_mm_slots_hash(mm, mm_slot);
1527 /* 1802 /*
1528 * Insert just behind the scanning cursor, to let the area settle 1803 * When KSM_RUN_MERGE (or KSM_RUN_STOP),
1804 * insert just behind the scanning cursor, to let the area settle
1529 * down a little; when fork is followed by immediate exec, we don't 1805 * down a little; when fork is followed by immediate exec, we don't
1530 * want ksmd to waste time setting up and tearing down an rmap_list. 1806 * want ksmd to waste time setting up and tearing down an rmap_list.
1807 *
1808 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
1809 * scanning cursor, otherwise KSM pages in newly forked mms will be
1810 * missed: then we might as well insert at the end of the list.
1531 */ 1811 */
1532 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 1812 if (ksm_run & KSM_RUN_UNMERGE)
1813 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
1814 else
1815 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1533 spin_unlock(&ksm_mmlist_lock); 1816 spin_unlock(&ksm_mmlist_lock);
1534 1817
1535 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1818 set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -1559,7 +1842,7 @@ void __ksm_exit(struct mm_struct *mm)
1559 mm_slot = get_mm_slot(mm); 1842 mm_slot = get_mm_slot(mm);
1560 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1843 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1561 if (!mm_slot->rmap_list) { 1844 if (!mm_slot->rmap_list) {
1562 hlist_del(&mm_slot->link); 1845 hash_del(&mm_slot->link);
1563 list_del(&mm_slot->mm_list); 1846 list_del(&mm_slot->mm_list);
1564 easy_to_free = 1; 1847 easy_to_free = 1;
1565 } else { 1848 } else {
@@ -1579,24 +1862,32 @@ void __ksm_exit(struct mm_struct *mm)
1579 } 1862 }
1580} 1863}
1581 1864
1582struct page *ksm_does_need_to_copy(struct page *page, 1865struct page *ksm_might_need_to_copy(struct page *page,
1583 struct vm_area_struct *vma, unsigned long address) 1866 struct vm_area_struct *vma, unsigned long address)
1584{ 1867{
1868 struct anon_vma *anon_vma = page_anon_vma(page);
1585 struct page *new_page; 1869 struct page *new_page;
1586 1870
1871 if (PageKsm(page)) {
1872 if (page_stable_node(page) &&
1873 !(ksm_run & KSM_RUN_UNMERGE))
1874 return page; /* no need to copy it */
1875 } else if (!anon_vma) {
1876 return page; /* no need to copy it */
1877 } else if (anon_vma->root == vma->anon_vma->root &&
1878 page->index == linear_page_index(vma, address)) {
1879 return page; /* still no need to copy it */
1880 }
1881 if (!PageUptodate(page))
1882 return page; /* let do_swap_page report the error */
1883
1587 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1884 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1588 if (new_page) { 1885 if (new_page) {
1589 copy_user_highpage(new_page, page, address, vma); 1886 copy_user_highpage(new_page, page, address, vma);
1590 1887
1591 SetPageDirty(new_page); 1888 SetPageDirty(new_page);
1592 __SetPageUptodate(new_page); 1889 __SetPageUptodate(new_page);
1593 SetPageSwapBacked(new_page);
1594 __set_page_locked(new_page); 1890 __set_page_locked(new_page);
1595
1596 if (!mlocked_vma_newpage(vma, new_page))
1597 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1598 else
1599 add_page_to_unevictable_list(new_page);
1600 } 1891 }
1601 1892
1602 return new_page; 1893 return new_page;
@@ -1773,64 +2064,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1773 if (stable_node) { 2064 if (stable_node) {
1774 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); 2065 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
1775 stable_node->kpfn = page_to_pfn(newpage); 2066 stable_node->kpfn = page_to_pfn(newpage);
2067 /*
2068 * newpage->mapping was set in advance; now we need smp_wmb()
2069 * to make sure that the new stable_node->kpfn is visible
2070 * to get_ksm_page() before it can see that oldpage->mapping
2071 * has gone stale (or that PageSwapCache has been cleared).
2072 */
2073 smp_wmb();
2074 set_page_stable_node(oldpage, NULL);
1776 } 2075 }
1777} 2076}
1778#endif /* CONFIG_MIGRATION */ 2077#endif /* CONFIG_MIGRATION */
1779 2078
1780#ifdef CONFIG_MEMORY_HOTREMOVE 2079#ifdef CONFIG_MEMORY_HOTREMOVE
1781static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, 2080static int just_wait(void *word)
1782 unsigned long end_pfn)
1783{ 2081{
1784 struct rb_node *node; 2082 schedule();
2083 return 0;
2084}
1785 2085
1786 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { 2086static void wait_while_offlining(void)
1787 struct stable_node *stable_node; 2087{
2088 while (ksm_run & KSM_RUN_OFFLINE) {
2089 mutex_unlock(&ksm_thread_mutex);
2090 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2091 just_wait, TASK_UNINTERRUPTIBLE);
2092 mutex_lock(&ksm_thread_mutex);
2093 }
2094}
1788 2095
1789 stable_node = rb_entry(node, struct stable_node, node); 2096static void ksm_check_stable_tree(unsigned long start_pfn,
2097 unsigned long end_pfn)
2098{
2099 struct stable_node *stable_node;
2100 struct list_head *this, *next;
2101 struct rb_node *node;
2102 int nid;
2103
2104 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2105 node = rb_first(root_stable_tree + nid);
2106 while (node) {
2107 stable_node = rb_entry(node, struct stable_node, node);
2108 if (stable_node->kpfn >= start_pfn &&
2109 stable_node->kpfn < end_pfn) {
2110 /*
2111 * Don't get_ksm_page, page has already gone:
2112 * which is why we keep kpfn instead of page*
2113 */
2114 remove_node_from_stable_tree(stable_node);
2115 node = rb_first(root_stable_tree + nid);
2116 } else
2117 node = rb_next(node);
2118 cond_resched();
2119 }
2120 }
2121 list_for_each_safe(this, next, &migrate_nodes) {
2122 stable_node = list_entry(this, struct stable_node, list);
1790 if (stable_node->kpfn >= start_pfn && 2123 if (stable_node->kpfn >= start_pfn &&
1791 stable_node->kpfn < end_pfn) 2124 stable_node->kpfn < end_pfn)
1792 return stable_node; 2125 remove_node_from_stable_tree(stable_node);
2126 cond_resched();
1793 } 2127 }
1794 return NULL;
1795} 2128}
1796 2129
1797static int ksm_memory_callback(struct notifier_block *self, 2130static int ksm_memory_callback(struct notifier_block *self,
1798 unsigned long action, void *arg) 2131 unsigned long action, void *arg)
1799{ 2132{
1800 struct memory_notify *mn = arg; 2133 struct memory_notify *mn = arg;
1801 struct stable_node *stable_node;
1802 2134
1803 switch (action) { 2135 switch (action) {
1804 case MEM_GOING_OFFLINE: 2136 case MEM_GOING_OFFLINE:
1805 /* 2137 /*
1806 * Keep it very simple for now: just lock out ksmd and 2138 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
1807 * MADV_UNMERGEABLE while any memory is going offline. 2139 * and remove_all_stable_nodes() while memory is going offline:
1808 * mutex_lock_nested() is necessary because lockdep was alarmed 2140 * it is unsafe for them to touch the stable tree at this time.
1809 * that here we take ksm_thread_mutex inside notifier chain 2141 * But unmerge_ksm_pages(), rmap lookups and other entry points
1810 * mutex, and later take notifier chain mutex inside 2142 * which do not need the ksm_thread_mutex are all safe.
1811 * ksm_thread_mutex to unlock it. But that's safe because both
1812 * are inside mem_hotplug_mutex.
1813 */ 2143 */
1814 mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); 2144 mutex_lock(&ksm_thread_mutex);
2145 ksm_run |= KSM_RUN_OFFLINE;
2146 mutex_unlock(&ksm_thread_mutex);
1815 break; 2147 break;
1816 2148
1817 case MEM_OFFLINE: 2149 case MEM_OFFLINE:
1818 /* 2150 /*
1819 * Most of the work is done by page migration; but there might 2151 * Most of the work is done by page migration; but there might
1820 * be a few stable_nodes left over, still pointing to struct 2152 * be a few stable_nodes left over, still pointing to struct
1821 * pages which have been offlined: prune those from the tree. 2153 * pages which have been offlined: prune those from the tree,
2154 * otherwise get_ksm_page() might later try to access a
2155 * non-existent struct page.
1822 */ 2156 */
1823 while ((stable_node = ksm_check_stable_tree(mn->start_pfn, 2157 ksm_check_stable_tree(mn->start_pfn,
1824 mn->start_pfn + mn->nr_pages)) != NULL) 2158 mn->start_pfn + mn->nr_pages);
1825 remove_node_from_stable_tree(stable_node);
1826 /* fallthrough */ 2159 /* fallthrough */
1827 2160
1828 case MEM_CANCEL_OFFLINE: 2161 case MEM_CANCEL_OFFLINE:
2162 mutex_lock(&ksm_thread_mutex);
2163 ksm_run &= ~KSM_RUN_OFFLINE;
1829 mutex_unlock(&ksm_thread_mutex); 2164 mutex_unlock(&ksm_thread_mutex);
2165
2166 smp_mb(); /* wake_up_bit advises this */
2167 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
1830 break; 2168 break;
1831 } 2169 }
1832 return NOTIFY_OK; 2170 return NOTIFY_OK;
1833} 2171}
2172#else
2173static void wait_while_offlining(void)
2174{
2175}
1834#endif /* CONFIG_MEMORY_HOTREMOVE */ 2176#endif /* CONFIG_MEMORY_HOTREMOVE */
1835 2177
1836#ifdef CONFIG_SYSFS 2178#ifdef CONFIG_SYSFS
@@ -1893,7 +2235,7 @@ KSM_ATTR(pages_to_scan);
1893static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 2235static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1894 char *buf) 2236 char *buf)
1895{ 2237{
1896 return sprintf(buf, "%u\n", ksm_run); 2238 return sprintf(buf, "%lu\n", ksm_run);
1897} 2239}
1898 2240
1899static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 2241static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1916,6 +2258,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1916 */ 2258 */
1917 2259
1918 mutex_lock(&ksm_thread_mutex); 2260 mutex_lock(&ksm_thread_mutex);
2261 wait_while_offlining();
1919 if (ksm_run != flags) { 2262 if (ksm_run != flags) {
1920 ksm_run = flags; 2263 ksm_run = flags;
1921 if (flags & KSM_RUN_UNMERGE) { 2264 if (flags & KSM_RUN_UNMERGE) {
@@ -1937,6 +2280,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1937} 2280}
1938KSM_ATTR(run); 2281KSM_ATTR(run);
1939 2282
2283#ifdef CONFIG_NUMA
2284static ssize_t merge_across_nodes_show(struct kobject *kobj,
2285 struct kobj_attribute *attr, char *buf)
2286{
2287 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2288}
2289
2290static ssize_t merge_across_nodes_store(struct kobject *kobj,
2291 struct kobj_attribute *attr,
2292 const char *buf, size_t count)
2293{
2294 int err;
2295 unsigned long knob;
2296
2297 err = kstrtoul(buf, 10, &knob);
2298 if (err)
2299 return err;
2300 if (knob > 1)
2301 return -EINVAL;
2302
2303 mutex_lock(&ksm_thread_mutex);
2304 wait_while_offlining();
2305 if (ksm_merge_across_nodes != knob) {
2306 if (ksm_pages_shared || remove_all_stable_nodes())
2307 err = -EBUSY;
2308 else if (root_stable_tree == one_stable_tree) {
2309 struct rb_root *buf;
2310 /*
2311 * This is the first time that we switch away from the
2312 * default of merging across nodes: must now allocate
2313 * a buffer to hold as many roots as may be needed.
2314 * Allocate stable and unstable together:
2315 * MAXSMP NODES_SHIFT 10 will use 16kB.
2316 */
2317 buf = kcalloc(nr_node_ids + nr_node_ids,
2318 sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
2319 /* Let us assume that RB_ROOT is NULL is zero */
2320 if (!buf)
2321 err = -ENOMEM;
2322 else {
2323 root_stable_tree = buf;
2324 root_unstable_tree = buf + nr_node_ids;
2325 /* Stable tree is empty but not the unstable */
2326 root_unstable_tree[0] = one_unstable_tree[0];
2327 }
2328 }
2329 if (!err) {
2330 ksm_merge_across_nodes = knob;
2331 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2332 }
2333 }
2334 mutex_unlock(&ksm_thread_mutex);
2335
2336 return err ? err : count;
2337}
2338KSM_ATTR(merge_across_nodes);
2339#endif
2340
1940static ssize_t pages_shared_show(struct kobject *kobj, 2341static ssize_t pages_shared_show(struct kobject *kobj,
1941 struct kobj_attribute *attr, char *buf) 2342 struct kobj_attribute *attr, char *buf)
1942{ 2343{
@@ -1991,6 +2392,9 @@ static struct attribute *ksm_attrs[] = {
1991 &pages_unshared_attr.attr, 2392 &pages_unshared_attr.attr,
1992 &pages_volatile_attr.attr, 2393 &pages_volatile_attr.attr,
1993 &full_scans_attr.attr, 2394 &full_scans_attr.attr,
2395#ifdef CONFIG_NUMA
2396 &merge_across_nodes_attr.attr,
2397#endif
1994 NULL, 2398 NULL,
1995}; 2399};
1996 2400
@@ -2029,10 +2433,7 @@ static int __init ksm_init(void)
2029#endif /* CONFIG_SYSFS */ 2433#endif /* CONFIG_SYSFS */
2030 2434
2031#ifdef CONFIG_MEMORY_HOTREMOVE 2435#ifdef CONFIG_MEMORY_HOTREMOVE
2032 /* 2436 /* There is no significance to this priority 100 */
2033 * Choose a high priority since the callback takes ksm_thread_mutex:
2034 * later callbacks could only be taking locks which nest within that.
2035 */
2036 hotplug_memory_notifier(ksm_memory_callback, 100); 2437 hotplug_memory_notifier(ksm_memory_callback, 100);
2037#endif 2438#endif
2038 return 0; 2439 return 0;
diff --git a/mm/madvise.c b/mm/madvise.c
index 03dfa5c7adb3..c58c94b56c3d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -16,6 +16,9 @@
16#include <linux/ksm.h> 16#include <linux/ksm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/file.h> 18#include <linux/file.h>
19#include <linux/blkdev.h>
20#include <linux/swap.h>
21#include <linux/swapops.h>
19 22
20/* 23/*
21 * Any behaviour which results in changes to the vma->vm_flags needs to 24 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -131,6 +134,84 @@ out:
131 return error; 134 return error;
132} 135}
133 136
137#ifdef CONFIG_SWAP
138static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
139 unsigned long end, struct mm_walk *walk)
140{
141 pte_t *orig_pte;
142 struct vm_area_struct *vma = walk->private;
143 unsigned long index;
144
145 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
146 return 0;
147
148 for (index = start; index != end; index += PAGE_SIZE) {
149 pte_t pte;
150 swp_entry_t entry;
151 struct page *page;
152 spinlock_t *ptl;
153
154 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
155 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
156 pte_unmap_unlock(orig_pte, ptl);
157
158 if (pte_present(pte) || pte_none(pte) || pte_file(pte))
159 continue;
160 entry = pte_to_swp_entry(pte);
161 if (unlikely(non_swap_entry(entry)))
162 continue;
163
164 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
165 vma, index);
166 if (page)
167 page_cache_release(page);
168 }
169
170 return 0;
171}
172
173static void force_swapin_readahead(struct vm_area_struct *vma,
174 unsigned long start, unsigned long end)
175{
176 struct mm_walk walk = {
177 .mm = vma->vm_mm,
178 .pmd_entry = swapin_walk_pmd_entry,
179 .private = vma,
180 };
181
182 walk_page_range(start, end, &walk);
183
184 lru_add_drain(); /* Push any new pages onto the LRU now */
185}
186
187static void force_shm_swapin_readahead(struct vm_area_struct *vma,
188 unsigned long start, unsigned long end,
189 struct address_space *mapping)
190{
191 pgoff_t index;
192 struct page *page;
193 swp_entry_t swap;
194
195 for (; start < end; start += PAGE_SIZE) {
196 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
197
198 page = find_get_page(mapping, index);
199 if (!radix_tree_exceptional_entry(page)) {
200 if (page)
201 page_cache_release(page);
202 continue;
203 }
204 swap = radix_to_swp_entry(page);
205 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
206 NULL, 0);
207 if (page)
208 page_cache_release(page);
209 }
210
211 lru_add_drain(); /* Push any new pages onto the LRU now */
212}
213#endif /* CONFIG_SWAP */
214
134/* 215/*
135 * Schedule all required I/O operations. Do not wait for completion. 216 * Schedule all required I/O operations. Do not wait for completion.
136 */ 217 */
@@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma,
140{ 221{
141 struct file *file = vma->vm_file; 222 struct file *file = vma->vm_file;
142 223
224#ifdef CONFIG_SWAP
225 if (!file || mapping_cap_swap_backed(file->f_mapping)) {
226 *prev = vma;
227 if (!file)
228 force_swapin_readahead(vma, start, end);
229 else
230 force_shm_swapin_readahead(vma, start, end,
231 file->f_mapping);
232 return 0;
233 }
234#endif
235
143 if (!file) 236 if (!file)
144 return -EBADF; 237 return -EBADF;
145 238
@@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
371 int error = -EINVAL; 464 int error = -EINVAL;
372 int write; 465 int write;
373 size_t len; 466 size_t len;
467 struct blk_plug plug;
374 468
375#ifdef CONFIG_MEMORY_FAILURE 469#ifdef CONFIG_MEMORY_FAILURE
376 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 470 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
410 if (vma && start > vma->vm_start) 504 if (vma && start > vma->vm_start)
411 prev = vma; 505 prev = vma;
412 506
507 blk_start_plug(&plug);
413 for (;;) { 508 for (;;) {
414 /* Still start < end. */ 509 /* Still start < end. */
415 error = -ENOMEM; 510 error = -ENOMEM;
416 if (!vma) 511 if (!vma)
417 goto out; 512 goto out_plug;
418 513
419 /* Here start < (end|vma->vm_end). */ 514 /* Here start < (end|vma->vm_end). */
420 if (start < vma->vm_start) { 515 if (start < vma->vm_start) {
421 unmapped_error = -ENOMEM; 516 unmapped_error = -ENOMEM;
422 start = vma->vm_start; 517 start = vma->vm_start;
423 if (start >= end) 518 if (start >= end)
424 goto out; 519 goto out_plug;
425 } 520 }
426 521
427 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 522 /* Here vma->vm_start <= start < (end|vma->vm_end) */
@@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
432 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 527 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
433 error = madvise_vma(vma, &prev, start, tmp, behavior); 528 error = madvise_vma(vma, &prev, start, tmp, behavior);
434 if (error) 529 if (error)
435 goto out; 530 goto out_plug;
436 start = tmp; 531 start = tmp;
437 if (prev && start < prev->vm_end) 532 if (prev && start < prev->vm_end)
438 start = prev->vm_end; 533 start = prev->vm_end;
439 error = unmapped_error; 534 error = unmapped_error;
440 if (start >= end) 535 if (start >= end)
441 goto out; 536 goto out_plug;
442 if (prev) 537 if (prev)
443 vma = prev->vm_next; 538 vma = prev->vm_next;
444 else /* madvise_remove dropped mmap_sem */ 539 else /* madvise_remove dropped mmap_sem */
445 vma = find_vma(current->mm, start); 540 vma = find_vma(current->mm, start);
446 } 541 }
542out_plug:
543 blk_finish_plug(&plug);
447out: 544out:
448 if (write) 545 if (write)
449 up_write(&current->mm->mmap_sem); 546 up_write(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index 625905523c2a..1bcd9b970564 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
92 * 92 *
93 * Find @size free area aligned to @align in the specified range and node. 93 * Find @size free area aligned to @align in the specified range and node.
94 * 94 *
95 * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the
96 * memory we found if not in hotpluggable ranges.
97 *
95 * RETURNS: 98 * RETURNS:
96 * Found address on success, %0 on failure. 99 * Found address on success, %0 on failure.
97 */ 100 */
101#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
102phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
103 phys_addr_t end, phys_addr_t size,
104 phys_addr_t align, int nid)
105{
106 phys_addr_t this_start, this_end, cand;
107 u64 i;
108 int curr = movablemem_map.nr_map - 1;
109
110 /* pump up @end */
111 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
112 end = memblock.current_limit;
113
114 /* avoid allocating the first page */
115 start = max_t(phys_addr_t, start, PAGE_SIZE);
116 end = max(start, end);
117
118 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
119 this_start = clamp(this_start, start, end);
120 this_end = clamp(this_end, start, end);
121
122restart:
123 if (this_end <= this_start || this_end < size)
124 continue;
125
126 for (; curr >= 0; curr--) {
127 if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT)
128 < this_end)
129 break;
130 }
131
132 cand = round_down(this_end - size, align);
133 if (curr >= 0 &&
134 cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) {
135 this_end = movablemem_map.map[curr].start_pfn
136 << PAGE_SHIFT;
137 goto restart;
138 }
139
140 if (cand >= this_start)
141 return cand;
142 }
143
144 return 0;
145}
146#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
98phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 147phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
99 phys_addr_t end, phys_addr_t size, 148 phys_addr_t end, phys_addr_t size,
100 phys_addr_t align, int nid) 149 phys_addr_t align, int nid)
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
123 } 172 }
124 return 0; 173 return 0;
125} 174}
175#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
126 176
127/** 177/**
128 * memblock_find_in_range - find free area in given range 178 * memblock_find_in_range - find free area in given range
@@ -314,7 +364,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
314 } 364 }
315 365
316 this->size += next->size; 366 this->size += next->size;
317 memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); 367 /* move forward from next + 1, index of which is i + 2 */
368 memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
318 type->cnt--; 369 type->cnt--;
319 } 370 }
320} 371}
@@ -827,6 +878,23 @@ phys_addr_t __init memblock_phys_mem_size(void)
827 return memblock.memory.total_size; 878 return memblock.memory.total_size;
828} 879}
829 880
881phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
882{
883 unsigned long pages = 0;
884 struct memblock_region *r;
885 unsigned long start_pfn, end_pfn;
886
887 for_each_memblock(memory, r) {
888 start_pfn = memblock_region_memory_base_pfn(r);
889 end_pfn = memblock_region_memory_end_pfn(r);
890 start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
891 end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
892 pages += end_pfn - start_pfn;
893 }
894
895 return (phys_addr_t)pages << PAGE_SHIFT;
896}
897
830/* lowest address */ 898/* lowest address */
831phys_addr_t __init_memblock memblock_start_of_DRAM(void) 899phys_addr_t __init_memblock memblock_start_of_DRAM(void)
832{ 900{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09255ec8159c..53b8201b31eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = {
120 "pgmajfault", 120 "pgmajfault",
121}; 121};
122 122
123static const char * const mem_cgroup_lru_names[] = {
124 "inactive_anon",
125 "active_anon",
126 "inactive_file",
127 "active_file",
128 "unevictable",
129};
130
123/* 131/*
124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 132 * Per memcg event counter is incremented at every pagein/pageout. With THP,
125 * it will be incremated by the number of pages. This counter is used for 133 * it will be incremated by the number of pages. This counter is used for
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node {
172}; 180};
173 181
174struct mem_cgroup_lru_info { 182struct mem_cgroup_lru_info {
175 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 183 struct mem_cgroup_per_node *nodeinfo[0];
176}; 184};
177 185
178/* 186/*
@@ -276,17 +284,6 @@ struct mem_cgroup {
276 */ 284 */
277 struct res_counter kmem; 285 struct res_counter kmem;
278 /* 286 /*
279 * Per cgroup active and inactive list, similar to the
280 * per zone LRU lists.
281 */
282 struct mem_cgroup_lru_info info;
283 int last_scanned_node;
284#if MAX_NUMNODES > 1
285 nodemask_t scan_nodes;
286 atomic_t numainfo_events;
287 atomic_t numainfo_updating;
288#endif
289 /*
290 * Should the accounting and control be hierarchical, per subtree? 287 * Should the accounting and control be hierarchical, per subtree?
291 */ 288 */
292 bool use_hierarchy; 289 bool use_hierarchy;
@@ -349,8 +346,29 @@ struct mem_cgroup {
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 346 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id; 347 int kmemcg_id;
351#endif 348#endif
349
350 int last_scanned_node;
351#if MAX_NUMNODES > 1
352 nodemask_t scan_nodes;
353 atomic_t numainfo_events;
354 atomic_t numainfo_updating;
355#endif
356 /*
357 * Per cgroup active and inactive list, similar to the
358 * per zone LRU lists.
359 *
360 * WARNING: This has to be the last element of the struct. Don't
361 * add new fields after this point.
362 */
363 struct mem_cgroup_lru_info info;
352}; 364};
353 365
366static size_t memcg_size(void)
367{
368 return sizeof(struct mem_cgroup) +
369 nr_node_ids * sizeof(struct mem_cgroup_per_node);
370}
371
354/* internal only representation about the status of kmem accounting. */ 372/* internal only representation about the status of kmem accounting. */
355enum { 373enum {
356 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ 374 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
398 416
399/* Stuffs for move charges at task migration. */ 417/* Stuffs for move charges at task migration. */
400/* 418/*
401 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 419 * Types of charges to be moved. "move_charge_at_immitgrate" and
402 * left-shifted bitmap of these types. 420 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
403 */ 421 */
404enum move_type { 422enum move_type {
405 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 423 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
@@ -412,6 +430,7 @@ static struct move_charge_struct {
412 spinlock_t lock; /* for from, to */ 430 spinlock_t lock; /* for from, to */
413 struct mem_cgroup *from; 431 struct mem_cgroup *from;
414 struct mem_cgroup *to; 432 struct mem_cgroup *to;
433 unsigned long immigrate_flags;
415 unsigned long precharge; 434 unsigned long precharge;
416 unsigned long moved_charge; 435 unsigned long moved_charge;
417 unsigned long moved_swap; 436 unsigned long moved_swap;
@@ -424,14 +443,12 @@ static struct move_charge_struct {
424 443
425static bool move_anon(void) 444static bool move_anon(void)
426{ 445{
427 return test_bit(MOVE_CHARGE_TYPE_ANON, 446 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
428 &mc.to->move_charge_at_immigrate);
429} 447}
430 448
431static bool move_file(void) 449static bool move_file(void)
432{ 450{
433 return test_bit(MOVE_CHARGE_TYPE_FILE, 451 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
434 &mc.to->move_charge_at_immigrate);
435} 452}
436 453
437/* 454/*
@@ -471,6 +488,13 @@ enum res_type {
471#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 488#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
472#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 489#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
473 490
491/*
492 * The memcg_create_mutex will be held whenever a new cgroup is created.
493 * As a consequence, any change that needs to protect against new child cgroups
494 * appearing has to hold it as well.
495 */
496static DEFINE_MUTEX(memcg_create_mutex);
497
474static void mem_cgroup_get(struct mem_cgroup *memcg); 498static void mem_cgroup_get(struct mem_cgroup *memcg);
475static void mem_cgroup_put(struct mem_cgroup *memcg); 499static void mem_cgroup_put(struct mem_cgroup *memcg);
476 500
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg);
627static struct mem_cgroup_per_zone * 651static struct mem_cgroup_per_zone *
628mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 652mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
629{ 653{
654 VM_BUG_ON((unsigned)nid >= nr_node_ids);
630 return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; 655 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
631} 656}
632 657
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1371 return inactive * inactive_ratio < active; 1396 return inactive * inactive_ratio < active;
1372} 1397}
1373 1398
1374int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1375{
1376 unsigned long active;
1377 unsigned long inactive;
1378
1379 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1380 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1381
1382 return (active > inactive);
1383}
1384
1385#define mem_cgroup_from_res_counter(counter, member) \ 1399#define mem_cgroup_from_res_counter(counter, member) \
1386 container_of(counter, struct mem_cgroup, member) 1400 container_of(counter, struct mem_cgroup, member)
1387 1401
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1524 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1538 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1525} 1539}
1526 1540
1541#define K(x) ((x) << (PAGE_SHIFT-10))
1527/** 1542/**
1528 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1543 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1529 * @memcg: The memory cgroup that went over limit 1544 * @memcg: The memory cgroup that went over limit
1530 * @p: Task that is going to be killed 1545 * @p: Task that is going to be killed
1531 * 1546 *
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1543 */ 1558 */
1544 static char memcg_name[PATH_MAX]; 1559 static char memcg_name[PATH_MAX];
1545 int ret; 1560 int ret;
1561 struct mem_cgroup *iter;
1562 unsigned int i;
1546 1563
1547 if (!memcg || !p) 1564 if (!p)
1548 return; 1565 return;
1549 1566
1550 rcu_read_lock(); 1567 rcu_read_lock();
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1563 } 1580 }
1564 rcu_read_unlock(); 1581 rcu_read_unlock();
1565 1582
1566 printk(KERN_INFO "Task in %s killed", memcg_name); 1583 pr_info("Task in %s killed", memcg_name);
1567 1584
1568 rcu_read_lock(); 1585 rcu_read_lock();
1569 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1586 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1576 /* 1593 /*
1577 * Continues from above, so we don't need an KERN_ level 1594 * Continues from above, so we don't need an KERN_ level
1578 */ 1595 */
1579 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1596 pr_cont(" as a result of limit of %s\n", memcg_name);
1580done: 1597done:
1581 1598
1582 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1599 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1583 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1600 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1584 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1601 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1585 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1602 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1586 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1603 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1587 "failcnt %llu\n",
1588 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1604 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1589 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1605 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1590 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1606 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1591 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1607 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1592 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1608 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1593 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1609 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1594 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1610 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1611
1612 for_each_mem_cgroup_tree(iter, memcg) {
1613 pr_info("Memory cgroup stats");
1614
1615 rcu_read_lock();
1616 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1617 if (!ret)
1618 pr_cont(" for %s", memcg_name);
1619 rcu_read_unlock();
1620 pr_cont(":");
1621
1622 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1623 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1624 continue;
1625 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1626 K(mem_cgroup_read_stat(iter, i)));
1627 }
1628
1629 for (i = 0; i < NR_LRU_LISTS; i++)
1630 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1631 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1632
1633 pr_cont("\n");
1634 }
1595} 1635}
1596 1636
1597/* 1637/*
@@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy)
2256 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2296 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2257} 2297}
2258 2298
2299static void __init memcg_stock_init(void)
2300{
2301 int cpu;
2302
2303 for_each_possible_cpu(cpu) {
2304 struct memcg_stock_pcp *stock =
2305 &per_cpu(memcg_stock, cpu);
2306 INIT_WORK(&stock->work, drain_local_stock);
2307 }
2308}
2309
2259/* 2310/*
2260 * Cache charges(val) which is from res_counter, to local per_cpu area. 2311 * Cache charges(val) which is from res_counter, to local per_cpu area.
2261 * This will be consumed by consume_stock() function, later. 2312 * This will be consumed by consume_stock() function, later.
@@ -3030,7 +3081,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3030 if (memcg) { 3081 if (memcg) {
3031 s->memcg_params->memcg = memcg; 3082 s->memcg_params->memcg = memcg;
3032 s->memcg_params->root_cache = root_cache; 3083 s->memcg_params->root_cache = root_cache;
3033 } 3084 } else
3085 s->memcg_params->is_root_cache = true;
3086
3034 return 0; 3087 return 0;
3035} 3088}
3036 3089
@@ -4389,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page)
4389 4442
4390 pc = lookup_page_cgroup_used(page); 4443 pc = lookup_page_cgroup_used(page);
4391 if (pc) { 4444 if (pc) {
4392 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 4445 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4393 pc, pc->flags, pc->mem_cgroup); 4446 pc, pc->flags, pc->mem_cgroup);
4394 } 4447 }
4395} 4448}
4396#endif 4449#endif
@@ -4717,6 +4770,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4717} 4770}
4718 4771
4719/* 4772/*
4773 * This mainly exists for tests during the setting of set of use_hierarchy.
4774 * Since this is the very setting we are changing, the current hierarchy value
4775 * is meaningless
4776 */
4777static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4778{
4779 struct cgroup *pos;
4780
4781 /* bounce at first found */
4782 cgroup_for_each_child(pos, memcg->css.cgroup)
4783 return true;
4784 return false;
4785}
4786
4787/*
4788 * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
4789 * to be already dead (as in mem_cgroup_force_empty, for instance). This is
4790 * from mem_cgroup_count_children(), in the sense that we don't really care how
4791 * many children we have; we only need to know if we have any. It also counts
4792 * any memcg without hierarchy as infertile.
4793 */
4794static inline bool memcg_has_children(struct mem_cgroup *memcg)
4795{
4796 return memcg->use_hierarchy && __memcg_has_children(memcg);
4797}
4798
4799/*
4720 * Reclaims as many pages from the given memcg as possible and moves 4800 * Reclaims as many pages from the given memcg as possible and moves
4721 * the rest to the parent. 4801 * the rest to the parent.
4722 * 4802 *
@@ -4786,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4786 if (parent) 4866 if (parent)
4787 parent_memcg = mem_cgroup_from_cont(parent); 4867 parent_memcg = mem_cgroup_from_cont(parent);
4788 4868
4789 cgroup_lock(); 4869 mutex_lock(&memcg_create_mutex);
4790 4870
4791 if (memcg->use_hierarchy == val) 4871 if (memcg->use_hierarchy == val)
4792 goto out; 4872 goto out;
@@ -4801,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4801 */ 4881 */
4802 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 4882 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
4803 (val == 1 || val == 0)) { 4883 (val == 1 || val == 0)) {
4804 if (list_empty(&cont->children)) 4884 if (!__memcg_has_children(memcg))
4805 memcg->use_hierarchy = val; 4885 memcg->use_hierarchy = val;
4806 else 4886 else
4807 retval = -EBUSY; 4887 retval = -EBUSY;
@@ -4809,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4809 retval = -EINVAL; 4889 retval = -EINVAL;
4810 4890
4811out: 4891out:
4812 cgroup_unlock(); 4892 mutex_unlock(&memcg_create_mutex);
4813 4893
4814 return retval; 4894 return retval;
4815} 4895}
@@ -4894,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4894{ 4974{
4895 int ret = -EINVAL; 4975 int ret = -EINVAL;
4896#ifdef CONFIG_MEMCG_KMEM 4976#ifdef CONFIG_MEMCG_KMEM
4897 bool must_inc_static_branch = false;
4898
4899 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4977 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4900 /* 4978 /*
4901 * For simplicity, we won't allow this to be disabled. It also can't 4979 * For simplicity, we won't allow this to be disabled. It also can't
@@ -4908,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4908 * 4986 *
4909 * After it first became limited, changes in the value of the limit are 4987 * After it first became limited, changes in the value of the limit are
4910 * of course permitted. 4988 * of course permitted.
4911 *
4912 * Taking the cgroup_lock is really offensive, but it is so far the only
4913 * way to guarantee that no children will appear. There are plenty of
4914 * other offenders, and they should all go away. Fine grained locking
4915 * is probably the way to go here. When we are fully hierarchical, we
4916 * can also get rid of the use_hierarchy check.
4917 */ 4989 */
4918 cgroup_lock(); 4990 mutex_lock(&memcg_create_mutex);
4919 mutex_lock(&set_limit_mutex); 4991 mutex_lock(&set_limit_mutex);
4920 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { 4992 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4921 if (cgroup_task_count(cont) || (memcg->use_hierarchy && 4993 if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
4922 !list_empty(&cont->children))) {
4923 ret = -EBUSY; 4994 ret = -EBUSY;
4924 goto out; 4995 goto out;
4925 } 4996 }
@@ -4931,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4931 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); 5002 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4932 goto out; 5003 goto out;
4933 } 5004 }
4934 must_inc_static_branch = true; 5005 static_key_slow_inc(&memcg_kmem_enabled_key);
5006 /*
5007 * setting the active bit after the inc will guarantee no one
5008 * starts accounting before all call sites are patched
5009 */
5010 memcg_kmem_set_active(memcg);
5011
4935 /* 5012 /*
4936 * kmem charges can outlive the cgroup. In the case of slab 5013 * kmem charges can outlive the cgroup. In the case of slab
4937 * pages, for instance, a page contain objects from various 5014 * pages, for instance, a page contain objects from various
@@ -4943,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4943 ret = res_counter_set_limit(&memcg->kmem, val); 5020 ret = res_counter_set_limit(&memcg->kmem, val);
4944out: 5021out:
4945 mutex_unlock(&set_limit_mutex); 5022 mutex_unlock(&set_limit_mutex);
4946 cgroup_unlock(); 5023 mutex_unlock(&memcg_create_mutex);
4947
4948 /*
4949 * We are by now familiar with the fact that we can't inc the static
4950 * branch inside cgroup_lock. See disarm functions for details. A
4951 * worker here is overkill, but also wrong: After the limit is set, we
4952 * must start accounting right away. Since this operation can't fail,
4953 * we can safely defer it to here - no rollback will be needed.
4954 *
4955 * The boolean used to control this is also safe, because
4956 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
4957 * able to set it to true;
4958 */
4959 if (must_inc_static_branch) {
4960 static_key_slow_inc(&memcg_kmem_enabled_key);
4961 /*
4962 * setting the active bit after the inc will guarantee no one
4963 * starts accounting before all call sites are patched
4964 */
4965 memcg_kmem_set_active(memcg);
4966 }
4967
4968#endif 5024#endif
4969 return ret; 5025 return ret;
4970} 5026}
4971 5027
5028#ifdef CONFIG_MEMCG_KMEM
4972static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5029static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4973{ 5030{
4974 int ret = 0; 5031 int ret = 0;
@@ -4977,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4977 goto out; 5034 goto out;
4978 5035
4979 memcg->kmem_account_flags = parent->kmem_account_flags; 5036 memcg->kmem_account_flags = parent->kmem_account_flags;
4980#ifdef CONFIG_MEMCG_KMEM
4981 /* 5037 /*
4982 * When that happen, we need to disable the static branch only on those 5038 * When that happen, we need to disable the static branch only on those
4983 * memcgs that enabled it. To achieve this, we would be forced to 5039 * memcgs that enabled it. To achieve this, we would be forced to
@@ -5003,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5003 mutex_lock(&set_limit_mutex); 5059 mutex_lock(&set_limit_mutex);
5004 ret = memcg_update_cache_sizes(memcg); 5060 ret = memcg_update_cache_sizes(memcg);
5005 mutex_unlock(&set_limit_mutex); 5061 mutex_unlock(&set_limit_mutex);
5006#endif
5007out: 5062out:
5008 return ret; 5063 return ret;
5009} 5064}
5065#endif /* CONFIG_MEMCG_KMEM */
5010 5066
5011/* 5067/*
5012 * The user of this function is... 5068 * The user of this function is...
@@ -5146,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5146 5202
5147 if (val >= (1 << NR_MOVE_TYPE)) 5203 if (val >= (1 << NR_MOVE_TYPE))
5148 return -EINVAL; 5204 return -EINVAL;
5205
5149 /* 5206 /*
5150 * We check this value several times in both in can_attach() and 5207 * No kind of locking is needed in here, because ->can_attach() will
5151 * attach(), so we need cgroup lock to prevent this value from being 5208 * check this value once in the beginning of the process, and then carry
5152 * inconsistent. 5209 * on with stale data. This means that changes to this value will only
5210 * affect task migrations starting after the change.
5153 */ 5211 */
5154 cgroup_lock();
5155 memcg->move_charge_at_immigrate = val; 5212 memcg->move_charge_at_immigrate = val;
5156 cgroup_unlock();
5157
5158 return 0; 5213 return 0;
5159} 5214}
5160#else 5215#else
@@ -5212,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
5212} 5267}
5213#endif /* CONFIG_NUMA */ 5268#endif /* CONFIG_NUMA */
5214 5269
5215static const char * const mem_cgroup_lru_names[] = {
5216 "inactive_anon",
5217 "active_anon",
5218 "inactive_file",
5219 "active_file",
5220 "unevictable",
5221};
5222
5223static inline void mem_cgroup_lru_names_not_uptodate(void) 5270static inline void mem_cgroup_lru_names_not_uptodate(void)
5224{ 5271{
5225 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5272 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -5333,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
5333 5380
5334 parent = mem_cgroup_from_cont(cgrp->parent); 5381 parent = mem_cgroup_from_cont(cgrp->parent);
5335 5382
5336 cgroup_lock(); 5383 mutex_lock(&memcg_create_mutex);
5337 5384
5338 /* If under hierarchy, only empty-root can set this value */ 5385 /* If under hierarchy, only empty-root can set this value */
5339 if ((parent->use_hierarchy) || 5386 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5340 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 5387 mutex_unlock(&memcg_create_mutex);
5341 cgroup_unlock();
5342 return -EINVAL; 5388 return -EINVAL;
5343 } 5389 }
5344 5390
5345 memcg->swappiness = val; 5391 memcg->swappiness = val;
5346 5392
5347 cgroup_unlock(); 5393 mutex_unlock(&memcg_create_mutex);
5348 5394
5349 return 0; 5395 return 0;
5350} 5396}
@@ -5670,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
5670 5716
5671 parent = mem_cgroup_from_cont(cgrp->parent); 5717 parent = mem_cgroup_from_cont(cgrp->parent);
5672 5718
5673 cgroup_lock(); 5719 mutex_lock(&memcg_create_mutex);
5674 /* oom-kill-disable is a flag for subhierarchy. */ 5720 /* oom-kill-disable is a flag for subhierarchy. */
5675 if ((parent->use_hierarchy) || 5721 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5676 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 5722 mutex_unlock(&memcg_create_mutex);
5677 cgroup_unlock();
5678 return -EINVAL; 5723 return -EINVAL;
5679 } 5724 }
5680 memcg->oom_kill_disable = val; 5725 memcg->oom_kill_disable = val;
5681 if (!val) 5726 if (!val)
5682 memcg_oom_recover(memcg); 5727 memcg_oom_recover(memcg);
5683 cgroup_unlock(); 5728 mutex_unlock(&memcg_create_mutex);
5684 return 0; 5729 return 0;
5685} 5730}
5686 5731
@@ -5795,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = {
5795 .read_seq_string = memcg_numa_stat_show, 5840 .read_seq_string = memcg_numa_stat_show,
5796 }, 5841 },
5797#endif 5842#endif
5798#ifdef CONFIG_MEMCG_SWAP
5799 {
5800 .name = "memsw.usage_in_bytes",
5801 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5802 .read = mem_cgroup_read,
5803 .register_event = mem_cgroup_usage_register_event,
5804 .unregister_event = mem_cgroup_usage_unregister_event,
5805 },
5806 {
5807 .name = "memsw.max_usage_in_bytes",
5808 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5809 .trigger = mem_cgroup_reset,
5810 .read = mem_cgroup_read,
5811 },
5812 {
5813 .name = "memsw.limit_in_bytes",
5814 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5815 .write_string = mem_cgroup_write,
5816 .read = mem_cgroup_read,
5817 },
5818 {
5819 .name = "memsw.failcnt",
5820 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5821 .trigger = mem_cgroup_reset,
5822 .read = mem_cgroup_read,
5823 },
5824#endif
5825#ifdef CONFIG_MEMCG_KMEM 5843#ifdef CONFIG_MEMCG_KMEM
5826 { 5844 {
5827 .name = "kmem.limit_in_bytes", 5845 .name = "kmem.limit_in_bytes",
@@ -5856,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = {
5856 { }, /* terminate */ 5874 { }, /* terminate */
5857}; 5875};
5858 5876
5877#ifdef CONFIG_MEMCG_SWAP
5878static struct cftype memsw_cgroup_files[] = {
5879 {
5880 .name = "memsw.usage_in_bytes",
5881 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5882 .read = mem_cgroup_read,
5883 .register_event = mem_cgroup_usage_register_event,
5884 .unregister_event = mem_cgroup_usage_unregister_event,
5885 },
5886 {
5887 .name = "memsw.max_usage_in_bytes",
5888 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5889 .trigger = mem_cgroup_reset,
5890 .read = mem_cgroup_read,
5891 },
5892 {
5893 .name = "memsw.limit_in_bytes",
5894 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5895 .write_string = mem_cgroup_write,
5896 .read = mem_cgroup_read,
5897 },
5898 {
5899 .name = "memsw.failcnt",
5900 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5901 .trigger = mem_cgroup_reset,
5902 .read = mem_cgroup_read,
5903 },
5904 { }, /* terminate */
5905};
5906#endif
5859static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 5907static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5860{ 5908{
5861 struct mem_cgroup_per_node *pn; 5909 struct mem_cgroup_per_node *pn;
@@ -5894,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5894static struct mem_cgroup *mem_cgroup_alloc(void) 5942static struct mem_cgroup *mem_cgroup_alloc(void)
5895{ 5943{
5896 struct mem_cgroup *memcg; 5944 struct mem_cgroup *memcg;
5897 int size = sizeof(struct mem_cgroup); 5945 size_t size = memcg_size();
5898 5946
5899 /* Can be very big if MAX_NUMNODES is very big */ 5947 /* Can be very big if nr_node_ids is very big */
5900 if (size < PAGE_SIZE) 5948 if (size < PAGE_SIZE)
5901 memcg = kzalloc(size, GFP_KERNEL); 5949 memcg = kzalloc(size, GFP_KERNEL);
5902 else 5950 else
@@ -5933,7 +5981,7 @@ out_free:
5933static void __mem_cgroup_free(struct mem_cgroup *memcg) 5981static void __mem_cgroup_free(struct mem_cgroup *memcg)
5934{ 5982{
5935 int node; 5983 int node;
5936 int size = sizeof(struct mem_cgroup); 5984 size_t size = memcg_size();
5937 5985
5938 mem_cgroup_remove_from_trees(memcg); 5986 mem_cgroup_remove_from_trees(memcg);
5939 free_css_id(&mem_cgroup_subsys, &memcg->css); 5987 free_css_id(&mem_cgroup_subsys, &memcg->css);
@@ -6015,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6015} 6063}
6016EXPORT_SYMBOL(parent_mem_cgroup); 6064EXPORT_SYMBOL(parent_mem_cgroup);
6017 6065
6018#ifdef CONFIG_MEMCG_SWAP 6066static void __init mem_cgroup_soft_limit_tree_init(void)
6019static void __init enable_swap_cgroup(void)
6020{
6021 if (!mem_cgroup_disabled() && really_do_swap_account)
6022 do_swap_account = 1;
6023}
6024#else
6025static void __init enable_swap_cgroup(void)
6026{
6027}
6028#endif
6029
6030static int mem_cgroup_soft_limit_tree_init(void)
6031{ 6067{
6032 struct mem_cgroup_tree_per_node *rtpn; 6068 struct mem_cgroup_tree_per_node *rtpn;
6033 struct mem_cgroup_tree_per_zone *rtpz; 6069 struct mem_cgroup_tree_per_zone *rtpz;
@@ -6038,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
6038 if (!node_state(node, N_NORMAL_MEMORY)) 6074 if (!node_state(node, N_NORMAL_MEMORY))
6039 tmp = -1; 6075 tmp = -1;
6040 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6076 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6041 if (!rtpn) 6077 BUG_ON(!rtpn);
6042 goto err_cleanup;
6043 6078
6044 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6079 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6045 6080
@@ -6049,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void)
6049 spin_lock_init(&rtpz->lock); 6084 spin_lock_init(&rtpz->lock);
6050 } 6085 }
6051 } 6086 }
6052 return 0;
6053
6054err_cleanup:
6055 for_each_node(node) {
6056 if (!soft_limit_tree.rb_tree_per_node[node])
6057 break;
6058 kfree(soft_limit_tree.rb_tree_per_node[node]);
6059 soft_limit_tree.rb_tree_per_node[node] = NULL;
6060 }
6061 return 1;
6062
6063} 6087}
6064 6088
6065static struct cgroup_subsys_state * __ref 6089static struct cgroup_subsys_state * __ref
6066mem_cgroup_css_alloc(struct cgroup *cont) 6090mem_cgroup_css_alloc(struct cgroup *cont)
6067{ 6091{
6068 struct mem_cgroup *memcg, *parent; 6092 struct mem_cgroup *memcg;
6069 long error = -ENOMEM; 6093 long error = -ENOMEM;
6070 int node; 6094 int node;
6071 6095
@@ -6079,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6079 6103
6080 /* root ? */ 6104 /* root ? */
6081 if (cont->parent == NULL) { 6105 if (cont->parent == NULL) {
6082 int cpu;
6083 enable_swap_cgroup();
6084 parent = NULL;
6085 if (mem_cgroup_soft_limit_tree_init())
6086 goto free_out;
6087 root_mem_cgroup = memcg; 6106 root_mem_cgroup = memcg;
6088 for_each_possible_cpu(cpu) { 6107 res_counter_init(&memcg->res, NULL);
6089 struct memcg_stock_pcp *stock = 6108 res_counter_init(&memcg->memsw, NULL);
6090 &per_cpu(memcg_stock, cpu); 6109 res_counter_init(&memcg->kmem, NULL);
6091 INIT_WORK(&stock->work, drain_local_stock);
6092 }
6093 } else {
6094 parent = mem_cgroup_from_cont(cont->parent);
6095 memcg->use_hierarchy = parent->use_hierarchy;
6096 memcg->oom_kill_disable = parent->oom_kill_disable;
6097 } 6110 }
6098 6111
6099 if (parent && parent->use_hierarchy) { 6112 memcg->last_scanned_node = MAX_NUMNODES;
6113 INIT_LIST_HEAD(&memcg->oom_notify);
6114 atomic_set(&memcg->refcnt, 1);
6115 memcg->move_charge_at_immigrate = 0;
6116 mutex_init(&memcg->thresholds_lock);
6117 spin_lock_init(&memcg->move_lock);
6118
6119 return &memcg->css;
6120
6121free_out:
6122 __mem_cgroup_free(memcg);
6123 return ERR_PTR(error);
6124}
6125
6126static int
6127mem_cgroup_css_online(struct cgroup *cont)
6128{
6129 struct mem_cgroup *memcg, *parent;
6130 int error = 0;
6131
6132 if (!cont->parent)
6133 return 0;
6134
6135 mutex_lock(&memcg_create_mutex);
6136 memcg = mem_cgroup_from_cont(cont);
6137 parent = mem_cgroup_from_cont(cont->parent);
6138
6139 memcg->use_hierarchy = parent->use_hierarchy;
6140 memcg->oom_kill_disable = parent->oom_kill_disable;
6141 memcg->swappiness = mem_cgroup_swappiness(parent);
6142
6143 if (parent->use_hierarchy) {
6100 res_counter_init(&memcg->res, &parent->res); 6144 res_counter_init(&memcg->res, &parent->res);
6101 res_counter_init(&memcg->memsw, &parent->memsw); 6145 res_counter_init(&memcg->memsw, &parent->memsw);
6102 res_counter_init(&memcg->kmem, &parent->kmem); 6146 res_counter_init(&memcg->kmem, &parent->kmem);
@@ -6117,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6117 * much sense so let cgroup subsystem know about this 6161 * much sense so let cgroup subsystem know about this
6118 * unfortunate state in our controller. 6162 * unfortunate state in our controller.
6119 */ 6163 */
6120 if (parent && parent != root_mem_cgroup) 6164 if (parent != root_mem_cgroup)
6121 mem_cgroup_subsys.broken_hierarchy = true; 6165 mem_cgroup_subsys.broken_hierarchy = true;
6122 } 6166 }
6123 memcg->last_scanned_node = MAX_NUMNODES;
6124 INIT_LIST_HEAD(&memcg->oom_notify);
6125
6126 if (parent)
6127 memcg->swappiness = mem_cgroup_swappiness(parent);
6128 atomic_set(&memcg->refcnt, 1);
6129 memcg->move_charge_at_immigrate = 0;
6130 mutex_init(&memcg->thresholds_lock);
6131 spin_lock_init(&memcg->move_lock);
6132 6167
6133 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 6168 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6169 mutex_unlock(&memcg_create_mutex);
6134 if (error) { 6170 if (error) {
6135 /* 6171 /*
6136 * We call put now because our (and parent's) refcnts 6172 * We call put now because our (and parent's) refcnts
@@ -6138,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6138 * call __mem_cgroup_free, so return directly 6174 * call __mem_cgroup_free, so return directly
6139 */ 6175 */
6140 mem_cgroup_put(memcg); 6176 mem_cgroup_put(memcg);
6141 return ERR_PTR(error); 6177 if (parent->use_hierarchy)
6178 mem_cgroup_put(parent);
6142 } 6179 }
6143 return &memcg->css; 6180 return error;
6144free_out:
6145 __mem_cgroup_free(memcg);
6146 return ERR_PTR(error);
6147} 6181}
6148 6182
6149static void mem_cgroup_css_offline(struct cgroup *cont) 6183static void mem_cgroup_css_offline(struct cgroup *cont)
@@ -6279,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6279 * Because lookup_swap_cache() updates some statistics counter, 6313 * Because lookup_swap_cache() updates some statistics counter,
6280 * we call find_get_page() with swapper_space directly. 6314 * we call find_get_page() with swapper_space directly.
6281 */ 6315 */
6282 page = find_get_page(&swapper_space, ent.val); 6316 page = find_get_page(swap_address_space(ent), ent.val);
6283 if (do_swap_account) 6317 if (do_swap_account)
6284 entry->val = ent.val; 6318 entry->val = ent.val;
6285 6319
@@ -6320,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6320 swp_entry_t swap = radix_to_swp_entry(page); 6354 swp_entry_t swap = radix_to_swp_entry(page);
6321 if (do_swap_account) 6355 if (do_swap_account)
6322 *entry = swap; 6356 *entry = swap;
6323 page = find_get_page(&swapper_space, swap.val); 6357 page = find_get_page(swap_address_space(swap), swap.val);
6324 } 6358 }
6325#endif 6359#endif
6326 return page; 6360 return page;
@@ -6530,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
6530 struct task_struct *p = cgroup_taskset_first(tset); 6564 struct task_struct *p = cgroup_taskset_first(tset);
6531 int ret = 0; 6565 int ret = 0;
6532 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 6566 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
6567 unsigned long move_charge_at_immigrate;
6533 6568
6534 if (memcg->move_charge_at_immigrate) { 6569 /*
6570 * We are now commited to this value whatever it is. Changes in this
6571 * tunable will only affect upcoming migrations, not the current one.
6572 * So we need to save it, and keep it going.
6573 */
6574 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
6575 if (move_charge_at_immigrate) {
6535 struct mm_struct *mm; 6576 struct mm_struct *mm;
6536 struct mem_cgroup *from = mem_cgroup_from_task(p); 6577 struct mem_cgroup *from = mem_cgroup_from_task(p);
6537 6578
@@ -6551,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
6551 spin_lock(&mc.lock); 6592 spin_lock(&mc.lock);
6552 mc.from = from; 6593 mc.from = from;
6553 mc.to = memcg; 6594 mc.to = memcg;
6595 mc.immigrate_flags = move_charge_at_immigrate;
6554 spin_unlock(&mc.lock); 6596 spin_unlock(&mc.lock);
6555 /* We set mc.moving_task later */ 6597 /* We set mc.moving_task later */
6556 6598
@@ -6745,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
6745 .name = "memory", 6787 .name = "memory",
6746 .subsys_id = mem_cgroup_subsys_id, 6788 .subsys_id = mem_cgroup_subsys_id,
6747 .css_alloc = mem_cgroup_css_alloc, 6789 .css_alloc = mem_cgroup_css_alloc,
6790 .css_online = mem_cgroup_css_online,
6748 .css_offline = mem_cgroup_css_offline, 6791 .css_offline = mem_cgroup_css_offline,
6749 .css_free = mem_cgroup_css_free, 6792 .css_free = mem_cgroup_css_free,
6750 .can_attach = mem_cgroup_can_attach, 6793 .can_attach = mem_cgroup_can_attach,
@@ -6755,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
6755 .use_id = 1, 6798 .use_id = 1,
6756}; 6799};
6757 6800
6758/*
6759 * The rest of init is performed during ->css_alloc() for root css which
6760 * happens before initcalls. hotcpu_notifier() can't be done together as
6761 * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
6762 * dependency. Do it from a subsys_initcall().
6763 */
6764static int __init mem_cgroup_init(void)
6765{
6766 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6767 return 0;
6768}
6769subsys_initcall(mem_cgroup_init);
6770
6771#ifdef CONFIG_MEMCG_SWAP 6801#ifdef CONFIG_MEMCG_SWAP
6772static int __init enable_swap_account(char *s) 6802static int __init enable_swap_account(char *s)
6773{ 6803{
@@ -6780,4 +6810,39 @@ static int __init enable_swap_account(char *s)
6780} 6810}
6781__setup("swapaccount=", enable_swap_account); 6811__setup("swapaccount=", enable_swap_account);
6782 6812
6813static void __init memsw_file_init(void)
6814{
6815 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
6816}
6817
6818static void __init enable_swap_cgroup(void)
6819{
6820 if (!mem_cgroup_disabled() && really_do_swap_account) {
6821 do_swap_account = 1;
6822 memsw_file_init();
6823 }
6824}
6825
6826#else
6827static void __init enable_swap_cgroup(void)
6828{
6829}
6783#endif 6830#endif
6831
6832/*
6833 * subsys_initcall() for memory controller.
6834 *
6835 * Some parts like hotcpu_notifier() have to be initialized from this context
6836 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
6837 * everything that doesn't depend on a specific mem_cgroup structure should
6838 * be initialized from here.
6839 */
6840static int __init mem_cgroup_init(void)
6841{
6842 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6843 enable_swap_cgroup();
6844 mem_cgroup_soft_limit_tree_init();
6845 memcg_stock_init();
6846 return 0;
6847}
6848subsys_initcall(mem_cgroup_init);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c6e4dd3e1c08..df0694c6adef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0;
61 61
62int sysctl_memory_failure_recovery __read_mostly = 1; 62int sysctl_memory_failure_recovery __read_mostly = 1;
63 63
64atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 64atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
65 65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) 66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67 67
@@ -784,12 +784,12 @@ static struct page_state {
784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, 784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, 785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786 786
787 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
788 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
789
790 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, 787 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
791 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, 788 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
792 789
790 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
791 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
792
793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, 793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795 795
@@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1021 struct page *hpage; 1021 struct page *hpage;
1022 int res; 1022 int res;
1023 unsigned int nr_pages; 1023 unsigned int nr_pages;
1024 unsigned long page_flags;
1024 1025
1025 if (!sysctl_memory_failure_recovery) 1026 if (!sysctl_memory_failure_recovery)
1026 panic("Memory failure from trap %d on page %lx", trapno, pfn); 1027 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1039 return 0; 1040 return 0;
1040 } 1041 }
1041 1042
1042 nr_pages = 1 << compound_trans_order(hpage); 1043 /*
1043 atomic_long_add(nr_pages, &mce_bad_pages); 1044 * Currently errors on hugetlbfs pages are measured in hugepage units,
1045 * so nr_pages should be 1 << compound_order. OTOH when errors are on
1046 * transparent hugepages, they are supposed to be split and error
1047 * measurement is done in normal page units. So nr_pages should be one
1048 * in this case.
1049 */
1050 if (PageHuge(p))
1051 nr_pages = 1 << compound_order(hpage);
1052 else /* normal page or thp */
1053 nr_pages = 1;
1054 atomic_long_add(nr_pages, &num_poisoned_pages);
1044 1055
1045 /* 1056 /*
1046 * We need/can do nothing about count=0 pages. 1057 * We need/can do nothing about count=0 pages.
@@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1070 if (!PageHWPoison(hpage) 1081 if (!PageHWPoison(hpage)
1071 || (hwpoison_filter(p) && TestClearPageHWPoison(p)) 1082 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1072 || (p != hpage && TestSetPageHWPoison(hpage))) { 1083 || (p != hpage && TestSetPageHWPoison(hpage))) {
1073 atomic_long_sub(nr_pages, &mce_bad_pages); 1084 atomic_long_sub(nr_pages, &num_poisoned_pages);
1074 return 0; 1085 return 0;
1075 } 1086 }
1076 set_page_hwpoison_huge_page(hpage); 1087 set_page_hwpoison_huge_page(hpage);
@@ -1119,6 +1130,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1119 lock_page(hpage); 1130 lock_page(hpage);
1120 1131
1121 /* 1132 /*
1133 * We use page flags to determine what action should be taken, but
1134 * the flags can be modified by the error containment action. One
1135 * example is an mlocked page, where PG_mlocked is cleared by
1136 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
1137 * correctly, we save a copy of the page flags at this time.
1138 */
1139 page_flags = p->flags;
1140
1141 /*
1122 * unpoison always clear PG_hwpoison inside page lock 1142 * unpoison always clear PG_hwpoison inside page lock
1123 */ 1143 */
1124 if (!PageHWPoison(p)) { 1144 if (!PageHWPoison(p)) {
@@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1128 } 1148 }
1129 if (hwpoison_filter(p)) { 1149 if (hwpoison_filter(p)) {
1130 if (TestClearPageHWPoison(p)) 1150 if (TestClearPageHWPoison(p))
1131 atomic_long_sub(nr_pages, &mce_bad_pages); 1151 atomic_long_sub(nr_pages, &num_poisoned_pages);
1132 unlock_page(hpage); 1152 unlock_page(hpage);
1133 put_page(hpage); 1153 put_page(hpage);
1134 return 0; 1154 return 0;
@@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1176 } 1196 }
1177 1197
1178 res = -EBUSY; 1198 res = -EBUSY;
1179 for (ps = error_states;; ps++) { 1199 /*
1180 if ((p->flags & ps->mask) == ps->res) { 1200 * The first check uses the current page flags which may not have any
1181 res = page_action(ps, p, pfn); 1201 * relevant information. The second check with the saved page flagss is
1202 * carried out only if the first check can't determine the page status.
1203 */
1204 for (ps = error_states;; ps++)
1205 if ((p->flags & ps->mask) == ps->res)
1182 break; 1206 break;
1183 } 1207 if (!ps->mask)
1184 } 1208 for (ps = error_states;; ps++)
1209 if ((page_flags & ps->mask) == ps->res)
1210 break;
1211 res = page_action(ps, p, pfn);
1185out: 1212out:
1186 unlock_page(hpage); 1213 unlock_page(hpage);
1187 return res; 1214 return res;
@@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn)
1323 return 0; 1350 return 0;
1324 } 1351 }
1325 if (TestClearPageHWPoison(p)) 1352 if (TestClearPageHWPoison(p))
1326 atomic_long_sub(nr_pages, &mce_bad_pages); 1353 atomic_long_sub(nr_pages, &num_poisoned_pages);
1327 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); 1354 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1328 return 0; 1355 return 0;
1329 } 1356 }
@@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn)
1337 */ 1364 */
1338 if (TestClearPageHWPoison(page)) { 1365 if (TestClearPageHWPoison(page)) {
1339 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); 1366 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1340 atomic_long_sub(nr_pages, &mce_bad_pages); 1367 atomic_long_sub(nr_pages, &num_poisoned_pages);
1341 freeit = 1; 1368 freeit = 1;
1342 if (PageHuge(page)) 1369 if (PageHuge(page))
1343 clear_page_hwpoison_huge_page(page); 1370 clear_page_hwpoison_huge_page(page);
@@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
1368 * that is not free, and 1 for any other page type. 1395 * that is not free, and 1 for any other page type.
1369 * For 1 the page is returned with increased page count, otherwise not. 1396 * For 1 the page is returned with increased page count, otherwise not.
1370 */ 1397 */
1371static int get_any_page(struct page *p, unsigned long pfn, int flags) 1398static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1372{ 1399{
1373 int ret; 1400 int ret;
1374 1401
@@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1393 if (!get_page_unless_zero(compound_head(p))) { 1420 if (!get_page_unless_zero(compound_head(p))) {
1394 if (PageHuge(p)) { 1421 if (PageHuge(p)) {
1395 pr_info("%s: %#lx free huge page\n", __func__, pfn); 1422 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1396 ret = dequeue_hwpoisoned_huge_page(compound_head(p)); 1423 ret = 0;
1397 } else if (is_free_buddy_page(p)) { 1424 } else if (is_free_buddy_page(p)) {
1398 pr_info("%s: %#lx free buddy page\n", __func__, pfn); 1425 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1399 /* Set hwpoison bit while page is still isolated */
1400 SetPageHWPoison(p);
1401 ret = 0; 1426 ret = 0;
1402 } else { 1427 } else {
1403 pr_info("%s: %#lx: unknown zero refcount page type %lx\n", 1428 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
@@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1413 return ret; 1438 return ret;
1414} 1439}
1415 1440
1441static int get_any_page(struct page *page, unsigned long pfn, int flags)
1442{
1443 int ret = __get_any_page(page, pfn, flags);
1444
1445 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1446 /*
1447 * Try to free it.
1448 */
1449 put_page(page);
1450 shake_page(page, 1);
1451
1452 /*
1453 * Did it turn free?
1454 */
1455 ret = __get_any_page(page, pfn, 0);
1456 if (!PageLRU(page)) {
1457 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1458 pfn, page->flags);
1459 return -EIO;
1460 }
1461 }
1462 return ret;
1463}
1464
1416static int soft_offline_huge_page(struct page *page, int flags) 1465static int soft_offline_huge_page(struct page *page, int flags)
1417{ 1466{
1418 int ret; 1467 int ret;
1419 unsigned long pfn = page_to_pfn(page); 1468 unsigned long pfn = page_to_pfn(page);
1420 struct page *hpage = compound_head(page); 1469 struct page *hpage = compound_head(page);
1421 1470
1422 ret = get_any_page(page, pfn, flags); 1471 /*
1423 if (ret < 0) 1472 * This double-check of PageHWPoison is to avoid the race with
1424 return ret; 1473 * memory_failure(). See also comment in __soft_offline_page().
1425 if (ret == 0) 1474 */
1426 goto done; 1475 lock_page(hpage);
1427
1428 if (PageHWPoison(hpage)) { 1476 if (PageHWPoison(hpage)) {
1477 unlock_page(hpage);
1429 put_page(hpage); 1478 put_page(hpage);
1430 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); 1479 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1431 return -EBUSY; 1480 return -EBUSY;
1432 } 1481 }
1482 unlock_page(hpage);
1433 1483
1434 /* Keep page count to indicate a given hugepage is isolated. */ 1484 /* Keep page count to indicate a given hugepage is isolated. */
1435 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, 1485 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
1436 MIGRATE_SYNC); 1486 MIGRATE_SYNC);
1437 put_page(hpage); 1487 put_page(hpage);
1438 if (ret) { 1488 if (ret) {
1439 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1489 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1440 pfn, ret, page->flags); 1490 pfn, ret, page->flags);
1441 return ret; 1491 } else {
1442 } 1492 set_page_hwpoison_huge_page(hpage);
1443done: 1493 dequeue_hwpoisoned_huge_page(hpage);
1444 if (!PageHWPoison(hpage))
1445 atomic_long_add(1 << compound_trans_order(hpage), 1494 atomic_long_add(1 << compound_trans_order(hpage),
1446 &mce_bad_pages); 1495 &num_poisoned_pages);
1447 set_page_hwpoison_huge_page(hpage); 1496 }
1448 dequeue_hwpoisoned_huge_page(hpage);
1449 /* keep elevated page count for bad page */ 1497 /* keep elevated page count for bad page */
1450 return ret; 1498 return ret;
1451} 1499}
1452 1500
1501static int __soft_offline_page(struct page *page, int flags);
1502
1453/** 1503/**
1454 * soft_offline_page - Soft offline a page. 1504 * soft_offline_page - Soft offline a page.
1455 * @page: page to offline 1505 * @page: page to offline
@@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags)
1478 unsigned long pfn = page_to_pfn(page); 1528 unsigned long pfn = page_to_pfn(page);
1479 struct page *hpage = compound_trans_head(page); 1529 struct page *hpage = compound_trans_head(page);
1480 1530
1481 if (PageHuge(page)) 1531 if (PageHWPoison(page)) {
1482 return soft_offline_huge_page(page, flags); 1532 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1483 if (PageTransHuge(hpage)) { 1533 return -EBUSY;
1534 }
1535 if (!PageHuge(page) && PageTransHuge(hpage)) {
1484 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { 1536 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1485 pr_info("soft offline: %#lx: failed to split THP\n", 1537 pr_info("soft offline: %#lx: failed to split THP\n",
1486 pfn); 1538 pfn);
@@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags)
1491 ret = get_any_page(page, pfn, flags); 1543 ret = get_any_page(page, pfn, flags);
1492 if (ret < 0) 1544 if (ret < 0)
1493 return ret; 1545 return ret;
1494 if (ret == 0) 1546 if (ret) { /* for in-use pages */
1495 goto done; 1547 if (PageHuge(page))
1496 1548 ret = soft_offline_huge_page(page, flags);
1497 /* 1549 else
1498 * Page cache page we can handle? 1550 ret = __soft_offline_page(page, flags);
1499 */ 1551 } else { /* for free pages */
1500 if (!PageLRU(page)) { 1552 if (PageHuge(page)) {
1501 /* 1553 set_page_hwpoison_huge_page(hpage);
1502 * Try to free it. 1554 dequeue_hwpoisoned_huge_page(hpage);
1503 */ 1555 atomic_long_add(1 << compound_trans_order(hpage),
1504 put_page(page); 1556 &num_poisoned_pages);
1505 shake_page(page, 1); 1557 } else {
1506 1558 SetPageHWPoison(page);
1507 /* 1559 atomic_long_inc(&num_poisoned_pages);
1508 * Did it turn free? 1560 }
1509 */
1510 ret = get_any_page(page, pfn, 0);
1511 if (ret < 0)
1512 return ret;
1513 if (ret == 0)
1514 goto done;
1515 }
1516 if (!PageLRU(page)) {
1517 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1518 pfn, page->flags);
1519 return -EIO;
1520 } 1561 }
1562 /* keep elevated page count for bad page */
1563 return ret;
1564}
1521 1565
1522 lock_page(page); 1566static int __soft_offline_page(struct page *page, int flags)
1523 wait_on_page_writeback(page); 1567{
1568 int ret;
1569 unsigned long pfn = page_to_pfn(page);
1524 1570
1525 /* 1571 /*
1526 * Synchronized using the page lock with memory_failure() 1572 * Check PageHWPoison again inside page lock because PageHWPoison
1573 * is set by memory_failure() outside page lock. Note that
1574 * memory_failure() also double-checks PageHWPoison inside page lock,
1575 * so there's no race between soft_offline_page() and memory_failure().
1527 */ 1576 */
1577 lock_page(page);
1578 wait_on_page_writeback(page);
1528 if (PageHWPoison(page)) { 1579 if (PageHWPoison(page)) {
1529 unlock_page(page); 1580 unlock_page(page);
1530 put_page(page); 1581 put_page(page);
1531 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1582 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1532 return -EBUSY; 1583 return -EBUSY;
1533 } 1584 }
1534
1535 /* 1585 /*
1536 * Try to invalidate first. This should work for 1586 * Try to invalidate first. This should work for
1537 * non dirty unmapped page cache pages. 1587 * non dirty unmapped page cache pages.
@@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags)
1544 */ 1594 */
1545 if (ret == 1) { 1595 if (ret == 1) {
1546 put_page(page); 1596 put_page(page);
1547 ret = 0;
1548 pr_info("soft_offline: %#lx: invalidated\n", pfn); 1597 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1549 goto done; 1598 SetPageHWPoison(page);
1599 atomic_long_inc(&num_poisoned_pages);
1600 return 0;
1550 } 1601 }
1551 1602
1552 /* 1603 /*
@@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags)
1563 if (!ret) { 1614 if (!ret) {
1564 LIST_HEAD(pagelist); 1615 LIST_HEAD(pagelist);
1565 inc_zone_page_state(page, NR_ISOLATED_ANON + 1616 inc_zone_page_state(page, NR_ISOLATED_ANON +
1566 page_is_file_cache(page)); 1617 page_is_file_cache(page));
1567 list_add(&page->lru, &pagelist); 1618 list_add(&page->lru, &pagelist);
1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1619 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1569 false, MIGRATE_SYNC, 1620 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1570 MR_MEMORY_FAILURE);
1571 if (ret) { 1621 if (ret) {
1572 putback_lru_pages(&pagelist); 1622 putback_lru_pages(&pagelist);
1573 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1623 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1574 pfn, ret, page->flags); 1624 pfn, ret, page->flags);
1575 if (ret > 0) 1625 if (ret > 0)
1576 ret = -EIO; 1626 ret = -EIO;
1627 } else {
1628 SetPageHWPoison(page);
1629 atomic_long_inc(&num_poisoned_pages);
1577 } 1630 }
1578 } else { 1631 } else {
1579 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1632 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1580 pfn, ret, page_count(page), page->flags); 1633 pfn, ret, page_count(page), page->flags);
1581 } 1634 }
1582 if (ret)
1583 return ret;
1584
1585done:
1586 atomic_long_add(1, &mce_bad_pages);
1587 SetPageHWPoison(page);
1588 /* keep elevated page count for bad page */
1589 return ret; 1635 return ret;
1590} 1636}
diff --git a/mm/memory.c b/mm/memory.c
index e0a9b0ce4f10..494526ae024a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,10 @@
69 69
70#include "internal.h" 70#include "internal.h"
71 71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
74#endif
75
72#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
73/* use the per-pgdat data instead for discontigmem - mbligh */ 77/* use the per-pgdat data instead for discontigmem - mbligh */
74unsigned long max_mapnr; 78unsigned long max_mapnr;
@@ -184,10 +188,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
184 return 1; 188 return 1;
185 } 189 }
186 190
191 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
192 return 0;
193
187 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); 194 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
188 if (!batch) 195 if (!batch)
189 return 0; 196 return 0;
190 197
198 tlb->batch_count++;
191 batch->next = NULL; 199 batch->next = NULL;
192 batch->nr = 0; 200 batch->nr = 0;
193 batch->max = MAX_GATHER_BATCH; 201 batch->max = MAX_GATHER_BATCH;
@@ -216,6 +224,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
216 tlb->local.nr = 0; 224 tlb->local.nr = 0;
217 tlb->local.max = ARRAY_SIZE(tlb->__pages); 225 tlb->local.max = ARRAY_SIZE(tlb->__pages);
218 tlb->active = &tlb->local; 226 tlb->active = &tlb->local;
227 tlb->batch_count = 0;
219 228
220#ifdef CONFIG_HAVE_RCU_TABLE_FREE 229#ifdef CONFIG_HAVE_RCU_TABLE_FREE
221 tlb->batch = NULL; 230 tlb->batch = NULL;
@@ -711,7 +720,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
711 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", 720 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
712 (unsigned long)vma->vm_file->f_op->mmap); 721 (unsigned long)vma->vm_file->f_op->mmap);
713 dump_stack(); 722 dump_stack();
714 add_taint(TAINT_BAD_PAGE); 723 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
715} 724}
716 725
717static inline bool is_cow_mapping(vm_flags_t flags) 726static inline bool is_cow_mapping(vm_flags_t flags)
@@ -1453,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1453EXPORT_SYMBOL_GPL(zap_vma_ptes); 1462EXPORT_SYMBOL_GPL(zap_vma_ptes);
1454 1463
1455/** 1464/**
1456 * follow_page - look up a page descriptor from a user-virtual address 1465 * follow_page_mask - look up a page descriptor from a user-virtual address
1457 * @vma: vm_area_struct mapping @address 1466 * @vma: vm_area_struct mapping @address
1458 * @address: virtual address to look up 1467 * @address: virtual address to look up
1459 * @flags: flags modifying lookup behaviour 1468 * @flags: flags modifying lookup behaviour
1469 * @page_mask: on output, *page_mask is set according to the size of the page
1460 * 1470 *
1461 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 1471 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1462 * 1472 *
@@ -1464,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
1464 * an error pointer if there is a mapping to something not represented 1474 * an error pointer if there is a mapping to something not represented
1465 * by a page descriptor (see also vm_normal_page()). 1475 * by a page descriptor (see also vm_normal_page()).
1466 */ 1476 */
1467struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1477struct page *follow_page_mask(struct vm_area_struct *vma,
1468 unsigned int flags) 1478 unsigned long address, unsigned int flags,
1479 unsigned int *page_mask)
1469{ 1480{
1470 pgd_t *pgd; 1481 pgd_t *pgd;
1471 pud_t *pud; 1482 pud_t *pud;
@@ -1475,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1475 struct page *page; 1486 struct page *page;
1476 struct mm_struct *mm = vma->vm_mm; 1487 struct mm_struct *mm = vma->vm_mm;
1477 1488
1489 *page_mask = 0;
1490
1478 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 1491 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1479 if (!IS_ERR(page)) { 1492 if (!IS_ERR(page)) {
1480 BUG_ON(flags & FOLL_GET); 1493 BUG_ON(flags & FOLL_GET);
@@ -1521,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1521 page = follow_trans_huge_pmd(vma, address, 1534 page = follow_trans_huge_pmd(vma, address,
1522 pmd, flags); 1535 pmd, flags);
1523 spin_unlock(&mm->page_table_lock); 1536 spin_unlock(&mm->page_table_lock);
1537 *page_mask = HPAGE_PMD_NR - 1;
1524 goto out; 1538 goto out;
1525 } 1539 }
1526 } else 1540 } else
@@ -1534,8 +1548,24 @@ split_fallthrough:
1534 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 1548 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1535 1549
1536 pte = *ptep; 1550 pte = *ptep;
1537 if (!pte_present(pte)) 1551 if (!pte_present(pte)) {
1538 goto no_page; 1552 swp_entry_t entry;
1553 /*
1554 * KSM's break_ksm() relies upon recognizing a ksm page
1555 * even while it is being migrated, so for that case we
1556 * need migration_entry_wait().
1557 */
1558 if (likely(!(flags & FOLL_MIGRATION)))
1559 goto no_page;
1560 if (pte_none(pte) || pte_file(pte))
1561 goto no_page;
1562 entry = pte_to_swp_entry(pte);
1563 if (!is_migration_entry(entry))
1564 goto no_page;
1565 pte_unmap_unlock(ptep, ptl);
1566 migration_entry_wait(mm, pmd, address);
1567 goto split_fallthrough;
1568 }
1539 if ((flags & FOLL_NUMA) && pte_numa(pte)) 1569 if ((flags & FOLL_NUMA) && pte_numa(pte))
1540 goto no_page; 1570 goto no_page;
1541 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1571 if ((flags & FOLL_WRITE) && !pte_write(pte))
@@ -1668,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
1668 * instead of __get_user_pages. __get_user_pages should be used only if 1698 * instead of __get_user_pages. __get_user_pages should be used only if
1669 * you need some special @gup_flags. 1699 * you need some special @gup_flags.
1670 */ 1700 */
1671int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1701long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1672 unsigned long start, int nr_pages, unsigned int gup_flags, 1702 unsigned long start, unsigned long nr_pages,
1673 struct page **pages, struct vm_area_struct **vmas, 1703 unsigned int gup_flags, struct page **pages,
1674 int *nonblocking) 1704 struct vm_area_struct **vmas, int *nonblocking)
1675{ 1705{
1676 int i; 1706 long i;
1677 unsigned long vm_flags; 1707 unsigned long vm_flags;
1708 unsigned int page_mask;
1678 1709
1679 if (nr_pages <= 0) 1710 if (!nr_pages)
1680 return 0; 1711 return 0;
1681 1712
1682 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 1713 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
@@ -1752,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1752 get_page(page); 1783 get_page(page);
1753 } 1784 }
1754 pte_unmap(pte); 1785 pte_unmap(pte);
1786 page_mask = 0;
1755 goto next_page; 1787 goto next_page;
1756 } 1788 }
1757 1789
@@ -1769,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1769 do { 1801 do {
1770 struct page *page; 1802 struct page *page;
1771 unsigned int foll_flags = gup_flags; 1803 unsigned int foll_flags = gup_flags;
1804 unsigned int page_increm;
1772 1805
1773 /* 1806 /*
1774 * If we have a pending SIGKILL, don't keep faulting 1807 * If we have a pending SIGKILL, don't keep faulting
@@ -1778,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1778 return i ? i : -ERESTARTSYS; 1811 return i ? i : -ERESTARTSYS;
1779 1812
1780 cond_resched(); 1813 cond_resched();
1781 while (!(page = follow_page(vma, start, foll_flags))) { 1814 while (!(page = follow_page_mask(vma, start,
1815 foll_flags, &page_mask))) {
1782 int ret; 1816 int ret;
1783 unsigned int fault_flags = 0; 1817 unsigned int fault_flags = 0;
1784 1818
@@ -1852,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1852 1886
1853 flush_anon_page(vma, page, start); 1887 flush_anon_page(vma, page, start);
1854 flush_dcache_page(page); 1888 flush_dcache_page(page);
1889 page_mask = 0;
1855 } 1890 }
1856next_page: 1891next_page:
1857 if (vmas) 1892 if (vmas) {
1858 vmas[i] = vma; 1893 vmas[i] = vma;
1859 i++; 1894 page_mask = 0;
1860 start += PAGE_SIZE; 1895 }
1861 nr_pages--; 1896 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
1897 if (page_increm > nr_pages)
1898 page_increm = nr_pages;
1899 i += page_increm;
1900 start += page_increm * PAGE_SIZE;
1901 nr_pages -= page_increm;
1862 } while (nr_pages && start < vma->vm_end); 1902 } while (nr_pages && start < vma->vm_end);
1863 } while (nr_pages); 1903 } while (nr_pages);
1864 return i; 1904 return i;
@@ -1972,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1972 * 2012 *
1973 * See also get_user_pages_fast, for performance critical applications. 2013 * See also get_user_pages_fast, for performance critical applications.
1974 */ 2014 */
1975int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 2015long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1976 unsigned long start, int nr_pages, int write, int force, 2016 unsigned long start, unsigned long nr_pages, int write,
1977 struct page **pages, struct vm_area_struct **vmas) 2017 int force, struct page **pages, struct vm_area_struct **vmas)
1978{ 2018{
1979 int flags = FOLL_TOUCH; 2019 int flags = FOLL_TOUCH;
1980 2020
@@ -2914,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2914 unsigned int flags, pte_t orig_pte) 2954 unsigned int flags, pte_t orig_pte)
2915{ 2955{
2916 spinlock_t *ptl; 2956 spinlock_t *ptl;
2917 struct page *page, *swapcache = NULL; 2957 struct page *page, *swapcache;
2918 swp_entry_t entry; 2958 swp_entry_t entry;
2919 pte_t pte; 2959 pte_t pte;
2920 int locked; 2960 int locked;
@@ -2965,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2965 */ 3005 */
2966 ret = VM_FAULT_HWPOISON; 3006 ret = VM_FAULT_HWPOISON;
2967 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 3007 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3008 swapcache = page;
2968 goto out_release; 3009 goto out_release;
2969 } 3010 }
2970 3011
3012 swapcache = page;
2971 locked = lock_page_or_retry(page, mm, flags); 3013 locked = lock_page_or_retry(page, mm, flags);
2972 3014
2973 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 3015 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2985,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2985 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 3027 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2986 goto out_page; 3028 goto out_page;
2987 3029
2988 if (ksm_might_need_to_copy(page, vma, address)) { 3030 page = ksm_might_need_to_copy(page, vma, address);
2989 swapcache = page; 3031 if (unlikely(!page)) {
2990 page = ksm_does_need_to_copy(page, vma, address); 3032 ret = VM_FAULT_OOM;
2991 3033 page = swapcache;
2992 if (unlikely(!page)) { 3034 goto out_page;
2993 ret = VM_FAULT_OOM;
2994 page = swapcache;
2995 swapcache = NULL;
2996 goto out_page;
2997 }
2998 } 3035 }
2999 3036
3000 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 3037 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -3039,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
3039 } 3076 }
3040 flush_icache_page(vma, page); 3077 flush_icache_page(vma, page);
3041 set_pte_at(mm, address, page_table, pte); 3078 set_pte_at(mm, address, page_table, pte);
3042 do_page_add_anon_rmap(page, vma, address, exclusive); 3079 if (page == swapcache)
3080 do_page_add_anon_rmap(page, vma, address, exclusive);
3081 else /* ksm created a completely new copy */
3082 page_add_new_anon_rmap(page, vma, address);
3043 /* It's better to call commit-charge after rmap is established */ 3083 /* It's better to call commit-charge after rmap is established */
3044 mem_cgroup_commit_charge_swapin(page, ptr); 3084 mem_cgroup_commit_charge_swapin(page, ptr);
3045 3085
@@ -3047,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
3047 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 3087 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3048 try_to_free_swap(page); 3088 try_to_free_swap(page);
3049 unlock_page(page); 3089 unlock_page(page);
3050 if (swapcache) { 3090 if (page != swapcache) {
3051 /* 3091 /*
3052 * Hold the lock to avoid the swap entry to be reused 3092 * Hold the lock to avoid the swap entry to be reused
3053 * until we take the PT lock for the pte_same() check 3093 * until we take the PT lock for the pte_same() check
@@ -3080,7 +3120,7 @@ out_page:
3080 unlock_page(page); 3120 unlock_page(page);
3081out_release: 3121out_release:
3082 page_cache_release(page); 3122 page_cache_release(page);
3083 if (swapcache) { 3123 if (page != swapcache) {
3084 unlock_page(swapcache); 3124 unlock_page(swapcache);
3085 page_cache_release(swapcache); 3125 page_cache_release(swapcache);
3086 } 3126 }
@@ -3706,6 +3746,14 @@ retry:
3706 if (pmd_trans_huge(orig_pmd)) { 3746 if (pmd_trans_huge(orig_pmd)) {
3707 unsigned int dirty = flags & FAULT_FLAG_WRITE; 3747 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3708 3748
3749 /*
3750 * If the pmd is splitting, return and retry the
3751 * the fault. Alternative: wait until the split
3752 * is done, and goto retry.
3753 */
3754 if (pmd_trans_splitting(orig_pmd))
3755 return 0;
3756
3709 if (pmd_numa(orig_pmd)) 3757 if (pmd_numa(orig_pmd))
3710 return do_huge_pmd_numa_page(mm, vma, address, 3758 return do_huge_pmd_numa_page(mm, vma, address,
3711 orig_pmd, pmd); 3759 orig_pmd, pmd);
@@ -3808,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3808} 3856}
3809#endif /* __PAGETABLE_PMD_FOLDED */ 3857#endif /* __PAGETABLE_PMD_FOLDED */
3810 3858
3811int make_pages_present(unsigned long addr, unsigned long end)
3812{
3813 int ret, len, write;
3814 struct vm_area_struct * vma;
3815
3816 vma = find_vma(current->mm, addr);
3817 if (!vma)
3818 return -ENOMEM;
3819 /*
3820 * We want to touch writable mappings with a write fault in order
3821 * to break COW, except for shared mappings because these don't COW
3822 * and we would not want to dirty them for nothing.
3823 */
3824 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3825 BUG_ON(addr >= end);
3826 BUG_ON(end > vma->vm_end);
3827 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3828 ret = get_user_pages(current, current->mm, addr,
3829 len, write, 0, NULL, NULL);
3830 if (ret < 0)
3831 return ret;
3832 return ret == len ? 0 : -EFAULT;
3833}
3834
3835#if !defined(__HAVE_ARCH_GATE_AREA) 3859#if !defined(__HAVE_ARCH_GATE_AREA)
3836 3860
3837#if defined(AT_SYSINFO_EHDR) 3861#if defined(AT_SYSINFO_EHDR)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d04ed87bfacb..b81a367b9f39 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,7 @@
29#include <linux/suspend.h> 29#include <linux/suspend.h>
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h>
32 33
33#include <asm/tlbflush.h> 34#include <asm/tlbflush.h>
34 35
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res)
91} 92}
92 93
93#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 94#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
94#ifndef CONFIG_SPARSEMEM_VMEMMAP 95void get_page_bootmem(unsigned long info, struct page *page,
95static void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type)
96 unsigned long type)
97{ 97{
98 page->lru.next = (struct list_head *) type; 98 page->lru.next = (struct list_head *) type;
99 SetPagePrivate(page); 99 SetPagePrivate(page);
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page)
124 mutex_lock(&ppb_lock); 124 mutex_lock(&ppb_lock);
125 __free_pages_bootmem(page, 0); 125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock); 126 mutex_unlock(&ppb_lock);
127 totalram_pages++;
127 } 128 }
128 129
129} 130}
130 131
132#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
133#ifndef CONFIG_SPARSEMEM_VMEMMAP
131static void register_page_bootmem_info_section(unsigned long start_pfn) 134static void register_page_bootmem_info_section(unsigned long start_pfn)
132{ 135{
133 unsigned long *usemap, mapsize, section_nr, i; 136 unsigned long *usemap, mapsize, section_nr, i;
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
161 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 164 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
162 165
163} 166}
167#else /* CONFIG_SPARSEMEM_VMEMMAP */
168static void register_page_bootmem_info_section(unsigned long start_pfn)
169{
170 unsigned long *usemap, mapsize, section_nr, i;
171 struct mem_section *ms;
172 struct page *page, *memmap;
173
174 if (!pfn_valid(start_pfn))
175 return;
176
177 section_nr = pfn_to_section_nr(start_pfn);
178 ms = __nr_to_section(section_nr);
179
180 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
181
182 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
183
184 usemap = __nr_to_section(section_nr)->pageblock_flags;
185 page = virt_to_page(usemap);
186
187 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
188
189 for (i = 0; i < mapsize; i++, page++)
190 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
191}
192#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
164 193
165void register_page_bootmem_info_node(struct pglist_data *pgdat) 194void register_page_bootmem_info_node(struct pglist_data *pgdat)
166{ 195{
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
189 } 218 }
190 219
191 pfn = pgdat->node_start_pfn; 220 pfn = pgdat->node_start_pfn;
192 end_pfn = pfn + pgdat->node_spanned_pages; 221 end_pfn = pgdat_end_pfn(pgdat);
193 222
194 /* register_section info */ 223 /* register_section info */
195 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
203 register_page_bootmem_info_section(pfn); 232 register_page_bootmem_info_section(pfn);
204 } 233 }
205} 234}
206#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 235#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
207 236
208static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 237static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
209 unsigned long end_pfn) 238 unsigned long end_pfn)
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
253 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 282 set_page_links(pfn_to_page(pfn), zid, nid, pfn);
254} 283}
255 284
285/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
286 * alloc_bootmem_node_nopanic() */
287static int __ref ensure_zone_is_initialized(struct zone *zone,
288 unsigned long start_pfn, unsigned long num_pages)
289{
290 if (!zone_is_initialized(zone))
291 return init_currently_empty_zone(zone, start_pfn, num_pages,
292 MEMMAP_HOTPLUG);
293 return 0;
294}
295
256static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 296static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
257 unsigned long start_pfn, unsigned long end_pfn) 297 unsigned long start_pfn, unsigned long end_pfn)
258{ 298{
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
260 unsigned long flags; 300 unsigned long flags;
261 unsigned long z1_start_pfn; 301 unsigned long z1_start_pfn;
262 302
263 if (!z1->wait_table) { 303 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
264 ret = init_currently_empty_zone(z1, start_pfn, 304 if (ret)
265 end_pfn - start_pfn, MEMMAP_HOTPLUG); 305 return ret;
266 if (ret)
267 return ret;
268 }
269 306
270 pgdat_resize_lock(z1->zone_pgdat, &flags); 307 pgdat_resize_lock(z1->zone_pgdat, &flags);
271 308
272 /* can't move pfns which are higher than @z2 */ 309 /* can't move pfns which are higher than @z2 */
273 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) 310 if (end_pfn > zone_end_pfn(z2))
274 goto out_fail; 311 goto out_fail;
275 /* the move out part mast at the left most of @z2 */ 312 /* the move out part mast at the left most of @z2 */
276 if (start_pfn > z2->zone_start_pfn) 313 if (start_pfn > z2->zone_start_pfn)
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
286 z1_start_pfn = start_pfn; 323 z1_start_pfn = start_pfn;
287 324
288 resize_zone(z1, z1_start_pfn, end_pfn); 325 resize_zone(z1, z1_start_pfn, end_pfn);
289 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); 326 resize_zone(z2, end_pfn, zone_end_pfn(z2));
290 327
291 pgdat_resize_unlock(z1->zone_pgdat, &flags); 328 pgdat_resize_unlock(z1->zone_pgdat, &flags);
292 329
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
305 unsigned long flags; 342 unsigned long flags;
306 unsigned long z2_end_pfn; 343 unsigned long z2_end_pfn;
307 344
308 if (!z2->wait_table) { 345 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
309 ret = init_currently_empty_zone(z2, start_pfn, 346 if (ret)
310 end_pfn - start_pfn, MEMMAP_HOTPLUG); 347 return ret;
311 if (ret)
312 return ret;
313 }
314 348
315 pgdat_resize_lock(z1->zone_pgdat, &flags); 349 pgdat_resize_lock(z1->zone_pgdat, &flags);
316 350
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
318 if (z1->zone_start_pfn > start_pfn) 352 if (z1->zone_start_pfn > start_pfn)
319 goto out_fail; 353 goto out_fail;
320 /* the move out part mast at the right most of @z1 */ 354 /* the move out part mast at the right most of @z1 */
321 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) 355 if (zone_end_pfn(z1) > end_pfn)
322 goto out_fail; 356 goto out_fail;
323 /* must included/overlap */ 357 /* must included/overlap */
324 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) 358 if (start_pfn >= zone_end_pfn(z1))
325 goto out_fail; 359 goto out_fail;
326 360
327 /* use end_pfn for z2's end_pfn if z2 is empty */ 361 /* use end_pfn for z2's end_pfn if z2 is empty */
328 if (z2->spanned_pages) 362 if (z2->spanned_pages)
329 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; 363 z2_end_pfn = zone_end_pfn(z2);
330 else 364 else
331 z2_end_pfn = end_pfn; 365 z2_end_pfn = end_pfn;
332 366
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
363 int nid = pgdat->node_id; 397 int nid = pgdat->node_id;
364 int zone_type; 398 int zone_type;
365 unsigned long flags; 399 unsigned long flags;
400 int ret;
366 401
367 zone_type = zone - pgdat->node_zones; 402 zone_type = zone - pgdat->node_zones;
368 if (!zone->wait_table) { 403 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
369 int ret; 404 if (ret)
405 return ret;
370 406
371 ret = init_currently_empty_zone(zone, phys_start_pfn,
372 nr_pages, MEMMAP_HOTPLUG);
373 if (ret)
374 return ret;
375 }
376 pgdat_resize_lock(zone->zone_pgdat, &flags); 407 pgdat_resize_lock(zone->zone_pgdat, &flags);
377 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 408 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
378 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 409 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
405 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
406} 437}
407 438
408#ifdef CONFIG_SPARSEMEM_VMEMMAP 439/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
409static int __remove_section(struct zone *zone, struct mem_section *ms) 440static int find_smallest_section_pfn(int nid, struct zone *zone,
441 unsigned long start_pfn,
442 unsigned long end_pfn)
443{
444 struct mem_section *ms;
445
446 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
447 ms = __pfn_to_section(start_pfn);
448
449 if (unlikely(!valid_section(ms)))
450 continue;
451
452 if (unlikely(pfn_to_nid(start_pfn) != nid))
453 continue;
454
455 if (zone && zone != page_zone(pfn_to_page(start_pfn)))
456 continue;
457
458 return start_pfn;
459 }
460
461 return 0;
462}
463
464/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
465static int find_biggest_section_pfn(int nid, struct zone *zone,
466 unsigned long start_pfn,
467 unsigned long end_pfn)
468{
469 struct mem_section *ms;
470 unsigned long pfn;
471
472 /* pfn is the end pfn of a memory section. */
473 pfn = end_pfn - 1;
474 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
475 ms = __pfn_to_section(pfn);
476
477 if (unlikely(!valid_section(ms)))
478 continue;
479
480 if (unlikely(pfn_to_nid(pfn) != nid))
481 continue;
482
483 if (zone && zone != page_zone(pfn_to_page(pfn)))
484 continue;
485
486 return pfn;
487 }
488
489 return 0;
490}
491
492static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
493 unsigned long end_pfn)
410{ 494{
495 unsigned long zone_start_pfn = zone->zone_start_pfn;
496 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
497 unsigned long pfn;
498 struct mem_section *ms;
499 int nid = zone_to_nid(zone);
500
501 zone_span_writelock(zone);
502 if (zone_start_pfn == start_pfn) {
503 /*
504 * If the section is smallest section in the zone, it need
505 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
506 * In this case, we find second smallest valid mem_section
507 * for shrinking zone.
508 */
509 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
510 zone_end_pfn);
511 if (pfn) {
512 zone->zone_start_pfn = pfn;
513 zone->spanned_pages = zone_end_pfn - pfn;
514 }
515 } else if (zone_end_pfn == end_pfn) {
516 /*
517 * If the section is biggest section in the zone, it need
518 * shrink zone->spanned_pages.
519 * In this case, we find second biggest valid mem_section for
520 * shrinking zone.
521 */
522 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
523 start_pfn);
524 if (pfn)
525 zone->spanned_pages = pfn - zone_start_pfn + 1;
526 }
527
411 /* 528 /*
412 * XXX: Freeing memmap with vmemmap is not implement yet. 529 * The section is not biggest or smallest mem_section in the zone, it
413 * This should be removed later. 530 * only creates a hole in the zone. So in this case, we need not
531 * change the zone. But perhaps, the zone has only hole data. Thus
532 * it check the zone has only hole or not.
414 */ 533 */
415 return -EBUSY; 534 pfn = zone_start_pfn;
535 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
536 ms = __pfn_to_section(pfn);
537
538 if (unlikely(!valid_section(ms)))
539 continue;
540
541 if (page_zone(pfn_to_page(pfn)) != zone)
542 continue;
543
544 /* If the section is current section, it continues the loop */
545 if (start_pfn == pfn)
546 continue;
547
548 /* If we find valid section, we have nothing to do */
549 zone_span_writeunlock(zone);
550 return;
551 }
552
553 /* The zone has no valid section */
554 zone->zone_start_pfn = 0;
555 zone->spanned_pages = 0;
556 zone_span_writeunlock(zone);
416} 557}
417#else 558
418static int __remove_section(struct zone *zone, struct mem_section *ms) 559static void shrink_pgdat_span(struct pglist_data *pgdat,
560 unsigned long start_pfn, unsigned long end_pfn)
561{
562 unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
563 unsigned long pgdat_end_pfn =
564 pgdat->node_start_pfn + pgdat->node_spanned_pages;
565 unsigned long pfn;
566 struct mem_section *ms;
567 int nid = pgdat->node_id;
568
569 if (pgdat_start_pfn == start_pfn) {
570 /*
571 * If the section is smallest section in the pgdat, it need
572 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
573 * In this case, we find second smallest valid mem_section
574 * for shrinking zone.
575 */
576 pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
577 pgdat_end_pfn);
578 if (pfn) {
579 pgdat->node_start_pfn = pfn;
580 pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
581 }
582 } else if (pgdat_end_pfn == end_pfn) {
583 /*
584 * If the section is biggest section in the pgdat, it need
585 * shrink pgdat->node_spanned_pages.
586 * In this case, we find second biggest valid mem_section for
587 * shrinking zone.
588 */
589 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
590 start_pfn);
591 if (pfn)
592 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
593 }
594
595 /*
596 * If the section is not biggest or smallest mem_section in the pgdat,
597 * it only creates a hole in the pgdat. So in this case, we need not
598 * change the pgdat.
599 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
600 * has only hole or not.
601 */
602 pfn = pgdat_start_pfn;
603 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
604 ms = __pfn_to_section(pfn);
605
606 if (unlikely(!valid_section(ms)))
607 continue;
608
609 if (pfn_to_nid(pfn) != nid)
610 continue;
611
612 /* If the section is current section, it continues the loop */
613 if (start_pfn == pfn)
614 continue;
615
616 /* If we find valid section, we have nothing to do */
617 return;
618 }
619
620 /* The pgdat has no valid section */
621 pgdat->node_start_pfn = 0;
622 pgdat->node_spanned_pages = 0;
623}
624
625static void __remove_zone(struct zone *zone, unsigned long start_pfn)
419{ 626{
420 unsigned long flags;
421 struct pglist_data *pgdat = zone->zone_pgdat; 627 struct pglist_data *pgdat = zone->zone_pgdat;
628 int nr_pages = PAGES_PER_SECTION;
629 int zone_type;
630 unsigned long flags;
631
632 zone_type = zone - pgdat->node_zones;
633
634 pgdat_resize_lock(zone->zone_pgdat, &flags);
635 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
636 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
637 pgdat_resize_unlock(zone->zone_pgdat, &flags);
638}
639
640static int __remove_section(struct zone *zone, struct mem_section *ms)
641{
642 unsigned long start_pfn;
643 int scn_nr;
422 int ret = -EINVAL; 644 int ret = -EINVAL;
423 645
424 if (!valid_section(ms)) 646 if (!valid_section(ms))
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
428 if (ret) 650 if (ret)
429 return ret; 651 return ret;
430 652
431 pgdat_resize_lock(pgdat, &flags); 653 scn_nr = __section_nr(ms);
654 start_pfn = section_nr_to_pfn(scn_nr);
655 __remove_zone(zone, start_pfn);
656
432 sparse_remove_one_section(zone, ms); 657 sparse_remove_one_section(zone, ms);
433 pgdat_resize_unlock(pgdat, &flags);
434 return 0; 658 return 0;
435} 659}
436#endif
437 660
438/* 661/*
439 * Reasonably generic function for adding memory. It is 662 * Reasonably generic function for adding memory. It is
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
797 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1020 unsigned long zholes_size[MAX_NR_ZONES] = {0};
798 unsigned long start_pfn = start >> PAGE_SHIFT; 1021 unsigned long start_pfn = start >> PAGE_SHIFT;
799 1022
800 pgdat = arch_alloc_nodedata(nid); 1023 pgdat = NODE_DATA(nid);
801 if (!pgdat) 1024 if (!pgdat) {
802 return NULL; 1025 pgdat = arch_alloc_nodedata(nid);
1026 if (!pgdat)
1027 return NULL;
803 1028
804 arch_refresh_nodedata(nid, pgdat); 1029 arch_refresh_nodedata(nid, pgdat);
1030 }
805 1031
806 /* we can use NODE_DATA(nid) from here */ 1032 /* we can use NODE_DATA(nid) from here */
807 1033
@@ -854,7 +1080,8 @@ out:
854int __ref add_memory(int nid, u64 start, u64 size) 1080int __ref add_memory(int nid, u64 start, u64 size)
855{ 1081{
856 pg_data_t *pgdat = NULL; 1082 pg_data_t *pgdat = NULL;
857 int new_pgdat = 0; 1083 bool new_pgdat;
1084 bool new_node;
858 struct resource *res; 1085 struct resource *res;
859 int ret; 1086 int ret;
860 1087
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
865 if (!res) 1092 if (!res)
866 goto out; 1093 goto out;
867 1094
868 if (!node_online(nid)) { 1095 { /* Stupid hack to suppress address-never-null warning */
1096 void *p = NODE_DATA(nid);
1097 new_pgdat = !p;
1098 }
1099 new_node = !node_online(nid);
1100 if (new_node) {
869 pgdat = hotadd_new_pgdat(nid, start); 1101 pgdat = hotadd_new_pgdat(nid, start);
870 ret = -ENOMEM; 1102 ret = -ENOMEM;
871 if (!pgdat) 1103 if (!pgdat)
872 goto error; 1104 goto error;
873 new_pgdat = 1;
874 } 1105 }
875 1106
876 /* call arch's memory hotadd */ 1107 /* call arch's memory hotadd */
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
882 /* we online node here. we can't roll back from here. */ 1113 /* we online node here. we can't roll back from here. */
883 node_set_online(nid); 1114 node_set_online(nid);
884 1115
885 if (new_pgdat) { 1116 if (new_node) {
886 ret = register_one_node(nid); 1117 ret = register_one_node(nid);
887 /* 1118 /*
888 * If sysfs file of new node can't create, cpu on the node 1119 * If sysfs file of new node can't create, cpu on the node
@@ -901,8 +1132,7 @@ error:
901 /* rollback pgdat allocation and others */ 1132 /* rollback pgdat allocation and others */
902 if (new_pgdat) 1133 if (new_pgdat)
903 rollback_node_hotadd(nid, pgdat); 1134 rollback_node_hotadd(nid, pgdat);
904 if (res) 1135 release_memory_resource(res);
905 release_memory_resource(res);
906 1136
907out: 1137out:
908 unlock_memory_hotplug(); 1138 unlock_memory_hotplug();
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1058 * migrate_pages returns # of failed pages. 1288 * migrate_pages returns # of failed pages.
1059 */ 1289 */
1060 ret = migrate_pages(&source, alloc_migrate_target, 0, 1290 ret = migrate_pages(&source, alloc_migrate_target, 0,
1061 true, MIGRATE_SYNC, 1291 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1062 MR_MEMORY_HOTPLUG);
1063 if (ret) 1292 if (ret)
1064 putback_lru_pages(&source); 1293 putback_lru_pages(&source);
1065 } 1294 }
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1381 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1610 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1382} 1611}
1383 1612
1384int remove_memory(u64 start, u64 size) 1613/**
1614 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1615 * @start_pfn: start pfn of the memory range
1616 * @end_pfn: end pft of the memory range
1617 * @arg: argument passed to func
1618 * @func: callback for each memory section walked
1619 *
1620 * This function walks through all present mem sections in range
1621 * [start_pfn, end_pfn) and call func on each mem section.
1622 *
1623 * Returns the return value of func.
1624 */
1625static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1626 void *arg, int (*func)(struct memory_block *, void *))
1385{ 1627{
1386 struct memory_block *mem = NULL; 1628 struct memory_block *mem = NULL;
1387 struct mem_section *section; 1629 struct mem_section *section;
1388 unsigned long start_pfn, end_pfn;
1389 unsigned long pfn, section_nr; 1630 unsigned long pfn, section_nr;
1390 int ret; 1631 int ret;
1391 1632
1392 start_pfn = PFN_DOWN(start);
1393 end_pfn = start_pfn + PFN_DOWN(size);
1394
1395 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1633 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1396 section_nr = pfn_to_section_nr(pfn); 1634 section_nr = pfn_to_section_nr(pfn);
1397 if (!present_section_nr(section_nr)) 1635 if (!present_section_nr(section_nr))
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size)
1408 if (!mem) 1646 if (!mem)
1409 continue; 1647 continue;
1410 1648
1411 ret = offline_memory_block(mem); 1649 ret = func(mem, arg);
1412 if (ret) { 1650 if (ret) {
1413 kobject_put(&mem->dev.kobj); 1651 kobject_put(&mem->dev.kobj);
1414 return ret; 1652 return ret;
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size)
1420 1658
1421 return 0; 1659 return 0;
1422} 1660}
1661
1662/**
1663 * offline_memory_block_cb - callback function for offlining memory block
1664 * @mem: the memory block to be offlined
1665 * @arg: buffer to hold error msg
1666 *
1667 * Always return 0, and put the error msg in arg if any.
1668 */
1669static int offline_memory_block_cb(struct memory_block *mem, void *arg)
1670{
1671 int *ret = arg;
1672 int error = offline_memory_block(mem);
1673
1674 if (error != 0 && *ret == 0)
1675 *ret = error;
1676
1677 return 0;
1678}
1679
1680static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
1681{
1682 int ret = !is_memblock_offlined(mem);
1683
1684 if (unlikely(ret))
1685 pr_warn("removing memory fails, because memory "
1686 "[%#010llx-%#010llx] is onlined\n",
1687 PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
1688 PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
1689
1690 return ret;
1691}
1692
1693static int check_cpu_on_node(void *data)
1694{
1695 struct pglist_data *pgdat = data;
1696 int cpu;
1697
1698 for_each_present_cpu(cpu) {
1699 if (cpu_to_node(cpu) == pgdat->node_id)
1700 /*
1701 * the cpu on this node isn't removed, and we can't
1702 * offline this node.
1703 */
1704 return -EBUSY;
1705 }
1706
1707 return 0;
1708}
1709
1710static void unmap_cpu_on_node(void *data)
1711{
1712#ifdef CONFIG_ACPI_NUMA
1713 struct pglist_data *pgdat = data;
1714 int cpu;
1715
1716 for_each_possible_cpu(cpu)
1717 if (cpu_to_node(cpu) == pgdat->node_id)
1718 numa_clear_node(cpu);
1719#endif
1720}
1721
1722static int check_and_unmap_cpu_on_node(void *data)
1723{
1724 int ret = check_cpu_on_node(data);
1725
1726 if (ret)
1727 return ret;
1728
1729 /*
1730 * the node will be offlined when we come here, so we can clear
1731 * the cpu_to_node() now.
1732 */
1733
1734 unmap_cpu_on_node(data);
1735 return 0;
1736}
1737
1738/* offline the node if all memory sections of this node are removed */
1739void try_offline_node(int nid)
1740{
1741 pg_data_t *pgdat = NODE_DATA(nid);
1742 unsigned long start_pfn = pgdat->node_start_pfn;
1743 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1744 unsigned long pfn;
1745 struct page *pgdat_page = virt_to_page(pgdat);
1746 int i;
1747
1748 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1749 unsigned long section_nr = pfn_to_section_nr(pfn);
1750
1751 if (!present_section_nr(section_nr))
1752 continue;
1753
1754 if (pfn_to_nid(pfn) != nid)
1755 continue;
1756
1757 /*
1758 * some memory sections of this node are not removed, and we
1759 * can't offline node now.
1760 */
1761 return;
1762 }
1763
1764 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
1765 return;
1766
1767 /*
1768 * all memory/cpu of this node are removed, we can offline this
1769 * node now.
1770 */
1771 node_set_offline(nid);
1772 unregister_one_node(nid);
1773
1774 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
1775 /* node data is allocated from boot memory */
1776 return;
1777
1778 /* free waittable in each zone */
1779 for (i = 0; i < MAX_NR_ZONES; i++) {
1780 struct zone *zone = pgdat->node_zones + i;
1781
1782 if (zone->wait_table)
1783 vfree(zone->wait_table);
1784 }
1785
1786 /*
1787 * Since there is no way to guarentee the address of pgdat/zone is not
1788 * on stack of any kernel threads or used by other kernel objects
1789 * without reference counting or other symchronizing method, do not
1790 * reset node_data and free pgdat here. Just reset it to 0 and reuse
1791 * the memory when the node is online again.
1792 */
1793 memset(pgdat, 0, sizeof(*pgdat));
1794}
1795EXPORT_SYMBOL(try_offline_node);
1796
1797int __ref remove_memory(int nid, u64 start, u64 size)
1798{
1799 unsigned long start_pfn, end_pfn;
1800 int ret = 0;
1801 int retry = 1;
1802
1803 start_pfn = PFN_DOWN(start);
1804 end_pfn = start_pfn + PFN_DOWN(size);
1805
1806 /*
1807 * When CONFIG_MEMCG is on, one memory block may be used by other
1808 * blocks to store page cgroup when onlining pages. But we don't know
1809 * in what order pages are onlined. So we iterate twice to offline
1810 * memory:
1811 * 1st iterate: offline every non primary memory block.
1812 * 2nd iterate: offline primary (i.e. first added) memory block.
1813 */
1814repeat:
1815 walk_memory_range(start_pfn, end_pfn, &ret,
1816 offline_memory_block_cb);
1817 if (ret) {
1818 if (!retry)
1819 return ret;
1820
1821 retry = 0;
1822 ret = 0;
1823 goto repeat;
1824 }
1825
1826 lock_memory_hotplug();
1827
1828 /*
1829 * we have offlined all memory blocks like this:
1830 * 1. lock memory hotplug
1831 * 2. offline a memory block
1832 * 3. unlock memory hotplug
1833 *
1834 * repeat step1-3 to offline the memory block. All memory blocks
1835 * must be offlined before removing memory. But we don't hold the
1836 * lock in the whole operation. So we should check whether all
1837 * memory blocks are offlined.
1838 */
1839
1840 ret = walk_memory_range(start_pfn, end_pfn, NULL,
1841 is_memblock_offlined_cb);
1842 if (ret) {
1843 unlock_memory_hotplug();
1844 return ret;
1845 }
1846
1847 /* remove memmap entry */
1848 firmware_map_remove(start, start + size, "System RAM");
1849
1850 arch_remove_memory(start, size);
1851
1852 try_offline_node(nid);
1853
1854 unlock_memory_hotplug();
1855
1856 return 0;
1857}
1423#else 1858#else
1424int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1859int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1425{ 1860{
1426 return -EINVAL; 1861 return -EINVAL;
1427} 1862}
1428int remove_memory(u64 start, u64 size) 1863int remove_memory(int nid, u64 start, u64 size)
1429{ 1864{
1430 return -EINVAL; 1865 return -EINVAL;
1431} 1866}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d1b315e98627..31d26637b658 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -26,7 +26,7 @@
26 * the allocation to memory nodes instead 26 * the allocation to memory nodes instead
27 * 27 *
28 * preferred Try a specific node first before normal fallback. 28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation 29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default, 30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default 31 * but useful to set in a VMA when you have a non default
32 * process policy. 32 * process policy.
@@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p)
127 127
128 if (!pol) { 128 if (!pol) {
129 node = numa_node_id(); 129 node = numa_node_id();
130 if (node != -1) 130 if (node != NUMA_NO_NODE)
131 pol = &preferred_node_policy[node]; 131 pol = &preferred_node_policy[node];
132 132
133 /* preferred_node_policy is not initialised early in boot */ 133 /* preferred_node_policy is not initialised early in boot */
@@ -161,19 +161,7 @@ static const struct mempolicy_operations {
161/* Check that the nodemask contains at least one populated zone */ 161/* Check that the nodemask contains at least one populated zone */
162static int is_valid_nodemask(const nodemask_t *nodemask) 162static int is_valid_nodemask(const nodemask_t *nodemask)
163{ 163{
164 int nd, k; 164 return nodes_intersects(*nodemask, node_states[N_MEMORY]);
165
166 for_each_node_mask(nd, *nodemask) {
167 struct zone *z;
168
169 for (k = 0; k <= policy_zone; k++) {
170 z = &NODE_DATA(nd)->node_zones[k];
171 if (z->present_pages > 0)
172 return 1;
173 }
174 }
175
176 return 0;
177} 165}
178 166
179static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 167static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
270 struct mempolicy *policy; 258 struct mempolicy *policy;
271 259
272 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 260 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
273 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 261 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
274 262
275 if (mode == MPOL_DEFAULT) { 263 if (mode == MPOL_DEFAULT) {
276 if (nodes && !nodes_empty(*nodes)) 264 if (nodes && !nodes_empty(*nodes))
@@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
508 /* 496 /*
509 * vm_normal_page() filters out zero pages, but there might 497 * vm_normal_page() filters out zero pages, but there might
510 * still be PageReserved pages to skip, perhaps in a VDSO. 498 * still be PageReserved pages to skip, perhaps in a VDSO.
511 * And we cannot move PageKsm pages sensibly or safely yet.
512 */ 499 */
513 if (PageReserved(page) || PageKsm(page)) 500 if (PageReserved(page))
514 continue; 501 continue;
515 nid = page_to_nid(page); 502 nid = page_to_nid(page);
516 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 503 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1027 1014
1028 if (!list_empty(&pagelist)) { 1015 if (!list_empty(&pagelist)) {
1029 err = migrate_pages(&pagelist, new_node_page, dest, 1016 err = migrate_pages(&pagelist, new_node_page, dest,
1030 false, MIGRATE_SYNC, 1017 MIGRATE_SYNC, MR_SYSCALL);
1031 MR_SYSCALL);
1032 if (err) 1018 if (err)
1033 putback_lru_pages(&pagelist); 1019 putback_lru_pages(&pagelist);
1034 } 1020 }
@@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1235 1221
1236 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", 1222 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1237 start, start + len, mode, mode_flags, 1223 start, start + len, mode, mode_flags,
1238 nmask ? nodes_addr(*nmask)[0] : -1); 1224 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1239 1225
1240 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 1226 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1241 1227
@@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len,
1272 if (!list_empty(&pagelist)) { 1258 if (!list_empty(&pagelist)) {
1273 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1259 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1274 nr_failed = migrate_pages(&pagelist, new_vma_page, 1260 nr_failed = migrate_pages(&pagelist, new_vma_page,
1275 (unsigned long)vma, 1261 (unsigned long)vma,
1276 false, MIGRATE_SYNC, 1262 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1277 MR_MEMPOLICY_MBIND);
1278 if (nr_failed) 1263 if (nr_failed)
1279 putback_lru_pages(&pagelist); 1264 putback_lru_pages(&pagelist);
1280 } 1265 }
@@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1644 return pol; 1629 return pol;
1645} 1630}
1646 1631
1632static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1633{
1634 enum zone_type dynamic_policy_zone = policy_zone;
1635
1636 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1637
1638 /*
1639 * if policy->v.nodes has movable memory only,
1640 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1641 *
1642 * policy->v.nodes is intersect with node_states[N_MEMORY].
1643 * so if the following test faile, it implies
1644 * policy->v.nodes has movable memory only.
1645 */
1646 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1647 dynamic_policy_zone = ZONE_MOVABLE;
1648
1649 return zone >= dynamic_policy_zone;
1650}
1651
1647/* 1652/*
1648 * Return a nodemask representing a mempolicy for filtering nodes for 1653 * Return a nodemask representing a mempolicy for filtering nodes for
1649 * page allocation 1654 * page allocation
@@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1652{ 1657{
1653 /* Lower zones don't get a nodemask applied for MPOL_BIND */ 1658 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1654 if (unlikely(policy->mode == MPOL_BIND) && 1659 if (unlikely(policy->mode == MPOL_BIND) &&
1655 gfp_zone(gfp) >= policy_zone && 1660 apply_policy_zone(policy, gfp_zone(gfp)) &&
1656 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) 1661 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1657 return &policy->v.nodes; 1662 return &policy->v.nodes;
1658 1663
@@ -2132,7 +2137,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2132 */ 2137 */
2133 2138
2134/* lookup first element intersecting start-end */ 2139/* lookup first element intersecting start-end */
2135/* Caller holds sp->mutex */ 2140/* Caller holds sp->lock */
2136static struct sp_node * 2141static struct sp_node *
2137sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2142sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2138{ 2143{
@@ -2196,13 +2201,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2196 2201
2197 if (!sp->root.rb_node) 2202 if (!sp->root.rb_node)
2198 return NULL; 2203 return NULL;
2199 mutex_lock(&sp->mutex); 2204 spin_lock(&sp->lock);
2200 sn = sp_lookup(sp, idx, idx+1); 2205 sn = sp_lookup(sp, idx, idx+1);
2201 if (sn) { 2206 if (sn) {
2202 mpol_get(sn->policy); 2207 mpol_get(sn->policy);
2203 pol = sn->policy; 2208 pol = sn->policy;
2204 } 2209 }
2205 mutex_unlock(&sp->mutex); 2210 spin_unlock(&sp->lock);
2206 return pol; 2211 return pol;
2207} 2212}
2208 2213
@@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2308 * it less likely we act on an unlikely task<->page 2313 * it less likely we act on an unlikely task<->page
2309 * relation. 2314 * relation.
2310 */ 2315 */
2311 last_nid = page_xchg_last_nid(page, polnid); 2316 last_nid = page_nid_xchg_last(page, polnid);
2312 if (last_nid != polnid) 2317 if (last_nid != polnid)
2313 goto out; 2318 goto out;
2314 } 2319 }
@@ -2328,6 +2333,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2328 sp_free(n); 2333 sp_free(n);
2329} 2334}
2330 2335
2336static void sp_node_init(struct sp_node *node, unsigned long start,
2337 unsigned long end, struct mempolicy *pol)
2338{
2339 node->start = start;
2340 node->end = end;
2341 node->policy = pol;
2342}
2343
2331static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2344static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2332 struct mempolicy *pol) 2345 struct mempolicy *pol)
2333{ 2346{
@@ -2344,10 +2357,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2344 return NULL; 2357 return NULL;
2345 } 2358 }
2346 newpol->flags |= MPOL_F_SHARED; 2359 newpol->flags |= MPOL_F_SHARED;
2347 2360 sp_node_init(n, start, end, newpol);
2348 n->start = start;
2349 n->end = end;
2350 n->policy = newpol;
2351 2361
2352 return n; 2362 return n;
2353} 2363}
@@ -2357,9 +2367,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2357 unsigned long end, struct sp_node *new) 2367 unsigned long end, struct sp_node *new)
2358{ 2368{
2359 struct sp_node *n; 2369 struct sp_node *n;
2370 struct sp_node *n_new = NULL;
2371 struct mempolicy *mpol_new = NULL;
2360 int ret = 0; 2372 int ret = 0;
2361 2373
2362 mutex_lock(&sp->mutex); 2374restart:
2375 spin_lock(&sp->lock);
2363 n = sp_lookup(sp, start, end); 2376 n = sp_lookup(sp, start, end);
2364 /* Take care of old policies in the same range. */ 2377 /* Take care of old policies in the same range. */
2365 while (n && n->start < end) { 2378 while (n && n->start < end) {
@@ -2372,14 +2385,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2372 } else { 2385 } else {
2373 /* Old policy spanning whole new range. */ 2386 /* Old policy spanning whole new range. */
2374 if (n->end > end) { 2387 if (n->end > end) {
2375 struct sp_node *new2; 2388 if (!n_new)
2376 new2 = sp_alloc(end, n->end, n->policy); 2389 goto alloc_new;
2377 if (!new2) { 2390
2378 ret = -ENOMEM; 2391 *mpol_new = *n->policy;
2379 goto out; 2392 atomic_set(&mpol_new->refcnt, 1);
2380 } 2393 sp_node_init(n_new, n->end, end, mpol_new);
2394 sp_insert(sp, n_new);
2381 n->end = start; 2395 n->end = start;
2382 sp_insert(sp, new2); 2396 n_new = NULL;
2397 mpol_new = NULL;
2383 break; 2398 break;
2384 } else 2399 } else
2385 n->end = start; 2400 n->end = start;
@@ -2390,9 +2405,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2390 } 2405 }
2391 if (new) 2406 if (new)
2392 sp_insert(sp, new); 2407 sp_insert(sp, new);
2393out: 2408 spin_unlock(&sp->lock);
2394 mutex_unlock(&sp->mutex); 2409 ret = 0;
2410
2411err_out:
2412 if (mpol_new)
2413 mpol_put(mpol_new);
2414 if (n_new)
2415 kmem_cache_free(sn_cache, n_new);
2416
2395 return ret; 2417 return ret;
2418
2419alloc_new:
2420 spin_unlock(&sp->lock);
2421 ret = -ENOMEM;
2422 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2423 if (!n_new)
2424 goto err_out;
2425 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2426 if (!mpol_new)
2427 goto err_out;
2428 goto restart;
2396} 2429}
2397 2430
2398/** 2431/**
@@ -2410,7 +2443,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2410 int ret; 2443 int ret;
2411 2444
2412 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2445 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2413 mutex_init(&sp->mutex); 2446 spin_lock_init(&sp->lock);
2414 2447
2415 if (mpol) { 2448 if (mpol) {
2416 struct vm_area_struct pvma; 2449 struct vm_area_struct pvma;
@@ -2455,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
2455 vma->vm_pgoff, 2488 vma->vm_pgoff,
2456 sz, npol ? npol->mode : -1, 2489 sz, npol ? npol->mode : -1,
2457 npol ? npol->flags : -1, 2490 npol ? npol->flags : -1,
2458 npol ? nodes_addr(npol->v.nodes)[0] : -1); 2491 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2459 2492
2460 if (npol) { 2493 if (npol) {
2461 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 2494 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -2476,14 +2509,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
2476 2509
2477 if (!p->root.rb_node) 2510 if (!p->root.rb_node)
2478 return; 2511 return;
2479 mutex_lock(&p->mutex); 2512 spin_lock(&p->lock);
2480 next = rb_first(&p->root); 2513 next = rb_first(&p->root);
2481 while (next) { 2514 while (next) {
2482 n = rb_entry(next, struct sp_node, nd); 2515 n = rb_entry(next, struct sp_node, nd);
2483 next = rb_next(&n->nd); 2516 next = rb_next(&n->nd);
2484 sp_delete(p, n); 2517 sp_delete(p, n);
2485 } 2518 }
2486 mutex_unlock(&p->mutex); 2519 spin_unlock(&p->lock);
2487} 2520}
2488 2521
2489#ifdef CONFIG_NUMA_BALANCING 2522#ifdef CONFIG_NUMA_BALANCING
@@ -2595,8 +2628,7 @@ void numa_default_policy(void)
2595 */ 2628 */
2596 2629
2597/* 2630/*
2598 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2631 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2599 * Used only for mpol_parse_str() and mpol_to_str()
2600 */ 2632 */
2601static const char * const policy_modes[] = 2633static const char * const policy_modes[] =
2602{ 2634{
@@ -2610,28 +2642,20 @@ static const char * const policy_modes[] =
2610 2642
2611#ifdef CONFIG_TMPFS 2643#ifdef CONFIG_TMPFS
2612/** 2644/**
2613 * mpol_parse_str - parse string to mempolicy 2645 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2614 * @str: string containing mempolicy to parse 2646 * @str: string containing mempolicy to parse
2615 * @mpol: pointer to struct mempolicy pointer, returned on success. 2647 * @mpol: pointer to struct mempolicy pointer, returned on success.
2616 * @no_context: flag whether to "contextualize" the mempolicy
2617 * 2648 *
2618 * Format of input: 2649 * Format of input:
2619 * <mode>[=<flags>][:<nodelist>] 2650 * <mode>[=<flags>][:<nodelist>]
2620 * 2651 *
2621 * if @no_context is true, save the input nodemask in w.user_nodemask in
2622 * the returned mempolicy. This will be used to "clone" the mempolicy in
2623 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
2624 * mount option. Note that if 'static' or 'relative' mode flags were
2625 * specified, the input nodemask will already have been saved. Saving
2626 * it again is redundant, but safe.
2627 *
2628 * On success, returns 0, else 1 2652 * On success, returns 0, else 1
2629 */ 2653 */
2630int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) 2654int mpol_parse_str(char *str, struct mempolicy **mpol)
2631{ 2655{
2632 struct mempolicy *new = NULL; 2656 struct mempolicy *new = NULL;
2633 unsigned short mode; 2657 unsigned short mode;
2634 unsigned short uninitialized_var(mode_flags); 2658 unsigned short mode_flags;
2635 nodemask_t nodes; 2659 nodemask_t nodes;
2636 char *nodelist = strchr(str, ':'); 2660 char *nodelist = strchr(str, ':');
2637 char *flags = strchr(str, '='); 2661 char *flags = strchr(str, '=');
@@ -2719,24 +2743,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2719 if (IS_ERR(new)) 2743 if (IS_ERR(new))
2720 goto out; 2744 goto out;
2721 2745
2722 if (no_context) { 2746 /*
2723 /* save for contextualization */ 2747 * Save nodes for mpol_to_str() to show the tmpfs mount options
2724 new->w.user_nodemask = nodes; 2748 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2725 } else { 2749 */
2726 int ret; 2750 if (mode != MPOL_PREFERRED)
2727 NODEMASK_SCRATCH(scratch); 2751 new->v.nodes = nodes;
2728 if (scratch) { 2752 else if (nodelist)
2729 task_lock(current); 2753 new->v.preferred_node = first_node(nodes);
2730 ret = mpol_set_nodemask(new, &nodes, scratch); 2754 else
2731 task_unlock(current); 2755 new->flags |= MPOL_F_LOCAL;
2732 } else 2756
2733 ret = -ENOMEM; 2757 /*
2734 NODEMASK_SCRATCH_FREE(scratch); 2758 * Save nodes for contextualization: this will be used to "clone"
2735 if (ret) { 2759 * the mempolicy in a specific context [cpuset] at a later time.
2736 mpol_put(new); 2760 */
2737 goto out; 2761 new->w.user_nodemask = nodes;
2738 } 2762
2739 }
2740 err = 0; 2763 err = 0;
2741 2764
2742out: 2765out:
@@ -2756,13 +2779,12 @@ out:
2756 * @buffer: to contain formatted mempolicy string 2779 * @buffer: to contain formatted mempolicy string
2757 * @maxlen: length of @buffer 2780 * @maxlen: length of @buffer
2758 * @pol: pointer to mempolicy to be formatted 2781 * @pol: pointer to mempolicy to be formatted
2759 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2760 * 2782 *
2761 * Convert a mempolicy into a string. 2783 * Convert a mempolicy into a string.
2762 * Returns the number of characters in buffer (if positive) 2784 * Returns the number of characters in buffer (if positive)
2763 * or an error (negative) 2785 * or an error (negative)
2764 */ 2786 */
2765int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) 2787int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2766{ 2788{
2767 char *p = buffer; 2789 char *p = buffer;
2768 int l; 2790 int l;
@@ -2788,7 +2810,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2788 case MPOL_PREFERRED: 2810 case MPOL_PREFERRED:
2789 nodes_clear(nodes); 2811 nodes_clear(nodes);
2790 if (flags & MPOL_F_LOCAL) 2812 if (flags & MPOL_F_LOCAL)
2791 mode = MPOL_LOCAL; /* pseudo-policy */ 2813 mode = MPOL_LOCAL;
2792 else 2814 else
2793 node_set(pol->v.preferred_node, nodes); 2815 node_set(pol->v.preferred_node, nodes);
2794 break; 2816 break;
@@ -2796,10 +2818,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2796 case MPOL_BIND: 2818 case MPOL_BIND:
2797 /* Fall through */ 2819 /* Fall through */
2798 case MPOL_INTERLEAVE: 2820 case MPOL_INTERLEAVE:
2799 if (no_context) 2821 nodes = pol->v.nodes;
2800 nodes = pol->w.user_nodemask;
2801 else
2802 nodes = pol->v.nodes;
2803 break; 2822 break;
2804 2823
2805 default: 2824 default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 3b676b0c5c3e..3bbaf5d230b0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -160,8 +160,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
160 if (is_write_migration_entry(entry)) 160 if (is_write_migration_entry(entry))
161 pte = pte_mkwrite(pte); 161 pte = pte_mkwrite(pte);
162#ifdef CONFIG_HUGETLB_PAGE 162#ifdef CONFIG_HUGETLB_PAGE
163 if (PageHuge(new)) 163 if (PageHuge(new)) {
164 pte = pte_mkhuge(pte); 164 pte = pte_mkhuge(pte);
165 pte = arch_make_huge_pte(pte, vma, new, 0);
166 }
165#endif 167#endif
166 flush_cache_page(vma, addr, pte_pfn(pte)); 168 flush_cache_page(vma, addr, pte_pfn(pte));
167 set_pte_at(mm, addr, ptep, pte); 169 set_pte_at(mm, addr, ptep, pte);
@@ -462,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
462 464
463 mlock_migrate_page(newpage, page); 465 mlock_migrate_page(newpage, page);
464 ksm_migrate_page(newpage, page); 466 ksm_migrate_page(newpage, page);
465 467 /*
468 * Please do not reorder this without considering how mm/ksm.c's
469 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
470 */
466 ClearPageSwapCache(page); 471 ClearPageSwapCache(page);
467 ClearPagePrivate(page); 472 ClearPagePrivate(page);
468 set_page_private(page, 0); 473 set_page_private(page, 0);
@@ -696,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
696} 701}
697 702
698static int __unmap_and_move(struct page *page, struct page *newpage, 703static int __unmap_and_move(struct page *page, struct page *newpage,
699 int force, bool offlining, enum migrate_mode mode) 704 int force, enum migrate_mode mode)
700{ 705{
701 int rc = -EAGAIN; 706 int rc = -EAGAIN;
702 int remap_swapcache = 1; 707 int remap_swapcache = 1;
@@ -726,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
726 lock_page(page); 731 lock_page(page);
727 } 732 }
728 733
729 /*
730 * Only memory hotplug's offline_pages() caller has locked out KSM,
731 * and can safely migrate a KSM page. The other cases have skipped
732 * PageKsm along with PageReserved - but it is only now when we have
733 * the page lock that we can be certain it will not go KSM beneath us
734 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
735 * its pagecount raised, but only here do we take the page lock which
736 * serializes that).
737 */
738 if (PageKsm(page) && !offlining) {
739 rc = -EBUSY;
740 goto unlock;
741 }
742
743 /* charge against new page */ 734 /* charge against new page */
744 mem_cgroup_prepare_migration(page, newpage, &mem); 735 mem_cgroup_prepare_migration(page, newpage, &mem);
745 736
@@ -766,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
766 * File Caches may use write_page() or lock_page() in migration, then, 757 * File Caches may use write_page() or lock_page() in migration, then,
767 * just care Anon page here. 758 * just care Anon page here.
768 */ 759 */
769 if (PageAnon(page)) { 760 if (PageAnon(page) && !PageKsm(page)) {
770 /* 761 /*
771 * Only page_lock_anon_vma_read() understands the subtleties of 762 * Only page_lock_anon_vma_read() understands the subtleties of
772 * getting a hold on an anon_vma from outside one of its mms. 763 * getting a hold on an anon_vma from outside one of its mms.
@@ -846,7 +837,6 @@ uncharge:
846 mem_cgroup_end_migration(mem, page, newpage, 837 mem_cgroup_end_migration(mem, page, newpage,
847 (rc == MIGRATEPAGE_SUCCESS || 838 (rc == MIGRATEPAGE_SUCCESS ||
848 rc == MIGRATEPAGE_BALLOON_SUCCESS)); 839 rc == MIGRATEPAGE_BALLOON_SUCCESS));
849unlock:
850 unlock_page(page); 840 unlock_page(page);
851out: 841out:
852 return rc; 842 return rc;
@@ -857,8 +847,7 @@ out:
857 * to the newly allocated page in newpage. 847 * to the newly allocated page in newpage.
858 */ 848 */
859static int unmap_and_move(new_page_t get_new_page, unsigned long private, 849static int unmap_and_move(new_page_t get_new_page, unsigned long private,
860 struct page *page, int force, bool offlining, 850 struct page *page, int force, enum migrate_mode mode)
861 enum migrate_mode mode)
862{ 851{
863 int rc = 0; 852 int rc = 0;
864 int *result = NULL; 853 int *result = NULL;
@@ -876,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
876 if (unlikely(split_huge_page(page))) 865 if (unlikely(split_huge_page(page)))
877 goto out; 866 goto out;
878 867
879 rc = __unmap_and_move(page, newpage, force, offlining, mode); 868 rc = __unmap_and_move(page, newpage, force, mode);
880 869
881 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { 870 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
882 /* 871 /*
@@ -936,8 +925,7 @@ out:
936 */ 925 */
937static int unmap_and_move_huge_page(new_page_t get_new_page, 926static int unmap_and_move_huge_page(new_page_t get_new_page,
938 unsigned long private, struct page *hpage, 927 unsigned long private, struct page *hpage,
939 int force, bool offlining, 928 int force, enum migrate_mode mode)
940 enum migrate_mode mode)
941{ 929{
942 int rc = 0; 930 int rc = 0;
943 int *result = NULL; 931 int *result = NULL;
@@ -999,9 +987,8 @@ out:
999 * 987 *
1000 * Return: Number of pages not migrated or error code. 988 * Return: Number of pages not migrated or error code.
1001 */ 989 */
1002int migrate_pages(struct list_head *from, 990int migrate_pages(struct list_head *from, new_page_t get_new_page,
1003 new_page_t get_new_page, unsigned long private, bool offlining, 991 unsigned long private, enum migrate_mode mode, int reason)
1004 enum migrate_mode mode, int reason)
1005{ 992{
1006 int retry = 1; 993 int retry = 1;
1007 int nr_failed = 0; 994 int nr_failed = 0;
@@ -1022,8 +1009,7 @@ int migrate_pages(struct list_head *from,
1022 cond_resched(); 1009 cond_resched();
1023 1010
1024 rc = unmap_and_move(get_new_page, private, 1011 rc = unmap_and_move(get_new_page, private,
1025 page, pass > 2, offlining, 1012 page, pass > 2, mode);
1026 mode);
1027 1013
1028 switch(rc) { 1014 switch(rc) {
1029 case -ENOMEM: 1015 case -ENOMEM:
@@ -1056,15 +1042,13 @@ out:
1056} 1042}
1057 1043
1058int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 1044int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1059 unsigned long private, bool offlining, 1045 unsigned long private, enum migrate_mode mode)
1060 enum migrate_mode mode)
1061{ 1046{
1062 int pass, rc; 1047 int pass, rc;
1063 1048
1064 for (pass = 0; pass < 10; pass++) { 1049 for (pass = 0; pass < 10; pass++) {
1065 rc = unmap_and_move_huge_page(get_new_page, 1050 rc = unmap_and_move_huge_page(get_new_page, private,
1066 private, hpage, pass > 2, offlining, 1051 hpage, pass > 2, mode);
1067 mode);
1068 switch (rc) { 1052 switch (rc) {
1069 case -ENOMEM: 1053 case -ENOMEM:
1070 goto out; 1054 goto out;
@@ -1150,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1150 goto set_status; 1134 goto set_status;
1151 1135
1152 /* Use PageReserved to check for zero page */ 1136 /* Use PageReserved to check for zero page */
1153 if (PageReserved(page) || PageKsm(page)) 1137 if (PageReserved(page))
1154 goto put_and_set; 1138 goto put_and_set;
1155 1139
1156 pp->page = page; 1140 pp->page = page;
@@ -1187,8 +1171,7 @@ set_status:
1187 err = 0; 1171 err = 0;
1188 if (!list_empty(&pagelist)) { 1172 if (!list_empty(&pagelist)) {
1189 err = migrate_pages(&pagelist, new_page_node, 1173 err = migrate_pages(&pagelist, new_page_node,
1190 (unsigned long)pm, 0, MIGRATE_SYNC, 1174 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1191 MR_SYSCALL);
1192 if (err) 1175 if (err)
1193 putback_lru_pages(&pagelist); 1176 putback_lru_pages(&pagelist);
1194 } 1177 }
@@ -1312,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1312 1295
1313 err = -ENOENT; 1296 err = -ENOENT;
1314 /* Use PageReserved to check for zero page */ 1297 /* Use PageReserved to check for zero page */
1315 if (!page || PageReserved(page) || PageKsm(page)) 1298 if (!page || PageReserved(page))
1316 goto set_status; 1299 goto set_status;
1317 1300
1318 err = page_to_nid(page); 1301 err = page_to_nid(page);
@@ -1459,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1459 * pages. Currently it only checks the watermarks which crude 1442 * pages. Currently it only checks the watermarks which crude
1460 */ 1443 */
1461static bool migrate_balanced_pgdat(struct pglist_data *pgdat, 1444static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1462 int nr_migrate_pages) 1445 unsigned long nr_migrate_pages)
1463{ 1446{
1464 int z; 1447 int z;
1465 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1448 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
@@ -1495,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1495 __GFP_NOWARN) & 1478 __GFP_NOWARN) &
1496 ~GFP_IOFS, 0); 1479 ~GFP_IOFS, 0);
1497 if (newpage) 1480 if (newpage)
1498 page_xchg_last_nid(newpage, page_last_nid(page)); 1481 page_nid_xchg_last(newpage, page_nid_last(page));
1499 1482
1500 return newpage; 1483 return newpage;
1501} 1484}
@@ -1555,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
1555 1538
1556int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1539int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1557{ 1540{
1558 int ret = 0; 1541 int page_lru;
1542
1543 VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
1559 1544
1560 /* Avoid migrating to a node that is nearly full */ 1545 /* Avoid migrating to a node that is nearly full */
1561 if (migrate_balanced_pgdat(pgdat, 1)) { 1546 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1562 int page_lru; 1547 return 0;
1563 1548
1564 if (isolate_lru_page(page)) { 1549 if (isolate_lru_page(page))
1565 put_page(page); 1550 return 0;
1566 return 0;
1567 }
1568 1551
1569 /* Page is isolated */ 1552 /*
1570 ret = 1; 1553 * migrate_misplaced_transhuge_page() skips page migration's usual
1571 page_lru = page_is_file_cache(page); 1554 * check on page_count(), so we must do it here, now that the page
1572 if (!PageTransHuge(page)) 1555 * has been isolated: a GUP pin, or any other pin, prevents migration.
1573 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); 1556 * The expected page count is 3: 1 for page's mapcount and 1 for the
1574 else 1557 * caller's pin and 1 for the reference taken by isolate_lru_page().
1575 mod_zone_page_state(page_zone(page), 1558 */
1576 NR_ISOLATED_ANON + page_lru, 1559 if (PageTransHuge(page) && page_count(page) != 3) {
1577 HPAGE_PMD_NR); 1560 putback_lru_page(page);
1561 return 0;
1578 } 1562 }
1579 1563
1564 page_lru = page_is_file_cache(page);
1565 mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
1566 hpage_nr_pages(page));
1567
1580 /* 1568 /*
1581 * Page is either isolated or there is not enough space on the target 1569 * Isolating the page has taken another reference, so the
1582 * node. If isolated, then it has taken a reference count and the 1570 * caller's reference can be safely dropped without the page
1583 * callers reference can be safely dropped without the page 1571 * disappearing underneath us during migration.
1584 * disappearing underneath us during migration. Otherwise the page is
1585 * not to be migrated but the callers reference should still be
1586 * dropped so it does not leak.
1587 */ 1572 */
1588 put_page(page); 1573 put_page(page);
1589 1574 return 1;
1590 return ret;
1591} 1575}
1592 1576
1593/* 1577/*
@@ -1598,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1598int migrate_misplaced_page(struct page *page, int node) 1582int migrate_misplaced_page(struct page *page, int node)
1599{ 1583{
1600 pg_data_t *pgdat = NODE_DATA(node); 1584 pg_data_t *pgdat = NODE_DATA(node);
1601 int isolated = 0; 1585 int isolated;
1602 int nr_remaining; 1586 int nr_remaining;
1603 LIST_HEAD(migratepages); 1587 LIST_HEAD(migratepages);
1604 1588
@@ -1606,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node)
1606 * Don't migrate pages that are mapped in multiple processes. 1590 * Don't migrate pages that are mapped in multiple processes.
1607 * TODO: Handle false sharing detection instead of this hammer 1591 * TODO: Handle false sharing detection instead of this hammer
1608 */ 1592 */
1609 if (page_mapcount(page) != 1) { 1593 if (page_mapcount(page) != 1)
1610 put_page(page);
1611 goto out; 1594 goto out;
1612 }
1613 1595
1614 /* 1596 /*
1615 * Rate-limit the amount of data that is being migrated to a node. 1597 * Rate-limit the amount of data that is being migrated to a node.
1616 * Optimal placement is no good if the memory bus is saturated and 1598 * Optimal placement is no good if the memory bus is saturated and
1617 * all the time is being spent migrating! 1599 * all the time is being spent migrating!
1618 */ 1600 */
1619 if (numamigrate_update_ratelimit(pgdat, 1)) { 1601 if (numamigrate_update_ratelimit(pgdat, 1))
1620 put_page(page);
1621 goto out; 1602 goto out;
1622 }
1623 1603
1624 isolated = numamigrate_isolate_page(pgdat, page); 1604 isolated = numamigrate_isolate_page(pgdat, page);
1625 if (!isolated) 1605 if (!isolated)
1626 goto out; 1606 goto out;
1627 1607
1628 list_add(&page->lru, &migratepages); 1608 list_add(&page->lru, &migratepages);
1629 nr_remaining = migrate_pages(&migratepages, 1609 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1630 alloc_misplaced_dst_page, 1610 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1631 node, false, MIGRATE_ASYNC,
1632 MR_NUMA_MISPLACED);
1633 if (nr_remaining) { 1611 if (nr_remaining) {
1634 putback_lru_pages(&migratepages); 1612 putback_lru_pages(&migratepages);
1635 isolated = 0; 1613 isolated = 0;
1636 } else 1614 } else
1637 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1615 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1638 BUG_ON(!list_empty(&migratepages)); 1616 BUG_ON(!list_empty(&migratepages));
1639out:
1640 return isolated; 1617 return isolated;
1618
1619out:
1620 put_page(page);
1621 return 0;
1641} 1622}
1642#endif /* CONFIG_NUMA_BALANCING */ 1623#endif /* CONFIG_NUMA_BALANCING */
1643 1624
1644#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1625#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1626/*
1627 * Migrates a THP to a given target node. page must be locked and is unlocked
1628 * before returning.
1629 */
1645int migrate_misplaced_transhuge_page(struct mm_struct *mm, 1630int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1646 struct vm_area_struct *vma, 1631 struct vm_area_struct *vma,
1647 pmd_t *pmd, pmd_t entry, 1632 pmd_t *pmd, pmd_t entry,
@@ -1672,17 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1672 1657
1673 new_page = alloc_pages_node(node, 1658 new_page = alloc_pages_node(node,
1674 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); 1659 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
1675 if (!new_page) { 1660 if (!new_page)
1676 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1661 goto out_fail;
1677 goto out_dropref; 1662
1678 } 1663 page_nid_xchg_last(new_page, page_nid_last(page));
1679 page_xchg_last_nid(new_page, page_last_nid(page));
1680 1664
1681 isolated = numamigrate_isolate_page(pgdat, page); 1665 isolated = numamigrate_isolate_page(pgdat, page);
1682 if (!isolated) { 1666 if (!isolated) {
1683 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1684 put_page(new_page); 1667 put_page(new_page);
1685 goto out_keep_locked; 1668 goto out_fail;
1686 } 1669 }
1687 1670
1688 /* Prepare a page as a migration target */ 1671 /* Prepare a page as a migration target */
@@ -1714,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1714 putback_lru_page(page); 1697 putback_lru_page(page);
1715 1698
1716 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1699 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1700 isolated = 0;
1717 goto out; 1701 goto out;
1718 } 1702 }
1719 1703
@@ -1758,9 +1742,11 @@ out:
1758 -HPAGE_PMD_NR); 1742 -HPAGE_PMD_NR);
1759 return isolated; 1743 return isolated;
1760 1744
1745out_fail:
1746 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1761out_dropref: 1747out_dropref:
1748 unlock_page(page);
1762 put_page(page); 1749 put_page(page);
1763out_keep_locked:
1764 return 0; 1750 return 0;
1765} 1751}
1766#endif /* CONFIG_NUMA_BALANCING */ 1752#endif /* CONFIG_NUMA_BALANCING */
diff --git a/mm/mincore.c b/mm/mincore.c
index 936b4cee8cb1..da2be56a7b8f 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
75 /* shmem/tmpfs may return swap: account for swapcache page too. */ 75 /* shmem/tmpfs may return swap: account for swapcache page too. */
76 if (radix_tree_exceptional_entry(page)) { 76 if (radix_tree_exceptional_entry(page)) {
77 swp_entry_t swap = radix_to_swp_entry(page); 77 swp_entry_t swap = radix_to_swp_entry(page);
78 page = find_get_page(&swapper_space, swap.val); 78 page = find_get_page(swap_address_space(swap), swap.val);
79 } 79 }
80#endif 80#endif
81 if (page) { 81 if (page) {
@@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
135 } else { 135 } else {
136#ifdef CONFIG_SWAP 136#ifdef CONFIG_SWAP
137 pgoff = entry.val; 137 pgoff = entry.val;
138 *vec = mincore_page(&swapper_space, pgoff); 138 *vec = mincore_page(swap_address_space(entry),
139 pgoff);
139#else 140#else
140 WARN_ON(1); 141 WARN_ON(1);
141 *vec = 1; 142 *vec = 1;
diff --git a/mm/mlock.c b/mm/mlock.c
index f0b9ce572fc7..e6638f565d42 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ void munlock_vma_page(struct page *page)
155 * 155 *
156 * vma->vm_mm->mmap_sem must be held for at least read. 156 * vma->vm_mm->mmap_sem must be held for at least read.
157 */ 157 */
158static long __mlock_vma_pages_range(struct vm_area_struct *vma, 158long __mlock_vma_pages_range(struct vm_area_struct *vma,
159 unsigned long start, unsigned long end, 159 unsigned long start, unsigned long end, int *nonblocking)
160 int *nonblocking)
161{ 160{
162 struct mm_struct *mm = vma->vm_mm; 161 struct mm_struct *mm = vma->vm_mm;
163 unsigned long addr = start; 162 unsigned long addr = start;
164 int nr_pages = (end - start) / PAGE_SIZE; 163 unsigned long nr_pages = (end - start) / PAGE_SIZE;
165 int gup_flags; 164 int gup_flags;
166 165
167 VM_BUG_ON(start & ~PAGE_MASK); 166 VM_BUG_ON(start & ~PAGE_MASK);
@@ -186,6 +185,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
186 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) 185 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
187 gup_flags |= FOLL_FORCE; 186 gup_flags |= FOLL_FORCE;
188 187
188 /*
189 * We made sure addr is within a VMA, so the following will
190 * not result in a stack expansion that recurses back here.
191 */
189 return __get_user_pages(current, mm, addr, nr_pages, gup_flags, 192 return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
190 NULL, NULL, nonblocking); 193 NULL, NULL, nonblocking);
191} 194}
@@ -202,56 +205,6 @@ static int __mlock_posix_error_return(long retval)
202 return retval; 205 return retval;
203} 206}
204 207
205/**
206 * mlock_vma_pages_range() - mlock pages in specified vma range.
207 * @vma - the vma containing the specfied address range
208 * @start - starting address in @vma to mlock
209 * @end - end address [+1] in @vma to mlock
210 *
211 * For mmap()/mremap()/expansion of mlocked vma.
212 *
213 * return 0 on success for "normal" vmas.
214 *
215 * return number of pages [> 0] to be removed from locked_vm on success
216 * of "special" vmas.
217 */
218long mlock_vma_pages_range(struct vm_area_struct *vma,
219 unsigned long start, unsigned long end)
220{
221 int nr_pages = (end - start) / PAGE_SIZE;
222 BUG_ON(!(vma->vm_flags & VM_LOCKED));
223
224 /*
225 * filter unlockable vmas
226 */
227 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
228 goto no_mlock;
229
230 if (!((vma->vm_flags & VM_DONTEXPAND) ||
231 is_vm_hugetlb_page(vma) ||
232 vma == get_gate_vma(current->mm))) {
233
234 __mlock_vma_pages_range(vma, start, end, NULL);
235
236 /* Hide errors from mmap() and other callers */
237 return 0;
238 }
239
240 /*
241 * User mapped kernel pages or huge pages:
242 * make these pages present to populate the ptes, but
243 * fall thru' to reset VM_LOCKED--no need to unlock, and
244 * return nr_pages so these don't get counted against task's
245 * locked limit. huge pages are already counted against
246 * locked vm limit.
247 */
248 make_pages_present(start, end);
249
250no_mlock:
251 vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
252 return nr_pages; /* error or pages NOT mlocked */
253}
254
255/* 208/*
256 * munlock_vma_pages_range() - munlock all pages in the vma range.' 209 * munlock_vma_pages_range() - munlock all pages in the vma range.'
257 * @vma - vma containing range to be munlock()ed. 210 * @vma - vma containing range to be munlock()ed.
@@ -303,7 +256,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
303 * 256 *
304 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and 257 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
305 * munlock is a no-op. However, for some special vmas, we go ahead and 258 * munlock is a no-op. However, for some special vmas, we go ahead and
306 * populate the ptes via make_pages_present(). 259 * populate the ptes.
307 * 260 *
308 * For vmas that pass the filters, merge/split as appropriate. 261 * For vmas that pass the filters, merge/split as appropriate.
309 */ 262 */
@@ -391,9 +344,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
391 344
392 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 345 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
393 346
394 newflags = vma->vm_flags | VM_LOCKED; 347 newflags = vma->vm_flags & ~VM_LOCKED;
395 if (!on) 348 if (on)
396 newflags &= ~VM_LOCKED; 349 newflags |= VM_LOCKED | VM_POPULATE;
397 350
398 tmp = vma->vm_end; 351 tmp = vma->vm_end;
399 if (tmp > end) 352 if (tmp > end)
@@ -416,13 +369,20 @@ static int do_mlock(unsigned long start, size_t len, int on)
416 return error; 369 return error;
417} 370}
418 371
419static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) 372/*
373 * __mm_populate - populate and/or mlock pages within a range of address space.
374 *
375 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
376 * flags. VMAs must be already marked with the desired vm_flags, and
377 * mmap_sem must not be held.
378 */
379int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
420{ 380{
421 struct mm_struct *mm = current->mm; 381 struct mm_struct *mm = current->mm;
422 unsigned long end, nstart, nend; 382 unsigned long end, nstart, nend;
423 struct vm_area_struct *vma = NULL; 383 struct vm_area_struct *vma = NULL;
424 int locked = 0; 384 int locked = 0;
425 int ret = 0; 385 long ret = 0;
426 386
427 VM_BUG_ON(start & ~PAGE_MASK); 387 VM_BUG_ON(start & ~PAGE_MASK);
428 VM_BUG_ON(len != PAGE_ALIGN(len)); 388 VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -446,7 +406,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
446 * range with the first VMA. Also, skip undesirable VMA types. 406 * range with the first VMA. Also, skip undesirable VMA types.
447 */ 407 */
448 nend = min(end, vma->vm_end); 408 nend = min(end, vma->vm_end);
449 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 409 if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
410 VM_POPULATE)
450 continue; 411 continue;
451 if (nstart < vma->vm_start) 412 if (nstart < vma->vm_start)
452 nstart = vma->vm_start; 413 nstart = vma->vm_start;
@@ -498,7 +459,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
498 error = do_mlock(start, len, 1); 459 error = do_mlock(start, len, 1);
499 up_write(&current->mm->mmap_sem); 460 up_write(&current->mm->mmap_sem);
500 if (!error) 461 if (!error)
501 error = do_mlock_pages(start, len, 0); 462 error = __mm_populate(start, len, 0);
502 return error; 463 return error;
503} 464}
504 465
@@ -517,20 +478,20 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
517static int do_mlockall(int flags) 478static int do_mlockall(int flags)
518{ 479{
519 struct vm_area_struct * vma, * prev = NULL; 480 struct vm_area_struct * vma, * prev = NULL;
520 unsigned int def_flags = 0;
521 481
522 if (flags & MCL_FUTURE) 482 if (flags & MCL_FUTURE)
523 def_flags = VM_LOCKED; 483 current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
524 current->mm->def_flags = def_flags; 484 else
485 current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
525 if (flags == MCL_FUTURE) 486 if (flags == MCL_FUTURE)
526 goto out; 487 goto out;
527 488
528 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { 489 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
529 vm_flags_t newflags; 490 vm_flags_t newflags;
530 491
531 newflags = vma->vm_flags | VM_LOCKED; 492 newflags = vma->vm_flags & ~VM_LOCKED;
532 if (!(flags & MCL_CURRENT)) 493 if (flags & MCL_CURRENT)
533 newflags &= ~VM_LOCKED; 494 newflags |= VM_LOCKED | VM_POPULATE;
534 495
535 /* Ignore errors */ 496 /* Ignore errors */
536 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); 497 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -564,10 +525,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
564 capable(CAP_IPC_LOCK)) 525 capable(CAP_IPC_LOCK))
565 ret = do_mlockall(flags); 526 ret = do_mlockall(flags);
566 up_write(&current->mm->mmap_sem); 527 up_write(&current->mm->mmap_sem);
567 if (!ret && (flags & MCL_CURRENT)) { 528 if (!ret && (flags & MCL_CURRENT))
568 /* Ignore errors */ 529 mm_populate(0, TASK_SIZE);
569 do_mlock_pages(0, TASK_SIZE, 1);
570 }
571out: 530out:
572 return ret; 531 return ret;
573} 532}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1ffd97ae26d7..c280a02ea11e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void)
69 unsigned long or_mask, add_mask; 69 unsigned long or_mask, add_mask;
70 70
71 shift = 8 * sizeof(unsigned long); 71 shift = 8 * sizeof(unsigned long);
72 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; 72 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
73 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 73 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
74 "Section %d Node %d Zone %d Flags %d\n", 74 "Section %d Node %d Zone %d Lastnid %d Flags %d\n",
75 SECTIONS_WIDTH, 75 SECTIONS_WIDTH,
76 NODES_WIDTH, 76 NODES_WIDTH,
77 ZONES_WIDTH, 77 ZONES_WIDTH,
78 LAST_NID_WIDTH,
78 NR_PAGEFLAGS); 79 NR_PAGEFLAGS);
79 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 80 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
80 "Section %d Node %d Zone %d\n", 81 "Section %d Node %d Zone %d Lastnid %d\n",
81 SECTIONS_SHIFT, 82 SECTIONS_SHIFT,
82 NODES_SHIFT, 83 NODES_SHIFT,
83 ZONES_SHIFT); 84 ZONES_SHIFT,
84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", 85 LAST_NID_SHIFT);
85 "Section %lu Node %lu Zone %lu\n", 86 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
87 "Section %lu Node %lu Zone %lu Lastnid %lu\n",
86 (unsigned long)SECTIONS_PGSHIFT, 88 (unsigned long)SECTIONS_PGSHIFT,
87 (unsigned long)NODES_PGSHIFT, 89 (unsigned long)NODES_PGSHIFT,
88 (unsigned long)ZONES_PGSHIFT); 90 (unsigned long)ZONES_PGSHIFT,
89 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", 91 (unsigned long)LAST_NID_PGSHIFT);
90 "Zone ID: %lu -> %lu\n", 92 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
91 (unsigned long)ZONEID_PGOFF, 93 "Node/Zone ID: %lu -> %lu\n",
92 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); 94 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
95 (unsigned long)ZONEID_PGOFF);
93 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", 96 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
94 "location: %d -> %d unused %d -> %d flags %d -> %d\n", 97 "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
95 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); 98 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
96#ifdef NODE_NOT_IN_PAGE_FLAGS 99#ifdef NODE_NOT_IN_PAGE_FLAGS
97 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 100 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
98 "Node not in page flags"); 101 "Node not in page flags");
99#endif 102#endif
103#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
104 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
105 "Last nid not in page flags");
106#endif
100 107
101 if (SECTIONS_WIDTH) { 108 if (SECTIONS_WIDTH) {
102 shift -= SECTIONS_WIDTH; 109 shift -= SECTIONS_WIDTH;
diff --git a/mm/mmap.c b/mm/mmap.c
index f54b235f29a9..318e121affda 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -32,6 +32,7 @@
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/rbtree_augmented.h> 34#include <linux/rbtree_augmented.h>
35#include <linux/sched/sysctl.h>
35 36
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include <asm/cacheflush.h> 38#include <asm/cacheflush.h>
@@ -143,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
143 */ 144 */
144 free -= global_page_state(NR_SHMEM); 145 free -= global_page_state(NR_SHMEM);
145 146
146 free += nr_swap_pages; 147 free += get_nr_swap_pages();
147 148
148 /* 149 /*
149 * Any slabs which are created with the 150 * Any slabs which are created with the
@@ -255,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
255 unsigned long newbrk, oldbrk; 256 unsigned long newbrk, oldbrk;
256 struct mm_struct *mm = current->mm; 257 struct mm_struct *mm = current->mm;
257 unsigned long min_brk; 258 unsigned long min_brk;
259 bool populate;
258 260
259 down_write(&mm->mmap_sem); 261 down_write(&mm->mmap_sem);
260 262
@@ -304,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
304 /* Ok, looks good - let it rip. */ 306 /* Ok, looks good - let it rip. */
305 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) 307 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
306 goto out; 308 goto out;
309
307set_brk: 310set_brk:
308 mm->brk = brk; 311 mm->brk = brk;
312 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
313 up_write(&mm->mmap_sem);
314 if (populate)
315 mm_populate(oldbrk, newbrk - oldbrk);
316 return brk;
317
309out: 318out:
310 retval = mm->brk; 319 retval = mm->brk;
311 up_write(&mm->mmap_sem); 320 up_write(&mm->mmap_sem);
@@ -800,7 +809,7 @@ again: remove_next = 1 + (end > next->vm_end);
800 anon_vma_interval_tree_post_update_vma(vma); 809 anon_vma_interval_tree_post_update_vma(vma);
801 if (adjust_next) 810 if (adjust_next)
802 anon_vma_interval_tree_post_update_vma(next); 811 anon_vma_interval_tree_post_update_vma(next);
803 anon_vma_unlock(anon_vma); 812 anon_vma_unlock_write(anon_vma);
804 } 813 }
805 if (mapping) 814 if (mapping)
806 mutex_unlock(&mapping->i_mmap_mutex); 815 mutex_unlock(&mapping->i_mmap_mutex);
@@ -1153,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
1153 1162
1154unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1163unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1155 unsigned long len, unsigned long prot, 1164 unsigned long len, unsigned long prot,
1156 unsigned long flags, unsigned long pgoff) 1165 unsigned long flags, unsigned long pgoff,
1166 unsigned long *populate)
1157{ 1167{
1158 struct mm_struct * mm = current->mm; 1168 struct mm_struct * mm = current->mm;
1159 struct inode *inode; 1169 struct inode *inode;
1160 vm_flags_t vm_flags; 1170 vm_flags_t vm_flags;
1161 1171
1172 *populate = 0;
1173
1162 /* 1174 /*
1163 * Does the application expect PROT_READ to imply PROT_EXEC? 1175 * Does the application expect PROT_READ to imply PROT_EXEC?
1164 * 1176 *
@@ -1279,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1279 } 1291 }
1280 } 1292 }
1281 1293
1282 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1294 /*
1295 * Set 'VM_NORESERVE' if we should not account for the
1296 * memory use of this mapping.
1297 */
1298 if (flags & MAP_NORESERVE) {
1299 /* We honor MAP_NORESERVE if allowed to overcommit */
1300 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1301 vm_flags |= VM_NORESERVE;
1302
1303 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1304 if (file && is_file_hugepages(file))
1305 vm_flags |= VM_NORESERVE;
1306 }
1307
1308 addr = mmap_region(file, addr, len, vm_flags, pgoff);
1309 if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
1310 *populate = len;
1311 return addr;
1283} 1312}
1284 1313
1285SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1314SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1394,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1394} 1423}
1395 1424
1396unsigned long mmap_region(struct file *file, unsigned long addr, 1425unsigned long mmap_region(struct file *file, unsigned long addr,
1397 unsigned long len, unsigned long flags, 1426 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
1398 vm_flags_t vm_flags, unsigned long pgoff)
1399{ 1427{
1400 struct mm_struct *mm = current->mm; 1428 struct mm_struct *mm = current->mm;
1401 struct vm_area_struct *vma, *prev; 1429 struct vm_area_struct *vma, *prev;
@@ -1419,20 +1447,6 @@ munmap_back:
1419 return -ENOMEM; 1447 return -ENOMEM;
1420 1448
1421 /* 1449 /*
1422 * Set 'VM_NORESERVE' if we should not account for the
1423 * memory use of this mapping.
1424 */
1425 if ((flags & MAP_NORESERVE)) {
1426 /* We honor MAP_NORESERVE if allowed to overcommit */
1427 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1428 vm_flags |= VM_NORESERVE;
1429
1430 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1431 if (file && is_file_hugepages(file))
1432 vm_flags |= VM_NORESERVE;
1433 }
1434
1435 /*
1436 * Private writable mapping: check memory availability 1450 * Private writable mapping: check memory availability
1437 */ 1451 */
1438 if (accountable_mapping(file, vm_flags)) { 1452 if (accountable_mapping(file, vm_flags)) {
@@ -1530,10 +1544,12 @@ out:
1530 1544
1531 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1545 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1532 if (vm_flags & VM_LOCKED) { 1546 if (vm_flags & VM_LOCKED) {
1533 if (!mlock_vma_pages_range(vma, addr, addr + len)) 1547 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
1548 vma == get_gate_vma(current->mm)))
1534 mm->locked_vm += (len >> PAGE_SHIFT); 1549 mm->locked_vm += (len >> PAGE_SHIFT);
1535 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1550 else
1536 make_pages_present(addr, addr + len); 1551 vma->vm_flags &= ~VM_LOCKED;
1552 }
1537 1553
1538 if (file) 1554 if (file)
1539 uprobe_mmap(vma); 1555 uprobe_mmap(vma);
@@ -2186,9 +2202,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
2186 return vma; 2202 return vma;
2187 if (!prev || expand_stack(prev, addr)) 2203 if (!prev || expand_stack(prev, addr))
2188 return NULL; 2204 return NULL;
2189 if (prev->vm_flags & VM_LOCKED) { 2205 if (prev->vm_flags & VM_LOCKED)
2190 mlock_vma_pages_range(prev, addr, prev->vm_end); 2206 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
2191 }
2192 return prev; 2207 return prev;
2193} 2208}
2194#else 2209#else
@@ -2214,9 +2229,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
2214 start = vma->vm_start; 2229 start = vma->vm_start;
2215 if (expand_stack(vma, addr)) 2230 if (expand_stack(vma, addr))
2216 return NULL; 2231 return NULL;
2217 if (vma->vm_flags & VM_LOCKED) { 2232 if (vma->vm_flags & VM_LOCKED)
2218 mlock_vma_pages_range(vma, addr, start); 2233 __mlock_vma_pages_range(vma, addr, start, NULL);
2219 }
2220 return vma; 2234 return vma;
2221} 2235}
2222#endif 2236#endif
@@ -2589,10 +2603,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2589out: 2603out:
2590 perf_event_mmap(vma); 2604 perf_event_mmap(vma);
2591 mm->total_vm += len >> PAGE_SHIFT; 2605 mm->total_vm += len >> PAGE_SHIFT;
2592 if (flags & VM_LOCKED) { 2606 if (flags & VM_LOCKED)
2593 if (!mlock_vma_pages_range(vma, addr, addr + len)) 2607 mm->locked_vm += (len >> PAGE_SHIFT);
2594 mm->locked_vm += (len >> PAGE_SHIFT);
2595 }
2596 return addr; 2608 return addr;
2597} 2609}
2598 2610
@@ -2600,10 +2612,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
2600{ 2612{
2601 struct mm_struct *mm = current->mm; 2613 struct mm_struct *mm = current->mm;
2602 unsigned long ret; 2614 unsigned long ret;
2615 bool populate;
2603 2616
2604 down_write(&mm->mmap_sem); 2617 down_write(&mm->mmap_sem);
2605 ret = do_brk(addr, len); 2618 ret = do_brk(addr, len);
2619 populate = ((mm->def_flags & VM_LOCKED) != 0);
2606 up_write(&mm->mmap_sem); 2620 up_write(&mm->mmap_sem);
2621 if (populate)
2622 mm_populate(addr, len);
2607 return ret; 2623 return ret;
2608} 2624}
2609EXPORT_SYMBOL(vm_brk); 2625EXPORT_SYMBOL(vm_brk);
@@ -2886,7 +2902,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2886 * The LSB of head.next can't change from under us 2902 * The LSB of head.next can't change from under us
2887 * because we hold the mm_all_locks_mutex. 2903 * because we hold the mm_all_locks_mutex.
2888 */ 2904 */
2889 down_write(&anon_vma->root->rwsem); 2905 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
2890 /* 2906 /*
2891 * We can safely modify head.next after taking the 2907 * We can safely modify head.next after taking the
2892 * anon_vma->root->rwsem. If some other vma in this mm shares 2908 * anon_vma->root->rwsem. If some other vma in this mm shares
@@ -2943,7 +2959,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2943 * vma in this mm is backed by the same anon_vma or address_space. 2959 * vma in this mm is backed by the same anon_vma or address_space.
2944 * 2960 *
2945 * We can take all the locks in random order because the VM code 2961 * We can take all the locks in random order because the VM code
2946 * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never 2962 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
2947 * takes more than one of them in a row. Secondly we're protected 2963 * takes more than one of them in a row. Secondly we're protected
2948 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 2964 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2949 * 2965 *
@@ -3001,7 +3017,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3001 if (!__test_and_clear_bit(0, (unsigned long *) 3017 if (!__test_and_clear_bit(0, (unsigned long *)
3002 &anon_vma->root->rb_root.rb_node)) 3018 &anon_vma->root->rb_root.rb_node))
3003 BUG(); 3019 BUG();
3004 anon_vma_unlock(anon_vma); 3020 anon_vma_unlock_write(anon_vma);
3005 } 3021 }
3006} 3022}
3007 3023
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8a5ac8c686b0..2175fb0d501c 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -37,49 +37,51 @@ static struct srcu_struct srcu;
37void __mmu_notifier_release(struct mm_struct *mm) 37void __mmu_notifier_release(struct mm_struct *mm)
38{ 38{
39 struct mmu_notifier *mn; 39 struct mmu_notifier *mn;
40 struct hlist_node *n;
41 int id; 40 int id;
42 41
43 /* 42 /*
44 * SRCU here will block mmu_notifier_unregister until 43 * srcu_read_lock() here will block synchronize_srcu() in
45 * ->release returns. 44 * mmu_notifier_unregister() until all registered
45 * ->release() callouts this function makes have
46 * returned.
46 */ 47 */
47 id = srcu_read_lock(&srcu); 48 id = srcu_read_lock(&srcu);
48 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
49 /*
50 * if ->release runs before mmu_notifier_unregister it
51 * must be handled as it's the only way for the driver
52 * to flush all existing sptes and stop the driver
53 * from establishing any more sptes before all the
54 * pages in the mm are freed.
55 */
56 if (mn->ops->release)
57 mn->ops->release(mn, mm);
58 srcu_read_unlock(&srcu, id);
59
60 spin_lock(&mm->mmu_notifier_mm->lock); 49 spin_lock(&mm->mmu_notifier_mm->lock);
61 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 50 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
62 mn = hlist_entry(mm->mmu_notifier_mm->list.first, 51 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
63 struct mmu_notifier, 52 struct mmu_notifier,
64 hlist); 53 hlist);
54
65 /* 55 /*
66 * We arrived before mmu_notifier_unregister so 56 * Unlink. This will prevent mmu_notifier_unregister()
67 * mmu_notifier_unregister will do nothing other than 57 * from also making the ->release() callout.
68 * to wait ->release to finish and
69 * mmu_notifier_unregister to return.
70 */ 58 */
71 hlist_del_init_rcu(&mn->hlist); 59 hlist_del_init_rcu(&mn->hlist);
60 spin_unlock(&mm->mmu_notifier_mm->lock);
61
62 /*
63 * Clear sptes. (see 'release' description in mmu_notifier.h)
64 */
65 if (mn->ops->release)
66 mn->ops->release(mn, mm);
67
68 spin_lock(&mm->mmu_notifier_mm->lock);
72 } 69 }
73 spin_unlock(&mm->mmu_notifier_mm->lock); 70 spin_unlock(&mm->mmu_notifier_mm->lock);
74 71
75 /* 72 /*
76 * synchronize_srcu here prevents mmu_notifier_release to 73 * All callouts to ->release() which we have done are complete.
77 * return to exit_mmap (which would proceed freeing all pages 74 * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
78 * in the mm) until the ->release method returns, if it was 75 */
79 * invoked by mmu_notifier_unregister. 76 srcu_read_unlock(&srcu, id);
80 * 77
81 * The mmu_notifier_mm can't go away from under us because one 78 /*
82 * mm_count is hold by exit_mmap. 79 * mmu_notifier_unregister() may have unlinked a notifier and may
80 * still be calling out to it. Additionally, other notifiers
81 * may have been active via vmtruncate() et. al. Block here
82 * to ensure that all notifier callouts for this mm have been
83 * completed and the sptes are really cleaned up before returning
84 * to exit_mmap().
83 */ 85 */
84 synchronize_srcu(&srcu); 86 synchronize_srcu(&srcu);
85} 87}
@@ -170,6 +172,7 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
170 } 172 }
171 srcu_read_unlock(&srcu, id); 173 srcu_read_unlock(&srcu, id);
172} 174}
175EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
173 176
174void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 177void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
175 unsigned long start, unsigned long end) 178 unsigned long start, unsigned long end)
@@ -185,6 +188,7 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
185 } 188 }
186 srcu_read_unlock(&srcu, id); 189 srcu_read_unlock(&srcu, id);
187} 190}
191EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
188 192
189static int do_mmu_notifier_register(struct mmu_notifier *mn, 193static int do_mmu_notifier_register(struct mmu_notifier *mn,
190 struct mm_struct *mm, 194 struct mm_struct *mm,
@@ -294,31 +298,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
294{ 298{
295 BUG_ON(atomic_read(&mm->mm_count) <= 0); 299 BUG_ON(atomic_read(&mm->mm_count) <= 0);
296 300
301 spin_lock(&mm->mmu_notifier_mm->lock);
297 if (!hlist_unhashed(&mn->hlist)) { 302 if (!hlist_unhashed(&mn->hlist)) {
298 /*
299 * SRCU here will force exit_mmap to wait ->release to finish
300 * before freeing the pages.
301 */
302 int id; 303 int id;
303 304
304 id = srcu_read_lock(&srcu);
305 /* 305 /*
306 * exit_mmap will block in mmu_notifier_release to 306 * Ensure we synchronize up with __mmu_notifier_release().
307 * guarantee ->release is called before freeing the
308 * pages.
309 */ 307 */
308 id = srcu_read_lock(&srcu);
309
310 hlist_del_rcu(&mn->hlist);
311 spin_unlock(&mm->mmu_notifier_mm->lock);
312
310 if (mn->ops->release) 313 if (mn->ops->release)
311 mn->ops->release(mn, mm); 314 mn->ops->release(mn, mm);
312 srcu_read_unlock(&srcu, id);
313 315
314 spin_lock(&mm->mmu_notifier_mm->lock); 316 /*
315 hlist_del_rcu(&mn->hlist); 317 * Allow __mmu_notifier_release() to complete.
318 */
319 srcu_read_unlock(&srcu, id);
320 } else
316 spin_unlock(&mm->mmu_notifier_mm->lock); 321 spin_unlock(&mm->mmu_notifier_mm->lock);
317 }
318 322
319 /* 323 /*
320 * Wait any running method to finish, of course including 324 * Wait for any running method to finish, including ->release() if it
321 * ->release if it was run by mmu_notifier_relase instead of us. 325 * was run by __mmu_notifier_release() instead of us.
322 */ 326 */
323 synchronize_srcu(&srcu); 327 synchronize_srcu(&srcu);
324 328
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4596d81b89b1..2ac0afbd68f3 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/mm/mmzone.c 2 * linux/mm/mmzone.c
3 * 3 *
4 * management codes for pgdats and zones. 4 * management codes for pgdats, zones and page flags
5 */ 5 */
6 6
7 7
@@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec)
96 for_each_lru(lru) 96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98} 98}
99
100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
101int page_nid_xchg_last(struct page *page, int nid)
102{
103 unsigned long old_flags, flags;
104 int last_nid;
105
106 do {
107 old_flags = flags = page->flags;
108 last_nid = page_nid_last(page);
109
110 flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
111 flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
113
114 return last_nid;
115}
116#endif
diff --git a/mm/mremap.c b/mm/mremap.c
index e1031e1f6a61..463a25705ac6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -19,6 +19,7 @@
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/mmu_notifier.h> 21#include <linux/mmu_notifier.h>
22#include <linux/sched/sysctl.h>
22 23
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24#include <asm/cacheflush.h> 25#include <asm/cacheflush.h>
@@ -134,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
134 pte_unmap(new_pte - 1); 135 pte_unmap(new_pte - 1);
135 pte_unmap_unlock(old_pte - 1, old_ptl); 136 pte_unmap_unlock(old_pte - 1, old_ptl);
136 if (anon_vma) 137 if (anon_vma)
137 anon_vma_unlock(anon_vma); 138 anon_vma_unlock_write(anon_vma);
138 if (mapping) 139 if (mapping)
139 mutex_unlock(&mapping->i_mmap_mutex); 140 mutex_unlock(&mapping->i_mmap_mutex);
140} 141}
@@ -208,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
208 209
209static unsigned long move_vma(struct vm_area_struct *vma, 210static unsigned long move_vma(struct vm_area_struct *vma,
210 unsigned long old_addr, unsigned long old_len, 211 unsigned long old_addr, unsigned long old_len,
211 unsigned long new_len, unsigned long new_addr) 212 unsigned long new_len, unsigned long new_addr, bool *locked)
212{ 213{
213 struct mm_struct *mm = vma->vm_mm; 214 struct mm_struct *mm = vma->vm_mm;
214 struct vm_area_struct *new_vma; 215 struct vm_area_struct *new_vma;
@@ -299,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
299 300
300 if (vm_flags & VM_LOCKED) { 301 if (vm_flags & VM_LOCKED) {
301 mm->locked_vm += new_len >> PAGE_SHIFT; 302 mm->locked_vm += new_len >> PAGE_SHIFT;
302 if (new_len > old_len) 303 *locked = true;
303 mlock_vma_pages_range(new_vma, new_addr + old_len,
304 new_addr + new_len);
305 } 304 }
306 305
307 return new_addr; 306 return new_addr;
@@ -366,9 +365,8 @@ Eagain:
366 return ERR_PTR(-EAGAIN); 365 return ERR_PTR(-EAGAIN);
367} 366}
368 367
369static unsigned long mremap_to(unsigned long addr, 368static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
370 unsigned long old_len, unsigned long new_addr, 369 unsigned long new_addr, unsigned long new_len, bool *locked)
371 unsigned long new_len)
372{ 370{
373 struct mm_struct *mm = current->mm; 371 struct mm_struct *mm = current->mm;
374 struct vm_area_struct *vma; 372 struct vm_area_struct *vma;
@@ -418,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr,
418 if (ret & ~PAGE_MASK) 416 if (ret & ~PAGE_MASK)
419 goto out1; 417 goto out1;
420 418
421 ret = move_vma(vma, addr, old_len, new_len, new_addr); 419 ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
422 if (!(ret & ~PAGE_MASK)) 420 if (!(ret & ~PAGE_MASK))
423 goto out; 421 goto out;
424out1: 422out1:
@@ -456,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
456 struct vm_area_struct *vma; 454 struct vm_area_struct *vma;
457 unsigned long ret = -EINVAL; 455 unsigned long ret = -EINVAL;
458 unsigned long charged = 0; 456 unsigned long charged = 0;
457 bool locked = false;
459 458
460 down_write(&current->mm->mmap_sem); 459 down_write(&current->mm->mmap_sem);
461 460
@@ -478,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
478 477
479 if (flags & MREMAP_FIXED) { 478 if (flags & MREMAP_FIXED) {
480 if (flags & MREMAP_MAYMOVE) 479 if (flags & MREMAP_MAYMOVE)
481 ret = mremap_to(addr, old_len, new_addr, new_len); 480 ret = mremap_to(addr, old_len, new_addr, new_len,
481 &locked);
482 goto out; 482 goto out;
483 } 483 }
484 484
@@ -520,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
520 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 520 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
521 if (vma->vm_flags & VM_LOCKED) { 521 if (vma->vm_flags & VM_LOCKED) {
522 mm->locked_vm += pages; 522 mm->locked_vm += pages;
523 mlock_vma_pages_range(vma, addr + old_len, 523 locked = true;
524 addr + new_len); 524 new_addr = addr;
525 } 525 }
526 ret = addr; 526 ret = addr;
527 goto out; 527 goto out;
@@ -547,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
547 goto out; 547 goto out;
548 } 548 }
549 549
550 ret = move_vma(vma, addr, old_len, new_len, new_addr); 550 ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
551 } 551 }
552out: 552out:
553 if (ret & ~PAGE_MASK) 553 if (ret & ~PAGE_MASK)
554 vm_unacct_memory(charged); 554 vm_unacct_memory(charged);
555 up_write(&current->mm->mmap_sem); 555 up_write(&current->mm->mmap_sem);
556 if (locked && new_len > old_len)
557 mm_populate(new_addr + old_len, new_len - old_len);
556 return ret; 558 return ret;
557} 559}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index b8294fc03df8..5e07d36e381e 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -154,21 +154,6 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
154} 154}
155 155
156/** 156/**
157 * free_all_bootmem_node - release a node's free pages to the buddy allocator
158 * @pgdat: node to be released
159 *
160 * Returns the number of pages actually released.
161 */
162unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
163{
164 register_page_bootmem_info_node(pgdat);
165 reset_node_lowmem_managed_pages(pgdat);
166
167 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
168 return 0;
169}
170
171/**
172 * free_all_bootmem - release free pages to the buddy allocator 157 * free_all_bootmem - release free pages to the buddy allocator
173 * 158 *
174 * Returns the number of pages actually released. 159 * Returns the number of pages actually released.
@@ -406,6 +391,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
406 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); 391 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
407} 392}
408 393
394void * __init __alloc_bootmem_low_nopanic(unsigned long size,
395 unsigned long align,
396 unsigned long goal)
397{
398 return ___alloc_bootmem_nopanic(size, align, goal,
399 ARCH_LOW_ADDRESS_LIMIT);
400}
401
409/** 402/**
410 * __alloc_bootmem_low_node - allocate low boot memory from a specific node 403 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
411 * @pgdat: node to allocate from 404 * @pgdat: node to allocate from
diff --git a/mm/nommu.c b/mm/nommu.c
index 79c3cac87afa..da0d210fd403 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -29,6 +29,7 @@
29#include <linux/security.h> 29#include <linux/security.h>
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/sched/sysctl.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/tlb.h> 35#include <asm/tlb.h>
@@ -139,10 +140,10 @@ unsigned int kobjsize(const void *objp)
139 return PAGE_SIZE << compound_order(page); 140 return PAGE_SIZE << compound_order(page);
140} 141}
141 142
142int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 143long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
143 unsigned long start, int nr_pages, unsigned int foll_flags, 144 unsigned long start, unsigned long nr_pages,
144 struct page **pages, struct vm_area_struct **vmas, 145 unsigned int foll_flags, struct page **pages,
145 int *retry) 146 struct vm_area_struct **vmas, int *nonblocking)
146{ 147{
147 struct vm_area_struct *vma; 148 struct vm_area_struct *vma;
148 unsigned long vm_flags; 149 unsigned long vm_flags;
@@ -189,9 +190,10 @@ finish_or_fault:
189 * slab page or a secondary page from a compound page 190 * slab page or a secondary page from a compound page
190 * - don't permit access to VMAs that don't support it, such as I/O mappings 191 * - don't permit access to VMAs that don't support it, such as I/O mappings
191 */ 192 */
192int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 193long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
193 unsigned long start, int nr_pages, int write, int force, 194 unsigned long start, unsigned long nr_pages,
194 struct page **pages, struct vm_area_struct **vmas) 195 int write, int force, struct page **pages,
196 struct vm_area_struct **vmas)
195{ 197{
196 int flags = 0; 198 int flags = 0;
197 199
@@ -1249,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file,
1249 unsigned long len, 1251 unsigned long len,
1250 unsigned long prot, 1252 unsigned long prot,
1251 unsigned long flags, 1253 unsigned long flags,
1252 unsigned long pgoff) 1254 unsigned long pgoff,
1255 unsigned long *populate)
1253{ 1256{
1254 struct vm_area_struct *vma; 1257 struct vm_area_struct *vma;
1255 struct vm_region *region; 1258 struct vm_region *region;
@@ -1259,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file,
1259 1262
1260 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); 1263 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1261 1264
1265 *populate = 0;
1266
1262 /* decide whether we should attempt the mapping, and if so what sort of 1267 /* decide whether we should attempt the mapping, and if so what sort of
1263 * mapping */ 1268 * mapping */
1264 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1269 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1814,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1814 return ret; 1819 return ret;
1815} 1820}
1816 1821
1817struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1822struct page *follow_page_mask(struct vm_area_struct *vma,
1818 unsigned int foll_flags) 1823 unsigned long address, unsigned int flags,
1824 unsigned int *page_mask)
1819{ 1825{
1826 *page_mask = 0;
1820 return NULL; 1827 return NULL;
1821} 1828}
1822 1829
@@ -1903,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1903 */ 1910 */
1904 free -= global_page_state(NR_SHMEM); 1911 free -= global_page_state(NR_SHMEM);
1905 1912
1906 free += nr_swap_pages; 1913 free += get_nr_swap_pages();
1907 1914
1908 /* 1915 /*
1909 * Any slabs which are created with the 1916 * Any slabs which are created with the
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0399f146ae49..79e451a78c9e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
386 cpuset_print_task_mems_allowed(current); 386 cpuset_print_task_mems_allowed(current);
387 task_unlock(current); 387 task_unlock(current);
388 dump_stack(); 388 dump_stack();
389 mem_cgroup_print_oom_info(memcg, p); 389 if (memcg)
390 show_mem(SHOW_MEM_FILTER_NODES); 390 mem_cgroup_print_oom_info(memcg, p);
391 else
392 show_mem(SHOW_MEM_FILTER_NODES);
391 if (sysctl_oom_dump_tasks) 393 if (sysctl_oom_dump_tasks)
392 dump_tasks(memcg, nodemask); 394 dump_tasks(memcg, nodemask);
393} 395}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0713bfbf0954..cdc377c456c0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -35,6 +35,7 @@
35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ 35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/timer.h> 37#include <linux/timer.h>
38#include <linux/sched/rt.h>
38#include <trace/events/writeback.h> 39#include <trace/events/writeback.h>
39 40
40/* 41/*
@@ -240,6 +241,9 @@ static unsigned long global_dirtyable_memory(void)
240 if (!vm_highmem_is_dirtyable) 241 if (!vm_highmem_is_dirtyable)
241 x -= highmem_dirtyable_memory(x); 242 x -= highmem_dirtyable_memory(x);
242 243
244 /* Subtract min_free_kbytes */
245 x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
246
243 return x + 1; /* Ensure that we never return 0 */ 247 return x + 1; /* Ensure that we never return 0 */
244} 248}
245 249
@@ -2289,3 +2293,27 @@ int mapping_tagged(struct address_space *mapping, int tag)
2289 return radix_tree_tagged(&mapping->page_tree, tag); 2293 return radix_tree_tagged(&mapping->page_tree, tag);
2290} 2294}
2291EXPORT_SYMBOL(mapping_tagged); 2295EXPORT_SYMBOL(mapping_tagged);
2296
2297/**
2298 * wait_for_stable_page() - wait for writeback to finish, if necessary.
2299 * @page: The page to wait on.
2300 *
2301 * This function determines if the given page is related to a backing device
2302 * that requires page contents to be held stable during writeback. If so, then
2303 * it will wait for any pending writeback to complete.
2304 */
2305void wait_for_stable_page(struct page *page)
2306{
2307 struct address_space *mapping = page_mapping(page);
2308 struct backing_dev_info *bdi = mapping->backing_dev_info;
2309
2310 if (!bdi_cap_stable_pages_required(bdi))
2311 return;
2312#ifdef CONFIG_NEED_BOUNCE_POOL
2313 if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
2314 return;
2315#endif /* CONFIG_NEED_BOUNCE_POOL */
2316
2317 wait_on_page_writeback(page);
2318}
2319EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ba5e37127fc..0dade3f18f7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,6 +58,7 @@
58#include <linux/prefetch.h> 58#include <linux/prefetch.h>
59#include <linux/migrate.h> 59#include <linux/migrate.h>
60#include <linux/page-debug-flags.h> 60#include <linux/page-debug-flags.h>
61#include <linux/sched/rt.h>
61 62
62#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
63#include <asm/div64.h> 64#include <asm/div64.h>
@@ -201,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages;
201static unsigned long __meminitdata dma_reserve; 202static unsigned long __meminitdata dma_reserve;
202 203
203#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 204#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
205/* Movable memory ranges, will also be used by memblock subsystem. */
206struct movablemem_map movablemem_map = {
207 .acpi = false,
208 .nr_map = 0,
209};
210
204static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 211static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
205static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 212static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
206static unsigned long __initdata required_kernelcore; 213static unsigned long __initdata required_kernelcore;
207static unsigned long __initdata required_movablecore; 214static unsigned long __initdata required_movablecore;
208static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 215static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
216static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
209 217
210/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 218/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
211int movable_zone; 219int movable_zone;
@@ -221,11 +229,6 @@ EXPORT_SYMBOL(nr_online_nodes);
221 229
222int page_group_by_mobility_disabled __read_mostly; 230int page_group_by_mobility_disabled __read_mostly;
223 231
224/*
225 * NOTE:
226 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
227 * Instead, use {un}set_pageblock_isolate.
228 */
229void set_pageblock_migratetype(struct page *page, int migratetype) 232void set_pageblock_migratetype(struct page *page, int migratetype)
230{ 233{
231 234
@@ -244,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
244 int ret = 0; 247 int ret = 0;
245 unsigned seq; 248 unsigned seq;
246 unsigned long pfn = page_to_pfn(page); 249 unsigned long pfn = page_to_pfn(page);
250 unsigned long sp, start_pfn;
247 251
248 do { 252 do {
249 seq = zone_span_seqbegin(zone); 253 seq = zone_span_seqbegin(zone);
250 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 254 start_pfn = zone->zone_start_pfn;
251 ret = 1; 255 sp = zone->spanned_pages;
252 else if (pfn < zone->zone_start_pfn) 256 if (!zone_spans_pfn(zone, pfn))
253 ret = 1; 257 ret = 1;
254 } while (zone_span_seqretry(zone, seq)); 258 } while (zone_span_seqretry(zone, seq));
255 259
260 if (ret)
261 pr_err("page %lu outside zone [ %lu - %lu ]\n",
262 pfn, start_pfn, start_pfn + sp);
263
256 return ret; 264 return ret;
257} 265}
258 266
@@ -292,7 +300,7 @@ static void bad_page(struct page *page)
292 300
293 /* Don't complain about poisoned pages */ 301 /* Don't complain about poisoned pages */
294 if (PageHWPoison(page)) { 302 if (PageHWPoison(page)) {
295 reset_page_mapcount(page); /* remove PageBuddy */ 303 page_mapcount_reset(page); /* remove PageBuddy */
296 return; 304 return;
297 } 305 }
298 306
@@ -324,8 +332,8 @@ static void bad_page(struct page *page)
324 dump_stack(); 332 dump_stack();
325out: 333out:
326 /* Leave bad fields for debug, except PageBuddy could make trouble */ 334 /* Leave bad fields for debug, except PageBuddy could make trouble */
327 reset_page_mapcount(page); /* remove PageBuddy */ 335 page_mapcount_reset(page); /* remove PageBuddy */
328 add_taint(TAINT_BAD_PAGE); 336 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
329} 337}
330 338
331/* 339/*
@@ -537,6 +545,8 @@ static inline void __free_one_page(struct page *page,
537 unsigned long uninitialized_var(buddy_idx); 545 unsigned long uninitialized_var(buddy_idx);
538 struct page *buddy; 546 struct page *buddy;
539 547
548 VM_BUG_ON(!zone_is_initialized(zone));
549
540 if (unlikely(PageCompound(page))) 550 if (unlikely(PageCompound(page)))
541 if (unlikely(destroy_compound_page(page, order))) 551 if (unlikely(destroy_compound_page(page, order)))
542 return; 552 return;
@@ -610,7 +620,7 @@ static inline int free_pages_check(struct page *page)
610 bad_page(page); 620 bad_page(page);
611 return 1; 621 return 1;
612 } 622 }
613 reset_page_last_nid(page); 623 page_nid_reset_last(page);
614 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 624 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
615 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 625 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
616 return 0; 626 return 0;
@@ -670,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
670 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 680 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
671 __free_one_page(page, zone, 0, mt); 681 __free_one_page(page, zone, 0, mt);
672 trace_mm_page_pcpu_drain(page, 0, mt); 682 trace_mm_page_pcpu_drain(page, 0, mt);
673 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { 683 if (likely(!is_migrate_isolate_page(page))) {
674 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 684 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
675 if (is_migrate_cma(mt)) 685 if (is_migrate_cma(mt))
676 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 686 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
@@ -688,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
688 zone->pages_scanned = 0; 698 zone->pages_scanned = 0;
689 699
690 __free_one_page(page, zone, order, migratetype); 700 __free_one_page(page, zone, order, migratetype);
691 if (unlikely(migratetype != MIGRATE_ISOLATE)) 701 if (unlikely(!is_migrate_isolate(migratetype)))
692 __mod_zone_freepage_state(zone, 1 << order, migratetype); 702 __mod_zone_freepage_state(zone, 1 << order, migratetype);
693 spin_unlock(&zone->lock); 703 spin_unlock(&zone->lock);
694} 704}
@@ -778,6 +788,10 @@ void __init init_cma_reserved_pageblock(struct page *page)
778 set_pageblock_migratetype(page, MIGRATE_CMA); 788 set_pageblock_migratetype(page, MIGRATE_CMA);
779 __free_pages(page, pageblock_order); 789 __free_pages(page, pageblock_order);
780 totalram_pages += pageblock_nr_pages; 790 totalram_pages += pageblock_nr_pages;
791#ifdef CONFIG_HIGHMEM
792 if (PageHighMem(page))
793 totalhigh_pages += pageblock_nr_pages;
794#endif
781} 795}
782#endif 796#endif
783 797
@@ -916,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = {
916 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 930 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
917#endif 931#endif
918 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 932 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
933#ifdef CONFIG_MEMORY_ISOLATION
919 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 934 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
935#endif
920}; 936};
921 937
922/* 938/*
@@ -981,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
981 end_pfn = start_pfn + pageblock_nr_pages - 1; 997 end_pfn = start_pfn + pageblock_nr_pages - 1;
982 998
983 /* Do not cross zone boundaries */ 999 /* Do not cross zone boundaries */
984 if (start_pfn < zone->zone_start_pfn) 1000 if (!zone_spans_pfn(zone, start_pfn))
985 start_page = page; 1001 start_page = page;
986 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 1002 if (!zone_spans_pfn(zone, end_pfn))
987 return 0; 1003 return 0;
988 1004
989 return move_freepages(zone, start_page, end_page, migratetype); 1005 return move_freepages(zone, start_page, end_page, migratetype);
@@ -1142,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1142 list_add_tail(&page->lru, list); 1158 list_add_tail(&page->lru, list);
1143 if (IS_ENABLED(CONFIG_CMA)) { 1159 if (IS_ENABLED(CONFIG_CMA)) {
1144 mt = get_pageblock_migratetype(page); 1160 mt = get_pageblock_migratetype(page);
1145 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) 1161 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
1146 mt = migratetype; 1162 mt = migratetype;
1147 } 1163 }
1148 set_freepage_migratetype(page, mt); 1164 set_freepage_migratetype(page, mt);
@@ -1277,7 +1293,7 @@ void mark_free_pages(struct zone *zone)
1277 1293
1278 spin_lock_irqsave(&zone->lock, flags); 1294 spin_lock_irqsave(&zone->lock, flags);
1279 1295
1280 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1296 max_zone_pfn = zone_end_pfn(zone);
1281 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1297 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1282 if (pfn_valid(pfn)) { 1298 if (pfn_valid(pfn)) {
1283 struct page *page = pfn_to_page(pfn); 1299 struct page *page = pfn_to_page(pfn);
@@ -1326,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold)
1326 * excessively into the page allocator 1342 * excessively into the page allocator
1327 */ 1343 */
1328 if (migratetype >= MIGRATE_PCPTYPES) { 1344 if (migratetype >= MIGRATE_PCPTYPES) {
1329 if (unlikely(migratetype == MIGRATE_ISOLATE)) { 1345 if (unlikely(is_migrate_isolate(migratetype))) {
1330 free_one_page(zone, page, 0, migratetype); 1346 free_one_page(zone, page, 0, migratetype);
1331 goto out; 1347 goto out;
1332 } 1348 }
@@ -1389,14 +1405,8 @@ void split_page(struct page *page, unsigned int order)
1389 set_page_refcounted(page + i); 1405 set_page_refcounted(page + i);
1390} 1406}
1391 1407
1392/* 1408static int __isolate_free_page(struct page *page, unsigned int order)
1393 * Similar to the split_page family of functions except that the page
1394 * required at the given order and being isolated now to prevent races
1395 * with parallel allocators
1396 */
1397int capture_free_page(struct page *page, int alloc_order, int migratetype)
1398{ 1409{
1399 unsigned int order;
1400 unsigned long watermark; 1410 unsigned long watermark;
1401 struct zone *zone; 1411 struct zone *zone;
1402 int mt; 1412 int mt;
@@ -1404,16 +1414,15 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1404 BUG_ON(!PageBuddy(page)); 1414 BUG_ON(!PageBuddy(page));
1405 1415
1406 zone = page_zone(page); 1416 zone = page_zone(page);
1407 order = page_order(page);
1408 mt = get_pageblock_migratetype(page); 1417 mt = get_pageblock_migratetype(page);
1409 1418
1410 if (mt != MIGRATE_ISOLATE) { 1419 if (!is_migrate_isolate(mt)) {
1411 /* Obey watermarks as if the page was being allocated */ 1420 /* Obey watermarks as if the page was being allocated */
1412 watermark = low_wmark_pages(zone) + (1 << order); 1421 watermark = low_wmark_pages(zone) + (1 << order);
1413 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1422 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1414 return 0; 1423 return 0;
1415 1424
1416 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); 1425 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1417 } 1426 }
1418 1427
1419 /* Remove page from free list */ 1428 /* Remove page from free list */
@@ -1421,22 +1430,18 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1421 zone->free_area[order].nr_free--; 1430 zone->free_area[order].nr_free--;
1422 rmv_page_order(page); 1431 rmv_page_order(page);
1423 1432
1424 if (alloc_order != order) 1433 /* Set the pageblock if the isolated page is at least a pageblock */
1425 expand(zone, page, alloc_order, order,
1426 &zone->free_area[order], migratetype);
1427
1428 /* Set the pageblock if the captured page is at least a pageblock */
1429 if (order >= pageblock_order - 1) { 1434 if (order >= pageblock_order - 1) {
1430 struct page *endpage = page + (1 << order) - 1; 1435 struct page *endpage = page + (1 << order) - 1;
1431 for (; page < endpage; page += pageblock_nr_pages) { 1436 for (; page < endpage; page += pageblock_nr_pages) {
1432 int mt = get_pageblock_migratetype(page); 1437 int mt = get_pageblock_migratetype(page);
1433 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) 1438 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1434 set_pageblock_migratetype(page, 1439 set_pageblock_migratetype(page,
1435 MIGRATE_MOVABLE); 1440 MIGRATE_MOVABLE);
1436 } 1441 }
1437 } 1442 }
1438 1443
1439 return 1UL << alloc_order; 1444 return 1UL << order;
1440} 1445}
1441 1446
1442/* 1447/*
@@ -1454,10 +1459,9 @@ int split_free_page(struct page *page)
1454 unsigned int order; 1459 unsigned int order;
1455 int nr_pages; 1460 int nr_pages;
1456 1461
1457 BUG_ON(!PageBuddy(page));
1458 order = page_order(page); 1462 order = page_order(page);
1459 1463
1460 nr_pages = capture_free_page(page, order, 0); 1464 nr_pages = __isolate_free_page(page, order);
1461 if (!nr_pages) 1465 if (!nr_pages)
1462 return 0; 1466 return 0;
1463 1467
@@ -1655,20 +1659,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1655 return true; 1659 return true;
1656} 1660}
1657 1661
1658#ifdef CONFIG_MEMORY_ISOLATION
1659static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1660{
1661 if (unlikely(zone->nr_pageblock_isolate))
1662 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1663 return 0;
1664}
1665#else
1666static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1667{
1668 return 0;
1669}
1670#endif
1671
1672bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1662bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1673 int classzone_idx, int alloc_flags) 1663 int classzone_idx, int alloc_flags)
1674{ 1664{
@@ -1684,14 +1674,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1684 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1674 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1685 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1675 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1686 1676
1687 /*
1688 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1689 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1690 * sleep although it could do so. But this is more desirable for memory
1691 * hotplug than sleeping which can cause a livelock in the direct
1692 * reclaim path.
1693 */
1694 free_pages -= nr_zone_isolate_freepages(z);
1695 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1677 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1696 free_pages); 1678 free_pages);
1697} 1679}
@@ -2163,8 +2145,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2163 bool *contended_compaction, bool *deferred_compaction, 2145 bool *contended_compaction, bool *deferred_compaction,
2164 unsigned long *did_some_progress) 2146 unsigned long *did_some_progress)
2165{ 2147{
2166 struct page *page = NULL;
2167
2168 if (!order) 2148 if (!order)
2169 return NULL; 2149 return NULL;
2170 2150
@@ -2176,16 +2156,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2176 current->flags |= PF_MEMALLOC; 2156 current->flags |= PF_MEMALLOC;
2177 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2157 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2178 nodemask, sync_migration, 2158 nodemask, sync_migration,
2179 contended_compaction, &page); 2159 contended_compaction);
2180 current->flags &= ~PF_MEMALLOC; 2160 current->flags &= ~PF_MEMALLOC;
2181 2161
2182 /* If compaction captured a page, prep and use it */
2183 if (page) {
2184 prep_new_page(page, order, gfp_mask);
2185 goto got_page;
2186 }
2187
2188 if (*did_some_progress != COMPACT_SKIPPED) { 2162 if (*did_some_progress != COMPACT_SKIPPED) {
2163 struct page *page;
2164
2189 /* Page migration frees to the PCP lists but we want merging */ 2165 /* Page migration frees to the PCP lists but we want merging */
2190 drain_pages(get_cpu()); 2166 drain_pages(get_cpu());
2191 put_cpu(); 2167 put_cpu();
@@ -2195,7 +2171,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2195 alloc_flags & ~ALLOC_NO_WATERMARKS, 2171 alloc_flags & ~ALLOC_NO_WATERMARKS,
2196 preferred_zone, migratetype); 2172 preferred_zone, migratetype);
2197 if (page) { 2173 if (page) {
2198got_page:
2199 preferred_zone->compact_blockskip_flush = false; 2174 preferred_zone->compact_blockskip_flush = false;
2200 preferred_zone->compact_considered = 0; 2175 preferred_zone->compact_considered = 0;
2201 preferred_zone->compact_defer_shift = 0; 2176 preferred_zone->compact_defer_shift = 0;
@@ -2656,10 +2631,17 @@ retry_cpuset:
2656 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2631 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2657 zonelist, high_zoneidx, alloc_flags, 2632 zonelist, high_zoneidx, alloc_flags,
2658 preferred_zone, migratetype); 2633 preferred_zone, migratetype);
2659 if (unlikely(!page)) 2634 if (unlikely(!page)) {
2635 /*
2636 * Runtime PM, block IO and its error handling path
2637 * can deadlock because I/O on the device might not
2638 * complete.
2639 */
2640 gfp_mask = memalloc_noio_flags(gfp_mask);
2660 page = __alloc_pages_slowpath(gfp_mask, order, 2641 page = __alloc_pages_slowpath(gfp_mask, order,
2661 zonelist, high_zoneidx, nodemask, 2642 zonelist, high_zoneidx, nodemask,
2662 preferred_zone, migratetype); 2643 preferred_zone, migratetype);
2644 }
2663 2645
2664 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2646 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2665 2647
@@ -2831,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size)
2831} 2813}
2832EXPORT_SYMBOL(free_pages_exact); 2814EXPORT_SYMBOL(free_pages_exact);
2833 2815
2834static unsigned int nr_free_zone_pages(int offset) 2816/**
2817 * nr_free_zone_pages - count number of pages beyond high watermark
2818 * @offset: The zone index of the highest zone
2819 *
2820 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2821 * high watermark within all zones at or below a given zone index. For each
2822 * zone, the number of pages is calculated as:
2823 * present_pages - high_pages
2824 */
2825static unsigned long nr_free_zone_pages(int offset)
2835{ 2826{
2836 struct zoneref *z; 2827 struct zoneref *z;
2837 struct zone *zone; 2828 struct zone *zone;
2838 2829
2839 /* Just pick one node, since fallback list is circular */ 2830 /* Just pick one node, since fallback list is circular */
2840 unsigned int sum = 0; 2831 unsigned long sum = 0;
2841 2832
2842 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2833 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2843 2834
2844 for_each_zone_zonelist(zone, z, zonelist, offset) { 2835 for_each_zone_zonelist(zone, z, zonelist, offset) {
2845 unsigned long size = zone->present_pages; 2836 unsigned long size = zone->managed_pages;
2846 unsigned long high = high_wmark_pages(zone); 2837 unsigned long high = high_wmark_pages(zone);
2847 if (size > high) 2838 if (size > high)
2848 sum += size - high; 2839 sum += size - high;
@@ -2851,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset)
2851 return sum; 2842 return sum;
2852} 2843}
2853 2844
2854/* 2845/**
2855 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 2846 * nr_free_buffer_pages - count number of pages beyond high watermark
2847 *
2848 * nr_free_buffer_pages() counts the number of pages which are beyond the high
2849 * watermark within ZONE_DMA and ZONE_NORMAL.
2856 */ 2850 */
2857unsigned int nr_free_buffer_pages(void) 2851unsigned long nr_free_buffer_pages(void)
2858{ 2852{
2859 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2853 return nr_free_zone_pages(gfp_zone(GFP_USER));
2860} 2854}
2861EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2855EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2862 2856
2863/* 2857/**
2864 * Amount of free RAM allocatable within all zones 2858 * nr_free_pagecache_pages - count number of pages beyond high watermark
2859 *
2860 * nr_free_pagecache_pages() counts the number of pages which are beyond the
2861 * high watermark within all zones.
2865 */ 2862 */
2866unsigned int nr_free_pagecache_pages(void) 2863unsigned long nr_free_pagecache_pages(void)
2867{ 2864{
2868 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 2865 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2869} 2866}
@@ -2895,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2895 val->totalram = pgdat->node_present_pages; 2892 val->totalram = pgdat->node_present_pages;
2896 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2893 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2897#ifdef CONFIG_HIGHMEM 2894#ifdef CONFIG_HIGHMEM
2898 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 2895 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
2899 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 2896 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2900 NR_FREE_PAGES); 2897 NR_FREE_PAGES);
2901#else 2898#else
@@ -2938,7 +2935,9 @@ static void show_migration_types(unsigned char type)
2938#ifdef CONFIG_CMA 2935#ifdef CONFIG_CMA
2939 [MIGRATE_CMA] = 'C', 2936 [MIGRATE_CMA] = 'C',
2940#endif 2937#endif
2938#ifdef CONFIG_MEMORY_ISOLATION
2941 [MIGRATE_ISOLATE] = 'I', 2939 [MIGRATE_ISOLATE] = 'I',
2940#endif
2942 }; 2941 };
2943 char tmp[MIGRATE_TYPES + 1]; 2942 char tmp[MIGRATE_TYPES + 1];
2944 char *p = tmp; 2943 char *p = tmp;
@@ -3277,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
3277{ 3276{
3278 int n, val; 3277 int n, val;
3279 int min_val = INT_MAX; 3278 int min_val = INT_MAX;
3280 int best_node = -1; 3279 int best_node = NUMA_NO_NODE;
3281 const struct cpumask *tmp = cpumask_of_node(0); 3280 const struct cpumask *tmp = cpumask_of_node(0);
3282 3281
3283 /* Use the local node if we haven't already */ 3282 /* Use the local node if we haven't already */
@@ -3821,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3821 * the block. 3820 * the block.
3822 */ 3821 */
3823 start_pfn = zone->zone_start_pfn; 3822 start_pfn = zone->zone_start_pfn;
3824 end_pfn = start_pfn + zone->spanned_pages; 3823 end_pfn = zone_end_pfn(zone);
3825 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3824 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3826 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3825 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3827 pageblock_order; 3826 pageblock_order;
@@ -3917,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3917 set_page_links(page, zone, nid, pfn); 3916 set_page_links(page, zone, nid, pfn);
3918 mminit_verify_page_links(page, zone, nid, pfn); 3917 mminit_verify_page_links(page, zone, nid, pfn);
3919 init_page_count(page); 3918 init_page_count(page);
3920 reset_page_mapcount(page); 3919 page_mapcount_reset(page);
3921 reset_page_last_nid(page); 3920 page_nid_reset_last(page);
3922 SetPageReserved(page); 3921 SetPageReserved(page);
3923 /* 3922 /*
3924 * Mark the block movable so that blocks are reserved for 3923 * Mark the block movable so that blocks are reserved for
@@ -3935,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3935 * pfn out of zone. 3934 * pfn out of zone.
3936 */ 3935 */
3937 if ((z->zone_start_pfn <= pfn) 3936 if ((z->zone_start_pfn <= pfn)
3938 && (pfn < z->zone_start_pfn + z->spanned_pages) 3937 && (pfn < zone_end_pfn(z))
3939 && !(pfn & (pageblock_nr_pages - 1))) 3938 && !(pfn & (pageblock_nr_pages - 1)))
3940 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3939 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3941 3940
@@ -3973,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone)
3973 * 3972 *
3974 * OK, so we don't know how big the cache is. So guess. 3973 * OK, so we don't know how big the cache is. So guess.
3975 */ 3974 */
3976 batch = zone->present_pages / 1024; 3975 batch = zone->managed_pages / 1024;
3977 if (batch * PAGE_SIZE > 512 * 1024) 3976 if (batch * PAGE_SIZE > 512 * 1024)
3978 batch = (512 * 1024) / PAGE_SIZE; 3977 batch = (512 * 1024) / PAGE_SIZE;
3979 batch /= 4; /* We effectively *= 4 below */ 3978 batch /= 4; /* We effectively *= 4 below */
@@ -4057,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone)
4057 4056
4058 if (percpu_pagelist_fraction) 4057 if (percpu_pagelist_fraction)
4059 setup_pagelist_highmark(pcp, 4058 setup_pagelist_highmark(pcp,
4060 (zone->present_pages / 4059 (zone->managed_pages /
4061 percpu_pagelist_fraction)); 4060 percpu_pagelist_fraction));
4062 } 4061 }
4063} 4062}
@@ -4413,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4413 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4412 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4414} 4413}
4415 4414
4415/**
4416 * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
4417 *
4418 * zone_movable_limit is initialized as 0. This function will try to get
4419 * the first ZONE_MOVABLE pfn of each node from movablemem_map, and
4420 * assigne them to zone_movable_limit.
4421 * zone_movable_limit[nid] == 0 means no limit for the node.
4422 *
4423 * Note: Each range is represented as [start_pfn, end_pfn)
4424 */
4425static void __meminit sanitize_zone_movable_limit(void)
4426{
4427 int map_pos = 0, i, nid;
4428 unsigned long start_pfn, end_pfn;
4429
4430 if (!movablemem_map.nr_map)
4431 return;
4432
4433 /* Iterate all ranges from minimum to maximum */
4434 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4435 /*
4436 * If we have found lowest pfn of ZONE_MOVABLE of the node
4437 * specified by user, just go on to check next range.
4438 */
4439 if (zone_movable_limit[nid])
4440 continue;
4441
4442#ifdef CONFIG_ZONE_DMA
4443 /* Skip DMA memory. */
4444 if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA])
4445 start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA];
4446#endif
4447
4448#ifdef CONFIG_ZONE_DMA32
4449 /* Skip DMA32 memory. */
4450 if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32])
4451 start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32];
4452#endif
4453
4454#ifdef CONFIG_HIGHMEM
4455 /* Skip lowmem if ZONE_MOVABLE is highmem. */
4456 if (zone_movable_is_highmem() &&
4457 start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
4458 start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
4459#endif
4460
4461 if (start_pfn >= end_pfn)
4462 continue;
4463
4464 while (map_pos < movablemem_map.nr_map) {
4465 if (end_pfn <= movablemem_map.map[map_pos].start_pfn)
4466 break;
4467
4468 if (start_pfn >= movablemem_map.map[map_pos].end_pfn) {
4469 map_pos++;
4470 continue;
4471 }
4472
4473 /*
4474 * The start_pfn of ZONE_MOVABLE is either the minimum
4475 * pfn specified by movablemem_map, or 0, which means
4476 * the node has no ZONE_MOVABLE.
4477 */
4478 zone_movable_limit[nid] = max(start_pfn,
4479 movablemem_map.map[map_pos].start_pfn);
4480
4481 break;
4482 }
4483 }
4484}
4485
4416#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4486#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4417static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4487static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4418 unsigned long zone_type, 4488 unsigned long zone_type,
@@ -4430,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4430 4500
4431 return zholes_size[zone_type]; 4501 return zholes_size[zone_type];
4432} 4502}
4433
4434#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4503#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4435 4504
4436static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4505static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
@@ -4462,10 +4531,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4462 * round what is now in bits to nearest long in bits, then return it in 4531 * round what is now in bits to nearest long in bits, then return it in
4463 * bytes. 4532 * bytes.
4464 */ 4533 */
4465static unsigned long __init usemap_size(unsigned long zonesize) 4534static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4466{ 4535{
4467 unsigned long usemapsize; 4536 unsigned long usemapsize;
4468 4537
4538 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4469 usemapsize = roundup(zonesize, pageblock_nr_pages); 4539 usemapsize = roundup(zonesize, pageblock_nr_pages);
4470 usemapsize = usemapsize >> pageblock_order; 4540 usemapsize = usemapsize >> pageblock_order;
4471 usemapsize *= NR_PAGEBLOCK_BITS; 4541 usemapsize *= NR_PAGEBLOCK_BITS;
@@ -4475,17 +4545,19 @@ static unsigned long __init usemap_size(unsigned long zonesize)
4475} 4545}
4476 4546
4477static void __init setup_usemap(struct pglist_data *pgdat, 4547static void __init setup_usemap(struct pglist_data *pgdat,
4478 struct zone *zone, unsigned long zonesize) 4548 struct zone *zone,
4549 unsigned long zone_start_pfn,
4550 unsigned long zonesize)
4479{ 4551{
4480 unsigned long usemapsize = usemap_size(zonesize); 4552 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4481 zone->pageblock_flags = NULL; 4553 zone->pageblock_flags = NULL;
4482 if (usemapsize) 4554 if (usemapsize)
4483 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4555 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4484 usemapsize); 4556 usemapsize);
4485} 4557}
4486#else 4558#else
4487static inline void setup_usemap(struct pglist_data *pgdat, 4559static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4488 struct zone *zone, unsigned long zonesize) {} 4560 unsigned long zone_start_pfn, unsigned long zonesize) {}
4489#endif /* CONFIG_SPARSEMEM */ 4561#endif /* CONFIG_SPARSEMEM */
4490 4562
4491#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4563#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -4611,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4611 nr_all_pages += freesize; 4683 nr_all_pages += freesize;
4612 4684
4613 zone->spanned_pages = size; 4685 zone->spanned_pages = size;
4614 zone->present_pages = freesize; 4686 zone->present_pages = realsize;
4615 /* 4687 /*
4616 * Set an approximate value for lowmem here, it will be adjusted 4688 * Set an approximate value for lowmem here, it will be adjusted
4617 * when the bootmem allocator frees pages into the buddy system. 4689 * when the bootmem allocator frees pages into the buddy system.
@@ -4636,7 +4708,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4636 continue; 4708 continue;
4637 4709
4638 set_pageblock_order(); 4710 set_pageblock_order();
4639 setup_usemap(pgdat, zone, size); 4711 setup_usemap(pgdat, zone, zone_start_pfn, size);
4640 ret = init_currently_empty_zone(zone, zone_start_pfn, 4712 ret = init_currently_empty_zone(zone, zone_start_pfn,
4641 size, MEMMAP_EARLY); 4713 size, MEMMAP_EARLY);
4642 BUG_ON(ret); 4714 BUG_ON(ret);
@@ -4663,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4663 * for the buddy allocator to function correctly. 4735 * for the buddy allocator to function correctly.
4664 */ 4736 */
4665 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4737 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4666 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 4738 end = pgdat_end_pfn(pgdat);
4667 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4739 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4668 size = (end - start) * sizeof(struct page); 4740 size = (end - start) * sizeof(struct page);
4669 map = alloc_remap(pgdat->node_id, size); 4741 map = alloc_remap(pgdat->node_id, size);
@@ -4869,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4869 required_kernelcore = max(required_kernelcore, corepages); 4941 required_kernelcore = max(required_kernelcore, corepages);
4870 } 4942 }
4871 4943
4872 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4944 /*
4873 if (!required_kernelcore) 4945 * If neither kernelcore/movablecore nor movablemem_map is specified,
4946 * there is no ZONE_MOVABLE. But if movablemem_map is specified, the
4947 * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
4948 */
4949 if (!required_kernelcore) {
4950 if (movablemem_map.nr_map)
4951 memcpy(zone_movable_pfn, zone_movable_limit,
4952 sizeof(zone_movable_pfn));
4874 goto out; 4953 goto out;
4954 }
4875 4955
4876 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4956 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4877 find_usable_zone_for_movable();
4878 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 4957 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4879 4958
4880restart: 4959restart:
@@ -4902,10 +4981,24 @@ restart:
4902 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4981 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4903 unsigned long size_pages; 4982 unsigned long size_pages;
4904 4983
4984 /*
4985 * Find more memory for kernelcore in
4986 * [zone_movable_pfn[nid], zone_movable_limit[nid]).
4987 */
4905 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 4988 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4906 if (start_pfn >= end_pfn) 4989 if (start_pfn >= end_pfn)
4907 continue; 4990 continue;
4908 4991
4992 if (zone_movable_limit[nid]) {
4993 end_pfn = min(end_pfn, zone_movable_limit[nid]);
4994 /* No range left for kernelcore in this node */
4995 if (start_pfn >= end_pfn) {
4996 zone_movable_pfn[nid] =
4997 zone_movable_limit[nid];
4998 break;
4999 }
5000 }
5001
4909 /* Account for what is only usable for kernelcore */ 5002 /* Account for what is only usable for kernelcore */
4910 if (start_pfn < usable_startpfn) { 5003 if (start_pfn < usable_startpfn) {
4911 unsigned long kernel_pages; 5004 unsigned long kernel_pages;
@@ -4965,12 +5058,12 @@ restart:
4965 if (usable_nodes && required_kernelcore > usable_nodes) 5058 if (usable_nodes && required_kernelcore > usable_nodes)
4966 goto restart; 5059 goto restart;
4967 5060
5061out:
4968 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5062 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
4969 for (nid = 0; nid < MAX_NUMNODES; nid++) 5063 for (nid = 0; nid < MAX_NUMNODES; nid++)
4970 zone_movable_pfn[nid] = 5064 zone_movable_pfn[nid] =
4971 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5065 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4972 5066
4973out:
4974 /* restore the node_state */ 5067 /* restore the node_state */
4975 node_states[N_MEMORY] = saved_node_state; 5068 node_states[N_MEMORY] = saved_node_state;
4976} 5069}
@@ -5033,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5033 5126
5034 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5127 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5035 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5128 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5129 find_usable_zone_for_movable();
5130 sanitize_zone_movable_limit();
5036 find_zone_movable_pfns_for_nodes(); 5131 find_zone_movable_pfns_for_nodes();
5037 5132
5038 /* Print out the zone ranges */ 5133 /* Print out the zone ranges */
@@ -5116,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p)
5116early_param("kernelcore", cmdline_parse_kernelcore); 5211early_param("kernelcore", cmdline_parse_kernelcore);
5117early_param("movablecore", cmdline_parse_movablecore); 5212early_param("movablecore", cmdline_parse_movablecore);
5118 5213
5214/**
5215 * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
5216 * @start_pfn: start pfn of the range to be checked
5217 * @end_pfn: end pfn of the range to be checked (exclusive)
5218 *
5219 * This function checks if a given memory range [start_pfn, end_pfn) overlaps
5220 * the movablemem_map.map[] array.
5221 *
5222 * Return: index of the first overlapped element in movablemem_map.map[]
5223 * or -1 if they don't overlap each other.
5224 */
5225int __init movablemem_map_overlap(unsigned long start_pfn,
5226 unsigned long end_pfn)
5227{
5228 int overlap;
5229
5230 if (!movablemem_map.nr_map)
5231 return -1;
5232
5233 for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
5234 if (start_pfn < movablemem_map.map[overlap].end_pfn)
5235 break;
5236
5237 if (overlap == movablemem_map.nr_map ||
5238 end_pfn <= movablemem_map.map[overlap].start_pfn)
5239 return -1;
5240
5241 return overlap;
5242}
5243
5244/**
5245 * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
5246 * @start_pfn: start pfn of the range
5247 * @end_pfn: end pfn of the range
5248 *
5249 * This function will also merge the overlapped ranges, and sort the array
5250 * by start_pfn in monotonic increasing order.
5251 */
5252void __init insert_movablemem_map(unsigned long start_pfn,
5253 unsigned long end_pfn)
5254{
5255 int pos, overlap;
5256
5257 /*
5258 * pos will be at the 1st overlapped range, or the position
5259 * where the element should be inserted.
5260 */
5261 for (pos = 0; pos < movablemem_map.nr_map; pos++)
5262 if (start_pfn <= movablemem_map.map[pos].end_pfn)
5263 break;
5264
5265 /* If there is no overlapped range, just insert the element. */
5266 if (pos == movablemem_map.nr_map ||
5267 end_pfn < movablemem_map.map[pos].start_pfn) {
5268 /*
5269 * If pos is not the end of array, we need to move all
5270 * the rest elements backward.
5271 */
5272 if (pos < movablemem_map.nr_map)
5273 memmove(&movablemem_map.map[pos+1],
5274 &movablemem_map.map[pos],
5275 sizeof(struct movablemem_entry) *
5276 (movablemem_map.nr_map - pos));
5277 movablemem_map.map[pos].start_pfn = start_pfn;
5278 movablemem_map.map[pos].end_pfn = end_pfn;
5279 movablemem_map.nr_map++;
5280 return;
5281 }
5282
5283 /* overlap will be at the last overlapped range */
5284 for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++)
5285 if (end_pfn < movablemem_map.map[overlap].start_pfn)
5286 break;
5287
5288 /*
5289 * If there are more ranges overlapped, we need to merge them,
5290 * and move the rest elements forward.
5291 */
5292 overlap--;
5293 movablemem_map.map[pos].start_pfn = min(start_pfn,
5294 movablemem_map.map[pos].start_pfn);
5295 movablemem_map.map[pos].end_pfn = max(end_pfn,
5296 movablemem_map.map[overlap].end_pfn);
5297
5298 if (pos != overlap && overlap + 1 != movablemem_map.nr_map)
5299 memmove(&movablemem_map.map[pos+1],
5300 &movablemem_map.map[overlap+1],
5301 sizeof(struct movablemem_entry) *
5302 (movablemem_map.nr_map - overlap - 1));
5303
5304 movablemem_map.nr_map -= overlap - pos;
5305}
5306
5307/**
5308 * movablemem_map_add_region - Add a memory range into movablemem_map.
5309 * @start: physical start address of range
5310 * @end: physical end address of range
5311 *
5312 * This function transform the physical address into pfn, and then add the
5313 * range into movablemem_map by calling insert_movablemem_map().
5314 */
5315static void __init movablemem_map_add_region(u64 start, u64 size)
5316{
5317 unsigned long start_pfn, end_pfn;
5318
5319 /* In case size == 0 or start + size overflows */
5320 if (start + size <= start)
5321 return;
5322
5323 if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) {
5324 pr_err("movablemem_map: too many entries;"
5325 " ignoring [mem %#010llx-%#010llx]\n",
5326 (unsigned long long) start,
5327 (unsigned long long) (start + size - 1));
5328 return;
5329 }
5330
5331 start_pfn = PFN_DOWN(start);
5332 end_pfn = PFN_UP(start + size);
5333 insert_movablemem_map(start_pfn, end_pfn);
5334}
5335
5336/*
5337 * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
5338 * @p: The boot option of the following format:
5339 * movablemem_map=nn[KMG]@ss[KMG]
5340 *
5341 * This option sets the memory range [ss, ss+nn) to be used as movable memory.
5342 *
5343 * Return: 0 on success or -EINVAL on failure.
5344 */
5345static int __init cmdline_parse_movablemem_map(char *p)
5346{
5347 char *oldp;
5348 u64 start_at, mem_size;
5349
5350 if (!p)
5351 goto err;
5352
5353 if (!strcmp(p, "acpi"))
5354 movablemem_map.acpi = true;
5355
5356 /*
5357 * If user decide to use info from BIOS, all the other user specified
5358 * ranges will be ingored.
5359 */
5360 if (movablemem_map.acpi) {
5361 if (movablemem_map.nr_map) {
5362 memset(movablemem_map.map, 0,
5363 sizeof(struct movablemem_entry)
5364 * movablemem_map.nr_map);
5365 movablemem_map.nr_map = 0;
5366 }
5367 return 0;
5368 }
5369
5370 oldp = p;
5371 mem_size = memparse(p, &p);
5372 if (p == oldp)
5373 goto err;
5374
5375 if (*p == '@') {
5376 oldp = ++p;
5377 start_at = memparse(p, &p);
5378 if (p == oldp || *p != '\0')
5379 goto err;
5380
5381 movablemem_map_add_region(start_at, mem_size);
5382 return 0;
5383 }
5384err:
5385 return -EINVAL;
5386}
5387early_param("movablemem_map", cmdline_parse_movablemem_map);
5388
5119#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5389#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5120 5390
5121/** 5391/**
@@ -5198,8 +5468,8 @@ static void calculate_totalreserve_pages(void)
5198 /* we treat the high watermark as reserved pages. */ 5468 /* we treat the high watermark as reserved pages. */
5199 max += high_wmark_pages(zone); 5469 max += high_wmark_pages(zone);
5200 5470
5201 if (max > zone->present_pages) 5471 if (max > zone->managed_pages)
5202 max = zone->present_pages; 5472 max = zone->managed_pages;
5203 reserve_pages += max; 5473 reserve_pages += max;
5204 /* 5474 /*
5205 * Lowmem reserves are not available to 5475 * Lowmem reserves are not available to
@@ -5231,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void)
5231 for_each_online_pgdat(pgdat) { 5501 for_each_online_pgdat(pgdat) {
5232 for (j = 0; j < MAX_NR_ZONES; j++) { 5502 for (j = 0; j < MAX_NR_ZONES; j++) {
5233 struct zone *zone = pgdat->node_zones + j; 5503 struct zone *zone = pgdat->node_zones + j;
5234 unsigned long present_pages = zone->present_pages; 5504 unsigned long managed_pages = zone->managed_pages;
5235 5505
5236 zone->lowmem_reserve[j] = 0; 5506 zone->lowmem_reserve[j] = 0;
5237 5507
@@ -5245,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void)
5245 sysctl_lowmem_reserve_ratio[idx] = 1; 5515 sysctl_lowmem_reserve_ratio[idx] = 1;
5246 5516
5247 lower_zone = pgdat->node_zones + idx; 5517 lower_zone = pgdat->node_zones + idx;
5248 lower_zone->lowmem_reserve[j] = present_pages / 5518 lower_zone->lowmem_reserve[j] = managed_pages /
5249 sysctl_lowmem_reserve_ratio[idx]; 5519 sysctl_lowmem_reserve_ratio[idx];
5250 present_pages += lower_zone->present_pages; 5520 managed_pages += lower_zone->managed_pages;
5251 } 5521 }
5252 } 5522 }
5253 } 5523 }
@@ -5266,14 +5536,14 @@ static void __setup_per_zone_wmarks(void)
5266 /* Calculate total number of !ZONE_HIGHMEM pages */ 5536 /* Calculate total number of !ZONE_HIGHMEM pages */
5267 for_each_zone(zone) { 5537 for_each_zone(zone) {
5268 if (!is_highmem(zone)) 5538 if (!is_highmem(zone))
5269 lowmem_pages += zone->present_pages; 5539 lowmem_pages += zone->managed_pages;
5270 } 5540 }
5271 5541
5272 for_each_zone(zone) { 5542 for_each_zone(zone) {
5273 u64 tmp; 5543 u64 tmp;
5274 5544
5275 spin_lock_irqsave(&zone->lock, flags); 5545 spin_lock_irqsave(&zone->lock, flags);
5276 tmp = (u64)pages_min * zone->present_pages; 5546 tmp = (u64)pages_min * zone->managed_pages;
5277 do_div(tmp, lowmem_pages); 5547 do_div(tmp, lowmem_pages);
5278 if (is_highmem(zone)) { 5548 if (is_highmem(zone)) {
5279 /* 5549 /*
@@ -5285,13 +5555,10 @@ static void __setup_per_zone_wmarks(void)
5285 * deltas controls asynch page reclaim, and so should 5555 * deltas controls asynch page reclaim, and so should
5286 * not be capped for highmem. 5556 * not be capped for highmem.
5287 */ 5557 */
5288 int min_pages; 5558 unsigned long min_pages;
5289 5559
5290 min_pages = zone->present_pages / 1024; 5560 min_pages = zone->managed_pages / 1024;
5291 if (min_pages < SWAP_CLUSTER_MAX) 5561 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5292 min_pages = SWAP_CLUSTER_MAX;
5293 if (min_pages > 128)
5294 min_pages = 128;
5295 zone->watermark[WMARK_MIN] = min_pages; 5562 zone->watermark[WMARK_MIN] = min_pages;
5296 } else { 5563 } else {
5297 /* 5564 /*
@@ -5352,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5352 unsigned int gb, ratio; 5619 unsigned int gb, ratio;
5353 5620
5354 /* Zone size in gigabytes */ 5621 /* Zone size in gigabytes */
5355 gb = zone->present_pages >> (30 - PAGE_SHIFT); 5622 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
5356 if (gb) 5623 if (gb)
5357 ratio = int_sqrt(10 * gb); 5624 ratio = int_sqrt(10 * gb);
5358 else 5625 else
@@ -5438,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5438 return rc; 5705 return rc;
5439 5706
5440 for_each_zone(zone) 5707 for_each_zone(zone)
5441 zone->min_unmapped_pages = (zone->present_pages * 5708 zone->min_unmapped_pages = (zone->managed_pages *
5442 sysctl_min_unmapped_ratio) / 100; 5709 sysctl_min_unmapped_ratio) / 100;
5443 return 0; 5710 return 0;
5444} 5711}
@@ -5454,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5454 return rc; 5721 return rc;
5455 5722
5456 for_each_zone(zone) 5723 for_each_zone(zone)
5457 zone->min_slab_pages = (zone->present_pages * 5724 zone->min_slab_pages = (zone->managed_pages *
5458 sysctl_min_slab_ratio) / 100; 5725 sysctl_min_slab_ratio) / 100;
5459 return 0; 5726 return 0;
5460} 5727}
@@ -5496,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5496 for_each_populated_zone(zone) { 5763 for_each_populated_zone(zone) {
5497 for_each_possible_cpu(cpu) { 5764 for_each_possible_cpu(cpu) {
5498 unsigned long high; 5765 unsigned long high;
5499 high = zone->present_pages / percpu_pagelist_fraction; 5766 high = zone->managed_pages / percpu_pagelist_fraction;
5500 setup_pagelist_highmark( 5767 setup_pagelist_highmark(
5501 per_cpu_ptr(zone->pageset, cpu), high); 5768 per_cpu_ptr(zone->pageset, cpu), high);
5502 } 5769 }
@@ -5631,7 +5898,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5631 pfn &= (PAGES_PER_SECTION-1); 5898 pfn &= (PAGES_PER_SECTION-1);
5632 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5899 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5633#else 5900#else
5634 pfn = pfn - zone->zone_start_pfn; 5901 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5635 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5902 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5636#endif /* CONFIG_SPARSEMEM */ 5903#endif /* CONFIG_SPARSEMEM */
5637} 5904}
@@ -5683,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5683 pfn = page_to_pfn(page); 5950 pfn = page_to_pfn(page);
5684 bitmap = get_pageblock_bitmap(zone, pfn); 5951 bitmap = get_pageblock_bitmap(zone, pfn);
5685 bitidx = pfn_to_bitidx(zone, pfn); 5952 bitidx = pfn_to_bitidx(zone, pfn);
5686 VM_BUG_ON(pfn < zone->zone_start_pfn); 5953 VM_BUG_ON(!zone_spans_pfn(zone, pfn));
5687 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
5688 5954
5689 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5955 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5690 if (flags & value) 5956 if (flags & value)
@@ -5782,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5782 6048
5783 zone = page_zone(page); 6049 zone = page_zone(page);
5784 pfn = page_to_pfn(page); 6050 pfn = page_to_pfn(page);
5785 if (zone->zone_start_pfn > pfn || 6051 if (!zone_spans_pfn(zone, pfn))
5786 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5787 return false; 6052 return false;
5788 6053
5789 return !has_unmovable_pages(zone, page, 0, true); 6054 return !has_unmovable_pages(zone, page, 0, true);
@@ -5839,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5839 &cc->migratepages); 6104 &cc->migratepages);
5840 cc->nr_migratepages -= nr_reclaimed; 6105 cc->nr_migratepages -= nr_reclaimed;
5841 6106
5842 ret = migrate_pages(&cc->migratepages, 6107 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
5843 alloc_migrate_target, 6108 0, MIGRATE_SYNC, MR_CMA);
5844 0, false, MIGRATE_SYNC,
5845 MR_CMA);
5846 } 6109 }
5847 6110 if (ret < 0) {
5848 putback_movable_pages(&cc->migratepages); 6111 putback_movable_pages(&cc->migratepages);
5849 return ret > 0 ? 0 : ret; 6112 return ret;
6113 }
6114 return 0;
5850} 6115}
5851 6116
5852/** 6117/**
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 9d2264ea4606..383bdbb98b04 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,28 +8,6 @@
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include "internal.h" 9#include "internal.h"
10 10
11/* called while holding zone->lock */
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
33int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) 11int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
34{ 12{
35 struct zone *zone; 13 struct zone *zone;
@@ -80,7 +58,7 @@ out:
80 unsigned long nr_pages; 58 unsigned long nr_pages;
81 int migratetype = get_pageblock_migratetype(page); 59 int migratetype = get_pageblock_migratetype(page);
82 60
83 set_pageblock_isolate(page); 61 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
84 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); 62 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
85 63
86 __mod_zone_freepage_state(zone, -nr_pages, migratetype); 64 __mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -103,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
103 goto out; 81 goto out;
104 nr_pages = move_freepages_block(zone, page, migratetype); 82 nr_pages = move_freepages_block(zone, page, migratetype);
105 __mod_zone_freepage_state(zone, nr_pages, migratetype); 83 __mod_zone_freepage_state(zone, nr_pages, migratetype);
106 restore_pageblock_isolate(page, migratetype); 84 set_pageblock_migratetype(page, migratetype);
107out: 85out:
108 spin_unlock_irqrestore(&zone->lock, flags); 86 spin_unlock_irqrestore(&zone->lock, flags);
109} 87}
diff --git a/mm/rmap.c b/mm/rmap.c
index 2c78f8cadc95..807c96bf0dc6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
105 */ 105 */
106 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 106 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock_write(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock(anon_vma); 108 anon_vma_unlock_write(anon_vma);
109 } 109 }
110 110
111 kmem_cache_free(anon_vma_cachep, anon_vma); 111 kmem_cache_free(anon_vma_cachep, anon_vma);
@@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
191 avc = NULL; 191 avc = NULL;
192 } 192 }
193 spin_unlock(&mm->page_table_lock); 193 spin_unlock(&mm->page_table_lock);
194 anon_vma_unlock(anon_vma); 194 anon_vma_unlock_write(anon_vma);
195 195
196 if (unlikely(allocated)) 196 if (unlikely(allocated))
197 put_anon_vma(allocated); 197 put_anon_vma(allocated);
@@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
308 vma->anon_vma = anon_vma; 308 vma->anon_vma = anon_vma;
309 anon_vma_lock_write(anon_vma); 309 anon_vma_lock_write(anon_vma);
310 anon_vma_chain_link(vma, avc, anon_vma); 310 anon_vma_chain_link(vma, avc, anon_vma);
311 anon_vma_unlock(anon_vma); 311 anon_vma_unlock_write(anon_vma);
312 312
313 return 0; 313 return 0;
314 314
@@ -1126,7 +1126,6 @@ void page_add_file_rmap(struct page *page)
1126 */ 1126 */
1127void page_remove_rmap(struct page *page) 1127void page_remove_rmap(struct page *page)
1128{ 1128{
1129 struct address_space *mapping = page_mapping(page);
1130 bool anon = PageAnon(page); 1129 bool anon = PageAnon(page);
1131 bool locked; 1130 bool locked;
1132 unsigned long flags; 1131 unsigned long flags;
@@ -1144,29 +1143,6 @@ void page_remove_rmap(struct page *page)
1144 goto out; 1143 goto out;
1145 1144
1146 /* 1145 /*
1147 * Now that the last pte has gone, s390 must transfer dirty
1148 * flag from storage key to struct page. We can usually skip
1149 * this if the page is anon, so about to be freed; but perhaps
1150 * not if it's in swapcache - there might be another pte slot
1151 * containing the swap entry, but page not yet written to swap.
1152 *
1153 * And we can skip it on file pages, so long as the filesystem
1154 * participates in dirty tracking (note that this is not only an
1155 * optimization but also solves problems caused by dirty flag in
1156 * storage key getting set by a write from inside kernel); but need to
1157 * catch shm and tmpfs and ramfs pages which have been modified since
1158 * creation by read fault.
1159 *
1160 * Note that mapping must be decided above, before decrementing
1161 * mapcount (which luckily provides a barrier): once page is unmapped,
1162 * it could be truncated and page->mapping reset to NULL at any moment.
1163 * Note also that we are relying on page_mapping(page) to set mapping
1164 * to &swapper_space when PageSwapCache(page).
1165 */
1166 if (mapping && !mapping_cap_account_dirty(mapping) &&
1167 page_test_and_clear_dirty(page_to_pfn(page), 1))
1168 set_page_dirty(page);
1169 /*
1170 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED 1146 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
1171 * and not charged by memcg for now. 1147 * and not charged by memcg for now.
1172 */ 1148 */
diff --git a/mm/shmem.c b/mm/shmem.c
index 197ca5eccbae..39de1d6a077a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -335,19 +335,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
335 pgoff_t start, unsigned int nr_pages, 335 pgoff_t start, unsigned int nr_pages,
336 struct page **pages, pgoff_t *indices) 336 struct page **pages, pgoff_t *indices)
337{ 337{
338 unsigned int i; 338 void **slot;
339 unsigned int ret; 339 unsigned int ret = 0;
340 unsigned int nr_found; 340 struct radix_tree_iter iter;
341
342 if (!nr_pages)
343 return 0;
341 344
342 rcu_read_lock(); 345 rcu_read_lock();
343restart: 346restart:
344 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 347 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
345 (void ***)pages, indices, start, nr_pages);
346 ret = 0;
347 for (i = 0; i < nr_found; i++) {
348 struct page *page; 348 struct page *page;
349repeat: 349repeat:
350 page = radix_tree_deref_slot((void **)pages[i]); 350 page = radix_tree_deref_slot(slot);
351 if (unlikely(!page)) 351 if (unlikely(!page))
352 continue; 352 continue;
353 if (radix_tree_exception(page)) { 353 if (radix_tree_exception(page)) {
@@ -364,17 +364,16 @@ repeat:
364 goto repeat; 364 goto repeat;
365 365
366 /* Has the page moved? */ 366 /* Has the page moved? */
367 if (unlikely(page != *((void **)pages[i]))) { 367 if (unlikely(page != *slot)) {
368 page_cache_release(page); 368 page_cache_release(page);
369 goto repeat; 369 goto repeat;
370 } 370 }
371export: 371export:
372 indices[ret] = indices[i]; 372 indices[ret] = iter.index;
373 pages[ret] = page; 373 pages[ret] = page;
374 ret++; 374 if (++ret == nr_pages)
375 break;
375 } 376 }
376 if (unlikely(!ret && nr_found))
377 goto restart;
378 rcu_read_unlock(); 377 rcu_read_unlock();
379 return ret; 378 return ret;
380} 379}
@@ -889,7 +888,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
889 if (!mpol || mpol->mode == MPOL_DEFAULT) 888 if (!mpol || mpol->mode == MPOL_DEFAULT)
890 return; /* show nothing */ 889 return; /* show nothing */
891 890
892 mpol_to_str(buffer, sizeof(buffer), mpol, 1); 891 mpol_to_str(buffer, sizeof(buffer), mpol);
893 892
894 seq_printf(seq, ",mpol=%s", buffer); 893 seq_printf(seq, ",mpol=%s", buffer);
895} 894}
@@ -2386,6 +2385,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2386 bool remount) 2385 bool remount)
2387{ 2386{
2388 char *this_char, *value, *rest; 2387 char *this_char, *value, *rest;
2388 struct mempolicy *mpol = NULL;
2389 uid_t uid; 2389 uid_t uid;
2390 gid_t gid; 2390 gid_t gid;
2391 2391
@@ -2414,7 +2414,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2414 printk(KERN_ERR 2414 printk(KERN_ERR
2415 "tmpfs: No value for mount option '%s'\n", 2415 "tmpfs: No value for mount option '%s'\n",
2416 this_char); 2416 this_char);
2417 return 1; 2417 goto error;
2418 } 2418 }
2419 2419
2420 if (!strcmp(this_char,"size")) { 2420 if (!strcmp(this_char,"size")) {
@@ -2463,19 +2463,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2463 if (!gid_valid(sbinfo->gid)) 2463 if (!gid_valid(sbinfo->gid))
2464 goto bad_val; 2464 goto bad_val;
2465 } else if (!strcmp(this_char,"mpol")) { 2465 } else if (!strcmp(this_char,"mpol")) {
2466 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 2466 mpol_put(mpol);
2467 mpol = NULL;
2468 if (mpol_parse_str(value, &mpol))
2467 goto bad_val; 2469 goto bad_val;
2468 } else { 2470 } else {
2469 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2471 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2470 this_char); 2472 this_char);
2471 return 1; 2473 goto error;
2472 } 2474 }
2473 } 2475 }
2476 sbinfo->mpol = mpol;
2474 return 0; 2477 return 0;
2475 2478
2476bad_val: 2479bad_val:
2477 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 2480 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2478 value, this_char); 2481 value, this_char);
2482error:
2483 mpol_put(mpol);
2479 return 1; 2484 return 1;
2480 2485
2481} 2486}
@@ -2487,6 +2492,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2487 unsigned long inodes; 2492 unsigned long inodes;
2488 int error = -EINVAL; 2493 int error = -EINVAL;
2489 2494
2495 config.mpol = NULL;
2490 if (shmem_parse_options(data, &config, true)) 2496 if (shmem_parse_options(data, &config, true))
2491 return error; 2497 return error;
2492 2498
@@ -2511,8 +2517,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2511 sbinfo->max_inodes = config.max_inodes; 2517 sbinfo->max_inodes = config.max_inodes;
2512 sbinfo->free_inodes = config.max_inodes - inodes; 2518 sbinfo->free_inodes = config.max_inodes - inodes;
2513 2519
2514 mpol_put(sbinfo->mpol); 2520 /*
2515 sbinfo->mpol = config.mpol; /* transfers initial ref */ 2521 * Preserve previous mempolicy unless mpol remount option was specified.
2522 */
2523 if (config.mpol) {
2524 mpol_put(sbinfo->mpol);
2525 sbinfo->mpol = config.mpol; /* transfers initial ref */
2526 }
2516out: 2527out:
2517 spin_unlock(&sbinfo->stat_lock); 2528 spin_unlock(&sbinfo->stat_lock);
2518 return error; 2529 return error;
@@ -2545,6 +2556,7 @@ static void shmem_put_super(struct super_block *sb)
2545 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2556 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2546 2557
2547 percpu_counter_destroy(&sbinfo->used_blocks); 2558 percpu_counter_destroy(&sbinfo->used_blocks);
2559 mpol_put(sbinfo->mpol);
2548 kfree(sbinfo); 2560 kfree(sbinfo);
2549 sb->s_fs_info = NULL; 2561 sb->s_fs_info = NULL;
2550} 2562}
diff --git a/mm/slab.c b/mm/slab.c
index e7667a3584bc..856e4a192d25 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -812,7 +812,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
812 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 812 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
813 function, cachep->name, msg); 813 function, cachep->name, msg);
814 dump_stack(); 814 dump_stack();
815 add_taint(TAINT_BAD_PAGE); 815 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
816} 816}
817#endif 817#endif
818 818
diff --git a/mm/slob.c b/mm/slob.c
index a99fdf7a0907..eeed4a05a2ef 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
360 clear_slob_page_free(sp); 360 clear_slob_page_free(sp);
361 spin_unlock_irqrestore(&slob_lock, flags); 361 spin_unlock_irqrestore(&slob_lock, flags);
362 __ClearPageSlab(sp); 362 __ClearPageSlab(sp);
363 reset_page_mapcount(sp); 363 page_mapcount_reset(sp);
364 slob_free_pages(b, 0); 364 slob_free_pages(b, 0);
365 return; 365 return;
366 } 366 }
diff --git a/mm/slub.c b/mm/slub.c
index ba2ca53f6c3a..4aec53705e4f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -562,7 +562,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
562 printk(KERN_ERR "----------------------------------------" 562 printk(KERN_ERR "----------------------------------------"
563 "-------------------------------------\n\n"); 563 "-------------------------------------\n\n");
564 564
565 add_taint(TAINT_BAD_PAGE); 565 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
566} 566}
567 567
568static void slab_fix(struct kmem_cache *s, char *fmt, ...) 568static void slab_fix(struct kmem_cache *s, char *fmt, ...)
@@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1408 __ClearPageSlab(page); 1408 __ClearPageSlab(page);
1409 1409
1410 memcg_release_pages(s, order); 1410 memcg_release_pages(s, order);
1411 reset_page_mapcount(page); 1411 page_mapcount_reset(page);
1412 if (current->reclaim_state) 1412 if (current->reclaim_state)
1413 current->reclaim_state->reclaimed_slab += pages; 1413 current->reclaim_state->reclaimed_slab += pages;
1414 __free_memcg_kmem_pages(page, order); 1414 __free_memcg_kmem_pages(page, order);
diff --git a/mm/sparse.c b/mm/sparse.c
index 6b5fb762e2ca..7ca6dc847947 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
615} 615}
616static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 616static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
617{ 617{
618 return; /* XXX: Not implemented yet */ 618 vmemmap_free(memmap, nr_pages);
619} 619}
620static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) 620static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
621{ 621{
622 vmemmap_free(memmap, nr_pages);
622} 623}
623#else 624#else
624static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 625static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
@@ -697,7 +698,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
697 /* 698 /*
698 * Check to see if allocation came from hot-plug-add 699 * Check to see if allocation came from hot-plug-add
699 */ 700 */
700 if (PageSlab(usemap_page)) { 701 if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
701 kfree(usemap); 702 kfree(usemap);
702 if (memmap) 703 if (memmap)
703 __kfree_section_memmap(memmap, PAGES_PER_SECTION); 704 __kfree_section_memmap(memmap, PAGES_PER_SECTION);
@@ -782,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
782 783
783 for (i = 0; i < PAGES_PER_SECTION; i++) { 784 for (i = 0; i < PAGES_PER_SECTION; i++) {
784 if (PageHWPoison(&memmap[i])) { 785 if (PageHWPoison(&memmap[i])) {
785 atomic_long_sub(1, &mce_bad_pages); 786 atomic_long_sub(1, &num_poisoned_pages);
786 ClearPageHWPoison(&memmap[i]); 787 ClearPageHWPoison(&memmap[i]);
787 } 788 }
788 } 789 }
@@ -796,8 +797,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
796void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 797void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
797{ 798{
798 struct page *memmap = NULL; 799 struct page *memmap = NULL;
799 unsigned long *usemap = NULL; 800 unsigned long *usemap = NULL, flags;
801 struct pglist_data *pgdat = zone->zone_pgdat;
800 802
803 pgdat_resize_lock(pgdat, &flags);
801 if (ms->section_mem_map) { 804 if (ms->section_mem_map) {
802 usemap = ms->pageblock_flags; 805 usemap = ms->pageblock_flags;
803 memmap = sparse_decode_mem_map(ms->section_mem_map, 806 memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -805,6 +808,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
805 ms->section_mem_map = 0; 808 ms->section_mem_map = 0;
806 ms->pageblock_flags = NULL; 809 ms->pageblock_flags = NULL;
807 } 810 }
811 pgdat_resize_unlock(pgdat, &flags);
808 812
809 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); 813 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
810 free_section_usemap(memmap, usemap); 814 free_section_usemap(memmap, usemap);
diff --git a/mm/swap.c b/mm/swap.c
index 6310dc2008ff..8a529a01e8fc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
855void __init swap_setup(void) 855void __init swap_setup(void)
856{ 856{
857 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 857 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
858
859#ifdef CONFIG_SWAP 858#ifdef CONFIG_SWAP
860 bdi_init(swapper_space.backing_dev_info); 859 int i;
860
861 bdi_init(swapper_spaces[0].backing_dev_info);
862 for (i = 0; i < MAX_SWAPFILES; i++) {
863 spin_lock_init(&swapper_spaces[i].tree_lock);
864 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
865 }
861#endif 866#endif
862 867
863 /* Use a smaller cluster for small-memory machines */ 868 /* Use a smaller cluster for small-memory machines */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0cb36fb1f61c..7efcf1525921 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
37}; 37};
38 38
39struct address_space swapper_space = { 39struct address_space swapper_spaces[MAX_SWAPFILES] = {
40 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 40 [0 ... MAX_SWAPFILES - 1] = {
41 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .a_ops = &swap_aops, 42 .a_ops = &swap_aops,
43 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 43 .backing_dev_info = &swap_backing_dev_info,
44 .backing_dev_info = &swap_backing_dev_info, 44 }
45}; 45};
46 46
47#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 47#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -53,13 +53,24 @@ static struct {
53 unsigned long find_total; 53 unsigned long find_total;
54} swap_cache_info; 54} swap_cache_info;
55 55
56unsigned long total_swapcache_pages(void)
57{
58 int i;
59 unsigned long ret = 0;
60
61 for (i = 0; i < MAX_SWAPFILES; i++)
62 ret += swapper_spaces[i].nrpages;
63 return ret;
64}
65
56void show_swap_cache_info(void) 66void show_swap_cache_info(void)
57{ 67{
58 printk("%lu pages in swap cache\n", total_swapcache_pages); 68 printk("%lu pages in swap cache\n", total_swapcache_pages());
59 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 69 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swap_cache_info.del_total, 70 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, swap_cache_info.find_total); 71 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 72 printk("Free swap = %ldkB\n",
73 get_nr_swap_pages() << (PAGE_SHIFT - 10));
63 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 74 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
64} 75}
65 76
@@ -70,6 +81,7 @@ void show_swap_cache_info(void)
70static int __add_to_swap_cache(struct page *page, swp_entry_t entry) 81static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
71{ 82{
72 int error; 83 int error;
84 struct address_space *address_space;
73 85
74 VM_BUG_ON(!PageLocked(page)); 86 VM_BUG_ON(!PageLocked(page));
75 VM_BUG_ON(PageSwapCache(page)); 87 VM_BUG_ON(PageSwapCache(page));
@@ -79,14 +91,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
79 SetPageSwapCache(page); 91 SetPageSwapCache(page);
80 set_page_private(page, entry.val); 92 set_page_private(page, entry.val);
81 93
82 spin_lock_irq(&swapper_space.tree_lock); 94 address_space = swap_address_space(entry);
83 error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); 95 spin_lock_irq(&address_space->tree_lock);
96 error = radix_tree_insert(&address_space->page_tree,
97 entry.val, page);
84 if (likely(!error)) { 98 if (likely(!error)) {
85 total_swapcache_pages++; 99 address_space->nrpages++;
86 __inc_zone_page_state(page, NR_FILE_PAGES); 100 __inc_zone_page_state(page, NR_FILE_PAGES);
87 INC_CACHE_INFO(add_total); 101 INC_CACHE_INFO(add_total);
88 } 102 }
89 spin_unlock_irq(&swapper_space.tree_lock); 103 spin_unlock_irq(&address_space->tree_lock);
90 104
91 if (unlikely(error)) { 105 if (unlikely(error)) {
92 /* 106 /*
@@ -122,14 +136,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
122 */ 136 */
123void __delete_from_swap_cache(struct page *page) 137void __delete_from_swap_cache(struct page *page)
124{ 138{
139 swp_entry_t entry;
140 struct address_space *address_space;
141
125 VM_BUG_ON(!PageLocked(page)); 142 VM_BUG_ON(!PageLocked(page));
126 VM_BUG_ON(!PageSwapCache(page)); 143 VM_BUG_ON(!PageSwapCache(page));
127 VM_BUG_ON(PageWriteback(page)); 144 VM_BUG_ON(PageWriteback(page));
128 145
129 radix_tree_delete(&swapper_space.page_tree, page_private(page)); 146 entry.val = page_private(page);
147 address_space = swap_address_space(entry);
148 radix_tree_delete(&address_space->page_tree, page_private(page));
130 set_page_private(page, 0); 149 set_page_private(page, 0);
131 ClearPageSwapCache(page); 150 ClearPageSwapCache(page);
132 total_swapcache_pages--; 151 address_space->nrpages--;
133 __dec_zone_page_state(page, NR_FILE_PAGES); 152 __dec_zone_page_state(page, NR_FILE_PAGES);
134 INC_CACHE_INFO(del_total); 153 INC_CACHE_INFO(del_total);
135} 154}
@@ -195,12 +214,14 @@ int add_to_swap(struct page *page)
195void delete_from_swap_cache(struct page *page) 214void delete_from_swap_cache(struct page *page)
196{ 215{
197 swp_entry_t entry; 216 swp_entry_t entry;
217 struct address_space *address_space;
198 218
199 entry.val = page_private(page); 219 entry.val = page_private(page);
200 220
201 spin_lock_irq(&swapper_space.tree_lock); 221 address_space = swap_address_space(entry);
222 spin_lock_irq(&address_space->tree_lock);
202 __delete_from_swap_cache(page); 223 __delete_from_swap_cache(page);
203 spin_unlock_irq(&swapper_space.tree_lock); 224 spin_unlock_irq(&address_space->tree_lock);
204 225
205 swapcache_free(entry, page); 226 swapcache_free(entry, page);
206 page_cache_release(page); 227 page_cache_release(page);
@@ -263,7 +284,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
263{ 284{
264 struct page *page; 285 struct page *page;
265 286
266 page = find_get_page(&swapper_space, entry.val); 287 page = find_get_page(swap_address_space(entry), entry.val);
267 288
268 if (page) 289 if (page)
269 INC_CACHE_INFO(find_success); 290 INC_CACHE_INFO(find_success);
@@ -290,7 +311,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
290 * called after lookup_swap_cache() failed, re-calling 311 * called after lookup_swap_cache() failed, re-calling
291 * that would confuse statistics. 312 * that would confuse statistics.
292 */ 313 */
293 found_page = find_get_page(&swapper_space, entry.val); 314 found_page = find_get_page(swap_address_space(entry),
315 entry.val);
294 if (found_page) 316 if (found_page)
295 break; 317 break;
296 318
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e97a0e5aea91..c72c648f750c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**);
47 47
48DEFINE_SPINLOCK(swap_lock); 48DEFINE_SPINLOCK(swap_lock);
49static unsigned int nr_swapfiles; 49static unsigned int nr_swapfiles;
50long nr_swap_pages; 50atomic_long_t nr_swap_pages;
51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
51long total_swap_pages; 52long total_swap_pages;
52static int least_priority; 53static int least_priority;
54static atomic_t highest_priority_index = ATOMIC_INIT(-1);
53 55
54static const char Bad_file[] = "Bad swap file entry "; 56static const char Bad_file[] = "Bad swap file entry ";
55static const char Unused_file[] = "Unused swap file entry "; 57static const char Unused_file[] = "Unused swap file entry ";
@@ -79,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
79 struct page *page; 81 struct page *page;
80 int ret = 0; 82 int ret = 0;
81 83
82 page = find_get_page(&swapper_space, entry.val); 84 page = find_get_page(swap_address_space(entry), entry.val);
83 if (!page) 85 if (!page)
84 return 0; 86 return 0;
85 /* 87 /*
@@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
223 si->lowest_alloc = si->max; 225 si->lowest_alloc = si->max;
224 si->highest_alloc = 0; 226 si->highest_alloc = 0;
225 } 227 }
226 spin_unlock(&swap_lock); 228 spin_unlock(&si->lock);
227 229
228 /* 230 /*
229 * If seek is expensive, start searching for new cluster from 231 * If seek is expensive, start searching for new cluster from
@@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
242 if (si->swap_map[offset]) 244 if (si->swap_map[offset])
243 last_in_cluster = offset + SWAPFILE_CLUSTER; 245 last_in_cluster = offset + SWAPFILE_CLUSTER;
244 else if (offset == last_in_cluster) { 246 else if (offset == last_in_cluster) {
245 spin_lock(&swap_lock); 247 spin_lock(&si->lock);
246 offset -= SWAPFILE_CLUSTER - 1; 248 offset -= SWAPFILE_CLUSTER - 1;
247 si->cluster_next = offset; 249 si->cluster_next = offset;
248 si->cluster_nr = SWAPFILE_CLUSTER - 1; 250 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
263 if (si->swap_map[offset]) 265 if (si->swap_map[offset])
264 last_in_cluster = offset + SWAPFILE_CLUSTER; 266 last_in_cluster = offset + SWAPFILE_CLUSTER;
265 else if (offset == last_in_cluster) { 267 else if (offset == last_in_cluster) {
266 spin_lock(&swap_lock); 268 spin_lock(&si->lock);
267 offset -= SWAPFILE_CLUSTER - 1; 269 offset -= SWAPFILE_CLUSTER - 1;
268 si->cluster_next = offset; 270 si->cluster_next = offset;
269 si->cluster_nr = SWAPFILE_CLUSTER - 1; 271 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
277 } 279 }
278 280
279 offset = scan_base; 281 offset = scan_base;
280 spin_lock(&swap_lock); 282 spin_lock(&si->lock);
281 si->cluster_nr = SWAPFILE_CLUSTER - 1; 283 si->cluster_nr = SWAPFILE_CLUSTER - 1;
282 si->lowest_alloc = 0; 284 si->lowest_alloc = 0;
283 } 285 }
@@ -293,9 +295,9 @@ checks:
293 /* reuse swap entry of cache-only swap if not busy. */ 295 /* reuse swap entry of cache-only swap if not busy. */
294 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 296 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
295 int swap_was_freed; 297 int swap_was_freed;
296 spin_unlock(&swap_lock); 298 spin_unlock(&si->lock);
297 swap_was_freed = __try_to_reclaim_swap(si, offset); 299 swap_was_freed = __try_to_reclaim_swap(si, offset);
298 spin_lock(&swap_lock); 300 spin_lock(&si->lock);
299 /* entry was freed successfully, try to use this again */ 301 /* entry was freed successfully, try to use this again */
300 if (swap_was_freed) 302 if (swap_was_freed)
301 goto checks; 303 goto checks;
@@ -335,13 +337,13 @@ checks:
335 si->lowest_alloc <= last_in_cluster) 337 si->lowest_alloc <= last_in_cluster)
336 last_in_cluster = si->lowest_alloc - 1; 338 last_in_cluster = si->lowest_alloc - 1;
337 si->flags |= SWP_DISCARDING; 339 si->flags |= SWP_DISCARDING;
338 spin_unlock(&swap_lock); 340 spin_unlock(&si->lock);
339 341
340 if (offset < last_in_cluster) 342 if (offset < last_in_cluster)
341 discard_swap_cluster(si, offset, 343 discard_swap_cluster(si, offset,
342 last_in_cluster - offset + 1); 344 last_in_cluster - offset + 1);
343 345
344 spin_lock(&swap_lock); 346 spin_lock(&si->lock);
345 si->lowest_alloc = 0; 347 si->lowest_alloc = 0;
346 si->flags &= ~SWP_DISCARDING; 348 si->flags &= ~SWP_DISCARDING;
347 349
@@ -355,10 +357,10 @@ checks:
355 * could defer that delay until swap_writepage, 357 * could defer that delay until swap_writepage,
356 * but it's easier to keep this self-contained. 358 * but it's easier to keep this self-contained.
357 */ 359 */
358 spin_unlock(&swap_lock); 360 spin_unlock(&si->lock);
359 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), 361 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
360 wait_for_discard, TASK_UNINTERRUPTIBLE); 362 wait_for_discard, TASK_UNINTERRUPTIBLE);
361 spin_lock(&swap_lock); 363 spin_lock(&si->lock);
362 } else { 364 } else {
363 /* 365 /*
364 * Note pages allocated by racing tasks while 366 * Note pages allocated by racing tasks while
@@ -374,14 +376,14 @@ checks:
374 return offset; 376 return offset;
375 377
376scan: 378scan:
377 spin_unlock(&swap_lock); 379 spin_unlock(&si->lock);
378 while (++offset <= si->highest_bit) { 380 while (++offset <= si->highest_bit) {
379 if (!si->swap_map[offset]) { 381 if (!si->swap_map[offset]) {
380 spin_lock(&swap_lock); 382 spin_lock(&si->lock);
381 goto checks; 383 goto checks;
382 } 384 }
383 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 385 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
384 spin_lock(&swap_lock); 386 spin_lock(&si->lock);
385 goto checks; 387 goto checks;
386 } 388 }
387 if (unlikely(--latency_ration < 0)) { 389 if (unlikely(--latency_ration < 0)) {
@@ -392,11 +394,11 @@ scan:
392 offset = si->lowest_bit; 394 offset = si->lowest_bit;
393 while (++offset < scan_base) { 395 while (++offset < scan_base) {
394 if (!si->swap_map[offset]) { 396 if (!si->swap_map[offset]) {
395 spin_lock(&swap_lock); 397 spin_lock(&si->lock);
396 goto checks; 398 goto checks;
397 } 399 }
398 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 400 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
399 spin_lock(&swap_lock); 401 spin_lock(&si->lock);
400 goto checks; 402 goto checks;
401 } 403 }
402 if (unlikely(--latency_ration < 0)) { 404 if (unlikely(--latency_ration < 0)) {
@@ -404,7 +406,7 @@ scan:
404 latency_ration = LATENCY_LIMIT; 406 latency_ration = LATENCY_LIMIT;
405 } 407 }
406 } 408 }
407 spin_lock(&swap_lock); 409 spin_lock(&si->lock);
408 410
409no_page: 411no_page:
410 si->flags -= SWP_SCANNING; 412 si->flags -= SWP_SCANNING;
@@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void)
417 pgoff_t offset; 419 pgoff_t offset;
418 int type, next; 420 int type, next;
419 int wrapped = 0; 421 int wrapped = 0;
422 int hp_index;
420 423
421 spin_lock(&swap_lock); 424 spin_lock(&swap_lock);
422 if (nr_swap_pages <= 0) 425 if (atomic_long_read(&nr_swap_pages) <= 0)
423 goto noswap; 426 goto noswap;
424 nr_swap_pages--; 427 atomic_long_dec(&nr_swap_pages);
425 428
426 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 429 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
430 hp_index = atomic_xchg(&highest_priority_index, -1);
431 /*
432 * highest_priority_index records current highest priority swap
433 * type which just frees swap entries. If its priority is
434 * higher than that of swap_list.next swap type, we use it. It
435 * isn't protected by swap_lock, so it can be an invalid value
436 * if the corresponding swap type is swapoff. We double check
437 * the flags here. It's even possible the swap type is swapoff
438 * and swapon again and its priority is changed. In such rare
439 * case, low prority swap type might be used, but eventually
440 * high priority swap will be used after several rounds of
441 * swap.
442 */
443 if (hp_index != -1 && hp_index != type &&
444 swap_info[type]->prio < swap_info[hp_index]->prio &&
445 (swap_info[hp_index]->flags & SWP_WRITEOK)) {
446 type = hp_index;
447 swap_list.next = type;
448 }
449
427 si = swap_info[type]; 450 si = swap_info[type];
428 next = si->next; 451 next = si->next;
429 if (next < 0 || 452 if (next < 0 ||
@@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void)
432 wrapped++; 455 wrapped++;
433 } 456 }
434 457
435 if (!si->highest_bit) 458 spin_lock(&si->lock);
459 if (!si->highest_bit) {
460 spin_unlock(&si->lock);
436 continue; 461 continue;
437 if (!(si->flags & SWP_WRITEOK)) 462 }
463 if (!(si->flags & SWP_WRITEOK)) {
464 spin_unlock(&si->lock);
438 continue; 465 continue;
466 }
439 467
440 swap_list.next = next; 468 swap_list.next = next;
469
470 spin_unlock(&swap_lock);
441 /* This is called for allocating swap entry for cache */ 471 /* This is called for allocating swap entry for cache */
442 offset = scan_swap_map(si, SWAP_HAS_CACHE); 472 offset = scan_swap_map(si, SWAP_HAS_CACHE);
443 if (offset) { 473 spin_unlock(&si->lock);
444 spin_unlock(&swap_lock); 474 if (offset)
445 return swp_entry(type, offset); 475 return swp_entry(type, offset);
446 } 476 spin_lock(&swap_lock);
447 next = swap_list.next; 477 next = swap_list.next;
448 } 478 }
449 479
450 nr_swap_pages++; 480 atomic_long_inc(&nr_swap_pages);
451noswap: 481noswap:
452 spin_unlock(&swap_lock); 482 spin_unlock(&swap_lock);
453 return (swp_entry_t) {0}; 483 return (swp_entry_t) {0};
@@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type)
459 struct swap_info_struct *si; 489 struct swap_info_struct *si;
460 pgoff_t offset; 490 pgoff_t offset;
461 491
462 spin_lock(&swap_lock);
463 si = swap_info[type]; 492 si = swap_info[type];
493 spin_lock(&si->lock);
464 if (si && (si->flags & SWP_WRITEOK)) { 494 if (si && (si->flags & SWP_WRITEOK)) {
465 nr_swap_pages--; 495 atomic_long_dec(&nr_swap_pages);
466 /* This is called for allocating swap entry, not cache */ 496 /* This is called for allocating swap entry, not cache */
467 offset = scan_swap_map(si, 1); 497 offset = scan_swap_map(si, 1);
468 if (offset) { 498 if (offset) {
469 spin_unlock(&swap_lock); 499 spin_unlock(&si->lock);
470 return swp_entry(type, offset); 500 return swp_entry(type, offset);
471 } 501 }
472 nr_swap_pages++; 502 atomic_long_inc(&nr_swap_pages);
473 } 503 }
474 spin_unlock(&swap_lock); 504 spin_unlock(&si->lock);
475 return (swp_entry_t) {0}; 505 return (swp_entry_t) {0};
476} 506}
477 507
@@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
493 goto bad_offset; 523 goto bad_offset;
494 if (!p->swap_map[offset]) 524 if (!p->swap_map[offset])
495 goto bad_free; 525 goto bad_free;
496 spin_lock(&swap_lock); 526 spin_lock(&p->lock);
497 return p; 527 return p;
498 528
499bad_free: 529bad_free:
@@ -511,6 +541,27 @@ out:
511 return NULL; 541 return NULL;
512} 542}
513 543
544/*
545 * This swap type frees swap entry, check if it is the highest priority swap
546 * type which just frees swap entry. get_swap_page() uses
547 * highest_priority_index to search highest priority swap type. The
548 * swap_info_struct.lock can't protect us if there are multiple swap types
549 * active, so we use atomic_cmpxchg.
550 */
551static void set_highest_priority_index(int type)
552{
553 int old_hp_index, new_hp_index;
554
555 do {
556 old_hp_index = atomic_read(&highest_priority_index);
557 if (old_hp_index != -1 &&
558 swap_info[old_hp_index]->prio >= swap_info[type]->prio)
559 break;
560 new_hp_index = type;
561 } while (atomic_cmpxchg(&highest_priority_index,
562 old_hp_index, new_hp_index) != old_hp_index);
563}
564
514static unsigned char swap_entry_free(struct swap_info_struct *p, 565static unsigned char swap_entry_free(struct swap_info_struct *p,
515 swp_entry_t entry, unsigned char usage) 566 swp_entry_t entry, unsigned char usage)
516{ 567{
@@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
553 p->lowest_bit = offset; 604 p->lowest_bit = offset;
554 if (offset > p->highest_bit) 605 if (offset > p->highest_bit)
555 p->highest_bit = offset; 606 p->highest_bit = offset;
556 if (swap_list.next >= 0 && 607 set_highest_priority_index(p->type);
557 p->prio > swap_info[swap_list.next]->prio) 608 atomic_long_inc(&nr_swap_pages);
558 swap_list.next = p->type;
559 nr_swap_pages++;
560 p->inuse_pages--; 609 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset); 610 frontswap_invalidate_page(p->type, offset);
562 if (p->flags & SWP_BLKDEV) { 611 if (p->flags & SWP_BLKDEV) {
@@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry)
581 p = swap_info_get(entry); 630 p = swap_info_get(entry);
582 if (p) { 631 if (p) {
583 swap_entry_free(p, entry, 1); 632 swap_entry_free(p, entry, 1);
584 spin_unlock(&swap_lock); 633 spin_unlock(&p->lock);
585 } 634 }
586} 635}
587 636
@@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
598 count = swap_entry_free(p, entry, SWAP_HAS_CACHE); 647 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
599 if (page) 648 if (page)
600 mem_cgroup_uncharge_swapcache(page, entry, count != 0); 649 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
601 spin_unlock(&swap_lock); 650 spin_unlock(&p->lock);
602 } 651 }
603} 652}
604 653
@@ -617,7 +666,7 @@ int page_swapcount(struct page *page)
617 p = swap_info_get(entry); 666 p = swap_info_get(entry);
618 if (p) { 667 if (p) {
619 count = swap_count(p->swap_map[swp_offset(entry)]); 668 count = swap_count(p->swap_map[swp_offset(entry)]);
620 spin_unlock(&swap_lock); 669 spin_unlock(&p->lock);
621 } 670 }
622 return count; 671 return count;
623} 672}
@@ -699,13 +748,14 @@ int free_swap_and_cache(swp_entry_t entry)
699 p = swap_info_get(entry); 748 p = swap_info_get(entry);
700 if (p) { 749 if (p) {
701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { 750 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
702 page = find_get_page(&swapper_space, entry.val); 751 page = find_get_page(swap_address_space(entry),
752 entry.val);
703 if (page && !trylock_page(page)) { 753 if (page && !trylock_page(page)) {
704 page_cache_release(page); 754 page_cache_release(page);
705 page = NULL; 755 page = NULL;
706 } 756 }
707 } 757 }
708 spin_unlock(&swap_lock); 758 spin_unlock(&p->lock);
709 } 759 }
710 if (page) { 760 if (page) {
711 /* 761 /*
@@ -803,11 +853,13 @@ unsigned int count_swap_pages(int type, int free)
803 if ((unsigned int)type < nr_swapfiles) { 853 if ((unsigned int)type < nr_swapfiles) {
804 struct swap_info_struct *sis = swap_info[type]; 854 struct swap_info_struct *sis = swap_info[type];
805 855
856 spin_lock(&sis->lock);
806 if (sis->flags & SWP_WRITEOK) { 857 if (sis->flags & SWP_WRITEOK) {
807 n = sis->pages; 858 n = sis->pages;
808 if (free) 859 if (free)
809 n -= sis->inuse_pages; 860 n -= sis->inuse_pages;
810 } 861 }
862 spin_unlock(&sis->lock);
811 } 863 }
812 spin_unlock(&swap_lock); 864 spin_unlock(&swap_lock);
813 return n; 865 return n;
@@ -822,11 +874,17 @@ unsigned int count_swap_pages(int type, int free)
822static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 874static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
823 unsigned long addr, swp_entry_t entry, struct page *page) 875 unsigned long addr, swp_entry_t entry, struct page *page)
824{ 876{
877 struct page *swapcache;
825 struct mem_cgroup *memcg; 878 struct mem_cgroup *memcg;
826 spinlock_t *ptl; 879 spinlock_t *ptl;
827 pte_t *pte; 880 pte_t *pte;
828 int ret = 1; 881 int ret = 1;
829 882
883 swapcache = page;
884 page = ksm_might_need_to_copy(page, vma, addr);
885 if (unlikely(!page))
886 return -ENOMEM;
887
830 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, 888 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
831 GFP_KERNEL, &memcg)) { 889 GFP_KERNEL, &memcg)) {
832 ret = -ENOMEM; 890 ret = -ENOMEM;
@@ -845,7 +903,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
845 get_page(page); 903 get_page(page);
846 set_pte_at(vma->vm_mm, addr, pte, 904 set_pte_at(vma->vm_mm, addr, pte,
847 pte_mkold(mk_pte(page, vma->vm_page_prot))); 905 pte_mkold(mk_pte(page, vma->vm_page_prot)));
848 page_add_anon_rmap(page, vma, addr); 906 if (page == swapcache)
907 page_add_anon_rmap(page, vma, addr);
908 else /* ksm created a completely new copy */
909 page_add_new_anon_rmap(page, vma, addr);
849 mem_cgroup_commit_charge_swapin(page, memcg); 910 mem_cgroup_commit_charge_swapin(page, memcg);
850 swap_free(entry); 911 swap_free(entry);
851 /* 912 /*
@@ -856,6 +917,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
856out: 917out:
857 pte_unmap_unlock(pte, ptl); 918 pte_unmap_unlock(pte, ptl);
858out_nolock: 919out_nolock:
920 if (page != swapcache) {
921 unlock_page(page);
922 put_page(page);
923 }
859 return ret; 924 return ret;
860} 925}
861 926
@@ -1456,7 +1521,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1456 p->swap_map = swap_map; 1521 p->swap_map = swap_map;
1457 frontswap_map_set(p, frontswap_map); 1522 frontswap_map_set(p, frontswap_map);
1458 p->flags |= SWP_WRITEOK; 1523 p->flags |= SWP_WRITEOK;
1459 nr_swap_pages += p->pages; 1524 atomic_long_add(p->pages, &nr_swap_pages);
1460 total_swap_pages += p->pages; 1525 total_swap_pages += p->pages;
1461 1526
1462 /* insert swap space into swap_list: */ 1527 /* insert swap space into swap_list: */
@@ -1478,15 +1543,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1478 unsigned long *frontswap_map) 1543 unsigned long *frontswap_map)
1479{ 1544{
1480 spin_lock(&swap_lock); 1545 spin_lock(&swap_lock);
1546 spin_lock(&p->lock);
1481 _enable_swap_info(p, prio, swap_map, frontswap_map); 1547 _enable_swap_info(p, prio, swap_map, frontswap_map);
1482 frontswap_init(p->type); 1548 frontswap_init(p->type);
1549 spin_unlock(&p->lock);
1483 spin_unlock(&swap_lock); 1550 spin_unlock(&swap_lock);
1484} 1551}
1485 1552
1486static void reinsert_swap_info(struct swap_info_struct *p) 1553static void reinsert_swap_info(struct swap_info_struct *p)
1487{ 1554{
1488 spin_lock(&swap_lock); 1555 spin_lock(&swap_lock);
1556 spin_lock(&p->lock);
1489 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1557 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1558 spin_unlock(&p->lock);
1490 spin_unlock(&swap_lock); 1559 spin_unlock(&swap_lock);
1491} 1560}
1492 1561
@@ -1546,14 +1615,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1546 /* just pick something that's safe... */ 1615 /* just pick something that's safe... */
1547 swap_list.next = swap_list.head; 1616 swap_list.next = swap_list.head;
1548 } 1617 }
1618 spin_lock(&p->lock);
1549 if (p->prio < 0) { 1619 if (p->prio < 0) {
1550 for (i = p->next; i >= 0; i = swap_info[i]->next) 1620 for (i = p->next; i >= 0; i = swap_info[i]->next)
1551 swap_info[i]->prio = p->prio--; 1621 swap_info[i]->prio = p->prio--;
1552 least_priority++; 1622 least_priority++;
1553 } 1623 }
1554 nr_swap_pages -= p->pages; 1624 atomic_long_sub(p->pages, &nr_swap_pages);
1555 total_swap_pages -= p->pages; 1625 total_swap_pages -= p->pages;
1556 p->flags &= ~SWP_WRITEOK; 1626 p->flags &= ~SWP_WRITEOK;
1627 spin_unlock(&p->lock);
1557 spin_unlock(&swap_lock); 1628 spin_unlock(&swap_lock);
1558 1629
1559 set_current_oom_origin(); 1630 set_current_oom_origin();
@@ -1572,14 +1643,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1572 1643
1573 mutex_lock(&swapon_mutex); 1644 mutex_lock(&swapon_mutex);
1574 spin_lock(&swap_lock); 1645 spin_lock(&swap_lock);
1646 spin_lock(&p->lock);
1575 drain_mmlist(); 1647 drain_mmlist();
1576 1648
1577 /* wait for anyone still in scan_swap_map */ 1649 /* wait for anyone still in scan_swap_map */
1578 p->highest_bit = 0; /* cuts scans short */ 1650 p->highest_bit = 0; /* cuts scans short */
1579 while (p->flags >= SWP_SCANNING) { 1651 while (p->flags >= SWP_SCANNING) {
1652 spin_unlock(&p->lock);
1580 spin_unlock(&swap_lock); 1653 spin_unlock(&swap_lock);
1581 schedule_timeout_uninterruptible(1); 1654 schedule_timeout_uninterruptible(1);
1582 spin_lock(&swap_lock); 1655 spin_lock(&swap_lock);
1656 spin_lock(&p->lock);
1583 } 1657 }
1584 1658
1585 swap_file = p->swap_file; 1659 swap_file = p->swap_file;
@@ -1589,6 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1589 p->swap_map = NULL; 1663 p->swap_map = NULL;
1590 p->flags = 0; 1664 p->flags = 0;
1591 frontswap_invalidate_area(type); 1665 frontswap_invalidate_area(type);
1666 spin_unlock(&p->lock);
1592 spin_unlock(&swap_lock); 1667 spin_unlock(&swap_lock);
1593 mutex_unlock(&swapon_mutex); 1668 mutex_unlock(&swapon_mutex);
1594 vfree(swap_map); 1669 vfree(swap_map);
@@ -1794,6 +1869,7 @@ static struct swap_info_struct *alloc_swap_info(void)
1794 p->flags = SWP_USED; 1869 p->flags = SWP_USED;
1795 p->next = -1; 1870 p->next = -1;
1796 spin_unlock(&swap_lock); 1871 spin_unlock(&swap_lock);
1872 spin_lock_init(&p->lock);
1797 1873
1798 return p; 1874 return p;
1799} 1875}
@@ -2116,7 +2192,7 @@ void si_swapinfo(struct sysinfo *val)
2116 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 2192 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2117 nr_to_be_unused += si->inuse_pages; 2193 nr_to_be_unused += si->inuse_pages;
2118 } 2194 }
2119 val->freeswap = nr_swap_pages + nr_to_be_unused; 2195 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
2120 val->totalswap = total_swap_pages + nr_to_be_unused; 2196 val->totalswap = total_swap_pages + nr_to_be_unused;
2121 spin_unlock(&swap_lock); 2197 spin_unlock(&swap_lock);
2122} 2198}
@@ -2149,7 +2225,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2149 p = swap_info[type]; 2225 p = swap_info[type];
2150 offset = swp_offset(entry); 2226 offset = swp_offset(entry);
2151 2227
2152 spin_lock(&swap_lock); 2228 spin_lock(&p->lock);
2153 if (unlikely(offset >= p->max)) 2229 if (unlikely(offset >= p->max))
2154 goto unlock_out; 2230 goto unlock_out;
2155 2231
@@ -2184,7 +2260,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2184 p->swap_map[offset] = count | has_cache; 2260 p->swap_map[offset] = count | has_cache;
2185 2261
2186unlock_out: 2262unlock_out:
2187 spin_unlock(&swap_lock); 2263 spin_unlock(&p->lock);
2188out: 2264out:
2189 return err; 2265 return err;
2190 2266
@@ -2309,7 +2385,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2309 } 2385 }
2310 2386
2311 if (!page) { 2387 if (!page) {
2312 spin_unlock(&swap_lock); 2388 spin_unlock(&si->lock);
2313 return -ENOMEM; 2389 return -ENOMEM;
2314 } 2390 }
2315 2391
@@ -2357,7 +2433,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2357 list_add_tail(&page->lru, &head->lru); 2433 list_add_tail(&page->lru, &head->lru);
2358 page = NULL; /* now it's attached, don't free it */ 2434 page = NULL; /* now it's attached, don't free it */
2359out: 2435out:
2360 spin_unlock(&swap_lock); 2436 spin_unlock(&si->lock);
2361outer: 2437outer:
2362 if (page) 2438 if (page)
2363 __free_page(page); 2439 __free_page(page);
diff --git a/mm/util.c b/mm/util.c
index c55e26b17d93..ab1424dbe2e6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,6 +5,8 @@
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/swap.h>
9#include <linux/swapops.h>
8#include <asm/uaccess.h> 10#include <asm/uaccess.h>
9 11
10#include "internal.h" 12#include "internal.h"
@@ -355,12 +357,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
355{ 357{
356 unsigned long ret; 358 unsigned long ret;
357 struct mm_struct *mm = current->mm; 359 struct mm_struct *mm = current->mm;
360 unsigned long populate;
358 361
359 ret = security_mmap_file(file, prot, flag); 362 ret = security_mmap_file(file, prot, flag);
360 if (!ret) { 363 if (!ret) {
361 down_write(&mm->mmap_sem); 364 down_write(&mm->mmap_sem);
362 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); 365 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
366 &populate);
363 up_write(&mm->mmap_sem); 367 up_write(&mm->mmap_sem);
368 if (populate)
369 mm_populate(ret, populate);
364 } 370 }
365 return ret; 371 return ret;
366} 372}
@@ -378,6 +384,24 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
378} 384}
379EXPORT_SYMBOL(vm_mmap); 385EXPORT_SYMBOL(vm_mmap);
380 386
387struct address_space *page_mapping(struct page *page)
388{
389 struct address_space *mapping = page->mapping;
390
391 VM_BUG_ON(PageSlab(page));
392#ifdef CONFIG_SWAP
393 if (unlikely(PageSwapCache(page))) {
394 swp_entry_t entry;
395
396 entry.val = page_private(page);
397 mapping = swap_address_space(entry);
398 } else
399#endif
400 if ((unsigned long)mapping & PAGE_MAPPING_ANON)
401 mapping = NULL;
402 return mapping;
403}
404
381/* Tracepoints definitions. */ 405/* Tracepoints definitions. */
382EXPORT_TRACEPOINT_SYMBOL(kmalloc); 406EXPORT_TRACEPOINT_SYMBOL(kmalloc);
383EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 407EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5123a169ab7b..0f751f2068c3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1376,8 +1376,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1376struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1376struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1377 unsigned long start, unsigned long end) 1377 unsigned long start, unsigned long end)
1378{ 1378{
1379 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1379 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1380 __builtin_return_address(0)); 1380 GFP_KERNEL, __builtin_return_address(0));
1381} 1381}
1382EXPORT_SYMBOL_GPL(__get_vm_area); 1382EXPORT_SYMBOL_GPL(__get_vm_area);
1383 1383
@@ -1385,8 +1385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1385 unsigned long start, unsigned long end, 1385 unsigned long start, unsigned long end,
1386 const void *caller) 1386 const void *caller)
1387{ 1387{
1388 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1388 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1389 caller); 1389 GFP_KERNEL, caller);
1390} 1390}
1391 1391
1392/** 1392/**
@@ -1401,14 +1401,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1401struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1401struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1402{ 1402{
1403 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1403 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1404 -1, GFP_KERNEL, __builtin_return_address(0)); 1404 NUMA_NO_NODE, GFP_KERNEL,
1405 __builtin_return_address(0));
1405} 1406}
1406 1407
1407struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1408struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1408 const void *caller) 1409 const void *caller)
1409{ 1410{
1410 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1411 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1411 -1, GFP_KERNEL, caller); 1412 NUMA_NO_NODE, GFP_KERNEL, caller);
1412} 1413}
1413 1414
1414/** 1415/**
@@ -1650,7 +1651,7 @@ fail:
1650 * @end: vm area range end 1651 * @end: vm area range end
1651 * @gfp_mask: flags for the page level allocator 1652 * @gfp_mask: flags for the page level allocator
1652 * @prot: protection mask for the allocated pages 1653 * @prot: protection mask for the allocated pages
1653 * @node: node to use for allocation or -1 1654 * @node: node to use for allocation or NUMA_NO_NODE
1654 * @caller: caller's return address 1655 * @caller: caller's return address
1655 * 1656 *
1656 * Allocate enough pages to cover @size from the page level 1657 * Allocate enough pages to cover @size from the page level
@@ -1706,7 +1707,7 @@ fail:
1706 * @align: desired alignment 1707 * @align: desired alignment
1707 * @gfp_mask: flags for the page level allocator 1708 * @gfp_mask: flags for the page level allocator
1708 * @prot: protection mask for the allocated pages 1709 * @prot: protection mask for the allocated pages
1709 * @node: node to use for allocation or -1 1710 * @node: node to use for allocation or NUMA_NO_NODE
1710 * @caller: caller's return address 1711 * @caller: caller's return address
1711 * 1712 *
1712 * Allocate enough pages to cover @size from the page level 1713 * Allocate enough pages to cover @size from the page level
@@ -1723,7 +1724,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1723 1724
1724void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1725void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1725{ 1726{
1726 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1727 return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
1727 __builtin_return_address(0)); 1728 __builtin_return_address(0));
1728} 1729}
1729EXPORT_SYMBOL(__vmalloc); 1730EXPORT_SYMBOL(__vmalloc);
@@ -1746,7 +1747,8 @@ static inline void *__vmalloc_node_flags(unsigned long size,
1746 */ 1747 */
1747void *vmalloc(unsigned long size) 1748void *vmalloc(unsigned long size)
1748{ 1749{
1749 return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); 1750 return __vmalloc_node_flags(size, NUMA_NO_NODE,
1751 GFP_KERNEL | __GFP_HIGHMEM);
1750} 1752}
1751EXPORT_SYMBOL(vmalloc); 1753EXPORT_SYMBOL(vmalloc);
1752 1754
@@ -1762,7 +1764,7 @@ EXPORT_SYMBOL(vmalloc);
1762 */ 1764 */
1763void *vzalloc(unsigned long size) 1765void *vzalloc(unsigned long size)
1764{ 1766{
1765 return __vmalloc_node_flags(size, -1, 1767 return __vmalloc_node_flags(size, NUMA_NO_NODE,
1766 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); 1768 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1767} 1769}
1768EXPORT_SYMBOL(vzalloc); 1770EXPORT_SYMBOL(vzalloc);
@@ -1781,7 +1783,8 @@ void *vmalloc_user(unsigned long size)
1781 1783
1782 ret = __vmalloc_node(size, SHMLBA, 1784 ret = __vmalloc_node(size, SHMLBA,
1783 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1785 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1784 PAGE_KERNEL, -1, __builtin_return_address(0)); 1786 PAGE_KERNEL, NUMA_NO_NODE,
1787 __builtin_return_address(0));
1785 if (ret) { 1788 if (ret) {
1786 area = find_vm_area(ret); 1789 area = find_vm_area(ret);
1787 area->flags |= VM_USERMAP; 1790 area->flags |= VM_USERMAP;
@@ -1846,7 +1849,7 @@ EXPORT_SYMBOL(vzalloc_node);
1846void *vmalloc_exec(unsigned long size) 1849void *vmalloc_exec(unsigned long size)
1847{ 1850{
1848 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1851 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1849 -1, __builtin_return_address(0)); 1852 NUMA_NO_NODE, __builtin_return_address(0));
1850} 1853}
1851 1854
1852#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 1855#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1867,7 +1870,7 @@ void *vmalloc_exec(unsigned long size)
1867void *vmalloc_32(unsigned long size) 1870void *vmalloc_32(unsigned long size)
1868{ 1871{
1869 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, 1872 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1870 -1, __builtin_return_address(0)); 1873 NUMA_NO_NODE, __builtin_return_address(0));
1871} 1874}
1872EXPORT_SYMBOL(vmalloc_32); 1875EXPORT_SYMBOL(vmalloc_32);
1873 1876
@@ -1884,7 +1887,7 @@ void *vmalloc_32_user(unsigned long size)
1884 void *ret; 1887 void *ret;
1885 1888
1886 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1889 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1887 -1, __builtin_return_address(0)); 1890 NUMA_NO_NODE, __builtin_return_address(0));
1888 if (ret) { 1891 if (ret) {
1889 area = find_vm_area(ret); 1892 area = find_vm_area(ret);
1890 area->flags |= VM_USERMAP; 1893 area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index adc7e9058181..88c5fed8b9a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -128,7 +128,7 @@ struct scan_control {
128 * From 0 .. 100. Higher means more swappy. 128 * From 0 .. 100. Higher means more swappy.
129 */ 129 */
130int vm_swappiness = 60; 130int vm_swappiness = 60;
131long vm_total_pages; /* The total number of pages which the VM controls */ 131unsigned long vm_total_pages; /* The total number of pages which the VM controls */
132 132
133static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
1579} 1579}
1580#endif 1580#endif
1581 1581
1582static int inactive_file_is_low_global(struct zone *zone)
1583{
1584 unsigned long active, inactive;
1585
1586 active = zone_page_state(zone, NR_ACTIVE_FILE);
1587 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1588
1589 return (active > inactive);
1590}
1591
1592/** 1582/**
1593 * inactive_file_is_low - check if file pages need to be deactivated 1583 * inactive_file_is_low - check if file pages need to be deactivated
1594 * @lruvec: LRU vector to check 1584 * @lruvec: LRU vector to check
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone)
1605 */ 1595 */
1606static int inactive_file_is_low(struct lruvec *lruvec) 1596static int inactive_file_is_low(struct lruvec *lruvec)
1607{ 1597{
1608 if (!mem_cgroup_disabled()) 1598 unsigned long inactive;
1609 return mem_cgroup_inactive_file_is_low(lruvec); 1599 unsigned long active;
1600
1601 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1602 active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
1610 1603
1611 return inactive_file_is_low_global(lruvec_zone(lruvec)); 1604 return active > inactive;
1612} 1605}
1613 1606
1614static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) 1607static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc)
1638 return mem_cgroup_swappiness(sc->target_mem_cgroup); 1631 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1639} 1632}
1640 1633
1634enum scan_balance {
1635 SCAN_EQUAL,
1636 SCAN_FRACT,
1637 SCAN_ANON,
1638 SCAN_FILE,
1639};
1640
1641/* 1641/*
1642 * Determine how aggressively the anon and file LRU lists should be 1642 * Determine how aggressively the anon and file LRU lists should be
1643 * scanned. The relative value of each set of LRU lists is determined 1643 * scanned. The relative value of each set of LRU lists is determined
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc)
1650static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1650static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1651 unsigned long *nr) 1651 unsigned long *nr)
1652{ 1652{
1653 unsigned long anon, file, free; 1653 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1654 u64 fraction[2];
1655 u64 denominator = 0; /* gcc */
1656 struct zone *zone = lruvec_zone(lruvec);
1654 unsigned long anon_prio, file_prio; 1657 unsigned long anon_prio, file_prio;
1658 enum scan_balance scan_balance;
1659 unsigned long anon, file, free;
1660 bool force_scan = false;
1655 unsigned long ap, fp; 1661 unsigned long ap, fp;
1656 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1657 u64 fraction[2], denominator;
1658 enum lru_list lru; 1662 enum lru_list lru;
1659 int noswap = 0;
1660 bool force_scan = false;
1661 struct zone *zone = lruvec_zone(lruvec);
1662 1663
1663 /* 1664 /*
1664 * If the zone or memcg is small, nr[l] can be 0. This 1665 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1676 force_scan = true; 1677 force_scan = true;
1677 1678
1678 /* If we have no swap space, do not bother scanning anon pages. */ 1679 /* If we have no swap space, do not bother scanning anon pages. */
1679 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1680 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
1680 noswap = 1; 1681 scan_balance = SCAN_FILE;
1681 fraction[0] = 0; 1682 goto out;
1682 fraction[1] = 1; 1683 }
1683 denominator = 1; 1684
1685 /*
1686 * Global reclaim will swap to prevent OOM even with no
1687 * swappiness, but memcg users want to use this knob to
1688 * disable swapping for individual groups completely when
1689 * using the memory controller's swap limit feature would be
1690 * too expensive.
1691 */
1692 if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
1693 scan_balance = SCAN_FILE;
1694 goto out;
1695 }
1696
1697 /*
1698 * Do not apply any pressure balancing cleverness when the
1699 * system is close to OOM, scan both anon and file equally
1700 * (unless the swappiness setting disagrees with swapping).
1701 */
1702 if (!sc->priority && vmscan_swappiness(sc)) {
1703 scan_balance = SCAN_EQUAL;
1684 goto out; 1704 goto out;
1685 } 1705 }
1686 1706
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1689 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + 1709 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1690 get_lru_size(lruvec, LRU_INACTIVE_FILE); 1710 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1691 1711
1712 /*
1713 * If it's foreseeable that reclaiming the file cache won't be
1714 * enough to get the zone back into a desirable shape, we have
1715 * to swap. Better start now and leave the - probably heavily
1716 * thrashing - remaining file pages alone.
1717 */
1692 if (global_reclaim(sc)) { 1718 if (global_reclaim(sc)) {
1693 free = zone_page_state(zone, NR_FREE_PAGES); 1719 free = zone_page_state(zone, NR_FREE_PAGES);
1694 if (unlikely(file + free <= high_wmark_pages(zone))) { 1720 if (unlikely(file + free <= high_wmark_pages(zone))) {
1695 /* 1721 scan_balance = SCAN_ANON;
1696 * If we have very few page cache pages, force-scan
1697 * anon pages.
1698 */
1699 fraction[0] = 1;
1700 fraction[1] = 0;
1701 denominator = 1;
1702 goto out;
1703 } else if (!inactive_file_is_low_global(zone)) {
1704 /*
1705 * There is enough inactive page cache, do not
1706 * reclaim anything from the working set right now.
1707 */
1708 fraction[0] = 0;
1709 fraction[1] = 1;
1710 denominator = 1;
1711 goto out; 1722 goto out;
1712 } 1723 }
1713 } 1724 }
1714 1725
1715 /* 1726 /*
1727 * There is enough inactive page cache, do not reclaim
1728 * anything from the anonymous working set right now.
1729 */
1730 if (!inactive_file_is_low(lruvec)) {
1731 scan_balance = SCAN_FILE;
1732 goto out;
1733 }
1734
1735 scan_balance = SCAN_FRACT;
1736
1737 /*
1716 * With swappiness at 100, anonymous and file have the same priority. 1738 * With swappiness at 100, anonymous and file have the same priority.
1717 * This scanning priority is essentially the inverse of IO cost. 1739 * This scanning priority is essentially the inverse of IO cost.
1718 */ 1740 */
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1759out: 1781out:
1760 for_each_evictable_lru(lru) { 1782 for_each_evictable_lru(lru) {
1761 int file = is_file_lru(lru); 1783 int file = is_file_lru(lru);
1784 unsigned long size;
1762 unsigned long scan; 1785 unsigned long scan;
1763 1786
1764 scan = get_lru_size(lruvec, lru); 1787 size = get_lru_size(lruvec, lru);
1765 if (sc->priority || noswap || !vmscan_swappiness(sc)) { 1788 scan = size >> sc->priority;
1766 scan >>= sc->priority; 1789
1767 if (!scan && force_scan) 1790 if (!scan && force_scan)
1768 scan = SWAP_CLUSTER_MAX; 1791 scan = min(size, SWAP_CLUSTER_MAX);
1792
1793 switch (scan_balance) {
1794 case SCAN_EQUAL:
1795 /* Scan lists relative to size */
1796 break;
1797 case SCAN_FRACT:
1798 /*
1799 * Scan types proportional to swappiness and
1800 * their relative recent reclaim efficiency.
1801 */
1769 scan = div64_u64(scan * fraction[file], denominator); 1802 scan = div64_u64(scan * fraction[file], denominator);
1803 break;
1804 case SCAN_FILE:
1805 case SCAN_ANON:
1806 /* Scan one type exclusively */
1807 if ((scan_balance == SCAN_FILE) != file)
1808 scan = 0;
1809 break;
1810 default:
1811 /* Look ma, no brain */
1812 BUG();
1770 } 1813 }
1771 nr[lru] = scan; 1814 nr[lru] = scan;
1772 } 1815 }
1773} 1816}
1774 1817
1818/*
1819 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1820 */
1821static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1822{
1823 unsigned long nr[NR_LRU_LISTS];
1824 unsigned long nr_to_scan;
1825 enum lru_list lru;
1826 unsigned long nr_reclaimed = 0;
1827 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1828 struct blk_plug plug;
1829
1830 get_scan_count(lruvec, sc, nr);
1831
1832 blk_start_plug(&plug);
1833 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1834 nr[LRU_INACTIVE_FILE]) {
1835 for_each_evictable_lru(lru) {
1836 if (nr[lru]) {
1837 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
1838 nr[lru] -= nr_to_scan;
1839
1840 nr_reclaimed += shrink_list(lru, nr_to_scan,
1841 lruvec, sc);
1842 }
1843 }
1844 /*
1845 * On large memory systems, scan >> priority can become
1846 * really large. This is fine for the starting priority;
1847 * we want to put equal scanning pressure on each zone.
1848 * However, if the VM has a harder time of freeing pages,
1849 * with multiple processes reclaiming pages, the total
1850 * freeing target can get unreasonably large.
1851 */
1852 if (nr_reclaimed >= nr_to_reclaim &&
1853 sc->priority < DEF_PRIORITY)
1854 break;
1855 }
1856 blk_finish_plug(&plug);
1857 sc->nr_reclaimed += nr_reclaimed;
1858
1859 /*
1860 * Even if we did not try to evict anon pages at all, we want to
1861 * rebalance the anon lru active/inactive ratio.
1862 */
1863 if (inactive_anon_is_low(lruvec))
1864 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1865 sc, LRU_ACTIVE_ANON);
1866
1867 throttle_vm_writeout(sc->gfp_mask);
1868}
1869
1775/* Use reclaim/compaction for costly allocs or under memory pressure */ 1870/* Use reclaim/compaction for costly allocs or under memory pressure */
1776static bool in_reclaim_compaction(struct scan_control *sc) 1871static bool in_reclaim_compaction(struct scan_control *sc)
1777{ 1872{
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
1790 * calls try_to_compact_zone() that it will have enough free pages to succeed. 1885 * calls try_to_compact_zone() that it will have enough free pages to succeed.
1791 * It will give up earlier than that if there is difficulty reclaiming pages. 1886 * It will give up earlier than that if there is difficulty reclaiming pages.
1792 */ 1887 */
1793static inline bool should_continue_reclaim(struct lruvec *lruvec, 1888static inline bool should_continue_reclaim(struct zone *zone,
1794 unsigned long nr_reclaimed, 1889 unsigned long nr_reclaimed,
1795 unsigned long nr_scanned, 1890 unsigned long nr_scanned,
1796 struct scan_control *sc) 1891 struct scan_control *sc)
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1830 * inactive lists are large enough, continue reclaiming 1925 * inactive lists are large enough, continue reclaiming
1831 */ 1926 */
1832 pages_for_compaction = (2UL << sc->order); 1927 pages_for_compaction = (2UL << sc->order);
1833 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1928 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
1834 if (nr_swap_pages > 0) 1929 if (get_nr_swap_pages() > 0)
1835 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); 1930 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
1836 if (sc->nr_reclaimed < pages_for_compaction && 1931 if (sc->nr_reclaimed < pages_for_compaction &&
1837 inactive_lru_pages > pages_for_compaction) 1932 inactive_lru_pages > pages_for_compaction)
1838 return true; 1933 return true;
1839 1934
1840 /* If compaction would go ahead or the allocation would succeed, stop */ 1935 /* If compaction would go ahead or the allocation would succeed, stop */
1841 switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { 1936 switch (compaction_suitable(zone, sc->order)) {
1842 case COMPACT_PARTIAL: 1937 case COMPACT_PARTIAL:
1843 case COMPACT_CONTINUE: 1938 case COMPACT_CONTINUE:
1844 return false; 1939 return false;
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1847 } 1942 }
1848} 1943}
1849 1944
1850/* 1945static void shrink_zone(struct zone *zone, struct scan_control *sc)
1851 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1852 */
1853static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1854{ 1946{
1855 unsigned long nr[NR_LRU_LISTS];
1856 unsigned long nr_to_scan;
1857 enum lru_list lru;
1858 unsigned long nr_reclaimed, nr_scanned; 1947 unsigned long nr_reclaimed, nr_scanned;
1859 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1860 struct blk_plug plug;
1861 1948
1862restart: 1949 do {
1863 nr_reclaimed = 0; 1950 struct mem_cgroup *root = sc->target_mem_cgroup;
1864 nr_scanned = sc->nr_scanned; 1951 struct mem_cgroup_reclaim_cookie reclaim = {
1865 get_scan_count(lruvec, sc, nr); 1952 .zone = zone,
1866 1953 .priority = sc->priority,
1867 blk_start_plug(&plug); 1954 };
1868 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1955 struct mem_cgroup *memcg;
1869 nr[LRU_INACTIVE_FILE]) {
1870 for_each_evictable_lru(lru) {
1871 if (nr[lru]) {
1872 nr_to_scan = min_t(unsigned long,
1873 nr[lru], SWAP_CLUSTER_MAX);
1874 nr[lru] -= nr_to_scan;
1875
1876 nr_reclaimed += shrink_list(lru, nr_to_scan,
1877 lruvec, sc);
1878 }
1879 }
1880 /*
1881 * On large memory systems, scan >> priority can become
1882 * really large. This is fine for the starting priority;
1883 * we want to put equal scanning pressure on each zone.
1884 * However, if the VM has a harder time of freeing pages,
1885 * with multiple processes reclaiming pages, the total
1886 * freeing target can get unreasonably large.
1887 */
1888 if (nr_reclaimed >= nr_to_reclaim &&
1889 sc->priority < DEF_PRIORITY)
1890 break;
1891 }
1892 blk_finish_plug(&plug);
1893 sc->nr_reclaimed += nr_reclaimed;
1894
1895 /*
1896 * Even if we did not try to evict anon pages at all, we want to
1897 * rebalance the anon lru active/inactive ratio.
1898 */
1899 if (inactive_anon_is_low(lruvec))
1900 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1901 sc, LRU_ACTIVE_ANON);
1902
1903 /* reclaim/compaction might need reclaim to continue */
1904 if (should_continue_reclaim(lruvec, nr_reclaimed,
1905 sc->nr_scanned - nr_scanned, sc))
1906 goto restart;
1907 1956
1908 throttle_vm_writeout(sc->gfp_mask); 1957 nr_reclaimed = sc->nr_reclaimed;
1909} 1958 nr_scanned = sc->nr_scanned;
1910 1959
1911static void shrink_zone(struct zone *zone, struct scan_control *sc) 1960 memcg = mem_cgroup_iter(root, NULL, &reclaim);
1912{ 1961 do {
1913 struct mem_cgroup *root = sc->target_mem_cgroup; 1962 struct lruvec *lruvec;
1914 struct mem_cgroup_reclaim_cookie reclaim = {
1915 .zone = zone,
1916 .priority = sc->priority,
1917 };
1918 struct mem_cgroup *memcg;
1919 1963
1920 memcg = mem_cgroup_iter(root, NULL, &reclaim); 1964 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1921 do {
1922 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1923 1965
1924 shrink_lruvec(lruvec, sc); 1966 shrink_lruvec(lruvec, sc);
1925 1967
1926 /* 1968 /*
1927 * Limit reclaim has historically picked one memcg and 1969 * Direct reclaim and kswapd have to scan all memory
1928 * scanned it with decreasing priority levels until 1970 * cgroups to fulfill the overall scan target for the
1929 * nr_to_reclaim had been reclaimed. This priority 1971 * zone.
1930 * cycle is thus over after a single memcg. 1972 *
1931 * 1973 * Limit reclaim, on the other hand, only cares about
1932 * Direct reclaim and kswapd, on the other hand, have 1974 * nr_to_reclaim pages to be reclaimed and it will
1933 * to scan all memory cgroups to fulfill the overall 1975 * retry with decreasing priority if one round over the
1934 * scan target for the zone. 1976 * whole hierarchy is not sufficient.
1935 */ 1977 */
1936 if (!global_reclaim(sc)) { 1978 if (!global_reclaim(sc) &&
1937 mem_cgroup_iter_break(root, memcg); 1979 sc->nr_reclaimed >= sc->nr_to_reclaim) {
1938 break; 1980 mem_cgroup_iter_break(root, memcg);
1939 } 1981 break;
1940 memcg = mem_cgroup_iter(root, memcg, &reclaim); 1982 }
1941 } while (memcg); 1983 memcg = mem_cgroup_iter(root, memcg, &reclaim);
1984 } while (memcg);
1985 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
1986 sc->nr_scanned - nr_scanned, sc));
1942} 1987}
1943 1988
1944/* Returns true if compaction should go ahead for a high-order request */ 1989/* Returns true if compaction should go ahead for a high-order request */
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
1958 * a reasonable chance of completing and allocating the page 2003 * a reasonable chance of completing and allocating the page
1959 */ 2004 */
1960 balance_gap = min(low_wmark_pages(zone), 2005 balance_gap = min(low_wmark_pages(zone),
1961 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2006 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
1962 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2007 KSWAPD_ZONE_BALANCE_GAP_RATIO);
1963 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2008 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
1964 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2009 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2150 goto out; 2195 goto out;
2151 2196
2152 /* 2197 /*
2198 * If we're getting trouble reclaiming, start doing
2199 * writepage even in laptop mode.
2200 */
2201 if (sc->priority < DEF_PRIORITY - 2)
2202 sc->may_writepage = 1;
2203
2204 /*
2153 * Try to write back as many pages as we just scanned. This 2205 * Try to write back as many pages as we just scanned. This
2154 * tends to cause slow streaming writers to write data to the 2206 * tends to cause slow streaming writers to write data to the
2155 * disk smoothly, at the dirtying rate, which is nice. But 2207 * disk smoothly, at the dirtying rate, which is nice. But
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2300{ 2352{
2301 unsigned long nr_reclaimed; 2353 unsigned long nr_reclaimed;
2302 struct scan_control sc = { 2354 struct scan_control sc = {
2303 .gfp_mask = gfp_mask, 2355 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2304 .may_writepage = !laptop_mode, 2356 .may_writepage = !laptop_mode,
2305 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2357 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2306 .may_unmap = 1, 2358 .may_unmap = 1,
@@ -2452,12 +2504,16 @@ static bool zone_balanced(struct zone *zone, int order,
2452} 2504}
2453 2505
2454/* 2506/*
2455 * pgdat_balanced is used when checking if a node is balanced for high-order 2507 * pgdat_balanced() is used when checking if a node is balanced.
2456 * allocations. Only zones that meet watermarks and are in a zone allowed 2508 *
2457 * by the callers classzone_idx are added to balanced_pages. The total of 2509 * For order-0, all zones must be balanced!
2458 * balanced pages must be at least 25% of the zones allowed by classzone_idx 2510 *
2459 * for the node to be considered balanced. Forcing all zones to be balanced 2511 * For high-order allocations only zones that meet watermarks and are in a
2460 * for high orders can cause excessive reclaim when there are imbalanced zones. 2512 * zone allowed by the callers classzone_idx are added to balanced_pages. The
2513 * total of balanced pages must be at least 25% of the zones allowed by
2514 * classzone_idx for the node to be considered balanced. Forcing all zones to
2515 * be balanced for high orders can cause excessive reclaim when there are
2516 * imbalanced zones.
2461 * The choice of 25% is due to 2517 * The choice of 25% is due to
2462 * o a 16M DMA zone that is balanced will not balance a zone on any 2518 * o a 16M DMA zone that is balanced will not balance a zone on any
2463 * reasonable sized machine 2519 * reasonable sized machine
@@ -2467,17 +2523,43 @@ static bool zone_balanced(struct zone *zone, int order,
2467 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2523 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2468 * to balance a node on its own. These seemed like reasonable ratios. 2524 * to balance a node on its own. These seemed like reasonable ratios.
2469 */ 2525 */
2470static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, 2526static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2471 int classzone_idx)
2472{ 2527{
2473 unsigned long present_pages = 0; 2528 unsigned long managed_pages = 0;
2529 unsigned long balanced_pages = 0;
2474 int i; 2530 int i;
2475 2531
2476 for (i = 0; i <= classzone_idx; i++) 2532 /* Check the watermark levels */
2477 present_pages += pgdat->node_zones[i].present_pages; 2533 for (i = 0; i <= classzone_idx; i++) {
2534 struct zone *zone = pgdat->node_zones + i;
2535
2536 if (!populated_zone(zone))
2537 continue;
2478 2538
2479 /* A special case here: if zone has no page, we think it's balanced */ 2539 managed_pages += zone->managed_pages;
2480 return balanced_pages >= (present_pages >> 2); 2540
2541 /*
2542 * A special case here:
2543 *
2544 * balance_pgdat() skips over all_unreclaimable after
2545 * DEF_PRIORITY. Effectively, it considers them balanced so
2546 * they must be considered balanced here as well!
2547 */
2548 if (zone->all_unreclaimable) {
2549 balanced_pages += zone->managed_pages;
2550 continue;
2551 }
2552
2553 if (zone_balanced(zone, order, 0, i))
2554 balanced_pages += zone->managed_pages;
2555 else if (!order)
2556 return false;
2557 }
2558
2559 if (order)
2560 return balanced_pages >= (managed_pages >> 2);
2561 else
2562 return true;
2481} 2563}
2482 2564
2483/* 2565/*
@@ -2489,10 +2571,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2489static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, 2571static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2490 int classzone_idx) 2572 int classzone_idx)
2491{ 2573{
2492 int i;
2493 unsigned long balanced = 0;
2494 bool all_zones_ok = true;
2495
2496 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2574 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2497 if (remaining) 2575 if (remaining)
2498 return false; 2576 return false;
@@ -2511,39 +2589,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2511 return false; 2589 return false;
2512 } 2590 }
2513 2591
2514 /* Check the watermark levels */ 2592 return pgdat_balanced(pgdat, order, classzone_idx);
2515 for (i = 0; i <= classzone_idx; i++) {
2516 struct zone *zone = pgdat->node_zones + i;
2517
2518 if (!populated_zone(zone))
2519 continue;
2520
2521 /*
2522 * balance_pgdat() skips over all_unreclaimable after
2523 * DEF_PRIORITY. Effectively, it considers them balanced so
2524 * they must be considered balanced here as well if kswapd
2525 * is to sleep
2526 */
2527 if (zone->all_unreclaimable) {
2528 balanced += zone->present_pages;
2529 continue;
2530 }
2531
2532 if (!zone_balanced(zone, order, 0, i))
2533 all_zones_ok = false;
2534 else
2535 balanced += zone->present_pages;
2536 }
2537
2538 /*
2539 * For high-order requests, the balanced zones must contain at least
2540 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2541 * must be balanced
2542 */
2543 if (order)
2544 return pgdat_balanced(pgdat, balanced, classzone_idx);
2545 else
2546 return all_zones_ok;
2547} 2593}
2548 2594
2549/* 2595/*
@@ -2570,8 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2570static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2616static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2571 int *classzone_idx) 2617 int *classzone_idx)
2572{ 2618{
2573 struct zone *unbalanced_zone; 2619 bool pgdat_is_balanced = false;
2574 unsigned long balanced;
2575 int i; 2620 int i;
2576 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2621 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2577 unsigned long total_scanned; 2622 unsigned long total_scanned;
@@ -2602,10 +2647,6 @@ loop_again:
2602 2647
2603 do { 2648 do {
2604 unsigned long lru_pages = 0; 2649 unsigned long lru_pages = 0;
2605 int has_under_min_watermark_zone = 0;
2606
2607 unbalanced_zone = NULL;
2608 balanced = 0;
2609 2650
2610 /* 2651 /*
2611 * Scan in the highmem->dma direction for the highest 2652 * Scan in the highmem->dma direction for the highest
@@ -2646,8 +2687,11 @@ loop_again:
2646 zone_clear_flag(zone, ZONE_CONGESTED); 2687 zone_clear_flag(zone, ZONE_CONGESTED);
2647 } 2688 }
2648 } 2689 }
2649 if (i < 0) 2690
2691 if (i < 0) {
2692 pgdat_is_balanced = true;
2650 goto out; 2693 goto out;
2694 }
2651 2695
2652 for (i = 0; i <= end_zone; i++) { 2696 for (i = 0; i <= end_zone; i++) {
2653 struct zone *zone = pgdat->node_zones + i; 2697 struct zone *zone = pgdat->node_zones + i;
@@ -2697,7 +2741,7 @@ loop_again:
2697 * of the zone, whichever is smaller. 2741 * of the zone, whichever is smaller.
2698 */ 2742 */
2699 balance_gap = min(low_wmark_pages(zone), 2743 balance_gap = min(low_wmark_pages(zone),
2700 (zone->present_pages + 2744 (zone->managed_pages +
2701 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2745 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2702 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2746 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2703 /* 2747 /*
@@ -2728,12 +2772,10 @@ loop_again:
2728 } 2772 }
2729 2773
2730 /* 2774 /*
2731 * If we've done a decent amount of scanning and 2775 * If we're getting trouble reclaiming, start doing
2732 * the reclaim ratio is low, start doing writepage 2776 * writepage even in laptop mode.
2733 * even in laptop mode
2734 */ 2777 */
2735 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2778 if (sc.priority < DEF_PRIORITY - 2)
2736 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2737 sc.may_writepage = 1; 2779 sc.may_writepage = 1;
2738 2780
2739 if (zone->all_unreclaimable) { 2781 if (zone->all_unreclaimable) {
@@ -2742,17 +2784,7 @@ loop_again:
2742 continue; 2784 continue;
2743 } 2785 }
2744 2786
2745 if (!zone_balanced(zone, testorder, 0, end_zone)) { 2787 if (zone_balanced(zone, testorder, 0, end_zone))
2746 unbalanced_zone = zone;
2747 /*
2748 * We are still under min water mark. This
2749 * means that we have a GFP_ATOMIC allocation
2750 * failure risk. Hurry up!
2751 */
2752 if (!zone_watermark_ok_safe(zone, order,
2753 min_wmark_pages(zone), end_zone, 0))
2754 has_under_min_watermark_zone = 1;
2755 } else {
2756 /* 2788 /*
2757 * If a zone reaches its high watermark, 2789 * If a zone reaches its high watermark,
2758 * consider it to be no longer congested. It's 2790 * consider it to be no longer congested. It's
@@ -2761,10 +2793,6 @@ loop_again:
2761 * speculatively avoid congestion waits 2793 * speculatively avoid congestion waits
2762 */ 2794 */
2763 zone_clear_flag(zone, ZONE_CONGESTED); 2795 zone_clear_flag(zone, ZONE_CONGESTED);
2764 if (i <= *classzone_idx)
2765 balanced += zone->present_pages;
2766 }
2767
2768 } 2796 }
2769 2797
2770 /* 2798 /*
@@ -2776,17 +2804,9 @@ loop_again:
2776 pfmemalloc_watermark_ok(pgdat)) 2804 pfmemalloc_watermark_ok(pgdat))
2777 wake_up(&pgdat->pfmemalloc_wait); 2805 wake_up(&pgdat->pfmemalloc_wait);
2778 2806
2779 if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2807 if (pgdat_balanced(pgdat, order, *classzone_idx)) {
2808 pgdat_is_balanced = true;
2780 break; /* kswapd: all done */ 2809 break; /* kswapd: all done */
2781 /*
2782 * OK, kswapd is getting into trouble. Take a nap, then take
2783 * another pass across the zones.
2784 */
2785 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2786 if (has_under_min_watermark_zone)
2787 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2788 else
2789 wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
2790 } 2810 }
2791 2811
2792 /* 2812 /*
@@ -2798,14 +2818,9 @@ loop_again:
2798 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2818 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2799 break; 2819 break;
2800 } while (--sc.priority >= 0); 2820 } while (--sc.priority >= 0);
2801out:
2802 2821
2803 /* 2822out:
2804 * order-0: All zones must meet high watermark for a balanced node 2823 if (!pgdat_is_balanced) {
2805 * high-order: Balanced zones must make up at least 25% of the node
2806 * for the node to be balanced
2807 */
2808 if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) {
2809 cond_resched(); 2824 cond_resched();
2810 2825
2811 try_to_freeze(); 2826 try_to_freeze();
@@ -3068,7 +3083,7 @@ unsigned long global_reclaimable_pages(void)
3068 nr = global_page_state(NR_ACTIVE_FILE) + 3083 nr = global_page_state(NR_ACTIVE_FILE) +
3069 global_page_state(NR_INACTIVE_FILE); 3084 global_page_state(NR_INACTIVE_FILE);
3070 3085
3071 if (nr_swap_pages > 0) 3086 if (get_nr_swap_pages() > 0)
3072 nr += global_page_state(NR_ACTIVE_ANON) + 3087 nr += global_page_state(NR_ACTIVE_ANON) +
3073 global_page_state(NR_INACTIVE_ANON); 3088 global_page_state(NR_INACTIVE_ANON);
3074 3089
@@ -3082,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
3082 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 3097 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3083 zone_page_state(zone, NR_INACTIVE_FILE); 3098 zone_page_state(zone, NR_INACTIVE_FILE);
3084 3099
3085 if (nr_swap_pages > 0) 3100 if (get_nr_swap_pages() > 0)
3086 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 3101 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3087 zone_page_state(zone, NR_INACTIVE_ANON); 3102 zone_page_state(zone, NR_INACTIVE_ANON);
3088 3103
@@ -3137,8 +3152,8 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3137 not required for correctness. So if the last cpu in a node goes 3152 not required for correctness. So if the last cpu in a node goes
3138 away, we get changed to run anywhere: as the first one comes back, 3153 away, we get changed to run anywhere: as the first one comes back,
3139 restore their cpu bindings. */ 3154 restore their cpu bindings. */
3140static int __devinit cpu_callback(struct notifier_block *nfb, 3155static int cpu_callback(struct notifier_block *nfb, unsigned long action,
3141 unsigned long action, void *hcpu) 3156 void *hcpu)
3142{ 3157{
3143 int nid; 3158 int nid;
3144 3159
@@ -3295,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3295 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3310 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3296 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3311 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3297 .may_swap = 1, 3312 .may_swap = 1,
3298 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3313 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3299 SWAP_CLUSTER_MAX), 3314 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3300 .gfp_mask = gfp_mask,
3301 .order = order, 3315 .order = order,
3302 .priority = ZONE_RECLAIM_PRIORITY, 3316 .priority = ZONE_RECLAIM_PRIORITY,
3303 }; 3317 };
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9800306c8195..e1d8ed172c42 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -142,7 +142,7 @@ int calculate_normal_threshold(struct zone *zone)
142 * 125 1024 10 16-32 GB 9 142 * 125 1024 10 16-32 GB 9
143 */ 143 */
144 144
145 mem = zone->present_pages >> (27 - PAGE_SHIFT); 145 mem = zone->managed_pages >> (27 - PAGE_SHIFT);
146 146
147 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); 147 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
148 148
@@ -628,7 +628,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
628#ifdef CONFIG_CMA 628#ifdef CONFIG_CMA
629 "CMA", 629 "CMA",
630#endif 630#endif
631#ifdef CONFIG_MEMORY_ISOLATION
631 "Isolate", 632 "Isolate",
633#endif
632}; 634};
633 635
634static void *frag_start(struct seq_file *m, loff_t *pos) 636static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -768,7 +770,6 @@ const char * const vmstat_text[] = {
768 "kswapd_inodesteal", 770 "kswapd_inodesteal",
769 "kswapd_low_wmark_hit_quickly", 771 "kswapd_low_wmark_hit_quickly",
770 "kswapd_high_wmark_hit_quickly", 772 "kswapd_high_wmark_hit_quickly",
771 "kswapd_skip_congestion_wait",
772 "pageoutrun", 773 "pageoutrun",
773 "allocstall", 774 "allocstall",
774 775
@@ -890,7 +891,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
890 int mtype; 891 int mtype;
891 unsigned long pfn; 892 unsigned long pfn;
892 unsigned long start_pfn = zone->zone_start_pfn; 893 unsigned long start_pfn = zone->zone_start_pfn;
893 unsigned long end_pfn = start_pfn + zone->spanned_pages; 894 unsigned long end_pfn = zone_end_pfn(zone);
894 unsigned long count[MIGRATE_TYPES] = { 0, }; 895 unsigned long count[MIGRATE_TYPES] = { 0, };
895 896
896 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 897 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {