aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig33
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/bounce.c287
-rw-r--r--mm/compaction.c249
-rw-r--r--mm/dmapool.c31
-rw-r--r--mm/filemap.c248
-rw-r--r--mm/fremap.c7
-rw-r--r--mm/frontswap.c13
-rw-r--r--mm/gup.c662
-rw-r--r--mm/huge_memory.c34
-rw-r--r--mm/hugetlb.c363
-rw-r--r--mm/internal.h36
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c231
-rw-r--r--mm/memcontrol.c410
-rw-r--r--mm/memory-failure.c113
-rw-r--r--mm/memory.c746
-rw-r--r--mm/memory_hotplug.c148
-rw-r--r--mm/mempolicy.c30
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c63
-rw-r--r--mm/mmap.c9
-rw-r--r--mm/msync.c8
-rw-r--r--mm/page-writeback.c22
-rw-r--r--mm/page_alloc.c394
-rw-r--r--mm/page_io.c21
-rw-r--r--mm/rmap.c55
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c45
-rw-r--r--mm/slab.h48
-rw-r--r--mm/slab_common.c95
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c225
-rw-r--r--mm/swap.c238
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c253
-rw-r--r--mm/vmacache.c22
-rw-r--r--mm/vmalloc.c13
-rw-r--r--mm/vmscan.c184
-rw-r--r--mm/vmstat.c12
-rw-r--r--mm/zbud.c4
-rw-r--r--mm/zsmalloc.c4
-rw-r--r--mm/zswap.c2
45 files changed, 2789 insertions, 2595 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ebe5880c29d6..3e9977a9d657 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -134,6 +134,9 @@ config HAVE_MEMBLOCK
134config HAVE_MEMBLOCK_NODE_MAP 134config HAVE_MEMBLOCK_NODE_MAP
135 boolean 135 boolean
136 136
137config HAVE_MEMBLOCK_PHYS_MAP
138 boolean
139
137config ARCH_DISCARD_MEMBLOCK 140config ARCH_DISCARD_MEMBLOCK
138 boolean 141 boolean
139 142
@@ -264,6 +267,9 @@ config MIGRATION
264 pages as migration can relocate pages to satisfy a huge page 267 pages as migration can relocate pages to satisfy a huge page
265 allocation instead of reclaiming. 268 allocation instead of reclaiming.
266 269
270config ARCH_ENABLE_HUGEPAGE_MIGRATION
271 boolean
272
267config PHYS_ADDR_T_64BIT 273config PHYS_ADDR_T_64BIT
268 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT 274 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
269 275
@@ -430,16 +436,6 @@ choice
430 benefit. 436 benefit.
431endchoice 437endchoice
432 438
433config CROSS_MEMORY_ATTACH
434 bool "Cross Memory Support"
435 depends on MMU
436 default y
437 help
438 Enabling this option adds the system calls process_vm_readv and
439 process_vm_writev which allow a process with the correct privileges
440 to directly read from or write to to another process's address space.
441 See the man page for more details.
442
443# 439#
444# UP and nommu archs use km based percpu allocator 440# UP and nommu archs use km based percpu allocator
445# 441#
@@ -555,7 +551,7 @@ config MEM_SOFT_DIRTY
555 See Documentation/vm/soft-dirty.txt for more details. 551 See Documentation/vm/soft-dirty.txt for more details.
556 552
557config ZSMALLOC 553config ZSMALLOC
558 bool "Memory allocator for compressed pages" 554 tristate "Memory allocator for compressed pages"
559 depends on MMU 555 depends on MMU
560 default n 556 default n
561 help 557 help
@@ -581,3 +577,18 @@ config PGTABLE_MAPPING
581 577
582config GENERIC_EARLY_IOREMAP 578config GENERIC_EARLY_IOREMAP
583 bool 579 bool
580
581config MAX_STACK_SIZE_MB
582 int "Maximum user stack size for 32-bit processes (MB)"
583 default 80
584 range 8 256 if METAG
585 range 8 2048
586 depends on STACK_GROWSUP && (!64BIT || COMPAT)
587 help
588 This is the maximum stack size in Megabytes in the VM layout of 32-bit
589 user processes when the stack grows upwards (currently only on parisc
590 and metag arch). The stack will be located at the highest memory
591 address minus the given value, unless the RLIMIT_STACK hard limit is
592 changed to a smaller value in which case that is used.
593
594 A sane initial value is 80 MB.
diff --git a/mm/Makefile b/mm/Makefile
index b484452dac57..4064f3ec145e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o 8 vmalloc.o pagewalk.o pgtable-generic.o
9 9
@@ -30,7 +30,6 @@ endif
30 30
31obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 31obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
32 32
33obj-$(CONFIG_BOUNCE) += bounce.o
34obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
35obj-$(CONFIG_FRONTSWAP) += frontswap.o 34obj-$(CONFIG_FRONTSWAP) += frontswap.o
36obj-$(CONFIG_ZSWAP) += zswap.o 35obj-$(CONFIG_ZSWAP) += zswap.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 09d9591b7708..1706cbbdf5f0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -557,7 +557,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
557 bit = sync ? BDI_sync_congested : BDI_async_congested; 557 bit = sync ? BDI_sync_congested : BDI_async_congested;
558 if (test_and_clear_bit(bit, &bdi->state)) 558 if (test_and_clear_bit(bit, &bdi->state))
559 atomic_dec(&nr_bdi_congested[sync]); 559 atomic_dec(&nr_bdi_congested[sync]);
560 smp_mb__after_clear_bit(); 560 smp_mb__after_atomic();
561 if (waitqueue_active(wqh)) 561 if (waitqueue_active(wqh))
562 wake_up(wqh); 562 wake_up(wqh);
563} 563}
diff --git a/mm/bounce.c b/mm/bounce.c
deleted file mode 100644
index 523918b8c6dc..000000000000
--- a/mm/bounce.c
+++ /dev/null
@@ -1,287 +0,0 @@
1/* bounce buffer handling for block devices
2 *
3 * - Split from highmem.c
4 */
5
6#include <linux/mm.h>
7#include <linux/export.h>
8#include <linux/swap.h>
9#include <linux/gfp.h>
10#include <linux/bio.h>
11#include <linux/pagemap.h>
12#include <linux/mempool.h>
13#include <linux/blkdev.h>
14#include <linux/init.h>
15#include <linux/hash.h>
16#include <linux/highmem.h>
17#include <linux/bootmem.h>
18#include <asm/tlbflush.h>
19
20#include <trace/events/block.h>
21
22#define POOL_SIZE 64
23#define ISA_POOL_SIZE 16
24
25static mempool_t *page_pool, *isa_page_pool;
26
27#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
28static __init int init_emergency_pool(void)
29{
30#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
31 if (max_pfn <= max_low_pfn)
32 return 0;
33#endif
34
35 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
36 BUG_ON(!page_pool);
37 printk("bounce pool size: %d pages\n", POOL_SIZE);
38
39 return 0;
40}
41
42__initcall(init_emergency_pool);
43#endif
44
45#ifdef CONFIG_HIGHMEM
46/*
47 * highmem version, map in to vec
48 */
49static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
50{
51 unsigned long flags;
52 unsigned char *vto;
53
54 local_irq_save(flags);
55 vto = kmap_atomic(to->bv_page);
56 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
57 kunmap_atomic(vto);
58 local_irq_restore(flags);
59}
60
61#else /* CONFIG_HIGHMEM */
62
63#define bounce_copy_vec(to, vfrom) \
64 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
65
66#endif /* CONFIG_HIGHMEM */
67
68/*
69 * allocate pages in the DMA region for the ISA pool
70 */
71static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
72{
73 return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
74}
75
76/*
77 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
78 * as the max address, so check if the pool has already been created.
79 */
80int init_emergency_isa_pool(void)
81{
82 if (isa_page_pool)
83 return 0;
84
85 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
86 mempool_free_pages, (void *) 0);
87 BUG_ON(!isa_page_pool);
88
89 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
90 return 0;
91}
92
93/*
94 * Simple bounce buffer support for highmem pages. Depending on the
95 * queue gfp mask set, *to may or may not be a highmem page. kmap it
96 * always, it will do the Right Thing
97 */
98static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
99{
100 unsigned char *vfrom;
101 struct bio_vec tovec, *fromvec = from->bi_io_vec;
102 struct bvec_iter iter;
103
104 bio_for_each_segment(tovec, to, iter) {
105 if (tovec.bv_page != fromvec->bv_page) {
106 /*
107 * fromvec->bv_offset and fromvec->bv_len might have
108 * been modified by the block layer, so use the original
109 * copy, bounce_copy_vec already uses tovec->bv_len
110 */
111 vfrom = page_address(fromvec->bv_page) +
112 tovec.bv_offset;
113
114 bounce_copy_vec(&tovec, vfrom);
115 flush_dcache_page(tovec.bv_page);
116 }
117
118 fromvec++;
119 }
120}
121
122static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
123{
124 struct bio *bio_orig = bio->bi_private;
125 struct bio_vec *bvec, *org_vec;
126 int i;
127
128 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
129 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
130
131 /*
132 * free up bounce indirect pages used
133 */
134 bio_for_each_segment_all(bvec, bio, i) {
135 org_vec = bio_orig->bi_io_vec + i;
136 if (bvec->bv_page == org_vec->bv_page)
137 continue;
138
139 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
140 mempool_free(bvec->bv_page, pool);
141 }
142
143 bio_endio(bio_orig, err);
144 bio_put(bio);
145}
146
147static void bounce_end_io_write(struct bio *bio, int err)
148{
149 bounce_end_io(bio, page_pool, err);
150}
151
152static void bounce_end_io_write_isa(struct bio *bio, int err)
153{
154
155 bounce_end_io(bio, isa_page_pool, err);
156}
157
158static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
159{
160 struct bio *bio_orig = bio->bi_private;
161
162 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
163 copy_to_high_bio_irq(bio_orig, bio);
164
165 bounce_end_io(bio, pool, err);
166}
167
168static void bounce_end_io_read(struct bio *bio, int err)
169{
170 __bounce_end_io_read(bio, page_pool, err);
171}
172
173static void bounce_end_io_read_isa(struct bio *bio, int err)
174{
175 __bounce_end_io_read(bio, isa_page_pool, err);
176}
177
178#ifdef CONFIG_NEED_BOUNCE_POOL
179static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
180{
181 if (bio_data_dir(bio) != WRITE)
182 return 0;
183
184 if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
185 return 0;
186
187 return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
188}
189#else
190static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
191{
192 return 0;
193}
194#endif /* CONFIG_NEED_BOUNCE_POOL */
195
196static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
197 mempool_t *pool, int force)
198{
199 struct bio *bio;
200 int rw = bio_data_dir(*bio_orig);
201 struct bio_vec *to, from;
202 struct bvec_iter iter;
203 unsigned i;
204
205 if (force)
206 goto bounce;
207 bio_for_each_segment(from, *bio_orig, iter)
208 if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
209 goto bounce;
210
211 return;
212bounce:
213 bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
214
215 bio_for_each_segment_all(to, bio, i) {
216 struct page *page = to->bv_page;
217
218 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
219 continue;
220
221 inc_zone_page_state(to->bv_page, NR_BOUNCE);
222 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
223
224 if (rw == WRITE) {
225 char *vto, *vfrom;
226
227 flush_dcache_page(page);
228
229 vto = page_address(to->bv_page) + to->bv_offset;
230 vfrom = kmap_atomic(page) + to->bv_offset;
231 memcpy(vto, vfrom, to->bv_len);
232 kunmap_atomic(vfrom);
233 }
234 }
235
236 trace_block_bio_bounce(q, *bio_orig);
237
238 bio->bi_flags |= (1 << BIO_BOUNCED);
239
240 if (pool == page_pool) {
241 bio->bi_end_io = bounce_end_io_write;
242 if (rw == READ)
243 bio->bi_end_io = bounce_end_io_read;
244 } else {
245 bio->bi_end_io = bounce_end_io_write_isa;
246 if (rw == READ)
247 bio->bi_end_io = bounce_end_io_read_isa;
248 }
249
250 bio->bi_private = *bio_orig;
251 *bio_orig = bio;
252}
253
254void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
255{
256 int must_bounce;
257 mempool_t *pool;
258
259 /*
260 * Data-less bio, nothing to bounce
261 */
262 if (!bio_has_data(*bio_orig))
263 return;
264
265 must_bounce = must_snapshot_stable_pages(q, *bio_orig);
266
267 /*
268 * for non-isa bounce case, just check if the bounce pfn is equal
269 * to or bigger than the highest pfn in the system -- in that case,
270 * don't waste time iterating over bio segments
271 */
272 if (!(q->bounce_gfp & GFP_DMA)) {
273 if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
274 return;
275 pool = page_pool;
276 } else {
277 BUG_ON(!isa_page_pool);
278 pool = isa_page_pool;
279 }
280
281 /*
282 * slow path
283 */
284 __blk_queue_bounce(q, bio_orig, pool, must_bounce);
285}
286
287EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/compaction.c b/mm/compaction.c
index 627dc2e4320f..21bf292b642a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
89 unsigned long end_pfn = zone_end_pfn(zone); 89 unsigned long end_pfn = zone_end_pfn(zone);
90 unsigned long pfn; 90 unsigned long pfn;
91 91
92 zone->compact_cached_migrate_pfn = start_pfn; 92 zone->compact_cached_migrate_pfn[0] = start_pfn;
93 zone->compact_cached_migrate_pfn[1] = start_pfn;
93 zone->compact_cached_free_pfn = end_pfn; 94 zone->compact_cached_free_pfn = end_pfn;
94 zone->compact_blockskip_flush = false; 95 zone->compact_blockskip_flush = false;
95 96
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
131 */ 132 */
132static void update_pageblock_skip(struct compact_control *cc, 133static void update_pageblock_skip(struct compact_control *cc,
133 struct page *page, unsigned long nr_isolated, 134 struct page *page, unsigned long nr_isolated,
134 bool migrate_scanner) 135 bool set_unsuitable, bool migrate_scanner)
135{ 136{
136 struct zone *zone = cc->zone; 137 struct zone *zone = cc->zone;
138 unsigned long pfn;
137 139
138 if (cc->ignore_skip_hint) 140 if (cc->ignore_skip_hint)
139 return; 141 return;
@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc,
141 if (!page) 143 if (!page)
142 return; 144 return;
143 145
144 if (!nr_isolated) { 146 if (nr_isolated)
145 unsigned long pfn = page_to_pfn(page); 147 return;
148
149 /*
150 * Only skip pageblocks when all forms of compaction will be known to
151 * fail in the near future.
152 */
153 if (set_unsuitable)
146 set_pageblock_skip(page); 154 set_pageblock_skip(page);
147 155
148 /* Update where compaction should restart */ 156 pfn = page_to_pfn(page);
149 if (migrate_scanner) { 157
150 if (!cc->finished_update_migrate && 158 /* Update where async and sync compaction should restart */
151 pfn > zone->compact_cached_migrate_pfn) 159 if (migrate_scanner) {
152 zone->compact_cached_migrate_pfn = pfn; 160 if (cc->finished_update_migrate)
153 } else { 161 return;
154 if (!cc->finished_update_free && 162 if (pfn > zone->compact_cached_migrate_pfn[0])
155 pfn < zone->compact_cached_free_pfn) 163 zone->compact_cached_migrate_pfn[0] = pfn;
156 zone->compact_cached_free_pfn = pfn; 164 if (cc->mode != MIGRATE_ASYNC &&
157 } 165 pfn > zone->compact_cached_migrate_pfn[1])
166 zone->compact_cached_migrate_pfn[1] = pfn;
167 } else {
168 if (cc->finished_update_free)
169 return;
170 if (pfn < zone->compact_cached_free_pfn)
171 zone->compact_cached_free_pfn = pfn;
158 } 172 }
159} 173}
160#else 174#else
@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
166 180
167static void update_pageblock_skip(struct compact_control *cc, 181static void update_pageblock_skip(struct compact_control *cc,
168 struct page *page, unsigned long nr_isolated, 182 struct page *page, unsigned long nr_isolated,
169 bool migrate_scanner) 183 bool set_unsuitable, bool migrate_scanner)
170{ 184{
171} 185}
172#endif /* CONFIG_COMPACTION */ 186#endif /* CONFIG_COMPACTION */
@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
195 } 209 }
196 210
197 /* async aborts if taking too long or contended */ 211 /* async aborts if taking too long or contended */
198 if (!cc->sync) { 212 if (cc->mode == MIGRATE_ASYNC) {
199 cc->contended = true; 213 cc->contended = true;
200 return false; 214 return false;
201 } 215 }
@@ -208,10 +222,28 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
208 return true; 222 return true;
209} 223}
210 224
211static inline bool compact_trylock_irqsave(spinlock_t *lock, 225/*
212 unsigned long *flags, struct compact_control *cc) 226 * Aside from avoiding lock contention, compaction also periodically checks
227 * need_resched() and either schedules in sync compaction or aborts async
228 * compaction. This is similar to what compact_checklock_irqsave() does, but
229 * is used where no lock is concerned.
230 *
231 * Returns false when no scheduling was needed, or sync compaction scheduled.
232 * Returns true when async compaction should abort.
233 */
234static inline bool compact_should_abort(struct compact_control *cc)
213{ 235{
214 return compact_checklock_irqsave(lock, flags, false, cc); 236 /* async compaction aborts if contended */
237 if (need_resched()) {
238 if (cc->mode == MIGRATE_ASYNC) {
239 cc->contended = true;
240 return true;
241 }
242
243 cond_resched();
244 }
245
246 return false;
215} 247}
216 248
217/* Returns true if the page is within a block suitable for migration to */ 249/* Returns true if the page is within a block suitable for migration to */
@@ -329,7 +361,8 @@ isolate_fail:
329 361
330 /* Update the pageblock-skip if the whole pageblock was scanned */ 362 /* Update the pageblock-skip if the whole pageblock was scanned */
331 if (blockpfn == end_pfn) 363 if (blockpfn == end_pfn)
332 update_pageblock_skip(cc, valid_page, total_isolated, false); 364 update_pageblock_skip(cc, valid_page, total_isolated, true,
365 false);
333 366
334 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 367 count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
335 if (total_isolated) 368 if (total_isolated)
@@ -464,8 +497,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
464 unsigned long flags; 497 unsigned long flags;
465 bool locked = false; 498 bool locked = false;
466 struct page *page = NULL, *valid_page = NULL; 499 struct page *page = NULL, *valid_page = NULL;
467 bool skipped_async_unsuitable = false; 500 bool set_unsuitable = true;
468 const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | 501 const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
502 ISOLATE_ASYNC_MIGRATE : 0) |
469 (unevictable ? ISOLATE_UNEVICTABLE : 0); 503 (unevictable ? ISOLATE_UNEVICTABLE : 0);
470 504
471 /* 505 /*
@@ -475,7 +509,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
475 */ 509 */
476 while (unlikely(too_many_isolated(zone))) { 510 while (unlikely(too_many_isolated(zone))) {
477 /* async migration should just abort */ 511 /* async migration should just abort */
478 if (!cc->sync) 512 if (cc->mode == MIGRATE_ASYNC)
479 return 0; 513 return 0;
480 514
481 congestion_wait(BLK_RW_ASYNC, HZ/10); 515 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -484,8 +518,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
484 return 0; 518 return 0;
485 } 519 }
486 520
521 if (compact_should_abort(cc))
522 return 0;
523
487 /* Time to isolate some pages for migration */ 524 /* Time to isolate some pages for migration */
488 cond_resched();
489 for (; low_pfn < end_pfn; low_pfn++) { 525 for (; low_pfn < end_pfn; low_pfn++) {
490 /* give a chance to irqs before checking need_resched() */ 526 /* give a chance to irqs before checking need_resched() */
491 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { 527 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
@@ -540,9 +576,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
540 * the minimum amount of work satisfies the allocation 576 * the minimum amount of work satisfies the allocation
541 */ 577 */
542 mt = get_pageblock_migratetype(page); 578 mt = get_pageblock_migratetype(page);
543 if (!cc->sync && !migrate_async_suitable(mt)) { 579 if (cc->mode == MIGRATE_ASYNC &&
544 cc->finished_update_migrate = true; 580 !migrate_async_suitable(mt)) {
545 skipped_async_unsuitable = true; 581 set_unsuitable = false;
546 goto next_pageblock; 582 goto next_pageblock;
547 } 583 }
548 } 584 }
@@ -646,11 +682,10 @@ next_pageblock:
646 /* 682 /*
647 * Update the pageblock-skip information and cached scanner pfn, 683 * Update the pageblock-skip information and cached scanner pfn,
648 * if the whole pageblock was scanned without isolating any page. 684 * if the whole pageblock was scanned without isolating any page.
649 * This is not done when pageblock was skipped due to being unsuitable
650 * for async compaction, so that eventual sync compaction can try.
651 */ 685 */
652 if (low_pfn == end_pfn && !skipped_async_unsuitable) 686 if (low_pfn == end_pfn)
653 update_pageblock_skip(cc, valid_page, nr_isolated, true); 687 update_pageblock_skip(cc, valid_page, nr_isolated,
688 set_unsuitable, true);
654 689
655 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 690 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
656 691
@@ -671,7 +706,9 @@ static void isolate_freepages(struct zone *zone,
671 struct compact_control *cc) 706 struct compact_control *cc)
672{ 707{
673 struct page *page; 708 struct page *page;
674 unsigned long high_pfn, low_pfn, pfn, z_end_pfn; 709 unsigned long block_start_pfn; /* start of current pageblock */
710 unsigned long block_end_pfn; /* end of current pageblock */
711 unsigned long low_pfn; /* lowest pfn scanner is able to scan */
675 int nr_freepages = cc->nr_freepages; 712 int nr_freepages = cc->nr_freepages;
676 struct list_head *freelist = &cc->freepages; 713 struct list_head *freelist = &cc->freepages;
677 714
@@ -679,41 +716,38 @@ static void isolate_freepages(struct zone *zone,
679 * Initialise the free scanner. The starting point is where we last 716 * Initialise the free scanner. The starting point is where we last
680 * successfully isolated from, zone-cached value, or the end of the 717 * successfully isolated from, zone-cached value, or the end of the
681 * zone when isolating for the first time. We need this aligned to 718 * zone when isolating for the first time. We need this aligned to
682 * the pageblock boundary, because we do pfn -= pageblock_nr_pages 719 * the pageblock boundary, because we do
683 * in the for loop. 720 * block_start_pfn -= pageblock_nr_pages in the for loop.
721 * For ending point, take care when isolating in last pageblock of a
722 * a zone which ends in the middle of a pageblock.
684 * The low boundary is the end of the pageblock the migration scanner 723 * The low boundary is the end of the pageblock the migration scanner
685 * is using. 724 * is using.
686 */ 725 */
687 pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 726 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
727 block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
728 zone_end_pfn(zone));
688 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); 729 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
689 730
690 /* 731 /*
691 * Take care that if the migration scanner is at the end of the zone
692 * that the free scanner does not accidentally move to the next zone
693 * in the next isolation cycle.
694 */
695 high_pfn = min(low_pfn, pfn);
696
697 z_end_pfn = zone_end_pfn(zone);
698
699 /*
700 * Isolate free pages until enough are available to migrate the 732 * Isolate free pages until enough are available to migrate the
701 * pages on cc->migratepages. We stop searching if the migrate 733 * pages on cc->migratepages. We stop searching if the migrate
702 * and free page scanners meet or enough free pages are isolated. 734 * and free page scanners meet or enough free pages are isolated.
703 */ 735 */
704 for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 736 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
705 pfn -= pageblock_nr_pages) { 737 block_end_pfn = block_start_pfn,
738 block_start_pfn -= pageblock_nr_pages) {
706 unsigned long isolated; 739 unsigned long isolated;
707 unsigned long end_pfn;
708 740
709 /* 741 /*
710 * This can iterate a massively long zone without finding any 742 * This can iterate a massively long zone without finding any
711 * suitable migration targets, so periodically check if we need 743 * suitable migration targets, so periodically check if we need
712 * to schedule. 744 * to schedule, or even abort async compaction.
713 */ 745 */
714 cond_resched(); 746 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
747 && compact_should_abort(cc))
748 break;
715 749
716 if (!pfn_valid(pfn)) 750 if (!pfn_valid(block_start_pfn))
717 continue; 751 continue;
718 752
719 /* 753 /*
@@ -723,7 +757,7 @@ static void isolate_freepages(struct zone *zone,
723 * i.e. it's possible that all pages within a zones range of 757 * i.e. it's possible that all pages within a zones range of
724 * pages do not belong to a single zone. 758 * pages do not belong to a single zone.
725 */ 759 */
726 page = pfn_to_page(pfn); 760 page = pfn_to_page(block_start_pfn);
727 if (page_zone(page) != zone) 761 if (page_zone(page) != zone)
728 continue; 762 continue;
729 763
@@ -736,26 +770,26 @@ static void isolate_freepages(struct zone *zone,
736 continue; 770 continue;
737 771
738 /* Found a block suitable for isolating free pages from */ 772 /* Found a block suitable for isolating free pages from */
739 isolated = 0; 773 cc->free_pfn = block_start_pfn;
774 isolated = isolate_freepages_block(cc, block_start_pfn,
775 block_end_pfn, freelist, false);
776 nr_freepages += isolated;
740 777
741 /* 778 /*
742 * Take care when isolating in last pageblock of a zone which 779 * Set a flag that we successfully isolated in this pageblock.
743 * ends in the middle of a pageblock. 780 * In the next loop iteration, zone->compact_cached_free_pfn
781 * will not be updated and thus it will effectively contain the
782 * highest pageblock we isolated pages from.
744 */ 783 */
745 end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); 784 if (isolated)
746 isolated = isolate_freepages_block(cc, pfn, end_pfn, 785 cc->finished_update_free = true;
747 freelist, false);
748 nr_freepages += isolated;
749 786
750 /* 787 /*
751 * Record the highest PFN we isolated pages from. When next 788 * isolate_freepages_block() might have aborted due to async
752 * looking for free pages, the search will restart here as 789 * compaction being contended
753 * page migration may have returned some pages to the allocator
754 */ 790 */
755 if (isolated) { 791 if (cc->contended)
756 cc->finished_update_free = true; 792 break;
757 high_pfn = max(high_pfn, pfn);
758 }
759 } 793 }
760 794
761 /* split_free_page does not map the pages */ 795 /* split_free_page does not map the pages */
@@ -765,10 +799,9 @@ static void isolate_freepages(struct zone *zone,
765 * If we crossed the migrate scanner, we want to keep it that way 799 * If we crossed the migrate scanner, we want to keep it that way
766 * so that compact_finished() may detect this 800 * so that compact_finished() may detect this
767 */ 801 */
768 if (pfn < low_pfn) 802 if (block_start_pfn < low_pfn)
769 cc->free_pfn = max(pfn, zone->zone_start_pfn); 803 cc->free_pfn = cc->migrate_pfn;
770 else 804
771 cc->free_pfn = high_pfn;
772 cc->nr_freepages = nr_freepages; 805 cc->nr_freepages = nr_freepages;
773} 806}
774 807
@@ -783,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage,
783 struct compact_control *cc = (struct compact_control *)data; 816 struct compact_control *cc = (struct compact_control *)data;
784 struct page *freepage; 817 struct page *freepage;
785 818
786 /* Isolate free pages if necessary */ 819 /*
820 * Isolate free pages if necessary, and if we are not aborting due to
821 * contention.
822 */
787 if (list_empty(&cc->freepages)) { 823 if (list_empty(&cc->freepages)) {
788 isolate_freepages(cc->zone, cc); 824 if (!cc->contended)
825 isolate_freepages(cc->zone, cc);
789 826
790 if (list_empty(&cc->freepages)) 827 if (list_empty(&cc->freepages))
791 return NULL; 828 return NULL;
@@ -799,23 +836,16 @@ static struct page *compaction_alloc(struct page *migratepage,
799} 836}
800 837
801/* 838/*
802 * We cannot control nr_migratepages and nr_freepages fully when migration is 839 * This is a migrate-callback that "frees" freepages back to the isolated
803 * running as migrate_pages() has no knowledge of compact_control. When 840 * freelist. All pages on the freelist are from the same zone, so there is no
804 * migration is complete, we count the number of pages on the lists by hand. 841 * special handling needed for NUMA.
805 */ 842 */
806static void update_nr_listpages(struct compact_control *cc) 843static void compaction_free(struct page *page, unsigned long data)
807{ 844{
808 int nr_migratepages = 0; 845 struct compact_control *cc = (struct compact_control *)data;
809 int nr_freepages = 0;
810 struct page *page;
811
812 list_for_each_entry(page, &cc->migratepages, lru)
813 nr_migratepages++;
814 list_for_each_entry(page, &cc->freepages, lru)
815 nr_freepages++;
816 846
817 cc->nr_migratepages = nr_migratepages; 847 list_add(&page->lru, &cc->freepages);
818 cc->nr_freepages = nr_freepages; 848 cc->nr_freepages++;
819} 849}
820 850
821/* possible outcome of isolate_migratepages */ 851/* possible outcome of isolate_migratepages */
@@ -862,13 +892,14 @@ static int compact_finished(struct zone *zone,
862 unsigned int order; 892 unsigned int order;
863 unsigned long watermark; 893 unsigned long watermark;
864 894
865 if (fatal_signal_pending(current)) 895 if (cc->contended || fatal_signal_pending(current))
866 return COMPACT_PARTIAL; 896 return COMPACT_PARTIAL;
867 897
868 /* Compaction run completes if the migrate and free scanner meet */ 898 /* Compaction run completes if the migrate and free scanner meet */
869 if (cc->free_pfn <= cc->migrate_pfn) { 899 if (cc->free_pfn <= cc->migrate_pfn) {
870 /* Let the next compaction start anew. */ 900 /* Let the next compaction start anew. */
871 zone->compact_cached_migrate_pfn = zone->zone_start_pfn; 901 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
902 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
872 zone->compact_cached_free_pfn = zone_end_pfn(zone); 903 zone->compact_cached_free_pfn = zone_end_pfn(zone);
873 904
874 /* 905 /*
@@ -968,6 +999,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
968 int ret; 999 int ret;
969 unsigned long start_pfn = zone->zone_start_pfn; 1000 unsigned long start_pfn = zone->zone_start_pfn;
970 unsigned long end_pfn = zone_end_pfn(zone); 1001 unsigned long end_pfn = zone_end_pfn(zone);
1002 const bool sync = cc->mode != MIGRATE_ASYNC;
971 1003
972 ret = compaction_suitable(zone, cc->order); 1004 ret = compaction_suitable(zone, cc->order);
973 switch (ret) { 1005 switch (ret) {
@@ -993,7 +1025,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
993 * information on where the scanners should start but check that it 1025 * information on where the scanners should start but check that it
994 * is initialised by ensuring the values are within zone boundaries. 1026 * is initialised by ensuring the values are within zone boundaries.
995 */ 1027 */
996 cc->migrate_pfn = zone->compact_cached_migrate_pfn; 1028 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
997 cc->free_pfn = zone->compact_cached_free_pfn; 1029 cc->free_pfn = zone->compact_cached_free_pfn;
998 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1030 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
999 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1031 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1001,7 +1033,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1001 } 1033 }
1002 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1034 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
1003 cc->migrate_pfn = start_pfn; 1035 cc->migrate_pfn = start_pfn;
1004 zone->compact_cached_migrate_pfn = cc->migrate_pfn; 1036 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1037 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1005 } 1038 }
1006 1039
1007 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); 1040 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
@@ -1009,7 +1042,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1009 migrate_prep_local(); 1042 migrate_prep_local();
1010 1043
1011 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 1044 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
1012 unsigned long nr_migrate, nr_remaining;
1013 int err; 1045 int err;
1014 1046
1015 switch (isolate_migratepages(zone, cc)) { 1047 switch (isolate_migratepages(zone, cc)) {
@@ -1024,21 +1056,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1024 ; 1056 ;
1025 } 1057 }
1026 1058
1027 nr_migrate = cc->nr_migratepages; 1059 if (!cc->nr_migratepages)
1060 continue;
1061
1028 err = migrate_pages(&cc->migratepages, compaction_alloc, 1062 err = migrate_pages(&cc->migratepages, compaction_alloc,
1029 (unsigned long)cc, 1063 compaction_free, (unsigned long)cc, cc->mode,
1030 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
1031 MR_COMPACTION); 1064 MR_COMPACTION);
1032 update_nr_listpages(cc);
1033 nr_remaining = cc->nr_migratepages;
1034 1065
1035 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 1066 trace_mm_compaction_migratepages(cc->nr_migratepages, err,
1036 nr_remaining); 1067 &cc->migratepages);
1037 1068
1038 /* Release isolated pages not migrated */ 1069 /* All pages were either migrated or will be released */
1070 cc->nr_migratepages = 0;
1039 if (err) { 1071 if (err) {
1040 putback_movable_pages(&cc->migratepages); 1072 putback_movable_pages(&cc->migratepages);
1041 cc->nr_migratepages = 0;
1042 /* 1073 /*
1043 * migrate_pages() may return -ENOMEM when scanners meet 1074 * migrate_pages() may return -ENOMEM when scanners meet
1044 * and we want compact_finished() to detect it 1075 * and we want compact_finished() to detect it
@@ -1060,9 +1091,8 @@ out:
1060 return ret; 1091 return ret;
1061} 1092}
1062 1093
1063static unsigned long compact_zone_order(struct zone *zone, 1094static unsigned long compact_zone_order(struct zone *zone, int order,
1064 int order, gfp_t gfp_mask, 1095 gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
1065 bool sync, bool *contended)
1066{ 1096{
1067 unsigned long ret; 1097 unsigned long ret;
1068 struct compact_control cc = { 1098 struct compact_control cc = {
@@ -1071,7 +1101,7 @@ static unsigned long compact_zone_order(struct zone *zone,
1071 .order = order, 1101 .order = order,
1072 .migratetype = allocflags_to_migratetype(gfp_mask), 1102 .migratetype = allocflags_to_migratetype(gfp_mask),
1073 .zone = zone, 1103 .zone = zone,
1074 .sync = sync, 1104 .mode = mode,
1075 }; 1105 };
1076 INIT_LIST_HEAD(&cc.freepages); 1106 INIT_LIST_HEAD(&cc.freepages);
1077 INIT_LIST_HEAD(&cc.migratepages); 1107 INIT_LIST_HEAD(&cc.migratepages);
@@ -1093,7 +1123,7 @@ int sysctl_extfrag_threshold = 500;
1093 * @order: The order of the current allocation 1123 * @order: The order of the current allocation
1094 * @gfp_mask: The GFP mask of the current allocation 1124 * @gfp_mask: The GFP mask of the current allocation
1095 * @nodemask: The allowed nodes to allocate from 1125 * @nodemask: The allowed nodes to allocate from
1096 * @sync: Whether migration is synchronous or not 1126 * @mode: The migration mode for async, sync light, or sync migration
1097 * @contended: Return value that is true if compaction was aborted due to lock contention 1127 * @contended: Return value that is true if compaction was aborted due to lock contention
1098 * @page: Optionally capture a free page of the requested order during compaction 1128 * @page: Optionally capture a free page of the requested order during compaction
1099 * 1129 *
@@ -1101,7 +1131,7 @@ int sysctl_extfrag_threshold = 500;
1101 */ 1131 */
1102unsigned long try_to_compact_pages(struct zonelist *zonelist, 1132unsigned long try_to_compact_pages(struct zonelist *zonelist,
1103 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1133 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1104 bool sync, bool *contended) 1134 enum migrate_mode mode, bool *contended)
1105{ 1135{
1106 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1136 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1107 int may_enter_fs = gfp_mask & __GFP_FS; 1137 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1126,7 +1156,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1126 nodemask) { 1156 nodemask) {
1127 int status; 1157 int status;
1128 1158
1129 status = compact_zone_order(zone, order, gfp_mask, sync, 1159 status = compact_zone_order(zone, order, gfp_mask, mode,
1130 contended); 1160 contended);
1131 rc = max(status, rc); 1161 rc = max(status, rc);
1132 1162
@@ -1165,9 +1195,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1165 if (zone_watermark_ok(zone, cc->order, 1195 if (zone_watermark_ok(zone, cc->order,
1166 low_wmark_pages(zone), 0, 0)) 1196 low_wmark_pages(zone), 0, 0))
1167 compaction_defer_reset(zone, cc->order, false); 1197 compaction_defer_reset(zone, cc->order, false);
1168 /* Currently async compaction is never deferred. */
1169 else if (cc->sync)
1170 defer_compaction(zone, cc->order);
1171 } 1198 }
1172 1199
1173 VM_BUG_ON(!list_empty(&cc->freepages)); 1200 VM_BUG_ON(!list_empty(&cc->freepages));
@@ -1179,7 +1206,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
1179{ 1206{
1180 struct compact_control cc = { 1207 struct compact_control cc = {
1181 .order = order, 1208 .order = order,
1182 .sync = false, 1209 .mode = MIGRATE_ASYNC,
1183 }; 1210 };
1184 1211
1185 if (!order) 1212 if (!order)
@@ -1192,7 +1219,7 @@ static void compact_node(int nid)
1192{ 1219{
1193 struct compact_control cc = { 1220 struct compact_control cc = {
1194 .order = -1, 1221 .order = -1,
1195 .sync = true, 1222 .mode = MIGRATE_SYNC,
1196 .ignore_skip_hint = true, 1223 .ignore_skip_hint = true,
1197 }; 1224 };
1198 1225
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c69781e97cf9..306baa594f95 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -170,24 +170,16 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
170 retval->boundary = boundary; 170 retval->boundary = boundary;
171 retval->allocation = allocation; 171 retval->allocation = allocation;
172 172
173 if (dev) { 173 INIT_LIST_HEAD(&retval->pools);
174 int ret;
175 174
176 mutex_lock(&pools_lock); 175 mutex_lock(&pools_lock);
177 if (list_empty(&dev->dma_pools)) 176 if (list_empty(&dev->dma_pools) &&
178 ret = device_create_file(dev, &dev_attr_pools); 177 device_create_file(dev, &dev_attr_pools)) {
179 else 178 kfree(retval);
180 ret = 0; 179 return NULL;
181 /* note: not currently insisting "name" be unique */
182 if (!ret)
183 list_add(&retval->pools, &dev->dma_pools);
184 else {
185 kfree(retval);
186 retval = NULL;
187 }
188 mutex_unlock(&pools_lock);
189 } else 180 } else
190 INIT_LIST_HEAD(&retval->pools); 181 list_add(&retval->pools, &dev->dma_pools);
182 mutex_unlock(&pools_lock);
191 183
192 return retval; 184 return retval;
193} 185}
@@ -341,10 +333,10 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
341 continue; 333 continue;
342 if (pool->dev) 334 if (pool->dev)
343 dev_err(pool->dev, 335 dev_err(pool->dev,
344 "dma_pool_alloc %s, %p (corruped)\n", 336 "dma_pool_alloc %s, %p (corrupted)\n",
345 pool->name, retval); 337 pool->name, retval);
346 else 338 else
347 pr_err("dma_pool_alloc %s, %p (corruped)\n", 339 pr_err("dma_pool_alloc %s, %p (corrupted)\n",
348 pool->name, retval); 340 pool->name, retval);
349 341
350 /* 342 /*
@@ -508,7 +500,6 @@ void dmam_pool_destroy(struct dma_pool *pool)
508{ 500{
509 struct device *dev = pool->dev; 501 struct device *dev = pool->dev;
510 502
511 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); 503 WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool));
512 dma_pool_destroy(pool);
513} 504}
514EXPORT_SYMBOL(dmam_pool_destroy); 505EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/filemap.c b/mm/filemap.c
index 000a220e2a41..7fadf1c62838 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -257,9 +257,11 @@ static int filemap_check_errors(struct address_space *mapping)
257{ 257{
258 int ret = 0; 258 int ret = 0;
259 /* Check for outstanding write errors */ 259 /* Check for outstanding write errors */
260 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 260 if (test_bit(AS_ENOSPC, &mapping->flags) &&
261 test_and_clear_bit(AS_ENOSPC, &mapping->flags))
261 ret = -ENOSPC; 262 ret = -ENOSPC;
262 if (test_and_clear_bit(AS_EIO, &mapping->flags)) 263 if (test_bit(AS_EIO, &mapping->flags) &&
264 test_and_clear_bit(AS_EIO, &mapping->flags))
263 ret = -EIO; 265 ret = -EIO;
264 return ret; 266 return ret;
265} 267}
@@ -740,7 +742,7 @@ void unlock_page(struct page *page)
740{ 742{
741 VM_BUG_ON_PAGE(!PageLocked(page), page); 743 VM_BUG_ON_PAGE(!PageLocked(page), page);
742 clear_bit_unlock(PG_locked, &page->flags); 744 clear_bit_unlock(PG_locked, &page->flags);
743 smp_mb__after_clear_bit(); 745 smp_mb__after_atomic();
744 wake_up_page(page, PG_locked); 746 wake_up_page(page, PG_locked);
745} 747}
746EXPORT_SYMBOL(unlock_page); 748EXPORT_SYMBOL(unlock_page);
@@ -751,17 +753,51 @@ EXPORT_SYMBOL(unlock_page);
751 */ 753 */
752void end_page_writeback(struct page *page) 754void end_page_writeback(struct page *page)
753{ 755{
754 if (TestClearPageReclaim(page)) 756 /*
757 * TestClearPageReclaim could be used here but it is an atomic
758 * operation and overkill in this particular case. Failing to
759 * shuffle a page marked for immediate reclaim is too mild to
760 * justify taking an atomic operation penalty at the end of
761 * ever page writeback.
762 */
763 if (PageReclaim(page)) {
764 ClearPageReclaim(page);
755 rotate_reclaimable_page(page); 765 rotate_reclaimable_page(page);
766 }
756 767
757 if (!test_clear_page_writeback(page)) 768 if (!test_clear_page_writeback(page))
758 BUG(); 769 BUG();
759 770
760 smp_mb__after_clear_bit(); 771 smp_mb__after_atomic();
761 wake_up_page(page, PG_writeback); 772 wake_up_page(page, PG_writeback);
762} 773}
763EXPORT_SYMBOL(end_page_writeback); 774EXPORT_SYMBOL(end_page_writeback);
764 775
776/*
777 * After completing I/O on a page, call this routine to update the page
778 * flags appropriately
779 */
780void page_endio(struct page *page, int rw, int err)
781{
782 if (rw == READ) {
783 if (!err) {
784 SetPageUptodate(page);
785 } else {
786 ClearPageUptodate(page);
787 SetPageError(page);
788 }
789 unlock_page(page);
790 } else { /* rw == WRITE */
791 if (err) {
792 SetPageError(page);
793 if (page->mapping)
794 mapping_set_error(page->mapping, err);
795 }
796 end_page_writeback(page);
797 }
798}
799EXPORT_SYMBOL_GPL(page_endio);
800
765/** 801/**
766 * __lock_page - get a lock on the page, assuming we need to sleep to get it 802 * __lock_page - get a lock on the page, assuming we need to sleep to get it
767 * @page: the page to lock 803 * @page: the page to lock
@@ -955,26 +991,6 @@ out:
955EXPORT_SYMBOL(find_get_entry); 991EXPORT_SYMBOL(find_get_entry);
956 992
957/** 993/**
958 * find_get_page - find and get a page reference
959 * @mapping: the address_space to search
960 * @offset: the page index
961 *
962 * Looks up the page cache slot at @mapping & @offset. If there is a
963 * page cache page, it is returned with an increased refcount.
964 *
965 * Otherwise, %NULL is returned.
966 */
967struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
968{
969 struct page *page = find_get_entry(mapping, offset);
970
971 if (radix_tree_exceptional_entry(page))
972 page = NULL;
973 return page;
974}
975EXPORT_SYMBOL(find_get_page);
976
977/**
978 * find_lock_entry - locate, pin and lock a page cache entry 994 * find_lock_entry - locate, pin and lock a page cache entry
979 * @mapping: the address_space to search 995 * @mapping: the address_space to search
980 * @offset: the page cache index 996 * @offset: the page cache index
@@ -1011,66 +1027,84 @@ repeat:
1011EXPORT_SYMBOL(find_lock_entry); 1027EXPORT_SYMBOL(find_lock_entry);
1012 1028
1013/** 1029/**
1014 * find_lock_page - locate, pin and lock a pagecache page 1030 * pagecache_get_page - find and get a page reference
1015 * @mapping: the address_space to search 1031 * @mapping: the address_space to search
1016 * @offset: the page index 1032 * @offset: the page index
1033 * @fgp_flags: PCG flags
1034 * @gfp_mask: gfp mask to use if a page is to be allocated
1017 * 1035 *
1018 * Looks up the page cache slot at @mapping & @offset. If there is a 1036 * Looks up the page cache slot at @mapping & @offset.
1019 * page cache page, it is returned locked and with an increased
1020 * refcount.
1021 *
1022 * Otherwise, %NULL is returned.
1023 *
1024 * find_lock_page() may sleep.
1025 */
1026struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
1027{
1028 struct page *page = find_lock_entry(mapping, offset);
1029
1030 if (radix_tree_exceptional_entry(page))
1031 page = NULL;
1032 return page;
1033}
1034EXPORT_SYMBOL(find_lock_page);
1035
1036/**
1037 * find_or_create_page - locate or add a pagecache page
1038 * @mapping: the page's address_space
1039 * @index: the page's index into the mapping
1040 * @gfp_mask: page allocation mode
1041 * 1037 *
1042 * Looks up the page cache slot at @mapping & @offset. If there is a 1038 * PCG flags modify how the page is returned
1043 * page cache page, it is returned locked and with an increased
1044 * refcount.
1045 * 1039 *
1046 * If the page is not present, a new page is allocated using @gfp_mask 1040 * FGP_ACCESSED: the page will be marked accessed
1047 * and added to the page cache and the VM's LRU list. The page is 1041 * FGP_LOCK: Page is return locked
1048 * returned locked and with an increased refcount. 1042 * FGP_CREAT: If page is not present then a new page is allocated using
1043 * @gfp_mask and added to the page cache and the VM's LRU
1044 * list. The page is returned locked and with an increased
1045 * refcount. Otherwise, %NULL is returned.
1049 * 1046 *
1050 * On memory exhaustion, %NULL is returned. 1047 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
1048 * if the GFP flags specified for FGP_CREAT are atomic.
1051 * 1049 *
1052 * find_or_create_page() may sleep, even if @gfp_flags specifies an 1050 * If there is a page cache page, it is returned with an increased refcount.
1053 * atomic allocation!
1054 */ 1051 */
1055struct page *find_or_create_page(struct address_space *mapping, 1052struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
1056 pgoff_t index, gfp_t gfp_mask) 1053 int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
1057{ 1054{
1058 struct page *page; 1055 struct page *page;
1059 int err; 1056
1060repeat: 1057repeat:
1061 page = find_lock_page(mapping, index); 1058 page = find_get_entry(mapping, offset);
1062 if (!page) { 1059 if (radix_tree_exceptional_entry(page))
1063 page = __page_cache_alloc(gfp_mask); 1060 page = NULL;
1061 if (!page)
1062 goto no_page;
1063
1064 if (fgp_flags & FGP_LOCK) {
1065 if (fgp_flags & FGP_NOWAIT) {
1066 if (!trylock_page(page)) {
1067 page_cache_release(page);
1068 return NULL;
1069 }
1070 } else {
1071 lock_page(page);
1072 }
1073
1074 /* Has the page been truncated? */
1075 if (unlikely(page->mapping != mapping)) {
1076 unlock_page(page);
1077 page_cache_release(page);
1078 goto repeat;
1079 }
1080 VM_BUG_ON_PAGE(page->index != offset, page);
1081 }
1082
1083 if (page && (fgp_flags & FGP_ACCESSED))
1084 mark_page_accessed(page);
1085
1086no_page:
1087 if (!page && (fgp_flags & FGP_CREAT)) {
1088 int err;
1089 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
1090 cache_gfp_mask |= __GFP_WRITE;
1091 if (fgp_flags & FGP_NOFS) {
1092 cache_gfp_mask &= ~__GFP_FS;
1093 radix_gfp_mask &= ~__GFP_FS;
1094 }
1095
1096 page = __page_cache_alloc(cache_gfp_mask);
1064 if (!page) 1097 if (!page)
1065 return NULL; 1098 return NULL;
1066 /* 1099
1067 * We want a regular kernel memory (not highmem or DMA etc) 1100 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
1068 * allocation for the radix tree nodes, but we need to honour 1101 fgp_flags |= FGP_LOCK;
1069 * the context-specific requirements the caller has asked for. 1102
1070 * GFP_RECLAIM_MASK collects those requirements. 1103 /* Init accessed so avoit atomic mark_page_accessed later */
1071 */ 1104 if (fgp_flags & FGP_ACCESSED)
1072 err = add_to_page_cache_lru(page, mapping, index, 1105 init_page_accessed(page);
1073 (gfp_mask & GFP_RECLAIM_MASK)); 1106
1107 err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
1074 if (unlikely(err)) { 1108 if (unlikely(err)) {
1075 page_cache_release(page); 1109 page_cache_release(page);
1076 page = NULL; 1110 page = NULL;
@@ -1078,9 +1112,10 @@ repeat:
1078 goto repeat; 1112 goto repeat;
1079 } 1113 }
1080 } 1114 }
1115
1081 return page; 1116 return page;
1082} 1117}
1083EXPORT_SYMBOL(find_or_create_page); 1118EXPORT_SYMBOL(pagecache_get_page);
1084 1119
1085/** 1120/**
1086 * find_get_entries - gang pagecache lookup 1121 * find_get_entries - gang pagecache lookup
@@ -1377,39 +1412,6 @@ repeat:
1377} 1412}
1378EXPORT_SYMBOL(find_get_pages_tag); 1413EXPORT_SYMBOL(find_get_pages_tag);
1379 1414
1380/**
1381 * grab_cache_page_nowait - returns locked page at given index in given cache
1382 * @mapping: target address_space
1383 * @index: the page index
1384 *
1385 * Same as grab_cache_page(), but do not wait if the page is unavailable.
1386 * This is intended for speculative data generators, where the data can
1387 * be regenerated if the page couldn't be grabbed. This routine should
1388 * be safe to call while holding the lock for another page.
1389 *
1390 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
1391 * and deadlock against the caller's locked page.
1392 */
1393struct page *
1394grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
1395{
1396 struct page *page = find_get_page(mapping, index);
1397
1398 if (page) {
1399 if (trylock_page(page))
1400 return page;
1401 page_cache_release(page);
1402 return NULL;
1403 }
1404 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
1405 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
1406 page_cache_release(page);
1407 page = NULL;
1408 }
1409 return page;
1410}
1411EXPORT_SYMBOL(grab_cache_page_nowait);
1412
1413/* 1415/*
1414 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1416 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
1415 * a _large_ part of the i/o request. Imagine the worst scenario: 1417 * a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2379,7 +2381,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
2379{ 2381{
2380 const struct address_space_operations *aops = mapping->a_ops; 2382 const struct address_space_operations *aops = mapping->a_ops;
2381 2383
2382 mark_page_accessed(page);
2383 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2384 return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2384} 2385}
2385EXPORT_SYMBOL(pagecache_write_end); 2386EXPORT_SYMBOL(pagecache_write_end);
@@ -2461,34 +2462,18 @@ EXPORT_SYMBOL(generic_file_direct_write);
2461struct page *grab_cache_page_write_begin(struct address_space *mapping, 2462struct page *grab_cache_page_write_begin(struct address_space *mapping,
2462 pgoff_t index, unsigned flags) 2463 pgoff_t index, unsigned flags)
2463{ 2464{
2464 int status;
2465 gfp_t gfp_mask;
2466 struct page *page; 2465 struct page *page;
2467 gfp_t gfp_notmask = 0; 2466 int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
2468 2467
2469 gfp_mask = mapping_gfp_mask(mapping);
2470 if (mapping_cap_account_dirty(mapping))
2471 gfp_mask |= __GFP_WRITE;
2472 if (flags & AOP_FLAG_NOFS) 2468 if (flags & AOP_FLAG_NOFS)
2473 gfp_notmask = __GFP_FS; 2469 fgp_flags |= FGP_NOFS;
2474repeat: 2470
2475 page = find_lock_page(mapping, index); 2471 page = pagecache_get_page(mapping, index, fgp_flags,
2472 mapping_gfp_mask(mapping),
2473 GFP_KERNEL);
2476 if (page) 2474 if (page)
2477 goto found; 2475 wait_for_stable_page(page);
2478 2476
2479 page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
2480 if (!page)
2481 return NULL;
2482 status = add_to_page_cache_lru(page, mapping, index,
2483 GFP_KERNEL & ~gfp_notmask);
2484 if (unlikely(status)) {
2485 page_cache_release(page);
2486 if (status == -EEXIST)
2487 goto repeat;
2488 return NULL;
2489 }
2490found:
2491 wait_for_stable_page(page);
2492 return page; 2477 return page;
2493} 2478}
2494EXPORT_SYMBOL(grab_cache_page_write_begin); 2479EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2537,7 +2522,7 @@ again:
2537 2522
2538 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2523 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2539 &page, &fsdata); 2524 &page, &fsdata);
2540 if (unlikely(status)) 2525 if (unlikely(status < 0))
2541 break; 2526 break;
2542 2527
2543 if (mapping_writably_mapped(mapping)) 2528 if (mapping_writably_mapped(mapping))
@@ -2546,7 +2531,6 @@ again:
2546 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2531 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2547 flush_dcache_page(page); 2532 flush_dcache_page(page);
2548 2533
2549 mark_page_accessed(page);
2550 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2534 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2551 page, fsdata); 2535 page, fsdata);
2552 if (unlikely(status < 0)) 2536 if (unlikely(status < 0))
diff --git a/mm/fremap.c b/mm/fremap.c
index 34feba60a17e..2c5646f11f41 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
82 82
83 ptfile = pgoff_to_pte(pgoff); 83 ptfile = pgoff_to_pte(pgoff);
84 84
85 if (!pte_none(*pte)) { 85 if (!pte_none(*pte))
86 if (pte_present(*pte) && pte_soft_dirty(*pte))
87 pte_file_mksoft_dirty(ptfile);
88 zap_pte(mm, vma, addr, pte); 86 zap_pte(mm, vma, addr, pte);
89 }
90 87
91 set_pte_at(mm, addr, pte, ptfile); 88 set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
92 /* 89 /*
93 * We don't need to run update_mmu_cache() here because the "file pte" 90 * We don't need to run update_mmu_cache() here because the "file pte"
94 * being installed by install_file_pte() is not a real pte - it's a 91 * being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1b24bdcb3197..c30eec536f03 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
327 327
328static unsigned long __frontswap_curr_pages(void) 328static unsigned long __frontswap_curr_pages(void)
329{ 329{
330 int type;
331 unsigned long totalpages = 0; 330 unsigned long totalpages = 0;
332 struct swap_info_struct *si = NULL; 331 struct swap_info_struct *si = NULL;
333 332
334 assert_spin_locked(&swap_lock); 333 assert_spin_locked(&swap_lock);
335 for (type = swap_list.head; type >= 0; type = si->next) { 334 plist_for_each_entry(si, &swap_active_head, list)
336 si = swap_info[type];
337 totalpages += atomic_read(&si->frontswap_pages); 335 totalpages += atomic_read(&si->frontswap_pages);
338 }
339 return totalpages; 336 return totalpages;
340} 337}
341 338
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
347 int si_frontswap_pages; 344 int si_frontswap_pages;
348 unsigned long total_pages_to_unuse = total; 345 unsigned long total_pages_to_unuse = total;
349 unsigned long pages = 0, pages_to_unuse = 0; 346 unsigned long pages = 0, pages_to_unuse = 0;
350 int type;
351 347
352 assert_spin_locked(&swap_lock); 348 assert_spin_locked(&swap_lock);
353 for (type = swap_list.head; type >= 0; type = si->next) { 349 plist_for_each_entry(si, &swap_active_head, list) {
354 si = swap_info[type];
355 si_frontswap_pages = atomic_read(&si->frontswap_pages); 350 si_frontswap_pages = atomic_read(&si->frontswap_pages);
356 if (total_pages_to_unuse < si_frontswap_pages) { 351 if (total_pages_to_unuse < si_frontswap_pages) {
357 pages = pages_to_unuse = total_pages_to_unuse; 352 pages = pages_to_unuse = total_pages_to_unuse;
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
366 } 361 }
367 vm_unacct_memory(pages); 362 vm_unacct_memory(pages);
368 *unused = pages_to_unuse; 363 *unused = pages_to_unuse;
369 *swapid = type; 364 *swapid = si->type;
370 ret = 0; 365 ret = 0;
371 break; 366 break;
372 } 367 }
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
413 /* 408 /*
414 * we don't want to hold swap_lock while doing a very 409 * we don't want to hold swap_lock while doing a very
415 * lengthy try_to_unuse, but swap_list may change 410 * lengthy try_to_unuse, but swap_list may change
416 * so restart scan from swap_list.head each time 411 * so restart scan from swap_active_head each time
417 */ 412 */
418 spin_lock(&swap_lock); 413 spin_lock(&swap_lock);
419 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); 414 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/gup.c b/mm/gup.c
new file mode 100644
index 000000000000..cc5a9e7adea7
--- /dev/null
+++ b/mm/gup.c
@@ -0,0 +1,662 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/err.h>
4#include <linux/spinlock.h>
5
6#include <linux/hugetlb.h>
7#include <linux/mm.h>
8#include <linux/pagemap.h>
9#include <linux/rmap.h>
10#include <linux/swap.h>
11#include <linux/swapops.h>
12
13#include "internal.h"
14
15static struct page *no_page_table(struct vm_area_struct *vma,
16 unsigned int flags)
17{
18 /*
19 * When core dumping an enormous anonymous area that nobody
20 * has touched so far, we don't want to allocate unnecessary pages or
21 * page tables. Return error instead of NULL to skip handle_mm_fault,
22 * then get_dump_page() will return NULL to leave a hole in the dump.
23 * But we can only make this optimization where a hole would surely
24 * be zero-filled if handle_mm_fault() actually did handle it.
25 */
26 if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
27 return ERR_PTR(-EFAULT);
28 return NULL;
29}
30
31static struct page *follow_page_pte(struct vm_area_struct *vma,
32 unsigned long address, pmd_t *pmd, unsigned int flags)
33{
34 struct mm_struct *mm = vma->vm_mm;
35 struct page *page;
36 spinlock_t *ptl;
37 pte_t *ptep, pte;
38
39retry:
40 if (unlikely(pmd_bad(*pmd)))
41 return no_page_table(vma, flags);
42
43 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
44 pte = *ptep;
45 if (!pte_present(pte)) {
46 swp_entry_t entry;
47 /*
48 * KSM's break_ksm() relies upon recognizing a ksm page
49 * even while it is being migrated, so for that case we
50 * need migration_entry_wait().
51 */
52 if (likely(!(flags & FOLL_MIGRATION)))
53 goto no_page;
54 if (pte_none(pte) || pte_file(pte))
55 goto no_page;
56 entry = pte_to_swp_entry(pte);
57 if (!is_migration_entry(entry))
58 goto no_page;
59 pte_unmap_unlock(ptep, ptl);
60 migration_entry_wait(mm, pmd, address);
61 goto retry;
62 }
63 if ((flags & FOLL_NUMA) && pte_numa(pte))
64 goto no_page;
65 if ((flags & FOLL_WRITE) && !pte_write(pte)) {
66 pte_unmap_unlock(ptep, ptl);
67 return NULL;
68 }
69
70 page = vm_normal_page(vma, address, pte);
71 if (unlikely(!page)) {
72 if ((flags & FOLL_DUMP) ||
73 !is_zero_pfn(pte_pfn(pte)))
74 goto bad_page;
75 page = pte_page(pte);
76 }
77
78 if (flags & FOLL_GET)
79 get_page_foll(page);
80 if (flags & FOLL_TOUCH) {
81 if ((flags & FOLL_WRITE) &&
82 !pte_dirty(pte) && !PageDirty(page))
83 set_page_dirty(page);
84 /*
85 * pte_mkyoung() would be more correct here, but atomic care
86 * is needed to avoid losing the dirty bit: it is easier to use
87 * mark_page_accessed().
88 */
89 mark_page_accessed(page);
90 }
91 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
92 /*
93 * The preliminary mapping check is mainly to avoid the
94 * pointless overhead of lock_page on the ZERO_PAGE
95 * which might bounce very badly if there is contention.
96 *
97 * If the page is already locked, we don't need to
98 * handle it now - vmscan will handle it later if and
99 * when it attempts to reclaim the page.
100 */
101 if (page->mapping && trylock_page(page)) {
102 lru_add_drain(); /* push cached pages to LRU */
103 /*
104 * Because we lock page here, and migration is
105 * blocked by the pte's page reference, and we
106 * know the page is still mapped, we don't even
107 * need to check for file-cache page truncation.
108 */
109 mlock_vma_page(page);
110 unlock_page(page);
111 }
112 }
113 pte_unmap_unlock(ptep, ptl);
114 return page;
115bad_page:
116 pte_unmap_unlock(ptep, ptl);
117 return ERR_PTR(-EFAULT);
118
119no_page:
120 pte_unmap_unlock(ptep, ptl);
121 if (!pte_none(pte))
122 return NULL;
123 return no_page_table(vma, flags);
124}
125
126/**
127 * follow_page_mask - look up a page descriptor from a user-virtual address
128 * @vma: vm_area_struct mapping @address
129 * @address: virtual address to look up
130 * @flags: flags modifying lookup behaviour
131 * @page_mask: on output, *page_mask is set according to the size of the page
132 *
133 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
134 *
135 * Returns the mapped (struct page *), %NULL if no mapping exists, or
136 * an error pointer if there is a mapping to something not represented
137 * by a page descriptor (see also vm_normal_page()).
138 */
139struct page *follow_page_mask(struct vm_area_struct *vma,
140 unsigned long address, unsigned int flags,
141 unsigned int *page_mask)
142{
143 pgd_t *pgd;
144 pud_t *pud;
145 pmd_t *pmd;
146 spinlock_t *ptl;
147 struct page *page;
148 struct mm_struct *mm = vma->vm_mm;
149
150 *page_mask = 0;
151
152 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
153 if (!IS_ERR(page)) {
154 BUG_ON(flags & FOLL_GET);
155 return page;
156 }
157
158 pgd = pgd_offset(mm, address);
159 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
160 return no_page_table(vma, flags);
161
162 pud = pud_offset(pgd, address);
163 if (pud_none(*pud))
164 return no_page_table(vma, flags);
165 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
166 if (flags & FOLL_GET)
167 return NULL;
168 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
169 return page;
170 }
171 if (unlikely(pud_bad(*pud)))
172 return no_page_table(vma, flags);
173
174 pmd = pmd_offset(pud, address);
175 if (pmd_none(*pmd))
176 return no_page_table(vma, flags);
177 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
178 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
179 if (flags & FOLL_GET) {
180 /*
181 * Refcount on tail pages are not well-defined and
182 * shouldn't be taken. The caller should handle a NULL
183 * return when trying to follow tail pages.
184 */
185 if (PageHead(page))
186 get_page(page);
187 else
188 page = NULL;
189 }
190 return page;
191 }
192 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
193 return no_page_table(vma, flags);
194 if (pmd_trans_huge(*pmd)) {
195 if (flags & FOLL_SPLIT) {
196 split_huge_page_pmd(vma, address, pmd);
197 return follow_page_pte(vma, address, pmd, flags);
198 }
199 ptl = pmd_lock(mm, pmd);
200 if (likely(pmd_trans_huge(*pmd))) {
201 if (unlikely(pmd_trans_splitting(*pmd))) {
202 spin_unlock(ptl);
203 wait_split_huge_page(vma->anon_vma, pmd);
204 } else {
205 page = follow_trans_huge_pmd(vma, address,
206 pmd, flags);
207 spin_unlock(ptl);
208 *page_mask = HPAGE_PMD_NR - 1;
209 return page;
210 }
211 } else
212 spin_unlock(ptl);
213 }
214 return follow_page_pte(vma, address, pmd, flags);
215}
216
217static int get_gate_page(struct mm_struct *mm, unsigned long address,
218 unsigned int gup_flags, struct vm_area_struct **vma,
219 struct page **page)
220{
221 pgd_t *pgd;
222 pud_t *pud;
223 pmd_t *pmd;
224 pte_t *pte;
225 int ret = -EFAULT;
226
227 /* user gate pages are read-only */
228 if (gup_flags & FOLL_WRITE)
229 return -EFAULT;
230 if (address > TASK_SIZE)
231 pgd = pgd_offset_k(address);
232 else
233 pgd = pgd_offset_gate(mm, address);
234 BUG_ON(pgd_none(*pgd));
235 pud = pud_offset(pgd, address);
236 BUG_ON(pud_none(*pud));
237 pmd = pmd_offset(pud, address);
238 if (pmd_none(*pmd))
239 return -EFAULT;
240 VM_BUG_ON(pmd_trans_huge(*pmd));
241 pte = pte_offset_map(pmd, address);
242 if (pte_none(*pte))
243 goto unmap;
244 *vma = get_gate_vma(mm);
245 if (!page)
246 goto out;
247 *page = vm_normal_page(*vma, address, *pte);
248 if (!*page) {
249 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
250 goto unmap;
251 *page = pte_page(*pte);
252 }
253 get_page(*page);
254out:
255 ret = 0;
256unmap:
257 pte_unmap(pte);
258 return ret;
259}
260
261static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
262 unsigned long address, unsigned int *flags, int *nonblocking)
263{
264 struct mm_struct *mm = vma->vm_mm;
265 unsigned int fault_flags = 0;
266 int ret;
267
268 /* For mlock, just skip the stack guard page. */
269 if ((*flags & FOLL_MLOCK) &&
270 (stack_guard_page_start(vma, address) ||
271 stack_guard_page_end(vma, address + PAGE_SIZE)))
272 return -ENOENT;
273 if (*flags & FOLL_WRITE)
274 fault_flags |= FAULT_FLAG_WRITE;
275 if (nonblocking)
276 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
277 if (*flags & FOLL_NOWAIT)
278 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
279
280 ret = handle_mm_fault(mm, vma, address, fault_flags);
281 if (ret & VM_FAULT_ERROR) {
282 if (ret & VM_FAULT_OOM)
283 return -ENOMEM;
284 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
285 return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
286 if (ret & VM_FAULT_SIGBUS)
287 return -EFAULT;
288 BUG();
289 }
290
291 if (tsk) {
292 if (ret & VM_FAULT_MAJOR)
293 tsk->maj_flt++;
294 else
295 tsk->min_flt++;
296 }
297
298 if (ret & VM_FAULT_RETRY) {
299 if (nonblocking)
300 *nonblocking = 0;
301 return -EBUSY;
302 }
303
304 /*
305 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
306 * necessary, even if maybe_mkwrite decided not to set pte_write. We
307 * can thus safely do subsequent page lookups as if they were reads.
308 * But only do so when looping for pte_write is futile: in some cases
309 * userspace may also be wanting to write to the gotten user page,
310 * which a read fault here might prevent (a readonly page might get
311 * reCOWed by userspace write).
312 */
313 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
314 *flags &= ~FOLL_WRITE;
315 return 0;
316}
317
318static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
319{
320 vm_flags_t vm_flags = vma->vm_flags;
321
322 if (vm_flags & (VM_IO | VM_PFNMAP))
323 return -EFAULT;
324
325 if (gup_flags & FOLL_WRITE) {
326 if (!(vm_flags & VM_WRITE)) {
327 if (!(gup_flags & FOLL_FORCE))
328 return -EFAULT;
329 /*
330 * We used to let the write,force case do COW in a
331 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
332 * set a breakpoint in a read-only mapping of an
333 * executable, without corrupting the file (yet only
334 * when that file had been opened for writing!).
335 * Anon pages in shared mappings are surprising: now
336 * just reject it.
337 */
338 if (!is_cow_mapping(vm_flags)) {
339 WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
340 return -EFAULT;
341 }
342 }
343 } else if (!(vm_flags & VM_READ)) {
344 if (!(gup_flags & FOLL_FORCE))
345 return -EFAULT;
346 /*
347 * Is there actually any vma we can reach here which does not
348 * have VM_MAYREAD set?
349 */
350 if (!(vm_flags & VM_MAYREAD))
351 return -EFAULT;
352 }
353 return 0;
354}
355
356/**
357 * __get_user_pages() - pin user pages in memory
358 * @tsk: task_struct of target task
359 * @mm: mm_struct of target mm
360 * @start: starting user address
361 * @nr_pages: number of pages from start to pin
362 * @gup_flags: flags modifying pin behaviour
363 * @pages: array that receives pointers to the pages pinned.
364 * Should be at least nr_pages long. Or NULL, if caller
365 * only intends to ensure the pages are faulted in.
366 * @vmas: array of pointers to vmas corresponding to each page.
367 * Or NULL if the caller does not require them.
368 * @nonblocking: whether waiting for disk IO or mmap_sem contention
369 *
370 * Returns number of pages pinned. This may be fewer than the number
371 * requested. If nr_pages is 0 or negative, returns 0. If no pages
372 * were pinned, returns -errno. Each page returned must be released
373 * with a put_page() call when it is finished with. vmas will only
374 * remain valid while mmap_sem is held.
375 *
376 * Must be called with mmap_sem held for read or write.
377 *
378 * __get_user_pages walks a process's page tables and takes a reference to
379 * each struct page that each user address corresponds to at a given
380 * instant. That is, it takes the page that would be accessed if a user
381 * thread accesses the given user virtual address at that instant.
382 *
383 * This does not guarantee that the page exists in the user mappings when
384 * __get_user_pages returns, and there may even be a completely different
385 * page there in some cases (eg. if mmapped pagecache has been invalidated
386 * and subsequently re faulted). However it does guarantee that the page
387 * won't be freed completely. And mostly callers simply care that the page
388 * contains data that was valid *at some point in time*. Typically, an IO
389 * or similar operation cannot guarantee anything stronger anyway because
390 * locks can't be held over the syscall boundary.
391 *
392 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
393 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
394 * appropriate) must be called after the page is finished with, and
395 * before put_page is called.
396 *
397 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
398 * or mmap_sem contention, and if waiting is needed to pin all pages,
399 * *@nonblocking will be set to 0.
400 *
401 * In most cases, get_user_pages or get_user_pages_fast should be used
402 * instead of __get_user_pages. __get_user_pages should be used only if
403 * you need some special @gup_flags.
404 */
405long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
406 unsigned long start, unsigned long nr_pages,
407 unsigned int gup_flags, struct page **pages,
408 struct vm_area_struct **vmas, int *nonblocking)
409{
410 long i = 0;
411 unsigned int page_mask;
412 struct vm_area_struct *vma = NULL;
413
414 if (!nr_pages)
415 return 0;
416
417 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
418
419 /*
420 * If FOLL_FORCE is set then do not force a full fault as the hinting
421 * fault information is unrelated to the reference behaviour of a task
422 * using the address space
423 */
424 if (!(gup_flags & FOLL_FORCE))
425 gup_flags |= FOLL_NUMA;
426
427 do {
428 struct page *page;
429 unsigned int foll_flags = gup_flags;
430 unsigned int page_increm;
431
432 /* first iteration or cross vma bound */
433 if (!vma || start >= vma->vm_end) {
434 vma = find_extend_vma(mm, start);
435 if (!vma && in_gate_area(mm, start)) {
436 int ret;
437 ret = get_gate_page(mm, start & PAGE_MASK,
438 gup_flags, &vma,
439 pages ? &pages[i] : NULL);
440 if (ret)
441 return i ? : ret;
442 page_mask = 0;
443 goto next_page;
444 }
445
446 if (!vma || check_vma_flags(vma, gup_flags))
447 return i ? : -EFAULT;
448 if (is_vm_hugetlb_page(vma)) {
449 i = follow_hugetlb_page(mm, vma, pages, vmas,
450 &start, &nr_pages, i,
451 gup_flags);
452 continue;
453 }
454 }
455retry:
456 /*
457 * If we have a pending SIGKILL, don't keep faulting pages and
458 * potentially allocating memory.
459 */
460 if (unlikely(fatal_signal_pending(current)))
461 return i ? i : -ERESTARTSYS;
462 cond_resched();
463 page = follow_page_mask(vma, start, foll_flags, &page_mask);
464 if (!page) {
465 int ret;
466 ret = faultin_page(tsk, vma, start, &foll_flags,
467 nonblocking);
468 switch (ret) {
469 case 0:
470 goto retry;
471 case -EFAULT:
472 case -ENOMEM:
473 case -EHWPOISON:
474 return i ? i : ret;
475 case -EBUSY:
476 return i;
477 case -ENOENT:
478 goto next_page;
479 }
480 BUG();
481 }
482 if (IS_ERR(page))
483 return i ? i : PTR_ERR(page);
484 if (pages) {
485 pages[i] = page;
486 flush_anon_page(vma, page, start);
487 flush_dcache_page(page);
488 page_mask = 0;
489 }
490next_page:
491 if (vmas) {
492 vmas[i] = vma;
493 page_mask = 0;
494 }
495 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
496 if (page_increm > nr_pages)
497 page_increm = nr_pages;
498 i += page_increm;
499 start += page_increm * PAGE_SIZE;
500 nr_pages -= page_increm;
501 } while (nr_pages);
502 return i;
503}
504EXPORT_SYMBOL(__get_user_pages);
505
506/*
507 * fixup_user_fault() - manually resolve a user page fault
508 * @tsk: the task_struct to use for page fault accounting, or
509 * NULL if faults are not to be recorded.
510 * @mm: mm_struct of target mm
511 * @address: user address
512 * @fault_flags:flags to pass down to handle_mm_fault()
513 *
514 * This is meant to be called in the specific scenario where for locking reasons
515 * we try to access user memory in atomic context (within a pagefault_disable()
516 * section), this returns -EFAULT, and we want to resolve the user fault before
517 * trying again.
518 *
519 * Typically this is meant to be used by the futex code.
520 *
521 * The main difference with get_user_pages() is that this function will
522 * unconditionally call handle_mm_fault() which will in turn perform all the
523 * necessary SW fixup of the dirty and young bits in the PTE, while
524 * handle_mm_fault() only guarantees to update these in the struct page.
525 *
526 * This is important for some architectures where those bits also gate the
527 * access permission to the page because they are maintained in software. On
528 * such architectures, gup() will not be enough to make a subsequent access
529 * succeed.
530 *
531 * This should be called with the mm_sem held for read.
532 */
533int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
534 unsigned long address, unsigned int fault_flags)
535{
536 struct vm_area_struct *vma;
537 vm_flags_t vm_flags;
538 int ret;
539
540 vma = find_extend_vma(mm, address);
541 if (!vma || address < vma->vm_start)
542 return -EFAULT;
543
544 vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
545 if (!(vm_flags & vma->vm_flags))
546 return -EFAULT;
547
548 ret = handle_mm_fault(mm, vma, address, fault_flags);
549 if (ret & VM_FAULT_ERROR) {
550 if (ret & VM_FAULT_OOM)
551 return -ENOMEM;
552 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
553 return -EHWPOISON;
554 if (ret & VM_FAULT_SIGBUS)
555 return -EFAULT;
556 BUG();
557 }
558 if (tsk) {
559 if (ret & VM_FAULT_MAJOR)
560 tsk->maj_flt++;
561 else
562 tsk->min_flt++;
563 }
564 return 0;
565}
566
567/*
568 * get_user_pages() - pin user pages in memory
569 * @tsk: the task_struct to use for page fault accounting, or
570 * NULL if faults are not to be recorded.
571 * @mm: mm_struct of target mm
572 * @start: starting user address
573 * @nr_pages: number of pages from start to pin
574 * @write: whether pages will be written to by the caller
575 * @force: whether to force access even when user mapping is currently
576 * protected (but never forces write access to shared mapping).
577 * @pages: array that receives pointers to the pages pinned.
578 * Should be at least nr_pages long. Or NULL, if caller
579 * only intends to ensure the pages are faulted in.
580 * @vmas: array of pointers to vmas corresponding to each page.
581 * Or NULL if the caller does not require them.
582 *
583 * Returns number of pages pinned. This may be fewer than the number
584 * requested. If nr_pages is 0 or negative, returns 0. If no pages
585 * were pinned, returns -errno. Each page returned must be released
586 * with a put_page() call when it is finished with. vmas will only
587 * remain valid while mmap_sem is held.
588 *
589 * Must be called with mmap_sem held for read or write.
590 *
591 * get_user_pages walks a process's page tables and takes a reference to
592 * each struct page that each user address corresponds to at a given
593 * instant. That is, it takes the page that would be accessed if a user
594 * thread accesses the given user virtual address at that instant.
595 *
596 * This does not guarantee that the page exists in the user mappings when
597 * get_user_pages returns, and there may even be a completely different
598 * page there in some cases (eg. if mmapped pagecache has been invalidated
599 * and subsequently re faulted). However it does guarantee that the page
600 * won't be freed completely. And mostly callers simply care that the page
601 * contains data that was valid *at some point in time*. Typically, an IO
602 * or similar operation cannot guarantee anything stronger anyway because
603 * locks can't be held over the syscall boundary.
604 *
605 * If write=0, the page must not be written to. If the page is written to,
606 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
607 * after the page is finished with, and before put_page is called.
608 *
609 * get_user_pages is typically used for fewer-copy IO operations, to get a
610 * handle on the memory by some means other than accesses via the user virtual
611 * addresses. The pages may be submitted for DMA to devices or accessed via
612 * their kernel linear mapping (via the kmap APIs). Care should be taken to
613 * use the correct cache flushing APIs.
614 *
615 * See also get_user_pages_fast, for performance critical applications.
616 */
617long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
618 unsigned long start, unsigned long nr_pages, int write,
619 int force, struct page **pages, struct vm_area_struct **vmas)
620{
621 int flags = FOLL_TOUCH;
622
623 if (pages)
624 flags |= FOLL_GET;
625 if (write)
626 flags |= FOLL_WRITE;
627 if (force)
628 flags |= FOLL_FORCE;
629
630 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
631 NULL);
632}
633EXPORT_SYMBOL(get_user_pages);
634
635/**
636 * get_dump_page() - pin user page in memory while writing it to core dump
637 * @addr: user address
638 *
639 * Returns struct page pointer of user page pinned for dump,
640 * to be freed afterwards by page_cache_release() or put_page().
641 *
642 * Returns NULL on any kind of failure - a hole must then be inserted into
643 * the corefile, to preserve alignment with its headers; and also returns
644 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
645 * allowing a hole to be left in the corefile to save diskspace.
646 *
647 * Called without mmap_sem, but after all other threads have been killed.
648 */
649#ifdef CONFIG_ELF_CORE
650struct page *get_dump_page(unsigned long addr)
651{
652 struct vm_area_struct *vma;
653 struct page *page;
654
655 if (__get_user_pages(current, current->mm, addr, 1,
656 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
657 NULL) < 1)
658 return NULL;
659 flush_cache_page(vma, addr, page_to_pfn(page));
660 return page;
661}
662#endif /* CONFIG_ELF_CORE */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b4b1feba6472..e60837dc785c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -5,6 +5,8 @@
5 * the COPYING file in the top-level directory. 5 * the COPYING file in the top-level directory.
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/mm.h> 10#include <linux/mm.h>
9#include <linux/sched.h> 11#include <linux/sched.h>
10#include <linux/highmem.h> 12#include <linux/highmem.h>
@@ -151,8 +153,7 @@ static int start_khugepaged(void)
151 khugepaged_thread = kthread_run(khugepaged, NULL, 153 khugepaged_thread = kthread_run(khugepaged, NULL,
152 "khugepaged"); 154 "khugepaged");
153 if (unlikely(IS_ERR(khugepaged_thread))) { 155 if (unlikely(IS_ERR(khugepaged_thread))) {
154 printk(KERN_ERR 156 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
155 "khugepaged: kthread_run(khugepaged) failed\n");
156 err = PTR_ERR(khugepaged_thread); 157 err = PTR_ERR(khugepaged_thread);
157 khugepaged_thread = NULL; 158 khugepaged_thread = NULL;
158 } 159 }
@@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
584 585
585 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 586 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
586 if (unlikely(!*hugepage_kobj)) { 587 if (unlikely(!*hugepage_kobj)) {
587 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); 588 pr_err("failed to create transparent hugepage kobject\n");
588 return -ENOMEM; 589 return -ENOMEM;
589 } 590 }
590 591
591 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 592 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
592 if (err) { 593 if (err) {
593 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 594 pr_err("failed to register transparent hugepage group\n");
594 goto delete_obj; 595 goto delete_obj;
595 } 596 }
596 597
597 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 598 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
598 if (err) { 599 if (err) {
599 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 600 pr_err("failed to register transparent hugepage group\n");
600 goto remove_hp_group; 601 goto remove_hp_group;
601 } 602 }
602 603
@@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str)
689 } 690 }
690out: 691out:
691 if (!ret) 692 if (!ret)
692 printk(KERN_WARNING 693 pr_warn("transparent_hugepage= cannot parse, ignored\n");
693 "transparent_hugepage= cannot parse, ignored\n");
694 return ret; 694 return ret;
695} 695}
696__setup("transparent_hugepage=", setup_transparent_hugepage); 696__setup("transparent_hugepage=", setup_transparent_hugepage);
@@ -1830,10 +1830,11 @@ static void __split_huge_page(struct page *page,
1830 * the newly established pmd of the child later during the 1830 * the newly established pmd of the child later during the
1831 * walk, to be able to set it as pmd_trans_splitting too. 1831 * walk, to be able to set it as pmd_trans_splitting too.
1832 */ 1832 */
1833 if (mapcount != page_mapcount(page)) 1833 if (mapcount != page_mapcount(page)) {
1834 printk(KERN_ERR "mapcount %d page_mapcount %d\n", 1834 pr_err("mapcount %d page_mapcount %d\n",
1835 mapcount, page_mapcount(page)); 1835 mapcount, page_mapcount(page));
1836 BUG_ON(mapcount != page_mapcount(page)); 1836 BUG();
1837 }
1837 1838
1838 __split_huge_page_refcount(page, list); 1839 __split_huge_page_refcount(page, list);
1839 1840
@@ -1844,10 +1845,11 @@ static void __split_huge_page(struct page *page,
1844 BUG_ON(is_vma_temporary_stack(vma)); 1845 BUG_ON(is_vma_temporary_stack(vma));
1845 mapcount2 += __split_huge_page_map(page, vma, addr); 1846 mapcount2 += __split_huge_page_map(page, vma, addr);
1846 } 1847 }
1847 if (mapcount != mapcount2) 1848 if (mapcount != mapcount2) {
1848 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", 1849 pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
1849 mapcount, mapcount2, page_mapcount(page)); 1850 mapcount, mapcount2, page_mapcount(page));
1850 BUG_ON(mapcount != mapcount2); 1851 BUG();
1852 }
1851} 1853}
1852 1854
1853/* 1855/*
@@ -2740,7 +2742,7 @@ static int khugepaged(void *none)
2740 struct mm_slot *mm_slot; 2742 struct mm_slot *mm_slot;
2741 2743
2742 set_freezable(); 2744 set_freezable();
2743 set_user_nice(current, 19); 2745 set_user_nice(current, MAX_NICE);
2744 2746
2745 while (!kthread_should_stop()) { 2747 while (!kthread_should_stop()) {
2746 khugepaged_do_scan(); 2748 khugepaged_do_scan();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c82290b9c1fc..226910cb7c9b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -544,7 +544,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
544/* Movability of hugepages depends on migration support. */ 544/* Movability of hugepages depends on migration support. */
545static inline gfp_t htlb_alloc_mask(struct hstate *h) 545static inline gfp_t htlb_alloc_mask(struct hstate *h)
546{ 546{
547 if (hugepages_treat_as_movable || hugepage_migration_support(h)) 547 if (hugepages_treat_as_movable || hugepage_migration_supported(h))
548 return GFP_HIGHUSER_MOVABLE; 548 return GFP_HIGHUSER_MOVABLE;
549 else 549 else
550 return GFP_HIGHUSER; 550 return GFP_HIGHUSER;
@@ -607,25 +607,242 @@ err:
607 return NULL; 607 return NULL;
608} 608}
609 609
610/*
611 * common helper functions for hstate_next_node_to_{alloc|free}.
612 * We may have allocated or freed a huge page based on a different
613 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
614 * be outside of *nodes_allowed. Ensure that we use an allowed
615 * node for alloc or free.
616 */
617static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
618{
619 nid = next_node(nid, *nodes_allowed);
620 if (nid == MAX_NUMNODES)
621 nid = first_node(*nodes_allowed);
622 VM_BUG_ON(nid >= MAX_NUMNODES);
623
624 return nid;
625}
626
627static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
628{
629 if (!node_isset(nid, *nodes_allowed))
630 nid = next_node_allowed(nid, nodes_allowed);
631 return nid;
632}
633
634/*
635 * returns the previously saved node ["this node"] from which to
636 * allocate a persistent huge page for the pool and advance the
637 * next node from which to allocate, handling wrap at end of node
638 * mask.
639 */
640static int hstate_next_node_to_alloc(struct hstate *h,
641 nodemask_t *nodes_allowed)
642{
643 int nid;
644
645 VM_BUG_ON(!nodes_allowed);
646
647 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
648 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
649
650 return nid;
651}
652
653/*
654 * helper for free_pool_huge_page() - return the previously saved
655 * node ["this node"] from which to free a huge page. Advance the
656 * next node id whether or not we find a free huge page to free so
657 * that the next attempt to free addresses the next node.
658 */
659static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
660{
661 int nid;
662
663 VM_BUG_ON(!nodes_allowed);
664
665 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
666 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
667
668 return nid;
669}
670
671#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
672 for (nr_nodes = nodes_weight(*mask); \
673 nr_nodes > 0 && \
674 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
675 nr_nodes--)
676
677#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
678 for (nr_nodes = nodes_weight(*mask); \
679 nr_nodes > 0 && \
680 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
681 nr_nodes--)
682
683#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
684static void destroy_compound_gigantic_page(struct page *page,
685 unsigned long order)
686{
687 int i;
688 int nr_pages = 1 << order;
689 struct page *p = page + 1;
690
691 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
692 __ClearPageTail(p);
693 set_page_refcounted(p);
694 p->first_page = NULL;
695 }
696
697 set_compound_order(page, 0);
698 __ClearPageHead(page);
699}
700
701static void free_gigantic_page(struct page *page, unsigned order)
702{
703 free_contig_range(page_to_pfn(page), 1 << order);
704}
705
706static int __alloc_gigantic_page(unsigned long start_pfn,
707 unsigned long nr_pages)
708{
709 unsigned long end_pfn = start_pfn + nr_pages;
710 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
711}
712
713static bool pfn_range_valid_gigantic(unsigned long start_pfn,
714 unsigned long nr_pages)
715{
716 unsigned long i, end_pfn = start_pfn + nr_pages;
717 struct page *page;
718
719 for (i = start_pfn; i < end_pfn; i++) {
720 if (!pfn_valid(i))
721 return false;
722
723 page = pfn_to_page(i);
724
725 if (PageReserved(page))
726 return false;
727
728 if (page_count(page) > 0)
729 return false;
730
731 if (PageHuge(page))
732 return false;
733 }
734
735 return true;
736}
737
738static bool zone_spans_last_pfn(const struct zone *zone,
739 unsigned long start_pfn, unsigned long nr_pages)
740{
741 unsigned long last_pfn = start_pfn + nr_pages - 1;
742 return zone_spans_pfn(zone, last_pfn);
743}
744
745static struct page *alloc_gigantic_page(int nid, unsigned order)
746{
747 unsigned long nr_pages = 1 << order;
748 unsigned long ret, pfn, flags;
749 struct zone *z;
750
751 z = NODE_DATA(nid)->node_zones;
752 for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
753 spin_lock_irqsave(&z->lock, flags);
754
755 pfn = ALIGN(z->zone_start_pfn, nr_pages);
756 while (zone_spans_last_pfn(z, pfn, nr_pages)) {
757 if (pfn_range_valid_gigantic(pfn, nr_pages)) {
758 /*
759 * We release the zone lock here because
760 * alloc_contig_range() will also lock the zone
761 * at some point. If there's an allocation
762 * spinning on this lock, it may win the race
763 * and cause alloc_contig_range() to fail...
764 */
765 spin_unlock_irqrestore(&z->lock, flags);
766 ret = __alloc_gigantic_page(pfn, nr_pages);
767 if (!ret)
768 return pfn_to_page(pfn);
769 spin_lock_irqsave(&z->lock, flags);
770 }
771 pfn += nr_pages;
772 }
773
774 spin_unlock_irqrestore(&z->lock, flags);
775 }
776
777 return NULL;
778}
779
780static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
781static void prep_compound_gigantic_page(struct page *page, unsigned long order);
782
783static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
784{
785 struct page *page;
786
787 page = alloc_gigantic_page(nid, huge_page_order(h));
788 if (page) {
789 prep_compound_gigantic_page(page, huge_page_order(h));
790 prep_new_huge_page(h, page, nid);
791 }
792
793 return page;
794}
795
796static int alloc_fresh_gigantic_page(struct hstate *h,
797 nodemask_t *nodes_allowed)
798{
799 struct page *page = NULL;
800 int nr_nodes, node;
801
802 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
803 page = alloc_fresh_gigantic_page_node(h, node);
804 if (page)
805 return 1;
806 }
807
808 return 0;
809}
810
811static inline bool gigantic_page_supported(void) { return true; }
812#else
813static inline bool gigantic_page_supported(void) { return false; }
814static inline void free_gigantic_page(struct page *page, unsigned order) { }
815static inline void destroy_compound_gigantic_page(struct page *page,
816 unsigned long order) { }
817static inline int alloc_fresh_gigantic_page(struct hstate *h,
818 nodemask_t *nodes_allowed) { return 0; }
819#endif
820
610static void update_and_free_page(struct hstate *h, struct page *page) 821static void update_and_free_page(struct hstate *h, struct page *page)
611{ 822{
612 int i; 823 int i;
613 824
614 VM_BUG_ON(h->order >= MAX_ORDER); 825 if (hstate_is_gigantic(h) && !gigantic_page_supported())
826 return;
615 827
616 h->nr_huge_pages--; 828 h->nr_huge_pages--;
617 h->nr_huge_pages_node[page_to_nid(page)]--; 829 h->nr_huge_pages_node[page_to_nid(page)]--;
618 for (i = 0; i < pages_per_huge_page(h); i++) { 830 for (i = 0; i < pages_per_huge_page(h); i++) {
619 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 831 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
620 1 << PG_referenced | 1 << PG_dirty | 832 1 << PG_referenced | 1 << PG_dirty |
621 1 << PG_active | 1 << PG_reserved | 833 1 << PG_active | 1 << PG_private |
622 1 << PG_private | 1 << PG_writeback); 834 1 << PG_writeback);
623 } 835 }
624 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 836 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
625 set_compound_page_dtor(page, NULL); 837 set_compound_page_dtor(page, NULL);
626 set_page_refcounted(page); 838 set_page_refcounted(page);
627 arch_release_hugepage(page); 839 if (hstate_is_gigantic(h)) {
628 __free_pages(page, huge_page_order(h)); 840 destroy_compound_gigantic_page(page, huge_page_order(h));
841 free_gigantic_page(page, huge_page_order(h));
842 } else {
843 arch_release_hugepage(page);
844 __free_pages(page, huge_page_order(h));
845 }
629} 846}
630 847
631struct hstate *size_to_hstate(unsigned long size) 848struct hstate *size_to_hstate(unsigned long size)
@@ -664,7 +881,7 @@ static void free_huge_page(struct page *page)
664 if (restore_reserve) 881 if (restore_reserve)
665 h->resv_huge_pages++; 882 h->resv_huge_pages++;
666 883
667 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 884 if (h->surplus_huge_pages_node[nid]) {
668 /* remove the page from active list */ 885 /* remove the page from active list */
669 list_del(&page->lru); 886 list_del(&page->lru);
670 update_and_free_page(h, page); 887 update_and_free_page(h, page);
@@ -690,8 +907,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
690 put_page(page); /* free it into the hugepage allocator */ 907 put_page(page); /* free it into the hugepage allocator */
691} 908}
692 909
693static void __init prep_compound_gigantic_page(struct page *page, 910static void prep_compound_gigantic_page(struct page *page, unsigned long order)
694 unsigned long order)
695{ 911{
696 int i; 912 int i;
697 int nr_pages = 1 << order; 913 int nr_pages = 1 << order;
@@ -769,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
769{ 985{
770 struct page *page; 986 struct page *page;
771 987
772 if (h->order >= MAX_ORDER)
773 return NULL;
774
775 page = alloc_pages_exact_node(nid, 988 page = alloc_pages_exact_node(nid,
776 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 989 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
777 __GFP_REPEAT|__GFP_NOWARN, 990 __GFP_REPEAT|__GFP_NOWARN,
@@ -787,79 +1000,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
787 return page; 1000 return page;
788} 1001}
789 1002
790/*
791 * common helper functions for hstate_next_node_to_{alloc|free}.
792 * We may have allocated or freed a huge page based on a different
793 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
794 * be outside of *nodes_allowed. Ensure that we use an allowed
795 * node for alloc or free.
796 */
797static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
798{
799 nid = next_node(nid, *nodes_allowed);
800 if (nid == MAX_NUMNODES)
801 nid = first_node(*nodes_allowed);
802 VM_BUG_ON(nid >= MAX_NUMNODES);
803
804 return nid;
805}
806
807static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
808{
809 if (!node_isset(nid, *nodes_allowed))
810 nid = next_node_allowed(nid, nodes_allowed);
811 return nid;
812}
813
814/*
815 * returns the previously saved node ["this node"] from which to
816 * allocate a persistent huge page for the pool and advance the
817 * next node from which to allocate, handling wrap at end of node
818 * mask.
819 */
820static int hstate_next_node_to_alloc(struct hstate *h,
821 nodemask_t *nodes_allowed)
822{
823 int nid;
824
825 VM_BUG_ON(!nodes_allowed);
826
827 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
828 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
829
830 return nid;
831}
832
833/*
834 * helper for free_pool_huge_page() - return the previously saved
835 * node ["this node"] from which to free a huge page. Advance the
836 * next node id whether or not we find a free huge page to free so
837 * that the next attempt to free addresses the next node.
838 */
839static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
840{
841 int nid;
842
843 VM_BUG_ON(!nodes_allowed);
844
845 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
846 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
847
848 return nid;
849}
850
851#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
852 for (nr_nodes = nodes_weight(*mask); \
853 nr_nodes > 0 && \
854 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
855 nr_nodes--)
856
857#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
858 for (nr_nodes = nodes_weight(*mask); \
859 nr_nodes > 0 && \
860 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
861 nr_nodes--)
862
863static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 1003static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
864{ 1004{
865 struct page *page; 1005 struct page *page;
@@ -963,7 +1103,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
963 struct page *page; 1103 struct page *page;
964 unsigned int r_nid; 1104 unsigned int r_nid;
965 1105
966 if (h->order >= MAX_ORDER) 1106 if (hstate_is_gigantic(h))
967 return NULL; 1107 return NULL;
968 1108
969 /* 1109 /*
@@ -1156,7 +1296,7 @@ static void return_unused_surplus_pages(struct hstate *h,
1156 h->resv_huge_pages -= unused_resv_pages; 1296 h->resv_huge_pages -= unused_resv_pages;
1157 1297
1158 /* Cannot return gigantic pages currently */ 1298 /* Cannot return gigantic pages currently */
1159 if (h->order >= MAX_ORDER) 1299 if (hstate_is_gigantic(h))
1160 return; 1300 return;
1161 1301
1162 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 1302 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1246,24 +1386,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1246 return ERR_PTR(-ENOSPC); 1386 return ERR_PTR(-ENOSPC);
1247 1387
1248 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1388 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1249 if (ret) { 1389 if (ret)
1250 if (chg || avoid_reserve) 1390 goto out_subpool_put;
1251 hugepage_subpool_put_pages(spool, 1); 1391
1252 return ERR_PTR(-ENOSPC);
1253 }
1254 spin_lock(&hugetlb_lock); 1392 spin_lock(&hugetlb_lock);
1255 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); 1393 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
1256 if (!page) { 1394 if (!page) {
1257 spin_unlock(&hugetlb_lock); 1395 spin_unlock(&hugetlb_lock);
1258 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1396 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1259 if (!page) { 1397 if (!page)
1260 hugetlb_cgroup_uncharge_cgroup(idx, 1398 goto out_uncharge_cgroup;
1261 pages_per_huge_page(h), 1399
1262 h_cg);
1263 if (chg || avoid_reserve)
1264 hugepage_subpool_put_pages(spool, 1);
1265 return ERR_PTR(-ENOSPC);
1266 }
1267 spin_lock(&hugetlb_lock); 1400 spin_lock(&hugetlb_lock);
1268 list_move(&page->lru, &h->hugepage_activelist); 1401 list_move(&page->lru, &h->hugepage_activelist);
1269 /* Fall through */ 1402 /* Fall through */
@@ -1275,6 +1408,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1275 1408
1276 vma_commit_reservation(h, vma, addr); 1409 vma_commit_reservation(h, vma, addr);
1277 return page; 1410 return page;
1411
1412out_uncharge_cgroup:
1413 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
1414out_subpool_put:
1415 if (chg || avoid_reserve)
1416 hugepage_subpool_put_pages(spool, 1);
1417 return ERR_PTR(-ENOSPC);
1278} 1418}
1279 1419
1280/* 1420/*
@@ -1356,7 +1496,7 @@ static void __init gather_bootmem_prealloc(void)
1356 * fix confusing memory reports from free(1) and another 1496 * fix confusing memory reports from free(1) and another
1357 * side-effects, like CommitLimit going negative. 1497 * side-effects, like CommitLimit going negative.
1358 */ 1498 */
1359 if (h->order > (MAX_ORDER - 1)) 1499 if (hstate_is_gigantic(h))
1360 adjust_managed_page_count(page, 1 << h->order); 1500 adjust_managed_page_count(page, 1 << h->order);
1361 } 1501 }
1362} 1502}
@@ -1366,7 +1506,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1366 unsigned long i; 1506 unsigned long i;
1367 1507
1368 for (i = 0; i < h->max_huge_pages; ++i) { 1508 for (i = 0; i < h->max_huge_pages; ++i) {
1369 if (h->order >= MAX_ORDER) { 1509 if (hstate_is_gigantic(h)) {
1370 if (!alloc_bootmem_huge_page(h)) 1510 if (!alloc_bootmem_huge_page(h))
1371 break; 1511 break;
1372 } else if (!alloc_fresh_huge_page(h, 1512 } else if (!alloc_fresh_huge_page(h,
@@ -1382,7 +1522,7 @@ static void __init hugetlb_init_hstates(void)
1382 1522
1383 for_each_hstate(h) { 1523 for_each_hstate(h) {
1384 /* oversize hugepages were init'ed in early boot */ 1524 /* oversize hugepages were init'ed in early boot */
1385 if (h->order < MAX_ORDER) 1525 if (!hstate_is_gigantic(h))
1386 hugetlb_hstate_alloc_pages(h); 1526 hugetlb_hstate_alloc_pages(h);
1387 } 1527 }
1388} 1528}
@@ -1416,7 +1556,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
1416{ 1556{
1417 int i; 1557 int i;
1418 1558
1419 if (h->order >= MAX_ORDER) 1559 if (hstate_is_gigantic(h))
1420 return; 1560 return;
1421 1561
1422 for_each_node_mask(i, *nodes_allowed) { 1562 for_each_node_mask(i, *nodes_allowed) {
@@ -1479,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1479{ 1619{
1480 unsigned long min_count, ret; 1620 unsigned long min_count, ret;
1481 1621
1482 if (h->order >= MAX_ORDER) 1622 if (hstate_is_gigantic(h) && !gigantic_page_supported())
1483 return h->max_huge_pages; 1623 return h->max_huge_pages;
1484 1624
1485 /* 1625 /*
@@ -1506,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1506 * and reducing the surplus. 1646 * and reducing the surplus.
1507 */ 1647 */
1508 spin_unlock(&hugetlb_lock); 1648 spin_unlock(&hugetlb_lock);
1509 ret = alloc_fresh_huge_page(h, nodes_allowed); 1649 if (hstate_is_gigantic(h))
1650 ret = alloc_fresh_gigantic_page(h, nodes_allowed);
1651 else
1652 ret = alloc_fresh_huge_page(h, nodes_allowed);
1510 spin_lock(&hugetlb_lock); 1653 spin_lock(&hugetlb_lock);
1511 if (!ret) 1654 if (!ret)
1512 goto out; 1655 goto out;
@@ -1606,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1606 goto out; 1749 goto out;
1607 1750
1608 h = kobj_to_hstate(kobj, &nid); 1751 h = kobj_to_hstate(kobj, &nid);
1609 if (h->order >= MAX_ORDER) { 1752 if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
1610 err = -EINVAL; 1753 err = -EINVAL;
1611 goto out; 1754 goto out;
1612 } 1755 }
@@ -1689,7 +1832,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1689 unsigned long input; 1832 unsigned long input;
1690 struct hstate *h = kobj_to_hstate(kobj, NULL); 1833 struct hstate *h = kobj_to_hstate(kobj, NULL);
1691 1834
1692 if (h->order >= MAX_ORDER) 1835 if (hstate_is_gigantic(h))
1693 return -EINVAL; 1836 return -EINVAL;
1694 1837
1695 err = kstrtoul(buf, 10, &input); 1838 err = kstrtoul(buf, 10, &input);
@@ -2113,7 +2256,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2113 2256
2114 tmp = h->max_huge_pages; 2257 tmp = h->max_huge_pages;
2115 2258
2116 if (write && h->order >= MAX_ORDER) 2259 if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
2117 return -EINVAL; 2260 return -EINVAL;
2118 2261
2119 table->data = &tmp; 2262 table->data = &tmp;
@@ -2169,7 +2312,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
2169 2312
2170 tmp = h->nr_overcommit_huge_pages; 2313 tmp = h->nr_overcommit_huge_pages;
2171 2314
2172 if (write && h->order >= MAX_ORDER) 2315 if (write && hstate_is_gigantic(h))
2173 return -EINVAL; 2316 return -EINVAL;
2174 2317
2175 table->data = &tmp; 2318 table->data = &tmp;
diff --git a/mm/internal.h b/mm/internal.h
index 07b67361a40a..7f22a11fcc66 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
134 unsigned long nr_migratepages; /* Number of pages to migrate */ 134 unsigned long nr_migratepages; /* Number of pages to migrate */
135 unsigned long free_pfn; /* isolate_freepages search base */ 135 unsigned long free_pfn; /* isolate_freepages search base */
136 unsigned long migrate_pfn; /* isolate_migratepages search base */ 136 unsigned long migrate_pfn; /* isolate_migratepages search base */
137 bool sync; /* Synchronous migration */ 137 enum migrate_mode mode; /* Async or sync migration mode */
138 bool ignore_skip_hint; /* Scan blocks even if marked skip */ 138 bool ignore_skip_hint; /* Scan blocks even if marked skip */
139 bool finished_update_free; /* True when the zone cached pfns are 139 bool finished_update_free; /* True when the zone cached pfns are
140 * no longer being updated 140 * no longer being updated
@@ -144,7 +144,10 @@ struct compact_control {
144 int order; /* order a direct compactor needs */ 144 int order; /* order a direct compactor needs */
145 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 145 int migratetype; /* MOVABLE, RECLAIMABLE etc */
146 struct zone *zone; 146 struct zone *zone;
147 bool contended; /* True if a lock was contended */ 147 bool contended; /* True if a lock was contended, or
148 * need_resched() true during async
149 * compaction
150 */
148}; 151};
149 152
150unsigned long 153unsigned long
@@ -169,6 +172,11 @@ static inline unsigned long page_order(struct page *page)
169 return page_private(page); 172 return page_private(page);
170} 173}
171 174
175static inline bool is_cow_mapping(vm_flags_t flags)
176{
177 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
178}
179
172/* mm/util.c */ 180/* mm/util.c */
173void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 181void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
174 struct vm_area_struct *prev, struct rb_node *rb_parent); 182 struct vm_area_struct *prev, struct rb_node *rb_parent);
@@ -184,26 +192,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
184} 192}
185 193
186/* 194/*
187 * Called only in fault path, to determine if a new page is being
188 * mapped into a LOCKED vma. If it is, mark page as mlocked.
189 */
190static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
191 struct page *page)
192{
193 VM_BUG_ON_PAGE(PageLRU(page), page);
194
195 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
196 return 0;
197
198 if (!TestSetPageMlocked(page)) {
199 mod_zone_page_state(page_zone(page), NR_MLOCK,
200 hpage_nr_pages(page));
201 count_vm_event(UNEVICTABLE_PGMLOCKED);
202 }
203 return 1;
204}
205
206/*
207 * must be called with vma's mmap_sem held for read or write, and page locked. 195 * must be called with vma's mmap_sem held for read or write, and page locked.
208 */ 196 */
209extern void mlock_vma_page(struct page *page); 197extern void mlock_vma_page(struct page *page);
@@ -245,10 +233,6 @@ extern unsigned long vma_address(struct page *page,
245 struct vm_area_struct *vma); 233 struct vm_area_struct *vma);
246#endif 234#endif
247#else /* !CONFIG_MMU */ 235#else /* !CONFIG_MMU */
248static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
249{
250 return 0;
251}
252static inline void clear_page_mlock(struct page *page) { } 236static inline void clear_page_mlock(struct page *page) { }
253static inline void mlock_vma_page(struct page *page) { } 237static inline void mlock_vma_page(struct page *page) { }
254static inline void mlock_migrate_page(struct page *new, struct page *old) { } 238static inline void mlock_migrate_page(struct page *new, struct page *old) { }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 8d2fcdfeff7f..736ade31d1dc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1300,7 +1300,7 @@ static void kmemleak_scan(void)
1300 /* 1300 /*
1301 * Struct page scanning for each node. 1301 * Struct page scanning for each node.
1302 */ 1302 */
1303 lock_memory_hotplug(); 1303 get_online_mems();
1304 for_each_online_node(i) { 1304 for_each_online_node(i) {
1305 unsigned long start_pfn = node_start_pfn(i); 1305 unsigned long start_pfn = node_start_pfn(i);
1306 unsigned long end_pfn = node_end_pfn(i); 1306 unsigned long end_pfn = node_end_pfn(i);
@@ -1318,7 +1318,7 @@ static void kmemleak_scan(void)
1318 scan_block(page, page + 1, NULL, 1); 1318 scan_block(page, page + 1, NULL, 1);
1319 } 1319 }
1320 } 1320 }
1321 unlock_memory_hotplug(); 1321 put_online_mems();
1322 1322
1323 /* 1323 /*
1324 * Scanning the task stacks (may introduce false negatives). 1324 * Scanning the task stacks (may introduce false negatives).
diff --git a/mm/madvise.c b/mm/madvise.c
index 539eeb96b323..a402f8fdc68e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
195 for (; start < end; start += PAGE_SIZE) { 195 for (; start < end; start += PAGE_SIZE) {
196 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 196 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
197 197
198 page = find_get_page(mapping, index); 198 page = find_get_entry(mapping, index);
199 if (!radix_tree_exceptional_entry(page)) { 199 if (!radix_tree_exceptional_entry(page)) {
200 if (page) 200 if (page)
201 page_cache_release(page); 201 page_cache_release(page);
diff --git a/mm/memblock.c b/mm/memblock.c
index e9d6ca9a01a9..0aa0d2b07624 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -27,6 +27,9 @@
27 27
28static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 28static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
29static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 29static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
30#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
31static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
32#endif
30 33
31struct memblock memblock __initdata_memblock = { 34struct memblock memblock __initdata_memblock = {
32 .memory.regions = memblock_memory_init_regions, 35 .memory.regions = memblock_memory_init_regions,
@@ -37,6 +40,12 @@ struct memblock memblock __initdata_memblock = {
37 .reserved.cnt = 1, /* empty dummy entry */ 40 .reserved.cnt = 1, /* empty dummy entry */
38 .reserved.max = INIT_MEMBLOCK_REGIONS, 41 .reserved.max = INIT_MEMBLOCK_REGIONS,
39 42
43#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
44 .physmem.regions = memblock_physmem_init_regions,
45 .physmem.cnt = 1, /* empty dummy entry */
46 .physmem.max = INIT_PHYSMEM_REGIONS,
47#endif
48
40 .bottom_up = false, 49 .bottom_up = false,
41 .current_limit = MEMBLOCK_ALLOC_ANYWHERE, 50 .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
42}; 51};
@@ -472,7 +481,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
472} 481}
473 482
474/** 483/**
475 * memblock_add_region - add new memblock region 484 * memblock_add_range - add new memblock region
476 * @type: memblock type to add new region into 485 * @type: memblock type to add new region into
477 * @base: base address of the new region 486 * @base: base address of the new region
478 * @size: size of the new region 487 * @size: size of the new region
@@ -487,7 +496,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
487 * RETURNS: 496 * RETURNS:
488 * 0 on success, -errno on failure. 497 * 0 on success, -errno on failure.
489 */ 498 */
490static int __init_memblock memblock_add_region(struct memblock_type *type, 499int __init_memblock memblock_add_range(struct memblock_type *type,
491 phys_addr_t base, phys_addr_t size, 500 phys_addr_t base, phys_addr_t size,
492 int nid, unsigned long flags) 501 int nid, unsigned long flags)
493{ 502{
@@ -569,12 +578,12 @@ repeat:
569int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, 578int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
570 int nid) 579 int nid)
571{ 580{
572 return memblock_add_region(&memblock.memory, base, size, nid, 0); 581 return memblock_add_range(&memblock.memory, base, size, nid, 0);
573} 582}
574 583
575int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 584int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
576{ 585{
577 return memblock_add_region(&memblock.memory, base, size, 586 return memblock_add_range(&memblock.memory, base, size,
578 MAX_NUMNODES, 0); 587 MAX_NUMNODES, 0);
579} 588}
580 589
@@ -654,8 +663,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
654 return 0; 663 return 0;
655} 664}
656 665
657static int __init_memblock __memblock_remove(struct memblock_type *type, 666int __init_memblock memblock_remove_range(struct memblock_type *type,
658 phys_addr_t base, phys_addr_t size) 667 phys_addr_t base, phys_addr_t size)
659{ 668{
660 int start_rgn, end_rgn; 669 int start_rgn, end_rgn;
661 int i, ret; 670 int i, ret;
@@ -671,9 +680,10 @@ static int __init_memblock __memblock_remove(struct memblock_type *type,
671 680
672int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) 681int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
673{ 682{
674 return __memblock_remove(&memblock.memory, base, size); 683 return memblock_remove_range(&memblock.memory, base, size);
675} 684}
676 685
686
677int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) 687int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
678{ 688{
679 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", 689 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
@@ -681,7 +691,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
681 (unsigned long long)base + size - 1, 691 (unsigned long long)base + size - 1,
682 (void *)_RET_IP_); 692 (void *)_RET_IP_);
683 693
684 return __memblock_remove(&memblock.reserved, base, size); 694 return memblock_remove_range(&memblock.reserved, base, size);
685} 695}
686 696
687static int __init_memblock memblock_reserve_region(phys_addr_t base, 697static int __init_memblock memblock_reserve_region(phys_addr_t base,
@@ -696,7 +706,7 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
696 (unsigned long long)base + size - 1, 706 (unsigned long long)base + size - 1,
697 flags, (void *)_RET_IP_); 707 flags, (void *)_RET_IP_);
698 708
699 return memblock_add_region(_rgn, base, size, nid, flags); 709 return memblock_add_range(_rgn, base, size, nid, flags);
700} 710}
701 711
702int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 712int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
@@ -758,17 +768,19 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
758} 768}
759 769
760/** 770/**
761 * __next_free_mem_range - next function for for_each_free_mem_range() 771 * __next__mem_range - next function for for_each_free_mem_range() etc.
762 * @idx: pointer to u64 loop variable 772 * @idx: pointer to u64 loop variable
763 * @nid: node selector, %NUMA_NO_NODE for all nodes 773 * @nid: node selector, %NUMA_NO_NODE for all nodes
774 * @type_a: pointer to memblock_type from where the range is taken
775 * @type_b: pointer to memblock_type which excludes memory from being taken
764 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 776 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
765 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 777 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
766 * @out_nid: ptr to int for nid of the range, can be %NULL 778 * @out_nid: ptr to int for nid of the range, can be %NULL
767 * 779 *
768 * Find the first free area from *@idx which matches @nid, fill the out 780 * Find the first area from *@idx which matches @nid, fill the out
769 * parameters, and update *@idx for the next iteration. The lower 32bit of 781 * parameters, and update *@idx for the next iteration. The lower 32bit of
770 * *@idx contains index into memory region and the upper 32bit indexes the 782 * *@idx contains index into type_a and the upper 32bit indexes the
771 * areas before each reserved region. For example, if reserved regions 783 * areas before each region in type_b. For example, if type_b regions
772 * look like the following, 784 * look like the following,
773 * 785 *
774 * 0:[0-16), 1:[32-48), 2:[128-130) 786 * 0:[0-16), 1:[32-48), 2:[128-130)
@@ -780,53 +792,77 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
780 * As both region arrays are sorted, the function advances the two indices 792 * As both region arrays are sorted, the function advances the two indices
781 * in lockstep and returns each intersection. 793 * in lockstep and returns each intersection.
782 */ 794 */
783void __init_memblock __next_free_mem_range(u64 *idx, int nid, 795void __init_memblock __next_mem_range(u64 *idx, int nid,
784 phys_addr_t *out_start, 796 struct memblock_type *type_a,
785 phys_addr_t *out_end, int *out_nid) 797 struct memblock_type *type_b,
798 phys_addr_t *out_start,
799 phys_addr_t *out_end, int *out_nid)
786{ 800{
787 struct memblock_type *mem = &memblock.memory; 801 int idx_a = *idx & 0xffffffff;
788 struct memblock_type *rsv = &memblock.reserved; 802 int idx_b = *idx >> 32;
789 int mi = *idx & 0xffffffff;
790 int ri = *idx >> 32;
791 803
792 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) 804 if (WARN_ONCE(nid == MAX_NUMNODES,
805 "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
793 nid = NUMA_NO_NODE; 806 nid = NUMA_NO_NODE;
794 807
795 for ( ; mi < mem->cnt; mi++) { 808 for (; idx_a < type_a->cnt; idx_a++) {
796 struct memblock_region *m = &mem->regions[mi]; 809 struct memblock_region *m = &type_a->regions[idx_a];
810
797 phys_addr_t m_start = m->base; 811 phys_addr_t m_start = m->base;
798 phys_addr_t m_end = m->base + m->size; 812 phys_addr_t m_end = m->base + m->size;
813 int m_nid = memblock_get_region_node(m);
799 814
800 /* only memory regions are associated with nodes, check it */ 815 /* only memory regions are associated with nodes, check it */
801 if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) 816 if (nid != NUMA_NO_NODE && nid != m_nid)
802 continue; 817 continue;
803 818
804 /* scan areas before each reservation for intersection */ 819 if (!type_b) {
805 for ( ; ri < rsv->cnt + 1; ri++) { 820 if (out_start)
806 struct memblock_region *r = &rsv->regions[ri]; 821 *out_start = m_start;
807 phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; 822 if (out_end)
808 phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; 823 *out_end = m_end;
824 if (out_nid)
825 *out_nid = m_nid;
826 idx_a++;
827 *idx = (u32)idx_a | (u64)idx_b << 32;
828 return;
829 }
830
831 /* scan areas before each reservation */
832 for (; idx_b < type_b->cnt + 1; idx_b++) {
833 struct memblock_region *r;
834 phys_addr_t r_start;
835 phys_addr_t r_end;
809 836
810 /* if ri advanced past mi, break out to advance mi */ 837 r = &type_b->regions[idx_b];
838 r_start = idx_b ? r[-1].base + r[-1].size : 0;
839 r_end = idx_b < type_b->cnt ?
840 r->base : ULLONG_MAX;
841
842 /*
843 * if idx_b advanced past idx_a,
844 * break out to advance idx_a
845 */
811 if (r_start >= m_end) 846 if (r_start >= m_end)
812 break; 847 break;
813 /* if the two regions intersect, we're done */ 848 /* if the two regions intersect, we're done */
814 if (m_start < r_end) { 849 if (m_start < r_end) {
815 if (out_start) 850 if (out_start)
816 *out_start = max(m_start, r_start); 851 *out_start =
852 max(m_start, r_start);
817 if (out_end) 853 if (out_end)
818 *out_end = min(m_end, r_end); 854 *out_end = min(m_end, r_end);
819 if (out_nid) 855 if (out_nid)
820 *out_nid = memblock_get_region_node(m); 856 *out_nid = m_nid;
821 /* 857 /*
822 * The region which ends first is advanced 858 * The region which ends first is
823 * for the next iteration. 859 * advanced for the next iteration.
824 */ 860 */
825 if (m_end <= r_end) 861 if (m_end <= r_end)
826 mi++; 862 idx_a++;
827 else 863 else
828 ri++; 864 idx_b++;
829 *idx = (u32)mi | (u64)ri << 32; 865 *idx = (u32)idx_a | (u64)idx_b << 32;
830 return; 866 return;
831 } 867 }
832 } 868 }
@@ -837,57 +873,80 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
837} 873}
838 874
839/** 875/**
840 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 876 * __next_mem_range_rev - generic next function for for_each_*_range_rev()
877 *
878 * Finds the next range from type_a which is not marked as unsuitable
879 * in type_b.
880 *
841 * @idx: pointer to u64 loop variable 881 * @idx: pointer to u64 loop variable
842 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes 882 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
883 * @type_a: pointer to memblock_type from where the range is taken
884 * @type_b: pointer to memblock_type which excludes memory from being taken
843 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 885 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
844 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 886 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
845 * @out_nid: ptr to int for nid of the range, can be %NULL 887 * @out_nid: ptr to int for nid of the range, can be %NULL
846 * 888 *
847 * Reverse of __next_free_mem_range(). 889 * Reverse of __next_mem_range().
848 *
849 * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't
850 * be able to hot-remove hotpluggable memory used by the kernel. So this
851 * function skip hotpluggable regions if needed when allocating memory for the
852 * kernel.
853 */ 890 */
854void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, 891void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
855 phys_addr_t *out_start, 892 struct memblock_type *type_a,
856 phys_addr_t *out_end, int *out_nid) 893 struct memblock_type *type_b,
894 phys_addr_t *out_start,
895 phys_addr_t *out_end, int *out_nid)
857{ 896{
858 struct memblock_type *mem = &memblock.memory; 897 int idx_a = *idx & 0xffffffff;
859 struct memblock_type *rsv = &memblock.reserved; 898 int idx_b = *idx >> 32;
860 int mi = *idx & 0xffffffff;
861 int ri = *idx >> 32;
862 899
863 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) 900 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
864 nid = NUMA_NO_NODE; 901 nid = NUMA_NO_NODE;
865 902
866 if (*idx == (u64)ULLONG_MAX) { 903 if (*idx == (u64)ULLONG_MAX) {
867 mi = mem->cnt - 1; 904 idx_a = type_a->cnt - 1;
868 ri = rsv->cnt; 905 idx_b = type_b->cnt;
869 } 906 }
870 907
871 for ( ; mi >= 0; mi--) { 908 for (; idx_a >= 0; idx_a--) {
872 struct memblock_region *m = &mem->regions[mi]; 909 struct memblock_region *m = &type_a->regions[idx_a];
910
873 phys_addr_t m_start = m->base; 911 phys_addr_t m_start = m->base;
874 phys_addr_t m_end = m->base + m->size; 912 phys_addr_t m_end = m->base + m->size;
913 int m_nid = memblock_get_region_node(m);
875 914
876 /* only memory regions are associated with nodes, check it */ 915 /* only memory regions are associated with nodes, check it */
877 if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) 916 if (nid != NUMA_NO_NODE && nid != m_nid)
878 continue; 917 continue;
879 918
880 /* skip hotpluggable memory regions if needed */ 919 /* skip hotpluggable memory regions if needed */
881 if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) 920 if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
882 continue; 921 continue;
883 922
884 /* scan areas before each reservation for intersection */ 923 if (!type_b) {
885 for ( ; ri >= 0; ri--) { 924 if (out_start)
886 struct memblock_region *r = &rsv->regions[ri]; 925 *out_start = m_start;
887 phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; 926 if (out_end)
888 phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; 927 *out_end = m_end;
928 if (out_nid)
929 *out_nid = m_nid;
930 idx_a++;
931 *idx = (u32)idx_a | (u64)idx_b << 32;
932 return;
933 }
934
935 /* scan areas before each reservation */
936 for (; idx_b >= 0; idx_b--) {
937 struct memblock_region *r;
938 phys_addr_t r_start;
939 phys_addr_t r_end;
940
941 r = &type_b->regions[idx_b];
942 r_start = idx_b ? r[-1].base + r[-1].size : 0;
943 r_end = idx_b < type_b->cnt ?
944 r->base : ULLONG_MAX;
945 /*
946 * if idx_b advanced past idx_a,
947 * break out to advance idx_a
948 */
889 949
890 /* if ri advanced past mi, break out to advance mi */
891 if (r_end <= m_start) 950 if (r_end <= m_start)
892 break; 951 break;
893 /* if the two regions intersect, we're done */ 952 /* if the two regions intersect, we're done */
@@ -897,18 +956,17 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
897 if (out_end) 956 if (out_end)
898 *out_end = min(m_end, r_end); 957 *out_end = min(m_end, r_end);
899 if (out_nid) 958 if (out_nid)
900 *out_nid = memblock_get_region_node(m); 959 *out_nid = m_nid;
901
902 if (m_start >= r_start) 960 if (m_start >= r_start)
903 mi--; 961 idx_a--;
904 else 962 else
905 ri--; 963 idx_b--;
906 *idx = (u32)mi | (u64)ri << 32; 964 *idx = (u32)idx_a | (u64)idx_b << 32;
907 return; 965 return;
908 } 966 }
909 } 967 }
910 } 968 }
911 969 /* signal end of iteration */
912 *idx = ULLONG_MAX; 970 *idx = ULLONG_MAX;
913} 971}
914 972
@@ -975,22 +1033,35 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
975} 1033}
976#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 1034#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
977 1035
978static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, 1036static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
979 phys_addr_t align, phys_addr_t max_addr, 1037 phys_addr_t align, phys_addr_t start,
980 int nid) 1038 phys_addr_t end, int nid)
981{ 1039{
982 phys_addr_t found; 1040 phys_addr_t found;
983 1041
984 if (!align) 1042 if (!align)
985 align = SMP_CACHE_BYTES; 1043 align = SMP_CACHE_BYTES;
986 1044
987 found = memblock_find_in_range_node(size, align, 0, max_addr, nid); 1045 found = memblock_find_in_range_node(size, align, start, end, nid);
988 if (found && !memblock_reserve(found, size)) 1046 if (found && !memblock_reserve(found, size))
989 return found; 1047 return found;
990 1048
991 return 0; 1049 return 0;
992} 1050}
993 1051
1052phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
1053 phys_addr_t start, phys_addr_t end)
1054{
1055 return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
1056}
1057
1058static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
1059 phys_addr_t align, phys_addr_t max_addr,
1060 int nid)
1061{
1062 return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
1063}
1064
994phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) 1065phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
995{ 1066{
996 return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); 1067 return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
@@ -1201,7 +1272,7 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
1201 __func__, (u64)base, (u64)base + size - 1, 1272 __func__, (u64)base, (u64)base + size - 1,
1202 (void *)_RET_IP_); 1273 (void *)_RET_IP_);
1203 kmemleak_free_part(__va(base), size); 1274 kmemleak_free_part(__va(base), size);
1204 __memblock_remove(&memblock.reserved, base, size); 1275 memblock_remove_range(&memblock.reserved, base, size);
1205} 1276}
1206 1277
1207/* 1278/*
@@ -1287,8 +1358,10 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
1287 } 1358 }
1288 1359
1289 /* truncate both memory and reserved regions */ 1360 /* truncate both memory and reserved regions */
1290 __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); 1361 memblock_remove_range(&memblock.memory, max_addr,
1291 __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX); 1362 (phys_addr_t)ULLONG_MAX);
1363 memblock_remove_range(&memblock.reserved, max_addr,
1364 (phys_addr_t)ULLONG_MAX);
1292} 1365}
1293 1366
1294static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) 1367static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
@@ -1329,9 +1402,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
1329 if (mid == -1) 1402 if (mid == -1)
1330 return -1; 1403 return -1;
1331 1404
1332 *start_pfn = type->regions[mid].base >> PAGE_SHIFT; 1405 *start_pfn = PFN_DOWN(type->regions[mid].base);
1333 *end_pfn = (type->regions[mid].base + type->regions[mid].size) 1406 *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
1334 >> PAGE_SHIFT;
1335 1407
1336 return type->regions[mid].nid; 1408 return type->regions[mid].nid;
1337} 1409}
@@ -1502,6 +1574,9 @@ static int __init memblock_init_debugfs(void)
1502 return -ENXIO; 1574 return -ENXIO;
1503 debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); 1575 debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
1504 debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); 1576 debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
1577#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
1578 debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops);
1579#endif
1505 1580
1506 return 0; 1581 return 0;
1507} 1582}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c47dffdcb246..a500cb0594c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -80,7 +80,7 @@ int do_swap_account __read_mostly;
80#ifdef CONFIG_MEMCG_SWAP_ENABLED 80#ifdef CONFIG_MEMCG_SWAP_ENABLED
81static int really_do_swap_account __initdata = 1; 81static int really_do_swap_account __initdata = 1;
82#else 82#else
83static int really_do_swap_account __initdata = 0; 83static int really_do_swap_account __initdata;
84#endif 84#endif
85 85
86#else 86#else
@@ -357,10 +357,9 @@ struct mem_cgroup {
357 struct cg_proto tcp_mem; 357 struct cg_proto tcp_mem;
358#endif 358#endif
359#if defined(CONFIG_MEMCG_KMEM) 359#if defined(CONFIG_MEMCG_KMEM)
360 /* analogous to slab_common's slab_caches list. per-memcg */ 360 /* analogous to slab_common's slab_caches list, but per-memcg;
361 * protected by memcg_slab_mutex */
361 struct list_head memcg_slab_caches; 362 struct list_head memcg_slab_caches;
362 /* Not a spinlock, we can take a lot of time walking the list */
363 struct mutex slab_caches_mutex;
364 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 363 /* Index in the kmem_cache->memcg_params->memcg_caches array */
365 int kmemcg_id; 364 int kmemcg_id;
366#endif 365#endif
@@ -1077,9 +1076,18 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1077 1076
1078 rcu_read_lock(); 1077 rcu_read_lock();
1079 do { 1078 do {
1080 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1079 /*
1081 if (unlikely(!memcg)) 1080 * Page cache insertions can happen withou an
1081 * actual mm context, e.g. during disk probing
1082 * on boot, loopback IO, acct() writes etc.
1083 */
1084 if (unlikely(!mm))
1082 memcg = root_mem_cgroup; 1085 memcg = root_mem_cgroup;
1086 else {
1087 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1088 if (unlikely(!memcg))
1089 memcg = root_mem_cgroup;
1090 }
1083 } while (!css_tryget(&memcg->css)); 1091 } while (!css_tryget(&memcg->css));
1084 rcu_read_unlock(); 1092 rcu_read_unlock();
1085 return memcg; 1093 return memcg;
@@ -1586,23 +1594,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1586} 1594}
1587 1595
1588/* 1596/*
1589 * 2 routines for checking "mem" is under move_account() or not. 1597 * A routine for checking "mem" is under move_account() or not.
1590 *
1591 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
1592 * is used for avoiding races in accounting. If true,
1593 * pc->mem_cgroup may be overwritten.
1594 * 1598 *
1595 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1599 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1596 * under hierarchy of moving cgroups. This is for 1600 * moving cgroups. This is for waiting at high-memory pressure
1597 * waiting at hith-memory prressure caused by "move". 1601 * caused by "move".
1598 */ 1602 */
1599
1600static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1601{
1602 VM_BUG_ON(!rcu_read_lock_held());
1603 return atomic_read(&memcg->moving_account) > 0;
1604}
1605
1606static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1603static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1607{ 1604{
1608 struct mem_cgroup *from; 1605 struct mem_cgroup *from;
@@ -1645,7 +1642,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1645 * Take this lock when 1642 * Take this lock when
1646 * - a code tries to modify page's memcg while it's USED. 1643 * - a code tries to modify page's memcg while it's USED.
1647 * - a code tries to modify page state accounting in a memcg. 1644 * - a code tries to modify page state accounting in a memcg.
1648 * see mem_cgroup_stolen(), too.
1649 */ 1645 */
1650static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1646static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1651 unsigned long *flags) 1647 unsigned long *flags)
@@ -2280,12 +2276,11 @@ cleanup:
2280} 2276}
2281 2277
2282/* 2278/*
2283 * Currently used to update mapped file statistics, but the routine can be 2279 * Used to update mapped file or writeback or other statistics.
2284 * generalized to update other statistics as well.
2285 * 2280 *
2286 * Notes: Race condition 2281 * Notes: Race condition
2287 * 2282 *
2288 * We usually use page_cgroup_lock() for accessing page_cgroup member but 2283 * We usually use lock_page_cgroup() for accessing page_cgroup member but
2289 * it tends to be costly. But considering some conditions, we doesn't need 2284 * it tends to be costly. But considering some conditions, we doesn't need
2290 * to do so _always_. 2285 * to do so _always_.
2291 * 2286 *
@@ -2299,8 +2294,8 @@ cleanup:
2299 * by flags. 2294 * by flags.
2300 * 2295 *
2301 * Considering "move", this is an only case we see a race. To make the race 2296 * Considering "move", this is an only case we see a race. To make the race
2302 * small, we check mm->moving_account and detect there are possibility of race 2297 * small, we check memcg->moving_account and detect there are possibility
2303 * If there is, we take a lock. 2298 * of race or not. If there is, we take a lock.
2304 */ 2299 */
2305 2300
2306void __mem_cgroup_begin_update_page_stat(struct page *page, 2301void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2318,9 +2313,10 @@ again:
2318 * If this memory cgroup is not under account moving, we don't 2313 * If this memory cgroup is not under account moving, we don't
2319 * need to take move_lock_mem_cgroup(). Because we already hold 2314 * need to take move_lock_mem_cgroup(). Because we already hold
2320 * rcu_read_lock(), any calls to move_account will be delayed until 2315 * rcu_read_lock(), any calls to move_account will be delayed until
2321 * rcu_read_unlock() if mem_cgroup_stolen() == true. 2316 * rcu_read_unlock().
2322 */ 2317 */
2323 if (!mem_cgroup_stolen(memcg)) 2318 VM_BUG_ON(!rcu_read_lock_held());
2319 if (atomic_read(&memcg->moving_account) <= 0)
2324 return; 2320 return;
2325 2321
2326 move_lock_mem_cgroup(memcg, flags); 2322 move_lock_mem_cgroup(memcg, flags);
@@ -2428,7 +2424,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
2428 */ 2424 */
2429static void drain_local_stock(struct work_struct *dummy) 2425static void drain_local_stock(struct work_struct *dummy)
2430{ 2426{
2431 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2427 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
2432 drain_stock(stock); 2428 drain_stock(stock);
2433 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2429 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2434} 2430}
@@ -2675,7 +2671,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2675 * free their memory. 2671 * free their memory.
2676 */ 2672 */
2677 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2673 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2678 fatal_signal_pending(current))) 2674 fatal_signal_pending(current) ||
2675 current->flags & PF_EXITING))
2679 goto bypass; 2676 goto bypass;
2680 2677
2681 if (unlikely(task_in_memcg_oom(current))) 2678 if (unlikely(task_in_memcg_oom(current)))
@@ -2903,6 +2900,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2903static DEFINE_MUTEX(set_limit_mutex); 2900static DEFINE_MUTEX(set_limit_mutex);
2904 2901
2905#ifdef CONFIG_MEMCG_KMEM 2902#ifdef CONFIG_MEMCG_KMEM
2903/*
2904 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
2905 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
2906 */
2907static DEFINE_MUTEX(memcg_slab_mutex);
2908
2906static DEFINE_MUTEX(activate_kmem_mutex); 2909static DEFINE_MUTEX(activate_kmem_mutex);
2907 2910
2908static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 2911static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
@@ -2935,10 +2938,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2935 2938
2936 print_slabinfo_header(m); 2939 print_slabinfo_header(m);
2937 2940
2938 mutex_lock(&memcg->slab_caches_mutex); 2941 mutex_lock(&memcg_slab_mutex);
2939 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 2942 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2940 cache_show(memcg_params_to_cache(params), m); 2943 cache_show(memcg_params_to_cache(params), m);
2941 mutex_unlock(&memcg->slab_caches_mutex); 2944 mutex_unlock(&memcg_slab_mutex);
2942 2945
2943 return 0; 2946 return 0;
2944} 2947}
@@ -3040,8 +3043,6 @@ void memcg_update_array_size(int num)
3040 memcg_limited_groups_array_size = memcg_caches_array_size(num); 3043 memcg_limited_groups_array_size = memcg_caches_array_size(num);
3041} 3044}
3042 3045
3043static void kmem_cache_destroy_work_func(struct work_struct *w);
3044
3045int memcg_update_cache_size(struct kmem_cache *s, int num_groups) 3046int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3046{ 3047{
3047 struct memcg_cache_params *cur_params = s->memcg_params; 3048 struct memcg_cache_params *cur_params = s->memcg_params;
@@ -3094,29 +3095,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3094 return 0; 3095 return 0;
3095} 3096}
3096 3097
3097char *memcg_create_cache_name(struct mem_cgroup *memcg,
3098 struct kmem_cache *root_cache)
3099{
3100 static char *buf = NULL;
3101
3102 /*
3103 * We need a mutex here to protect the shared buffer. Since this is
3104 * expected to be called only on cache creation, we can employ the
3105 * slab_mutex for that purpose.
3106 */
3107 lockdep_assert_held(&slab_mutex);
3108
3109 if (!buf) {
3110 buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
3111 if (!buf)
3112 return NULL;
3113 }
3114
3115 cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
3116 return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
3117 memcg_cache_id(memcg), buf);
3118}
3119
3120int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, 3098int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3121 struct kmem_cache *root_cache) 3099 struct kmem_cache *root_cache)
3122{ 3100{
@@ -3138,8 +3116,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3138 if (memcg) { 3116 if (memcg) {
3139 s->memcg_params->memcg = memcg; 3117 s->memcg_params->memcg = memcg;
3140 s->memcg_params->root_cache = root_cache; 3118 s->memcg_params->root_cache = root_cache;
3141 INIT_WORK(&s->memcg_params->destroy,
3142 kmem_cache_destroy_work_func);
3143 css_get(&memcg->css); 3119 css_get(&memcg->css);
3144 } else 3120 } else
3145 s->memcg_params->is_root_cache = true; 3121 s->memcg_params->is_root_cache = true;
@@ -3156,24 +3132,37 @@ void memcg_free_cache_params(struct kmem_cache *s)
3156 kfree(s->memcg_params); 3132 kfree(s->memcg_params);
3157} 3133}
3158 3134
3159void memcg_register_cache(struct kmem_cache *s) 3135static void memcg_register_cache(struct mem_cgroup *memcg,
3136 struct kmem_cache *root_cache)
3160{ 3137{
3161 struct kmem_cache *root; 3138 static char memcg_name_buf[NAME_MAX + 1]; /* protected by
3162 struct mem_cgroup *memcg; 3139 memcg_slab_mutex */
3140 struct kmem_cache *cachep;
3163 int id; 3141 int id;
3164 3142
3165 if (is_root_cache(s)) 3143 lockdep_assert_held(&memcg_slab_mutex);
3144
3145 id = memcg_cache_id(memcg);
3146
3147 /*
3148 * Since per-memcg caches are created asynchronously on first
3149 * allocation (see memcg_kmem_get_cache()), several threads can try to
3150 * create the same cache, but only one of them may succeed.
3151 */
3152 if (cache_from_memcg_idx(root_cache, id))
3166 return; 3153 return;
3167 3154
3155 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
3156 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
3168 /* 3157 /*
3169 * Holding the slab_mutex assures nobody will touch the memcg_caches 3158 * If we could not create a memcg cache, do not complain, because
3170 * array while we are modifying it. 3159 * that's not critical at all as we can always proceed with the root
3160 * cache.
3171 */ 3161 */
3172 lockdep_assert_held(&slab_mutex); 3162 if (!cachep)
3163 return;
3173 3164
3174 root = s->memcg_params->root_cache; 3165 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3175 memcg = s->memcg_params->memcg;
3176 id = memcg_cache_id(memcg);
3177 3166
3178 /* 3167 /*
3179 * Since readers won't lock (see cache_from_memcg_idx()), we need a 3168 * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -3182,49 +3171,30 @@ void memcg_register_cache(struct kmem_cache *s)
3182 */ 3171 */
3183 smp_wmb(); 3172 smp_wmb();
3184 3173
3185 /* 3174 BUG_ON(root_cache->memcg_params->memcg_caches[id]);
3186 * Initialize the pointer to this cache in its parent's memcg_params 3175 root_cache->memcg_params->memcg_caches[id] = cachep;
3187 * before adding it to the memcg_slab_caches list, otherwise we can
3188 * fail to convert memcg_params_to_cache() while traversing the list.
3189 */
3190 VM_BUG_ON(root->memcg_params->memcg_caches[id]);
3191 root->memcg_params->memcg_caches[id] = s;
3192
3193 mutex_lock(&memcg->slab_caches_mutex);
3194 list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
3195 mutex_unlock(&memcg->slab_caches_mutex);
3196} 3176}
3197 3177
3198void memcg_unregister_cache(struct kmem_cache *s) 3178static void memcg_unregister_cache(struct kmem_cache *cachep)
3199{ 3179{
3200 struct kmem_cache *root; 3180 struct kmem_cache *root_cache;
3201 struct mem_cgroup *memcg; 3181 struct mem_cgroup *memcg;
3202 int id; 3182 int id;
3203 3183
3204 if (is_root_cache(s)) 3184 lockdep_assert_held(&memcg_slab_mutex);
3205 return;
3206 3185
3207 /* 3186 BUG_ON(is_root_cache(cachep));
3208 * Holding the slab_mutex assures nobody will touch the memcg_caches
3209 * array while we are modifying it.
3210 */
3211 lockdep_assert_held(&slab_mutex);
3212 3187
3213 root = s->memcg_params->root_cache; 3188 root_cache = cachep->memcg_params->root_cache;
3214 memcg = s->memcg_params->memcg; 3189 memcg = cachep->memcg_params->memcg;
3215 id = memcg_cache_id(memcg); 3190 id = memcg_cache_id(memcg);
3216 3191
3217 mutex_lock(&memcg->slab_caches_mutex); 3192 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
3218 list_del(&s->memcg_params->list); 3193 root_cache->memcg_params->memcg_caches[id] = NULL;
3219 mutex_unlock(&memcg->slab_caches_mutex);
3220 3194
3221 /* 3195 list_del(&cachep->memcg_params->list);
3222 * Clear the pointer to this cache in its parent's memcg_params only 3196
3223 * after removing it from the memcg_slab_caches list, otherwise we can 3197 kmem_cache_destroy(cachep);
3224 * fail to convert memcg_params_to_cache() while traversing the list.
3225 */
3226 VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
3227 root->memcg_params->memcg_caches[id] = NULL;
3228} 3198}
3229 3199
3230/* 3200/*
@@ -3258,144 +3228,61 @@ static inline void memcg_resume_kmem_account(void)
3258 current->memcg_kmem_skip_account--; 3228 current->memcg_kmem_skip_account--;
3259} 3229}
3260 3230
3261static void kmem_cache_destroy_work_func(struct work_struct *w) 3231int __memcg_cleanup_cache_params(struct kmem_cache *s)
3262{
3263 struct kmem_cache *cachep;
3264 struct memcg_cache_params *p;
3265
3266 p = container_of(w, struct memcg_cache_params, destroy);
3267
3268 cachep = memcg_params_to_cache(p);
3269
3270 /*
3271 * If we get down to 0 after shrink, we could delete right away.
3272 * However, memcg_release_pages() already puts us back in the workqueue
3273 * in that case. If we proceed deleting, we'll get a dangling
3274 * reference, and removing the object from the workqueue in that case
3275 * is unnecessary complication. We are not a fast path.
3276 *
3277 * Note that this case is fundamentally different from racing with
3278 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3279 * kmem_cache_shrink, not only we would be reinserting a dead cache
3280 * into the queue, but doing so from inside the worker racing to
3281 * destroy it.
3282 *
3283 * So if we aren't down to zero, we'll just schedule a worker and try
3284 * again
3285 */
3286 if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
3287 kmem_cache_shrink(cachep);
3288 else
3289 kmem_cache_destroy(cachep);
3290}
3291
3292void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3293{
3294 if (!cachep->memcg_params->dead)
3295 return;
3296
3297 /*
3298 * There are many ways in which we can get here.
3299 *
3300 * We can get to a memory-pressure situation while the delayed work is
3301 * still pending to run. The vmscan shrinkers can then release all
3302 * cache memory and get us to destruction. If this is the case, we'll
3303 * be executed twice, which is a bug (the second time will execute over
3304 * bogus data). In this case, cancelling the work should be fine.
3305 *
3306 * But we can also get here from the worker itself, if
3307 * kmem_cache_shrink is enough to shake all the remaining objects and
3308 * get the page count to 0. In this case, we'll deadlock if we try to
3309 * cancel the work (the worker runs with an internal lock held, which
3310 * is the same lock we would hold for cancel_work_sync().)
3311 *
3312 * Since we can't possibly know who got us here, just refrain from
3313 * running if there is already work pending
3314 */
3315 if (work_pending(&cachep->memcg_params->destroy))
3316 return;
3317 /*
3318 * We have to defer the actual destroying to a workqueue, because
3319 * we might currently be in a context that cannot sleep.
3320 */
3321 schedule_work(&cachep->memcg_params->destroy);
3322}
3323
3324int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3325{ 3232{
3326 struct kmem_cache *c; 3233 struct kmem_cache *c;
3327 int i, failed = 0; 3234 int i, failed = 0;
3328 3235
3329 /* 3236 mutex_lock(&memcg_slab_mutex);
3330 * If the cache is being destroyed, we trust that there is no one else
3331 * requesting objects from it. Even if there are, the sanity checks in
3332 * kmem_cache_destroy should caught this ill-case.
3333 *
3334 * Still, we don't want anyone else freeing memcg_caches under our
3335 * noses, which can happen if a new memcg comes to life. As usual,
3336 * we'll take the activate_kmem_mutex to protect ourselves against
3337 * this.
3338 */
3339 mutex_lock(&activate_kmem_mutex);
3340 for_each_memcg_cache_index(i) { 3237 for_each_memcg_cache_index(i) {
3341 c = cache_from_memcg_idx(s, i); 3238 c = cache_from_memcg_idx(s, i);
3342 if (!c) 3239 if (!c)
3343 continue; 3240 continue;
3344 3241
3345 /* 3242 memcg_unregister_cache(c);
3346 * We will now manually delete the caches, so to avoid races
3347 * we need to cancel all pending destruction workers and
3348 * proceed with destruction ourselves.
3349 *
3350 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3351 * and that could spawn the workers again: it is likely that
3352 * the cache still have active pages until this very moment.
3353 * This would lead us back to mem_cgroup_destroy_cache.
3354 *
3355 * But that will not execute at all if the "dead" flag is not
3356 * set, so flip it down to guarantee we are in control.
3357 */
3358 c->memcg_params->dead = false;
3359 cancel_work_sync(&c->memcg_params->destroy);
3360 kmem_cache_destroy(c);
3361 3243
3362 if (cache_from_memcg_idx(s, i)) 3244 if (cache_from_memcg_idx(s, i))
3363 failed++; 3245 failed++;
3364 } 3246 }
3365 mutex_unlock(&activate_kmem_mutex); 3247 mutex_unlock(&memcg_slab_mutex);
3366 return failed; 3248 return failed;
3367} 3249}
3368 3250
3369static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3251static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3370{ 3252{
3371 struct kmem_cache *cachep; 3253 struct kmem_cache *cachep;
3372 struct memcg_cache_params *params; 3254 struct memcg_cache_params *params, *tmp;
3373 3255
3374 if (!memcg_kmem_is_active(memcg)) 3256 if (!memcg_kmem_is_active(memcg))
3375 return; 3257 return;
3376 3258
3377 mutex_lock(&memcg->slab_caches_mutex); 3259 mutex_lock(&memcg_slab_mutex);
3378 list_for_each_entry(params, &memcg->memcg_slab_caches, list) { 3260 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
3379 cachep = memcg_params_to_cache(params); 3261 cachep = memcg_params_to_cache(params);
3380 cachep->memcg_params->dead = true; 3262 kmem_cache_shrink(cachep);
3381 schedule_work(&cachep->memcg_params->destroy); 3263 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3264 memcg_unregister_cache(cachep);
3382 } 3265 }
3383 mutex_unlock(&memcg->slab_caches_mutex); 3266 mutex_unlock(&memcg_slab_mutex);
3384} 3267}
3385 3268
3386struct create_work { 3269struct memcg_register_cache_work {
3387 struct mem_cgroup *memcg; 3270 struct mem_cgroup *memcg;
3388 struct kmem_cache *cachep; 3271 struct kmem_cache *cachep;
3389 struct work_struct work; 3272 struct work_struct work;
3390}; 3273};
3391 3274
3392static void memcg_create_cache_work_func(struct work_struct *w) 3275static void memcg_register_cache_func(struct work_struct *w)
3393{ 3276{
3394 struct create_work *cw = container_of(w, struct create_work, work); 3277 struct memcg_register_cache_work *cw =
3278 container_of(w, struct memcg_register_cache_work, work);
3395 struct mem_cgroup *memcg = cw->memcg; 3279 struct mem_cgroup *memcg = cw->memcg;
3396 struct kmem_cache *cachep = cw->cachep; 3280 struct kmem_cache *cachep = cw->cachep;
3397 3281
3398 kmem_cache_create_memcg(memcg, cachep); 3282 mutex_lock(&memcg_slab_mutex);
3283 memcg_register_cache(memcg, cachep);
3284 mutex_unlock(&memcg_slab_mutex);
3285
3399 css_put(&memcg->css); 3286 css_put(&memcg->css);
3400 kfree(cw); 3287 kfree(cw);
3401} 3288}
@@ -3403,12 +3290,12 @@ static void memcg_create_cache_work_func(struct work_struct *w)
3403/* 3290/*
3404 * Enqueue the creation of a per-memcg kmem_cache. 3291 * Enqueue the creation of a per-memcg kmem_cache.
3405 */ 3292 */
3406static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3293static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
3407 struct kmem_cache *cachep) 3294 struct kmem_cache *cachep)
3408{ 3295{
3409 struct create_work *cw; 3296 struct memcg_register_cache_work *cw;
3410 3297
3411 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); 3298 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
3412 if (cw == NULL) { 3299 if (cw == NULL) {
3413 css_put(&memcg->css); 3300 css_put(&memcg->css);
3414 return; 3301 return;
@@ -3417,17 +3304,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3417 cw->memcg = memcg; 3304 cw->memcg = memcg;
3418 cw->cachep = cachep; 3305 cw->cachep = cachep;
3419 3306
3420 INIT_WORK(&cw->work, memcg_create_cache_work_func); 3307 INIT_WORK(&cw->work, memcg_register_cache_func);
3421 schedule_work(&cw->work); 3308 schedule_work(&cw->work);
3422} 3309}
3423 3310
3424static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3311static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
3425 struct kmem_cache *cachep) 3312 struct kmem_cache *cachep)
3426{ 3313{
3427 /* 3314 /*
3428 * We need to stop accounting when we kmalloc, because if the 3315 * We need to stop accounting when we kmalloc, because if the
3429 * corresponding kmalloc cache is not yet created, the first allocation 3316 * corresponding kmalloc cache is not yet created, the first allocation
3430 * in __memcg_create_cache_enqueue will recurse. 3317 * in __memcg_schedule_register_cache will recurse.
3431 * 3318 *
3432 * However, it is better to enclose the whole function. Depending on 3319 * However, it is better to enclose the whole function. Depending on
3433 * the debugging options enabled, INIT_WORK(), for instance, can 3320 * the debugging options enabled, INIT_WORK(), for instance, can
@@ -3436,9 +3323,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3436 * the safest choice is to do it like this, wrapping the whole function. 3323 * the safest choice is to do it like this, wrapping the whole function.
3437 */ 3324 */
3438 memcg_stop_kmem_account(); 3325 memcg_stop_kmem_account();
3439 __memcg_create_cache_enqueue(memcg, cachep); 3326 __memcg_schedule_register_cache(memcg, cachep);
3440 memcg_resume_kmem_account(); 3327 memcg_resume_kmem_account();
3441} 3328}
3329
3330int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
3331{
3332 int res;
3333
3334 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
3335 PAGE_SIZE << order);
3336 if (!res)
3337 atomic_add(1 << order, &cachep->memcg_params->nr_pages);
3338 return res;
3339}
3340
3341void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
3342{
3343 memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
3344 atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
3345}
3346
3442/* 3347/*
3443 * Return the kmem_cache we're supposed to use for a slab allocation. 3348 * Return the kmem_cache we're supposed to use for a slab allocation.
3444 * We try to use the current memcg's version of the cache. 3349 * We try to use the current memcg's version of the cache.
@@ -3489,22 +3394,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3489 * 3394 *
3490 * However, there are some clashes that can arrive from locking. 3395 * However, there are some clashes that can arrive from locking.
3491 * For instance, because we acquire the slab_mutex while doing 3396 * For instance, because we acquire the slab_mutex while doing
3492 * kmem_cache_dup, this means no further allocation could happen 3397 * memcg_create_kmem_cache, this means no further allocation
3493 * with the slab_mutex held. 3398 * could happen with the slab_mutex held. So it's better to
3494 * 3399 * defer everything.
3495 * Also, because cache creation issue get_online_cpus(), this
3496 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3497 * that ends up reversed during cpu hotplug. (cpuset allocates
3498 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3499 * better to defer everything.
3500 */ 3400 */
3501 memcg_create_cache_enqueue(memcg, cachep); 3401 memcg_schedule_register_cache(memcg, cachep);
3502 return cachep; 3402 return cachep;
3503out: 3403out:
3504 rcu_read_unlock(); 3404 rcu_read_unlock();
3505 return cachep; 3405 return cachep;
3506} 3406}
3507EXPORT_SYMBOL(__memcg_kmem_get_cache);
3508 3407
3509/* 3408/*
3510 * We need to verify if the allocation against current->mm->owner's memcg is 3409 * We need to verify if the allocation against current->mm->owner's memcg is
@@ -3531,11 +3430,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3531 /* 3430 /*
3532 * Disabling accounting is only relevant for some specific memcg 3431 * Disabling accounting is only relevant for some specific memcg
3533 * internal allocations. Therefore we would initially not have such 3432 * internal allocations. Therefore we would initially not have such
3534 * check here, since direct calls to the page allocator that are marked 3433 * check here, since direct calls to the page allocator that are
3535 * with GFP_KMEMCG only happen outside memcg core. We are mostly 3434 * accounted to kmemcg (alloc_kmem_pages and friends) only happen
3536 * concerned with cache allocations, and by having this test at 3435 * outside memcg core. We are mostly concerned with cache allocations,
3537 * memcg_kmem_get_cache, we are already able to relay the allocation to 3436 * and by having this test at memcg_kmem_get_cache, we are already able
3538 * the root cache and bypass the memcg cache altogether. 3437 * to relay the allocation to the root cache and bypass the memcg cache
3438 * altogether.
3539 * 3439 *
3540 * There is one exception, though: the SLUB allocator does not create 3440 * There is one exception, though: the SLUB allocator does not create
3541 * large order caches, but rather service large kmallocs directly from 3441 * large order caches, but rather service large kmallocs directly from
@@ -3622,7 +3522,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
3622 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3522 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3623} 3523}
3624#else 3524#else
3625static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3525static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3626{ 3526{
3627} 3527}
3628#endif /* CONFIG_MEMCG_KMEM */ 3528#endif /* CONFIG_MEMCG_KMEM */
@@ -3958,17 +3858,9 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
3958 return 0; 3858 return 0;
3959 } 3859 }
3960 3860
3961 /* 3861 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
3962 * Page cache insertions can happen without an actual mm 3862 if (!memcg)
3963 * context, e.g. during disk probing on boot. 3863 return -ENOMEM;
3964 */
3965 if (unlikely(!mm))
3966 memcg = root_mem_cgroup;
3967 else {
3968 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
3969 if (!memcg)
3970 return -ENOMEM;
3971 }
3972 __mem_cgroup_commit_charge(memcg, page, 1, type, false); 3864 __mem_cgroup_commit_charge(memcg, page, 1, type, false);
3973 return 0; 3865 return 0;
3974} 3866}
@@ -4783,9 +4675,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4783 if (mem_cgroup_move_parent(page, pc, memcg)) { 4675 if (mem_cgroup_move_parent(page, pc, memcg)) {
4784 /* found lock contention or "pc" is obsolete. */ 4676 /* found lock contention or "pc" is obsolete. */
4785 busy = page; 4677 busy = page;
4786 cond_resched();
4787 } else 4678 } else
4788 busy = NULL; 4679 busy = NULL;
4680 cond_resched();
4789 } while (!list_empty(list)); 4681 } while (!list_empty(list));
4790} 4682}
4791 4683
@@ -5061,13 +4953,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
5061 * Make sure we have enough space for this cgroup in each root cache's 4953 * Make sure we have enough space for this cgroup in each root cache's
5062 * memcg_params. 4954 * memcg_params.
5063 */ 4955 */
4956 mutex_lock(&memcg_slab_mutex);
5064 err = memcg_update_all_caches(memcg_id + 1); 4957 err = memcg_update_all_caches(memcg_id + 1);
4958 mutex_unlock(&memcg_slab_mutex);
5065 if (err) 4959 if (err)
5066 goto out_rmid; 4960 goto out_rmid;
5067 4961
5068 memcg->kmemcg_id = memcg_id; 4962 memcg->kmemcg_id = memcg_id;
5069 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 4963 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
5070 mutex_init(&memcg->slab_caches_mutex);
5071 4964
5072 /* 4965 /*
5073 * We couldn't have accounted to this cgroup, because it hasn't got the 4966 * We couldn't have accounted to this cgroup, because it hasn't got the
@@ -5442,22 +5335,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5442 struct cftype *cft, u64 val) 5335 struct cftype *cft, u64 val)
5443{ 5336{
5444 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5337 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5445 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5446 5338
5447 if (val > 100 || !parent) 5339 if (val > 100)
5448 return -EINVAL; 5340 return -EINVAL;
5449 5341
5450 mutex_lock(&memcg_create_mutex); 5342 if (css_parent(css))
5451 5343 memcg->swappiness = val;
5452 /* If under hierarchy, only empty-root can set this value */ 5344 else
5453 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5345 vm_swappiness = val;
5454 mutex_unlock(&memcg_create_mutex);
5455 return -EINVAL;
5456 }
5457
5458 memcg->swappiness = val;
5459
5460 mutex_unlock(&memcg_create_mutex);
5461 5346
5462 return 0; 5347 return 0;
5463} 5348}
@@ -5789,22 +5674,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5789 struct cftype *cft, u64 val) 5674 struct cftype *cft, u64 val)
5790{ 5675{
5791 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5676 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5792 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5793 5677
5794 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5678 /* cannot set to root cgroup and only 0 and 1 are allowed */
5795 if (!parent || !((val == 0) || (val == 1))) 5679 if (!css_parent(css) || !((val == 0) || (val == 1)))
5796 return -EINVAL; 5680 return -EINVAL;
5797 5681
5798 mutex_lock(&memcg_create_mutex);
5799 /* oom-kill-disable is a flag for subhierarchy. */
5800 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5801 mutex_unlock(&memcg_create_mutex);
5802 return -EINVAL;
5803 }
5804 memcg->oom_kill_disable = val; 5682 memcg->oom_kill_disable = val;
5805 if (!val) 5683 if (!val)
5806 memcg_oom_recover(memcg); 5684 memcg_oom_recover(memcg);
5807 mutex_unlock(&memcg_create_mutex); 5685
5808 return 0; 5686 return 0;
5809} 5687}
5810 5688
@@ -6490,7 +6368,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6490 css_for_each_descendant_post(iter, css) 6368 css_for_each_descendant_post(iter, css)
6491 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); 6369 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
6492 6370
6493 mem_cgroup_destroy_all_caches(memcg); 6371 memcg_unregister_all_caches(memcg);
6494 vmpressure_cleanup(&memcg->vmpressure); 6372 vmpressure_cleanup(&memcg->vmpressure);
6495} 6373}
6496 6374
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 35ef28acf137..cd8989c1027e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -204,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
204#endif 204#endif
205 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; 205 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
206 206
207 if ((flags & MF_ACTION_REQUIRED) && t == current) { 207 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
208 si.si_code = BUS_MCEERR_AR; 208 si.si_code = BUS_MCEERR_AR;
209 ret = force_sig_info(SIGBUS, &si, t); 209 ret = force_sig_info(SIGBUS, &si, current);
210 } else { 210 } else {
211 /* 211 /*
212 * Don't use force here, it's convenient if the signal 212 * Don't use force here, it's convenient if the signal
@@ -380,20 +380,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
380 } 380 }
381} 381}
382 382
383static int task_early_kill(struct task_struct *tsk) 383/*
384 * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
385 * on behalf of the thread group. Return task_struct of the (first found)
386 * dedicated thread if found, and return NULL otherwise.
387 *
388 * We already hold read_lock(&tasklist_lock) in the caller, so we don't
389 * have to call rcu_read_lock/unlock() in this function.
390 */
391static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
384{ 392{
393 struct task_struct *t;
394
395 for_each_thread(tsk, t)
396 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
397 return t;
398 return NULL;
399}
400
401/*
402 * Determine whether a given process is "early kill" process which expects
403 * to be signaled when some page under the process is hwpoisoned.
404 * Return task_struct of the dedicated thread (main thread unless explicitly
405 * specified) if the process is "early kill," and otherwise returns NULL.
406 */
407static struct task_struct *task_early_kill(struct task_struct *tsk,
408 int force_early)
409{
410 struct task_struct *t;
385 if (!tsk->mm) 411 if (!tsk->mm)
386 return 0; 412 return NULL;
387 if (tsk->flags & PF_MCE_PROCESS) 413 if (force_early)
388 return !!(tsk->flags & PF_MCE_EARLY); 414 return tsk;
389 return sysctl_memory_failure_early_kill; 415 t = find_early_kill_thread(tsk);
416 if (t)
417 return t;
418 if (sysctl_memory_failure_early_kill)
419 return tsk;
420 return NULL;
390} 421}
391 422
392/* 423/*
393 * Collect processes when the error hit an anonymous page. 424 * Collect processes when the error hit an anonymous page.
394 */ 425 */
395static void collect_procs_anon(struct page *page, struct list_head *to_kill, 426static void collect_procs_anon(struct page *page, struct list_head *to_kill,
396 struct to_kill **tkc) 427 struct to_kill **tkc, int force_early)
397{ 428{
398 struct vm_area_struct *vma; 429 struct vm_area_struct *vma;
399 struct task_struct *tsk; 430 struct task_struct *tsk;
@@ -408,16 +439,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
408 read_lock(&tasklist_lock); 439 read_lock(&tasklist_lock);
409 for_each_process (tsk) { 440 for_each_process (tsk) {
410 struct anon_vma_chain *vmac; 441 struct anon_vma_chain *vmac;
442 struct task_struct *t = task_early_kill(tsk, force_early);
411 443
412 if (!task_early_kill(tsk)) 444 if (!t)
413 continue; 445 continue;
414 anon_vma_interval_tree_foreach(vmac, &av->rb_root, 446 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
415 pgoff, pgoff) { 447 pgoff, pgoff) {
416 vma = vmac->vma; 448 vma = vmac->vma;
417 if (!page_mapped_in_vma(page, vma)) 449 if (!page_mapped_in_vma(page, vma))
418 continue; 450 continue;
419 if (vma->vm_mm == tsk->mm) 451 if (vma->vm_mm == t->mm)
420 add_to_kill(tsk, page, vma, to_kill, tkc); 452 add_to_kill(t, page, vma, to_kill, tkc);
421 } 453 }
422 } 454 }
423 read_unlock(&tasklist_lock); 455 read_unlock(&tasklist_lock);
@@ -428,7 +460,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
428 * Collect processes when the error hit a file mapped page. 460 * Collect processes when the error hit a file mapped page.
429 */ 461 */
430static void collect_procs_file(struct page *page, struct list_head *to_kill, 462static void collect_procs_file(struct page *page, struct list_head *to_kill,
431 struct to_kill **tkc) 463 struct to_kill **tkc, int force_early)
432{ 464{
433 struct vm_area_struct *vma; 465 struct vm_area_struct *vma;
434 struct task_struct *tsk; 466 struct task_struct *tsk;
@@ -438,10 +470,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
438 read_lock(&tasklist_lock); 470 read_lock(&tasklist_lock);
439 for_each_process(tsk) { 471 for_each_process(tsk) {
440 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 472 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
473 struct task_struct *t = task_early_kill(tsk, force_early);
441 474
442 if (!task_early_kill(tsk)) 475 if (!t)
443 continue; 476 continue;
444
445 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, 477 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
446 pgoff) { 478 pgoff) {
447 /* 479 /*
@@ -451,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
451 * Assume applications who requested early kill want 483 * Assume applications who requested early kill want
452 * to be informed of all such data corruptions. 484 * to be informed of all such data corruptions.
453 */ 485 */
454 if (vma->vm_mm == tsk->mm) 486 if (vma->vm_mm == t->mm)
455 add_to_kill(tsk, page, vma, to_kill, tkc); 487 add_to_kill(t, page, vma, to_kill, tkc);
456 } 488 }
457 } 489 }
458 read_unlock(&tasklist_lock); 490 read_unlock(&tasklist_lock);
@@ -465,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
465 * First preallocate one tokill structure outside the spin locks, 497 * First preallocate one tokill structure outside the spin locks,
466 * so that we can kill at least one process reasonably reliable. 498 * so that we can kill at least one process reasonably reliable.
467 */ 499 */
468static void collect_procs(struct page *page, struct list_head *tokill) 500static void collect_procs(struct page *page, struct list_head *tokill,
501 int force_early)
469{ 502{
470 struct to_kill *tk; 503 struct to_kill *tk;
471 504
@@ -476,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)
476 if (!tk) 509 if (!tk)
477 return; 510 return;
478 if (PageAnon(page)) 511 if (PageAnon(page))
479 collect_procs_anon(page, tokill, &tk); 512 collect_procs_anon(page, tokill, &tk, force_early);
480 else 513 else
481 collect_procs_file(page, tokill, &tk); 514 collect_procs_file(page, tokill, &tk, force_early);
482 kfree(tk); 515 kfree(tk);
483} 516}
484 517
@@ -963,7 +996,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
963 * there's nothing that can be done. 996 * there's nothing that can be done.
964 */ 997 */
965 if (kill) 998 if (kill)
966 collect_procs(ppage, &tokill); 999 collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
967 1000
968 ret = try_to_unmap(ppage, ttu); 1001 ret = try_to_unmap(ppage, ttu);
969 if (ret != SWAP_SUCCESS) 1002 if (ret != SWAP_SUCCESS)
@@ -1081,15 +1114,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1081 return 0; 1114 return 0;
1082 } else if (PageHuge(hpage)) { 1115 } else if (PageHuge(hpage)) {
1083 /* 1116 /*
1084 * Check "just unpoisoned", "filter hit", and 1117 * Check "filter hit" and "race with other subpage."
1085 * "race with other subpage."
1086 */ 1118 */
1087 lock_page(hpage); 1119 lock_page(hpage);
1088 if (!PageHWPoison(hpage) 1120 if (PageHWPoison(hpage)) {
1089 || (hwpoison_filter(p) && TestClearPageHWPoison(p)) 1121 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1090 || (p != hpage && TestSetPageHWPoison(hpage))) { 1122 || (p != hpage && TestSetPageHWPoison(hpage))) {
1091 atomic_long_sub(nr_pages, &num_poisoned_pages); 1123 atomic_long_sub(nr_pages, &num_poisoned_pages);
1092 return 0; 1124 unlock_page(hpage);
1125 return 0;
1126 }
1093 } 1127 }
1094 set_page_hwpoison_huge_page(hpage); 1128 set_page_hwpoison_huge_page(hpage);
1095 res = dequeue_hwpoisoned_huge_page(hpage); 1129 res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1131,11 +1165,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1131 } 1165 }
1132 } 1166 }
1133 1167
1134 /*
1135 * Lock the page and wait for writeback to finish.
1136 * It's very difficult to mess with pages currently under IO
1137 * and in many cases impossible, so we just avoid it here.
1138 */
1139 lock_page(hpage); 1168 lock_page(hpage);
1140 1169
1141 /* 1170 /*
@@ -1152,6 +1181,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1152 */ 1181 */
1153 if (!PageHWPoison(p)) { 1182 if (!PageHWPoison(p)) {
1154 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); 1183 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1184 atomic_long_sub(nr_pages, &num_poisoned_pages);
1185 put_page(hpage);
1155 res = 0; 1186 res = 0;
1156 goto out; 1187 goto out;
1157 } 1188 }
@@ -1183,6 +1214,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1183 if (PageHuge(p)) 1214 if (PageHuge(p))
1184 set_page_hwpoison_huge_page(hpage); 1215 set_page_hwpoison_huge_page(hpage);
1185 1216
1217 /*
1218 * It's very difficult to mess with pages currently under IO
1219 * and in many cases impossible, so we just avoid it here.
1220 */
1186 wait_on_page_writeback(p); 1221 wait_on_page_writeback(p);
1187 1222
1188 /* 1223 /*
@@ -1295,7 +1330,7 @@ static void memory_failure_work_func(struct work_struct *work)
1295 unsigned long proc_flags; 1330 unsigned long proc_flags;
1296 int gotten; 1331 int gotten;
1297 1332
1298 mf_cpu = &__get_cpu_var(memory_failure_cpu); 1333 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1299 for (;;) { 1334 for (;;) {
1300 spin_lock_irqsave(&mf_cpu->lock, proc_flags); 1335 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1301 gotten = kfifo_get(&mf_cpu->fifo, &entry); 1336 gotten = kfifo_get(&mf_cpu->fifo, &entry);
@@ -1500,7 +1535,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1500 1535
1501 /* Keep page count to indicate a given hugepage is isolated. */ 1536 /* Keep page count to indicate a given hugepage is isolated. */
1502 list_move(&hpage->lru, &pagelist); 1537 list_move(&hpage->lru, &pagelist);
1503 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1538 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1504 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1539 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1505 if (ret) { 1540 if (ret) {
1506 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1541 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1616,7 @@ static int __soft_offline_page(struct page *page, int flags)
1581 inc_zone_page_state(page, NR_ISOLATED_ANON + 1616 inc_zone_page_state(page, NR_ISOLATED_ANON +
1582 page_is_file_cache(page)); 1617 page_is_file_cache(page));
1583 list_add(&page->lru, &pagelist); 1618 list_add(&page->lru, &pagelist);
1584 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1619 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1585 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1620 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1586 if (ret) { 1621 if (ret) {
1587 if (!list_empty(&pagelist)) { 1622 if (!list_empty(&pagelist)) {
@@ -1661,11 +1696,7 @@ int soft_offline_page(struct page *page, int flags)
1661 } 1696 }
1662 } 1697 }
1663 1698
1664 /* 1699 get_online_mems();
1665 * The lock_memory_hotplug prevents a race with memory hotplug.
1666 * This is a big hammer, a better would be nicer.
1667 */
1668 lock_memory_hotplug();
1669 1700
1670 /* 1701 /*
1671 * Isolate the page, so that it doesn't get reallocated if it 1702 * Isolate the page, so that it doesn't get reallocated if it
@@ -1676,7 +1707,7 @@ int soft_offline_page(struct page *page, int flags)
1676 set_migratetype_isolate(page, true); 1707 set_migratetype_isolate(page, true);
1677 1708
1678 ret = get_any_page(page, pfn, flags); 1709 ret = get_any_page(page, pfn, flags);
1679 unlock_memory_hotplug(); 1710 put_online_mems();
1680 if (ret > 0) { /* for in-use pages */ 1711 if (ret > 0) { /* for in-use pages */
1681 if (PageHuge(page)) 1712 if (PageHuge(page))
1682 ret = soft_offline_huge_page(page, flags); 1713 ret = soft_offline_huge_page(page, flags);
diff --git a/mm/memory.c b/mm/memory.c
index 037b812a9531..d67fd9fcf1f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -698,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
698 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 698 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
699} 699}
700 700
701static inline bool is_cow_mapping(vm_flags_t flags)
702{
703 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
704}
705
706/* 701/*
707 * vm_normal_page -- This function gets the "struct page" associated with a pte. 702 * vm_normal_page -- This function gets the "struct page" associated with a pte.
708 * 703 *
@@ -756,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
756 unsigned long pfn = pte_pfn(pte); 751 unsigned long pfn = pte_pfn(pte);
757 752
758 if (HAVE_PTE_SPECIAL) { 753 if (HAVE_PTE_SPECIAL) {
759 if (likely(!pte_special(pte))) 754 if (likely(!pte_special(pte) || pte_numa(pte)))
760 goto check_pfn; 755 goto check_pfn;
761 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 756 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
762 return NULL; 757 return NULL;
@@ -782,14 +777,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
782 } 777 }
783 } 778 }
784 779
785 if (is_zero_pfn(pfn))
786 return NULL;
787check_pfn: 780check_pfn:
788 if (unlikely(pfn > highest_memmap_pfn)) { 781 if (unlikely(pfn > highest_memmap_pfn)) {
789 print_bad_pte(vma, addr, pte, NULL); 782 print_bad_pte(vma, addr, pte, NULL);
790 return NULL; 783 return NULL;
791 } 784 }
792 785
786 if (is_zero_pfn(pfn))
787 return NULL;
788
793 /* 789 /*
794 * NOTE! We still have PageReserved() pages in the page tables. 790 * NOTE! We still have PageReserved() pages in the page tables.
795 * eg. VDSO mappings can cause them to exist. 791 * eg. VDSO mappings can cause them to exist.
@@ -1457,646 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1457} 1453}
1458EXPORT_SYMBOL_GPL(zap_vma_ptes); 1454EXPORT_SYMBOL_GPL(zap_vma_ptes);
1459 1455
1460/**
1461 * follow_page_mask - look up a page descriptor from a user-virtual address
1462 * @vma: vm_area_struct mapping @address
1463 * @address: virtual address to look up
1464 * @flags: flags modifying lookup behaviour
1465 * @page_mask: on output, *page_mask is set according to the size of the page
1466 *
1467 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1468 *
1469 * Returns the mapped (struct page *), %NULL if no mapping exists, or
1470 * an error pointer if there is a mapping to something not represented
1471 * by a page descriptor (see also vm_normal_page()).
1472 */
1473struct page *follow_page_mask(struct vm_area_struct *vma,
1474 unsigned long address, unsigned int flags,
1475 unsigned int *page_mask)
1476{
1477 pgd_t *pgd;
1478 pud_t *pud;
1479 pmd_t *pmd;
1480 pte_t *ptep, pte;
1481 spinlock_t *ptl;
1482 struct page *page;
1483 struct mm_struct *mm = vma->vm_mm;
1484
1485 *page_mask = 0;
1486
1487 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1488 if (!IS_ERR(page)) {
1489 BUG_ON(flags & FOLL_GET);
1490 goto out;
1491 }
1492
1493 page = NULL;
1494 pgd = pgd_offset(mm, address);
1495 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1496 goto no_page_table;
1497
1498 pud = pud_offset(pgd, address);
1499 if (pud_none(*pud))
1500 goto no_page_table;
1501 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1502 if (flags & FOLL_GET)
1503 goto out;
1504 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1505 goto out;
1506 }
1507 if (unlikely(pud_bad(*pud)))
1508 goto no_page_table;
1509
1510 pmd = pmd_offset(pud, address);
1511 if (pmd_none(*pmd))
1512 goto no_page_table;
1513 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1514 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1515 if (flags & FOLL_GET) {
1516 /*
1517 * Refcount on tail pages are not well-defined and
1518 * shouldn't be taken. The caller should handle a NULL
1519 * return when trying to follow tail pages.
1520 */
1521 if (PageHead(page))
1522 get_page(page);
1523 else {
1524 page = NULL;
1525 goto out;
1526 }
1527 }
1528 goto out;
1529 }
1530 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1531 goto no_page_table;
1532 if (pmd_trans_huge(*pmd)) {
1533 if (flags & FOLL_SPLIT) {
1534 split_huge_page_pmd(vma, address, pmd);
1535 goto split_fallthrough;
1536 }
1537 ptl = pmd_lock(mm, pmd);
1538 if (likely(pmd_trans_huge(*pmd))) {
1539 if (unlikely(pmd_trans_splitting(*pmd))) {
1540 spin_unlock(ptl);
1541 wait_split_huge_page(vma->anon_vma, pmd);
1542 } else {
1543 page = follow_trans_huge_pmd(vma, address,
1544 pmd, flags);
1545 spin_unlock(ptl);
1546 *page_mask = HPAGE_PMD_NR - 1;
1547 goto out;
1548 }
1549 } else
1550 spin_unlock(ptl);
1551 /* fall through */
1552 }
1553split_fallthrough:
1554 if (unlikely(pmd_bad(*pmd)))
1555 goto no_page_table;
1556
1557 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1558
1559 pte = *ptep;
1560 if (!pte_present(pte)) {
1561 swp_entry_t entry;
1562 /*
1563 * KSM's break_ksm() relies upon recognizing a ksm page
1564 * even while it is being migrated, so for that case we
1565 * need migration_entry_wait().
1566 */
1567 if (likely(!(flags & FOLL_MIGRATION)))
1568 goto no_page;
1569 if (pte_none(pte) || pte_file(pte))
1570 goto no_page;
1571 entry = pte_to_swp_entry(pte);
1572 if (!is_migration_entry(entry))
1573 goto no_page;
1574 pte_unmap_unlock(ptep, ptl);
1575 migration_entry_wait(mm, pmd, address);
1576 goto split_fallthrough;
1577 }
1578 if ((flags & FOLL_NUMA) && pte_numa(pte))
1579 goto no_page;
1580 if ((flags & FOLL_WRITE) && !pte_write(pte))
1581 goto unlock;
1582
1583 page = vm_normal_page(vma, address, pte);
1584 if (unlikely(!page)) {
1585 if ((flags & FOLL_DUMP) ||
1586 !is_zero_pfn(pte_pfn(pte)))
1587 goto bad_page;
1588 page = pte_page(pte);
1589 }
1590
1591 if (flags & FOLL_GET)
1592 get_page_foll(page);
1593 if (flags & FOLL_TOUCH) {
1594 if ((flags & FOLL_WRITE) &&
1595 !pte_dirty(pte) && !PageDirty(page))
1596 set_page_dirty(page);
1597 /*
1598 * pte_mkyoung() would be more correct here, but atomic care
1599 * is needed to avoid losing the dirty bit: it is easier to use
1600 * mark_page_accessed().
1601 */
1602 mark_page_accessed(page);
1603 }
1604 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1605 /*
1606 * The preliminary mapping check is mainly to avoid the
1607 * pointless overhead of lock_page on the ZERO_PAGE
1608 * which might bounce very badly if there is contention.
1609 *
1610 * If the page is already locked, we don't need to
1611 * handle it now - vmscan will handle it later if and
1612 * when it attempts to reclaim the page.
1613 */
1614 if (page->mapping && trylock_page(page)) {
1615 lru_add_drain(); /* push cached pages to LRU */
1616 /*
1617 * Because we lock page here, and migration is
1618 * blocked by the pte's page reference, and we
1619 * know the page is still mapped, we don't even
1620 * need to check for file-cache page truncation.
1621 */
1622 mlock_vma_page(page);
1623 unlock_page(page);
1624 }
1625 }
1626unlock:
1627 pte_unmap_unlock(ptep, ptl);
1628out:
1629 return page;
1630
1631bad_page:
1632 pte_unmap_unlock(ptep, ptl);
1633 return ERR_PTR(-EFAULT);
1634
1635no_page:
1636 pte_unmap_unlock(ptep, ptl);
1637 if (!pte_none(pte))
1638 return page;
1639
1640no_page_table:
1641 /*
1642 * When core dumping an enormous anonymous area that nobody
1643 * has touched so far, we don't want to allocate unnecessary pages or
1644 * page tables. Return error instead of NULL to skip handle_mm_fault,
1645 * then get_dump_page() will return NULL to leave a hole in the dump.
1646 * But we can only make this optimization where a hole would surely
1647 * be zero-filled if handle_mm_fault() actually did handle it.
1648 */
1649 if ((flags & FOLL_DUMP) &&
1650 (!vma->vm_ops || !vma->vm_ops->fault))
1651 return ERR_PTR(-EFAULT);
1652 return page;
1653}
1654
1655static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1656{
1657 return stack_guard_page_start(vma, addr) ||
1658 stack_guard_page_end(vma, addr+PAGE_SIZE);
1659}
1660
1661/**
1662 * __get_user_pages() - pin user pages in memory
1663 * @tsk: task_struct of target task
1664 * @mm: mm_struct of target mm
1665 * @start: starting user address
1666 * @nr_pages: number of pages from start to pin
1667 * @gup_flags: flags modifying pin behaviour
1668 * @pages: array that receives pointers to the pages pinned.
1669 * Should be at least nr_pages long. Or NULL, if caller
1670 * only intends to ensure the pages are faulted in.
1671 * @vmas: array of pointers to vmas corresponding to each page.
1672 * Or NULL if the caller does not require them.
1673 * @nonblocking: whether waiting for disk IO or mmap_sem contention
1674 *
1675 * Returns number of pages pinned. This may be fewer than the number
1676 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1677 * were pinned, returns -errno. Each page returned must be released
1678 * with a put_page() call when it is finished with. vmas will only
1679 * remain valid while mmap_sem is held.
1680 *
1681 * Must be called with mmap_sem held for read or write.
1682 *
1683 * __get_user_pages walks a process's page tables and takes a reference to
1684 * each struct page that each user address corresponds to at a given
1685 * instant. That is, it takes the page that would be accessed if a user
1686 * thread accesses the given user virtual address at that instant.
1687 *
1688 * This does not guarantee that the page exists in the user mappings when
1689 * __get_user_pages returns, and there may even be a completely different
1690 * page there in some cases (eg. if mmapped pagecache has been invalidated
1691 * and subsequently re faulted). However it does guarantee that the page
1692 * won't be freed completely. And mostly callers simply care that the page
1693 * contains data that was valid *at some point in time*. Typically, an IO
1694 * or similar operation cannot guarantee anything stronger anyway because
1695 * locks can't be held over the syscall boundary.
1696 *
1697 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1698 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1699 * appropriate) must be called after the page is finished with, and
1700 * before put_page is called.
1701 *
1702 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
1703 * or mmap_sem contention, and if waiting is needed to pin all pages,
1704 * *@nonblocking will be set to 0.
1705 *
1706 * In most cases, get_user_pages or get_user_pages_fast should be used
1707 * instead of __get_user_pages. __get_user_pages should be used only if
1708 * you need some special @gup_flags.
1709 */
1710long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1711 unsigned long start, unsigned long nr_pages,
1712 unsigned int gup_flags, struct page **pages,
1713 struct vm_area_struct **vmas, int *nonblocking)
1714{
1715 long i;
1716 unsigned long vm_flags;
1717 unsigned int page_mask;
1718
1719 if (!nr_pages)
1720 return 0;
1721
1722 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1723
1724 /*
1725 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1726 * would be called on PROT_NONE ranges. We must never invoke
1727 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1728 * page faults would unprotect the PROT_NONE ranges if
1729 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1730 * bitflag. So to avoid that, don't set FOLL_NUMA if
1731 * FOLL_FORCE is set.
1732 */
1733 if (!(gup_flags & FOLL_FORCE))
1734 gup_flags |= FOLL_NUMA;
1735
1736 i = 0;
1737
1738 do {
1739 struct vm_area_struct *vma;
1740
1741 vma = find_extend_vma(mm, start);
1742 if (!vma && in_gate_area(mm, start)) {
1743 unsigned long pg = start & PAGE_MASK;
1744 pgd_t *pgd;
1745 pud_t *pud;
1746 pmd_t *pmd;
1747 pte_t *pte;
1748
1749 /* user gate pages are read-only */
1750 if (gup_flags & FOLL_WRITE)
1751 goto efault;
1752 if (pg > TASK_SIZE)
1753 pgd = pgd_offset_k(pg);
1754 else
1755 pgd = pgd_offset_gate(mm, pg);
1756 BUG_ON(pgd_none(*pgd));
1757 pud = pud_offset(pgd, pg);
1758 BUG_ON(pud_none(*pud));
1759 pmd = pmd_offset(pud, pg);
1760 if (pmd_none(*pmd))
1761 goto efault;
1762 VM_BUG_ON(pmd_trans_huge(*pmd));
1763 pte = pte_offset_map(pmd, pg);
1764 if (pte_none(*pte)) {
1765 pte_unmap(pte);
1766 goto efault;
1767 }
1768 vma = get_gate_vma(mm);
1769 if (pages) {
1770 struct page *page;
1771
1772 page = vm_normal_page(vma, start, *pte);
1773 if (!page) {
1774 if (!(gup_flags & FOLL_DUMP) &&
1775 is_zero_pfn(pte_pfn(*pte)))
1776 page = pte_page(*pte);
1777 else {
1778 pte_unmap(pte);
1779 goto efault;
1780 }
1781 }
1782 pages[i] = page;
1783 get_page(page);
1784 }
1785 pte_unmap(pte);
1786 page_mask = 0;
1787 goto next_page;
1788 }
1789
1790 if (!vma)
1791 goto efault;
1792 vm_flags = vma->vm_flags;
1793 if (vm_flags & (VM_IO | VM_PFNMAP))
1794 goto efault;
1795
1796 if (gup_flags & FOLL_WRITE) {
1797 if (!(vm_flags & VM_WRITE)) {
1798 if (!(gup_flags & FOLL_FORCE))
1799 goto efault;
1800 /*
1801 * We used to let the write,force case do COW
1802 * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so
1803 * ptrace could set a breakpoint in a read-only
1804 * mapping of an executable, without corrupting
1805 * the file (yet only when that file had been
1806 * opened for writing!). Anon pages in shared
1807 * mappings are surprising: now just reject it.
1808 */
1809 if (!is_cow_mapping(vm_flags)) {
1810 WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
1811 goto efault;
1812 }
1813 }
1814 } else {
1815 if (!(vm_flags & VM_READ)) {
1816 if (!(gup_flags & FOLL_FORCE))
1817 goto efault;
1818 /*
1819 * Is there actually any vma we can reach here
1820 * which does not have VM_MAYREAD set?
1821 */
1822 if (!(vm_flags & VM_MAYREAD))
1823 goto efault;
1824 }
1825 }
1826
1827 if (is_vm_hugetlb_page(vma)) {
1828 i = follow_hugetlb_page(mm, vma, pages, vmas,
1829 &start, &nr_pages, i, gup_flags);
1830 continue;
1831 }
1832
1833 do {
1834 struct page *page;
1835 unsigned int foll_flags = gup_flags;
1836 unsigned int page_increm;
1837
1838 /*
1839 * If we have a pending SIGKILL, don't keep faulting
1840 * pages and potentially allocating memory.
1841 */
1842 if (unlikely(fatal_signal_pending(current)))
1843 return i ? i : -ERESTARTSYS;
1844
1845 cond_resched();
1846 while (!(page = follow_page_mask(vma, start,
1847 foll_flags, &page_mask))) {
1848 int ret;
1849 unsigned int fault_flags = 0;
1850
1851 /* For mlock, just skip the stack guard page. */
1852 if (foll_flags & FOLL_MLOCK) {
1853 if (stack_guard_page(vma, start))
1854 goto next_page;
1855 }
1856 if (foll_flags & FOLL_WRITE)
1857 fault_flags |= FAULT_FLAG_WRITE;
1858 if (nonblocking)
1859 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1860 if (foll_flags & FOLL_NOWAIT)
1861 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1862
1863 ret = handle_mm_fault(mm, vma, start,
1864 fault_flags);
1865
1866 if (ret & VM_FAULT_ERROR) {
1867 if (ret & VM_FAULT_OOM)
1868 return i ? i : -ENOMEM;
1869 if (ret & (VM_FAULT_HWPOISON |
1870 VM_FAULT_HWPOISON_LARGE)) {
1871 if (i)
1872 return i;
1873 else if (gup_flags & FOLL_HWPOISON)
1874 return -EHWPOISON;
1875 else
1876 return -EFAULT;
1877 }
1878 if (ret & VM_FAULT_SIGBUS)
1879 goto efault;
1880 BUG();
1881 }
1882
1883 if (tsk) {
1884 if (ret & VM_FAULT_MAJOR)
1885 tsk->maj_flt++;
1886 else
1887 tsk->min_flt++;
1888 }
1889
1890 if (ret & VM_FAULT_RETRY) {
1891 if (nonblocking)
1892 *nonblocking = 0;
1893 return i;
1894 }
1895
1896 /*
1897 * The VM_FAULT_WRITE bit tells us that
1898 * do_wp_page has broken COW when necessary,
1899 * even if maybe_mkwrite decided not to set
1900 * pte_write. We can thus safely do subsequent
1901 * page lookups as if they were reads. But only
1902 * do so when looping for pte_write is futile:
1903 * in some cases userspace may also be wanting
1904 * to write to the gotten user page, which a
1905 * read fault here might prevent (a readonly
1906 * page might get reCOWed by userspace write).
1907 */
1908 if ((ret & VM_FAULT_WRITE) &&
1909 !(vma->vm_flags & VM_WRITE))
1910 foll_flags &= ~FOLL_WRITE;
1911
1912 cond_resched();
1913 }
1914 if (IS_ERR(page))
1915 return i ? i : PTR_ERR(page);
1916 if (pages) {
1917 pages[i] = page;
1918
1919 flush_anon_page(vma, page, start);
1920 flush_dcache_page(page);
1921 page_mask = 0;
1922 }
1923next_page:
1924 if (vmas) {
1925 vmas[i] = vma;
1926 page_mask = 0;
1927 }
1928 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
1929 if (page_increm > nr_pages)
1930 page_increm = nr_pages;
1931 i += page_increm;
1932 start += page_increm * PAGE_SIZE;
1933 nr_pages -= page_increm;
1934 } while (nr_pages && start < vma->vm_end);
1935 } while (nr_pages);
1936 return i;
1937efault:
1938 return i ? : -EFAULT;
1939}
1940EXPORT_SYMBOL(__get_user_pages);
1941
1942/*
1943 * fixup_user_fault() - manually resolve a user page fault
1944 * @tsk: the task_struct to use for page fault accounting, or
1945 * NULL if faults are not to be recorded.
1946 * @mm: mm_struct of target mm
1947 * @address: user address
1948 * @fault_flags:flags to pass down to handle_mm_fault()
1949 *
1950 * This is meant to be called in the specific scenario where for locking reasons
1951 * we try to access user memory in atomic context (within a pagefault_disable()
1952 * section), this returns -EFAULT, and we want to resolve the user fault before
1953 * trying again.
1954 *
1955 * Typically this is meant to be used by the futex code.
1956 *
1957 * The main difference with get_user_pages() is that this function will
1958 * unconditionally call handle_mm_fault() which will in turn perform all the
1959 * necessary SW fixup of the dirty and young bits in the PTE, while
1960 * handle_mm_fault() only guarantees to update these in the struct page.
1961 *
1962 * This is important for some architectures where those bits also gate the
1963 * access permission to the page because they are maintained in software. On
1964 * such architectures, gup() will not be enough to make a subsequent access
1965 * succeed.
1966 *
1967 * This should be called with the mm_sem held for read.
1968 */
1969int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1970 unsigned long address, unsigned int fault_flags)
1971{
1972 struct vm_area_struct *vma;
1973 vm_flags_t vm_flags;
1974 int ret;
1975
1976 vma = find_extend_vma(mm, address);
1977 if (!vma || address < vma->vm_start)
1978 return -EFAULT;
1979
1980 vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
1981 if (!(vm_flags & vma->vm_flags))
1982 return -EFAULT;
1983
1984 ret = handle_mm_fault(mm, vma, address, fault_flags);
1985 if (ret & VM_FAULT_ERROR) {
1986 if (ret & VM_FAULT_OOM)
1987 return -ENOMEM;
1988 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1989 return -EHWPOISON;
1990 if (ret & VM_FAULT_SIGBUS)
1991 return -EFAULT;
1992 BUG();
1993 }
1994 if (tsk) {
1995 if (ret & VM_FAULT_MAJOR)
1996 tsk->maj_flt++;
1997 else
1998 tsk->min_flt++;
1999 }
2000 return 0;
2001}
2002
2003/*
2004 * get_user_pages() - pin user pages in memory
2005 * @tsk: the task_struct to use for page fault accounting, or
2006 * NULL if faults are not to be recorded.
2007 * @mm: mm_struct of target mm
2008 * @start: starting user address
2009 * @nr_pages: number of pages from start to pin
2010 * @write: whether pages will be written to by the caller
2011 * @force: whether to force access even when user mapping is currently
2012 * protected (but never forces write access to shared mapping).
2013 * @pages: array that receives pointers to the pages pinned.
2014 * Should be at least nr_pages long. Or NULL, if caller
2015 * only intends to ensure the pages are faulted in.
2016 * @vmas: array of pointers to vmas corresponding to each page.
2017 * Or NULL if the caller does not require them.
2018 *
2019 * Returns number of pages pinned. This may be fewer than the number
2020 * requested. If nr_pages is 0 or negative, returns 0. If no pages
2021 * were pinned, returns -errno. Each page returned must be released
2022 * with a put_page() call when it is finished with. vmas will only
2023 * remain valid while mmap_sem is held.
2024 *
2025 * Must be called with mmap_sem held for read or write.
2026 *
2027 * get_user_pages walks a process's page tables and takes a reference to
2028 * each struct page that each user address corresponds to at a given
2029 * instant. That is, it takes the page that would be accessed if a user
2030 * thread accesses the given user virtual address at that instant.
2031 *
2032 * This does not guarantee that the page exists in the user mappings when
2033 * get_user_pages returns, and there may even be a completely different
2034 * page there in some cases (eg. if mmapped pagecache has been invalidated
2035 * and subsequently re faulted). However it does guarantee that the page
2036 * won't be freed completely. And mostly callers simply care that the page
2037 * contains data that was valid *at some point in time*. Typically, an IO
2038 * or similar operation cannot guarantee anything stronger anyway because
2039 * locks can't be held over the syscall boundary.
2040 *
2041 * If write=0, the page must not be written to. If the page is written to,
2042 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
2043 * after the page is finished with, and before put_page is called.
2044 *
2045 * get_user_pages is typically used for fewer-copy IO operations, to get a
2046 * handle on the memory by some means other than accesses via the user virtual
2047 * addresses. The pages may be submitted for DMA to devices or accessed via
2048 * their kernel linear mapping (via the kmap APIs). Care should be taken to
2049 * use the correct cache flushing APIs.
2050 *
2051 * See also get_user_pages_fast, for performance critical applications.
2052 */
2053long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
2054 unsigned long start, unsigned long nr_pages, int write,
2055 int force, struct page **pages, struct vm_area_struct **vmas)
2056{
2057 int flags = FOLL_TOUCH;
2058
2059 if (pages)
2060 flags |= FOLL_GET;
2061 if (write)
2062 flags |= FOLL_WRITE;
2063 if (force)
2064 flags |= FOLL_FORCE;
2065
2066 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
2067 NULL);
2068}
2069EXPORT_SYMBOL(get_user_pages);
2070
2071/**
2072 * get_dump_page() - pin user page in memory while writing it to core dump
2073 * @addr: user address
2074 *
2075 * Returns struct page pointer of user page pinned for dump,
2076 * to be freed afterwards by page_cache_release() or put_page().
2077 *
2078 * Returns NULL on any kind of failure - a hole must then be inserted into
2079 * the corefile, to preserve alignment with its headers; and also returns
2080 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
2081 * allowing a hole to be left in the corefile to save diskspace.
2082 *
2083 * Called without mmap_sem, but after all other threads have been killed.
2084 */
2085#ifdef CONFIG_ELF_CORE
2086struct page *get_dump_page(unsigned long addr)
2087{
2088 struct vm_area_struct *vma;
2089 struct page *page;
2090
2091 if (__get_user_pages(current, current->mm, addr, 1,
2092 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
2093 NULL) < 1)
2094 return NULL;
2095 flush_cache_page(vma, addr, page_to_pfn(page));
2096 return page;
2097}
2098#endif /* CONFIG_ELF_CORE */
2099
2100pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 1456pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
2101 spinlock_t **ptl) 1457 spinlock_t **ptl)
2102{ 1458{
@@ -3402,65 +2758,76 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
3402 update_mmu_cache(vma, address, pte); 2758 update_mmu_cache(vma, address, pte);
3403} 2759}
3404 2760
3405#define FAULT_AROUND_ORDER 4 2761static unsigned long fault_around_bytes = 65536;
3406 2762
3407#ifdef CONFIG_DEBUG_FS 2763/*
3408static unsigned int fault_around_order = FAULT_AROUND_ORDER; 2764 * fault_around_pages() and fault_around_mask() round down fault_around_bytes
2765 * to nearest page order. It's what do_fault_around() expects to see.
2766 */
2767static inline unsigned long fault_around_pages(void)
2768{
2769 return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE;
2770}
2771
2772static inline unsigned long fault_around_mask(void)
2773{
2774 return ~(rounddown_pow_of_two(fault_around_bytes) - 1) & PAGE_MASK;
2775}
3409 2776
3410static int fault_around_order_get(void *data, u64 *val) 2777
2778#ifdef CONFIG_DEBUG_FS
2779static int fault_around_bytes_get(void *data, u64 *val)
3411{ 2780{
3412 *val = fault_around_order; 2781 *val = fault_around_bytes;
3413 return 0; 2782 return 0;
3414} 2783}
3415 2784
3416static int fault_around_order_set(void *data, u64 val) 2785static int fault_around_bytes_set(void *data, u64 val)
3417{ 2786{
3418 BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); 2787 if (val / PAGE_SIZE > PTRS_PER_PTE)
3419 if (1UL << val > PTRS_PER_PTE)
3420 return -EINVAL; 2788 return -EINVAL;
3421 fault_around_order = val; 2789 fault_around_bytes = val;
3422 return 0; 2790 return 0;
3423} 2791}
3424DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, 2792DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
3425 fault_around_order_get, fault_around_order_set, "%llu\n"); 2793 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3426 2794
3427static int __init fault_around_debugfs(void) 2795static int __init fault_around_debugfs(void)
3428{ 2796{
3429 void *ret; 2797 void *ret;
3430 2798
3431 ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, 2799 ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
3432 &fault_around_order_fops); 2800 &fault_around_bytes_fops);
3433 if (!ret) 2801 if (!ret)
3434 pr_warn("Failed to create fault_around_order in debugfs"); 2802 pr_warn("Failed to create fault_around_bytes in debugfs");
3435 return 0; 2803 return 0;
3436} 2804}
3437late_initcall(fault_around_debugfs); 2805late_initcall(fault_around_debugfs);
3438
3439static inline unsigned long fault_around_pages(void)
3440{
3441 return 1UL << fault_around_order;
3442}
3443
3444static inline unsigned long fault_around_mask(void)
3445{
3446 return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1);
3447}
3448#else
3449static inline unsigned long fault_around_pages(void)
3450{
3451 unsigned long nr_pages;
3452
3453 nr_pages = 1UL << FAULT_AROUND_ORDER;
3454 BUILD_BUG_ON(nr_pages > PTRS_PER_PTE);
3455 return nr_pages;
3456}
3457
3458static inline unsigned long fault_around_mask(void)
3459{
3460 return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
3461}
3462#endif 2806#endif
3463 2807
2808/*
2809 * do_fault_around() tries to map few pages around the fault address. The hope
2810 * is that the pages will be needed soon and this will lower the number of
2811 * faults to handle.
2812 *
2813 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
2814 * not ready to be mapped: not up-to-date, locked, etc.
2815 *
2816 * This function is called with the page table lock taken. In the split ptlock
2817 * case the page table lock only protects only those entries which belong to
2818 * the page table corresponding to the fault address.
2819 *
2820 * This function doesn't cross the VMA boundaries, in order to call map_pages()
2821 * only once.
2822 *
2823 * fault_around_pages() defines how many pages we'll try to map.
2824 * do_fault_around() expects it to return a power of two less than or equal to
2825 * PTRS_PER_PTE.
2826 *
2827 * The virtual address of the area that we map is naturally aligned to the
2828 * fault_around_pages() value (and therefore to page order). This way it's
2829 * easier to guarantee that we don't cross page table boundaries.
2830 */
3464static void do_fault_around(struct vm_area_struct *vma, unsigned long address, 2831static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
3465 pte_t *pte, pgoff_t pgoff, unsigned int flags) 2832 pte_t *pte, pgoff_t pgoff, unsigned int flags)
3466{ 2833{
@@ -3476,7 +2843,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
3476 2843
3477 /* 2844 /*
3478 * max_pgoff is either end of page table or end of vma 2845 * max_pgoff is either end of page table or end of vma
3479 * or fault_around_pages() from pgoff, depending what is neast. 2846 * or fault_around_pages() from pgoff, depending what is nearest.
3480 */ 2847 */
3481 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 2848 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3482 PTRS_PER_PTE - 1; 2849 PTRS_PER_PTE - 1;
@@ -3515,7 +2882,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3515 * if page by the offset is not ready to be mapped (cold cache or 2882 * if page by the offset is not ready to be mapped (cold cache or
3516 * something). 2883 * something).
3517 */ 2884 */
3518 if (vma->vm_ops->map_pages) { 2885 if (vma->vm_ops->map_pages && fault_around_pages() > 1) {
3519 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2886 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
3520 do_fault_around(vma, address, pte, pgoff, flags); 2887 do_fault_around(vma, address, pte, pgoff, flags);
3521 if (!pte_same(*pte, orig_pte)) 2888 if (!pte_same(*pte, orig_pte))
@@ -3920,9 +3287,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3920 } 3287 }
3921 } 3288 }
3922 3289
3923 /* THP should already have been handled */
3924 BUG_ON(pmd_numa(*pmd));
3925
3926 /* 3290 /*
3927 * Use __pte_alloc instead of pte_alloc_map, because we can't 3291 * Use __pte_alloc instead of pte_alloc_map, because we can't
3928 * run pte_offset_map on the pmd, if an huge pmd could 3292 * run pte_offset_map on the pmd, if an huge pmd could
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a650db29606f..469bbf505f85 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -46,19 +46,84 @@
46static void generic_online_page(struct page *page); 46static void generic_online_page(struct page *page);
47 47
48static online_page_callback_t online_page_callback = generic_online_page; 48static online_page_callback_t online_page_callback = generic_online_page;
49static DEFINE_MUTEX(online_page_callback_lock);
49 50
50DEFINE_MUTEX(mem_hotplug_mutex); 51/* The same as the cpu_hotplug lock, but for memory hotplug. */
52static struct {
53 struct task_struct *active_writer;
54 struct mutex lock; /* Synchronizes accesses to refcount, */
55 /*
56 * Also blocks the new readers during
57 * an ongoing mem hotplug operation.
58 */
59 int refcount;
60
61#ifdef CONFIG_DEBUG_LOCK_ALLOC
62 struct lockdep_map dep_map;
63#endif
64} mem_hotplug = {
65 .active_writer = NULL,
66 .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
67 .refcount = 0,
68#ifdef CONFIG_DEBUG_LOCK_ALLOC
69 .dep_map = {.name = "mem_hotplug.lock" },
70#endif
71};
72
73/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
74#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
75#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
76#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
77
78void get_online_mems(void)
79{
80 might_sleep();
81 if (mem_hotplug.active_writer == current)
82 return;
83 memhp_lock_acquire_read();
84 mutex_lock(&mem_hotplug.lock);
85 mem_hotplug.refcount++;
86 mutex_unlock(&mem_hotplug.lock);
87
88}
51 89
52void lock_memory_hotplug(void) 90void put_online_mems(void)
53{ 91{
54 mutex_lock(&mem_hotplug_mutex); 92 if (mem_hotplug.active_writer == current)
93 return;
94 mutex_lock(&mem_hotplug.lock);
95
96 if (WARN_ON(!mem_hotplug.refcount))
97 mem_hotplug.refcount++; /* try to fix things up */
98
99 if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
100 wake_up_process(mem_hotplug.active_writer);
101 mutex_unlock(&mem_hotplug.lock);
102 memhp_lock_release();
103
55} 104}
56 105
57void unlock_memory_hotplug(void) 106static void mem_hotplug_begin(void)
58{ 107{
59 mutex_unlock(&mem_hotplug_mutex); 108 mem_hotplug.active_writer = current;
109
110 memhp_lock_acquire();
111 for (;;) {
112 mutex_lock(&mem_hotplug.lock);
113 if (likely(!mem_hotplug.refcount))
114 break;
115 __set_current_state(TASK_UNINTERRUPTIBLE);
116 mutex_unlock(&mem_hotplug.lock);
117 schedule();
118 }
60} 119}
61 120
121static void mem_hotplug_done(void)
122{
123 mem_hotplug.active_writer = NULL;
124 mutex_unlock(&mem_hotplug.lock);
125 memhp_lock_release();
126}
62 127
63/* add this memory to iomem resource */ 128/* add this memory to iomem resource */
64static struct resource *register_memory_resource(u64 start, u64 size) 129static struct resource *register_memory_resource(u64 start, u64 size)
@@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback)
727{ 792{
728 int rc = -EINVAL; 793 int rc = -EINVAL;
729 794
730 lock_memory_hotplug(); 795 get_online_mems();
796 mutex_lock(&online_page_callback_lock);
731 797
732 if (online_page_callback == generic_online_page) { 798 if (online_page_callback == generic_online_page) {
733 online_page_callback = callback; 799 online_page_callback = callback;
734 rc = 0; 800 rc = 0;
735 } 801 }
736 802
737 unlock_memory_hotplug(); 803 mutex_unlock(&online_page_callback_lock);
804 put_online_mems();
738 805
739 return rc; 806 return rc;
740} 807}
@@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback)
744{ 811{
745 int rc = -EINVAL; 812 int rc = -EINVAL;
746 813
747 lock_memory_hotplug(); 814 get_online_mems();
815 mutex_lock(&online_page_callback_lock);
748 816
749 if (online_page_callback == callback) { 817 if (online_page_callback == callback) {
750 online_page_callback = generic_online_page; 818 online_page_callback = generic_online_page;
751 rc = 0; 819 rc = 0;
752 } 820 }
753 821
754 unlock_memory_hotplug(); 822 mutex_unlock(&online_page_callback_lock);
823 put_online_mems();
755 824
756 return rc; 825 return rc;
757} 826}
@@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
899 int ret; 968 int ret;
900 struct memory_notify arg; 969 struct memory_notify arg;
901 970
902 lock_memory_hotplug(); 971 mem_hotplug_begin();
903 /* 972 /*
904 * This doesn't need a lock to do pfn_to_page(). 973 * This doesn't need a lock to do pfn_to_page().
905 * The section can't be removed here because of the 974 * The section can't be removed here because of the
@@ -907,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
907 */ 976 */
908 zone = page_zone(pfn_to_page(pfn)); 977 zone = page_zone(pfn_to_page(pfn));
909 978
979 ret = -EINVAL;
910 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 980 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
911 !can_online_high_movable(zone)) { 981 !can_online_high_movable(zone))
912 unlock_memory_hotplug(); 982 goto out;
913 return -EINVAL;
914 }
915 983
916 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 984 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
917 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 985 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
918 unlock_memory_hotplug(); 986 goto out;
919 return -EINVAL;
920 }
921 } 987 }
922 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 988 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
923 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 989 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
924 unlock_memory_hotplug(); 990 goto out;
925 return -EINVAL;
926 }
927 } 991 }
928 992
929 /* Previous code may changed the zone of the pfn range */ 993 /* Previous code may changed the zone of the pfn range */
@@ -939,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
939 ret = notifier_to_errno(ret); 1003 ret = notifier_to_errno(ret);
940 if (ret) { 1004 if (ret) {
941 memory_notify(MEM_CANCEL_ONLINE, &arg); 1005 memory_notify(MEM_CANCEL_ONLINE, &arg);
942 unlock_memory_hotplug(); 1006 goto out;
943 return ret;
944 } 1007 }
945 /* 1008 /*
946 * If this zone is not populated, then it is not in zonelist. 1009 * If this zone is not populated, then it is not in zonelist.
@@ -964,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
964 (((unsigned long long) pfn + nr_pages) 1027 (((unsigned long long) pfn + nr_pages)
965 << PAGE_SHIFT) - 1); 1028 << PAGE_SHIFT) - 1);
966 memory_notify(MEM_CANCEL_ONLINE, &arg); 1029 memory_notify(MEM_CANCEL_ONLINE, &arg);
967 unlock_memory_hotplug(); 1030 goto out;
968 return ret;
969 } 1031 }
970 1032
971 zone->present_pages += onlined_pages; 1033 zone->present_pages += onlined_pages;
@@ -995,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
995 1057
996 if (onlined_pages) 1058 if (onlined_pages)
997 memory_notify(MEM_ONLINE, &arg); 1059 memory_notify(MEM_ONLINE, &arg);
998 unlock_memory_hotplug(); 1060out:
999 1061 mem_hotplug_done();
1000 return 0; 1062 return ret;
1001} 1063}
1002#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1064#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
1003 1065
@@ -1007,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1007 struct pglist_data *pgdat; 1069 struct pglist_data *pgdat;
1008 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1070 unsigned long zones_size[MAX_NR_ZONES] = {0};
1009 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1071 unsigned long zholes_size[MAX_NR_ZONES] = {0};
1010 unsigned long start_pfn = start >> PAGE_SHIFT; 1072 unsigned long start_pfn = PFN_DOWN(start);
1011 1073
1012 pgdat = NODE_DATA(nid); 1074 pgdat = NODE_DATA(nid);
1013 if (!pgdat) { 1075 if (!pgdat) {
@@ -1055,7 +1117,7 @@ int try_online_node(int nid)
1055 if (node_online(nid)) 1117 if (node_online(nid))
1056 return 0; 1118 return 0;
1057 1119
1058 lock_memory_hotplug(); 1120 mem_hotplug_begin();
1059 pgdat = hotadd_new_pgdat(nid, 0); 1121 pgdat = hotadd_new_pgdat(nid, 0);
1060 if (!pgdat) { 1122 if (!pgdat) {
1061 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1123 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
@@ -1073,13 +1135,13 @@ int try_online_node(int nid)
1073 } 1135 }
1074 1136
1075out: 1137out:
1076 unlock_memory_hotplug(); 1138 mem_hotplug_done();
1077 return ret; 1139 return ret;
1078} 1140}
1079 1141
1080static int check_hotplug_memory_range(u64 start, u64 size) 1142static int check_hotplug_memory_range(u64 start, u64 size)
1081{ 1143{
1082 u64 start_pfn = start >> PAGE_SHIFT; 1144 u64 start_pfn = PFN_DOWN(start);
1083 u64 nr_pages = size >> PAGE_SHIFT; 1145 u64 nr_pages = size >> PAGE_SHIFT;
1084 1146
1085 /* Memory range must be aligned with section */ 1147 /* Memory range must be aligned with section */
@@ -1117,7 +1179,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
1117 new_pgdat = !p; 1179 new_pgdat = !p;
1118 } 1180 }
1119 1181
1120 lock_memory_hotplug(); 1182 mem_hotplug_begin();
1121 1183
1122 new_node = !node_online(nid); 1184 new_node = !node_online(nid);
1123 if (new_node) { 1185 if (new_node) {
@@ -1158,7 +1220,7 @@ error:
1158 release_memory_resource(res); 1220 release_memory_resource(res);
1159 1221
1160out: 1222out:
1161 unlock_memory_hotplug(); 1223 mem_hotplug_done();
1162 return ret; 1224 return ret;
1163} 1225}
1164EXPORT_SYMBOL_GPL(add_memory); 1226EXPORT_SYMBOL_GPL(add_memory);
@@ -1332,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1332 * alloc_migrate_target should be improooooved!! 1394 * alloc_migrate_target should be improooooved!!
1333 * migrate_pages returns # of failed pages. 1395 * migrate_pages returns # of failed pages.
1334 */ 1396 */
1335 ret = migrate_pages(&source, alloc_migrate_target, 0, 1397 ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
1336 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1398 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1337 if (ret) 1399 if (ret)
1338 putback_movable_pages(&source); 1400 putback_movable_pages(&source);
@@ -1565,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
1565 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1627 if (!test_pages_in_a_zone(start_pfn, end_pfn))
1566 return -EINVAL; 1628 return -EINVAL;
1567 1629
1568 lock_memory_hotplug(); 1630 mem_hotplug_begin();
1569 1631
1570 zone = page_zone(pfn_to_page(start_pfn)); 1632 zone = page_zone(pfn_to_page(start_pfn));
1571 node = zone_to_nid(zone); 1633 node = zone_to_nid(zone);
@@ -1672,7 +1734,7 @@ repeat:
1672 writeback_set_ratelimit(); 1734 writeback_set_ratelimit();
1673 1735
1674 memory_notify(MEM_OFFLINE, &arg); 1736 memory_notify(MEM_OFFLINE, &arg);
1675 unlock_memory_hotplug(); 1737 mem_hotplug_done();
1676 return 0; 1738 return 0;
1677 1739
1678failed_removal: 1740failed_removal:
@@ -1684,7 +1746,7 @@ failed_removal:
1684 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1746 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1685 1747
1686out: 1748out:
1687 unlock_memory_hotplug(); 1749 mem_hotplug_done();
1688 return ret; 1750 return ret;
1689} 1751}
1690 1752
@@ -1888,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
1888 1950
1889 BUG_ON(check_hotplug_memory_range(start, size)); 1951 BUG_ON(check_hotplug_memory_range(start, size));
1890 1952
1891 lock_memory_hotplug(); 1953 mem_hotplug_begin();
1892 1954
1893 /* 1955 /*
1894 * All memory blocks must be offlined before removing memory. Check 1956 * All memory blocks must be offlined before removing memory. Check
@@ -1897,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size)
1897 */ 1959 */
1898 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1960 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1899 check_memblock_offlined_cb); 1961 check_memblock_offlined_cb);
1900 if (ret) { 1962 if (ret)
1901 unlock_memory_hotplug();
1902 BUG(); 1963 BUG();
1903 }
1904 1964
1905 /* remove memmap entry */ 1965 /* remove memmap entry */
1906 firmware_map_remove(start, start + size, "System RAM"); 1966 firmware_map_remove(start, start + size, "System RAM");
@@ -1909,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
1909 1969
1910 try_offline_node(nid); 1970 try_offline_node(nid);
1911 1971
1912 unlock_memory_hotplug(); 1972 mem_hotplug_done();
1913} 1973}
1914EXPORT_SYMBOL_GPL(remove_memory); 1974EXPORT_SYMBOL_GPL(remove_memory);
1915#endif /* CONFIG_MEMORY_HOTREMOVE */ 1975#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 78e1472933ea..16bc9fa42998 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1028 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1028 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1029 1029
1030 if (!list_empty(&pagelist)) { 1030 if (!list_empty(&pagelist)) {
1031 err = migrate_pages(&pagelist, new_node_page, dest, 1031 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1032 MIGRATE_SYNC, MR_SYSCALL); 1032 MIGRATE_SYNC, MR_SYSCALL);
1033 if (err) 1033 if (err)
1034 putback_movable_pages(&pagelist); 1034 putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1277 if (!list_empty(&pagelist)) { 1277 if (!list_empty(&pagelist)) {
1278 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1278 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1279 nr_failed = migrate_pages(&pagelist, new_vma_page, 1279 nr_failed = migrate_pages(&pagelist, new_vma_page,
1280 (unsigned long)vma, 1280 NULL, (unsigned long)vma,
1281 MIGRATE_SYNC, MR_MEMPOLICY_MBIND); 1281 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1282 if (nr_failed) 1282 if (nr_failed)
1283 putback_movable_pages(&pagelist); 1283 putback_movable_pages(&pagelist);
@@ -1362,7 +1362,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1362} 1362}
1363 1363
1364SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1364SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1365 unsigned long, mode, unsigned long __user *, nmask, 1365 unsigned long, mode, const unsigned long __user *, nmask,
1366 unsigned long, maxnode, unsigned, flags) 1366 unsigned long, maxnode, unsigned, flags)
1367{ 1367{
1368 nodemask_t nodes; 1368 nodemask_t nodes;
@@ -1383,7 +1383,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1383} 1383}
1384 1384
1385/* Set the process memory policy */ 1385/* Set the process memory policy */
1386SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, 1386SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1387 unsigned long, maxnode) 1387 unsigned long, maxnode)
1388{ 1388{
1389 int err; 1389 int err;
@@ -1606,9 +1606,9 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1606 1606
1607/* 1607/*
1608 * get_vma_policy(@task, @vma, @addr) 1608 * get_vma_policy(@task, @vma, @addr)
1609 * @task - task for fallback if vma policy == default 1609 * @task: task for fallback if vma policy == default
1610 * @vma - virtual memory area whose policy is sought 1610 * @vma: virtual memory area whose policy is sought
1611 * @addr - address in @vma for shared policy lookup 1611 * @addr: address in @vma for shared policy lookup
1612 * 1612 *
1613 * Returns effective policy for a VMA at specified address. 1613 * Returns effective policy for a VMA at specified address.
1614 * Falls back to @task or system default policy, as necessary. 1614 * Falls back to @task or system default policy, as necessary.
@@ -1854,11 +1854,11 @@ int node_random(const nodemask_t *maskp)
1854#ifdef CONFIG_HUGETLBFS 1854#ifdef CONFIG_HUGETLBFS
1855/* 1855/*
1856 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1856 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1857 * @vma = virtual memory area whose policy is sought 1857 * @vma: virtual memory area whose policy is sought
1858 * @addr = address in @vma for shared policy lookup and interleave policy 1858 * @addr: address in @vma for shared policy lookup and interleave policy
1859 * @gfp_flags = for requested zone 1859 * @gfp_flags: for requested zone
1860 * @mpol = pointer to mempolicy pointer for reference counted mempolicy 1860 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1861 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask 1861 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1862 * 1862 *
1863 * Returns a zonelist suitable for a huge page allocation and a pointer 1863 * Returns a zonelist suitable for a huge page allocation and a pointer
1864 * to the struct mempolicy for conditional unref after allocation. 1864 * to the struct mempolicy for conditional unref after allocation.
@@ -2270,9 +2270,9 @@ static void sp_free(struct sp_node *n)
2270/** 2270/**
2271 * mpol_misplaced - check whether current page node is valid in policy 2271 * mpol_misplaced - check whether current page node is valid in policy
2272 * 2272 *
2273 * @page - page to be checked 2273 * @page: page to be checked
2274 * @vma - vm area where page mapped 2274 * @vma: vm area where page mapped
2275 * @addr - virtual address where page mapped 2275 * @addr: virtual address where page mapped
2276 * 2276 *
2277 * Lookup current policy node id for vma,addr and "compare to" page's 2277 * Lookup current policy node id for vma,addr and "compare to" page's
2278 * node id. 2278 * node id.
diff --git a/mm/mempool.c b/mm/mempool.c
index 905434f18c97..455d468c3a5d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -192,6 +192,7 @@ EXPORT_SYMBOL(mempool_resize);
192 * returns NULL. Note that due to preallocation, this function 192 * returns NULL. Note that due to preallocation, this function
193 * *never* fails when called from process contexts. (it might 193 * *never* fails when called from process contexts. (it might
194 * fail if called from an IRQ context.) 194 * fail if called from an IRQ context.)
195 * Note: using __GFP_ZERO is not supported.
195 */ 196 */
196void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) 197void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
197{ 198{
@@ -200,6 +201,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
200 wait_queue_t wait; 201 wait_queue_t wait;
201 gfp_t gfp_temp; 202 gfp_t gfp_temp;
202 203
204 VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
203 might_sleep_if(gfp_mask & __GFP_WAIT); 205 might_sleep_if(gfp_mask & __GFP_WAIT);
204 206
205 gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ 207 gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
diff --git a/mm/migrate.c b/mm/migrate.c
index bed48809e5d0..63f0cd559999 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
938 * Obtain the lock on page, remove all ptes and migrate the page 938 * Obtain the lock on page, remove all ptes and migrate the page
939 * to the newly allocated page in newpage. 939 * to the newly allocated page in newpage.
940 */ 940 */
941static int unmap_and_move(new_page_t get_new_page, unsigned long private, 941static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
942 struct page *page, int force, enum migrate_mode mode) 942 unsigned long private, struct page *page, int force,
943 enum migrate_mode mode)
943{ 944{
944 int rc = 0; 945 int rc = 0;
945 int *result = NULL; 946 int *result = NULL;
@@ -983,11 +984,17 @@ out:
983 page_is_file_cache(page)); 984 page_is_file_cache(page));
984 putback_lru_page(page); 985 putback_lru_page(page);
985 } 986 }
987
986 /* 988 /*
987 * Move the new page to the LRU. If migration was not successful 989 * If migration was not successful and there's a freeing callback, use
988 * then this will free the page. 990 * it. Otherwise, putback_lru_page() will drop the reference grabbed
991 * during isolation.
989 */ 992 */
990 putback_lru_page(newpage); 993 if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
994 put_new_page(newpage, private);
995 else
996 putback_lru_page(newpage);
997
991 if (result) { 998 if (result) {
992 if (rc) 999 if (rc)
993 *result = rc; 1000 *result = rc;
@@ -1016,8 +1023,9 @@ out:
1016 * will wait in the page fault for migration to complete. 1023 * will wait in the page fault for migration to complete.
1017 */ 1024 */
1018static int unmap_and_move_huge_page(new_page_t get_new_page, 1025static int unmap_and_move_huge_page(new_page_t get_new_page,
1019 unsigned long private, struct page *hpage, 1026 free_page_t put_new_page, unsigned long private,
1020 int force, enum migrate_mode mode) 1027 struct page *hpage, int force,
1028 enum migrate_mode mode)
1021{ 1029{
1022 int rc = 0; 1030 int rc = 0;
1023 int *result = NULL; 1031 int *result = NULL;
@@ -1031,7 +1039,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1031 * tables or check whether the hugepage is pmd-based or not before 1039 * tables or check whether the hugepage is pmd-based or not before
1032 * kicking migration. 1040 * kicking migration.
1033 */ 1041 */
1034 if (!hugepage_migration_support(page_hstate(hpage))) { 1042 if (!hugepage_migration_supported(page_hstate(hpage))) {
1035 putback_active_hugepage(hpage); 1043 putback_active_hugepage(hpage);
1036 return -ENOSYS; 1044 return -ENOSYS;
1037 } 1045 }
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1056 if (!page_mapped(hpage)) 1064 if (!page_mapped(hpage))
1057 rc = move_to_new_page(new_hpage, hpage, 1, mode); 1065 rc = move_to_new_page(new_hpage, hpage, 1, mode);
1058 1066
1059 if (rc) 1067 if (rc != MIGRATEPAGE_SUCCESS)
1060 remove_migration_ptes(hpage, hpage); 1068 remove_migration_ptes(hpage, hpage);
1061 1069
1062 if (anon_vma) 1070 if (anon_vma)
1063 put_anon_vma(anon_vma); 1071 put_anon_vma(anon_vma);
1064 1072
1065 if (!rc) 1073 if (rc == MIGRATEPAGE_SUCCESS)
1066 hugetlb_cgroup_migrate(hpage, new_hpage); 1074 hugetlb_cgroup_migrate(hpage, new_hpage);
1067 1075
1068 unlock_page(hpage); 1076 unlock_page(hpage);
1069out: 1077out:
1070 if (rc != -EAGAIN) 1078 if (rc != -EAGAIN)
1071 putback_active_hugepage(hpage); 1079 putback_active_hugepage(hpage);
1072 put_page(new_hpage); 1080
1081 /*
1082 * If migration was not successful and there's a freeing callback, use
1083 * it. Otherwise, put_page() will drop the reference grabbed during
1084 * isolation.
1085 */
1086 if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
1087 put_new_page(new_hpage, private);
1088 else
1089 put_page(new_hpage);
1090
1073 if (result) { 1091 if (result) {
1074 if (rc) 1092 if (rc)
1075 *result = rc; 1093 *result = rc;
@@ -1086,6 +1104,8 @@ out:
1086 * @from: The list of pages to be migrated. 1104 * @from: The list of pages to be migrated.
1087 * @get_new_page: The function used to allocate free pages to be used 1105 * @get_new_page: The function used to allocate free pages to be used
1088 * as the target of the page migration. 1106 * as the target of the page migration.
1107 * @put_new_page: The function used to free target pages if migration
1108 * fails, or NULL if no special handling is necessary.
1089 * @private: Private data to be passed on to get_new_page() 1109 * @private: Private data to be passed on to get_new_page()
1090 * @mode: The migration mode that specifies the constraints for 1110 * @mode: The migration mode that specifies the constraints for
1091 * page migration, if any. 1111 * page migration, if any.
@@ -1099,7 +1119,8 @@ out:
1099 * Returns the number of pages that were not migrated, or an error code. 1119 * Returns the number of pages that were not migrated, or an error code.
1100 */ 1120 */
1101int migrate_pages(struct list_head *from, new_page_t get_new_page, 1121int migrate_pages(struct list_head *from, new_page_t get_new_page,
1102 unsigned long private, enum migrate_mode mode, int reason) 1122 free_page_t put_new_page, unsigned long private,
1123 enum migrate_mode mode, int reason)
1103{ 1124{
1104 int retry = 1; 1125 int retry = 1;
1105 int nr_failed = 0; 1126 int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1121 1142
1122 if (PageHuge(page)) 1143 if (PageHuge(page))
1123 rc = unmap_and_move_huge_page(get_new_page, 1144 rc = unmap_and_move_huge_page(get_new_page,
1124 private, page, pass > 2, mode); 1145 put_new_page, private, page,
1146 pass > 2, mode);
1125 else 1147 else
1126 rc = unmap_and_move(get_new_page, private, 1148 rc = unmap_and_move(get_new_page, put_new_page,
1127 page, pass > 2, mode); 1149 private, page, pass > 2, mode);
1128 1150
1129 switch(rc) { 1151 switch(rc) {
1130 case -ENOMEM: 1152 case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
1273 1295
1274 err = 0; 1296 err = 0;
1275 if (!list_empty(&pagelist)) { 1297 if (!list_empty(&pagelist)) {
1276 err = migrate_pages(&pagelist, new_page_node, 1298 err = migrate_pages(&pagelist, new_page_node, NULL,
1277 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); 1299 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1278 if (err) 1300 if (err)
1279 putback_movable_pages(&pagelist); 1301 putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1729 1751
1730 list_add(&page->lru, &migratepages); 1752 list_add(&page->lru, &migratepages);
1731 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1753 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1732 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); 1754 NULL, node, MIGRATE_ASYNC,
1755 MR_NUMA_MISPLACED);
1733 if (nr_remaining) { 1756 if (nr_remaining) {
1734 if (!list_empty(&migratepages)) { 1757 if (!list_empty(&migratepages)) {
1735 list_del(&page->lru); 1758 list_del(&page->lru);
@@ -1852,7 +1875,7 @@ fail_putback:
1852 * guarantee the copy is visible before the pagetable update. 1875 * guarantee the copy is visible before the pagetable update.
1853 */ 1876 */
1854 flush_cache_range(vma, mmun_start, mmun_end); 1877 flush_cache_range(vma, mmun_start, mmun_end);
1855 page_add_new_anon_rmap(new_page, vma, mmun_start); 1878 page_add_anon_rmap(new_page, vma, mmun_start);
1856 pmdp_clear_flush(vma, mmun_start, pmd); 1879 pmdp_clear_flush(vma, mmun_start, pmd);
1857 set_pmd_at(mm, mmun_start, pmd, entry); 1880 set_pmd_at(mm, mmun_start, pmd, entry);
1858 flush_tlb_range(vma, mmun_start, mmun_end); 1881 flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1877,6 +1900,10 @@ fail_putback:
1877 spin_unlock(ptl); 1900 spin_unlock(ptl);
1878 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1901 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1879 1902
1903 /* Take an "isolate" reference and put new page on the LRU. */
1904 get_page(new_page);
1905 putback_lru_page(new_page);
1906
1880 unlock_page(new_page); 1907 unlock_page(new_page);
1881 unlock_page(page); 1908 unlock_page(page);
1882 put_page(page); /* Drop the rmap reference */ 1909 put_page(page); /* Drop the rmap reference */
diff --git a/mm/mmap.c b/mm/mmap.c
index b1202cf81f4b..8a56d39df4ed 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -640,11 +640,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
640{ 640{
641 struct address_space *mapping = NULL; 641 struct address_space *mapping = NULL;
642 642
643 if (vma->vm_file) 643 if (vma->vm_file) {
644 mapping = vma->vm_file->f_mapping; 644 mapping = vma->vm_file->f_mapping;
645
646 if (mapping)
647 mutex_lock(&mapping->i_mmap_mutex); 645 mutex_lock(&mapping->i_mmap_mutex);
646 }
648 647
649 __vma_link(mm, vma, prev, rb_link, rb_parent); 648 __vma_link(mm, vma, prev, rb_link, rb_parent);
650 __vma_link_file(vma); 649 __vma_link_file(vma);
@@ -2965,9 +2964,7 @@ int install_special_mapping(struct mm_struct *mm,
2965 struct vm_area_struct *vma = _install_special_mapping(mm, 2964 struct vm_area_struct *vma = _install_special_mapping(mm,
2966 addr, len, vm_flags, pages); 2965 addr, len, vm_flags, pages);
2967 2966
2968 if (IS_ERR(vma)) 2967 return PTR_ERR_OR_ZERO(vma);
2969 return PTR_ERR(vma);
2970 return 0;
2971} 2968}
2972 2969
2973static DEFINE_MUTEX(mm_all_locks_mutex); 2970static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/msync.c b/mm/msync.c
index 632df4527c01..a5c673669ca6 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
58 vma = find_vma(mm, start); 58 vma = find_vma(mm, start);
59 for (;;) { 59 for (;;) {
60 struct file *file; 60 struct file *file;
61 loff_t fstart, fend;
61 62
62 /* Still start < end. */ 63 /* Still start < end. */
63 error = -ENOMEM; 64 error = -ENOMEM;
@@ -77,12 +78,17 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
77 goto out_unlock; 78 goto out_unlock;
78 } 79 }
79 file = vma->vm_file; 80 file = vma->vm_file;
81 fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
82 fend = fstart + (min(end, vma->vm_end) - start) - 1;
80 start = vma->vm_end; 83 start = vma->vm_end;
81 if ((flags & MS_SYNC) && file && 84 if ((flags & MS_SYNC) && file &&
82 (vma->vm_flags & VM_SHARED)) { 85 (vma->vm_flags & VM_SHARED)) {
83 get_file(file); 86 get_file(file);
84 up_read(&mm->mmap_sem); 87 up_read(&mm->mmap_sem);
85 error = vfs_fsync(file, 0); 88 if (vma->vm_flags & VM_NONLINEAR)
89 error = vfs_fsync(file, 1);
90 else
91 error = vfs_fsync_range(file, fstart, fend, 1);
86 fput(file); 92 fput(file);
87 if (error || start >= end) 93 if (error || start >= end)
88 goto out; 94 goto out;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a4317da60532..533fa60c9ac1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -156,24 +156,6 @@ static unsigned long writeout_period_time = 0;
156#define VM_COMPLETIONS_PERIOD_LEN (3*HZ) 156#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
157 157
158/* 158/*
159 * Work out the current dirty-memory clamping and background writeout
160 * thresholds.
161 *
162 * The main aim here is to lower them aggressively if there is a lot of mapped
163 * memory around. To avoid stressing page reclaim with lots of unreclaimable
164 * pages. It is better to clamp down on writers than to start swapping, and
165 * performing lots of scanning.
166 *
167 * We only allow 1/2 of the currently-unmapped memory to be dirtied.
168 *
169 * We don't permit the clamping level to fall below 5% - that is getting rather
170 * excessive.
171 *
172 * We make sure that the background writeout level is below the adjusted
173 * clamping level.
174 */
175
176/*
177 * In a memory zone, there is a certain amount of pages we consider 159 * In a memory zone, there is a certain amount of pages we consider
178 * available for the page cache, which is essentially the number of 160 * available for the page cache, which is essentially the number of
179 * free and reclaimable pages, minus some zone reserves to protect 161 * free and reclaimable pages, minus some zone reserves to protect
@@ -1623,7 +1605,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
1623 * 1000+ tasks, all of them start dirtying pages at exactly the same 1605 * 1000+ tasks, all of them start dirtying pages at exactly the same
1624 * time, hence all honoured too large initial task->nr_dirtied_pause. 1606 * time, hence all honoured too large initial task->nr_dirtied_pause.
1625 */ 1607 */
1626 p = &__get_cpu_var(bdp_ratelimits); 1608 p = this_cpu_ptr(&bdp_ratelimits);
1627 if (unlikely(current->nr_dirtied >= ratelimit)) 1609 if (unlikely(current->nr_dirtied >= ratelimit))
1628 *p = 0; 1610 *p = 0;
1629 else if (unlikely(*p >= ratelimit_pages)) { 1611 else if (unlikely(*p >= ratelimit_pages)) {
@@ -1635,7 +1617,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
1635 * short-lived tasks (eg. gcc invocations in a kernel build) escaping 1617 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
1636 * the dirty throttling and livelock other long-run dirtiers. 1618 * the dirty throttling and livelock other long-run dirtiers.
1637 */ 1619 */
1638 p = &__get_cpu_var(dirty_throttle_leaks); 1620 p = this_cpu_ptr(&dirty_throttle_leaks);
1639 if (*p > 0 && current->nr_dirtied < ratelimit) { 1621 if (*p > 0 && current->nr_dirtied < ratelimit) {
1640 unsigned long nr_pages_dirtied; 1622 unsigned long nr_pages_dirtied;
1641 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); 1623 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5dba2933c9c0..a59bdb653958 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
261 } while (zone_span_seqretry(zone, seq)); 261 } while (zone_span_seqretry(zone, seq));
262 262
263 if (ret) 263 if (ret)
264 pr_err("page %lu outside zone [ %lu - %lu ]\n", 264 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
265 pfn, start_pfn, start_pfn + sp); 265 pfn, zone_to_nid(zone), zone->name,
266 start_pfn, start_pfn + sp);
266 267
267 return ret; 268 return ret;
268} 269}
@@ -408,7 +409,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
408 return bad; 409 return bad;
409} 410}
410 411
411static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 412static inline void prep_zero_page(struct page *page, unsigned int order,
413 gfp_t gfp_flags)
412{ 414{
413 int i; 415 int i;
414 416
@@ -452,7 +454,7 @@ static inline void set_page_guard_flag(struct page *page) { }
452static inline void clear_page_guard_flag(struct page *page) { } 454static inline void clear_page_guard_flag(struct page *page) { }
453#endif 455#endif
454 456
455static inline void set_page_order(struct page *page, int order) 457static inline void set_page_order(struct page *page, unsigned int order)
456{ 458{
457 set_page_private(page, order); 459 set_page_private(page, order);
458 __SetPageBuddy(page); 460 __SetPageBuddy(page);
@@ -503,21 +505,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
503 * For recording page's order, we use page_private(page). 505 * For recording page's order, we use page_private(page).
504 */ 506 */
505static inline int page_is_buddy(struct page *page, struct page *buddy, 507static inline int page_is_buddy(struct page *page, struct page *buddy,
506 int order) 508 unsigned int order)
507{ 509{
508 if (!pfn_valid_within(page_to_pfn(buddy))) 510 if (!pfn_valid_within(page_to_pfn(buddy)))
509 return 0; 511 return 0;
510 512
511 if (page_zone_id(page) != page_zone_id(buddy))
512 return 0;
513
514 if (page_is_guard(buddy) && page_order(buddy) == order) { 513 if (page_is_guard(buddy) && page_order(buddy) == order) {
515 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 514 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
515
516 if (page_zone_id(page) != page_zone_id(buddy))
517 return 0;
518
516 return 1; 519 return 1;
517 } 520 }
518 521
519 if (PageBuddy(buddy) && page_order(buddy) == order) { 522 if (PageBuddy(buddy) && page_order(buddy) == order) {
520 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); 523 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
524
525 /*
526 * zone check is done late to avoid uselessly
527 * calculating zone/node ids for pages that could
528 * never merge.
529 */
530 if (page_zone_id(page) != page_zone_id(buddy))
531 return 0;
532
521 return 1; 533 return 1;
522 } 534 }
523 return 0; 535 return 0;
@@ -549,6 +561,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
549 */ 561 */
550 562
551static inline void __free_one_page(struct page *page, 563static inline void __free_one_page(struct page *page,
564 unsigned long pfn,
552 struct zone *zone, unsigned int order, 565 struct zone *zone, unsigned int order,
553 int migratetype) 566 int migratetype)
554{ 567{
@@ -565,7 +578,7 @@ static inline void __free_one_page(struct page *page,
565 578
566 VM_BUG_ON(migratetype == -1); 579 VM_BUG_ON(migratetype == -1);
567 580
568 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 581 page_idx = pfn & ((1 << MAX_ORDER) - 1);
569 582
570 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); 583 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
571 VM_BUG_ON_PAGE(bad_range(zone, page), page); 584 VM_BUG_ON_PAGE(bad_range(zone, page), page);
@@ -700,7 +713,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
700 list_del(&page->lru); 713 list_del(&page->lru);
701 mt = get_freepage_migratetype(page); 714 mt = get_freepage_migratetype(page);
702 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 715 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
703 __free_one_page(page, zone, 0, mt); 716 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
704 trace_mm_page_pcpu_drain(page, 0, mt); 717 trace_mm_page_pcpu_drain(page, 0, mt);
705 if (likely(!is_migrate_isolate_page(page))) { 718 if (likely(!is_migrate_isolate_page(page))) {
706 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 719 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
@@ -712,13 +725,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
712 spin_unlock(&zone->lock); 725 spin_unlock(&zone->lock);
713} 726}
714 727
715static void free_one_page(struct zone *zone, struct page *page, int order, 728static void free_one_page(struct zone *zone,
729 struct page *page, unsigned long pfn,
730 unsigned int order,
716 int migratetype) 731 int migratetype)
717{ 732{
718 spin_lock(&zone->lock); 733 spin_lock(&zone->lock);
719 zone->pages_scanned = 0; 734 zone->pages_scanned = 0;
720 735
721 __free_one_page(page, zone, order, migratetype); 736 __free_one_page(page, pfn, zone, order, migratetype);
722 if (unlikely(!is_migrate_isolate(migratetype))) 737 if (unlikely(!is_migrate_isolate(migratetype)))
723 __mod_zone_freepage_state(zone, 1 << order, migratetype); 738 __mod_zone_freepage_state(zone, 1 << order, migratetype);
724 spin_unlock(&zone->lock); 739 spin_unlock(&zone->lock);
@@ -755,15 +770,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
755{ 770{
756 unsigned long flags; 771 unsigned long flags;
757 int migratetype; 772 int migratetype;
773 unsigned long pfn = page_to_pfn(page);
758 774
759 if (!free_pages_prepare(page, order)) 775 if (!free_pages_prepare(page, order))
760 return; 776 return;
761 777
778 migratetype = get_pfnblock_migratetype(page, pfn);
762 local_irq_save(flags); 779 local_irq_save(flags);
763 __count_vm_events(PGFREE, 1 << order); 780 __count_vm_events(PGFREE, 1 << order);
764 migratetype = get_pageblock_migratetype(page);
765 set_freepage_migratetype(page, migratetype); 781 set_freepage_migratetype(page, migratetype);
766 free_one_page(page_zone(page), page, order, migratetype); 782 free_one_page(page_zone(page), page, pfn, order, migratetype);
767 local_irq_restore(flags); 783 local_irq_restore(flags);
768} 784}
769 785
@@ -882,7 +898,7 @@ static inline int check_new_page(struct page *page)
882 return 0; 898 return 0;
883} 899}
884 900
885static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 901static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
886{ 902{
887 int i; 903 int i;
888 904
@@ -931,6 +947,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
931 rmv_page_order(page); 947 rmv_page_order(page);
932 area->nr_free--; 948 area->nr_free--;
933 expand(zone, page, order, current_order, area, migratetype); 949 expand(zone, page, order, current_order, area, migratetype);
950 set_freepage_migratetype(page, migratetype);
934 return page; 951 return page;
935 } 952 }
936 953
@@ -1057,7 +1074,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1057 1074
1058 /* 1075 /*
1059 * When borrowing from MIGRATE_CMA, we need to release the excess 1076 * When borrowing from MIGRATE_CMA, we need to release the excess
1060 * buddy pages to CMA itself. 1077 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1078 * is set to CMA so it is returned to the correct freelist in case
1079 * the page ends up being not actually allocated from the pcp lists.
1061 */ 1080 */
1062 if (is_migrate_cma(fallback_type)) 1081 if (is_migrate_cma(fallback_type))
1063 return fallback_type; 1082 return fallback_type;
@@ -1090,16 +1109,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1090 1109
1091/* Remove an element from the buddy allocator from the fallback list */ 1110/* Remove an element from the buddy allocator from the fallback list */
1092static inline struct page * 1111static inline struct page *
1093__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1112__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1094{ 1113{
1095 struct free_area *area; 1114 struct free_area *area;
1096 int current_order; 1115 unsigned int current_order;
1097 struct page *page; 1116 struct page *page;
1098 int migratetype, new_type, i; 1117 int migratetype, new_type, i;
1099 1118
1100 /* Find the largest possible block of pages in the other list */ 1119 /* Find the largest possible block of pages in the other list */
1101 for (current_order = MAX_ORDER-1; current_order >= order; 1120 for (current_order = MAX_ORDER-1;
1102 --current_order) { 1121 current_order >= order && current_order <= MAX_ORDER-1;
1122 --current_order) {
1103 for (i = 0;; i++) { 1123 for (i = 0;; i++) {
1104 migratetype = fallbacks[start_migratetype][i]; 1124 migratetype = fallbacks[start_migratetype][i];
1105 1125
@@ -1125,6 +1145,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1125 1145
1126 expand(zone, page, order, current_order, area, 1146 expand(zone, page, order, current_order, area,
1127 new_type); 1147 new_type);
1148 /* The freepage_migratetype may differ from pageblock's
1149 * migratetype depending on the decisions in
1150 * try_to_steal_freepages. This is OK as long as it does
1151 * not differ for MIGRATE_CMA type.
1152 */
1153 set_freepage_migratetype(page, new_type);
1128 1154
1129 trace_mm_page_alloc_extfrag(page, order, current_order, 1155 trace_mm_page_alloc_extfrag(page, order, current_order,
1130 start_migratetype, migratetype, new_type); 1156 start_migratetype, migratetype, new_type);
@@ -1173,9 +1199,9 @@ retry_reserve:
1173 */ 1199 */
1174static int rmqueue_bulk(struct zone *zone, unsigned int order, 1200static int rmqueue_bulk(struct zone *zone, unsigned int order,
1175 unsigned long count, struct list_head *list, 1201 unsigned long count, struct list_head *list,
1176 int migratetype, int cold) 1202 int migratetype, bool cold)
1177{ 1203{
1178 int mt = migratetype, i; 1204 int i;
1179 1205
1180 spin_lock(&zone->lock); 1206 spin_lock(&zone->lock);
1181 for (i = 0; i < count; ++i) { 1207 for (i = 0; i < count; ++i) {
@@ -1192,18 +1218,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1192 * merge IO requests if the physical pages are ordered 1218 * merge IO requests if the physical pages are ordered
1193 * properly. 1219 * properly.
1194 */ 1220 */
1195 if (likely(cold == 0)) 1221 if (likely(!cold))
1196 list_add(&page->lru, list); 1222 list_add(&page->lru, list);
1197 else 1223 else
1198 list_add_tail(&page->lru, list); 1224 list_add_tail(&page->lru, list);
1199 if (IS_ENABLED(CONFIG_CMA)) {
1200 mt = get_pageblock_migratetype(page);
1201 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
1202 mt = migratetype;
1203 }
1204 set_freepage_migratetype(page, mt);
1205 list = &page->lru; 1225 list = &page->lru;
1206 if (is_migrate_cma(mt)) 1226 if (is_migrate_cma(get_freepage_migratetype(page)))
1207 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1227 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1208 -(1 << order)); 1228 -(1 << order));
1209 } 1229 }
@@ -1327,7 +1347,7 @@ void mark_free_pages(struct zone *zone)
1327{ 1347{
1328 unsigned long pfn, max_zone_pfn; 1348 unsigned long pfn, max_zone_pfn;
1329 unsigned long flags; 1349 unsigned long flags;
1330 int order, t; 1350 unsigned int order, t;
1331 struct list_head *curr; 1351 struct list_head *curr;
1332 1352
1333 if (zone_is_empty(zone)) 1353 if (zone_is_empty(zone))
@@ -1359,19 +1379,20 @@ void mark_free_pages(struct zone *zone)
1359 1379
1360/* 1380/*
1361 * Free a 0-order page 1381 * Free a 0-order page
1362 * cold == 1 ? free a cold page : free a hot page 1382 * cold == true ? free a cold page : free a hot page
1363 */ 1383 */
1364void free_hot_cold_page(struct page *page, int cold) 1384void free_hot_cold_page(struct page *page, bool cold)
1365{ 1385{
1366 struct zone *zone = page_zone(page); 1386 struct zone *zone = page_zone(page);
1367 struct per_cpu_pages *pcp; 1387 struct per_cpu_pages *pcp;
1368 unsigned long flags; 1388 unsigned long flags;
1389 unsigned long pfn = page_to_pfn(page);
1369 int migratetype; 1390 int migratetype;
1370 1391
1371 if (!free_pages_prepare(page, 0)) 1392 if (!free_pages_prepare(page, 0))
1372 return; 1393 return;
1373 1394
1374 migratetype = get_pageblock_migratetype(page); 1395 migratetype = get_pfnblock_migratetype(page, pfn);
1375 set_freepage_migratetype(page, migratetype); 1396 set_freepage_migratetype(page, migratetype);
1376 local_irq_save(flags); 1397 local_irq_save(flags);
1377 __count_vm_event(PGFREE); 1398 __count_vm_event(PGFREE);
@@ -1385,17 +1406,17 @@ void free_hot_cold_page(struct page *page, int cold)
1385 */ 1406 */
1386 if (migratetype >= MIGRATE_PCPTYPES) { 1407 if (migratetype >= MIGRATE_PCPTYPES) {
1387 if (unlikely(is_migrate_isolate(migratetype))) { 1408 if (unlikely(is_migrate_isolate(migratetype))) {
1388 free_one_page(zone, page, 0, migratetype); 1409 free_one_page(zone, page, pfn, 0, migratetype);
1389 goto out; 1410 goto out;
1390 } 1411 }
1391 migratetype = MIGRATE_MOVABLE; 1412 migratetype = MIGRATE_MOVABLE;
1392 } 1413 }
1393 1414
1394 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1415 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1395 if (cold) 1416 if (!cold)
1396 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1397 else
1398 list_add(&page->lru, &pcp->lists[migratetype]); 1417 list_add(&page->lru, &pcp->lists[migratetype]);
1418 else
1419 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1399 pcp->count++; 1420 pcp->count++;
1400 if (pcp->count >= pcp->high) { 1421 if (pcp->count >= pcp->high) {
1401 unsigned long batch = ACCESS_ONCE(pcp->batch); 1422 unsigned long batch = ACCESS_ONCE(pcp->batch);
@@ -1410,7 +1431,7 @@ out:
1410/* 1431/*
1411 * Free a list of 0-order pages 1432 * Free a list of 0-order pages
1412 */ 1433 */
1413void free_hot_cold_page_list(struct list_head *list, int cold) 1434void free_hot_cold_page_list(struct list_head *list, bool cold)
1414{ 1435{
1415 struct page *page, *next; 1436 struct page *page, *next;
1416 1437
@@ -1522,12 +1543,12 @@ int split_free_page(struct page *page)
1522 */ 1543 */
1523static inline 1544static inline
1524struct page *buffered_rmqueue(struct zone *preferred_zone, 1545struct page *buffered_rmqueue(struct zone *preferred_zone,
1525 struct zone *zone, int order, gfp_t gfp_flags, 1546 struct zone *zone, unsigned int order,
1526 int migratetype) 1547 gfp_t gfp_flags, int migratetype)
1527{ 1548{
1528 unsigned long flags; 1549 unsigned long flags;
1529 struct page *page; 1550 struct page *page;
1530 int cold = !!(gfp_flags & __GFP_COLD); 1551 bool cold = ((gfp_flags & __GFP_COLD) != 0);
1531 1552
1532again: 1553again:
1533 if (likely(order == 0)) { 1554 if (likely(order == 0)) {
@@ -1572,7 +1593,7 @@ again:
1572 if (!page) 1593 if (!page)
1573 goto failed; 1594 goto failed;
1574 __mod_zone_freepage_state(zone, -(1 << order), 1595 __mod_zone_freepage_state(zone, -(1 << order),
1575 get_pageblock_migratetype(page)); 1596 get_freepage_migratetype(page));
1576 } 1597 }
1577 1598
1578 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1599 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -1672,8 +1693,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1672 * Return true if free pages are above 'mark'. This takes into account the order 1693 * Return true if free pages are above 'mark'. This takes into account the order
1673 * of the allocation. 1694 * of the allocation.
1674 */ 1695 */
1675static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1696static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1676 int classzone_idx, int alloc_flags, long free_pages) 1697 unsigned long mark, int classzone_idx, int alloc_flags,
1698 long free_pages)
1677{ 1699{
1678 /* free_pages my go negative - that's OK */ 1700 /* free_pages my go negative - that's OK */
1679 long min = mark; 1701 long min = mark;
@@ -1707,15 +1729,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1707 return true; 1729 return true;
1708} 1730}
1709 1731
1710bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1732bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
1711 int classzone_idx, int alloc_flags) 1733 int classzone_idx, int alloc_flags)
1712{ 1734{
1713 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1735 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1714 zone_page_state(z, NR_FREE_PAGES)); 1736 zone_page_state(z, NR_FREE_PAGES));
1715} 1737}
1716 1738
1717bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1739bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
1718 int classzone_idx, int alloc_flags) 1740 unsigned long mark, int classzone_idx, int alloc_flags)
1719{ 1741{
1720 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1742 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1721 1743
@@ -1850,18 +1872,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone)
1850 1872
1851static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1873static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1852{ 1874{
1853 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1875 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
1854} 1876 RECLAIM_DISTANCE;
1855
1856static void __paginginit init_zone_allows_reclaim(int nid)
1857{
1858 int i;
1859
1860 for_each_node_state(i, N_MEMORY)
1861 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1862 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1863 else
1864 zone_reclaim_mode = 1;
1865} 1877}
1866 1878
1867#else /* CONFIG_NUMA */ 1879#else /* CONFIG_NUMA */
@@ -1895,9 +1907,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1895 return true; 1907 return true;
1896} 1908}
1897 1909
1898static inline void init_zone_allows_reclaim(int nid)
1899{
1900}
1901#endif /* CONFIG_NUMA */ 1910#endif /* CONFIG_NUMA */
1902 1911
1903/* 1912/*
@@ -1907,17 +1916,17 @@ static inline void init_zone_allows_reclaim(int nid)
1907static struct page * 1916static struct page *
1908get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1917get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1909 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1918 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1910 struct zone *preferred_zone, int migratetype) 1919 struct zone *preferred_zone, int classzone_idx, int migratetype)
1911{ 1920{
1912 struct zoneref *z; 1921 struct zoneref *z;
1913 struct page *page = NULL; 1922 struct page *page = NULL;
1914 int classzone_idx;
1915 struct zone *zone; 1923 struct zone *zone;
1916 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1924 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1917 int zlc_active = 0; /* set if using zonelist_cache */ 1925 int zlc_active = 0; /* set if using zonelist_cache */
1918 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1926 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1927 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1928 (gfp_mask & __GFP_WRITE);
1919 1929
1920 classzone_idx = zone_idx(preferred_zone);
1921zonelist_scan: 1930zonelist_scan:
1922 /* 1931 /*
1923 * Scan zonelist, looking for a zone with enough free. 1932 * Scan zonelist, looking for a zone with enough free.
@@ -1930,12 +1939,10 @@ zonelist_scan:
1930 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1939 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1931 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1940 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1932 continue; 1941 continue;
1933 if ((alloc_flags & ALLOC_CPUSET) && 1942 if (cpusets_enabled() &&
1943 (alloc_flags & ALLOC_CPUSET) &&
1934 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1944 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1935 continue; 1945 continue;
1936 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1937 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
1938 goto try_this_zone;
1939 /* 1946 /*
1940 * Distribute pages in proportion to the individual 1947 * Distribute pages in proportion to the individual
1941 * zone size to ensure fair page aging. The zone a 1948 * zone size to ensure fair page aging. The zone a
@@ -1974,15 +1981,19 @@ zonelist_scan:
1974 * will require awareness of zones in the 1981 * will require awareness of zones in the
1975 * dirty-throttling and the flusher threads. 1982 * dirty-throttling and the flusher threads.
1976 */ 1983 */
1977 if ((alloc_flags & ALLOC_WMARK_LOW) && 1984 if (consider_zone_dirty && !zone_dirty_ok(zone))
1978 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1985 continue;
1979 goto this_zone_full;
1980 1986
1981 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1987 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1982 if (!zone_watermark_ok(zone, order, mark, 1988 if (!zone_watermark_ok(zone, order, mark,
1983 classzone_idx, alloc_flags)) { 1989 classzone_idx, alloc_flags)) {
1984 int ret; 1990 int ret;
1985 1991
1992 /* Checked here to keep the fast path fast */
1993 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1994 if (alloc_flags & ALLOC_NO_WATERMARKS)
1995 goto try_this_zone;
1996
1986 if (IS_ENABLED(CONFIG_NUMA) && 1997 if (IS_ENABLED(CONFIG_NUMA) &&
1987 !did_zlc_setup && nr_online_nodes > 1) { 1998 !did_zlc_setup && nr_online_nodes > 1) {
1988 /* 1999 /*
@@ -2044,7 +2055,7 @@ try_this_zone:
2044 if (page) 2055 if (page)
2045 break; 2056 break;
2046this_zone_full: 2057this_zone_full:
2047 if (IS_ENABLED(CONFIG_NUMA)) 2058 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2048 zlc_mark_zone_full(zonelist, z); 2059 zlc_mark_zone_full(zonelist, z);
2049 } 2060 }
2050 2061
@@ -2173,7 +2184,7 @@ static inline struct page *
2173__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2184__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2174 struct zonelist *zonelist, enum zone_type high_zoneidx, 2185 struct zonelist *zonelist, enum zone_type high_zoneidx,
2175 nodemask_t *nodemask, struct zone *preferred_zone, 2186 nodemask_t *nodemask, struct zone *preferred_zone,
2176 int migratetype) 2187 int classzone_idx, int migratetype)
2177{ 2188{
2178 struct page *page; 2189 struct page *page;
2179 2190
@@ -2191,7 +2202,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2191 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2202 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2192 order, zonelist, high_zoneidx, 2203 order, zonelist, high_zoneidx,
2193 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2204 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2194 preferred_zone, migratetype); 2205 preferred_zone, classzone_idx, migratetype);
2195 if (page) 2206 if (page)
2196 goto out; 2207 goto out;
2197 2208
@@ -2226,7 +2237,7 @@ static struct page *
2226__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2237__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2227 struct zonelist *zonelist, enum zone_type high_zoneidx, 2238 struct zonelist *zonelist, enum zone_type high_zoneidx,
2228 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2239 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2229 int migratetype, bool sync_migration, 2240 int classzone_idx, int migratetype, enum migrate_mode mode,
2230 bool *contended_compaction, bool *deferred_compaction, 2241 bool *contended_compaction, bool *deferred_compaction,
2231 unsigned long *did_some_progress) 2242 unsigned long *did_some_progress)
2232{ 2243{
@@ -2240,7 +2251,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2240 2251
2241 current->flags |= PF_MEMALLOC; 2252 current->flags |= PF_MEMALLOC;
2242 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2253 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2243 nodemask, sync_migration, 2254 nodemask, mode,
2244 contended_compaction); 2255 contended_compaction);
2245 current->flags &= ~PF_MEMALLOC; 2256 current->flags &= ~PF_MEMALLOC;
2246 2257
@@ -2254,7 +2265,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2254 page = get_page_from_freelist(gfp_mask, nodemask, 2265 page = get_page_from_freelist(gfp_mask, nodemask,
2255 order, zonelist, high_zoneidx, 2266 order, zonelist, high_zoneidx,
2256 alloc_flags & ~ALLOC_NO_WATERMARKS, 2267 alloc_flags & ~ALLOC_NO_WATERMARKS,
2257 preferred_zone, migratetype); 2268 preferred_zone, classzone_idx, migratetype);
2258 if (page) { 2269 if (page) {
2259 preferred_zone->compact_blockskip_flush = false; 2270 preferred_zone->compact_blockskip_flush = false;
2260 compaction_defer_reset(preferred_zone, order, true); 2271 compaction_defer_reset(preferred_zone, order, true);
@@ -2273,7 +2284,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2273 * As async compaction considers a subset of pageblocks, only 2284 * As async compaction considers a subset of pageblocks, only
2274 * defer if the failure was a sync compaction failure. 2285 * defer if the failure was a sync compaction failure.
2275 */ 2286 */
2276 if (sync_migration) 2287 if (mode != MIGRATE_ASYNC)
2277 defer_compaction(preferred_zone, order); 2288 defer_compaction(preferred_zone, order);
2278 2289
2279 cond_resched(); 2290 cond_resched();
@@ -2286,9 +2297,9 @@ static inline struct page *
2286__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2297__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2287 struct zonelist *zonelist, enum zone_type high_zoneidx, 2298 struct zonelist *zonelist, enum zone_type high_zoneidx,
2288 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2299 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2289 int migratetype, bool sync_migration, 2300 int classzone_idx, int migratetype,
2290 bool *contended_compaction, bool *deferred_compaction, 2301 enum migrate_mode mode, bool *contended_compaction,
2291 unsigned long *did_some_progress) 2302 bool *deferred_compaction, unsigned long *did_some_progress)
2292{ 2303{
2293 return NULL; 2304 return NULL;
2294} 2305}
@@ -2327,7 +2338,7 @@ static inline struct page *
2327__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2338__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2328 struct zonelist *zonelist, enum zone_type high_zoneidx, 2339 struct zonelist *zonelist, enum zone_type high_zoneidx,
2329 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2340 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2330 int migratetype, unsigned long *did_some_progress) 2341 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2331{ 2342{
2332 struct page *page = NULL; 2343 struct page *page = NULL;
2333 bool drained = false; 2344 bool drained = false;
@@ -2345,7 +2356,8 @@ retry:
2345 page = get_page_from_freelist(gfp_mask, nodemask, order, 2356 page = get_page_from_freelist(gfp_mask, nodemask, order,
2346 zonelist, high_zoneidx, 2357 zonelist, high_zoneidx,
2347 alloc_flags & ~ALLOC_NO_WATERMARKS, 2358 alloc_flags & ~ALLOC_NO_WATERMARKS,
2348 preferred_zone, migratetype); 2359 preferred_zone, classzone_idx,
2360 migratetype);
2349 2361
2350 /* 2362 /*
2351 * If an allocation failed after direct reclaim, it could be because 2363 * If an allocation failed after direct reclaim, it could be because
@@ -2368,14 +2380,14 @@ static inline struct page *
2368__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2380__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2369 struct zonelist *zonelist, enum zone_type high_zoneidx, 2381 struct zonelist *zonelist, enum zone_type high_zoneidx,
2370 nodemask_t *nodemask, struct zone *preferred_zone, 2382 nodemask_t *nodemask, struct zone *preferred_zone,
2371 int migratetype) 2383 int classzone_idx, int migratetype)
2372{ 2384{
2373 struct page *page; 2385 struct page *page;
2374 2386
2375 do { 2387 do {
2376 page = get_page_from_freelist(gfp_mask, nodemask, order, 2388 page = get_page_from_freelist(gfp_mask, nodemask, order,
2377 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2389 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2378 preferred_zone, migratetype); 2390 preferred_zone, classzone_idx, migratetype);
2379 2391
2380 if (!page && gfp_mask & __GFP_NOFAIL) 2392 if (!page && gfp_mask & __GFP_NOFAIL)
2381 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2393 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2476,14 +2488,14 @@ static inline struct page *
2476__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2488__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2477 struct zonelist *zonelist, enum zone_type high_zoneidx, 2489 struct zonelist *zonelist, enum zone_type high_zoneidx,
2478 nodemask_t *nodemask, struct zone *preferred_zone, 2490 nodemask_t *nodemask, struct zone *preferred_zone,
2479 int migratetype) 2491 int classzone_idx, int migratetype)
2480{ 2492{
2481 const gfp_t wait = gfp_mask & __GFP_WAIT; 2493 const gfp_t wait = gfp_mask & __GFP_WAIT;
2482 struct page *page = NULL; 2494 struct page *page = NULL;
2483 int alloc_flags; 2495 int alloc_flags;
2484 unsigned long pages_reclaimed = 0; 2496 unsigned long pages_reclaimed = 0;
2485 unsigned long did_some_progress; 2497 unsigned long did_some_progress;
2486 bool sync_migration = false; 2498 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2487 bool deferred_compaction = false; 2499 bool deferred_compaction = false;
2488 bool contended_compaction = false; 2500 bool contended_compaction = false;
2489 2501
@@ -2525,15 +2537,18 @@ restart:
2525 * Find the true preferred zone if the allocation is unconstrained by 2537 * Find the true preferred zone if the allocation is unconstrained by
2526 * cpusets. 2538 * cpusets.
2527 */ 2539 */
2528 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) 2540 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
2529 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2541 struct zoneref *preferred_zoneref;
2530 &preferred_zone); 2542 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2543 NULL, &preferred_zone);
2544 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2545 }
2531 2546
2532rebalance: 2547rebalance:
2533 /* This is the last chance, in general, before the goto nopage. */ 2548 /* This is the last chance, in general, before the goto nopage. */
2534 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2549 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2535 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2550 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2536 preferred_zone, migratetype); 2551 preferred_zone, classzone_idx, migratetype);
2537 if (page) 2552 if (page)
2538 goto got_pg; 2553 goto got_pg;
2539 2554
@@ -2548,7 +2563,7 @@ rebalance:
2548 2563
2549 page = __alloc_pages_high_priority(gfp_mask, order, 2564 page = __alloc_pages_high_priority(gfp_mask, order,
2550 zonelist, high_zoneidx, nodemask, 2565 zonelist, high_zoneidx, nodemask,
2551 preferred_zone, migratetype); 2566 preferred_zone, classzone_idx, migratetype);
2552 if (page) { 2567 if (page) {
2553 goto got_pg; 2568 goto got_pg;
2554 } 2569 }
@@ -2577,17 +2592,23 @@ rebalance:
2577 * Try direct compaction. The first pass is asynchronous. Subsequent 2592 * Try direct compaction. The first pass is asynchronous. Subsequent
2578 * attempts after direct reclaim are synchronous 2593 * attempts after direct reclaim are synchronous
2579 */ 2594 */
2580 page = __alloc_pages_direct_compact(gfp_mask, order, 2595 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2581 zonelist, high_zoneidx, 2596 high_zoneidx, nodemask, alloc_flags,
2582 nodemask, 2597 preferred_zone,
2583 alloc_flags, preferred_zone, 2598 classzone_idx, migratetype,
2584 migratetype, sync_migration, 2599 migration_mode, &contended_compaction,
2585 &contended_compaction,
2586 &deferred_compaction, 2600 &deferred_compaction,
2587 &did_some_progress); 2601 &did_some_progress);
2588 if (page) 2602 if (page)
2589 goto got_pg; 2603 goto got_pg;
2590 sync_migration = true; 2604
2605 /*
2606 * It can become very expensive to allocate transparent hugepages at
2607 * fault, so use asynchronous memory compaction for THP unless it is
2608 * khugepaged trying to collapse.
2609 */
2610 if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
2611 migration_mode = MIGRATE_SYNC_LIGHT;
2591 2612
2592 /* 2613 /*
2593 * If compaction is deferred for high-order allocations, it is because 2614 * If compaction is deferred for high-order allocations, it is because
@@ -2604,7 +2625,8 @@ rebalance:
2604 zonelist, high_zoneidx, 2625 zonelist, high_zoneidx,
2605 nodemask, 2626 nodemask,
2606 alloc_flags, preferred_zone, 2627 alloc_flags, preferred_zone,
2607 migratetype, &did_some_progress); 2628 classzone_idx, migratetype,
2629 &did_some_progress);
2608 if (page) 2630 if (page)
2609 goto got_pg; 2631 goto got_pg;
2610 2632
@@ -2623,7 +2645,7 @@ rebalance:
2623 page = __alloc_pages_may_oom(gfp_mask, order, 2645 page = __alloc_pages_may_oom(gfp_mask, order,
2624 zonelist, high_zoneidx, 2646 zonelist, high_zoneidx,
2625 nodemask, preferred_zone, 2647 nodemask, preferred_zone,
2626 migratetype); 2648 classzone_idx, migratetype);
2627 if (page) 2649 if (page)
2628 goto got_pg; 2650 goto got_pg;
2629 2651
@@ -2662,12 +2684,11 @@ rebalance:
2662 * direct reclaim and reclaim/compaction depends on compaction 2684 * direct reclaim and reclaim/compaction depends on compaction
2663 * being called after reclaim so call directly if necessary 2685 * being called after reclaim so call directly if necessary
2664 */ 2686 */
2665 page = __alloc_pages_direct_compact(gfp_mask, order, 2687 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2666 zonelist, high_zoneidx, 2688 high_zoneidx, nodemask, alloc_flags,
2667 nodemask, 2689 preferred_zone,
2668 alloc_flags, preferred_zone, 2690 classzone_idx, migratetype,
2669 migratetype, sync_migration, 2691 migration_mode, &contended_compaction,
2670 &contended_compaction,
2671 &deferred_compaction, 2692 &deferred_compaction,
2672 &did_some_progress); 2693 &did_some_progress);
2673 if (page) 2694 if (page)
@@ -2693,11 +2714,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2693{ 2714{
2694 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2715 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2695 struct zone *preferred_zone; 2716 struct zone *preferred_zone;
2717 struct zoneref *preferred_zoneref;
2696 struct page *page = NULL; 2718 struct page *page = NULL;
2697 int migratetype = allocflags_to_migratetype(gfp_mask); 2719 int migratetype = allocflags_to_migratetype(gfp_mask);
2698 unsigned int cpuset_mems_cookie; 2720 unsigned int cpuset_mems_cookie;
2699 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2721 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2700 struct mem_cgroup *memcg = NULL; 2722 int classzone_idx;
2701 2723
2702 gfp_mask &= gfp_allowed_mask; 2724 gfp_mask &= gfp_allowed_mask;
2703 2725
@@ -2716,22 +2738,16 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2716 if (unlikely(!zonelist->_zonerefs->zone)) 2738 if (unlikely(!zonelist->_zonerefs->zone))
2717 return NULL; 2739 return NULL;
2718 2740
2719 /*
2720 * Will only have any effect when __GFP_KMEMCG is set. This is
2721 * verified in the (always inline) callee
2722 */
2723 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2724 return NULL;
2725
2726retry_cpuset: 2741retry_cpuset:
2727 cpuset_mems_cookie = read_mems_allowed_begin(); 2742 cpuset_mems_cookie = read_mems_allowed_begin();
2728 2743
2729 /* The preferred zone is used for statistics later */ 2744 /* The preferred zone is used for statistics later */
2730 first_zones_zonelist(zonelist, high_zoneidx, 2745 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2731 nodemask ? : &cpuset_current_mems_allowed, 2746 nodemask ? : &cpuset_current_mems_allowed,
2732 &preferred_zone); 2747 &preferred_zone);
2733 if (!preferred_zone) 2748 if (!preferred_zone)
2734 goto out; 2749 goto out;
2750 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2735 2751
2736#ifdef CONFIG_CMA 2752#ifdef CONFIG_CMA
2737 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2753 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -2741,7 +2757,7 @@ retry:
2741 /* First allocation attempt */ 2757 /* First allocation attempt */
2742 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2758 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2743 zonelist, high_zoneidx, alloc_flags, 2759 zonelist, high_zoneidx, alloc_flags,
2744 preferred_zone, migratetype); 2760 preferred_zone, classzone_idx, migratetype);
2745 if (unlikely(!page)) { 2761 if (unlikely(!page)) {
2746 /* 2762 /*
2747 * The first pass makes sure allocations are spread 2763 * The first pass makes sure allocations are spread
@@ -2767,7 +2783,7 @@ retry:
2767 gfp_mask = memalloc_noio_flags(gfp_mask); 2783 gfp_mask = memalloc_noio_flags(gfp_mask);
2768 page = __alloc_pages_slowpath(gfp_mask, order, 2784 page = __alloc_pages_slowpath(gfp_mask, order,
2769 zonelist, high_zoneidx, nodemask, 2785 zonelist, high_zoneidx, nodemask,
2770 preferred_zone, migratetype); 2786 preferred_zone, classzone_idx, migratetype);
2771 } 2787 }
2772 2788
2773 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2789 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2782,8 +2798,6 @@ out:
2782 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2798 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2783 goto retry_cpuset; 2799 goto retry_cpuset;
2784 2800
2785 memcg_kmem_commit_charge(page, memcg, order);
2786
2787 return page; 2801 return page;
2788} 2802}
2789EXPORT_SYMBOL(__alloc_pages_nodemask); 2803EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2818,7 +2832,7 @@ void __free_pages(struct page *page, unsigned int order)
2818{ 2832{
2819 if (put_page_testzero(page)) { 2833 if (put_page_testzero(page)) {
2820 if (order == 0) 2834 if (order == 0)
2821 free_hot_cold_page(page, 0); 2835 free_hot_cold_page(page, false);
2822 else 2836 else
2823 __free_pages_ok(page, order); 2837 __free_pages_ok(page, order);
2824 } 2838 }
@@ -2837,27 +2851,51 @@ void free_pages(unsigned long addr, unsigned int order)
2837EXPORT_SYMBOL(free_pages); 2851EXPORT_SYMBOL(free_pages);
2838 2852
2839/* 2853/*
2840 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free 2854 * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
2841 * pages allocated with __GFP_KMEMCG. 2855 * of the current memory cgroup.
2842 *
2843 * Those pages are accounted to a particular memcg, embedded in the
2844 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2845 * for that information only to find out that it is NULL for users who have no
2846 * interest in that whatsoever, we provide these functions.
2847 * 2856 *
2848 * The caller knows better which flags it relies on. 2857 * It should be used when the caller would like to use kmalloc, but since the
2858 * allocation is large, it has to fall back to the page allocator.
2849 */ 2859 */
2850void __free_memcg_kmem_pages(struct page *page, unsigned int order) 2860struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
2861{
2862 struct page *page;
2863 struct mem_cgroup *memcg = NULL;
2864
2865 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2866 return NULL;
2867 page = alloc_pages(gfp_mask, order);
2868 memcg_kmem_commit_charge(page, memcg, order);
2869 return page;
2870}
2871
2872struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
2873{
2874 struct page *page;
2875 struct mem_cgroup *memcg = NULL;
2876
2877 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2878 return NULL;
2879 page = alloc_pages_node(nid, gfp_mask, order);
2880 memcg_kmem_commit_charge(page, memcg, order);
2881 return page;
2882}
2883
2884/*
2885 * __free_kmem_pages and free_kmem_pages will free pages allocated with
2886 * alloc_kmem_pages.
2887 */
2888void __free_kmem_pages(struct page *page, unsigned int order)
2851{ 2889{
2852 memcg_kmem_uncharge_pages(page, order); 2890 memcg_kmem_uncharge_pages(page, order);
2853 __free_pages(page, order); 2891 __free_pages(page, order);
2854} 2892}
2855 2893
2856void free_memcg_kmem_pages(unsigned long addr, unsigned int order) 2894void free_kmem_pages(unsigned long addr, unsigned int order)
2857{ 2895{
2858 if (addr != 0) { 2896 if (addr != 0) {
2859 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2897 VM_BUG_ON(!virt_addr_valid((void *)addr));
2860 __free_memcg_kmem_pages(virt_to_page((void *)addr), order); 2898 __free_kmem_pages(virt_to_page((void *)addr), order);
2861 } 2899 }
2862} 2900}
2863 2901
@@ -4095,7 +4133,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4095 4133
4096static void __meminit zone_init_free_lists(struct zone *zone) 4134static void __meminit zone_init_free_lists(struct zone *zone)
4097{ 4135{
4098 int order, t; 4136 unsigned int order, t;
4099 for_each_migratetype_order(order, t) { 4137 for_each_migratetype_order(order, t) {
4100 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4138 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
4101 zone->free_area[order].nr_free = 0; 4139 zone->free_area[order].nr_free = 0;
@@ -4349,9 +4387,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
4349#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4387#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4350/* 4388/*
4351 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4389 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
4352 * Architectures may implement their own version but if add_active_range()
4353 * was used and there are no special requirements, this is a convenient
4354 * alternative
4355 */ 4390 */
4356int __meminit __early_pfn_to_nid(unsigned long pfn) 4391int __meminit __early_pfn_to_nid(unsigned long pfn)
4357{ 4392{
@@ -4406,10 +4441,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4406 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4441 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4407 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid 4442 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
4408 * 4443 *
4409 * If an architecture guarantees that all ranges registered with 4444 * If an architecture guarantees that all ranges registered contain no holes
4410 * add_active_ranges() contain no holes and may be freed, this 4445 * and may be freed, this this function may be used instead of calling
4411 * this function may be used instead of calling memblock_free_early_nid() 4446 * memblock_free_early_nid() manually.
4412 * manually.
4413 */ 4447 */
4414void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4448void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4415{ 4449{
@@ -4431,9 +4465,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4431 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4465 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4432 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4466 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4433 * 4467 *
4434 * If an architecture guarantees that all ranges registered with 4468 * If an architecture guarantees that all ranges registered contain no holes and may
4435 * add_active_ranges() contain no holes and may be freed, this 4469 * be freed, this function may be used instead of calling memory_present() manually.
4436 * function may be used instead of calling memory_present() manually.
4437 */ 4470 */
4438void __init sparse_memory_present_with_active_regions(int nid) 4471void __init sparse_memory_present_with_active_regions(int nid)
4439{ 4472{
@@ -4451,7 +4484,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
4451 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4484 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4452 * 4485 *
4453 * It returns the start and end page frame of a node based on information 4486 * It returns the start and end page frame of a node based on information
4454 * provided by an arch calling add_active_range(). If called for a node 4487 * provided by memblock_set_node(). If called for a node
4455 * with no available memory, a warning is printed and the start and end 4488 * with no available memory, a warning is printed and the start and end
4456 * PFNs will be 0. 4489 * PFNs will be 0.
4457 */ 4490 */
@@ -4921,8 +4954,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4921 4954
4922 pgdat->node_id = nid; 4955 pgdat->node_id = nid;
4923 pgdat->node_start_pfn = node_start_pfn; 4956 pgdat->node_start_pfn = node_start_pfn;
4924 if (node_state(nid, N_MEMORY))
4925 init_zone_allows_reclaim(nid);
4926#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4957#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4927 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4958 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4928#endif 4959#endif
@@ -5030,7 +5061,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
5030 * find_min_pfn_with_active_regions - Find the minimum PFN registered 5061 * find_min_pfn_with_active_regions - Find the minimum PFN registered
5031 * 5062 *
5032 * It returns the minimum PFN based on information provided via 5063 * It returns the minimum PFN based on information provided via
5033 * add_active_range(). 5064 * memblock_set_node().
5034 */ 5065 */
5035unsigned long __init find_min_pfn_with_active_regions(void) 5066unsigned long __init find_min_pfn_with_active_regions(void)
5036{ 5067{
@@ -5251,7 +5282,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
5251 * @max_zone_pfn: an array of max PFNs for each zone 5282 * @max_zone_pfn: an array of max PFNs for each zone
5252 * 5283 *
5253 * This will call free_area_init_node() for each active node in the system. 5284 * This will call free_area_init_node() for each active node in the system.
5254 * Using the page ranges provided by add_active_range(), the size of each 5285 * Using the page ranges provided by memblock_set_node(), the size of each
5255 * zone in each node and their holes is calculated. If the maximum PFN 5286 * zone in each node and their holes is calculated. If the maximum PFN
5256 * between two adjacent zones match, it is assumed that the zone is empty. 5287 * between two adjacent zones match, it is assumed that the zone is empty.
5257 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 5288 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
@@ -6009,53 +6040,64 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
6009 * @end_bitidx: The last bit of interest 6040 * @end_bitidx: The last bit of interest
6010 * returns pageblock_bits flags 6041 * returns pageblock_bits flags
6011 */ 6042 */
6012unsigned long get_pageblock_flags_group(struct page *page, 6043unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
6013 int start_bitidx, int end_bitidx) 6044 unsigned long end_bitidx,
6045 unsigned long mask)
6014{ 6046{
6015 struct zone *zone; 6047 struct zone *zone;
6016 unsigned long *bitmap; 6048 unsigned long *bitmap;
6017 unsigned long pfn, bitidx; 6049 unsigned long bitidx, word_bitidx;
6018 unsigned long flags = 0; 6050 unsigned long word;
6019 unsigned long value = 1;
6020 6051
6021 zone = page_zone(page); 6052 zone = page_zone(page);
6022 pfn = page_to_pfn(page);
6023 bitmap = get_pageblock_bitmap(zone, pfn); 6053 bitmap = get_pageblock_bitmap(zone, pfn);
6024 bitidx = pfn_to_bitidx(zone, pfn); 6054 bitidx = pfn_to_bitidx(zone, pfn);
6055 word_bitidx = bitidx / BITS_PER_LONG;
6056 bitidx &= (BITS_PER_LONG-1);
6025 6057
6026 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 6058 word = bitmap[word_bitidx];
6027 if (test_bit(bitidx + start_bitidx, bitmap)) 6059 bitidx += end_bitidx;
6028 flags |= value; 6060 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
6029
6030 return flags;
6031} 6061}
6032 6062
6033/** 6063/**
6034 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 6064 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
6035 * @page: The page within the block of interest 6065 * @page: The page within the block of interest
6036 * @start_bitidx: The first bit of interest 6066 * @start_bitidx: The first bit of interest
6037 * @end_bitidx: The last bit of interest 6067 * @end_bitidx: The last bit of interest
6038 * @flags: The flags to set 6068 * @flags: The flags to set
6039 */ 6069 */
6040void set_pageblock_flags_group(struct page *page, unsigned long flags, 6070void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
6041 int start_bitidx, int end_bitidx) 6071 unsigned long pfn,
6072 unsigned long end_bitidx,
6073 unsigned long mask)
6042{ 6074{
6043 struct zone *zone; 6075 struct zone *zone;
6044 unsigned long *bitmap; 6076 unsigned long *bitmap;
6045 unsigned long pfn, bitidx; 6077 unsigned long bitidx, word_bitidx;
6046 unsigned long value = 1; 6078 unsigned long old_word, word;
6079
6080 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
6047 6081
6048 zone = page_zone(page); 6082 zone = page_zone(page);
6049 pfn = page_to_pfn(page);
6050 bitmap = get_pageblock_bitmap(zone, pfn); 6083 bitmap = get_pageblock_bitmap(zone, pfn);
6051 bitidx = pfn_to_bitidx(zone, pfn); 6084 bitidx = pfn_to_bitidx(zone, pfn);
6085 word_bitidx = bitidx / BITS_PER_LONG;
6086 bitidx &= (BITS_PER_LONG-1);
6087
6052 VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); 6088 VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
6053 6089
6054 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 6090 bitidx += end_bitidx;
6055 if (flags & value) 6091 mask <<= (BITS_PER_LONG - bitidx - 1);
6056 __set_bit(bitidx + start_bitidx, bitmap); 6092 flags <<= (BITS_PER_LONG - bitidx - 1);
6057 else 6093
6058 __clear_bit(bitidx + start_bitidx, bitmap); 6094 word = ACCESS_ONCE(bitmap[word_bitidx]);
6095 for (;;) {
6096 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6097 if (word == old_word)
6098 break;
6099 word = old_word;
6100 }
6059} 6101}
6060 6102
6061/* 6103/*
@@ -6215,7 +6257,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
6215 cc->nr_migratepages -= nr_reclaimed; 6257 cc->nr_migratepages -= nr_reclaimed;
6216 6258
6217 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6259 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
6218 0, MIGRATE_SYNC, MR_CMA); 6260 NULL, 0, cc->mode, MR_CMA);
6219 } 6261 }
6220 if (ret < 0) { 6262 if (ret < 0) {
6221 putback_movable_pages(&cc->migratepages); 6263 putback_movable_pages(&cc->migratepages);
@@ -6254,7 +6296,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6254 .nr_migratepages = 0, 6296 .nr_migratepages = 0,
6255 .order = -1, 6297 .order = -1,
6256 .zone = page_zone(pfn_to_page(start)), 6298 .zone = page_zone(pfn_to_page(start)),
6257 .sync = true, 6299 .mode = MIGRATE_SYNC,
6258 .ignore_skip_hint = true, 6300 .ignore_skip_hint = true,
6259 }; 6301 };
6260 INIT_LIST_HEAD(&cc.migratepages); 6302 INIT_LIST_HEAD(&cc.migratepages);
@@ -6409,7 +6451,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6409{ 6451{
6410 struct page *page; 6452 struct page *page;
6411 struct zone *zone; 6453 struct zone *zone;
6412 int order, i; 6454 unsigned int order, i;
6413 unsigned long pfn; 6455 unsigned long pfn;
6414 unsigned long flags; 6456 unsigned long flags;
6415 /* find the first valid pfn */ 6457 /* find the first valid pfn */
@@ -6461,7 +6503,7 @@ bool is_free_buddy_page(struct page *page)
6461 struct zone *zone = page_zone(page); 6503 struct zone *zone = page_zone(page);
6462 unsigned long pfn = page_to_pfn(page); 6504 unsigned long pfn = page_to_pfn(page);
6463 unsigned long flags; 6505 unsigned long flags;
6464 int order; 6506 unsigned int order;
6465 6507
6466 spin_lock_irqsave(&zone->lock, flags); 6508 spin_lock_irqsave(&zone->lock, flags);
6467 for (order = 0; order < MAX_ORDER; order++) { 6509 for (order = 0; order < MAX_ORDER; order++) {
diff --git a/mm/page_io.c b/mm/page_io.c
index 7c59ef681381..58b50d2901fe 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -248,11 +248,16 @@ out:
248 return ret; 248 return ret;
249} 249}
250 250
251static sector_t swap_page_sector(struct page *page)
252{
253 return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
254}
255
251int __swap_writepage(struct page *page, struct writeback_control *wbc, 256int __swap_writepage(struct page *page, struct writeback_control *wbc,
252 void (*end_write_func)(struct bio *, int)) 257 void (*end_write_func)(struct bio *, int))
253{ 258{
254 struct bio *bio; 259 struct bio *bio;
255 int ret = 0, rw = WRITE; 260 int ret, rw = WRITE;
256 struct swap_info_struct *sis = page_swap_info(page); 261 struct swap_info_struct *sis = page_swap_info(page);
257 262
258 if (sis->flags & SWP_FILE) { 263 if (sis->flags & SWP_FILE) {
@@ -297,6 +302,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
297 return ret; 302 return ret;
298 } 303 }
299 304
305 ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
306 if (!ret) {
307 count_vm_event(PSWPOUT);
308 return 0;
309 }
310
311 ret = 0;
300 bio = get_swap_bio(GFP_NOIO, page, end_write_func); 312 bio = get_swap_bio(GFP_NOIO, page, end_write_func);
301 if (bio == NULL) { 313 if (bio == NULL) {
302 set_page_dirty(page); 314 set_page_dirty(page);
@@ -338,6 +350,13 @@ int swap_readpage(struct page *page)
338 return ret; 350 return ret;
339 } 351 }
340 352
353 ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
354 if (!ret) {
355 count_vm_event(PSWPIN);
356 return 0;
357 }
358
359 ret = 0;
341 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 360 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
342 if (bio == NULL) { 361 if (bio == NULL) {
343 unlock_page(page); 362 unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 9c3e77396d1a..ea8e20d75b29 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
103 * LOCK should suffice since the actual taking of the lock must 103 * LOCK should suffice since the actual taking of the lock must
104 * happen _before_ what follows. 104 * happen _before_ what follows.
105 */ 105 */
106 might_sleep();
106 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 107 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock_write(anon_vma); 108 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock_write(anon_vma); 109 anon_vma_unlock_write(anon_vma);
@@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page)
426 * above cannot corrupt). 427 * above cannot corrupt).
427 */ 428 */
428 if (!page_mapped(page)) { 429 if (!page_mapped(page)) {
430 rcu_read_unlock();
429 put_anon_vma(anon_vma); 431 put_anon_vma(anon_vma);
430 anon_vma = NULL; 432 return NULL;
431 } 433 }
432out: 434out:
433 rcu_read_unlock(); 435 rcu_read_unlock();
@@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
477 } 479 }
478 480
479 if (!page_mapped(page)) { 481 if (!page_mapped(page)) {
482 rcu_read_unlock();
480 put_anon_vma(anon_vma); 483 put_anon_vma(anon_vma);
481 anon_vma = NULL; 484 return NULL;
482 goto out;
483 } 485 }
484 486
485 /* we pinned the anon_vma, its safe to sleep */ 487 /* we pinned the anon_vma, its safe to sleep */
@@ -669,7 +671,7 @@ struct page_referenced_arg {
669/* 671/*
670 * arg: page_referenced_arg will be passed 672 * arg: page_referenced_arg will be passed
671 */ 673 */
672int page_referenced_one(struct page *page, struct vm_area_struct *vma, 674static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
673 unsigned long address, void *arg) 675 unsigned long address, void *arg)
674{ 676{
675 struct mm_struct *mm = vma->vm_mm; 677 struct mm_struct *mm = vma->vm_mm;
@@ -986,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page,
986{ 988{
987 int first = atomic_inc_and_test(&page->_mapcount); 989 int first = atomic_inc_and_test(&page->_mapcount);
988 if (first) { 990 if (first) {
991 /*
992 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
993 * these counters are not modified in interrupt context, and
994 * pte lock(a spinlock) is held, which implies preemption
995 * disabled.
996 */
989 if (PageTransHuge(page)) 997 if (PageTransHuge(page))
990 __inc_zone_page_state(page, 998 __inc_zone_page_state(page,
991 NR_ANON_TRANSPARENT_HUGEPAGES); 999 NR_ANON_TRANSPARENT_HUGEPAGES);
@@ -1024,11 +1032,25 @@ void page_add_new_anon_rmap(struct page *page,
1024 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1032 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1025 hpage_nr_pages(page)); 1033 hpage_nr_pages(page));
1026 __page_set_anon_rmap(page, vma, address, 1); 1034 __page_set_anon_rmap(page, vma, address, 1);
1027 if (!mlocked_vma_newpage(vma, page)) { 1035
1036 VM_BUG_ON_PAGE(PageLRU(page), page);
1037 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
1028 SetPageActive(page); 1038 SetPageActive(page);
1029 lru_cache_add(page); 1039 lru_cache_add(page);
1030 } else 1040 return;
1031 add_page_to_unevictable_list(page); 1041 }
1042
1043 if (!TestSetPageMlocked(page)) {
1044 /*
1045 * We use the irq-unsafe __mod_zone_page_stat because this
1046 * counter is not modified from interrupt context, and the pte
1047 * lock is held(spinlock), which implies preemption disabled.
1048 */
1049 __mod_zone_page_state(page_zone(page), NR_MLOCK,
1050 hpage_nr_pages(page));
1051 count_vm_event(UNEVICTABLE_PGMLOCKED);
1052 }
1053 add_page_to_unevictable_list(page);
1032} 1054}
1033 1055
1034/** 1056/**
@@ -1077,6 +1099,11 @@ void page_remove_rmap(struct page *page)
1077 /* 1099 /*
1078 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED 1100 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
1079 * and not charged by memcg for now. 1101 * and not charged by memcg for now.
1102 *
1103 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1104 * these counters are not modified in interrupt context, and
1105 * these counters are not modified in interrupt context, and
1106 * pte lock(a spinlock) is held, which implies preemption disabled.
1080 */ 1107 */
1081 if (unlikely(PageHuge(page))) 1108 if (unlikely(PageHuge(page)))
1082 goto out; 1109 goto out;
@@ -1112,7 +1139,7 @@ out:
1112/* 1139/*
1113 * @arg: enum ttu_flags will be passed to this argument 1140 * @arg: enum ttu_flags will be passed to this argument
1114 */ 1141 */
1115int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1142static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1116 unsigned long address, void *arg) 1143 unsigned long address, void *arg)
1117{ 1144{
1118 struct mm_struct *mm = vma->vm_mm; 1145 struct mm_struct *mm = vma->vm_mm;
@@ -1135,7 +1162,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1135 if (vma->vm_flags & VM_LOCKED) 1162 if (vma->vm_flags & VM_LOCKED)
1136 goto out_mlock; 1163 goto out_mlock;
1137 1164
1138 if (TTU_ACTION(flags) == TTU_MUNLOCK) 1165 if (flags & TTU_MUNLOCK)
1139 goto out_unmap; 1166 goto out_unmap;
1140 } 1167 }
1141 if (!(flags & TTU_IGNORE_ACCESS)) { 1168 if (!(flags & TTU_IGNORE_ACCESS)) {
@@ -1203,7 +1230,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1203 * pte. do_swap_page() will wait until the migration 1230 * pte. do_swap_page() will wait until the migration
1204 * pte is removed and then restart fault handling. 1231 * pte is removed and then restart fault handling.
1205 */ 1232 */
1206 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); 1233 BUG_ON(!(flags & TTU_MIGRATION));
1207 entry = make_migration_entry(page, pte_write(pteval)); 1234 entry = make_migration_entry(page, pte_write(pteval));
1208 } 1235 }
1209 swp_pte = swp_entry_to_pte(entry); 1236 swp_pte = swp_entry_to_pte(entry);
@@ -1212,7 +1239,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1212 set_pte_at(mm, address, pte, swp_pte); 1239 set_pte_at(mm, address, pte, swp_pte);
1213 BUG_ON(pte_file(*pte)); 1240 BUG_ON(pte_file(*pte));
1214 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1241 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1215 (TTU_ACTION(flags) == TTU_MIGRATION)) { 1242 (flags & TTU_MIGRATION)) {
1216 /* Establish migration entry for a file page */ 1243 /* Establish migration entry for a file page */
1217 swp_entry_t entry; 1244 swp_entry_t entry;
1218 entry = make_migration_entry(page, pte_write(pteval)); 1245 entry = make_migration_entry(page, pte_write(pteval));
@@ -1225,7 +1252,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1225 1252
1226out_unmap: 1253out_unmap:
1227 pte_unmap_unlock(pte, ptl); 1254 pte_unmap_unlock(pte, ptl);
1228 if (ret != SWAP_FAIL) 1255 if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
1229 mmu_notifier_invalidate_page(mm, address); 1256 mmu_notifier_invalidate_page(mm, address);
1230out: 1257out:
1231 return ret; 1258 return ret;
@@ -1359,7 +1386,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1359 if (page->index != linear_page_index(vma, address)) { 1386 if (page->index != linear_page_index(vma, address)) {
1360 pte_t ptfile = pgoff_to_pte(page->index); 1387 pte_t ptfile = pgoff_to_pte(page->index);
1361 if (pte_soft_dirty(pteval)) 1388 if (pte_soft_dirty(pteval))
1362 pte_file_mksoft_dirty(ptfile); 1389 ptfile = pte_file_mksoft_dirty(ptfile);
1363 set_pte_at(mm, address, pte, ptfile); 1390 set_pte_at(mm, address, pte, ptfile);
1364 } 1391 }
1365 1392
@@ -1512,7 +1539,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1512 * locking requirements of exec(), migration skips 1539 * locking requirements of exec(), migration skips
1513 * temporary VMAs until after exec() completes. 1540 * temporary VMAs until after exec() completes.
1514 */ 1541 */
1515 if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) 1542 if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
1516 rwc.invalid_vma = invalid_migration_vma; 1543 rwc.invalid_vma = invalid_migration_vma;
1517 1544
1518 ret = rmap_walk(page, &rwc); 1545 ret = rmap_walk(page, &rwc);
diff --git a/mm/shmem.c b/mm/shmem.c
index 9f70e02111c6..5402481c28d1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1132,7 +1132,7 @@ repeat:
1132 goto decused; 1132 goto decused;
1133 } 1133 }
1134 1134
1135 SetPageSwapBacked(page); 1135 __SetPageSwapBacked(page);
1136 __set_page_locked(page); 1136 __set_page_locked(page);
1137 error = mem_cgroup_charge_file(page, current->mm, 1137 error = mem_cgroup_charge_file(page, current->mm,
1138 gfp & GFP_RECLAIM_MASK); 1138 gfp & GFP_RECLAIM_MASK);
@@ -1372,9 +1372,13 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1372 loff_t pos, unsigned len, unsigned flags, 1372 loff_t pos, unsigned len, unsigned flags,
1373 struct page **pagep, void **fsdata) 1373 struct page **pagep, void **fsdata)
1374{ 1374{
1375 int ret;
1375 struct inode *inode = mapping->host; 1376 struct inode *inode = mapping->host;
1376 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1377 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1377 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1378 ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1379 if (ret == 0 && *pagep)
1380 init_page_accessed(*pagep);
1381 return ret;
1378} 1382}
1379 1383
1380static int 1384static int
diff --git a/mm/slab.c b/mm/slab.c
index 19d92181ce24..9ca3b87edabc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1621,10 +1621,16 @@ __initcall(cpucache_init);
1621static noinline void 1621static noinline void
1622slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) 1622slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1623{ 1623{
1624#if DEBUG
1624 struct kmem_cache_node *n; 1625 struct kmem_cache_node *n;
1625 struct page *page; 1626 struct page *page;
1626 unsigned long flags; 1627 unsigned long flags;
1627 int node; 1628 int node;
1629 static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
1630 DEFAULT_RATELIMIT_BURST);
1631
1632 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
1633 return;
1628 1634
1629 printk(KERN_WARNING 1635 printk(KERN_WARNING
1630 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1636 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
@@ -1662,6 +1668,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1662 node, active_slabs, num_slabs, active_objs, num_objs, 1668 node, active_slabs, num_slabs, active_objs, num_objs,
1663 free_objects); 1669 free_objects);
1664 } 1670 }
1671#endif
1665} 1672}
1666 1673
1667/* 1674/*
@@ -1681,10 +1688,13 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1681 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1688 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1682 flags |= __GFP_RECLAIMABLE; 1689 flags |= __GFP_RECLAIMABLE;
1683 1690
1691 if (memcg_charge_slab(cachep, flags, cachep->gfporder))
1692 return NULL;
1693
1684 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1694 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1685 if (!page) { 1695 if (!page) {
1686 if (!(flags & __GFP_NOWARN) && printk_ratelimit()) 1696 memcg_uncharge_slab(cachep, cachep->gfporder);
1687 slab_out_of_memory(cachep, flags, nodeid); 1697 slab_out_of_memory(cachep, flags, nodeid);
1688 return NULL; 1698 return NULL;
1689 } 1699 }
1690 1700
@@ -1702,7 +1712,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1702 __SetPageSlab(page); 1712 __SetPageSlab(page);
1703 if (page->pfmemalloc) 1713 if (page->pfmemalloc)
1704 SetPageSlabPfmemalloc(page); 1714 SetPageSlabPfmemalloc(page);
1705 memcg_bind_pages(cachep, cachep->gfporder);
1706 1715
1707 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1716 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1708 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1717 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1738,10 +1747,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
1738 page_mapcount_reset(page); 1747 page_mapcount_reset(page);
1739 page->mapping = NULL; 1748 page->mapping = NULL;
1740 1749
1741 memcg_release_pages(cachep, cachep->gfporder);
1742 if (current->reclaim_state) 1750 if (current->reclaim_state)
1743 current->reclaim_state->reclaimed_slab += nr_freed; 1751 current->reclaim_state->reclaimed_slab += nr_freed;
1744 __free_memcg_kmem_pages(page, cachep->gfporder); 1752 __free_pages(page, cachep->gfporder);
1753 memcg_uncharge_slab(cachep, cachep->gfporder);
1745} 1754}
1746 1755
1747static void kmem_rcu_free(struct rcu_head *head) 1756static void kmem_rcu_free(struct rcu_head *head)
@@ -2469,8 +2478,7 @@ out:
2469 return nr_freed; 2478 return nr_freed;
2470} 2479}
2471 2480
2472/* Called with slab_mutex held to protect against cpu hotplug */ 2481int __kmem_cache_shrink(struct kmem_cache *cachep)
2473static int __cache_shrink(struct kmem_cache *cachep)
2474{ 2482{
2475 int ret = 0, i = 0; 2483 int ret = 0, i = 0;
2476 struct kmem_cache_node *n; 2484 struct kmem_cache_node *n;
@@ -2491,32 +2499,11 @@ static int __cache_shrink(struct kmem_cache *cachep)
2491 return (ret ? 1 : 0); 2499 return (ret ? 1 : 0);
2492} 2500}
2493 2501
2494/**
2495 * kmem_cache_shrink - Shrink a cache.
2496 * @cachep: The cache to shrink.
2497 *
2498 * Releases as many slabs as possible for a cache.
2499 * To help debugging, a zero exit status indicates all slabs were released.
2500 */
2501int kmem_cache_shrink(struct kmem_cache *cachep)
2502{
2503 int ret;
2504 BUG_ON(!cachep || in_interrupt());
2505
2506 get_online_cpus();
2507 mutex_lock(&slab_mutex);
2508 ret = __cache_shrink(cachep);
2509 mutex_unlock(&slab_mutex);
2510 put_online_cpus();
2511 return ret;
2512}
2513EXPORT_SYMBOL(kmem_cache_shrink);
2514
2515int __kmem_cache_shutdown(struct kmem_cache *cachep) 2502int __kmem_cache_shutdown(struct kmem_cache *cachep)
2516{ 2503{
2517 int i; 2504 int i;
2518 struct kmem_cache_node *n; 2505 struct kmem_cache_node *n;
2519 int rc = __cache_shrink(cachep); 2506 int rc = __kmem_cache_shrink(cachep);
2520 2507
2521 if (rc) 2508 if (rc)
2522 return rc; 2509 return rc;
diff --git a/mm/slab.h b/mm/slab.h
index 6bd4c353704f..961a3fb1f5a2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
91#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) 91#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
92 92
93int __kmem_cache_shutdown(struct kmem_cache *); 93int __kmem_cache_shutdown(struct kmem_cache *);
94int __kmem_cache_shrink(struct kmem_cache *);
94void slab_kmem_cache_release(struct kmem_cache *); 95void slab_kmem_cache_release(struct kmem_cache *);
95 96
96struct seq_file; 97struct seq_file;
@@ -120,21 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
120 return !s->memcg_params || s->memcg_params->is_root_cache; 121 return !s->memcg_params || s->memcg_params->is_root_cache;
121} 122}
122 123
123static inline void memcg_bind_pages(struct kmem_cache *s, int order)
124{
125 if (!is_root_cache(s))
126 atomic_add(1 << order, &s->memcg_params->nr_pages);
127}
128
129static inline void memcg_release_pages(struct kmem_cache *s, int order)
130{
131 if (is_root_cache(s))
132 return;
133
134 if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
135 mem_cgroup_destroy_cache(s);
136}
137
138static inline bool slab_equal_or_root(struct kmem_cache *s, 124static inline bool slab_equal_or_root(struct kmem_cache *s,
139 struct kmem_cache *p) 125 struct kmem_cache *p)
140{ 126{
@@ -192,18 +178,29 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
192 return s; 178 return s;
193 return s->memcg_params->root_cache; 179 return s->memcg_params->root_cache;
194} 180}
195#else 181
196static inline bool is_root_cache(struct kmem_cache *s) 182static __always_inline int memcg_charge_slab(struct kmem_cache *s,
183 gfp_t gfp, int order)
197{ 184{
198 return true; 185 if (!memcg_kmem_enabled())
186 return 0;
187 if (is_root_cache(s))
188 return 0;
189 return __memcg_charge_slab(s, gfp, order);
199} 190}
200 191
201static inline void memcg_bind_pages(struct kmem_cache *s, int order) 192static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
202{ 193{
194 if (!memcg_kmem_enabled())
195 return;
196 if (is_root_cache(s))
197 return;
198 __memcg_uncharge_slab(s, order);
203} 199}
204 200#else
205static inline void memcg_release_pages(struct kmem_cache *s, int order) 201static inline bool is_root_cache(struct kmem_cache *s)
206{ 202{
203 return true;
207} 204}
208 205
209static inline bool slab_equal_or_root(struct kmem_cache *s, 206static inline bool slab_equal_or_root(struct kmem_cache *s,
@@ -227,6 +224,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
227{ 224{
228 return s; 225 return s;
229} 226}
227
228static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
229{
230 return 0;
231}
232
233static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
234{
235}
230#endif 236#endif
231 237
232static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) 238static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 102cc6fca3d3..735e01a0db6f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -160,7 +160,6 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
160 160
161 s->refcount = 1; 161 s->refcount = 1;
162 list_add(&s->list, &slab_caches); 162 list_add(&s->list, &slab_caches);
163 memcg_register_cache(s);
164out: 163out:
165 if (err) 164 if (err)
166 return ERR_PTR(err); 165 return ERR_PTR(err);
@@ -205,6 +204,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,
205 int err; 204 int err;
206 205
207 get_online_cpus(); 206 get_online_cpus();
207 get_online_mems();
208
208 mutex_lock(&slab_mutex); 209 mutex_lock(&slab_mutex);
209 210
210 err = kmem_cache_sanity_check(name, size); 211 err = kmem_cache_sanity_check(name, size);
@@ -239,6 +240,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,
239 240
240out_unlock: 241out_unlock:
241 mutex_unlock(&slab_mutex); 242 mutex_unlock(&slab_mutex);
243
244 put_online_mems();
242 put_online_cpus(); 245 put_online_cpus();
243 246
244 if (err) { 247 if (err) {
@@ -258,31 +261,29 @@ EXPORT_SYMBOL(kmem_cache_create);
258 261
259#ifdef CONFIG_MEMCG_KMEM 262#ifdef CONFIG_MEMCG_KMEM
260/* 263/*
261 * kmem_cache_create_memcg - Create a cache for a memory cgroup. 264 * memcg_create_kmem_cache - Create a cache for a memory cgroup.
262 * @memcg: The memory cgroup the new cache is for. 265 * @memcg: The memory cgroup the new cache is for.
263 * @root_cache: The parent of the new cache. 266 * @root_cache: The parent of the new cache.
267 * @memcg_name: The name of the memory cgroup (used for naming the new cache).
264 * 268 *
265 * This function attempts to create a kmem cache that will serve allocation 269 * This function attempts to create a kmem cache that will serve allocation
266 * requests going from @memcg to @root_cache. The new cache inherits properties 270 * requests going from @memcg to @root_cache. The new cache inherits properties
267 * from its parent. 271 * from its parent.
268 */ 272 */
269void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) 273struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
274 struct kmem_cache *root_cache,
275 const char *memcg_name)
270{ 276{
271 struct kmem_cache *s; 277 struct kmem_cache *s = NULL;
272 char *cache_name; 278 char *cache_name;
273 279
274 get_online_cpus(); 280 get_online_cpus();
275 mutex_lock(&slab_mutex); 281 get_online_mems();
276 282
277 /* 283 mutex_lock(&slab_mutex);
278 * Since per-memcg caches are created asynchronously on first
279 * allocation (see memcg_kmem_get_cache()), several threads can try to
280 * create the same cache, but only one of them may succeed.
281 */
282 if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg)))
283 goto out_unlock;
284 284
285 cache_name = memcg_create_cache_name(memcg, root_cache); 285 cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
286 memcg_cache_id(memcg), memcg_name);
286 if (!cache_name) 287 if (!cache_name)
287 goto out_unlock; 288 goto out_unlock;
288 289
@@ -292,17 +293,19 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c
292 memcg, root_cache); 293 memcg, root_cache);
293 if (IS_ERR(s)) { 294 if (IS_ERR(s)) {
294 kfree(cache_name); 295 kfree(cache_name);
295 goto out_unlock; 296 s = NULL;
296 } 297 }
297 298
298 s->allocflags |= __GFP_KMEMCG;
299
300out_unlock: 299out_unlock:
301 mutex_unlock(&slab_mutex); 300 mutex_unlock(&slab_mutex);
301
302 put_online_mems();
302 put_online_cpus(); 303 put_online_cpus();
304
305 return s;
303} 306}
304 307
305static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) 308static int memcg_cleanup_cache_params(struct kmem_cache *s)
306{ 309{
307 int rc; 310 int rc;
308 311
@@ -311,13 +314,13 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
311 return 0; 314 return 0;
312 315
313 mutex_unlock(&slab_mutex); 316 mutex_unlock(&slab_mutex);
314 rc = __kmem_cache_destroy_memcg_children(s); 317 rc = __memcg_cleanup_cache_params(s);
315 mutex_lock(&slab_mutex); 318 mutex_lock(&slab_mutex);
316 319
317 return rc; 320 return rc;
318} 321}
319#else 322#else
320static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) 323static int memcg_cleanup_cache_params(struct kmem_cache *s)
321{ 324{
322 return 0; 325 return 0;
323} 326}
@@ -332,27 +335,26 @@ void slab_kmem_cache_release(struct kmem_cache *s)
332void kmem_cache_destroy(struct kmem_cache *s) 335void kmem_cache_destroy(struct kmem_cache *s)
333{ 336{
334 get_online_cpus(); 337 get_online_cpus();
338 get_online_mems();
339
335 mutex_lock(&slab_mutex); 340 mutex_lock(&slab_mutex);
336 341
337 s->refcount--; 342 s->refcount--;
338 if (s->refcount) 343 if (s->refcount)
339 goto out_unlock; 344 goto out_unlock;
340 345
341 if (kmem_cache_destroy_memcg_children(s) != 0) 346 if (memcg_cleanup_cache_params(s) != 0)
342 goto out_unlock; 347 goto out_unlock;
343 348
344 list_del(&s->list);
345 memcg_unregister_cache(s);
346
347 if (__kmem_cache_shutdown(s) != 0) { 349 if (__kmem_cache_shutdown(s) != 0) {
348 list_add(&s->list, &slab_caches);
349 memcg_register_cache(s);
350 printk(KERN_ERR "kmem_cache_destroy %s: " 350 printk(KERN_ERR "kmem_cache_destroy %s: "
351 "Slab cache still has objects\n", s->name); 351 "Slab cache still has objects\n", s->name);
352 dump_stack(); 352 dump_stack();
353 goto out_unlock; 353 goto out_unlock;
354 } 354 }
355 355
356 list_del(&s->list);
357
356 mutex_unlock(&slab_mutex); 358 mutex_unlock(&slab_mutex);
357 if (s->flags & SLAB_DESTROY_BY_RCU) 359 if (s->flags & SLAB_DESTROY_BY_RCU)
358 rcu_barrier(); 360 rcu_barrier();
@@ -363,15 +365,36 @@ void kmem_cache_destroy(struct kmem_cache *s)
363#else 365#else
364 slab_kmem_cache_release(s); 366 slab_kmem_cache_release(s);
365#endif 367#endif
366 goto out_put_cpus; 368 goto out;
367 369
368out_unlock: 370out_unlock:
369 mutex_unlock(&slab_mutex); 371 mutex_unlock(&slab_mutex);
370out_put_cpus: 372out:
373 put_online_mems();
371 put_online_cpus(); 374 put_online_cpus();
372} 375}
373EXPORT_SYMBOL(kmem_cache_destroy); 376EXPORT_SYMBOL(kmem_cache_destroy);
374 377
378/**
379 * kmem_cache_shrink - Shrink a cache.
380 * @cachep: The cache to shrink.
381 *
382 * Releases as many slabs as possible for a cache.
383 * To help debugging, a zero exit status indicates all slabs were released.
384 */
385int kmem_cache_shrink(struct kmem_cache *cachep)
386{
387 int ret;
388
389 get_online_cpus();
390 get_online_mems();
391 ret = __kmem_cache_shrink(cachep);
392 put_online_mems();
393 put_online_cpus();
394 return ret;
395}
396EXPORT_SYMBOL(kmem_cache_shrink);
397
375int slab_is_available(void) 398int slab_is_available(void)
376{ 399{
377 return slab_state >= UP; 400 return slab_state >= UP;
@@ -586,6 +609,24 @@ void __init create_kmalloc_caches(unsigned long flags)
586} 609}
587#endif /* !CONFIG_SLOB */ 610#endif /* !CONFIG_SLOB */
588 611
612/*
613 * To avoid unnecessary overhead, we pass through large allocation requests
614 * directly to the page allocator. We use __GFP_COMP, because we will need to
615 * know the allocation order to free the pages properly in kfree.
616 */
617void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
618{
619 void *ret;
620 struct page *page;
621
622 flags |= __GFP_COMP;
623 page = alloc_kmem_pages(flags, order);
624 ret = page ? page_address(page) : NULL;
625 kmemleak_alloc(ret, size, 1, flags);
626 return ret;
627}
628EXPORT_SYMBOL(kmalloc_order);
629
589#ifdef CONFIG_TRACING 630#ifdef CONFIG_TRACING
590void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 631void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
591{ 632{
diff --git a/mm/slob.c b/mm/slob.c
index 730cad45d4be..21980e0f39a8 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
620 return 0; 620 return 0;
621} 621}
622 622
623int kmem_cache_shrink(struct kmem_cache *d) 623int __kmem_cache_shrink(struct kmem_cache *d)
624{ 624{
625 return 0; 625 return 0;
626} 626}
627EXPORT_SYMBOL(kmem_cache_shrink);
628 627
629struct kmem_cache kmem_cache_boot = { 628struct kmem_cache kmem_cache_boot = {
630 .name = "kmem_cache", 629 .name = "kmem_cache",
diff --git a/mm/slub.c b/mm/slub.c
index 2b1ce697fc4b..fdf0fe4da9a9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -403,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
403 stat(s, CMPXCHG_DOUBLE_FAIL); 403 stat(s, CMPXCHG_DOUBLE_FAIL);
404 404
405#ifdef SLUB_DEBUG_CMPXCHG 405#ifdef SLUB_DEBUG_CMPXCHG
406 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 406 pr_info("%s %s: cmpxchg double redo ", n, s->name);
407#endif 407#endif
408 408
409 return 0; 409 return 0;
@@ -444,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
444 stat(s, CMPXCHG_DOUBLE_FAIL); 444 stat(s, CMPXCHG_DOUBLE_FAIL);
445 445
446#ifdef SLUB_DEBUG_CMPXCHG 446#ifdef SLUB_DEBUG_CMPXCHG
447 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 447 pr_info("%s %s: cmpxchg double redo ", n, s->name);
448#endif 448#endif
449 449
450 return 0; 450 return 0;
@@ -546,14 +546,14 @@ static void print_track(const char *s, struct track *t)
546 if (!t->addr) 546 if (!t->addr)
547 return; 547 return;
548 548
549 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 549 pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
550 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 550 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
551#ifdef CONFIG_STACKTRACE 551#ifdef CONFIG_STACKTRACE
552 { 552 {
553 int i; 553 int i;
554 for (i = 0; i < TRACK_ADDRS_COUNT; i++) 554 for (i = 0; i < TRACK_ADDRS_COUNT; i++)
555 if (t->addrs[i]) 555 if (t->addrs[i])
556 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); 556 pr_err("\t%pS\n", (void *)t->addrs[i]);
557 else 557 else
558 break; 558 break;
559 } 559 }
@@ -571,38 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object)
571 571
572static void print_page_info(struct page *page) 572static void print_page_info(struct page *page)
573{ 573{
574 printk(KERN_ERR 574 pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
575 "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
576 page, page->objects, page->inuse, page->freelist, page->flags); 575 page, page->objects, page->inuse, page->freelist, page->flags);
577 576
578} 577}
579 578
580static void slab_bug(struct kmem_cache *s, char *fmt, ...) 579static void slab_bug(struct kmem_cache *s, char *fmt, ...)
581{ 580{
581 struct va_format vaf;
582 va_list args; 582 va_list args;
583 char buf[100];
584 583
585 va_start(args, fmt); 584 va_start(args, fmt);
586 vsnprintf(buf, sizeof(buf), fmt, args); 585 vaf.fmt = fmt;
587 va_end(args); 586 vaf.va = &args;
588 printk(KERN_ERR "========================================" 587 pr_err("=============================================================================\n");
589 "=====================================\n"); 588 pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
590 printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); 589 pr_err("-----------------------------------------------------------------------------\n\n");
591 printk(KERN_ERR "----------------------------------------"
592 "-------------------------------------\n\n");
593 590
594 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 591 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
592 va_end(args);
595} 593}
596 594
597static void slab_fix(struct kmem_cache *s, char *fmt, ...) 595static void slab_fix(struct kmem_cache *s, char *fmt, ...)
598{ 596{
597 struct va_format vaf;
599 va_list args; 598 va_list args;
600 char buf[100];
601 599
602 va_start(args, fmt); 600 va_start(args, fmt);
603 vsnprintf(buf, sizeof(buf), fmt, args); 601 vaf.fmt = fmt;
602 vaf.va = &args;
603 pr_err("FIX %s: %pV\n", s->name, &vaf);
604 va_end(args); 604 va_end(args);
605 printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
606} 605}
607 606
608static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 607static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
@@ -614,8 +613,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
614 613
615 print_page_info(page); 614 print_page_info(page);
616 615
617 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 616 pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
618 p, p - addr, get_freepointer(s, p)); 617 p, p - addr, get_freepointer(s, p));
619 618
620 if (p > addr + 16) 619 if (p > addr + 16)
621 print_section("Bytes b4 ", p - 16, 16); 620 print_section("Bytes b4 ", p - 16, 16);
@@ -698,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
698 end--; 697 end--;
699 698
700 slab_bug(s, "%s overwritten", what); 699 slab_bug(s, "%s overwritten", what);
701 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 700 pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
702 fault, end - 1, fault[0], value); 701 fault, end - 1, fault[0], value);
703 print_trailer(s, page, object); 702 print_trailer(s, page, object);
704 703
@@ -931,7 +930,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
931 int alloc) 930 int alloc)
932{ 931{
933 if (s->flags & SLAB_TRACE) { 932 if (s->flags & SLAB_TRACE) {
934 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 933 pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
935 s->name, 934 s->name,
936 alloc ? "alloc" : "free", 935 alloc ? "alloc" : "free",
937 object, page->inuse, 936 object, page->inuse,
@@ -1134,9 +1133,8 @@ static noinline struct kmem_cache_node *free_debug_processing(
1134 slab_err(s, page, "Attempt to free object(0x%p) " 1133 slab_err(s, page, "Attempt to free object(0x%p) "
1135 "outside of slab", object); 1134 "outside of slab", object);
1136 } else if (!page->slab_cache) { 1135 } else if (!page->slab_cache) {
1137 printk(KERN_ERR 1136 pr_err("SLUB <none>: no slab for object 0x%p.\n",
1138 "SLUB <none>: no slab for object 0x%p.\n", 1137 object);
1139 object);
1140 dump_stack(); 1138 dump_stack();
1141 } else 1139 } else
1142 object_err(s, page, object, 1140 object_err(s, page, object,
@@ -1219,8 +1217,8 @@ static int __init setup_slub_debug(char *str)
1219 slub_debug |= SLAB_FAILSLAB; 1217 slub_debug |= SLAB_FAILSLAB;
1220 break; 1218 break;
1221 default: 1219 default:
1222 printk(KERN_ERR "slub_debug option '%c' " 1220 pr_err("slub_debug option '%c' unknown. skipped\n",
1223 "unknown. skipped\n", *str); 1221 *str);
1224 } 1222 }
1225 } 1223 }
1226 1224
@@ -1314,17 +1312,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
1314/* 1312/*
1315 * Slab allocation and freeing 1313 * Slab allocation and freeing
1316 */ 1314 */
1317static inline struct page *alloc_slab_page(gfp_t flags, int node, 1315static inline struct page *alloc_slab_page(struct kmem_cache *s,
1318 struct kmem_cache_order_objects oo) 1316 gfp_t flags, int node, struct kmem_cache_order_objects oo)
1319{ 1317{
1318 struct page *page;
1320 int order = oo_order(oo); 1319 int order = oo_order(oo);
1321 1320
1322 flags |= __GFP_NOTRACK; 1321 flags |= __GFP_NOTRACK;
1323 1322
1323 if (memcg_charge_slab(s, flags, order))
1324 return NULL;
1325
1324 if (node == NUMA_NO_NODE) 1326 if (node == NUMA_NO_NODE)
1325 return alloc_pages(flags, order); 1327 page = alloc_pages(flags, order);
1326 else 1328 else
1327 return alloc_pages_exact_node(node, flags, order); 1329 page = alloc_pages_exact_node(node, flags, order);
1330
1331 if (!page)
1332 memcg_uncharge_slab(s, order);
1333
1334 return page;
1328} 1335}
1329 1336
1330static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1337static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1346,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1346 */ 1353 */
1347 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1354 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1348 1355
1349 page = alloc_slab_page(alloc_gfp, node, oo); 1356 page = alloc_slab_page(s, alloc_gfp, node, oo);
1350 if (unlikely(!page)) { 1357 if (unlikely(!page)) {
1351 oo = s->min; 1358 oo = s->min;
1352 alloc_gfp = flags; 1359 alloc_gfp = flags;
@@ -1354,7 +1361,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1354 * Allocation may have failed due to fragmentation. 1361 * Allocation may have failed due to fragmentation.
1355 * Try a lower order alloc if possible 1362 * Try a lower order alloc if possible
1356 */ 1363 */
1357 page = alloc_slab_page(alloc_gfp, node, oo); 1364 page = alloc_slab_page(s, alloc_gfp, node, oo);
1358 1365
1359 if (page) 1366 if (page)
1360 stat(s, ORDER_FALLBACK); 1367 stat(s, ORDER_FALLBACK);
@@ -1415,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1415 1422
1416 order = compound_order(page); 1423 order = compound_order(page);
1417 inc_slabs_node(s, page_to_nid(page), page->objects); 1424 inc_slabs_node(s, page_to_nid(page), page->objects);
1418 memcg_bind_pages(s, order);
1419 page->slab_cache = s; 1425 page->slab_cache = s;
1420 __SetPageSlab(page); 1426 __SetPageSlab(page);
1421 if (page->pfmemalloc) 1427 if (page->pfmemalloc)
@@ -1466,11 +1472,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1466 __ClearPageSlabPfmemalloc(page); 1472 __ClearPageSlabPfmemalloc(page);
1467 __ClearPageSlab(page); 1473 __ClearPageSlab(page);
1468 1474
1469 memcg_release_pages(s, order);
1470 page_mapcount_reset(page); 1475 page_mapcount_reset(page);
1471 if (current->reclaim_state) 1476 if (current->reclaim_state)
1472 current->reclaim_state->reclaimed_slab += pages; 1477 current->reclaim_state->reclaimed_slab += pages;
1473 __free_memcg_kmem_pages(page, order); 1478 __free_pages(page, order);
1479 memcg_uncharge_slab(s, order);
1474} 1480}
1475 1481
1476#define need_reserve_slab_rcu \ 1482#define need_reserve_slab_rcu \
@@ -1770,19 +1776,19 @@ static inline void note_cmpxchg_failure(const char *n,
1770#ifdef SLUB_DEBUG_CMPXCHG 1776#ifdef SLUB_DEBUG_CMPXCHG
1771 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 1777 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1772 1778
1773 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); 1779 pr_info("%s %s: cmpxchg redo ", n, s->name);
1774 1780
1775#ifdef CONFIG_PREEMPT 1781#ifdef CONFIG_PREEMPT
1776 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 1782 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1777 printk("due to cpu change %d -> %d\n", 1783 pr_warn("due to cpu change %d -> %d\n",
1778 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 1784 tid_to_cpu(tid), tid_to_cpu(actual_tid));
1779 else 1785 else
1780#endif 1786#endif
1781 if (tid_to_event(tid) != tid_to_event(actual_tid)) 1787 if (tid_to_event(tid) != tid_to_event(actual_tid))
1782 printk("due to cpu running other code. Event %ld->%ld\n", 1788 pr_warn("due to cpu running other code. Event %ld->%ld\n",
1783 tid_to_event(tid), tid_to_event(actual_tid)); 1789 tid_to_event(tid), tid_to_event(actual_tid));
1784 else 1790 else
1785 printk("for unknown reason: actual=%lx was=%lx target=%lx\n", 1791 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
1786 actual_tid, tid, next_tid(tid)); 1792 actual_tid, tid, next_tid(tid));
1787#endif 1793#endif
1788 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1794 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
@@ -2121,11 +2127,19 @@ static inline int node_match(struct page *page, int node)
2121 return 1; 2127 return 1;
2122} 2128}
2123 2129
2130#ifdef CONFIG_SLUB_DEBUG
2124static int count_free(struct page *page) 2131static int count_free(struct page *page)
2125{ 2132{
2126 return page->objects - page->inuse; 2133 return page->objects - page->inuse;
2127} 2134}
2128 2135
2136static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2137{
2138 return atomic_long_read(&n->total_objects);
2139}
2140#endif /* CONFIG_SLUB_DEBUG */
2141
2142#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
2129static unsigned long count_partial(struct kmem_cache_node *n, 2143static unsigned long count_partial(struct kmem_cache_node *n,
2130 int (*get_count)(struct page *)) 2144 int (*get_count)(struct page *))
2131{ 2145{
@@ -2139,31 +2153,28 @@ static unsigned long count_partial(struct kmem_cache_node *n,
2139 spin_unlock_irqrestore(&n->list_lock, flags); 2153 spin_unlock_irqrestore(&n->list_lock, flags);
2140 return x; 2154 return x;
2141} 2155}
2142 2156#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
2143static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2144{
2145#ifdef CONFIG_SLUB_DEBUG
2146 return atomic_long_read(&n->total_objects);
2147#else
2148 return 0;
2149#endif
2150}
2151 2157
2152static noinline void 2158static noinline void
2153slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 2159slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2154{ 2160{
2161#ifdef CONFIG_SLUB_DEBUG
2162 static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
2163 DEFAULT_RATELIMIT_BURST);
2155 int node; 2164 int node;
2156 2165
2157 printk(KERN_WARNING 2166 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
2158 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2167 return;
2168
2169 pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2159 nid, gfpflags); 2170 nid, gfpflags);
2160 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2171 pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
2161 "default order: %d, min order: %d\n", s->name, s->object_size, 2172 s->name, s->object_size, s->size, oo_order(s->oo),
2162 s->size, oo_order(s->oo), oo_order(s->min)); 2173 oo_order(s->min));
2163 2174
2164 if (oo_order(s->min) > get_order(s->object_size)) 2175 if (oo_order(s->min) > get_order(s->object_size))
2165 printk(KERN_WARNING " %s debugging increased min order, use " 2176 pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
2166 "slub_debug=O to disable.\n", s->name); 2177 s->name);
2167 2178
2168 for_each_online_node(node) { 2179 for_each_online_node(node) {
2169 struct kmem_cache_node *n = get_node(s, node); 2180 struct kmem_cache_node *n = get_node(s, node);
@@ -2178,10 +2189,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2178 nr_slabs = node_nr_slabs(n); 2189 nr_slabs = node_nr_slabs(n);
2179 nr_objs = node_nr_objs(n); 2190 nr_objs = node_nr_objs(n);
2180 2191
2181 printk(KERN_WARNING 2192 pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
2182 " node %d: slabs: %ld, objs: %ld, free: %ld\n",
2183 node, nr_slabs, nr_objs, nr_free); 2193 node, nr_slabs, nr_objs, nr_free);
2184 } 2194 }
2195#endif
2185} 2196}
2186 2197
2187static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2198static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
@@ -2198,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2198 2209
2199 page = new_slab(s, flags, node); 2210 page = new_slab(s, flags, node);
2200 if (page) { 2211 if (page) {
2201 c = __this_cpu_ptr(s->cpu_slab); 2212 c = raw_cpu_ptr(s->cpu_slab);
2202 if (c->page) 2213 if (c->page)
2203 flush_slab(s, c); 2214 flush_slab(s, c);
2204 2215
@@ -2323,8 +2334,6 @@ redo:
2323 if (freelist) 2334 if (freelist)
2324 goto load_freelist; 2335 goto load_freelist;
2325 2336
2326 stat(s, ALLOC_SLOWPATH);
2327
2328 freelist = get_freelist(s, page); 2337 freelist = get_freelist(s, page);
2329 2338
2330 if (!freelist) { 2339 if (!freelist) {
@@ -2360,9 +2369,7 @@ new_slab:
2360 freelist = new_slab_objects(s, gfpflags, node, &c); 2369 freelist = new_slab_objects(s, gfpflags, node, &c);
2361 2370
2362 if (unlikely(!freelist)) { 2371 if (unlikely(!freelist)) {
2363 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2372 slab_out_of_memory(s, gfpflags, node);
2364 slab_out_of_memory(s, gfpflags, node);
2365
2366 local_irq_restore(flags); 2373 local_irq_restore(flags);
2367 return NULL; 2374 return NULL;
2368 } 2375 }
@@ -2418,7 +2425,7 @@ redo:
2418 * and the retrieval of the tid. 2425 * and the retrieval of the tid.
2419 */ 2426 */
2420 preempt_disable(); 2427 preempt_disable();
2421 c = __this_cpu_ptr(s->cpu_slab); 2428 c = this_cpu_ptr(s->cpu_slab);
2422 2429
2423 /* 2430 /*
2424 * The transaction ids are globally unique per cpu and per operation on 2431 * The transaction ids are globally unique per cpu and per operation on
@@ -2431,10 +2438,10 @@ redo:
2431 2438
2432 object = c->freelist; 2439 object = c->freelist;
2433 page = c->page; 2440 page = c->page;
2434 if (unlikely(!object || !node_match(page, node))) 2441 if (unlikely(!object || !node_match(page, node))) {
2435 object = __slab_alloc(s, gfpflags, node, addr, c); 2442 object = __slab_alloc(s, gfpflags, node, addr, c);
2436 2443 stat(s, ALLOC_SLOWPATH);
2437 else { 2444 } else {
2438 void *next_object = get_freepointer_safe(s, object); 2445 void *next_object = get_freepointer_safe(s, object);
2439 2446
2440 /* 2447 /*
@@ -2674,7 +2681,7 @@ redo:
2674 * during the cmpxchg then the free will succedd. 2681 * during the cmpxchg then the free will succedd.
2675 */ 2682 */
2676 preempt_disable(); 2683 preempt_disable();
2677 c = __this_cpu_ptr(s->cpu_slab); 2684 c = this_cpu_ptr(s->cpu_slab);
2678 2685
2679 tid = c->tid; 2686 tid = c->tid;
2680 preempt_enable(); 2687 preempt_enable();
@@ -2894,10 +2901,8 @@ static void early_kmem_cache_node_alloc(int node)
2894 2901
2895 BUG_ON(!page); 2902 BUG_ON(!page);
2896 if (page_to_nid(page) != node) { 2903 if (page_to_nid(page) != node) {
2897 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2904 pr_err("SLUB: Unable to allocate memory from node %d\n", node);
2898 "node %d\n", node); 2905 pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
2899 printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2900 "in order to be able to continue\n");
2901 } 2906 }
2902 2907
2903 n = page->freelist; 2908 n = page->freelist;
@@ -3182,8 +3187,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
3182 for_each_object(p, s, addr, page->objects) { 3187 for_each_object(p, s, addr, page->objects) {
3183 3188
3184 if (!test_bit(slab_index(p, s, addr), map)) { 3189 if (!test_bit(slab_index(p, s, addr), map)) {
3185 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", 3190 pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
3186 p, p - addr);
3187 print_tracking(s, p); 3191 print_tracking(s, p);
3188 } 3192 }
3189 } 3193 }
@@ -3305,8 +3309,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3305 struct page *page; 3309 struct page *page;
3306 void *ptr = NULL; 3310 void *ptr = NULL;
3307 3311
3308 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; 3312 flags |= __GFP_COMP | __GFP_NOTRACK;
3309 page = alloc_pages_node(node, flags, get_order(size)); 3313 page = alloc_kmem_pages_node(node, flags, get_order(size));
3310 if (page) 3314 if (page)
3311 ptr = page_address(page); 3315 ptr = page_address(page);
3312 3316
@@ -3375,7 +3379,7 @@ void kfree(const void *x)
3375 if (unlikely(!PageSlab(page))) { 3379 if (unlikely(!PageSlab(page))) {
3376 BUG_ON(!PageCompound(page)); 3380 BUG_ON(!PageCompound(page));
3377 kfree_hook(x); 3381 kfree_hook(x);
3378 __free_memcg_kmem_pages(page, compound_order(page)); 3382 __free_kmem_pages(page, compound_order(page));
3379 return; 3383 return;
3380 } 3384 }
3381 slab_free(page->slab_cache, page, object, _RET_IP_); 3385 slab_free(page->slab_cache, page, object, _RET_IP_);
@@ -3392,7 +3396,7 @@ EXPORT_SYMBOL(kfree);
3392 * being allocated from last increasing the chance that the last objects 3396 * being allocated from last increasing the chance that the last objects
3393 * are freed in them. 3397 * are freed in them.
3394 */ 3398 */
3395int kmem_cache_shrink(struct kmem_cache *s) 3399int __kmem_cache_shrink(struct kmem_cache *s)
3396{ 3400{
3397 int node; 3401 int node;
3398 int i; 3402 int i;
@@ -3448,7 +3452,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
3448 kfree(slabs_by_inuse); 3452 kfree(slabs_by_inuse);
3449 return 0; 3453 return 0;
3450} 3454}
3451EXPORT_SYMBOL(kmem_cache_shrink);
3452 3455
3453static int slab_mem_going_offline_callback(void *arg) 3456static int slab_mem_going_offline_callback(void *arg)
3454{ 3457{
@@ -3456,7 +3459,7 @@ static int slab_mem_going_offline_callback(void *arg)
3456 3459
3457 mutex_lock(&slab_mutex); 3460 mutex_lock(&slab_mutex);
3458 list_for_each_entry(s, &slab_caches, list) 3461 list_for_each_entry(s, &slab_caches, list)
3459 kmem_cache_shrink(s); 3462 __kmem_cache_shrink(s);
3460 mutex_unlock(&slab_mutex); 3463 mutex_unlock(&slab_mutex);
3461 3464
3462 return 0; 3465 return 0;
@@ -3650,9 +3653,7 @@ void __init kmem_cache_init(void)
3650 register_cpu_notifier(&slab_notifier); 3653 register_cpu_notifier(&slab_notifier);
3651#endif 3654#endif
3652 3655
3653 printk(KERN_INFO 3656 pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
3654 "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d,"
3655 " CPUs=%d, Nodes=%d\n",
3656 cache_line_size(), 3657 cache_line_size(),
3657 slub_min_order, slub_max_order, slub_min_objects, 3658 slub_min_order, slub_max_order, slub_min_objects,
3658 nr_cpu_ids, nr_node_ids); 3659 nr_cpu_ids, nr_node_ids);
@@ -3934,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s,
3934 count++; 3935 count++;
3935 } 3936 }
3936 if (count != n->nr_partial) 3937 if (count != n->nr_partial)
3937 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 3938 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
3938 "counter=%ld\n", s->name, count, n->nr_partial); 3939 s->name, count, n->nr_partial);
3939 3940
3940 if (!(s->flags & SLAB_STORE_USER)) 3941 if (!(s->flags & SLAB_STORE_USER))
3941 goto out; 3942 goto out;
@@ -3945,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s,
3945 count++; 3946 count++;
3946 } 3947 }
3947 if (count != atomic_long_read(&n->nr_slabs)) 3948 if (count != atomic_long_read(&n->nr_slabs))
3948 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 3949 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
3949 "counter=%ld\n", s->name, count, 3950 s->name, count, atomic_long_read(&n->nr_slabs));
3950 atomic_long_read(&n->nr_slabs));
3951 3951
3952out: 3952out:
3953 spin_unlock_irqrestore(&n->list_lock, flags); 3953 spin_unlock_irqrestore(&n->list_lock, flags);
@@ -4211,53 +4211,50 @@ static void resiliency_test(void)
4211 4211
4212 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); 4212 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4213 4213
4214 printk(KERN_ERR "SLUB resiliency testing\n"); 4214 pr_err("SLUB resiliency testing\n");
4215 printk(KERN_ERR "-----------------------\n"); 4215 pr_err("-----------------------\n");
4216 printk(KERN_ERR "A. Corruption after allocation\n"); 4216 pr_err("A. Corruption after allocation\n");
4217 4217
4218 p = kzalloc(16, GFP_KERNEL); 4218 p = kzalloc(16, GFP_KERNEL);
4219 p[16] = 0x12; 4219 p[16] = 0x12;
4220 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 4220 pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
4221 " 0x12->0x%p\n\n", p + 16); 4221 p + 16);
4222 4222
4223 validate_slab_cache(kmalloc_caches[4]); 4223 validate_slab_cache(kmalloc_caches[4]);
4224 4224
4225 /* Hmmm... The next two are dangerous */ 4225 /* Hmmm... The next two are dangerous */
4226 p = kzalloc(32, GFP_KERNEL); 4226 p = kzalloc(32, GFP_KERNEL);
4227 p[32 + sizeof(void *)] = 0x34; 4227 p[32 + sizeof(void *)] = 0x34;
4228 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 4228 pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
4229 " 0x34 -> -0x%p\n", p); 4229 p);
4230 printk(KERN_ERR 4230 pr_err("If allocated object is overwritten then not detectable\n\n");
4231 "If allocated object is overwritten then not detectable\n\n");
4232 4231
4233 validate_slab_cache(kmalloc_caches[5]); 4232 validate_slab_cache(kmalloc_caches[5]);
4234 p = kzalloc(64, GFP_KERNEL); 4233 p = kzalloc(64, GFP_KERNEL);
4235 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4234 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
4236 *p = 0x56; 4235 *p = 0x56;
4237 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4236 pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
4238 p); 4237 p);
4239 printk(KERN_ERR 4238 pr_err("If allocated object is overwritten then not detectable\n\n");
4240 "If allocated object is overwritten then not detectable\n\n");
4241 validate_slab_cache(kmalloc_caches[6]); 4239 validate_slab_cache(kmalloc_caches[6]);
4242 4240
4243 printk(KERN_ERR "\nB. Corruption after free\n"); 4241 pr_err("\nB. Corruption after free\n");
4244 p = kzalloc(128, GFP_KERNEL); 4242 p = kzalloc(128, GFP_KERNEL);
4245 kfree(p); 4243 kfree(p);
4246 *p = 0x78; 4244 *p = 0x78;
4247 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4245 pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4248 validate_slab_cache(kmalloc_caches[7]); 4246 validate_slab_cache(kmalloc_caches[7]);
4249 4247
4250 p = kzalloc(256, GFP_KERNEL); 4248 p = kzalloc(256, GFP_KERNEL);
4251 kfree(p); 4249 kfree(p);
4252 p[50] = 0x9a; 4250 p[50] = 0x9a;
4253 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 4251 pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
4254 p);
4255 validate_slab_cache(kmalloc_caches[8]); 4252 validate_slab_cache(kmalloc_caches[8]);
4256 4253
4257 p = kzalloc(512, GFP_KERNEL); 4254 p = kzalloc(512, GFP_KERNEL);
4258 kfree(p); 4255 kfree(p);
4259 p[512] = 0xab; 4256 p[512] = 0xab;
4260 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4257 pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4261 validate_slab_cache(kmalloc_caches[9]); 4258 validate_slab_cache(kmalloc_caches[9]);
4262} 4259}
4263#else 4260#else
@@ -4332,7 +4329,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4332 } 4329 }
4333 } 4330 }
4334 4331
4335 lock_memory_hotplug(); 4332 get_online_mems();
4336#ifdef CONFIG_SLUB_DEBUG 4333#ifdef CONFIG_SLUB_DEBUG
4337 if (flags & SO_ALL) { 4334 if (flags & SO_ALL) {
4338 for_each_node_state(node, N_NORMAL_MEMORY) { 4335 for_each_node_state(node, N_NORMAL_MEMORY) {
@@ -4372,7 +4369,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4372 x += sprintf(buf + x, " N%d=%lu", 4369 x += sprintf(buf + x, " N%d=%lu",
4373 node, nodes[node]); 4370 node, nodes[node]);
4374#endif 4371#endif
4375 unlock_memory_hotplug(); 4372 put_online_mems();
4376 kfree(nodes); 4373 kfree(nodes);
4377 return x + sprintf(buf + x, "\n"); 4374 return x + sprintf(buf + x, "\n");
4378} 4375}
@@ -5303,7 +5300,7 @@ static int __init slab_sysfs_init(void)
5303 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5300 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5304 if (!slab_kset) { 5301 if (!slab_kset) {
5305 mutex_unlock(&slab_mutex); 5302 mutex_unlock(&slab_mutex);
5306 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5303 pr_err("Cannot register slab subsystem.\n");
5307 return -ENOSYS; 5304 return -ENOSYS;
5308 } 5305 }
5309 5306
@@ -5312,8 +5309,8 @@ static int __init slab_sysfs_init(void)
5312 list_for_each_entry(s, &slab_caches, list) { 5309 list_for_each_entry(s, &slab_caches, list) {
5313 err = sysfs_slab_add(s); 5310 err = sysfs_slab_add(s);
5314 if (err) 5311 if (err)
5315 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 5312 pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
5316 " to sysfs\n", s->name); 5313 s->name);
5317 } 5314 }
5318 5315
5319 while (alias_list) { 5316 while (alias_list) {
@@ -5322,8 +5319,8 @@ static int __init slab_sysfs_init(void)
5322 alias_list = alias_list->next; 5319 alias_list = alias_list->next;
5323 err = sysfs_slab_alias(al->s, al->name); 5320 err = sysfs_slab_alias(al->s, al->name);
5324 if (err) 5321 if (err)
5325 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5322 pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
5326 " %s to sysfs\n", al->name); 5323 al->name);
5327 kfree(al); 5324 kfree(al);
5328 } 5325 }
5329 5326
diff --git a/mm/swap.c b/mm/swap.c
index 9ce43ba4498b..9e8e3472248b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -67,7 +67,7 @@ static void __page_cache_release(struct page *page)
67static void __put_single_page(struct page *page) 67static void __put_single_page(struct page *page)
68{ 68{
69 __page_cache_release(page); 69 __page_cache_release(page);
70 free_hot_cold_page(page, 0); 70 free_hot_cold_page(page, false);
71} 71}
72 72
73static void __put_compound_page(struct page *page) 73static void __put_compound_page(struct page *page)
@@ -79,95 +79,88 @@ static void __put_compound_page(struct page *page)
79 (*dtor)(page); 79 (*dtor)(page);
80} 80}
81 81
82static void put_compound_page(struct page *page) 82/**
83 * Two special cases here: we could avoid taking compound_lock_irqsave
84 * and could skip the tail refcounting(in _mapcount).
85 *
86 * 1. Hugetlbfs page:
87 *
88 * PageHeadHuge will remain true until the compound page
89 * is released and enters the buddy allocator, and it could
90 * not be split by __split_huge_page_refcount().
91 *
92 * So if we see PageHeadHuge set, and we have the tail page pin,
93 * then we could safely put head page.
94 *
95 * 2. Slab THP page:
96 *
97 * PG_slab is cleared before the slab frees the head page, and
98 * tail pin cannot be the last reference left on the head page,
99 * because the slab code is free to reuse the compound page
100 * after a kfree/kmem_cache_free without having to check if
101 * there's any tail pin left. In turn all tail pinsmust be always
102 * released while the head is still pinned by the slab code
103 * and so we know PG_slab will be still set too.
104 *
105 * So if we see PageSlab set, and we have the tail page pin,
106 * then we could safely put head page.
107 */
108static __always_inline
109void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
83{ 110{
84 struct page *page_head;
85
86 if (likely(!PageTail(page))) {
87 if (put_page_testzero(page)) {
88 /*
89 * By the time all refcounts have been released
90 * split_huge_page cannot run anymore from under us.
91 */
92 if (PageHead(page))
93 __put_compound_page(page);
94 else
95 __put_single_page(page);
96 }
97 return;
98 }
99
100 /* __split_huge_page_refcount can run under us */
101 page_head = compound_head(page);
102
103 /* 111 /*
104 * THP can not break up slab pages so avoid taking 112 * If @page is a THP tail, we must read the tail page
105 * compound_lock() and skip the tail page refcounting (in 113 * flags after the head page flags. The
106 * _mapcount) too. Slab performs non-atomic bit ops on 114 * __split_huge_page_refcount side enforces write memory barriers
107 * page->flags for better performance. In particular 115 * between clearing PageTail and before the head page
108 * slab_unlock() in slub used to be a hot path. It is still 116 * can be freed and reallocated.
109 * hot on arches that do not support
110 * this_cpu_cmpxchg_double().
111 *
112 * If "page" is part of a slab or hugetlbfs page it cannot be
113 * splitted and the head page cannot change from under us. And
114 * if "page" is part of a THP page under splitting, if the
115 * head page pointed by the THP tail isn't a THP head anymore,
116 * we'll find PageTail clear after smp_rmb() and we'll treat
117 * it as a single page.
118 */ 117 */
119 if (!__compound_tail_refcounted(page_head)) { 118 smp_rmb();
119 if (likely(PageTail(page))) {
120 /* 120 /*
121 * If "page" is a THP tail, we must read the tail page 121 * __split_huge_page_refcount cannot race
122 * flags after the head page flags. The 122 * here, see the comment above this function.
123 * split_huge_page side enforces write memory barriers
124 * between clearing PageTail and before the head page
125 * can be freed and reallocated.
126 */ 123 */
127 smp_rmb(); 124 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
128 if (likely(PageTail(page))) { 125 VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
129 /* 126 if (put_page_testzero(page_head)) {
130 * __split_huge_page_refcount cannot race
131 * here.
132 */
133 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
134 VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
135 if (put_page_testzero(page_head)) {
136 /*
137 * If this is the tail of a slab
138 * compound page, the tail pin must
139 * not be the last reference held on
140 * the page, because the PG_slab
141 * cannot be cleared before all tail
142 * pins (which skips the _mapcount
143 * tail refcounting) have been
144 * released. For hugetlbfs the tail
145 * pin may be the last reference on
146 * the page instead, because
147 * PageHeadHuge will not go away until
148 * the compound page enters the buddy
149 * allocator.
150 */
151 VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
152 __put_compound_page(page_head);
153 }
154 return;
155 } else
156 /* 127 /*
157 * __split_huge_page_refcount run before us, 128 * If this is the tail of a slab THP page,
158 * "page" was a THP tail. The split page_head 129 * the tail pin must not be the last reference
159 * has been freed and reallocated as slab or 130 * held on the page, because the PG_slab cannot
160 * hugetlbfs page of smaller order (only 131 * be cleared before all tail pins (which skips
161 * possible if reallocated as slab on x86). 132 * the _mapcount tail refcounting) have been
133 * released.
134 *
135 * If this is the tail of a hugetlbfs page,
136 * the tail pin may be the last reference on
137 * the page instead, because PageHeadHuge will
138 * not go away until the compound page enters
139 * the buddy allocator.
162 */ 140 */
163 goto out_put_single; 141 VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
164 } 142 __put_compound_page(page_head);
143 }
144 } else
145 /*
146 * __split_huge_page_refcount run before us,
147 * @page was a THP tail. The split @page_head
148 * has been freed and reallocated as slab or
149 * hugetlbfs page of smaller order (only
150 * possible if reallocated as slab on x86).
151 */
152 if (put_page_testzero(page))
153 __put_single_page(page);
154}
165 155
156static __always_inline
157void put_refcounted_compound_page(struct page *page_head, struct page *page)
158{
166 if (likely(page != page_head && get_page_unless_zero(page_head))) { 159 if (likely(page != page_head && get_page_unless_zero(page_head))) {
167 unsigned long flags; 160 unsigned long flags;
168 161
169 /* 162 /*
170 * page_head wasn't a dangling pointer but it may not 163 * @page_head wasn't a dangling pointer but it may not
171 * be a head page anymore by the time we obtain the 164 * be a head page anymore by the time we obtain the
172 * lock. That is ok as long as it can't be freed from 165 * lock. That is ok as long as it can't be freed from
173 * under us. 166 * under us.
@@ -178,7 +171,7 @@ static void put_compound_page(struct page *page)
178 compound_unlock_irqrestore(page_head, flags); 171 compound_unlock_irqrestore(page_head, flags);
179 if (put_page_testzero(page_head)) { 172 if (put_page_testzero(page_head)) {
180 /* 173 /*
181 * The head page may have been freed 174 * The @page_head may have been freed
182 * and reallocated as a compound page 175 * and reallocated as a compound page
183 * of smaller order and then freed 176 * of smaller order and then freed
184 * again. All we know is that it 177 * again. All we know is that it
@@ -222,12 +215,51 @@ out_put_single:
222 __put_single_page(page_head); 215 __put_single_page(page_head);
223 } 216 }
224 } else { 217 } else {
225 /* page_head is a dangling pointer */ 218 /* @page_head is a dangling pointer */
226 VM_BUG_ON_PAGE(PageTail(page), page); 219 VM_BUG_ON_PAGE(PageTail(page), page);
227 goto out_put_single; 220 goto out_put_single;
228 } 221 }
229} 222}
230 223
224static void put_compound_page(struct page *page)
225{
226 struct page *page_head;
227
228 /*
229 * We see the PageCompound set and PageTail not set, so @page maybe:
230 * 1. hugetlbfs head page, or
231 * 2. THP head page.
232 */
233 if (likely(!PageTail(page))) {
234 if (put_page_testzero(page)) {
235 /*
236 * By the time all refcounts have been released
237 * split_huge_page cannot run anymore from under us.
238 */
239 if (PageHead(page))
240 __put_compound_page(page);
241 else
242 __put_single_page(page);
243 }
244 return;
245 }
246
247 /*
248 * We see the PageCompound set and PageTail set, so @page maybe:
249 * 1. a tail hugetlbfs page, or
250 * 2. a tail THP page, or
251 * 3. a split THP page.
252 *
253 * Case 3 is possible, as we may race with
254 * __split_huge_page_refcount tearing down a THP page.
255 */
256 page_head = compound_head_by_tail(page);
257 if (!__compound_tail_refcounted(page_head))
258 put_unrefcounted_compound_page(page_head, page);
259 else
260 put_refcounted_compound_page(page_head, page);
261}
262
231void put_page(struct page *page) 263void put_page(struct page *page)
232{ 264{
233 if (unlikely(PageCompound(page))) 265 if (unlikely(PageCompound(page)))
@@ -441,7 +473,7 @@ void rotate_reclaimable_page(struct page *page)
441 473
442 page_cache_get(page); 474 page_cache_get(page);
443 local_irq_save(flags); 475 local_irq_save(flags);
444 pvec = &__get_cpu_var(lru_rotate_pvecs); 476 pvec = this_cpu_ptr(&lru_rotate_pvecs);
445 if (!pagevec_add(pvec, page)) 477 if (!pagevec_add(pvec, page))
446 pagevec_move_tail(pvec); 478 pagevec_move_tail(pvec);
447 local_irq_restore(flags); 479 local_irq_restore(flags);
@@ -583,12 +615,17 @@ void mark_page_accessed(struct page *page)
583EXPORT_SYMBOL(mark_page_accessed); 615EXPORT_SYMBOL(mark_page_accessed);
584 616
585/* 617/*
586 * Queue the page for addition to the LRU via pagevec. The decision on whether 618 * Used to mark_page_accessed(page) that is not visible yet and when it is
587 * to add the page to the [in]active [file|anon] list is deferred until the 619 * still safe to use non-atomic ops
588 * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
589 * have the page added to the active list using mark_page_accessed().
590 */ 620 */
591void __lru_cache_add(struct page *page) 621void init_page_accessed(struct page *page)
622{
623 if (!PageReferenced(page))
624 __SetPageReferenced(page);
625}
626EXPORT_SYMBOL(init_page_accessed);
627
628static void __lru_cache_add(struct page *page)
592{ 629{
593 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 630 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
594 631
@@ -598,11 +635,34 @@ void __lru_cache_add(struct page *page)
598 pagevec_add(pvec, page); 635 pagevec_add(pvec, page);
599 put_cpu_var(lru_add_pvec); 636 put_cpu_var(lru_add_pvec);
600} 637}
601EXPORT_SYMBOL(__lru_cache_add); 638
639/**
640 * lru_cache_add: add a page to the page lists
641 * @page: the page to add
642 */
643void lru_cache_add_anon(struct page *page)
644{
645 if (PageActive(page))
646 ClearPageActive(page);
647 __lru_cache_add(page);
648}
649
650void lru_cache_add_file(struct page *page)
651{
652 if (PageActive(page))
653 ClearPageActive(page);
654 __lru_cache_add(page);
655}
656EXPORT_SYMBOL(lru_cache_add_file);
602 657
603/** 658/**
604 * lru_cache_add - add a page to a page list 659 * lru_cache_add - add a page to a page list
605 * @page: the page to be added to the LRU. 660 * @page: the page to be added to the LRU.
661 *
662 * Queue the page for addition to the LRU via pagevec. The decision on whether
663 * to add the page to the [in]active [file|anon] list is deferred until the
664 * pagevec is drained. This gives a chance for the caller of lru_cache_add()
665 * have the page added to the active list using mark_page_accessed().
606 */ 666 */
607void lru_cache_add(struct page *page) 667void lru_cache_add(struct page *page)
608{ 668{
@@ -813,7 +873,7 @@ void lru_add_drain_all(void)
813 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 873 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
814 * will free it. 874 * will free it.
815 */ 875 */
816void release_pages(struct page **pages, int nr, int cold) 876void release_pages(struct page **pages, int nr, bool cold)
817{ 877{
818 int i; 878 int i;
819 LIST_HEAD(pages_to_free); 879 LIST_HEAD(pages_to_free);
@@ -854,7 +914,7 @@ void release_pages(struct page **pages, int nr, int cold)
854 } 914 }
855 915
856 /* Clear Active bit in case of parallel mark_page_accessed */ 916 /* Clear Active bit in case of parallel mark_page_accessed */
857 ClearPageActive(page); 917 __ClearPageActive(page);
858 918
859 list_add(&page->lru, &pages_to_free); 919 list_add(&page->lru, &pages_to_free);
860 } 920 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e76ace30d436..2972eee184a4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -270,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
270 270
271 for (i = 0; i < todo; i++) 271 for (i = 0; i < todo; i++)
272 free_swap_cache(pagep[i]); 272 free_swap_cache(pagep[i]);
273 release_pages(pagep, todo, 0); 273 release_pages(pagep, todo, false);
274 pagep += todo; 274 pagep += todo;
275 nr -= todo; 275 nr -= todo;
276 } 276 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4a7f7e6992b6..4c524f7bd0bf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
52long total_swap_pages; 52long total_swap_pages;
53static int least_priority; 53static int least_priority;
54static atomic_t highest_priority_index = ATOMIC_INIT(-1);
55 54
56static const char Bad_file[] = "Bad swap file entry "; 55static const char Bad_file[] = "Bad swap file entry ";
57static const char Unused_file[] = "Unused swap file entry "; 56static const char Unused_file[] = "Unused swap file entry ";
58static const char Bad_offset[] = "Bad swap offset entry "; 57static const char Bad_offset[] = "Bad swap offset entry ";
59static const char Unused_offset[] = "Unused swap offset entry "; 58static const char Unused_offset[] = "Unused swap offset entry ";
60 59
61struct swap_list_t swap_list = {-1, -1}; 60/*
61 * all active swap_info_structs
62 * protected with swap_lock, and ordered by priority.
63 */
64PLIST_HEAD(swap_active_head);
65
66/*
67 * all available (active, not full) swap_info_structs
68 * protected with swap_avail_lock, ordered by priority.
69 * This is used by get_swap_page() instead of swap_active_head
70 * because swap_active_head includes all swap_info_structs,
71 * but get_swap_page() doesn't need to look at full ones.
72 * This uses its own lock instead of swap_lock because when a
73 * swap_info_struct changes between not-full/full, it needs to
74 * add/remove itself to/from this list, but the swap_info_struct->lock
75 * is held and the locking order requires swap_lock to be taken
76 * before any swap_info_struct->lock.
77 */
78static PLIST_HEAD(swap_avail_head);
79static DEFINE_SPINLOCK(swap_avail_lock);
62 80
63struct swap_info_struct *swap_info[MAX_SWAPFILES]; 81struct swap_info_struct *swap_info[MAX_SWAPFILES];
64 82
@@ -505,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
505 /* 523 /*
506 * If seek is expensive, start searching for new cluster from 524 * If seek is expensive, start searching for new cluster from
507 * start of partition, to minimize the span of allocated swap. 525 * start of partition, to minimize the span of allocated swap.
508 * But if seek is cheap, search from our current position, so 526 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
509 * that swap is allocated from all over the partition: if the 527 * case, just handled by scan_swap_map_try_ssd_cluster() above.
510 * Flash Translation Layer only remaps within limited zones,
511 * we don't want to wear out the first zone too quickly.
512 */ 528 */
513 if (!(si->flags & SWP_SOLIDSTATE)) 529 scan_base = offset = si->lowest_bit;
514 scan_base = offset = si->lowest_bit;
515 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 530 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
516 531
517 /* Locate the first empty (unaligned) cluster */ 532 /* Locate the first empty (unaligned) cluster */
@@ -531,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
531 } 546 }
532 } 547 }
533 548
534 offset = si->lowest_bit;
535 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
536
537 /* Locate the first empty (unaligned) cluster */
538 for (; last_in_cluster < scan_base; offset++) {
539 if (si->swap_map[offset])
540 last_in_cluster = offset + SWAPFILE_CLUSTER;
541 else if (offset == last_in_cluster) {
542 spin_lock(&si->lock);
543 offset -= SWAPFILE_CLUSTER - 1;
544 si->cluster_next = offset;
545 si->cluster_nr = SWAPFILE_CLUSTER - 1;
546 goto checks;
547 }
548 if (unlikely(--latency_ration < 0)) {
549 cond_resched();
550 latency_ration = LATENCY_LIMIT;
551 }
552 }
553
554 offset = scan_base; 549 offset = scan_base;
555 spin_lock(&si->lock); 550 spin_lock(&si->lock);
556 si->cluster_nr = SWAPFILE_CLUSTER - 1; 551 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -591,6 +586,9 @@ checks:
591 if (si->inuse_pages == si->pages) { 586 if (si->inuse_pages == si->pages) {
592 si->lowest_bit = si->max; 587 si->lowest_bit = si->max;
593 si->highest_bit = 0; 588 si->highest_bit = 0;
589 spin_lock(&swap_avail_lock);
590 plist_del(&si->avail_list, &swap_avail_head);
591 spin_unlock(&swap_avail_lock);
594 } 592 }
595 si->swap_map[offset] = usage; 593 si->swap_map[offset] = usage;
596 inc_cluster_info_page(si, si->cluster_info, offset); 594 inc_cluster_info_page(si, si->cluster_info, offset);
@@ -640,71 +638,65 @@ no_page:
640 638
641swp_entry_t get_swap_page(void) 639swp_entry_t get_swap_page(void)
642{ 640{
643 struct swap_info_struct *si; 641 struct swap_info_struct *si, *next;
644 pgoff_t offset; 642 pgoff_t offset;
645 int type, next;
646 int wrapped = 0;
647 int hp_index;
648 643
649 spin_lock(&swap_lock);
650 if (atomic_long_read(&nr_swap_pages) <= 0) 644 if (atomic_long_read(&nr_swap_pages) <= 0)
651 goto noswap; 645 goto noswap;
652 atomic_long_dec(&nr_swap_pages); 646 atomic_long_dec(&nr_swap_pages);
653 647
654 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 648 spin_lock(&swap_avail_lock);
655 hp_index = atomic_xchg(&highest_priority_index, -1);
656 /*
657 * highest_priority_index records current highest priority swap
658 * type which just frees swap entries. If its priority is
659 * higher than that of swap_list.next swap type, we use it. It
660 * isn't protected by swap_lock, so it can be an invalid value
661 * if the corresponding swap type is swapoff. We double check
662 * the flags here. It's even possible the swap type is swapoff
663 * and swapon again and its priority is changed. In such rare
664 * case, low prority swap type might be used, but eventually
665 * high priority swap will be used after several rounds of
666 * swap.
667 */
668 if (hp_index != -1 && hp_index != type &&
669 swap_info[type]->prio < swap_info[hp_index]->prio &&
670 (swap_info[hp_index]->flags & SWP_WRITEOK)) {
671 type = hp_index;
672 swap_list.next = type;
673 }
674
675 si = swap_info[type];
676 next = si->next;
677 if (next < 0 ||
678 (!wrapped && si->prio != swap_info[next]->prio)) {
679 next = swap_list.head;
680 wrapped++;
681 }
682 649
650start_over:
651 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
652 /* requeue si to after same-priority siblings */
653 plist_requeue(&si->avail_list, &swap_avail_head);
654 spin_unlock(&swap_avail_lock);
683 spin_lock(&si->lock); 655 spin_lock(&si->lock);
684 if (!si->highest_bit) { 656 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
685 spin_unlock(&si->lock); 657 spin_lock(&swap_avail_lock);
686 continue; 658 if (plist_node_empty(&si->avail_list)) {
687 } 659 spin_unlock(&si->lock);
688 if (!(si->flags & SWP_WRITEOK)) { 660 goto nextsi;
661 }
662 WARN(!si->highest_bit,
663 "swap_info %d in list but !highest_bit\n",
664 si->type);
665 WARN(!(si->flags & SWP_WRITEOK),
666 "swap_info %d in list but !SWP_WRITEOK\n",
667 si->type);
668 plist_del(&si->avail_list, &swap_avail_head);
689 spin_unlock(&si->lock); 669 spin_unlock(&si->lock);
690 continue; 670 goto nextsi;
691 } 671 }
692 672
693 swap_list.next = next;
694
695 spin_unlock(&swap_lock);
696 /* This is called for allocating swap entry for cache */ 673 /* This is called for allocating swap entry for cache */
697 offset = scan_swap_map(si, SWAP_HAS_CACHE); 674 offset = scan_swap_map(si, SWAP_HAS_CACHE);
698 spin_unlock(&si->lock); 675 spin_unlock(&si->lock);
699 if (offset) 676 if (offset)
700 return swp_entry(type, offset); 677 return swp_entry(si->type, offset);
701 spin_lock(&swap_lock); 678 pr_debug("scan_swap_map of si %d failed to find offset\n",
702 next = swap_list.next; 679 si->type);
680 spin_lock(&swap_avail_lock);
681nextsi:
682 /*
683 * if we got here, it's likely that si was almost full before,
684 * and since scan_swap_map() can drop the si->lock, multiple
685 * callers probably all tried to get a page from the same si
686 * and it filled up before we could get one; or, the si filled
687 * up between us dropping swap_avail_lock and taking si->lock.
688 * Since we dropped the swap_avail_lock, the swap_avail_head
689 * list may have been modified; so if next is still in the
690 * swap_avail_head list then try it, otherwise start over.
691 */
692 if (plist_node_empty(&next->avail_list))
693 goto start_over;
703 } 694 }
704 695
696 spin_unlock(&swap_avail_lock);
697
705 atomic_long_inc(&nr_swap_pages); 698 atomic_long_inc(&nr_swap_pages);
706noswap: 699noswap:
707 spin_unlock(&swap_lock);
708 return (swp_entry_t) {0}; 700 return (swp_entry_t) {0};
709} 701}
710 702
@@ -766,27 +758,6 @@ out:
766 return NULL; 758 return NULL;
767} 759}
768 760
769/*
770 * This swap type frees swap entry, check if it is the highest priority swap
771 * type which just frees swap entry. get_swap_page() uses
772 * highest_priority_index to search highest priority swap type. The
773 * swap_info_struct.lock can't protect us if there are multiple swap types
774 * active, so we use atomic_cmpxchg.
775 */
776static void set_highest_priority_index(int type)
777{
778 int old_hp_index, new_hp_index;
779
780 do {
781 old_hp_index = atomic_read(&highest_priority_index);
782 if (old_hp_index != -1 &&
783 swap_info[old_hp_index]->prio >= swap_info[type]->prio)
784 break;
785 new_hp_index = type;
786 } while (atomic_cmpxchg(&highest_priority_index,
787 old_hp_index, new_hp_index) != old_hp_index);
788}
789
790static unsigned char swap_entry_free(struct swap_info_struct *p, 761static unsigned char swap_entry_free(struct swap_info_struct *p,
791 swp_entry_t entry, unsigned char usage) 762 swp_entry_t entry, unsigned char usage)
792{ 763{
@@ -828,9 +799,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
828 dec_cluster_info_page(p, p->cluster_info, offset); 799 dec_cluster_info_page(p, p->cluster_info, offset);
829 if (offset < p->lowest_bit) 800 if (offset < p->lowest_bit)
830 p->lowest_bit = offset; 801 p->lowest_bit = offset;
831 if (offset > p->highest_bit) 802 if (offset > p->highest_bit) {
803 bool was_full = !p->highest_bit;
832 p->highest_bit = offset; 804 p->highest_bit = offset;
833 set_highest_priority_index(p->type); 805 if (was_full && (p->flags & SWP_WRITEOK)) {
806 spin_lock(&swap_avail_lock);
807 WARN_ON(!plist_node_empty(&p->avail_list));
808 if (plist_node_empty(&p->avail_list))
809 plist_add(&p->avail_list,
810 &swap_avail_head);
811 spin_unlock(&swap_avail_lock);
812 }
813 }
834 atomic_long_inc(&nr_swap_pages); 814 atomic_long_inc(&nr_swap_pages);
835 p->inuse_pages--; 815 p->inuse_pages--;
836 frontswap_invalidate_page(p->type, offset); 816 frontswap_invalidate_page(p->type, offset);
@@ -1765,30 +1745,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1765 unsigned char *swap_map, 1745 unsigned char *swap_map,
1766 struct swap_cluster_info *cluster_info) 1746 struct swap_cluster_info *cluster_info)
1767{ 1747{
1768 int i, prev;
1769
1770 if (prio >= 0) 1748 if (prio >= 0)
1771 p->prio = prio; 1749 p->prio = prio;
1772 else 1750 else
1773 p->prio = --least_priority; 1751 p->prio = --least_priority;
1752 /*
1753 * the plist prio is negated because plist ordering is
1754 * low-to-high, while swap ordering is high-to-low
1755 */
1756 p->list.prio = -p->prio;
1757 p->avail_list.prio = -p->prio;
1774 p->swap_map = swap_map; 1758 p->swap_map = swap_map;
1775 p->cluster_info = cluster_info; 1759 p->cluster_info = cluster_info;
1776 p->flags |= SWP_WRITEOK; 1760 p->flags |= SWP_WRITEOK;
1777 atomic_long_add(p->pages, &nr_swap_pages); 1761 atomic_long_add(p->pages, &nr_swap_pages);
1778 total_swap_pages += p->pages; 1762 total_swap_pages += p->pages;
1779 1763
1780 /* insert swap space into swap_list: */ 1764 assert_spin_locked(&swap_lock);
1781 prev = -1; 1765 /*
1782 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { 1766 * both lists are plists, and thus priority ordered.
1783 if (p->prio >= swap_info[i]->prio) 1767 * swap_active_head needs to be priority ordered for swapoff(),
1784 break; 1768 * which on removal of any swap_info_struct with an auto-assigned
1785 prev = i; 1769 * (i.e. negative) priority increments the auto-assigned priority
1786 } 1770 * of any lower-priority swap_info_structs.
1787 p->next = i; 1771 * swap_avail_head needs to be priority ordered for get_swap_page(),
1788 if (prev < 0) 1772 * which allocates swap pages from the highest available priority
1789 swap_list.head = swap_list.next = p->type; 1773 * swap_info_struct.
1790 else 1774 */
1791 swap_info[prev]->next = p->type; 1775 plist_add(&p->list, &swap_active_head);
1776 spin_lock(&swap_avail_lock);
1777 plist_add(&p->avail_list, &swap_avail_head);
1778 spin_unlock(&swap_avail_lock);
1792} 1779}
1793 1780
1794static void enable_swap_info(struct swap_info_struct *p, int prio, 1781static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1823,8 +1810,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1823 struct address_space *mapping; 1810 struct address_space *mapping;
1824 struct inode *inode; 1811 struct inode *inode;
1825 struct filename *pathname; 1812 struct filename *pathname;
1826 int i, type, prev; 1813 int err, found = 0;
1827 int err;
1828 unsigned int old_block_size; 1814 unsigned int old_block_size;
1829 1815
1830 if (!capable(CAP_SYS_ADMIN)) 1816 if (!capable(CAP_SYS_ADMIN))
@@ -1842,17 +1828,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1842 goto out; 1828 goto out;
1843 1829
1844 mapping = victim->f_mapping; 1830 mapping = victim->f_mapping;
1845 prev = -1;
1846 spin_lock(&swap_lock); 1831 spin_lock(&swap_lock);
1847 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { 1832 plist_for_each_entry(p, &swap_active_head, list) {
1848 p = swap_info[type];
1849 if (p->flags & SWP_WRITEOK) { 1833 if (p->flags & SWP_WRITEOK) {
1850 if (p->swap_file->f_mapping == mapping) 1834 if (p->swap_file->f_mapping == mapping) {
1835 found = 1;
1851 break; 1836 break;
1837 }
1852 } 1838 }
1853 prev = type;
1854 } 1839 }
1855 if (type < 0) { 1840 if (!found) {
1856 err = -EINVAL; 1841 err = -EINVAL;
1857 spin_unlock(&swap_lock); 1842 spin_unlock(&swap_lock);
1858 goto out_dput; 1843 goto out_dput;
@@ -1864,20 +1849,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1864 spin_unlock(&swap_lock); 1849 spin_unlock(&swap_lock);
1865 goto out_dput; 1850 goto out_dput;
1866 } 1851 }
1867 if (prev < 0) 1852 spin_lock(&swap_avail_lock);
1868 swap_list.head = p->next; 1853 plist_del(&p->avail_list, &swap_avail_head);
1869 else 1854 spin_unlock(&swap_avail_lock);
1870 swap_info[prev]->next = p->next;
1871 if (type == swap_list.next) {
1872 /* just pick something that's safe... */
1873 swap_list.next = swap_list.head;
1874 }
1875 spin_lock(&p->lock); 1855 spin_lock(&p->lock);
1876 if (p->prio < 0) { 1856 if (p->prio < 0) {
1877 for (i = p->next; i >= 0; i = swap_info[i]->next) 1857 struct swap_info_struct *si = p;
1878 swap_info[i]->prio = p->prio--; 1858
1859 plist_for_each_entry_continue(si, &swap_active_head, list) {
1860 si->prio++;
1861 si->list.prio--;
1862 si->avail_list.prio--;
1863 }
1879 least_priority++; 1864 least_priority++;
1880 } 1865 }
1866 plist_del(&p->list, &swap_active_head);
1881 atomic_long_sub(p->pages, &nr_swap_pages); 1867 atomic_long_sub(p->pages, &nr_swap_pages);
1882 total_swap_pages -= p->pages; 1868 total_swap_pages -= p->pages;
1883 p->flags &= ~SWP_WRITEOK; 1869 p->flags &= ~SWP_WRITEOK;
@@ -1885,7 +1871,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1885 spin_unlock(&swap_lock); 1871 spin_unlock(&swap_lock);
1886 1872
1887 set_current_oom_origin(); 1873 set_current_oom_origin();
1888 err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1874 err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
1889 clear_current_oom_origin(); 1875 clear_current_oom_origin();
1890 1876
1891 if (err) { 1877 if (err) {
@@ -1926,7 +1912,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1926 frontswap_map = frontswap_map_get(p); 1912 frontswap_map = frontswap_map_get(p);
1927 spin_unlock(&p->lock); 1913 spin_unlock(&p->lock);
1928 spin_unlock(&swap_lock); 1914 spin_unlock(&swap_lock);
1929 frontswap_invalidate_area(type); 1915 frontswap_invalidate_area(p->type);
1930 frontswap_map_set(p, NULL); 1916 frontswap_map_set(p, NULL);
1931 mutex_unlock(&swapon_mutex); 1917 mutex_unlock(&swapon_mutex);
1932 free_percpu(p->percpu_cluster); 1918 free_percpu(p->percpu_cluster);
@@ -1935,7 +1921,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1935 vfree(cluster_info); 1921 vfree(cluster_info);
1936 vfree(frontswap_map); 1922 vfree(frontswap_map);
1937 /* Destroy swap account information */ 1923 /* Destroy swap account information */
1938 swap_cgroup_swapoff(type); 1924 swap_cgroup_swapoff(p->type);
1939 1925
1940 inode = mapping->host; 1926 inode = mapping->host;
1941 if (S_ISBLK(inode->i_mode)) { 1927 if (S_ISBLK(inode->i_mode)) {
@@ -2142,8 +2128,9 @@ static struct swap_info_struct *alloc_swap_info(void)
2142 */ 2128 */
2143 } 2129 }
2144 INIT_LIST_HEAD(&p->first_swap_extent.list); 2130 INIT_LIST_HEAD(&p->first_swap_extent.list);
2131 plist_node_init(&p->list, 0);
2132 plist_node_init(&p->avail_list, 0);
2145 p->flags = SWP_USED; 2133 p->flags = SWP_USED;
2146 p->next = -1;
2147 spin_unlock(&swap_lock); 2134 spin_unlock(&swap_lock);
2148 spin_lock_init(&p->lock); 2135 spin_lock_init(&p->lock);
2149 2136
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 1037a3bab505..9f25af825dec 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -17,6 +17,16 @@ void vmacache_flush_all(struct mm_struct *mm)
17{ 17{
18 struct task_struct *g, *p; 18 struct task_struct *g, *p;
19 19
20 /*
21 * Single threaded tasks need not iterate the entire
22 * list of process. We can avoid the flushing as well
23 * since the mm's seqnum was increased and don't have
24 * to worry about other threads' seqnum. Current's
25 * flush will occur upon the next lookup.
26 */
27 if (atomic_read(&mm->mm_users) == 1)
28 return;
29
20 rcu_read_lock(); 30 rcu_read_lock();
21 for_each_process_thread(g, p) { 31 for_each_process_thread(g, p) {
22 /* 32 /*
@@ -78,6 +88,8 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
78 if (!vmacache_valid(mm)) 88 if (!vmacache_valid(mm))
79 return NULL; 89 return NULL;
80 90
91 count_vm_vmacache_event(VMACACHE_FIND_CALLS);
92
81 for (i = 0; i < VMACACHE_SIZE; i++) { 93 for (i = 0; i < VMACACHE_SIZE; i++) {
82 struct vm_area_struct *vma = current->vmacache[i]; 94 struct vm_area_struct *vma = current->vmacache[i];
83 95
@@ -85,8 +97,10 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
85 continue; 97 continue;
86 if (WARN_ON_ONCE(vma->vm_mm != mm)) 98 if (WARN_ON_ONCE(vma->vm_mm != mm))
87 break; 99 break;
88 if (vma->vm_start <= addr && vma->vm_end > addr) 100 if (vma->vm_start <= addr && vma->vm_end > addr) {
101 count_vm_vmacache_event(VMACACHE_FIND_HITS);
89 return vma; 102 return vma;
103 }
90 } 104 }
91 105
92 return NULL; 106 return NULL;
@@ -102,11 +116,15 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
102 if (!vmacache_valid(mm)) 116 if (!vmacache_valid(mm))
103 return NULL; 117 return NULL;
104 118
119 count_vm_vmacache_event(VMACACHE_FIND_CALLS);
120
105 for (i = 0; i < VMACACHE_SIZE; i++) { 121 for (i = 0; i < VMACACHE_SIZE; i++) {
106 struct vm_area_struct *vma = current->vmacache[i]; 122 struct vm_area_struct *vma = current->vmacache[i];
107 123
108 if (vma && vma->vm_start == start && vma->vm_end == end) 124 if (vma && vma->vm_start == start && vma->vm_end == end) {
125 count_vm_vmacache_event(VMACACHE_FIND_HITS);
109 return vma; 126 return vma;
127 }
110 } 128 }
111 129
112 return NULL; 130 return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bf233b283319..f64632b67196 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1268,6 +1268,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
1268 vunmap_page_range(addr, end); 1268 vunmap_page_range(addr, end);
1269 flush_tlb_kernel_range(addr, end); 1269 flush_tlb_kernel_range(addr, end);
1270} 1270}
1271EXPORT_SYMBOL_GPL(unmap_kernel_range);
1271 1272
1272int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 1273int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
1273{ 1274{
@@ -1496,7 +1497,7 @@ void vfree(const void *addr)
1496 if (!addr) 1497 if (!addr)
1497 return; 1498 return;
1498 if (unlikely(in_interrupt())) { 1499 if (unlikely(in_interrupt())) {
1499 struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); 1500 struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
1500 if (llist_add((struct llist_node *)addr, &p->list)) 1501 if (llist_add((struct llist_node *)addr, &p->list))
1501 schedule_work(&p->wq); 1502 schedule_work(&p->wq);
1502 } else 1503 } else
@@ -2619,19 +2620,19 @@ static int s_show(struct seq_file *m, void *p)
2619 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); 2620 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
2620 2621
2621 if (v->flags & VM_IOREMAP) 2622 if (v->flags & VM_IOREMAP)
2622 seq_printf(m, " ioremap"); 2623 seq_puts(m, " ioremap");
2623 2624
2624 if (v->flags & VM_ALLOC) 2625 if (v->flags & VM_ALLOC)
2625 seq_printf(m, " vmalloc"); 2626 seq_puts(m, " vmalloc");
2626 2627
2627 if (v->flags & VM_MAP) 2628 if (v->flags & VM_MAP)
2628 seq_printf(m, " vmap"); 2629 seq_puts(m, " vmap");
2629 2630
2630 if (v->flags & VM_USERMAP) 2631 if (v->flags & VM_USERMAP)
2631 seq_printf(m, " user"); 2632 seq_puts(m, " user");
2632 2633
2633 if (v->flags & VM_VPAGES) 2634 if (v->flags & VM_VPAGES)
2634 seq_printf(m, " vpages"); 2635 seq_puts(m, " vpages");
2635 2636
2636 show_numa_info(m, v); 2637 show_numa_info(m, v);
2637 seq_putc(m, '\n'); 2638 seq_putc(m, '\n');
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 32c661d66a45..9149444f947d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
324 else 324 else
325 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); 325 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
326 326
327 trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); 327 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
328 return freed; 328 return freed;
329} 329}
330 330
@@ -1121,7 +1121,7 @@ keep:
1121 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); 1121 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1122 } 1122 }
1123 1123
1124 free_hot_cold_page_list(&free_pages, 1); 1124 free_hot_cold_page_list(&free_pages, true);
1125 1125
1126 list_splice(&ret_pages, page_list); 1126 list_splice(&ret_pages, page_list);
1127 count_vm_events(PGACTIVATE, pgactivate); 1127 count_vm_events(PGACTIVATE, pgactivate);
@@ -1439,6 +1439,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1439} 1439}
1440 1440
1441/* 1441/*
1442 * If a kernel thread (such as nfsd for loop-back mounts) services
1443 * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
1444 * In that case we should only throttle if the backing device it is
1445 * writing to is congested. In other cases it is safe to throttle.
1446 */
1447static int current_may_throttle(void)
1448{
1449 return !(current->flags & PF_LESS_THROTTLE) ||
1450 current->backing_dev_info == NULL ||
1451 bdi_write_congested(current->backing_dev_info);
1452}
1453
1454/*
1442 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1455 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1443 * of reclaimed pages 1456 * of reclaimed pages
1444 */ 1457 */
@@ -1519,7 +1532,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1519 1532
1520 spin_unlock_irq(&zone->lru_lock); 1533 spin_unlock_irq(&zone->lru_lock);
1521 1534
1522 free_hot_cold_page_list(&page_list, 1); 1535 free_hot_cold_page_list(&page_list, true);
1523 1536
1524 /* 1537 /*
1525 * If reclaim is isolating dirty pages under writeback, it implies 1538 * If reclaim is isolating dirty pages under writeback, it implies
@@ -1566,7 +1579,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1566 * implies that pages are cycling through the LRU faster than 1579 * implies that pages are cycling through the LRU faster than
1567 * they are written so also forcibly stall. 1580 * they are written so also forcibly stall.
1568 */ 1581 */
1569 if (nr_unqueued_dirty == nr_taken || nr_immediate) 1582 if ((nr_unqueued_dirty == nr_taken || nr_immediate) &&
1583 current_may_throttle())
1570 congestion_wait(BLK_RW_ASYNC, HZ/10); 1584 congestion_wait(BLK_RW_ASYNC, HZ/10);
1571 } 1585 }
1572 1586
@@ -1575,7 +1589,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1575 * is congested. Allow kswapd to continue until it starts encountering 1589 * is congested. Allow kswapd to continue until it starts encountering
1576 * unqueued dirty pages or cycling through the LRU too quickly. 1590 * unqueued dirty pages or cycling through the LRU too quickly.
1577 */ 1591 */
1578 if (!sc->hibernation_mode && !current_is_kswapd()) 1592 if (!sc->hibernation_mode && !current_is_kswapd() &&
1593 current_may_throttle())
1579 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1594 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1580 1595
1581 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1596 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1740,7 +1755,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1740 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1755 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1741 spin_unlock_irq(&zone->lru_lock); 1756 spin_unlock_irq(&zone->lru_lock);
1742 1757
1743 free_hot_cold_page_list(&l_hold, 1); 1758 free_hot_cold_page_list(&l_hold, true);
1744} 1759}
1745 1760
1746#ifdef CONFIG_SWAP 1761#ifdef CONFIG_SWAP
@@ -1866,6 +1881,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1866 bool force_scan = false; 1881 bool force_scan = false;
1867 unsigned long ap, fp; 1882 unsigned long ap, fp;
1868 enum lru_list lru; 1883 enum lru_list lru;
1884 bool some_scanned;
1885 int pass;
1869 1886
1870 /* 1887 /*
1871 * If the zone or memcg is small, nr[l] can be 0. This 1888 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1989,39 +2006,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1989 fraction[1] = fp; 2006 fraction[1] = fp;
1990 denominator = ap + fp + 1; 2007 denominator = ap + fp + 1;
1991out: 2008out:
1992 for_each_evictable_lru(lru) { 2009 some_scanned = false;
1993 int file = is_file_lru(lru); 2010 /* Only use force_scan on second pass. */
1994 unsigned long size; 2011 for (pass = 0; !some_scanned && pass < 2; pass++) {
1995 unsigned long scan; 2012 for_each_evictable_lru(lru) {
2013 int file = is_file_lru(lru);
2014 unsigned long size;
2015 unsigned long scan;
1996 2016
1997 size = get_lru_size(lruvec, lru); 2017 size = get_lru_size(lruvec, lru);
1998 scan = size >> sc->priority; 2018 scan = size >> sc->priority;
1999 2019
2000 if (!scan && force_scan) 2020 if (!scan && pass && force_scan)
2001 scan = min(size, SWAP_CLUSTER_MAX); 2021 scan = min(size, SWAP_CLUSTER_MAX);
2002 2022
2003 switch (scan_balance) { 2023 switch (scan_balance) {
2004 case SCAN_EQUAL: 2024 case SCAN_EQUAL:
2005 /* Scan lists relative to size */ 2025 /* Scan lists relative to size */
2006 break; 2026 break;
2007 case SCAN_FRACT: 2027 case SCAN_FRACT:
2028 /*
2029 * Scan types proportional to swappiness and
2030 * their relative recent reclaim efficiency.
2031 */
2032 scan = div64_u64(scan * fraction[file],
2033 denominator);
2034 break;
2035 case SCAN_FILE:
2036 case SCAN_ANON:
2037 /* Scan one type exclusively */
2038 if ((scan_balance == SCAN_FILE) != file)
2039 scan = 0;
2040 break;
2041 default:
2042 /* Look ma, no brain */
2043 BUG();
2044 }
2045 nr[lru] = scan;
2008 /* 2046 /*
2009 * Scan types proportional to swappiness and 2047 * Skip the second pass and don't force_scan,
2010 * their relative recent reclaim efficiency. 2048 * if we found something to scan.
2011 */ 2049 */
2012 scan = div64_u64(scan * fraction[file], denominator); 2050 some_scanned |= !!scan;
2013 break;
2014 case SCAN_FILE:
2015 case SCAN_ANON:
2016 /* Scan one type exclusively */
2017 if ((scan_balance == SCAN_FILE) != file)
2018 scan = 0;
2019 break;
2020 default:
2021 /* Look ma, no brain */
2022 BUG();
2023 } 2051 }
2024 nr[lru] = scan;
2025 } 2052 }
2026} 2053}
2027 2054
@@ -2037,13 +2064,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2037 unsigned long nr_reclaimed = 0; 2064 unsigned long nr_reclaimed = 0;
2038 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2065 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2039 struct blk_plug plug; 2066 struct blk_plug plug;
2040 bool scan_adjusted = false; 2067 bool scan_adjusted;
2041 2068
2042 get_scan_count(lruvec, sc, nr); 2069 get_scan_count(lruvec, sc, nr);
2043 2070
2044 /* Record the original scan target for proportional adjustments later */ 2071 /* Record the original scan target for proportional adjustments later */
2045 memcpy(targets, nr, sizeof(nr)); 2072 memcpy(targets, nr, sizeof(nr));
2046 2073
2074 /*
2075 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
2076 * event that can occur when there is little memory pressure e.g.
2077 * multiple streaming readers/writers. Hence, we do not abort scanning
2078 * when the requested number of pages are reclaimed when scanning at
2079 * DEF_PRIORITY on the assumption that the fact we are direct
2080 * reclaiming implies that kswapd is not keeping up and it is best to
2081 * do a batch of work at once. For memcg reclaim one check is made to
2082 * abort proportional reclaim if either the file or anon lru has already
2083 * dropped to zero at the first pass.
2084 */
2085 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2086 sc->priority == DEF_PRIORITY);
2087
2047 blk_start_plug(&plug); 2088 blk_start_plug(&plug);
2048 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2089 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2049 nr[LRU_INACTIVE_FILE]) { 2090 nr[LRU_INACTIVE_FILE]) {
@@ -2064,17 +2105,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2064 continue; 2105 continue;
2065 2106
2066 /* 2107 /*
2067 * For global direct reclaim, reclaim only the number of pages
2068 * requested. Less care is taken to scan proportionally as it
2069 * is more important to minimise direct reclaim stall latency
2070 * than it is to properly age the LRU lists.
2071 */
2072 if (global_reclaim(sc) && !current_is_kswapd())
2073 break;
2074
2075 /*
2076 * For kswapd and memcg, reclaim at least the number of pages 2108 * For kswapd and memcg, reclaim at least the number of pages
2077 * requested. Ensure that the anon and file LRUs shrink 2109 * requested. Ensure that the anon and file LRUs are scanned
2078 * proportionally what was requested by get_scan_count(). We 2110 * proportionally what was requested by get_scan_count(). We
2079 * stop reclaiming one LRU and reduce the amount scanning 2111 * stop reclaiming one LRU and reduce the amount scanning
2080 * proportional to the original scan target. 2112 * proportional to the original scan target.
@@ -2082,6 +2114,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2082 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 2114 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2083 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 2115 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2084 2116
2117 /*
2118 * It's just vindictive to attack the larger once the smaller
2119 * has gone to zero. And given the way we stop scanning the
2120 * smaller below, this makes sure that we only make one nudge
2121 * towards proportionality once we've got nr_to_reclaim.
2122 */
2123 if (!nr_file || !nr_anon)
2124 break;
2125
2085 if (nr_file > nr_anon) { 2126 if (nr_file > nr_anon) {
2086 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 2127 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2087 targets[LRU_ACTIVE_ANON] + 1; 2128 targets[LRU_ACTIVE_ANON] + 1;
@@ -2268,9 +2309,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2268 * there is a buffer of free pages available to give compaction 2309 * there is a buffer of free pages available to give compaction
2269 * a reasonable chance of completing and allocating the page 2310 * a reasonable chance of completing and allocating the page
2270 */ 2311 */
2271 balance_gap = min(low_wmark_pages(zone), 2312 balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
2272 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2313 zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
2273 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2274 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2314 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2275 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2315 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2276 2316
@@ -2525,10 +2565,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2525 2565
2526 for (i = 0; i <= ZONE_NORMAL; i++) { 2566 for (i = 0; i <= ZONE_NORMAL; i++) {
2527 zone = &pgdat->node_zones[i]; 2567 zone = &pgdat->node_zones[i];
2568 if (!populated_zone(zone))
2569 continue;
2570
2528 pfmemalloc_reserve += min_wmark_pages(zone); 2571 pfmemalloc_reserve += min_wmark_pages(zone);
2529 free_pages += zone_page_state(zone, NR_FREE_PAGES); 2572 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2530 } 2573 }
2531 2574
2575 /* If there are no reserves (unexpected config) then do not throttle */
2576 if (!pfmemalloc_reserve)
2577 return true;
2578
2532 wmark_ok = free_pages > pfmemalloc_reserve / 2; 2579 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2533 2580
2534 /* kswapd must be awake if processes are being throttled */ 2581 /* kswapd must be awake if processes are being throttled */
@@ -2553,9 +2600,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2553static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 2600static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2554 nodemask_t *nodemask) 2601 nodemask_t *nodemask)
2555{ 2602{
2603 struct zoneref *z;
2556 struct zone *zone; 2604 struct zone *zone;
2557 int high_zoneidx = gfp_zone(gfp_mask); 2605 pg_data_t *pgdat = NULL;
2558 pg_data_t *pgdat;
2559 2606
2560 /* 2607 /*
2561 * Kernel threads should not be throttled as they may be indirectly 2608 * Kernel threads should not be throttled as they may be indirectly
@@ -2574,10 +2621,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2574 if (fatal_signal_pending(current)) 2621 if (fatal_signal_pending(current))
2575 goto out; 2622 goto out;
2576 2623
2577 /* Check if the pfmemalloc reserves are ok */ 2624 /*
2578 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); 2625 * Check if the pfmemalloc reserves are ok by finding the first node
2579 pgdat = zone->zone_pgdat; 2626 * with a usable ZONE_NORMAL or lower zone. The expectation is that
2580 if (pfmemalloc_watermark_ok(pgdat)) 2627 * GFP_KERNEL will be required for allocating network buffers when
2628 * swapping over the network so ZONE_HIGHMEM is unusable.
2629 *
2630 * Throttling is based on the first usable node and throttled processes
2631 * wait on a queue until kswapd makes progress and wakes them. There
2632 * is an affinity then between processes waking up and where reclaim
2633 * progress has been made assuming the process wakes on the same node.
2634 * More importantly, processes running on remote nodes will not compete
2635 * for remote pfmemalloc reserves and processes on different nodes
2636 * should make reasonable progress.
2637 */
2638 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2639 gfp_mask, nodemask) {
2640 if (zone_idx(zone) > ZONE_NORMAL)
2641 continue;
2642
2643 /* Throttle based on the first usable node */
2644 pgdat = zone->zone_pgdat;
2645 if (pfmemalloc_watermark_ok(pgdat))
2646 goto out;
2647 break;
2648 }
2649
2650 /* If no zone was usable by the allocation flags then do not throttle */
2651 if (!pgdat)
2581 goto out; 2652 goto out;
2582 2653
2583 /* Account for the throttling */ 2654 /* Account for the throttling */
@@ -2891,9 +2962,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2891 * high wmark plus a "gap" where the gap is either the low 2962 * high wmark plus a "gap" where the gap is either the low
2892 * watermark or 1% of the zone, whichever is smaller. 2963 * watermark or 1% of the zone, whichever is smaller.
2893 */ 2964 */
2894 balance_gap = min(low_wmark_pages(zone), 2965 balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
2895 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2966 zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
2896 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2897 2967
2898 /* 2968 /*
2899 * If there is no low memory pressure or the zone is balanced then no 2969 * If there is no low memory pressure or the zone is balanced then no
@@ -3422,7 +3492,7 @@ int kswapd_run(int nid)
3422 3492
3423/* 3493/*
3424 * Called by memory hotplug when all memory in a node is offlined. Caller must 3494 * Called by memory hotplug when all memory in a node is offlined. Caller must
3425 * hold lock_memory_hotplug(). 3495 * hold mem_hotplug_begin/end().
3426 */ 3496 */
3427void kswapd_stop(int nid) 3497void kswapd_stop(int nid)
3428{ 3498{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 302dd076b8bf..b37bd49bfd55 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -207,7 +207,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
207} 207}
208 208
209/* 209/*
210 * For use when we know that interrupts are disabled. 210 * For use when we know that interrupts are disabled,
211 * or when we know that preemption is disabled and that
212 * particular counter cannot be updated from interrupt context.
211 */ 213 */
212void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 214void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
213 int delta) 215 int delta)
@@ -489,7 +491,7 @@ static void refresh_cpu_vm_stats(void)
489 continue; 491 continue;
490 492
491 if (__this_cpu_read(p->pcp.count)) 493 if (__this_cpu_read(p->pcp.count))
492 drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); 494 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
493#endif 495#endif
494 } 496 }
495 fold_diff(global_diff); 497 fold_diff(global_diff);
@@ -866,6 +868,10 @@ const char * const vmstat_text[] = {
866 "nr_tlb_local_flush_one", 868 "nr_tlb_local_flush_one",
867#endif /* CONFIG_DEBUG_TLBFLUSH */ 869#endif /* CONFIG_DEBUG_TLBFLUSH */
868 870
871#ifdef CONFIG_DEBUG_VM_VMACACHE
872 "vmacache_find_calls",
873 "vmacache_find_hits",
874#endif
869#endif /* CONFIG_VM_EVENTS_COUNTERS */ 875#endif /* CONFIG_VM_EVENTS_COUNTERS */
870}; 876};
871#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ 877#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
@@ -1226,7 +1232,7 @@ int sysctl_stat_interval __read_mostly = HZ;
1226static void vmstat_update(struct work_struct *w) 1232static void vmstat_update(struct work_struct *w)
1227{ 1233{
1228 refresh_cpu_vm_stats(); 1234 refresh_cpu_vm_stats();
1229 schedule_delayed_work(&__get_cpu_var(vmstat_work), 1235 schedule_delayed_work(this_cpu_ptr(&vmstat_work),
1230 round_jiffies_relative(sysctl_stat_interval)); 1236 round_jiffies_relative(sysctl_stat_interval));
1231} 1237}
1232 1238
diff --git a/mm/zbud.c b/mm/zbud.c
index 9451361e6aa7..01df13a7e2e1 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
248 * a new page. 248 * a new page.
249 */ 249 */
250int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, 250int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp,
251 unsigned long *handle) 251 unsigned long *handle)
252{ 252{
253 int chunks, i, freechunks; 253 int chunks, i, freechunks;
@@ -255,7 +255,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
255 enum buddy bud; 255 enum buddy bud;
256 struct page *page; 256 struct page *page;
257 257
258 if (size <= 0 || gfp & __GFP_HIGHMEM) 258 if (!size || (gfp & __GFP_HIGHMEM))
259 return -EINVAL; 259 return -EINVAL;
260 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) 260 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
261 return -ENOSPC; 261 return -ENOSPC;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 36b4591a7a2d..fe78189624cf 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -141,7 +141,7 @@
141#define ZS_MAX_ALLOC_SIZE PAGE_SIZE 141#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
142 142
143/* 143/*
144 * On systems with 4K page size, this gives 254 size classes! There is a 144 * On systems with 4K page size, this gives 255 size classes! There is a
145 * trader-off here: 145 * trader-off here:
146 * - Large number of size classes is potentially wasteful as free page are 146 * - Large number of size classes is potentially wasteful as free page are
147 * spread across these classes 147 * spread across these classes
@@ -1082,7 +1082,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1082 class = &pool->size_class[class_idx]; 1082 class = &pool->size_class[class_idx];
1083 off = obj_idx_to_offset(page, obj_idx, class->size); 1083 off = obj_idx_to_offset(page, obj_idx, class->size);
1084 1084
1085 area = &__get_cpu_var(zs_map_area); 1085 area = this_cpu_ptr(&zs_map_area);
1086 if (off + class->size <= PAGE_SIZE) 1086 if (off + class->size <= PAGE_SIZE)
1087 kunmap_atomic(area->vm_addr); 1087 kunmap_atomic(area->vm_addr);
1088 else { 1088 else {
diff --git a/mm/zswap.c b/mm/zswap.c
index aeaef0fb5624..008388fe7b0f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -347,7 +347,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
347 return NOTIFY_BAD; 347 return NOTIFY_BAD;
348 } 348 }
349 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; 349 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
350 dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); 350 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
351 if (!dst) { 351 if (!dst) {
352 pr_err("can't allocate compressor buffer\n"); 352 pr_err("can't allocate compressor buffer\n");
353 crypto_free_comp(tfm); 353 crypto_free_comp(tfm);