diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 33 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/backing-dev.c | 2 | ||||
-rw-r--r-- | mm/bounce.c | 287 | ||||
-rw-r--r-- | mm/compaction.c | 249 | ||||
-rw-r--r-- | mm/dmapool.c | 31 | ||||
-rw-r--r-- | mm/filemap.c | 248 | ||||
-rw-r--r-- | mm/fremap.c | 7 | ||||
-rw-r--r-- | mm/frontswap.c | 13 | ||||
-rw-r--r-- | mm/gup.c | 662 | ||||
-rw-r--r-- | mm/huge_memory.c | 34 | ||||
-rw-r--r-- | mm/hugetlb.c | 363 | ||||
-rw-r--r-- | mm/internal.h | 36 | ||||
-rw-r--r-- | mm/kmemleak.c | 4 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memblock.c | 231 | ||||
-rw-r--r-- | mm/memcontrol.c | 410 | ||||
-rw-r--r-- | mm/memory-failure.c | 113 | ||||
-rw-r--r-- | mm/memory.c | 746 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 148 | ||||
-rw-r--r-- | mm/mempolicy.c | 30 | ||||
-rw-r--r-- | mm/mempool.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 63 | ||||
-rw-r--r-- | mm/mmap.c | 9 | ||||
-rw-r--r-- | mm/msync.c | 8 | ||||
-rw-r--r-- | mm/page-writeback.c | 22 | ||||
-rw-r--r-- | mm/page_alloc.c | 394 | ||||
-rw-r--r-- | mm/page_io.c | 21 | ||||
-rw-r--r-- | mm/rmap.c | 55 | ||||
-rw-r--r-- | mm/shmem.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 45 | ||||
-rw-r--r-- | mm/slab.h | 48 | ||||
-rw-r--r-- | mm/slab_common.c | 95 | ||||
-rw-r--r-- | mm/slob.c | 3 | ||||
-rw-r--r-- | mm/slub.c | 225 | ||||
-rw-r--r-- | mm/swap.c | 238 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 253 | ||||
-rw-r--r-- | mm/vmacache.c | 22 | ||||
-rw-r--r-- | mm/vmalloc.c | 13 | ||||
-rw-r--r-- | mm/vmscan.c | 184 | ||||
-rw-r--r-- | mm/vmstat.c | 12 | ||||
-rw-r--r-- | mm/zbud.c | 4 | ||||
-rw-r--r-- | mm/zsmalloc.c | 4 | ||||
-rw-r--r-- | mm/zswap.c | 2 |
45 files changed, 2789 insertions, 2595 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ebe5880c29d6..3e9977a9d657 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -134,6 +134,9 @@ config HAVE_MEMBLOCK | |||
134 | config HAVE_MEMBLOCK_NODE_MAP | 134 | config HAVE_MEMBLOCK_NODE_MAP |
135 | boolean | 135 | boolean |
136 | 136 | ||
137 | config HAVE_MEMBLOCK_PHYS_MAP | ||
138 | boolean | ||
139 | |||
137 | config ARCH_DISCARD_MEMBLOCK | 140 | config ARCH_DISCARD_MEMBLOCK |
138 | boolean | 141 | boolean |
139 | 142 | ||
@@ -264,6 +267,9 @@ config MIGRATION | |||
264 | pages as migration can relocate pages to satisfy a huge page | 267 | pages as migration can relocate pages to satisfy a huge page |
265 | allocation instead of reclaiming. | 268 | allocation instead of reclaiming. |
266 | 269 | ||
270 | config ARCH_ENABLE_HUGEPAGE_MIGRATION | ||
271 | boolean | ||
272 | |||
267 | config PHYS_ADDR_T_64BIT | 273 | config PHYS_ADDR_T_64BIT |
268 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT | 274 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT |
269 | 275 | ||
@@ -430,16 +436,6 @@ choice | |||
430 | benefit. | 436 | benefit. |
431 | endchoice | 437 | endchoice |
432 | 438 | ||
433 | config CROSS_MEMORY_ATTACH | ||
434 | bool "Cross Memory Support" | ||
435 | depends on MMU | ||
436 | default y | ||
437 | help | ||
438 | Enabling this option adds the system calls process_vm_readv and | ||
439 | process_vm_writev which allow a process with the correct privileges | ||
440 | to directly read from or write to to another process's address space. | ||
441 | See the man page for more details. | ||
442 | |||
443 | # | 439 | # |
444 | # UP and nommu archs use km based percpu allocator | 440 | # UP and nommu archs use km based percpu allocator |
445 | # | 441 | # |
@@ -555,7 +551,7 @@ config MEM_SOFT_DIRTY | |||
555 | See Documentation/vm/soft-dirty.txt for more details. | 551 | See Documentation/vm/soft-dirty.txt for more details. |
556 | 552 | ||
557 | config ZSMALLOC | 553 | config ZSMALLOC |
558 | bool "Memory allocator for compressed pages" | 554 | tristate "Memory allocator for compressed pages" |
559 | depends on MMU | 555 | depends on MMU |
560 | default n | 556 | default n |
561 | help | 557 | help |
@@ -581,3 +577,18 @@ config PGTABLE_MAPPING | |||
581 | 577 | ||
582 | config GENERIC_EARLY_IOREMAP | 578 | config GENERIC_EARLY_IOREMAP |
583 | bool | 579 | bool |
580 | |||
581 | config MAX_STACK_SIZE_MB | ||
582 | int "Maximum user stack size for 32-bit processes (MB)" | ||
583 | default 80 | ||
584 | range 8 256 if METAG | ||
585 | range 8 2048 | ||
586 | depends on STACK_GROWSUP && (!64BIT || COMPAT) | ||
587 | help | ||
588 | This is the maximum stack size in Megabytes in the VM layout of 32-bit | ||
589 | user processes when the stack grows upwards (currently only on parisc | ||
590 | and metag arch). The stack will be located at the highest memory | ||
591 | address minus the given value, unless the RLIMIT_STACK hard limit is | ||
592 | changed to a smaller value in which case that is used. | ||
593 | |||
594 | A sane initial value is 80 MB. | ||
diff --git a/mm/Makefile b/mm/Makefile index b484452dac57..4064f3ec145e 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -3,7 +3,7 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | 9 | ||
@@ -30,7 +30,6 @@ endif | |||
30 | 30 | ||
31 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 31 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
32 | 32 | ||
33 | obj-$(CONFIG_BOUNCE) += bounce.o | ||
34 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o | 33 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
35 | obj-$(CONFIG_FRONTSWAP) += frontswap.o | 34 | obj-$(CONFIG_FRONTSWAP) += frontswap.o |
36 | obj-$(CONFIG_ZSWAP) += zswap.o | 35 | obj-$(CONFIG_ZSWAP) += zswap.o |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 09d9591b7708..1706cbbdf5f0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -557,7 +557,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) | |||
557 | bit = sync ? BDI_sync_congested : BDI_async_congested; | 557 | bit = sync ? BDI_sync_congested : BDI_async_congested; |
558 | if (test_and_clear_bit(bit, &bdi->state)) | 558 | if (test_and_clear_bit(bit, &bdi->state)) |
559 | atomic_dec(&nr_bdi_congested[sync]); | 559 | atomic_dec(&nr_bdi_congested[sync]); |
560 | smp_mb__after_clear_bit(); | 560 | smp_mb__after_atomic(); |
561 | if (waitqueue_active(wqh)) | 561 | if (waitqueue_active(wqh)) |
562 | wake_up(wqh); | 562 | wake_up(wqh); |
563 | } | 563 | } |
diff --git a/mm/bounce.c b/mm/bounce.c deleted file mode 100644 index 523918b8c6dc..000000000000 --- a/mm/bounce.c +++ /dev/null | |||
@@ -1,287 +0,0 @@ | |||
1 | /* bounce buffer handling for block devices | ||
2 | * | ||
3 | * - Split from highmem.c | ||
4 | */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/export.h> | ||
8 | #include <linux/swap.h> | ||
9 | #include <linux/gfp.h> | ||
10 | #include <linux/bio.h> | ||
11 | #include <linux/pagemap.h> | ||
12 | #include <linux/mempool.h> | ||
13 | #include <linux/blkdev.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/hash.h> | ||
16 | #include <linux/highmem.h> | ||
17 | #include <linux/bootmem.h> | ||
18 | #include <asm/tlbflush.h> | ||
19 | |||
20 | #include <trace/events/block.h> | ||
21 | |||
22 | #define POOL_SIZE 64 | ||
23 | #define ISA_POOL_SIZE 16 | ||
24 | |||
25 | static mempool_t *page_pool, *isa_page_pool; | ||
26 | |||
27 | #if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) | ||
28 | static __init int init_emergency_pool(void) | ||
29 | { | ||
30 | #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) | ||
31 | if (max_pfn <= max_low_pfn) | ||
32 | return 0; | ||
33 | #endif | ||
34 | |||
35 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); | ||
36 | BUG_ON(!page_pool); | ||
37 | printk("bounce pool size: %d pages\n", POOL_SIZE); | ||
38 | |||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | __initcall(init_emergency_pool); | ||
43 | #endif | ||
44 | |||
45 | #ifdef CONFIG_HIGHMEM | ||
46 | /* | ||
47 | * highmem version, map in to vec | ||
48 | */ | ||
49 | static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) | ||
50 | { | ||
51 | unsigned long flags; | ||
52 | unsigned char *vto; | ||
53 | |||
54 | local_irq_save(flags); | ||
55 | vto = kmap_atomic(to->bv_page); | ||
56 | memcpy(vto + to->bv_offset, vfrom, to->bv_len); | ||
57 | kunmap_atomic(vto); | ||
58 | local_irq_restore(flags); | ||
59 | } | ||
60 | |||
61 | #else /* CONFIG_HIGHMEM */ | ||
62 | |||
63 | #define bounce_copy_vec(to, vfrom) \ | ||
64 | memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) | ||
65 | |||
66 | #endif /* CONFIG_HIGHMEM */ | ||
67 | |||
68 | /* | ||
69 | * allocate pages in the DMA region for the ISA pool | ||
70 | */ | ||
71 | static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) | ||
72 | { | ||
73 | return mempool_alloc_pages(gfp_mask | GFP_DMA, data); | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA | ||
78 | * as the max address, so check if the pool has already been created. | ||
79 | */ | ||
80 | int init_emergency_isa_pool(void) | ||
81 | { | ||
82 | if (isa_page_pool) | ||
83 | return 0; | ||
84 | |||
85 | isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, | ||
86 | mempool_free_pages, (void *) 0); | ||
87 | BUG_ON(!isa_page_pool); | ||
88 | |||
89 | printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * Simple bounce buffer support for highmem pages. Depending on the | ||
95 | * queue gfp mask set, *to may or may not be a highmem page. kmap it | ||
96 | * always, it will do the Right Thing | ||
97 | */ | ||
98 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) | ||
99 | { | ||
100 | unsigned char *vfrom; | ||
101 | struct bio_vec tovec, *fromvec = from->bi_io_vec; | ||
102 | struct bvec_iter iter; | ||
103 | |||
104 | bio_for_each_segment(tovec, to, iter) { | ||
105 | if (tovec.bv_page != fromvec->bv_page) { | ||
106 | /* | ||
107 | * fromvec->bv_offset and fromvec->bv_len might have | ||
108 | * been modified by the block layer, so use the original | ||
109 | * copy, bounce_copy_vec already uses tovec->bv_len | ||
110 | */ | ||
111 | vfrom = page_address(fromvec->bv_page) + | ||
112 | tovec.bv_offset; | ||
113 | |||
114 | bounce_copy_vec(&tovec, vfrom); | ||
115 | flush_dcache_page(tovec.bv_page); | ||
116 | } | ||
117 | |||
118 | fromvec++; | ||
119 | } | ||
120 | } | ||
121 | |||
122 | static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) | ||
123 | { | ||
124 | struct bio *bio_orig = bio->bi_private; | ||
125 | struct bio_vec *bvec, *org_vec; | ||
126 | int i; | ||
127 | |||
128 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | ||
129 | set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); | ||
130 | |||
131 | /* | ||
132 | * free up bounce indirect pages used | ||
133 | */ | ||
134 | bio_for_each_segment_all(bvec, bio, i) { | ||
135 | org_vec = bio_orig->bi_io_vec + i; | ||
136 | if (bvec->bv_page == org_vec->bv_page) | ||
137 | continue; | ||
138 | |||
139 | dec_zone_page_state(bvec->bv_page, NR_BOUNCE); | ||
140 | mempool_free(bvec->bv_page, pool); | ||
141 | } | ||
142 | |||
143 | bio_endio(bio_orig, err); | ||
144 | bio_put(bio); | ||
145 | } | ||
146 | |||
147 | static void bounce_end_io_write(struct bio *bio, int err) | ||
148 | { | ||
149 | bounce_end_io(bio, page_pool, err); | ||
150 | } | ||
151 | |||
152 | static void bounce_end_io_write_isa(struct bio *bio, int err) | ||
153 | { | ||
154 | |||
155 | bounce_end_io(bio, isa_page_pool, err); | ||
156 | } | ||
157 | |||
158 | static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) | ||
159 | { | ||
160 | struct bio *bio_orig = bio->bi_private; | ||
161 | |||
162 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
163 | copy_to_high_bio_irq(bio_orig, bio); | ||
164 | |||
165 | bounce_end_io(bio, pool, err); | ||
166 | } | ||
167 | |||
168 | static void bounce_end_io_read(struct bio *bio, int err) | ||
169 | { | ||
170 | __bounce_end_io_read(bio, page_pool, err); | ||
171 | } | ||
172 | |||
173 | static void bounce_end_io_read_isa(struct bio *bio, int err) | ||
174 | { | ||
175 | __bounce_end_io_read(bio, isa_page_pool, err); | ||
176 | } | ||
177 | |||
178 | #ifdef CONFIG_NEED_BOUNCE_POOL | ||
179 | static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) | ||
180 | { | ||
181 | if (bio_data_dir(bio) != WRITE) | ||
182 | return 0; | ||
183 | |||
184 | if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) | ||
185 | return 0; | ||
186 | |||
187 | return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); | ||
188 | } | ||
189 | #else | ||
190 | static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) | ||
191 | { | ||
192 | return 0; | ||
193 | } | ||
194 | #endif /* CONFIG_NEED_BOUNCE_POOL */ | ||
195 | |||
196 | static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | ||
197 | mempool_t *pool, int force) | ||
198 | { | ||
199 | struct bio *bio; | ||
200 | int rw = bio_data_dir(*bio_orig); | ||
201 | struct bio_vec *to, from; | ||
202 | struct bvec_iter iter; | ||
203 | unsigned i; | ||
204 | |||
205 | if (force) | ||
206 | goto bounce; | ||
207 | bio_for_each_segment(from, *bio_orig, iter) | ||
208 | if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) | ||
209 | goto bounce; | ||
210 | |||
211 | return; | ||
212 | bounce: | ||
213 | bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); | ||
214 | |||
215 | bio_for_each_segment_all(to, bio, i) { | ||
216 | struct page *page = to->bv_page; | ||
217 | |||
218 | if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) | ||
219 | continue; | ||
220 | |||
221 | inc_zone_page_state(to->bv_page, NR_BOUNCE); | ||
222 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); | ||
223 | |||
224 | if (rw == WRITE) { | ||
225 | char *vto, *vfrom; | ||
226 | |||
227 | flush_dcache_page(page); | ||
228 | |||
229 | vto = page_address(to->bv_page) + to->bv_offset; | ||
230 | vfrom = kmap_atomic(page) + to->bv_offset; | ||
231 | memcpy(vto, vfrom, to->bv_len); | ||
232 | kunmap_atomic(vfrom); | ||
233 | } | ||
234 | } | ||
235 | |||
236 | trace_block_bio_bounce(q, *bio_orig); | ||
237 | |||
238 | bio->bi_flags |= (1 << BIO_BOUNCED); | ||
239 | |||
240 | if (pool == page_pool) { | ||
241 | bio->bi_end_io = bounce_end_io_write; | ||
242 | if (rw == READ) | ||
243 | bio->bi_end_io = bounce_end_io_read; | ||
244 | } else { | ||
245 | bio->bi_end_io = bounce_end_io_write_isa; | ||
246 | if (rw == READ) | ||
247 | bio->bi_end_io = bounce_end_io_read_isa; | ||
248 | } | ||
249 | |||
250 | bio->bi_private = *bio_orig; | ||
251 | *bio_orig = bio; | ||
252 | } | ||
253 | |||
254 | void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | ||
255 | { | ||
256 | int must_bounce; | ||
257 | mempool_t *pool; | ||
258 | |||
259 | /* | ||
260 | * Data-less bio, nothing to bounce | ||
261 | */ | ||
262 | if (!bio_has_data(*bio_orig)) | ||
263 | return; | ||
264 | |||
265 | must_bounce = must_snapshot_stable_pages(q, *bio_orig); | ||
266 | |||
267 | /* | ||
268 | * for non-isa bounce case, just check if the bounce pfn is equal | ||
269 | * to or bigger than the highest pfn in the system -- in that case, | ||
270 | * don't waste time iterating over bio segments | ||
271 | */ | ||
272 | if (!(q->bounce_gfp & GFP_DMA)) { | ||
273 | if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) | ||
274 | return; | ||
275 | pool = page_pool; | ||
276 | } else { | ||
277 | BUG_ON(!isa_page_pool); | ||
278 | pool = isa_page_pool; | ||
279 | } | ||
280 | |||
281 | /* | ||
282 | * slow path | ||
283 | */ | ||
284 | __blk_queue_bounce(q, bio_orig, pool, must_bounce); | ||
285 | } | ||
286 | |||
287 | EXPORT_SYMBOL(blk_queue_bounce); | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 627dc2e4320f..21bf292b642a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) | |||
89 | unsigned long end_pfn = zone_end_pfn(zone); | 89 | unsigned long end_pfn = zone_end_pfn(zone); |
90 | unsigned long pfn; | 90 | unsigned long pfn; |
91 | 91 | ||
92 | zone->compact_cached_migrate_pfn = start_pfn; | 92 | zone->compact_cached_migrate_pfn[0] = start_pfn; |
93 | zone->compact_cached_migrate_pfn[1] = start_pfn; | ||
93 | zone->compact_cached_free_pfn = end_pfn; | 94 | zone->compact_cached_free_pfn = end_pfn; |
94 | zone->compact_blockskip_flush = false; | 95 | zone->compact_blockskip_flush = false; |
95 | 96 | ||
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) | |||
131 | */ | 132 | */ |
132 | static void update_pageblock_skip(struct compact_control *cc, | 133 | static void update_pageblock_skip(struct compact_control *cc, |
133 | struct page *page, unsigned long nr_isolated, | 134 | struct page *page, unsigned long nr_isolated, |
134 | bool migrate_scanner) | 135 | bool set_unsuitable, bool migrate_scanner) |
135 | { | 136 | { |
136 | struct zone *zone = cc->zone; | 137 | struct zone *zone = cc->zone; |
138 | unsigned long pfn; | ||
137 | 139 | ||
138 | if (cc->ignore_skip_hint) | 140 | if (cc->ignore_skip_hint) |
139 | return; | 141 | return; |
@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc, | |||
141 | if (!page) | 143 | if (!page) |
142 | return; | 144 | return; |
143 | 145 | ||
144 | if (!nr_isolated) { | 146 | if (nr_isolated) |
145 | unsigned long pfn = page_to_pfn(page); | 147 | return; |
148 | |||
149 | /* | ||
150 | * Only skip pageblocks when all forms of compaction will be known to | ||
151 | * fail in the near future. | ||
152 | */ | ||
153 | if (set_unsuitable) | ||
146 | set_pageblock_skip(page); | 154 | set_pageblock_skip(page); |
147 | 155 | ||
148 | /* Update where compaction should restart */ | 156 | pfn = page_to_pfn(page); |
149 | if (migrate_scanner) { | 157 | |
150 | if (!cc->finished_update_migrate && | 158 | /* Update where async and sync compaction should restart */ |
151 | pfn > zone->compact_cached_migrate_pfn) | 159 | if (migrate_scanner) { |
152 | zone->compact_cached_migrate_pfn = pfn; | 160 | if (cc->finished_update_migrate) |
153 | } else { | 161 | return; |
154 | if (!cc->finished_update_free && | 162 | if (pfn > zone->compact_cached_migrate_pfn[0]) |
155 | pfn < zone->compact_cached_free_pfn) | 163 | zone->compact_cached_migrate_pfn[0] = pfn; |
156 | zone->compact_cached_free_pfn = pfn; | 164 | if (cc->mode != MIGRATE_ASYNC && |
157 | } | 165 | pfn > zone->compact_cached_migrate_pfn[1]) |
166 | zone->compact_cached_migrate_pfn[1] = pfn; | ||
167 | } else { | ||
168 | if (cc->finished_update_free) | ||
169 | return; | ||
170 | if (pfn < zone->compact_cached_free_pfn) | ||
171 | zone->compact_cached_free_pfn = pfn; | ||
158 | } | 172 | } |
159 | } | 173 | } |
160 | #else | 174 | #else |
@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc, | |||
166 | 180 | ||
167 | static void update_pageblock_skip(struct compact_control *cc, | 181 | static void update_pageblock_skip(struct compact_control *cc, |
168 | struct page *page, unsigned long nr_isolated, | 182 | struct page *page, unsigned long nr_isolated, |
169 | bool migrate_scanner) | 183 | bool set_unsuitable, bool migrate_scanner) |
170 | { | 184 | { |
171 | } | 185 | } |
172 | #endif /* CONFIG_COMPACTION */ | 186 | #endif /* CONFIG_COMPACTION */ |
@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
195 | } | 209 | } |
196 | 210 | ||
197 | /* async aborts if taking too long or contended */ | 211 | /* async aborts if taking too long or contended */ |
198 | if (!cc->sync) { | 212 | if (cc->mode == MIGRATE_ASYNC) { |
199 | cc->contended = true; | 213 | cc->contended = true; |
200 | return false; | 214 | return false; |
201 | } | 215 | } |
@@ -208,10 +222,28 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
208 | return true; | 222 | return true; |
209 | } | 223 | } |
210 | 224 | ||
211 | static inline bool compact_trylock_irqsave(spinlock_t *lock, | 225 | /* |
212 | unsigned long *flags, struct compact_control *cc) | 226 | * Aside from avoiding lock contention, compaction also periodically checks |
227 | * need_resched() and either schedules in sync compaction or aborts async | ||
228 | * compaction. This is similar to what compact_checklock_irqsave() does, but | ||
229 | * is used where no lock is concerned. | ||
230 | * | ||
231 | * Returns false when no scheduling was needed, or sync compaction scheduled. | ||
232 | * Returns true when async compaction should abort. | ||
233 | */ | ||
234 | static inline bool compact_should_abort(struct compact_control *cc) | ||
213 | { | 235 | { |
214 | return compact_checklock_irqsave(lock, flags, false, cc); | 236 | /* async compaction aborts if contended */ |
237 | if (need_resched()) { | ||
238 | if (cc->mode == MIGRATE_ASYNC) { | ||
239 | cc->contended = true; | ||
240 | return true; | ||
241 | } | ||
242 | |||
243 | cond_resched(); | ||
244 | } | ||
245 | |||
246 | return false; | ||
215 | } | 247 | } |
216 | 248 | ||
217 | /* Returns true if the page is within a block suitable for migration to */ | 249 | /* Returns true if the page is within a block suitable for migration to */ |
@@ -329,7 +361,8 @@ isolate_fail: | |||
329 | 361 | ||
330 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 362 | /* Update the pageblock-skip if the whole pageblock was scanned */ |
331 | if (blockpfn == end_pfn) | 363 | if (blockpfn == end_pfn) |
332 | update_pageblock_skip(cc, valid_page, total_isolated, false); | 364 | update_pageblock_skip(cc, valid_page, total_isolated, true, |
365 | false); | ||
333 | 366 | ||
334 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); | 367 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); |
335 | if (total_isolated) | 368 | if (total_isolated) |
@@ -464,8 +497,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
464 | unsigned long flags; | 497 | unsigned long flags; |
465 | bool locked = false; | 498 | bool locked = false; |
466 | struct page *page = NULL, *valid_page = NULL; | 499 | struct page *page = NULL, *valid_page = NULL; |
467 | bool skipped_async_unsuitable = false; | 500 | bool set_unsuitable = true; |
468 | const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | | 501 | const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? |
502 | ISOLATE_ASYNC_MIGRATE : 0) | | ||
469 | (unevictable ? ISOLATE_UNEVICTABLE : 0); | 503 | (unevictable ? ISOLATE_UNEVICTABLE : 0); |
470 | 504 | ||
471 | /* | 505 | /* |
@@ -475,7 +509,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
475 | */ | 509 | */ |
476 | while (unlikely(too_many_isolated(zone))) { | 510 | while (unlikely(too_many_isolated(zone))) { |
477 | /* async migration should just abort */ | 511 | /* async migration should just abort */ |
478 | if (!cc->sync) | 512 | if (cc->mode == MIGRATE_ASYNC) |
479 | return 0; | 513 | return 0; |
480 | 514 | ||
481 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 515 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -484,8 +518,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
484 | return 0; | 518 | return 0; |
485 | } | 519 | } |
486 | 520 | ||
521 | if (compact_should_abort(cc)) | ||
522 | return 0; | ||
523 | |||
487 | /* Time to isolate some pages for migration */ | 524 | /* Time to isolate some pages for migration */ |
488 | cond_resched(); | ||
489 | for (; low_pfn < end_pfn; low_pfn++) { | 525 | for (; low_pfn < end_pfn; low_pfn++) { |
490 | /* give a chance to irqs before checking need_resched() */ | 526 | /* give a chance to irqs before checking need_resched() */ |
491 | if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { | 527 | if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { |
@@ -540,9 +576,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
540 | * the minimum amount of work satisfies the allocation | 576 | * the minimum amount of work satisfies the allocation |
541 | */ | 577 | */ |
542 | mt = get_pageblock_migratetype(page); | 578 | mt = get_pageblock_migratetype(page); |
543 | if (!cc->sync && !migrate_async_suitable(mt)) { | 579 | if (cc->mode == MIGRATE_ASYNC && |
544 | cc->finished_update_migrate = true; | 580 | !migrate_async_suitable(mt)) { |
545 | skipped_async_unsuitable = true; | 581 | set_unsuitable = false; |
546 | goto next_pageblock; | 582 | goto next_pageblock; |
547 | } | 583 | } |
548 | } | 584 | } |
@@ -646,11 +682,10 @@ next_pageblock: | |||
646 | /* | 682 | /* |
647 | * Update the pageblock-skip information and cached scanner pfn, | 683 | * Update the pageblock-skip information and cached scanner pfn, |
648 | * if the whole pageblock was scanned without isolating any page. | 684 | * if the whole pageblock was scanned without isolating any page. |
649 | * This is not done when pageblock was skipped due to being unsuitable | ||
650 | * for async compaction, so that eventual sync compaction can try. | ||
651 | */ | 685 | */ |
652 | if (low_pfn == end_pfn && !skipped_async_unsuitable) | 686 | if (low_pfn == end_pfn) |
653 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 687 | update_pageblock_skip(cc, valid_page, nr_isolated, |
688 | set_unsuitable, true); | ||
654 | 689 | ||
655 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 690 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
656 | 691 | ||
@@ -671,7 +706,9 @@ static void isolate_freepages(struct zone *zone, | |||
671 | struct compact_control *cc) | 706 | struct compact_control *cc) |
672 | { | 707 | { |
673 | struct page *page; | 708 | struct page *page; |
674 | unsigned long high_pfn, low_pfn, pfn, z_end_pfn; | 709 | unsigned long block_start_pfn; /* start of current pageblock */ |
710 | unsigned long block_end_pfn; /* end of current pageblock */ | ||
711 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ | ||
675 | int nr_freepages = cc->nr_freepages; | 712 | int nr_freepages = cc->nr_freepages; |
676 | struct list_head *freelist = &cc->freepages; | 713 | struct list_head *freelist = &cc->freepages; |
677 | 714 | ||
@@ -679,41 +716,38 @@ static void isolate_freepages(struct zone *zone, | |||
679 | * Initialise the free scanner. The starting point is where we last | 716 | * Initialise the free scanner. The starting point is where we last |
680 | * successfully isolated from, zone-cached value, or the end of the | 717 | * successfully isolated from, zone-cached value, or the end of the |
681 | * zone when isolating for the first time. We need this aligned to | 718 | * zone when isolating for the first time. We need this aligned to |
682 | * the pageblock boundary, because we do pfn -= pageblock_nr_pages | 719 | * the pageblock boundary, because we do |
683 | * in the for loop. | 720 | * block_start_pfn -= pageblock_nr_pages in the for loop. |
721 | * For ending point, take care when isolating in last pageblock of a | ||
722 | * a zone which ends in the middle of a pageblock. | ||
684 | * The low boundary is the end of the pageblock the migration scanner | 723 | * The low boundary is the end of the pageblock the migration scanner |
685 | * is using. | 724 | * is using. |
686 | */ | 725 | */ |
687 | pfn = cc->free_pfn & ~(pageblock_nr_pages-1); | 726 | block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); |
727 | block_end_pfn = min(block_start_pfn + pageblock_nr_pages, | ||
728 | zone_end_pfn(zone)); | ||
688 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); | 729 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); |
689 | 730 | ||
690 | /* | 731 | /* |
691 | * Take care that if the migration scanner is at the end of the zone | ||
692 | * that the free scanner does not accidentally move to the next zone | ||
693 | * in the next isolation cycle. | ||
694 | */ | ||
695 | high_pfn = min(low_pfn, pfn); | ||
696 | |||
697 | z_end_pfn = zone_end_pfn(zone); | ||
698 | |||
699 | /* | ||
700 | * Isolate free pages until enough are available to migrate the | 732 | * Isolate free pages until enough are available to migrate the |
701 | * pages on cc->migratepages. We stop searching if the migrate | 733 | * pages on cc->migratepages. We stop searching if the migrate |
702 | * and free page scanners meet or enough free pages are isolated. | 734 | * and free page scanners meet or enough free pages are isolated. |
703 | */ | 735 | */ |
704 | for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; | 736 | for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; |
705 | pfn -= pageblock_nr_pages) { | 737 | block_end_pfn = block_start_pfn, |
738 | block_start_pfn -= pageblock_nr_pages) { | ||
706 | unsigned long isolated; | 739 | unsigned long isolated; |
707 | unsigned long end_pfn; | ||
708 | 740 | ||
709 | /* | 741 | /* |
710 | * This can iterate a massively long zone without finding any | 742 | * This can iterate a massively long zone without finding any |
711 | * suitable migration targets, so periodically check if we need | 743 | * suitable migration targets, so periodically check if we need |
712 | * to schedule. | 744 | * to schedule, or even abort async compaction. |
713 | */ | 745 | */ |
714 | cond_resched(); | 746 | if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) |
747 | && compact_should_abort(cc)) | ||
748 | break; | ||
715 | 749 | ||
716 | if (!pfn_valid(pfn)) | 750 | if (!pfn_valid(block_start_pfn)) |
717 | continue; | 751 | continue; |
718 | 752 | ||
719 | /* | 753 | /* |
@@ -723,7 +757,7 @@ static void isolate_freepages(struct zone *zone, | |||
723 | * i.e. it's possible that all pages within a zones range of | 757 | * i.e. it's possible that all pages within a zones range of |
724 | * pages do not belong to a single zone. | 758 | * pages do not belong to a single zone. |
725 | */ | 759 | */ |
726 | page = pfn_to_page(pfn); | 760 | page = pfn_to_page(block_start_pfn); |
727 | if (page_zone(page) != zone) | 761 | if (page_zone(page) != zone) |
728 | continue; | 762 | continue; |
729 | 763 | ||
@@ -736,26 +770,26 @@ static void isolate_freepages(struct zone *zone, | |||
736 | continue; | 770 | continue; |
737 | 771 | ||
738 | /* Found a block suitable for isolating free pages from */ | 772 | /* Found a block suitable for isolating free pages from */ |
739 | isolated = 0; | 773 | cc->free_pfn = block_start_pfn; |
774 | isolated = isolate_freepages_block(cc, block_start_pfn, | ||
775 | block_end_pfn, freelist, false); | ||
776 | nr_freepages += isolated; | ||
740 | 777 | ||
741 | /* | 778 | /* |
742 | * Take care when isolating in last pageblock of a zone which | 779 | * Set a flag that we successfully isolated in this pageblock. |
743 | * ends in the middle of a pageblock. | 780 | * In the next loop iteration, zone->compact_cached_free_pfn |
781 | * will not be updated and thus it will effectively contain the | ||
782 | * highest pageblock we isolated pages from. | ||
744 | */ | 783 | */ |
745 | end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); | 784 | if (isolated) |
746 | isolated = isolate_freepages_block(cc, pfn, end_pfn, | 785 | cc->finished_update_free = true; |
747 | freelist, false); | ||
748 | nr_freepages += isolated; | ||
749 | 786 | ||
750 | /* | 787 | /* |
751 | * Record the highest PFN we isolated pages from. When next | 788 | * isolate_freepages_block() might have aborted due to async |
752 | * looking for free pages, the search will restart here as | 789 | * compaction being contended |
753 | * page migration may have returned some pages to the allocator | ||
754 | */ | 790 | */ |
755 | if (isolated) { | 791 | if (cc->contended) |
756 | cc->finished_update_free = true; | 792 | break; |
757 | high_pfn = max(high_pfn, pfn); | ||
758 | } | ||
759 | } | 793 | } |
760 | 794 | ||
761 | /* split_free_page does not map the pages */ | 795 | /* split_free_page does not map the pages */ |
@@ -765,10 +799,9 @@ static void isolate_freepages(struct zone *zone, | |||
765 | * If we crossed the migrate scanner, we want to keep it that way | 799 | * If we crossed the migrate scanner, we want to keep it that way |
766 | * so that compact_finished() may detect this | 800 | * so that compact_finished() may detect this |
767 | */ | 801 | */ |
768 | if (pfn < low_pfn) | 802 | if (block_start_pfn < low_pfn) |
769 | cc->free_pfn = max(pfn, zone->zone_start_pfn); | 803 | cc->free_pfn = cc->migrate_pfn; |
770 | else | 804 | |
771 | cc->free_pfn = high_pfn; | ||
772 | cc->nr_freepages = nr_freepages; | 805 | cc->nr_freepages = nr_freepages; |
773 | } | 806 | } |
774 | 807 | ||
@@ -783,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage, | |||
783 | struct compact_control *cc = (struct compact_control *)data; | 816 | struct compact_control *cc = (struct compact_control *)data; |
784 | struct page *freepage; | 817 | struct page *freepage; |
785 | 818 | ||
786 | /* Isolate free pages if necessary */ | 819 | /* |
820 | * Isolate free pages if necessary, and if we are not aborting due to | ||
821 | * contention. | ||
822 | */ | ||
787 | if (list_empty(&cc->freepages)) { | 823 | if (list_empty(&cc->freepages)) { |
788 | isolate_freepages(cc->zone, cc); | 824 | if (!cc->contended) |
825 | isolate_freepages(cc->zone, cc); | ||
789 | 826 | ||
790 | if (list_empty(&cc->freepages)) | 827 | if (list_empty(&cc->freepages)) |
791 | return NULL; | 828 | return NULL; |
@@ -799,23 +836,16 @@ static struct page *compaction_alloc(struct page *migratepage, | |||
799 | } | 836 | } |
800 | 837 | ||
801 | /* | 838 | /* |
802 | * We cannot control nr_migratepages and nr_freepages fully when migration is | 839 | * This is a migrate-callback that "frees" freepages back to the isolated |
803 | * running as migrate_pages() has no knowledge of compact_control. When | 840 | * freelist. All pages on the freelist are from the same zone, so there is no |
804 | * migration is complete, we count the number of pages on the lists by hand. | 841 | * special handling needed for NUMA. |
805 | */ | 842 | */ |
806 | static void update_nr_listpages(struct compact_control *cc) | 843 | static void compaction_free(struct page *page, unsigned long data) |
807 | { | 844 | { |
808 | int nr_migratepages = 0; | 845 | struct compact_control *cc = (struct compact_control *)data; |
809 | int nr_freepages = 0; | ||
810 | struct page *page; | ||
811 | |||
812 | list_for_each_entry(page, &cc->migratepages, lru) | ||
813 | nr_migratepages++; | ||
814 | list_for_each_entry(page, &cc->freepages, lru) | ||
815 | nr_freepages++; | ||
816 | 846 | ||
817 | cc->nr_migratepages = nr_migratepages; | 847 | list_add(&page->lru, &cc->freepages); |
818 | cc->nr_freepages = nr_freepages; | 848 | cc->nr_freepages++; |
819 | } | 849 | } |
820 | 850 | ||
821 | /* possible outcome of isolate_migratepages */ | 851 | /* possible outcome of isolate_migratepages */ |
@@ -862,13 +892,14 @@ static int compact_finished(struct zone *zone, | |||
862 | unsigned int order; | 892 | unsigned int order; |
863 | unsigned long watermark; | 893 | unsigned long watermark; |
864 | 894 | ||
865 | if (fatal_signal_pending(current)) | 895 | if (cc->contended || fatal_signal_pending(current)) |
866 | return COMPACT_PARTIAL; | 896 | return COMPACT_PARTIAL; |
867 | 897 | ||
868 | /* Compaction run completes if the migrate and free scanner meet */ | 898 | /* Compaction run completes if the migrate and free scanner meet */ |
869 | if (cc->free_pfn <= cc->migrate_pfn) { | 899 | if (cc->free_pfn <= cc->migrate_pfn) { |
870 | /* Let the next compaction start anew. */ | 900 | /* Let the next compaction start anew. */ |
871 | zone->compact_cached_migrate_pfn = zone->zone_start_pfn; | 901 | zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; |
902 | zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; | ||
872 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | 903 | zone->compact_cached_free_pfn = zone_end_pfn(zone); |
873 | 904 | ||
874 | /* | 905 | /* |
@@ -968,6 +999,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
968 | int ret; | 999 | int ret; |
969 | unsigned long start_pfn = zone->zone_start_pfn; | 1000 | unsigned long start_pfn = zone->zone_start_pfn; |
970 | unsigned long end_pfn = zone_end_pfn(zone); | 1001 | unsigned long end_pfn = zone_end_pfn(zone); |
1002 | const bool sync = cc->mode != MIGRATE_ASYNC; | ||
971 | 1003 | ||
972 | ret = compaction_suitable(zone, cc->order); | 1004 | ret = compaction_suitable(zone, cc->order); |
973 | switch (ret) { | 1005 | switch (ret) { |
@@ -993,7 +1025,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
993 | * information on where the scanners should start but check that it | 1025 | * information on where the scanners should start but check that it |
994 | * is initialised by ensuring the values are within zone boundaries. | 1026 | * is initialised by ensuring the values are within zone boundaries. |
995 | */ | 1027 | */ |
996 | cc->migrate_pfn = zone->compact_cached_migrate_pfn; | 1028 | cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; |
997 | cc->free_pfn = zone->compact_cached_free_pfn; | 1029 | cc->free_pfn = zone->compact_cached_free_pfn; |
998 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { | 1030 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { |
999 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); | 1031 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); |
@@ -1001,7 +1033,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1001 | } | 1033 | } |
1002 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { | 1034 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { |
1003 | cc->migrate_pfn = start_pfn; | 1035 | cc->migrate_pfn = start_pfn; |
1004 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | 1036 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; |
1037 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | ||
1005 | } | 1038 | } |
1006 | 1039 | ||
1007 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); | 1040 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); |
@@ -1009,7 +1042,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1009 | migrate_prep_local(); | 1042 | migrate_prep_local(); |
1010 | 1043 | ||
1011 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 1044 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { |
1012 | unsigned long nr_migrate, nr_remaining; | ||
1013 | int err; | 1045 | int err; |
1014 | 1046 | ||
1015 | switch (isolate_migratepages(zone, cc)) { | 1047 | switch (isolate_migratepages(zone, cc)) { |
@@ -1024,21 +1056,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1024 | ; | 1056 | ; |
1025 | } | 1057 | } |
1026 | 1058 | ||
1027 | nr_migrate = cc->nr_migratepages; | 1059 | if (!cc->nr_migratepages) |
1060 | continue; | ||
1061 | |||
1028 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 1062 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
1029 | (unsigned long)cc, | 1063 | compaction_free, (unsigned long)cc, cc->mode, |
1030 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, | ||
1031 | MR_COMPACTION); | 1064 | MR_COMPACTION); |
1032 | update_nr_listpages(cc); | ||
1033 | nr_remaining = cc->nr_migratepages; | ||
1034 | 1065 | ||
1035 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | 1066 | trace_mm_compaction_migratepages(cc->nr_migratepages, err, |
1036 | nr_remaining); | 1067 | &cc->migratepages); |
1037 | 1068 | ||
1038 | /* Release isolated pages not migrated */ | 1069 | /* All pages were either migrated or will be released */ |
1070 | cc->nr_migratepages = 0; | ||
1039 | if (err) { | 1071 | if (err) { |
1040 | putback_movable_pages(&cc->migratepages); | 1072 | putback_movable_pages(&cc->migratepages); |
1041 | cc->nr_migratepages = 0; | ||
1042 | /* | 1073 | /* |
1043 | * migrate_pages() may return -ENOMEM when scanners meet | 1074 | * migrate_pages() may return -ENOMEM when scanners meet |
1044 | * and we want compact_finished() to detect it | 1075 | * and we want compact_finished() to detect it |
@@ -1060,9 +1091,8 @@ out: | |||
1060 | return ret; | 1091 | return ret; |
1061 | } | 1092 | } |
1062 | 1093 | ||
1063 | static unsigned long compact_zone_order(struct zone *zone, | 1094 | static unsigned long compact_zone_order(struct zone *zone, int order, |
1064 | int order, gfp_t gfp_mask, | 1095 | gfp_t gfp_mask, enum migrate_mode mode, bool *contended) |
1065 | bool sync, bool *contended) | ||
1066 | { | 1096 | { |
1067 | unsigned long ret; | 1097 | unsigned long ret; |
1068 | struct compact_control cc = { | 1098 | struct compact_control cc = { |
@@ -1071,7 +1101,7 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
1071 | .order = order, | 1101 | .order = order, |
1072 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1102 | .migratetype = allocflags_to_migratetype(gfp_mask), |
1073 | .zone = zone, | 1103 | .zone = zone, |
1074 | .sync = sync, | 1104 | .mode = mode, |
1075 | }; | 1105 | }; |
1076 | INIT_LIST_HEAD(&cc.freepages); | 1106 | INIT_LIST_HEAD(&cc.freepages); |
1077 | INIT_LIST_HEAD(&cc.migratepages); | 1107 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -1093,7 +1123,7 @@ int sysctl_extfrag_threshold = 500; | |||
1093 | * @order: The order of the current allocation | 1123 | * @order: The order of the current allocation |
1094 | * @gfp_mask: The GFP mask of the current allocation | 1124 | * @gfp_mask: The GFP mask of the current allocation |
1095 | * @nodemask: The allowed nodes to allocate from | 1125 | * @nodemask: The allowed nodes to allocate from |
1096 | * @sync: Whether migration is synchronous or not | 1126 | * @mode: The migration mode for async, sync light, or sync migration |
1097 | * @contended: Return value that is true if compaction was aborted due to lock contention | 1127 | * @contended: Return value that is true if compaction was aborted due to lock contention |
1098 | * @page: Optionally capture a free page of the requested order during compaction | 1128 | * @page: Optionally capture a free page of the requested order during compaction |
1099 | * | 1129 | * |
@@ -1101,7 +1131,7 @@ int sysctl_extfrag_threshold = 500; | |||
1101 | */ | 1131 | */ |
1102 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1132 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
1103 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1133 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
1104 | bool sync, bool *contended) | 1134 | enum migrate_mode mode, bool *contended) |
1105 | { | 1135 | { |
1106 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1136 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1107 | int may_enter_fs = gfp_mask & __GFP_FS; | 1137 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -1126,7 +1156,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1126 | nodemask) { | 1156 | nodemask) { |
1127 | int status; | 1157 | int status; |
1128 | 1158 | ||
1129 | status = compact_zone_order(zone, order, gfp_mask, sync, | 1159 | status = compact_zone_order(zone, order, gfp_mask, mode, |
1130 | contended); | 1160 | contended); |
1131 | rc = max(status, rc); | 1161 | rc = max(status, rc); |
1132 | 1162 | ||
@@ -1165,9 +1195,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1165 | if (zone_watermark_ok(zone, cc->order, | 1195 | if (zone_watermark_ok(zone, cc->order, |
1166 | low_wmark_pages(zone), 0, 0)) | 1196 | low_wmark_pages(zone), 0, 0)) |
1167 | compaction_defer_reset(zone, cc->order, false); | 1197 | compaction_defer_reset(zone, cc->order, false); |
1168 | /* Currently async compaction is never deferred. */ | ||
1169 | else if (cc->sync) | ||
1170 | defer_compaction(zone, cc->order); | ||
1171 | } | 1198 | } |
1172 | 1199 | ||
1173 | VM_BUG_ON(!list_empty(&cc->freepages)); | 1200 | VM_BUG_ON(!list_empty(&cc->freepages)); |
@@ -1179,7 +1206,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) | |||
1179 | { | 1206 | { |
1180 | struct compact_control cc = { | 1207 | struct compact_control cc = { |
1181 | .order = order, | 1208 | .order = order, |
1182 | .sync = false, | 1209 | .mode = MIGRATE_ASYNC, |
1183 | }; | 1210 | }; |
1184 | 1211 | ||
1185 | if (!order) | 1212 | if (!order) |
@@ -1192,7 +1219,7 @@ static void compact_node(int nid) | |||
1192 | { | 1219 | { |
1193 | struct compact_control cc = { | 1220 | struct compact_control cc = { |
1194 | .order = -1, | 1221 | .order = -1, |
1195 | .sync = true, | 1222 | .mode = MIGRATE_SYNC, |
1196 | .ignore_skip_hint = true, | 1223 | .ignore_skip_hint = true, |
1197 | }; | 1224 | }; |
1198 | 1225 | ||
diff --git a/mm/dmapool.c b/mm/dmapool.c index c69781e97cf9..306baa594f95 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -170,24 +170,16 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, | |||
170 | retval->boundary = boundary; | 170 | retval->boundary = boundary; |
171 | retval->allocation = allocation; | 171 | retval->allocation = allocation; |
172 | 172 | ||
173 | if (dev) { | 173 | INIT_LIST_HEAD(&retval->pools); |
174 | int ret; | ||
175 | 174 | ||
176 | mutex_lock(&pools_lock); | 175 | mutex_lock(&pools_lock); |
177 | if (list_empty(&dev->dma_pools)) | 176 | if (list_empty(&dev->dma_pools) && |
178 | ret = device_create_file(dev, &dev_attr_pools); | 177 | device_create_file(dev, &dev_attr_pools)) { |
179 | else | 178 | kfree(retval); |
180 | ret = 0; | 179 | return NULL; |
181 | /* note: not currently insisting "name" be unique */ | ||
182 | if (!ret) | ||
183 | list_add(&retval->pools, &dev->dma_pools); | ||
184 | else { | ||
185 | kfree(retval); | ||
186 | retval = NULL; | ||
187 | } | ||
188 | mutex_unlock(&pools_lock); | ||
189 | } else | 180 | } else |
190 | INIT_LIST_HEAD(&retval->pools); | 181 | list_add(&retval->pools, &dev->dma_pools); |
182 | mutex_unlock(&pools_lock); | ||
191 | 183 | ||
192 | return retval; | 184 | return retval; |
193 | } | 185 | } |
@@ -341,10 +333,10 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
341 | continue; | 333 | continue; |
342 | if (pool->dev) | 334 | if (pool->dev) |
343 | dev_err(pool->dev, | 335 | dev_err(pool->dev, |
344 | "dma_pool_alloc %s, %p (corruped)\n", | 336 | "dma_pool_alloc %s, %p (corrupted)\n", |
345 | pool->name, retval); | 337 | pool->name, retval); |
346 | else | 338 | else |
347 | pr_err("dma_pool_alloc %s, %p (corruped)\n", | 339 | pr_err("dma_pool_alloc %s, %p (corrupted)\n", |
348 | pool->name, retval); | 340 | pool->name, retval); |
349 | 341 | ||
350 | /* | 342 | /* |
@@ -508,7 +500,6 @@ void dmam_pool_destroy(struct dma_pool *pool) | |||
508 | { | 500 | { |
509 | struct device *dev = pool->dev; | 501 | struct device *dev = pool->dev; |
510 | 502 | ||
511 | WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); | 503 | WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool)); |
512 | dma_pool_destroy(pool); | ||
513 | } | 504 | } |
514 | EXPORT_SYMBOL(dmam_pool_destroy); | 505 | EXPORT_SYMBOL(dmam_pool_destroy); |
diff --git a/mm/filemap.c b/mm/filemap.c index 000a220e2a41..7fadf1c62838 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -257,9 +257,11 @@ static int filemap_check_errors(struct address_space *mapping) | |||
257 | { | 257 | { |
258 | int ret = 0; | 258 | int ret = 0; |
259 | /* Check for outstanding write errors */ | 259 | /* Check for outstanding write errors */ |
260 | if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | 260 | if (test_bit(AS_ENOSPC, &mapping->flags) && |
261 | test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | ||
261 | ret = -ENOSPC; | 262 | ret = -ENOSPC; |
262 | if (test_and_clear_bit(AS_EIO, &mapping->flags)) | 263 | if (test_bit(AS_EIO, &mapping->flags) && |
264 | test_and_clear_bit(AS_EIO, &mapping->flags)) | ||
263 | ret = -EIO; | 265 | ret = -EIO; |
264 | return ret; | 266 | return ret; |
265 | } | 267 | } |
@@ -740,7 +742,7 @@ void unlock_page(struct page *page) | |||
740 | { | 742 | { |
741 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 743 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
742 | clear_bit_unlock(PG_locked, &page->flags); | 744 | clear_bit_unlock(PG_locked, &page->flags); |
743 | smp_mb__after_clear_bit(); | 745 | smp_mb__after_atomic(); |
744 | wake_up_page(page, PG_locked); | 746 | wake_up_page(page, PG_locked); |
745 | } | 747 | } |
746 | EXPORT_SYMBOL(unlock_page); | 748 | EXPORT_SYMBOL(unlock_page); |
@@ -751,17 +753,51 @@ EXPORT_SYMBOL(unlock_page); | |||
751 | */ | 753 | */ |
752 | void end_page_writeback(struct page *page) | 754 | void end_page_writeback(struct page *page) |
753 | { | 755 | { |
754 | if (TestClearPageReclaim(page)) | 756 | /* |
757 | * TestClearPageReclaim could be used here but it is an atomic | ||
758 | * operation and overkill in this particular case. Failing to | ||
759 | * shuffle a page marked for immediate reclaim is too mild to | ||
760 | * justify taking an atomic operation penalty at the end of | ||
761 | * ever page writeback. | ||
762 | */ | ||
763 | if (PageReclaim(page)) { | ||
764 | ClearPageReclaim(page); | ||
755 | rotate_reclaimable_page(page); | 765 | rotate_reclaimable_page(page); |
766 | } | ||
756 | 767 | ||
757 | if (!test_clear_page_writeback(page)) | 768 | if (!test_clear_page_writeback(page)) |
758 | BUG(); | 769 | BUG(); |
759 | 770 | ||
760 | smp_mb__after_clear_bit(); | 771 | smp_mb__after_atomic(); |
761 | wake_up_page(page, PG_writeback); | 772 | wake_up_page(page, PG_writeback); |
762 | } | 773 | } |
763 | EXPORT_SYMBOL(end_page_writeback); | 774 | EXPORT_SYMBOL(end_page_writeback); |
764 | 775 | ||
776 | /* | ||
777 | * After completing I/O on a page, call this routine to update the page | ||
778 | * flags appropriately | ||
779 | */ | ||
780 | void page_endio(struct page *page, int rw, int err) | ||
781 | { | ||
782 | if (rw == READ) { | ||
783 | if (!err) { | ||
784 | SetPageUptodate(page); | ||
785 | } else { | ||
786 | ClearPageUptodate(page); | ||
787 | SetPageError(page); | ||
788 | } | ||
789 | unlock_page(page); | ||
790 | } else { /* rw == WRITE */ | ||
791 | if (err) { | ||
792 | SetPageError(page); | ||
793 | if (page->mapping) | ||
794 | mapping_set_error(page->mapping, err); | ||
795 | } | ||
796 | end_page_writeback(page); | ||
797 | } | ||
798 | } | ||
799 | EXPORT_SYMBOL_GPL(page_endio); | ||
800 | |||
765 | /** | 801 | /** |
766 | * __lock_page - get a lock on the page, assuming we need to sleep to get it | 802 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
767 | * @page: the page to lock | 803 | * @page: the page to lock |
@@ -955,26 +991,6 @@ out: | |||
955 | EXPORT_SYMBOL(find_get_entry); | 991 | EXPORT_SYMBOL(find_get_entry); |
956 | 992 | ||
957 | /** | 993 | /** |
958 | * find_get_page - find and get a page reference | ||
959 | * @mapping: the address_space to search | ||
960 | * @offset: the page index | ||
961 | * | ||
962 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
963 | * page cache page, it is returned with an increased refcount. | ||
964 | * | ||
965 | * Otherwise, %NULL is returned. | ||
966 | */ | ||
967 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) | ||
968 | { | ||
969 | struct page *page = find_get_entry(mapping, offset); | ||
970 | |||
971 | if (radix_tree_exceptional_entry(page)) | ||
972 | page = NULL; | ||
973 | return page; | ||
974 | } | ||
975 | EXPORT_SYMBOL(find_get_page); | ||
976 | |||
977 | /** | ||
978 | * find_lock_entry - locate, pin and lock a page cache entry | 994 | * find_lock_entry - locate, pin and lock a page cache entry |
979 | * @mapping: the address_space to search | 995 | * @mapping: the address_space to search |
980 | * @offset: the page cache index | 996 | * @offset: the page cache index |
@@ -1011,66 +1027,84 @@ repeat: | |||
1011 | EXPORT_SYMBOL(find_lock_entry); | 1027 | EXPORT_SYMBOL(find_lock_entry); |
1012 | 1028 | ||
1013 | /** | 1029 | /** |
1014 | * find_lock_page - locate, pin and lock a pagecache page | 1030 | * pagecache_get_page - find and get a page reference |
1015 | * @mapping: the address_space to search | 1031 | * @mapping: the address_space to search |
1016 | * @offset: the page index | 1032 | * @offset: the page index |
1033 | * @fgp_flags: PCG flags | ||
1034 | * @gfp_mask: gfp mask to use if a page is to be allocated | ||
1017 | * | 1035 | * |
1018 | * Looks up the page cache slot at @mapping & @offset. If there is a | 1036 | * Looks up the page cache slot at @mapping & @offset. |
1019 | * page cache page, it is returned locked and with an increased | ||
1020 | * refcount. | ||
1021 | * | ||
1022 | * Otherwise, %NULL is returned. | ||
1023 | * | ||
1024 | * find_lock_page() may sleep. | ||
1025 | */ | ||
1026 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) | ||
1027 | { | ||
1028 | struct page *page = find_lock_entry(mapping, offset); | ||
1029 | |||
1030 | if (radix_tree_exceptional_entry(page)) | ||
1031 | page = NULL; | ||
1032 | return page; | ||
1033 | } | ||
1034 | EXPORT_SYMBOL(find_lock_page); | ||
1035 | |||
1036 | /** | ||
1037 | * find_or_create_page - locate or add a pagecache page | ||
1038 | * @mapping: the page's address_space | ||
1039 | * @index: the page's index into the mapping | ||
1040 | * @gfp_mask: page allocation mode | ||
1041 | * | 1037 | * |
1042 | * Looks up the page cache slot at @mapping & @offset. If there is a | 1038 | * PCG flags modify how the page is returned |
1043 | * page cache page, it is returned locked and with an increased | ||
1044 | * refcount. | ||
1045 | * | 1039 | * |
1046 | * If the page is not present, a new page is allocated using @gfp_mask | 1040 | * FGP_ACCESSED: the page will be marked accessed |
1047 | * and added to the page cache and the VM's LRU list. The page is | 1041 | * FGP_LOCK: Page is return locked |
1048 | * returned locked and with an increased refcount. | 1042 | * FGP_CREAT: If page is not present then a new page is allocated using |
1043 | * @gfp_mask and added to the page cache and the VM's LRU | ||
1044 | * list. The page is returned locked and with an increased | ||
1045 | * refcount. Otherwise, %NULL is returned. | ||
1049 | * | 1046 | * |
1050 | * On memory exhaustion, %NULL is returned. | 1047 | * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even |
1048 | * if the GFP flags specified for FGP_CREAT are atomic. | ||
1051 | * | 1049 | * |
1052 | * find_or_create_page() may sleep, even if @gfp_flags specifies an | 1050 | * If there is a page cache page, it is returned with an increased refcount. |
1053 | * atomic allocation! | ||
1054 | */ | 1051 | */ |
1055 | struct page *find_or_create_page(struct address_space *mapping, | 1052 | struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, |
1056 | pgoff_t index, gfp_t gfp_mask) | 1053 | int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) |
1057 | { | 1054 | { |
1058 | struct page *page; | 1055 | struct page *page; |
1059 | int err; | 1056 | |
1060 | repeat: | 1057 | repeat: |
1061 | page = find_lock_page(mapping, index); | 1058 | page = find_get_entry(mapping, offset); |
1062 | if (!page) { | 1059 | if (radix_tree_exceptional_entry(page)) |
1063 | page = __page_cache_alloc(gfp_mask); | 1060 | page = NULL; |
1061 | if (!page) | ||
1062 | goto no_page; | ||
1063 | |||
1064 | if (fgp_flags & FGP_LOCK) { | ||
1065 | if (fgp_flags & FGP_NOWAIT) { | ||
1066 | if (!trylock_page(page)) { | ||
1067 | page_cache_release(page); | ||
1068 | return NULL; | ||
1069 | } | ||
1070 | } else { | ||
1071 | lock_page(page); | ||
1072 | } | ||
1073 | |||
1074 | /* Has the page been truncated? */ | ||
1075 | if (unlikely(page->mapping != mapping)) { | ||
1076 | unlock_page(page); | ||
1077 | page_cache_release(page); | ||
1078 | goto repeat; | ||
1079 | } | ||
1080 | VM_BUG_ON_PAGE(page->index != offset, page); | ||
1081 | } | ||
1082 | |||
1083 | if (page && (fgp_flags & FGP_ACCESSED)) | ||
1084 | mark_page_accessed(page); | ||
1085 | |||
1086 | no_page: | ||
1087 | if (!page && (fgp_flags & FGP_CREAT)) { | ||
1088 | int err; | ||
1089 | if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) | ||
1090 | cache_gfp_mask |= __GFP_WRITE; | ||
1091 | if (fgp_flags & FGP_NOFS) { | ||
1092 | cache_gfp_mask &= ~__GFP_FS; | ||
1093 | radix_gfp_mask &= ~__GFP_FS; | ||
1094 | } | ||
1095 | |||
1096 | page = __page_cache_alloc(cache_gfp_mask); | ||
1064 | if (!page) | 1097 | if (!page) |
1065 | return NULL; | 1098 | return NULL; |
1066 | /* | 1099 | |
1067 | * We want a regular kernel memory (not highmem or DMA etc) | 1100 | if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) |
1068 | * allocation for the radix tree nodes, but we need to honour | 1101 | fgp_flags |= FGP_LOCK; |
1069 | * the context-specific requirements the caller has asked for. | 1102 | |
1070 | * GFP_RECLAIM_MASK collects those requirements. | 1103 | /* Init accessed so avoit atomic mark_page_accessed later */ |
1071 | */ | 1104 | if (fgp_flags & FGP_ACCESSED) |
1072 | err = add_to_page_cache_lru(page, mapping, index, | 1105 | init_page_accessed(page); |
1073 | (gfp_mask & GFP_RECLAIM_MASK)); | 1106 | |
1107 | err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); | ||
1074 | if (unlikely(err)) { | 1108 | if (unlikely(err)) { |
1075 | page_cache_release(page); | 1109 | page_cache_release(page); |
1076 | page = NULL; | 1110 | page = NULL; |
@@ -1078,9 +1112,10 @@ repeat: | |||
1078 | goto repeat; | 1112 | goto repeat; |
1079 | } | 1113 | } |
1080 | } | 1114 | } |
1115 | |||
1081 | return page; | 1116 | return page; |
1082 | } | 1117 | } |
1083 | EXPORT_SYMBOL(find_or_create_page); | 1118 | EXPORT_SYMBOL(pagecache_get_page); |
1084 | 1119 | ||
1085 | /** | 1120 | /** |
1086 | * find_get_entries - gang pagecache lookup | 1121 | * find_get_entries - gang pagecache lookup |
@@ -1377,39 +1412,6 @@ repeat: | |||
1377 | } | 1412 | } |
1378 | EXPORT_SYMBOL(find_get_pages_tag); | 1413 | EXPORT_SYMBOL(find_get_pages_tag); |
1379 | 1414 | ||
1380 | /** | ||
1381 | * grab_cache_page_nowait - returns locked page at given index in given cache | ||
1382 | * @mapping: target address_space | ||
1383 | * @index: the page index | ||
1384 | * | ||
1385 | * Same as grab_cache_page(), but do not wait if the page is unavailable. | ||
1386 | * This is intended for speculative data generators, where the data can | ||
1387 | * be regenerated if the page couldn't be grabbed. This routine should | ||
1388 | * be safe to call while holding the lock for another page. | ||
1389 | * | ||
1390 | * Clear __GFP_FS when allocating the page to avoid recursion into the fs | ||
1391 | * and deadlock against the caller's locked page. | ||
1392 | */ | ||
1393 | struct page * | ||
1394 | grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | ||
1395 | { | ||
1396 | struct page *page = find_get_page(mapping, index); | ||
1397 | |||
1398 | if (page) { | ||
1399 | if (trylock_page(page)) | ||
1400 | return page; | ||
1401 | page_cache_release(page); | ||
1402 | return NULL; | ||
1403 | } | ||
1404 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); | ||
1405 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { | ||
1406 | page_cache_release(page); | ||
1407 | page = NULL; | ||
1408 | } | ||
1409 | return page; | ||
1410 | } | ||
1411 | EXPORT_SYMBOL(grab_cache_page_nowait); | ||
1412 | |||
1413 | /* | 1415 | /* |
1414 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail | 1416 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail |
1415 | * a _large_ part of the i/o request. Imagine the worst scenario: | 1417 | * a _large_ part of the i/o request. Imagine the worst scenario: |
@@ -2379,7 +2381,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, | |||
2379 | { | 2381 | { |
2380 | const struct address_space_operations *aops = mapping->a_ops; | 2382 | const struct address_space_operations *aops = mapping->a_ops; |
2381 | 2383 | ||
2382 | mark_page_accessed(page); | ||
2383 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); | 2384 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); |
2384 | } | 2385 | } |
2385 | EXPORT_SYMBOL(pagecache_write_end); | 2386 | EXPORT_SYMBOL(pagecache_write_end); |
@@ -2461,34 +2462,18 @@ EXPORT_SYMBOL(generic_file_direct_write); | |||
2461 | struct page *grab_cache_page_write_begin(struct address_space *mapping, | 2462 | struct page *grab_cache_page_write_begin(struct address_space *mapping, |
2462 | pgoff_t index, unsigned flags) | 2463 | pgoff_t index, unsigned flags) |
2463 | { | 2464 | { |
2464 | int status; | ||
2465 | gfp_t gfp_mask; | ||
2466 | struct page *page; | 2465 | struct page *page; |
2467 | gfp_t gfp_notmask = 0; | 2466 | int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; |
2468 | 2467 | ||
2469 | gfp_mask = mapping_gfp_mask(mapping); | ||
2470 | if (mapping_cap_account_dirty(mapping)) | ||
2471 | gfp_mask |= __GFP_WRITE; | ||
2472 | if (flags & AOP_FLAG_NOFS) | 2468 | if (flags & AOP_FLAG_NOFS) |
2473 | gfp_notmask = __GFP_FS; | 2469 | fgp_flags |= FGP_NOFS; |
2474 | repeat: | 2470 | |
2475 | page = find_lock_page(mapping, index); | 2471 | page = pagecache_get_page(mapping, index, fgp_flags, |
2472 | mapping_gfp_mask(mapping), | ||
2473 | GFP_KERNEL); | ||
2476 | if (page) | 2474 | if (page) |
2477 | goto found; | 2475 | wait_for_stable_page(page); |
2478 | 2476 | ||
2479 | page = __page_cache_alloc(gfp_mask & ~gfp_notmask); | ||
2480 | if (!page) | ||
2481 | return NULL; | ||
2482 | status = add_to_page_cache_lru(page, mapping, index, | ||
2483 | GFP_KERNEL & ~gfp_notmask); | ||
2484 | if (unlikely(status)) { | ||
2485 | page_cache_release(page); | ||
2486 | if (status == -EEXIST) | ||
2487 | goto repeat; | ||
2488 | return NULL; | ||
2489 | } | ||
2490 | found: | ||
2491 | wait_for_stable_page(page); | ||
2492 | return page; | 2477 | return page; |
2493 | } | 2478 | } |
2494 | EXPORT_SYMBOL(grab_cache_page_write_begin); | 2479 | EXPORT_SYMBOL(grab_cache_page_write_begin); |
@@ -2537,7 +2522,7 @@ again: | |||
2537 | 2522 | ||
2538 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, | 2523 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, |
2539 | &page, &fsdata); | 2524 | &page, &fsdata); |
2540 | if (unlikely(status)) | 2525 | if (unlikely(status < 0)) |
2541 | break; | 2526 | break; |
2542 | 2527 | ||
2543 | if (mapping_writably_mapped(mapping)) | 2528 | if (mapping_writably_mapped(mapping)) |
@@ -2546,7 +2531,6 @@ again: | |||
2546 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | 2531 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); |
2547 | flush_dcache_page(page); | 2532 | flush_dcache_page(page); |
2548 | 2533 | ||
2549 | mark_page_accessed(page); | ||
2550 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | 2534 | status = a_ops->write_end(file, mapping, pos, bytes, copied, |
2551 | page, fsdata); | 2535 | page, fsdata); |
2552 | if (unlikely(status < 0)) | 2536 | if (unlikely(status < 0)) |
diff --git a/mm/fremap.c b/mm/fremap.c index 34feba60a17e..2c5646f11f41 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
82 | 82 | ||
83 | ptfile = pgoff_to_pte(pgoff); | 83 | ptfile = pgoff_to_pte(pgoff); |
84 | 84 | ||
85 | if (!pte_none(*pte)) { | 85 | if (!pte_none(*pte)) |
86 | if (pte_present(*pte) && pte_soft_dirty(*pte)) | ||
87 | pte_file_mksoft_dirty(ptfile); | ||
88 | zap_pte(mm, vma, addr, pte); | 86 | zap_pte(mm, vma, addr, pte); |
89 | } | ||
90 | 87 | ||
91 | set_pte_at(mm, addr, pte, ptfile); | 88 | set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); |
92 | /* | 89 | /* |
93 | * We don't need to run update_mmu_cache() here because the "file pte" | 90 | * We don't need to run update_mmu_cache() here because the "file pte" |
94 | * being installed by install_file_pte() is not a real pte - it's a | 91 | * being installed by install_file_pte() is not a real pte - it's a |
diff --git a/mm/frontswap.c b/mm/frontswap.c index 1b24bdcb3197..c30eec536f03 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c | |||
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); | |||
327 | 327 | ||
328 | static unsigned long __frontswap_curr_pages(void) | 328 | static unsigned long __frontswap_curr_pages(void) |
329 | { | 329 | { |
330 | int type; | ||
331 | unsigned long totalpages = 0; | 330 | unsigned long totalpages = 0; |
332 | struct swap_info_struct *si = NULL; | 331 | struct swap_info_struct *si = NULL; |
333 | 332 | ||
334 | assert_spin_locked(&swap_lock); | 333 | assert_spin_locked(&swap_lock); |
335 | for (type = swap_list.head; type >= 0; type = si->next) { | 334 | plist_for_each_entry(si, &swap_active_head, list) |
336 | si = swap_info[type]; | ||
337 | totalpages += atomic_read(&si->frontswap_pages); | 335 | totalpages += atomic_read(&si->frontswap_pages); |
338 | } | ||
339 | return totalpages; | 336 | return totalpages; |
340 | } | 337 | } |
341 | 338 | ||
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | |||
347 | int si_frontswap_pages; | 344 | int si_frontswap_pages; |
348 | unsigned long total_pages_to_unuse = total; | 345 | unsigned long total_pages_to_unuse = total; |
349 | unsigned long pages = 0, pages_to_unuse = 0; | 346 | unsigned long pages = 0, pages_to_unuse = 0; |
350 | int type; | ||
351 | 347 | ||
352 | assert_spin_locked(&swap_lock); | 348 | assert_spin_locked(&swap_lock); |
353 | for (type = swap_list.head; type >= 0; type = si->next) { | 349 | plist_for_each_entry(si, &swap_active_head, list) { |
354 | si = swap_info[type]; | ||
355 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | 350 | si_frontswap_pages = atomic_read(&si->frontswap_pages); |
356 | if (total_pages_to_unuse < si_frontswap_pages) { | 351 | if (total_pages_to_unuse < si_frontswap_pages) { |
357 | pages = pages_to_unuse = total_pages_to_unuse; | 352 | pages = pages_to_unuse = total_pages_to_unuse; |
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | |||
366 | } | 361 | } |
367 | vm_unacct_memory(pages); | 362 | vm_unacct_memory(pages); |
368 | *unused = pages_to_unuse; | 363 | *unused = pages_to_unuse; |
369 | *swapid = type; | 364 | *swapid = si->type; |
370 | ret = 0; | 365 | ret = 0; |
371 | break; | 366 | break; |
372 | } | 367 | } |
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) | |||
413 | /* | 408 | /* |
414 | * we don't want to hold swap_lock while doing a very | 409 | * we don't want to hold swap_lock while doing a very |
415 | * lengthy try_to_unuse, but swap_list may change | 410 | * lengthy try_to_unuse, but swap_list may change |
416 | * so restart scan from swap_list.head each time | 411 | * so restart scan from swap_active_head each time |
417 | */ | 412 | */ |
418 | spin_lock(&swap_lock); | 413 | spin_lock(&swap_lock); |
419 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); | 414 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); |
diff --git a/mm/gup.c b/mm/gup.c new file mode 100644 index 000000000000..cc5a9e7adea7 --- /dev/null +++ b/mm/gup.c | |||
@@ -0,0 +1,662 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/err.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | |||
6 | #include <linux/hugetlb.h> | ||
7 | #include <linux/mm.h> | ||
8 | #include <linux/pagemap.h> | ||
9 | #include <linux/rmap.h> | ||
10 | #include <linux/swap.h> | ||
11 | #include <linux/swapops.h> | ||
12 | |||
13 | #include "internal.h" | ||
14 | |||
15 | static struct page *no_page_table(struct vm_area_struct *vma, | ||
16 | unsigned int flags) | ||
17 | { | ||
18 | /* | ||
19 | * When core dumping an enormous anonymous area that nobody | ||
20 | * has touched so far, we don't want to allocate unnecessary pages or | ||
21 | * page tables. Return error instead of NULL to skip handle_mm_fault, | ||
22 | * then get_dump_page() will return NULL to leave a hole in the dump. | ||
23 | * But we can only make this optimization where a hole would surely | ||
24 | * be zero-filled if handle_mm_fault() actually did handle it. | ||
25 | */ | ||
26 | if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) | ||
27 | return ERR_PTR(-EFAULT); | ||
28 | return NULL; | ||
29 | } | ||
30 | |||
31 | static struct page *follow_page_pte(struct vm_area_struct *vma, | ||
32 | unsigned long address, pmd_t *pmd, unsigned int flags) | ||
33 | { | ||
34 | struct mm_struct *mm = vma->vm_mm; | ||
35 | struct page *page; | ||
36 | spinlock_t *ptl; | ||
37 | pte_t *ptep, pte; | ||
38 | |||
39 | retry: | ||
40 | if (unlikely(pmd_bad(*pmd))) | ||
41 | return no_page_table(vma, flags); | ||
42 | |||
43 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
44 | pte = *ptep; | ||
45 | if (!pte_present(pte)) { | ||
46 | swp_entry_t entry; | ||
47 | /* | ||
48 | * KSM's break_ksm() relies upon recognizing a ksm page | ||
49 | * even while it is being migrated, so for that case we | ||
50 | * need migration_entry_wait(). | ||
51 | */ | ||
52 | if (likely(!(flags & FOLL_MIGRATION))) | ||
53 | goto no_page; | ||
54 | if (pte_none(pte) || pte_file(pte)) | ||
55 | goto no_page; | ||
56 | entry = pte_to_swp_entry(pte); | ||
57 | if (!is_migration_entry(entry)) | ||
58 | goto no_page; | ||
59 | pte_unmap_unlock(ptep, ptl); | ||
60 | migration_entry_wait(mm, pmd, address); | ||
61 | goto retry; | ||
62 | } | ||
63 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | ||
64 | goto no_page; | ||
65 | if ((flags & FOLL_WRITE) && !pte_write(pte)) { | ||
66 | pte_unmap_unlock(ptep, ptl); | ||
67 | return NULL; | ||
68 | } | ||
69 | |||
70 | page = vm_normal_page(vma, address, pte); | ||
71 | if (unlikely(!page)) { | ||
72 | if ((flags & FOLL_DUMP) || | ||
73 | !is_zero_pfn(pte_pfn(pte))) | ||
74 | goto bad_page; | ||
75 | page = pte_page(pte); | ||
76 | } | ||
77 | |||
78 | if (flags & FOLL_GET) | ||
79 | get_page_foll(page); | ||
80 | if (flags & FOLL_TOUCH) { | ||
81 | if ((flags & FOLL_WRITE) && | ||
82 | !pte_dirty(pte) && !PageDirty(page)) | ||
83 | set_page_dirty(page); | ||
84 | /* | ||
85 | * pte_mkyoung() would be more correct here, but atomic care | ||
86 | * is needed to avoid losing the dirty bit: it is easier to use | ||
87 | * mark_page_accessed(). | ||
88 | */ | ||
89 | mark_page_accessed(page); | ||
90 | } | ||
91 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
92 | /* | ||
93 | * The preliminary mapping check is mainly to avoid the | ||
94 | * pointless overhead of lock_page on the ZERO_PAGE | ||
95 | * which might bounce very badly if there is contention. | ||
96 | * | ||
97 | * If the page is already locked, we don't need to | ||
98 | * handle it now - vmscan will handle it later if and | ||
99 | * when it attempts to reclaim the page. | ||
100 | */ | ||
101 | if (page->mapping && trylock_page(page)) { | ||
102 | lru_add_drain(); /* push cached pages to LRU */ | ||
103 | /* | ||
104 | * Because we lock page here, and migration is | ||
105 | * blocked by the pte's page reference, and we | ||
106 | * know the page is still mapped, we don't even | ||
107 | * need to check for file-cache page truncation. | ||
108 | */ | ||
109 | mlock_vma_page(page); | ||
110 | unlock_page(page); | ||
111 | } | ||
112 | } | ||
113 | pte_unmap_unlock(ptep, ptl); | ||
114 | return page; | ||
115 | bad_page: | ||
116 | pte_unmap_unlock(ptep, ptl); | ||
117 | return ERR_PTR(-EFAULT); | ||
118 | |||
119 | no_page: | ||
120 | pte_unmap_unlock(ptep, ptl); | ||
121 | if (!pte_none(pte)) | ||
122 | return NULL; | ||
123 | return no_page_table(vma, flags); | ||
124 | } | ||
125 | |||
126 | /** | ||
127 | * follow_page_mask - look up a page descriptor from a user-virtual address | ||
128 | * @vma: vm_area_struct mapping @address | ||
129 | * @address: virtual address to look up | ||
130 | * @flags: flags modifying lookup behaviour | ||
131 | * @page_mask: on output, *page_mask is set according to the size of the page | ||
132 | * | ||
133 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> | ||
134 | * | ||
135 | * Returns the mapped (struct page *), %NULL if no mapping exists, or | ||
136 | * an error pointer if there is a mapping to something not represented | ||
137 | * by a page descriptor (see also vm_normal_page()). | ||
138 | */ | ||
139 | struct page *follow_page_mask(struct vm_area_struct *vma, | ||
140 | unsigned long address, unsigned int flags, | ||
141 | unsigned int *page_mask) | ||
142 | { | ||
143 | pgd_t *pgd; | ||
144 | pud_t *pud; | ||
145 | pmd_t *pmd; | ||
146 | spinlock_t *ptl; | ||
147 | struct page *page; | ||
148 | struct mm_struct *mm = vma->vm_mm; | ||
149 | |||
150 | *page_mask = 0; | ||
151 | |||
152 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | ||
153 | if (!IS_ERR(page)) { | ||
154 | BUG_ON(flags & FOLL_GET); | ||
155 | return page; | ||
156 | } | ||
157 | |||
158 | pgd = pgd_offset(mm, address); | ||
159 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
160 | return no_page_table(vma, flags); | ||
161 | |||
162 | pud = pud_offset(pgd, address); | ||
163 | if (pud_none(*pud)) | ||
164 | return no_page_table(vma, flags); | ||
165 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | ||
166 | if (flags & FOLL_GET) | ||
167 | return NULL; | ||
168 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | ||
169 | return page; | ||
170 | } | ||
171 | if (unlikely(pud_bad(*pud))) | ||
172 | return no_page_table(vma, flags); | ||
173 | |||
174 | pmd = pmd_offset(pud, address); | ||
175 | if (pmd_none(*pmd)) | ||
176 | return no_page_table(vma, flags); | ||
177 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | ||
178 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | ||
179 | if (flags & FOLL_GET) { | ||
180 | /* | ||
181 | * Refcount on tail pages are not well-defined and | ||
182 | * shouldn't be taken. The caller should handle a NULL | ||
183 | * return when trying to follow tail pages. | ||
184 | */ | ||
185 | if (PageHead(page)) | ||
186 | get_page(page); | ||
187 | else | ||
188 | page = NULL; | ||
189 | } | ||
190 | return page; | ||
191 | } | ||
192 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | ||
193 | return no_page_table(vma, flags); | ||
194 | if (pmd_trans_huge(*pmd)) { | ||
195 | if (flags & FOLL_SPLIT) { | ||
196 | split_huge_page_pmd(vma, address, pmd); | ||
197 | return follow_page_pte(vma, address, pmd, flags); | ||
198 | } | ||
199 | ptl = pmd_lock(mm, pmd); | ||
200 | if (likely(pmd_trans_huge(*pmd))) { | ||
201 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
202 | spin_unlock(ptl); | ||
203 | wait_split_huge_page(vma->anon_vma, pmd); | ||
204 | } else { | ||
205 | page = follow_trans_huge_pmd(vma, address, | ||
206 | pmd, flags); | ||
207 | spin_unlock(ptl); | ||
208 | *page_mask = HPAGE_PMD_NR - 1; | ||
209 | return page; | ||
210 | } | ||
211 | } else | ||
212 | spin_unlock(ptl); | ||
213 | } | ||
214 | return follow_page_pte(vma, address, pmd, flags); | ||
215 | } | ||
216 | |||
217 | static int get_gate_page(struct mm_struct *mm, unsigned long address, | ||
218 | unsigned int gup_flags, struct vm_area_struct **vma, | ||
219 | struct page **page) | ||
220 | { | ||
221 | pgd_t *pgd; | ||
222 | pud_t *pud; | ||
223 | pmd_t *pmd; | ||
224 | pte_t *pte; | ||
225 | int ret = -EFAULT; | ||
226 | |||
227 | /* user gate pages are read-only */ | ||
228 | if (gup_flags & FOLL_WRITE) | ||
229 | return -EFAULT; | ||
230 | if (address > TASK_SIZE) | ||
231 | pgd = pgd_offset_k(address); | ||
232 | else | ||
233 | pgd = pgd_offset_gate(mm, address); | ||
234 | BUG_ON(pgd_none(*pgd)); | ||
235 | pud = pud_offset(pgd, address); | ||
236 | BUG_ON(pud_none(*pud)); | ||
237 | pmd = pmd_offset(pud, address); | ||
238 | if (pmd_none(*pmd)) | ||
239 | return -EFAULT; | ||
240 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
241 | pte = pte_offset_map(pmd, address); | ||
242 | if (pte_none(*pte)) | ||
243 | goto unmap; | ||
244 | *vma = get_gate_vma(mm); | ||
245 | if (!page) | ||
246 | goto out; | ||
247 | *page = vm_normal_page(*vma, address, *pte); | ||
248 | if (!*page) { | ||
249 | if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) | ||
250 | goto unmap; | ||
251 | *page = pte_page(*pte); | ||
252 | } | ||
253 | get_page(*page); | ||
254 | out: | ||
255 | ret = 0; | ||
256 | unmap: | ||
257 | pte_unmap(pte); | ||
258 | return ret; | ||
259 | } | ||
260 | |||
261 | static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | ||
262 | unsigned long address, unsigned int *flags, int *nonblocking) | ||
263 | { | ||
264 | struct mm_struct *mm = vma->vm_mm; | ||
265 | unsigned int fault_flags = 0; | ||
266 | int ret; | ||
267 | |||
268 | /* For mlock, just skip the stack guard page. */ | ||
269 | if ((*flags & FOLL_MLOCK) && | ||
270 | (stack_guard_page_start(vma, address) || | ||
271 | stack_guard_page_end(vma, address + PAGE_SIZE))) | ||
272 | return -ENOENT; | ||
273 | if (*flags & FOLL_WRITE) | ||
274 | fault_flags |= FAULT_FLAG_WRITE; | ||
275 | if (nonblocking) | ||
276 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
277 | if (*flags & FOLL_NOWAIT) | ||
278 | fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; | ||
279 | |||
280 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
281 | if (ret & VM_FAULT_ERROR) { | ||
282 | if (ret & VM_FAULT_OOM) | ||
283 | return -ENOMEM; | ||
284 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
285 | return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; | ||
286 | if (ret & VM_FAULT_SIGBUS) | ||
287 | return -EFAULT; | ||
288 | BUG(); | ||
289 | } | ||
290 | |||
291 | if (tsk) { | ||
292 | if (ret & VM_FAULT_MAJOR) | ||
293 | tsk->maj_flt++; | ||
294 | else | ||
295 | tsk->min_flt++; | ||
296 | } | ||
297 | |||
298 | if (ret & VM_FAULT_RETRY) { | ||
299 | if (nonblocking) | ||
300 | *nonblocking = 0; | ||
301 | return -EBUSY; | ||
302 | } | ||
303 | |||
304 | /* | ||
305 | * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when | ||
306 | * necessary, even if maybe_mkwrite decided not to set pte_write. We | ||
307 | * can thus safely do subsequent page lookups as if they were reads. | ||
308 | * But only do so when looping for pte_write is futile: in some cases | ||
309 | * userspace may also be wanting to write to the gotten user page, | ||
310 | * which a read fault here might prevent (a readonly page might get | ||
311 | * reCOWed by userspace write). | ||
312 | */ | ||
313 | if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) | ||
314 | *flags &= ~FOLL_WRITE; | ||
315 | return 0; | ||
316 | } | ||
317 | |||
318 | static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) | ||
319 | { | ||
320 | vm_flags_t vm_flags = vma->vm_flags; | ||
321 | |||
322 | if (vm_flags & (VM_IO | VM_PFNMAP)) | ||
323 | return -EFAULT; | ||
324 | |||
325 | if (gup_flags & FOLL_WRITE) { | ||
326 | if (!(vm_flags & VM_WRITE)) { | ||
327 | if (!(gup_flags & FOLL_FORCE)) | ||
328 | return -EFAULT; | ||
329 | /* | ||
330 | * We used to let the write,force case do COW in a | ||
331 | * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could | ||
332 | * set a breakpoint in a read-only mapping of an | ||
333 | * executable, without corrupting the file (yet only | ||
334 | * when that file had been opened for writing!). | ||
335 | * Anon pages in shared mappings are surprising: now | ||
336 | * just reject it. | ||
337 | */ | ||
338 | if (!is_cow_mapping(vm_flags)) { | ||
339 | WARN_ON_ONCE(vm_flags & VM_MAYWRITE); | ||
340 | return -EFAULT; | ||
341 | } | ||
342 | } | ||
343 | } else if (!(vm_flags & VM_READ)) { | ||
344 | if (!(gup_flags & FOLL_FORCE)) | ||
345 | return -EFAULT; | ||
346 | /* | ||
347 | * Is there actually any vma we can reach here which does not | ||
348 | * have VM_MAYREAD set? | ||
349 | */ | ||
350 | if (!(vm_flags & VM_MAYREAD)) | ||
351 | return -EFAULT; | ||
352 | } | ||
353 | return 0; | ||
354 | } | ||
355 | |||
356 | /** | ||
357 | * __get_user_pages() - pin user pages in memory | ||
358 | * @tsk: task_struct of target task | ||
359 | * @mm: mm_struct of target mm | ||
360 | * @start: starting user address | ||
361 | * @nr_pages: number of pages from start to pin | ||
362 | * @gup_flags: flags modifying pin behaviour | ||
363 | * @pages: array that receives pointers to the pages pinned. | ||
364 | * Should be at least nr_pages long. Or NULL, if caller | ||
365 | * only intends to ensure the pages are faulted in. | ||
366 | * @vmas: array of pointers to vmas corresponding to each page. | ||
367 | * Or NULL if the caller does not require them. | ||
368 | * @nonblocking: whether waiting for disk IO or mmap_sem contention | ||
369 | * | ||
370 | * Returns number of pages pinned. This may be fewer than the number | ||
371 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
372 | * were pinned, returns -errno. Each page returned must be released | ||
373 | * with a put_page() call when it is finished with. vmas will only | ||
374 | * remain valid while mmap_sem is held. | ||
375 | * | ||
376 | * Must be called with mmap_sem held for read or write. | ||
377 | * | ||
378 | * __get_user_pages walks a process's page tables and takes a reference to | ||
379 | * each struct page that each user address corresponds to at a given | ||
380 | * instant. That is, it takes the page that would be accessed if a user | ||
381 | * thread accesses the given user virtual address at that instant. | ||
382 | * | ||
383 | * This does not guarantee that the page exists in the user mappings when | ||
384 | * __get_user_pages returns, and there may even be a completely different | ||
385 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
386 | * and subsequently re faulted). However it does guarantee that the page | ||
387 | * won't be freed completely. And mostly callers simply care that the page | ||
388 | * contains data that was valid *at some point in time*. Typically, an IO | ||
389 | * or similar operation cannot guarantee anything stronger anyway because | ||
390 | * locks can't be held over the syscall boundary. | ||
391 | * | ||
392 | * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If | ||
393 | * the page is written to, set_page_dirty (or set_page_dirty_lock, as | ||
394 | * appropriate) must be called after the page is finished with, and | ||
395 | * before put_page is called. | ||
396 | * | ||
397 | * If @nonblocking != NULL, __get_user_pages will not wait for disk IO | ||
398 | * or mmap_sem contention, and if waiting is needed to pin all pages, | ||
399 | * *@nonblocking will be set to 0. | ||
400 | * | ||
401 | * In most cases, get_user_pages or get_user_pages_fast should be used | ||
402 | * instead of __get_user_pages. __get_user_pages should be used only if | ||
403 | * you need some special @gup_flags. | ||
404 | */ | ||
405 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
406 | unsigned long start, unsigned long nr_pages, | ||
407 | unsigned int gup_flags, struct page **pages, | ||
408 | struct vm_area_struct **vmas, int *nonblocking) | ||
409 | { | ||
410 | long i = 0; | ||
411 | unsigned int page_mask; | ||
412 | struct vm_area_struct *vma = NULL; | ||
413 | |||
414 | if (!nr_pages) | ||
415 | return 0; | ||
416 | |||
417 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); | ||
418 | |||
419 | /* | ||
420 | * If FOLL_FORCE is set then do not force a full fault as the hinting | ||
421 | * fault information is unrelated to the reference behaviour of a task | ||
422 | * using the address space | ||
423 | */ | ||
424 | if (!(gup_flags & FOLL_FORCE)) | ||
425 | gup_flags |= FOLL_NUMA; | ||
426 | |||
427 | do { | ||
428 | struct page *page; | ||
429 | unsigned int foll_flags = gup_flags; | ||
430 | unsigned int page_increm; | ||
431 | |||
432 | /* first iteration or cross vma bound */ | ||
433 | if (!vma || start >= vma->vm_end) { | ||
434 | vma = find_extend_vma(mm, start); | ||
435 | if (!vma && in_gate_area(mm, start)) { | ||
436 | int ret; | ||
437 | ret = get_gate_page(mm, start & PAGE_MASK, | ||
438 | gup_flags, &vma, | ||
439 | pages ? &pages[i] : NULL); | ||
440 | if (ret) | ||
441 | return i ? : ret; | ||
442 | page_mask = 0; | ||
443 | goto next_page; | ||
444 | } | ||
445 | |||
446 | if (!vma || check_vma_flags(vma, gup_flags)) | ||
447 | return i ? : -EFAULT; | ||
448 | if (is_vm_hugetlb_page(vma)) { | ||
449 | i = follow_hugetlb_page(mm, vma, pages, vmas, | ||
450 | &start, &nr_pages, i, | ||
451 | gup_flags); | ||
452 | continue; | ||
453 | } | ||
454 | } | ||
455 | retry: | ||
456 | /* | ||
457 | * If we have a pending SIGKILL, don't keep faulting pages and | ||
458 | * potentially allocating memory. | ||
459 | */ | ||
460 | if (unlikely(fatal_signal_pending(current))) | ||
461 | return i ? i : -ERESTARTSYS; | ||
462 | cond_resched(); | ||
463 | page = follow_page_mask(vma, start, foll_flags, &page_mask); | ||
464 | if (!page) { | ||
465 | int ret; | ||
466 | ret = faultin_page(tsk, vma, start, &foll_flags, | ||
467 | nonblocking); | ||
468 | switch (ret) { | ||
469 | case 0: | ||
470 | goto retry; | ||
471 | case -EFAULT: | ||
472 | case -ENOMEM: | ||
473 | case -EHWPOISON: | ||
474 | return i ? i : ret; | ||
475 | case -EBUSY: | ||
476 | return i; | ||
477 | case -ENOENT: | ||
478 | goto next_page; | ||
479 | } | ||
480 | BUG(); | ||
481 | } | ||
482 | if (IS_ERR(page)) | ||
483 | return i ? i : PTR_ERR(page); | ||
484 | if (pages) { | ||
485 | pages[i] = page; | ||
486 | flush_anon_page(vma, page, start); | ||
487 | flush_dcache_page(page); | ||
488 | page_mask = 0; | ||
489 | } | ||
490 | next_page: | ||
491 | if (vmas) { | ||
492 | vmas[i] = vma; | ||
493 | page_mask = 0; | ||
494 | } | ||
495 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); | ||
496 | if (page_increm > nr_pages) | ||
497 | page_increm = nr_pages; | ||
498 | i += page_increm; | ||
499 | start += page_increm * PAGE_SIZE; | ||
500 | nr_pages -= page_increm; | ||
501 | } while (nr_pages); | ||
502 | return i; | ||
503 | } | ||
504 | EXPORT_SYMBOL(__get_user_pages); | ||
505 | |||
506 | /* | ||
507 | * fixup_user_fault() - manually resolve a user page fault | ||
508 | * @tsk: the task_struct to use for page fault accounting, or | ||
509 | * NULL if faults are not to be recorded. | ||
510 | * @mm: mm_struct of target mm | ||
511 | * @address: user address | ||
512 | * @fault_flags:flags to pass down to handle_mm_fault() | ||
513 | * | ||
514 | * This is meant to be called in the specific scenario where for locking reasons | ||
515 | * we try to access user memory in atomic context (within a pagefault_disable() | ||
516 | * section), this returns -EFAULT, and we want to resolve the user fault before | ||
517 | * trying again. | ||
518 | * | ||
519 | * Typically this is meant to be used by the futex code. | ||
520 | * | ||
521 | * The main difference with get_user_pages() is that this function will | ||
522 | * unconditionally call handle_mm_fault() which will in turn perform all the | ||
523 | * necessary SW fixup of the dirty and young bits in the PTE, while | ||
524 | * handle_mm_fault() only guarantees to update these in the struct page. | ||
525 | * | ||
526 | * This is important for some architectures where those bits also gate the | ||
527 | * access permission to the page because they are maintained in software. On | ||
528 | * such architectures, gup() will not be enough to make a subsequent access | ||
529 | * succeed. | ||
530 | * | ||
531 | * This should be called with the mm_sem held for read. | ||
532 | */ | ||
533 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | ||
534 | unsigned long address, unsigned int fault_flags) | ||
535 | { | ||
536 | struct vm_area_struct *vma; | ||
537 | vm_flags_t vm_flags; | ||
538 | int ret; | ||
539 | |||
540 | vma = find_extend_vma(mm, address); | ||
541 | if (!vma || address < vma->vm_start) | ||
542 | return -EFAULT; | ||
543 | |||
544 | vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; | ||
545 | if (!(vm_flags & vma->vm_flags)) | ||
546 | return -EFAULT; | ||
547 | |||
548 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
549 | if (ret & VM_FAULT_ERROR) { | ||
550 | if (ret & VM_FAULT_OOM) | ||
551 | return -ENOMEM; | ||
552 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
553 | return -EHWPOISON; | ||
554 | if (ret & VM_FAULT_SIGBUS) | ||
555 | return -EFAULT; | ||
556 | BUG(); | ||
557 | } | ||
558 | if (tsk) { | ||
559 | if (ret & VM_FAULT_MAJOR) | ||
560 | tsk->maj_flt++; | ||
561 | else | ||
562 | tsk->min_flt++; | ||
563 | } | ||
564 | return 0; | ||
565 | } | ||
566 | |||
567 | /* | ||
568 | * get_user_pages() - pin user pages in memory | ||
569 | * @tsk: the task_struct to use for page fault accounting, or | ||
570 | * NULL if faults are not to be recorded. | ||
571 | * @mm: mm_struct of target mm | ||
572 | * @start: starting user address | ||
573 | * @nr_pages: number of pages from start to pin | ||
574 | * @write: whether pages will be written to by the caller | ||
575 | * @force: whether to force access even when user mapping is currently | ||
576 | * protected (but never forces write access to shared mapping). | ||
577 | * @pages: array that receives pointers to the pages pinned. | ||
578 | * Should be at least nr_pages long. Or NULL, if caller | ||
579 | * only intends to ensure the pages are faulted in. | ||
580 | * @vmas: array of pointers to vmas corresponding to each page. | ||
581 | * Or NULL if the caller does not require them. | ||
582 | * | ||
583 | * Returns number of pages pinned. This may be fewer than the number | ||
584 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
585 | * were pinned, returns -errno. Each page returned must be released | ||
586 | * with a put_page() call when it is finished with. vmas will only | ||
587 | * remain valid while mmap_sem is held. | ||
588 | * | ||
589 | * Must be called with mmap_sem held for read or write. | ||
590 | * | ||
591 | * get_user_pages walks a process's page tables and takes a reference to | ||
592 | * each struct page that each user address corresponds to at a given | ||
593 | * instant. That is, it takes the page that would be accessed if a user | ||
594 | * thread accesses the given user virtual address at that instant. | ||
595 | * | ||
596 | * This does not guarantee that the page exists in the user mappings when | ||
597 | * get_user_pages returns, and there may even be a completely different | ||
598 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
599 | * and subsequently re faulted). However it does guarantee that the page | ||
600 | * won't be freed completely. And mostly callers simply care that the page | ||
601 | * contains data that was valid *at some point in time*. Typically, an IO | ||
602 | * or similar operation cannot guarantee anything stronger anyway because | ||
603 | * locks can't be held over the syscall boundary. | ||
604 | * | ||
605 | * If write=0, the page must not be written to. If the page is written to, | ||
606 | * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called | ||
607 | * after the page is finished with, and before put_page is called. | ||
608 | * | ||
609 | * get_user_pages is typically used for fewer-copy IO operations, to get a | ||
610 | * handle on the memory by some means other than accesses via the user virtual | ||
611 | * addresses. The pages may be submitted for DMA to devices or accessed via | ||
612 | * their kernel linear mapping (via the kmap APIs). Care should be taken to | ||
613 | * use the correct cache flushing APIs. | ||
614 | * | ||
615 | * See also get_user_pages_fast, for performance critical applications. | ||
616 | */ | ||
617 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
618 | unsigned long start, unsigned long nr_pages, int write, | ||
619 | int force, struct page **pages, struct vm_area_struct **vmas) | ||
620 | { | ||
621 | int flags = FOLL_TOUCH; | ||
622 | |||
623 | if (pages) | ||
624 | flags |= FOLL_GET; | ||
625 | if (write) | ||
626 | flags |= FOLL_WRITE; | ||
627 | if (force) | ||
628 | flags |= FOLL_FORCE; | ||
629 | |||
630 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, | ||
631 | NULL); | ||
632 | } | ||
633 | EXPORT_SYMBOL(get_user_pages); | ||
634 | |||
635 | /** | ||
636 | * get_dump_page() - pin user page in memory while writing it to core dump | ||
637 | * @addr: user address | ||
638 | * | ||
639 | * Returns struct page pointer of user page pinned for dump, | ||
640 | * to be freed afterwards by page_cache_release() or put_page(). | ||
641 | * | ||
642 | * Returns NULL on any kind of failure - a hole must then be inserted into | ||
643 | * the corefile, to preserve alignment with its headers; and also returns | ||
644 | * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - | ||
645 | * allowing a hole to be left in the corefile to save diskspace. | ||
646 | * | ||
647 | * Called without mmap_sem, but after all other threads have been killed. | ||
648 | */ | ||
649 | #ifdef CONFIG_ELF_CORE | ||
650 | struct page *get_dump_page(unsigned long addr) | ||
651 | { | ||
652 | struct vm_area_struct *vma; | ||
653 | struct page *page; | ||
654 | |||
655 | if (__get_user_pages(current, current->mm, addr, 1, | ||
656 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, | ||
657 | NULL) < 1) | ||
658 | return NULL; | ||
659 | flush_cache_page(vma, addr, page_to_pfn(page)); | ||
660 | return page; | ||
661 | } | ||
662 | #endif /* CONFIG_ELF_CORE */ | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b4b1feba6472..e60837dc785c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -5,6 +5,8 @@ | |||
5 | * the COPYING file in the top-level directory. | 5 | * the COPYING file in the top-level directory. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
9 | |||
8 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
9 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
10 | #include <linux/highmem.h> | 12 | #include <linux/highmem.h> |
@@ -151,8 +153,7 @@ static int start_khugepaged(void) | |||
151 | khugepaged_thread = kthread_run(khugepaged, NULL, | 153 | khugepaged_thread = kthread_run(khugepaged, NULL, |
152 | "khugepaged"); | 154 | "khugepaged"); |
153 | if (unlikely(IS_ERR(khugepaged_thread))) { | 155 | if (unlikely(IS_ERR(khugepaged_thread))) { |
154 | printk(KERN_ERR | 156 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); |
155 | "khugepaged: kthread_run(khugepaged) failed\n"); | ||
156 | err = PTR_ERR(khugepaged_thread); | 157 | err = PTR_ERR(khugepaged_thread); |
157 | khugepaged_thread = NULL; | 158 | khugepaged_thread = NULL; |
158 | } | 159 | } |
@@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) | |||
584 | 585 | ||
585 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); | 586 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); |
586 | if (unlikely(!*hugepage_kobj)) { | 587 | if (unlikely(!*hugepage_kobj)) { |
587 | printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); | 588 | pr_err("failed to create transparent hugepage kobject\n"); |
588 | return -ENOMEM; | 589 | return -ENOMEM; |
589 | } | 590 | } |
590 | 591 | ||
591 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); | 592 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); |
592 | if (err) { | 593 | if (err) { |
593 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); | 594 | pr_err("failed to register transparent hugepage group\n"); |
594 | goto delete_obj; | 595 | goto delete_obj; |
595 | } | 596 | } |
596 | 597 | ||
597 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); | 598 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); |
598 | if (err) { | 599 | if (err) { |
599 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); | 600 | pr_err("failed to register transparent hugepage group\n"); |
600 | goto remove_hp_group; | 601 | goto remove_hp_group; |
601 | } | 602 | } |
602 | 603 | ||
@@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str) | |||
689 | } | 690 | } |
690 | out: | 691 | out: |
691 | if (!ret) | 692 | if (!ret) |
692 | printk(KERN_WARNING | 693 | pr_warn("transparent_hugepage= cannot parse, ignored\n"); |
693 | "transparent_hugepage= cannot parse, ignored\n"); | ||
694 | return ret; | 694 | return ret; |
695 | } | 695 | } |
696 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 696 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
@@ -1830,10 +1830,11 @@ static void __split_huge_page(struct page *page, | |||
1830 | * the newly established pmd of the child later during the | 1830 | * the newly established pmd of the child later during the |
1831 | * walk, to be able to set it as pmd_trans_splitting too. | 1831 | * walk, to be able to set it as pmd_trans_splitting too. |
1832 | */ | 1832 | */ |
1833 | if (mapcount != page_mapcount(page)) | 1833 | if (mapcount != page_mapcount(page)) { |
1834 | printk(KERN_ERR "mapcount %d page_mapcount %d\n", | 1834 | pr_err("mapcount %d page_mapcount %d\n", |
1835 | mapcount, page_mapcount(page)); | 1835 | mapcount, page_mapcount(page)); |
1836 | BUG_ON(mapcount != page_mapcount(page)); | 1836 | BUG(); |
1837 | } | ||
1837 | 1838 | ||
1838 | __split_huge_page_refcount(page, list); | 1839 | __split_huge_page_refcount(page, list); |
1839 | 1840 | ||
@@ -1844,10 +1845,11 @@ static void __split_huge_page(struct page *page, | |||
1844 | BUG_ON(is_vma_temporary_stack(vma)); | 1845 | BUG_ON(is_vma_temporary_stack(vma)); |
1845 | mapcount2 += __split_huge_page_map(page, vma, addr); | 1846 | mapcount2 += __split_huge_page_map(page, vma, addr); |
1846 | } | 1847 | } |
1847 | if (mapcount != mapcount2) | 1848 | if (mapcount != mapcount2) { |
1848 | printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", | 1849 | pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", |
1849 | mapcount, mapcount2, page_mapcount(page)); | 1850 | mapcount, mapcount2, page_mapcount(page)); |
1850 | BUG_ON(mapcount != mapcount2); | 1851 | BUG(); |
1852 | } | ||
1851 | } | 1853 | } |
1852 | 1854 | ||
1853 | /* | 1855 | /* |
@@ -2740,7 +2742,7 @@ static int khugepaged(void *none) | |||
2740 | struct mm_slot *mm_slot; | 2742 | struct mm_slot *mm_slot; |
2741 | 2743 | ||
2742 | set_freezable(); | 2744 | set_freezable(); |
2743 | set_user_nice(current, 19); | 2745 | set_user_nice(current, MAX_NICE); |
2744 | 2746 | ||
2745 | while (!kthread_should_stop()) { | 2747 | while (!kthread_should_stop()) { |
2746 | khugepaged_do_scan(); | 2748 | khugepaged_do_scan(); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c82290b9c1fc..226910cb7c9b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -544,7 +544,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |||
544 | /* Movability of hugepages depends on migration support. */ | 544 | /* Movability of hugepages depends on migration support. */ |
545 | static inline gfp_t htlb_alloc_mask(struct hstate *h) | 545 | static inline gfp_t htlb_alloc_mask(struct hstate *h) |
546 | { | 546 | { |
547 | if (hugepages_treat_as_movable || hugepage_migration_support(h)) | 547 | if (hugepages_treat_as_movable || hugepage_migration_supported(h)) |
548 | return GFP_HIGHUSER_MOVABLE; | 548 | return GFP_HIGHUSER_MOVABLE; |
549 | else | 549 | else |
550 | return GFP_HIGHUSER; | 550 | return GFP_HIGHUSER; |
@@ -607,25 +607,242 @@ err: | |||
607 | return NULL; | 607 | return NULL; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* | ||
611 | * common helper functions for hstate_next_node_to_{alloc|free}. | ||
612 | * We may have allocated or freed a huge page based on a different | ||
613 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might | ||
614 | * be outside of *nodes_allowed. Ensure that we use an allowed | ||
615 | * node for alloc or free. | ||
616 | */ | ||
617 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
618 | { | ||
619 | nid = next_node(nid, *nodes_allowed); | ||
620 | if (nid == MAX_NUMNODES) | ||
621 | nid = first_node(*nodes_allowed); | ||
622 | VM_BUG_ON(nid >= MAX_NUMNODES); | ||
623 | |||
624 | return nid; | ||
625 | } | ||
626 | |||
627 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
628 | { | ||
629 | if (!node_isset(nid, *nodes_allowed)) | ||
630 | nid = next_node_allowed(nid, nodes_allowed); | ||
631 | return nid; | ||
632 | } | ||
633 | |||
634 | /* | ||
635 | * returns the previously saved node ["this node"] from which to | ||
636 | * allocate a persistent huge page for the pool and advance the | ||
637 | * next node from which to allocate, handling wrap at end of node | ||
638 | * mask. | ||
639 | */ | ||
640 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
641 | nodemask_t *nodes_allowed) | ||
642 | { | ||
643 | int nid; | ||
644 | |||
645 | VM_BUG_ON(!nodes_allowed); | ||
646 | |||
647 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
648 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
649 | |||
650 | return nid; | ||
651 | } | ||
652 | |||
653 | /* | ||
654 | * helper for free_pool_huge_page() - return the previously saved | ||
655 | * node ["this node"] from which to free a huge page. Advance the | ||
656 | * next node id whether or not we find a free huge page to free so | ||
657 | * that the next attempt to free addresses the next node. | ||
658 | */ | ||
659 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) | ||
660 | { | ||
661 | int nid; | ||
662 | |||
663 | VM_BUG_ON(!nodes_allowed); | ||
664 | |||
665 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); | ||
666 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); | ||
667 | |||
668 | return nid; | ||
669 | } | ||
670 | |||
671 | #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ | ||
672 | for (nr_nodes = nodes_weight(*mask); \ | ||
673 | nr_nodes > 0 && \ | ||
674 | ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ | ||
675 | nr_nodes--) | ||
676 | |||
677 | #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ | ||
678 | for (nr_nodes = nodes_weight(*mask); \ | ||
679 | nr_nodes > 0 && \ | ||
680 | ((node = hstate_next_node_to_free(hs, mask)) || 1); \ | ||
681 | nr_nodes--) | ||
682 | |||
683 | #if defined(CONFIG_CMA) && defined(CONFIG_X86_64) | ||
684 | static void destroy_compound_gigantic_page(struct page *page, | ||
685 | unsigned long order) | ||
686 | { | ||
687 | int i; | ||
688 | int nr_pages = 1 << order; | ||
689 | struct page *p = page + 1; | ||
690 | |||
691 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
692 | __ClearPageTail(p); | ||
693 | set_page_refcounted(p); | ||
694 | p->first_page = NULL; | ||
695 | } | ||
696 | |||
697 | set_compound_order(page, 0); | ||
698 | __ClearPageHead(page); | ||
699 | } | ||
700 | |||
701 | static void free_gigantic_page(struct page *page, unsigned order) | ||
702 | { | ||
703 | free_contig_range(page_to_pfn(page), 1 << order); | ||
704 | } | ||
705 | |||
706 | static int __alloc_gigantic_page(unsigned long start_pfn, | ||
707 | unsigned long nr_pages) | ||
708 | { | ||
709 | unsigned long end_pfn = start_pfn + nr_pages; | ||
710 | return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | ||
711 | } | ||
712 | |||
713 | static bool pfn_range_valid_gigantic(unsigned long start_pfn, | ||
714 | unsigned long nr_pages) | ||
715 | { | ||
716 | unsigned long i, end_pfn = start_pfn + nr_pages; | ||
717 | struct page *page; | ||
718 | |||
719 | for (i = start_pfn; i < end_pfn; i++) { | ||
720 | if (!pfn_valid(i)) | ||
721 | return false; | ||
722 | |||
723 | page = pfn_to_page(i); | ||
724 | |||
725 | if (PageReserved(page)) | ||
726 | return false; | ||
727 | |||
728 | if (page_count(page) > 0) | ||
729 | return false; | ||
730 | |||
731 | if (PageHuge(page)) | ||
732 | return false; | ||
733 | } | ||
734 | |||
735 | return true; | ||
736 | } | ||
737 | |||
738 | static bool zone_spans_last_pfn(const struct zone *zone, | ||
739 | unsigned long start_pfn, unsigned long nr_pages) | ||
740 | { | ||
741 | unsigned long last_pfn = start_pfn + nr_pages - 1; | ||
742 | return zone_spans_pfn(zone, last_pfn); | ||
743 | } | ||
744 | |||
745 | static struct page *alloc_gigantic_page(int nid, unsigned order) | ||
746 | { | ||
747 | unsigned long nr_pages = 1 << order; | ||
748 | unsigned long ret, pfn, flags; | ||
749 | struct zone *z; | ||
750 | |||
751 | z = NODE_DATA(nid)->node_zones; | ||
752 | for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { | ||
753 | spin_lock_irqsave(&z->lock, flags); | ||
754 | |||
755 | pfn = ALIGN(z->zone_start_pfn, nr_pages); | ||
756 | while (zone_spans_last_pfn(z, pfn, nr_pages)) { | ||
757 | if (pfn_range_valid_gigantic(pfn, nr_pages)) { | ||
758 | /* | ||
759 | * We release the zone lock here because | ||
760 | * alloc_contig_range() will also lock the zone | ||
761 | * at some point. If there's an allocation | ||
762 | * spinning on this lock, it may win the race | ||
763 | * and cause alloc_contig_range() to fail... | ||
764 | */ | ||
765 | spin_unlock_irqrestore(&z->lock, flags); | ||
766 | ret = __alloc_gigantic_page(pfn, nr_pages); | ||
767 | if (!ret) | ||
768 | return pfn_to_page(pfn); | ||
769 | spin_lock_irqsave(&z->lock, flags); | ||
770 | } | ||
771 | pfn += nr_pages; | ||
772 | } | ||
773 | |||
774 | spin_unlock_irqrestore(&z->lock, flags); | ||
775 | } | ||
776 | |||
777 | return NULL; | ||
778 | } | ||
779 | |||
780 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); | ||
781 | static void prep_compound_gigantic_page(struct page *page, unsigned long order); | ||
782 | |||
783 | static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) | ||
784 | { | ||
785 | struct page *page; | ||
786 | |||
787 | page = alloc_gigantic_page(nid, huge_page_order(h)); | ||
788 | if (page) { | ||
789 | prep_compound_gigantic_page(page, huge_page_order(h)); | ||
790 | prep_new_huge_page(h, page, nid); | ||
791 | } | ||
792 | |||
793 | return page; | ||
794 | } | ||
795 | |||
796 | static int alloc_fresh_gigantic_page(struct hstate *h, | ||
797 | nodemask_t *nodes_allowed) | ||
798 | { | ||
799 | struct page *page = NULL; | ||
800 | int nr_nodes, node; | ||
801 | |||
802 | for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { | ||
803 | page = alloc_fresh_gigantic_page_node(h, node); | ||
804 | if (page) | ||
805 | return 1; | ||
806 | } | ||
807 | |||
808 | return 0; | ||
809 | } | ||
810 | |||
811 | static inline bool gigantic_page_supported(void) { return true; } | ||
812 | #else | ||
813 | static inline bool gigantic_page_supported(void) { return false; } | ||
814 | static inline void free_gigantic_page(struct page *page, unsigned order) { } | ||
815 | static inline void destroy_compound_gigantic_page(struct page *page, | ||
816 | unsigned long order) { } | ||
817 | static inline int alloc_fresh_gigantic_page(struct hstate *h, | ||
818 | nodemask_t *nodes_allowed) { return 0; } | ||
819 | #endif | ||
820 | |||
610 | static void update_and_free_page(struct hstate *h, struct page *page) | 821 | static void update_and_free_page(struct hstate *h, struct page *page) |
611 | { | 822 | { |
612 | int i; | 823 | int i; |
613 | 824 | ||
614 | VM_BUG_ON(h->order >= MAX_ORDER); | 825 | if (hstate_is_gigantic(h) && !gigantic_page_supported()) |
826 | return; | ||
615 | 827 | ||
616 | h->nr_huge_pages--; | 828 | h->nr_huge_pages--; |
617 | h->nr_huge_pages_node[page_to_nid(page)]--; | 829 | h->nr_huge_pages_node[page_to_nid(page)]--; |
618 | for (i = 0; i < pages_per_huge_page(h); i++) { | 830 | for (i = 0; i < pages_per_huge_page(h); i++) { |
619 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | | 831 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | |
620 | 1 << PG_referenced | 1 << PG_dirty | | 832 | 1 << PG_referenced | 1 << PG_dirty | |
621 | 1 << PG_active | 1 << PG_reserved | | 833 | 1 << PG_active | 1 << PG_private | |
622 | 1 << PG_private | 1 << PG_writeback); | 834 | 1 << PG_writeback); |
623 | } | 835 | } |
624 | VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); | 836 | VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); |
625 | set_compound_page_dtor(page, NULL); | 837 | set_compound_page_dtor(page, NULL); |
626 | set_page_refcounted(page); | 838 | set_page_refcounted(page); |
627 | arch_release_hugepage(page); | 839 | if (hstate_is_gigantic(h)) { |
628 | __free_pages(page, huge_page_order(h)); | 840 | destroy_compound_gigantic_page(page, huge_page_order(h)); |
841 | free_gigantic_page(page, huge_page_order(h)); | ||
842 | } else { | ||
843 | arch_release_hugepage(page); | ||
844 | __free_pages(page, huge_page_order(h)); | ||
845 | } | ||
629 | } | 846 | } |
630 | 847 | ||
631 | struct hstate *size_to_hstate(unsigned long size) | 848 | struct hstate *size_to_hstate(unsigned long size) |
@@ -664,7 +881,7 @@ static void free_huge_page(struct page *page) | |||
664 | if (restore_reserve) | 881 | if (restore_reserve) |
665 | h->resv_huge_pages++; | 882 | h->resv_huge_pages++; |
666 | 883 | ||
667 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { | 884 | if (h->surplus_huge_pages_node[nid]) { |
668 | /* remove the page from active list */ | 885 | /* remove the page from active list */ |
669 | list_del(&page->lru); | 886 | list_del(&page->lru); |
670 | update_and_free_page(h, page); | 887 | update_and_free_page(h, page); |
@@ -690,8 +907,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | |||
690 | put_page(page); /* free it into the hugepage allocator */ | 907 | put_page(page); /* free it into the hugepage allocator */ |
691 | } | 908 | } |
692 | 909 | ||
693 | static void __init prep_compound_gigantic_page(struct page *page, | 910 | static void prep_compound_gigantic_page(struct page *page, unsigned long order) |
694 | unsigned long order) | ||
695 | { | 911 | { |
696 | int i; | 912 | int i; |
697 | int nr_pages = 1 << order; | 913 | int nr_pages = 1 << order; |
@@ -769,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
769 | { | 985 | { |
770 | struct page *page; | 986 | struct page *page; |
771 | 987 | ||
772 | if (h->order >= MAX_ORDER) | ||
773 | return NULL; | ||
774 | |||
775 | page = alloc_pages_exact_node(nid, | 988 | page = alloc_pages_exact_node(nid, |
776 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| | 989 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| |
777 | __GFP_REPEAT|__GFP_NOWARN, | 990 | __GFP_REPEAT|__GFP_NOWARN, |
@@ -787,79 +1000,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
787 | return page; | 1000 | return page; |
788 | } | 1001 | } |
789 | 1002 | ||
790 | /* | ||
791 | * common helper functions for hstate_next_node_to_{alloc|free}. | ||
792 | * We may have allocated or freed a huge page based on a different | ||
793 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might | ||
794 | * be outside of *nodes_allowed. Ensure that we use an allowed | ||
795 | * node for alloc or free. | ||
796 | */ | ||
797 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
798 | { | ||
799 | nid = next_node(nid, *nodes_allowed); | ||
800 | if (nid == MAX_NUMNODES) | ||
801 | nid = first_node(*nodes_allowed); | ||
802 | VM_BUG_ON(nid >= MAX_NUMNODES); | ||
803 | |||
804 | return nid; | ||
805 | } | ||
806 | |||
807 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
808 | { | ||
809 | if (!node_isset(nid, *nodes_allowed)) | ||
810 | nid = next_node_allowed(nid, nodes_allowed); | ||
811 | return nid; | ||
812 | } | ||
813 | |||
814 | /* | ||
815 | * returns the previously saved node ["this node"] from which to | ||
816 | * allocate a persistent huge page for the pool and advance the | ||
817 | * next node from which to allocate, handling wrap at end of node | ||
818 | * mask. | ||
819 | */ | ||
820 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
821 | nodemask_t *nodes_allowed) | ||
822 | { | ||
823 | int nid; | ||
824 | |||
825 | VM_BUG_ON(!nodes_allowed); | ||
826 | |||
827 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
828 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
829 | |||
830 | return nid; | ||
831 | } | ||
832 | |||
833 | /* | ||
834 | * helper for free_pool_huge_page() - return the previously saved | ||
835 | * node ["this node"] from which to free a huge page. Advance the | ||
836 | * next node id whether or not we find a free huge page to free so | ||
837 | * that the next attempt to free addresses the next node. | ||
838 | */ | ||
839 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) | ||
840 | { | ||
841 | int nid; | ||
842 | |||
843 | VM_BUG_ON(!nodes_allowed); | ||
844 | |||
845 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); | ||
846 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); | ||
847 | |||
848 | return nid; | ||
849 | } | ||
850 | |||
851 | #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ | ||
852 | for (nr_nodes = nodes_weight(*mask); \ | ||
853 | nr_nodes > 0 && \ | ||
854 | ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ | ||
855 | nr_nodes--) | ||
856 | |||
857 | #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ | ||
858 | for (nr_nodes = nodes_weight(*mask); \ | ||
859 | nr_nodes > 0 && \ | ||
860 | ((node = hstate_next_node_to_free(hs, mask)) || 1); \ | ||
861 | nr_nodes--) | ||
862 | |||
863 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) | 1003 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) |
864 | { | 1004 | { |
865 | struct page *page; | 1005 | struct page *page; |
@@ -963,7 +1103,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
963 | struct page *page; | 1103 | struct page *page; |
964 | unsigned int r_nid; | 1104 | unsigned int r_nid; |
965 | 1105 | ||
966 | if (h->order >= MAX_ORDER) | 1106 | if (hstate_is_gigantic(h)) |
967 | return NULL; | 1107 | return NULL; |
968 | 1108 | ||
969 | /* | 1109 | /* |
@@ -1156,7 +1296,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
1156 | h->resv_huge_pages -= unused_resv_pages; | 1296 | h->resv_huge_pages -= unused_resv_pages; |
1157 | 1297 | ||
1158 | /* Cannot return gigantic pages currently */ | 1298 | /* Cannot return gigantic pages currently */ |
1159 | if (h->order >= MAX_ORDER) | 1299 | if (hstate_is_gigantic(h)) |
1160 | return; | 1300 | return; |
1161 | 1301 | ||
1162 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); | 1302 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); |
@@ -1246,24 +1386,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1246 | return ERR_PTR(-ENOSPC); | 1386 | return ERR_PTR(-ENOSPC); |
1247 | 1387 | ||
1248 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | 1388 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); |
1249 | if (ret) { | 1389 | if (ret) |
1250 | if (chg || avoid_reserve) | 1390 | goto out_subpool_put; |
1251 | hugepage_subpool_put_pages(spool, 1); | 1391 | |
1252 | return ERR_PTR(-ENOSPC); | ||
1253 | } | ||
1254 | spin_lock(&hugetlb_lock); | 1392 | spin_lock(&hugetlb_lock); |
1255 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); | 1393 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); |
1256 | if (!page) { | 1394 | if (!page) { |
1257 | spin_unlock(&hugetlb_lock); | 1395 | spin_unlock(&hugetlb_lock); |
1258 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1396 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1259 | if (!page) { | 1397 | if (!page) |
1260 | hugetlb_cgroup_uncharge_cgroup(idx, | 1398 | goto out_uncharge_cgroup; |
1261 | pages_per_huge_page(h), | 1399 | |
1262 | h_cg); | ||
1263 | if (chg || avoid_reserve) | ||
1264 | hugepage_subpool_put_pages(spool, 1); | ||
1265 | return ERR_PTR(-ENOSPC); | ||
1266 | } | ||
1267 | spin_lock(&hugetlb_lock); | 1400 | spin_lock(&hugetlb_lock); |
1268 | list_move(&page->lru, &h->hugepage_activelist); | 1401 | list_move(&page->lru, &h->hugepage_activelist); |
1269 | /* Fall through */ | 1402 | /* Fall through */ |
@@ -1275,6 +1408,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1275 | 1408 | ||
1276 | vma_commit_reservation(h, vma, addr); | 1409 | vma_commit_reservation(h, vma, addr); |
1277 | return page; | 1410 | return page; |
1411 | |||
1412 | out_uncharge_cgroup: | ||
1413 | hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); | ||
1414 | out_subpool_put: | ||
1415 | if (chg || avoid_reserve) | ||
1416 | hugepage_subpool_put_pages(spool, 1); | ||
1417 | return ERR_PTR(-ENOSPC); | ||
1278 | } | 1418 | } |
1279 | 1419 | ||
1280 | /* | 1420 | /* |
@@ -1356,7 +1496,7 @@ static void __init gather_bootmem_prealloc(void) | |||
1356 | * fix confusing memory reports from free(1) and another | 1496 | * fix confusing memory reports from free(1) and another |
1357 | * side-effects, like CommitLimit going negative. | 1497 | * side-effects, like CommitLimit going negative. |
1358 | */ | 1498 | */ |
1359 | if (h->order > (MAX_ORDER - 1)) | 1499 | if (hstate_is_gigantic(h)) |
1360 | adjust_managed_page_count(page, 1 << h->order); | 1500 | adjust_managed_page_count(page, 1 << h->order); |
1361 | } | 1501 | } |
1362 | } | 1502 | } |
@@ -1366,7 +1506,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1366 | unsigned long i; | 1506 | unsigned long i; |
1367 | 1507 | ||
1368 | for (i = 0; i < h->max_huge_pages; ++i) { | 1508 | for (i = 0; i < h->max_huge_pages; ++i) { |
1369 | if (h->order >= MAX_ORDER) { | 1509 | if (hstate_is_gigantic(h)) { |
1370 | if (!alloc_bootmem_huge_page(h)) | 1510 | if (!alloc_bootmem_huge_page(h)) |
1371 | break; | 1511 | break; |
1372 | } else if (!alloc_fresh_huge_page(h, | 1512 | } else if (!alloc_fresh_huge_page(h, |
@@ -1382,7 +1522,7 @@ static void __init hugetlb_init_hstates(void) | |||
1382 | 1522 | ||
1383 | for_each_hstate(h) { | 1523 | for_each_hstate(h) { |
1384 | /* oversize hugepages were init'ed in early boot */ | 1524 | /* oversize hugepages were init'ed in early boot */ |
1385 | if (h->order < MAX_ORDER) | 1525 | if (!hstate_is_gigantic(h)) |
1386 | hugetlb_hstate_alloc_pages(h); | 1526 | hugetlb_hstate_alloc_pages(h); |
1387 | } | 1527 | } |
1388 | } | 1528 | } |
@@ -1416,7 +1556,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, | |||
1416 | { | 1556 | { |
1417 | int i; | 1557 | int i; |
1418 | 1558 | ||
1419 | if (h->order >= MAX_ORDER) | 1559 | if (hstate_is_gigantic(h)) |
1420 | return; | 1560 | return; |
1421 | 1561 | ||
1422 | for_each_node_mask(i, *nodes_allowed) { | 1562 | for_each_node_mask(i, *nodes_allowed) { |
@@ -1479,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, | |||
1479 | { | 1619 | { |
1480 | unsigned long min_count, ret; | 1620 | unsigned long min_count, ret; |
1481 | 1621 | ||
1482 | if (h->order >= MAX_ORDER) | 1622 | if (hstate_is_gigantic(h) && !gigantic_page_supported()) |
1483 | return h->max_huge_pages; | 1623 | return h->max_huge_pages; |
1484 | 1624 | ||
1485 | /* | 1625 | /* |
@@ -1506,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, | |||
1506 | * and reducing the surplus. | 1646 | * and reducing the surplus. |
1507 | */ | 1647 | */ |
1508 | spin_unlock(&hugetlb_lock); | 1648 | spin_unlock(&hugetlb_lock); |
1509 | ret = alloc_fresh_huge_page(h, nodes_allowed); | 1649 | if (hstate_is_gigantic(h)) |
1650 | ret = alloc_fresh_gigantic_page(h, nodes_allowed); | ||
1651 | else | ||
1652 | ret = alloc_fresh_huge_page(h, nodes_allowed); | ||
1510 | spin_lock(&hugetlb_lock); | 1653 | spin_lock(&hugetlb_lock); |
1511 | if (!ret) | 1654 | if (!ret) |
1512 | goto out; | 1655 | goto out; |
@@ -1606,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1606 | goto out; | 1749 | goto out; |
1607 | 1750 | ||
1608 | h = kobj_to_hstate(kobj, &nid); | 1751 | h = kobj_to_hstate(kobj, &nid); |
1609 | if (h->order >= MAX_ORDER) { | 1752 | if (hstate_is_gigantic(h) && !gigantic_page_supported()) { |
1610 | err = -EINVAL; | 1753 | err = -EINVAL; |
1611 | goto out; | 1754 | goto out; |
1612 | } | 1755 | } |
@@ -1689,7 +1832,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1689 | unsigned long input; | 1832 | unsigned long input; |
1690 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1833 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1691 | 1834 | ||
1692 | if (h->order >= MAX_ORDER) | 1835 | if (hstate_is_gigantic(h)) |
1693 | return -EINVAL; | 1836 | return -EINVAL; |
1694 | 1837 | ||
1695 | err = kstrtoul(buf, 10, &input); | 1838 | err = kstrtoul(buf, 10, &input); |
@@ -2113,7 +2256,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
2113 | 2256 | ||
2114 | tmp = h->max_huge_pages; | 2257 | tmp = h->max_huge_pages; |
2115 | 2258 | ||
2116 | if (write && h->order >= MAX_ORDER) | 2259 | if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) |
2117 | return -EINVAL; | 2260 | return -EINVAL; |
2118 | 2261 | ||
2119 | table->data = &tmp; | 2262 | table->data = &tmp; |
@@ -2169,7 +2312,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
2169 | 2312 | ||
2170 | tmp = h->nr_overcommit_huge_pages; | 2313 | tmp = h->nr_overcommit_huge_pages; |
2171 | 2314 | ||
2172 | if (write && h->order >= MAX_ORDER) | 2315 | if (write && hstate_is_gigantic(h)) |
2173 | return -EINVAL; | 2316 | return -EINVAL; |
2174 | 2317 | ||
2175 | table->data = &tmp; | 2318 | table->data = &tmp; |
diff --git a/mm/internal.h b/mm/internal.h index 07b67361a40a..7f22a11fcc66 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -134,7 +134,7 @@ struct compact_control { | |||
134 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 134 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
135 | unsigned long free_pfn; /* isolate_freepages search base */ | 135 | unsigned long free_pfn; /* isolate_freepages search base */ |
136 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 136 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
137 | bool sync; /* Synchronous migration */ | 137 | enum migrate_mode mode; /* Async or sync migration mode */ |
138 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ | 138 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
139 | bool finished_update_free; /* True when the zone cached pfns are | 139 | bool finished_update_free; /* True when the zone cached pfns are |
140 | * no longer being updated | 140 | * no longer being updated |
@@ -144,7 +144,10 @@ struct compact_control { | |||
144 | int order; /* order a direct compactor needs */ | 144 | int order; /* order a direct compactor needs */ |
145 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 145 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
146 | struct zone *zone; | 146 | struct zone *zone; |
147 | bool contended; /* True if a lock was contended */ | 147 | bool contended; /* True if a lock was contended, or |
148 | * need_resched() true during async | ||
149 | * compaction | ||
150 | */ | ||
148 | }; | 151 | }; |
149 | 152 | ||
150 | unsigned long | 153 | unsigned long |
@@ -169,6 +172,11 @@ static inline unsigned long page_order(struct page *page) | |||
169 | return page_private(page); | 172 | return page_private(page); |
170 | } | 173 | } |
171 | 174 | ||
175 | static inline bool is_cow_mapping(vm_flags_t flags) | ||
176 | { | ||
177 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
178 | } | ||
179 | |||
172 | /* mm/util.c */ | 180 | /* mm/util.c */ |
173 | void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | 181 | void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, |
174 | struct vm_area_struct *prev, struct rb_node *rb_parent); | 182 | struct vm_area_struct *prev, struct rb_node *rb_parent); |
@@ -184,26 +192,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
184 | } | 192 | } |
185 | 193 | ||
186 | /* | 194 | /* |
187 | * Called only in fault path, to determine if a new page is being | ||
188 | * mapped into a LOCKED vma. If it is, mark page as mlocked. | ||
189 | */ | ||
190 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | ||
191 | struct page *page) | ||
192 | { | ||
193 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
194 | |||
195 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) | ||
196 | return 0; | ||
197 | |||
198 | if (!TestSetPageMlocked(page)) { | ||
199 | mod_zone_page_state(page_zone(page), NR_MLOCK, | ||
200 | hpage_nr_pages(page)); | ||
201 | count_vm_event(UNEVICTABLE_PGMLOCKED); | ||
202 | } | ||
203 | return 1; | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * must be called with vma's mmap_sem held for read or write, and page locked. | 195 | * must be called with vma's mmap_sem held for read or write, and page locked. |
208 | */ | 196 | */ |
209 | extern void mlock_vma_page(struct page *page); | 197 | extern void mlock_vma_page(struct page *page); |
@@ -245,10 +233,6 @@ extern unsigned long vma_address(struct page *page, | |||
245 | struct vm_area_struct *vma); | 233 | struct vm_area_struct *vma); |
246 | #endif | 234 | #endif |
247 | #else /* !CONFIG_MMU */ | 235 | #else /* !CONFIG_MMU */ |
248 | static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) | ||
249 | { | ||
250 | return 0; | ||
251 | } | ||
252 | static inline void clear_page_mlock(struct page *page) { } | 236 | static inline void clear_page_mlock(struct page *page) { } |
253 | static inline void mlock_vma_page(struct page *page) { } | 237 | static inline void mlock_vma_page(struct page *page) { } |
254 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 238 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8d2fcdfeff7f..736ade31d1dc 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1300,7 +1300,7 @@ static void kmemleak_scan(void) | |||
1300 | /* | 1300 | /* |
1301 | * Struct page scanning for each node. | 1301 | * Struct page scanning for each node. |
1302 | */ | 1302 | */ |
1303 | lock_memory_hotplug(); | 1303 | get_online_mems(); |
1304 | for_each_online_node(i) { | 1304 | for_each_online_node(i) { |
1305 | unsigned long start_pfn = node_start_pfn(i); | 1305 | unsigned long start_pfn = node_start_pfn(i); |
1306 | unsigned long end_pfn = node_end_pfn(i); | 1306 | unsigned long end_pfn = node_end_pfn(i); |
@@ -1318,7 +1318,7 @@ static void kmemleak_scan(void) | |||
1318 | scan_block(page, page + 1, NULL, 1); | 1318 | scan_block(page, page + 1, NULL, 1); |
1319 | } | 1319 | } |
1320 | } | 1320 | } |
1321 | unlock_memory_hotplug(); | 1321 | put_online_mems(); |
1322 | 1322 | ||
1323 | /* | 1323 | /* |
1324 | * Scanning the task stacks (may introduce false negatives). | 1324 | * Scanning the task stacks (may introduce false negatives). |
diff --git a/mm/madvise.c b/mm/madvise.c index 539eeb96b323..a402f8fdc68e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, | |||
195 | for (; start < end; start += PAGE_SIZE) { | 195 | for (; start < end; start += PAGE_SIZE) { |
196 | index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 196 | index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
197 | 197 | ||
198 | page = find_get_page(mapping, index); | 198 | page = find_get_entry(mapping, index); |
199 | if (!radix_tree_exceptional_entry(page)) { | 199 | if (!radix_tree_exceptional_entry(page)) { |
200 | if (page) | 200 | if (page) |
201 | page_cache_release(page); | 201 | page_cache_release(page); |
diff --git a/mm/memblock.c b/mm/memblock.c index e9d6ca9a01a9..0aa0d2b07624 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -27,6 +27,9 @@ | |||
27 | 27 | ||
28 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 28 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
29 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 29 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
30 | #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP | ||
31 | static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; | ||
32 | #endif | ||
30 | 33 | ||
31 | struct memblock memblock __initdata_memblock = { | 34 | struct memblock memblock __initdata_memblock = { |
32 | .memory.regions = memblock_memory_init_regions, | 35 | .memory.regions = memblock_memory_init_regions, |
@@ -37,6 +40,12 @@ struct memblock memblock __initdata_memblock = { | |||
37 | .reserved.cnt = 1, /* empty dummy entry */ | 40 | .reserved.cnt = 1, /* empty dummy entry */ |
38 | .reserved.max = INIT_MEMBLOCK_REGIONS, | 41 | .reserved.max = INIT_MEMBLOCK_REGIONS, |
39 | 42 | ||
43 | #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP | ||
44 | .physmem.regions = memblock_physmem_init_regions, | ||
45 | .physmem.cnt = 1, /* empty dummy entry */ | ||
46 | .physmem.max = INIT_PHYSMEM_REGIONS, | ||
47 | #endif | ||
48 | |||
40 | .bottom_up = false, | 49 | .bottom_up = false, |
41 | .current_limit = MEMBLOCK_ALLOC_ANYWHERE, | 50 | .current_limit = MEMBLOCK_ALLOC_ANYWHERE, |
42 | }; | 51 | }; |
@@ -472,7 +481,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
472 | } | 481 | } |
473 | 482 | ||
474 | /** | 483 | /** |
475 | * memblock_add_region - add new memblock region | 484 | * memblock_add_range - add new memblock region |
476 | * @type: memblock type to add new region into | 485 | * @type: memblock type to add new region into |
477 | * @base: base address of the new region | 486 | * @base: base address of the new region |
478 | * @size: size of the new region | 487 | * @size: size of the new region |
@@ -487,7 +496,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
487 | * RETURNS: | 496 | * RETURNS: |
488 | * 0 on success, -errno on failure. | 497 | * 0 on success, -errno on failure. |
489 | */ | 498 | */ |
490 | static int __init_memblock memblock_add_region(struct memblock_type *type, | 499 | int __init_memblock memblock_add_range(struct memblock_type *type, |
491 | phys_addr_t base, phys_addr_t size, | 500 | phys_addr_t base, phys_addr_t size, |
492 | int nid, unsigned long flags) | 501 | int nid, unsigned long flags) |
493 | { | 502 | { |
@@ -569,12 +578,12 @@ repeat: | |||
569 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, | 578 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, |
570 | int nid) | 579 | int nid) |
571 | { | 580 | { |
572 | return memblock_add_region(&memblock.memory, base, size, nid, 0); | 581 | return memblock_add_range(&memblock.memory, base, size, nid, 0); |
573 | } | 582 | } |
574 | 583 | ||
575 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 584 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
576 | { | 585 | { |
577 | return memblock_add_region(&memblock.memory, base, size, | 586 | return memblock_add_range(&memblock.memory, base, size, |
578 | MAX_NUMNODES, 0); | 587 | MAX_NUMNODES, 0); |
579 | } | 588 | } |
580 | 589 | ||
@@ -654,8 +663,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
654 | return 0; | 663 | return 0; |
655 | } | 664 | } |
656 | 665 | ||
657 | static int __init_memblock __memblock_remove(struct memblock_type *type, | 666 | int __init_memblock memblock_remove_range(struct memblock_type *type, |
658 | phys_addr_t base, phys_addr_t size) | 667 | phys_addr_t base, phys_addr_t size) |
659 | { | 668 | { |
660 | int start_rgn, end_rgn; | 669 | int start_rgn, end_rgn; |
661 | int i, ret; | 670 | int i, ret; |
@@ -671,9 +680,10 @@ static int __init_memblock __memblock_remove(struct memblock_type *type, | |||
671 | 680 | ||
672 | int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) | 681 | int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) |
673 | { | 682 | { |
674 | return __memblock_remove(&memblock.memory, base, size); | 683 | return memblock_remove_range(&memblock.memory, base, size); |
675 | } | 684 | } |
676 | 685 | ||
686 | |||
677 | int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) | 687 | int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) |
678 | { | 688 | { |
679 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", | 689 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", |
@@ -681,7 +691,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) | |||
681 | (unsigned long long)base + size - 1, | 691 | (unsigned long long)base + size - 1, |
682 | (void *)_RET_IP_); | 692 | (void *)_RET_IP_); |
683 | 693 | ||
684 | return __memblock_remove(&memblock.reserved, base, size); | 694 | return memblock_remove_range(&memblock.reserved, base, size); |
685 | } | 695 | } |
686 | 696 | ||
687 | static int __init_memblock memblock_reserve_region(phys_addr_t base, | 697 | static int __init_memblock memblock_reserve_region(phys_addr_t base, |
@@ -696,7 +706,7 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, | |||
696 | (unsigned long long)base + size - 1, | 706 | (unsigned long long)base + size - 1, |
697 | flags, (void *)_RET_IP_); | 707 | flags, (void *)_RET_IP_); |
698 | 708 | ||
699 | return memblock_add_region(_rgn, base, size, nid, flags); | 709 | return memblock_add_range(_rgn, base, size, nid, flags); |
700 | } | 710 | } |
701 | 711 | ||
702 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | 712 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) |
@@ -758,17 +768,19 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | |||
758 | } | 768 | } |
759 | 769 | ||
760 | /** | 770 | /** |
761 | * __next_free_mem_range - next function for for_each_free_mem_range() | 771 | * __next__mem_range - next function for for_each_free_mem_range() etc. |
762 | * @idx: pointer to u64 loop variable | 772 | * @idx: pointer to u64 loop variable |
763 | * @nid: node selector, %NUMA_NO_NODE for all nodes | 773 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
774 | * @type_a: pointer to memblock_type from where the range is taken | ||
775 | * @type_b: pointer to memblock_type which excludes memory from being taken | ||
764 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 776 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
765 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 777 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
766 | * @out_nid: ptr to int for nid of the range, can be %NULL | 778 | * @out_nid: ptr to int for nid of the range, can be %NULL |
767 | * | 779 | * |
768 | * Find the first free area from *@idx which matches @nid, fill the out | 780 | * Find the first area from *@idx which matches @nid, fill the out |
769 | * parameters, and update *@idx for the next iteration. The lower 32bit of | 781 | * parameters, and update *@idx for the next iteration. The lower 32bit of |
770 | * *@idx contains index into memory region and the upper 32bit indexes the | 782 | * *@idx contains index into type_a and the upper 32bit indexes the |
771 | * areas before each reserved region. For example, if reserved regions | 783 | * areas before each region in type_b. For example, if type_b regions |
772 | * look like the following, | 784 | * look like the following, |
773 | * | 785 | * |
774 | * 0:[0-16), 1:[32-48), 2:[128-130) | 786 | * 0:[0-16), 1:[32-48), 2:[128-130) |
@@ -780,53 +792,77 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | |||
780 | * As both region arrays are sorted, the function advances the two indices | 792 | * As both region arrays are sorted, the function advances the two indices |
781 | * in lockstep and returns each intersection. | 793 | * in lockstep and returns each intersection. |
782 | */ | 794 | */ |
783 | void __init_memblock __next_free_mem_range(u64 *idx, int nid, | 795 | void __init_memblock __next_mem_range(u64 *idx, int nid, |
784 | phys_addr_t *out_start, | 796 | struct memblock_type *type_a, |
785 | phys_addr_t *out_end, int *out_nid) | 797 | struct memblock_type *type_b, |
798 | phys_addr_t *out_start, | ||
799 | phys_addr_t *out_end, int *out_nid) | ||
786 | { | 800 | { |
787 | struct memblock_type *mem = &memblock.memory; | 801 | int idx_a = *idx & 0xffffffff; |
788 | struct memblock_type *rsv = &memblock.reserved; | 802 | int idx_b = *idx >> 32; |
789 | int mi = *idx & 0xffffffff; | ||
790 | int ri = *idx >> 32; | ||
791 | 803 | ||
792 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | 804 | if (WARN_ONCE(nid == MAX_NUMNODES, |
805 | "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
793 | nid = NUMA_NO_NODE; | 806 | nid = NUMA_NO_NODE; |
794 | 807 | ||
795 | for ( ; mi < mem->cnt; mi++) { | 808 | for (; idx_a < type_a->cnt; idx_a++) { |
796 | struct memblock_region *m = &mem->regions[mi]; | 809 | struct memblock_region *m = &type_a->regions[idx_a]; |
810 | |||
797 | phys_addr_t m_start = m->base; | 811 | phys_addr_t m_start = m->base; |
798 | phys_addr_t m_end = m->base + m->size; | 812 | phys_addr_t m_end = m->base + m->size; |
813 | int m_nid = memblock_get_region_node(m); | ||
799 | 814 | ||
800 | /* only memory regions are associated with nodes, check it */ | 815 | /* only memory regions are associated with nodes, check it */ |
801 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) | 816 | if (nid != NUMA_NO_NODE && nid != m_nid) |
802 | continue; | 817 | continue; |
803 | 818 | ||
804 | /* scan areas before each reservation for intersection */ | 819 | if (!type_b) { |
805 | for ( ; ri < rsv->cnt + 1; ri++) { | 820 | if (out_start) |
806 | struct memblock_region *r = &rsv->regions[ri]; | 821 | *out_start = m_start; |
807 | phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; | 822 | if (out_end) |
808 | phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; | 823 | *out_end = m_end; |
824 | if (out_nid) | ||
825 | *out_nid = m_nid; | ||
826 | idx_a++; | ||
827 | *idx = (u32)idx_a | (u64)idx_b << 32; | ||
828 | return; | ||
829 | } | ||
830 | |||
831 | /* scan areas before each reservation */ | ||
832 | for (; idx_b < type_b->cnt + 1; idx_b++) { | ||
833 | struct memblock_region *r; | ||
834 | phys_addr_t r_start; | ||
835 | phys_addr_t r_end; | ||
809 | 836 | ||
810 | /* if ri advanced past mi, break out to advance mi */ | 837 | r = &type_b->regions[idx_b]; |
838 | r_start = idx_b ? r[-1].base + r[-1].size : 0; | ||
839 | r_end = idx_b < type_b->cnt ? | ||
840 | r->base : ULLONG_MAX; | ||
841 | |||
842 | /* | ||
843 | * if idx_b advanced past idx_a, | ||
844 | * break out to advance idx_a | ||
845 | */ | ||
811 | if (r_start >= m_end) | 846 | if (r_start >= m_end) |
812 | break; | 847 | break; |
813 | /* if the two regions intersect, we're done */ | 848 | /* if the two regions intersect, we're done */ |
814 | if (m_start < r_end) { | 849 | if (m_start < r_end) { |
815 | if (out_start) | 850 | if (out_start) |
816 | *out_start = max(m_start, r_start); | 851 | *out_start = |
852 | max(m_start, r_start); | ||
817 | if (out_end) | 853 | if (out_end) |
818 | *out_end = min(m_end, r_end); | 854 | *out_end = min(m_end, r_end); |
819 | if (out_nid) | 855 | if (out_nid) |
820 | *out_nid = memblock_get_region_node(m); | 856 | *out_nid = m_nid; |
821 | /* | 857 | /* |
822 | * The region which ends first is advanced | 858 | * The region which ends first is |
823 | * for the next iteration. | 859 | * advanced for the next iteration. |
824 | */ | 860 | */ |
825 | if (m_end <= r_end) | 861 | if (m_end <= r_end) |
826 | mi++; | 862 | idx_a++; |
827 | else | 863 | else |
828 | ri++; | 864 | idx_b++; |
829 | *idx = (u32)mi | (u64)ri << 32; | 865 | *idx = (u32)idx_a | (u64)idx_b << 32; |
830 | return; | 866 | return; |
831 | } | 867 | } |
832 | } | 868 | } |
@@ -837,57 +873,80 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
837 | } | 873 | } |
838 | 874 | ||
839 | /** | 875 | /** |
840 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 876 | * __next_mem_range_rev - generic next function for for_each_*_range_rev() |
877 | * | ||
878 | * Finds the next range from type_a which is not marked as unsuitable | ||
879 | * in type_b. | ||
880 | * | ||
841 | * @idx: pointer to u64 loop variable | 881 | * @idx: pointer to u64 loop variable |
842 | * @nid: nid: node selector, %NUMA_NO_NODE for all nodes | 882 | * @nid: nid: node selector, %NUMA_NO_NODE for all nodes |
883 | * @type_a: pointer to memblock_type from where the range is taken | ||
884 | * @type_b: pointer to memblock_type which excludes memory from being taken | ||
843 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 885 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
844 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 886 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
845 | * @out_nid: ptr to int for nid of the range, can be %NULL | 887 | * @out_nid: ptr to int for nid of the range, can be %NULL |
846 | * | 888 | * |
847 | * Reverse of __next_free_mem_range(). | 889 | * Reverse of __next_mem_range(). |
848 | * | ||
849 | * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't | ||
850 | * be able to hot-remove hotpluggable memory used by the kernel. So this | ||
851 | * function skip hotpluggable regions if needed when allocating memory for the | ||
852 | * kernel. | ||
853 | */ | 890 | */ |
854 | void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | 891 | void __init_memblock __next_mem_range_rev(u64 *idx, int nid, |
855 | phys_addr_t *out_start, | 892 | struct memblock_type *type_a, |
856 | phys_addr_t *out_end, int *out_nid) | 893 | struct memblock_type *type_b, |
894 | phys_addr_t *out_start, | ||
895 | phys_addr_t *out_end, int *out_nid) | ||
857 | { | 896 | { |
858 | struct memblock_type *mem = &memblock.memory; | 897 | int idx_a = *idx & 0xffffffff; |
859 | struct memblock_type *rsv = &memblock.reserved; | 898 | int idx_b = *idx >> 32; |
860 | int mi = *idx & 0xffffffff; | ||
861 | int ri = *idx >> 32; | ||
862 | 899 | ||
863 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | 900 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) |
864 | nid = NUMA_NO_NODE; | 901 | nid = NUMA_NO_NODE; |
865 | 902 | ||
866 | if (*idx == (u64)ULLONG_MAX) { | 903 | if (*idx == (u64)ULLONG_MAX) { |
867 | mi = mem->cnt - 1; | 904 | idx_a = type_a->cnt - 1; |
868 | ri = rsv->cnt; | 905 | idx_b = type_b->cnt; |
869 | } | 906 | } |
870 | 907 | ||
871 | for ( ; mi >= 0; mi--) { | 908 | for (; idx_a >= 0; idx_a--) { |
872 | struct memblock_region *m = &mem->regions[mi]; | 909 | struct memblock_region *m = &type_a->regions[idx_a]; |
910 | |||
873 | phys_addr_t m_start = m->base; | 911 | phys_addr_t m_start = m->base; |
874 | phys_addr_t m_end = m->base + m->size; | 912 | phys_addr_t m_end = m->base + m->size; |
913 | int m_nid = memblock_get_region_node(m); | ||
875 | 914 | ||
876 | /* only memory regions are associated with nodes, check it */ | 915 | /* only memory regions are associated with nodes, check it */ |
877 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) | 916 | if (nid != NUMA_NO_NODE && nid != m_nid) |
878 | continue; | 917 | continue; |
879 | 918 | ||
880 | /* skip hotpluggable memory regions if needed */ | 919 | /* skip hotpluggable memory regions if needed */ |
881 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) | 920 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) |
882 | continue; | 921 | continue; |
883 | 922 | ||
884 | /* scan areas before each reservation for intersection */ | 923 | if (!type_b) { |
885 | for ( ; ri >= 0; ri--) { | 924 | if (out_start) |
886 | struct memblock_region *r = &rsv->regions[ri]; | 925 | *out_start = m_start; |
887 | phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; | 926 | if (out_end) |
888 | phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; | 927 | *out_end = m_end; |
928 | if (out_nid) | ||
929 | *out_nid = m_nid; | ||
930 | idx_a++; | ||
931 | *idx = (u32)idx_a | (u64)idx_b << 32; | ||
932 | return; | ||
933 | } | ||
934 | |||
935 | /* scan areas before each reservation */ | ||
936 | for (; idx_b >= 0; idx_b--) { | ||
937 | struct memblock_region *r; | ||
938 | phys_addr_t r_start; | ||
939 | phys_addr_t r_end; | ||
940 | |||
941 | r = &type_b->regions[idx_b]; | ||
942 | r_start = idx_b ? r[-1].base + r[-1].size : 0; | ||
943 | r_end = idx_b < type_b->cnt ? | ||
944 | r->base : ULLONG_MAX; | ||
945 | /* | ||
946 | * if idx_b advanced past idx_a, | ||
947 | * break out to advance idx_a | ||
948 | */ | ||
889 | 949 | ||
890 | /* if ri advanced past mi, break out to advance mi */ | ||
891 | if (r_end <= m_start) | 950 | if (r_end <= m_start) |
892 | break; | 951 | break; |
893 | /* if the two regions intersect, we're done */ | 952 | /* if the two regions intersect, we're done */ |
@@ -897,18 +956,17 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | |||
897 | if (out_end) | 956 | if (out_end) |
898 | *out_end = min(m_end, r_end); | 957 | *out_end = min(m_end, r_end); |
899 | if (out_nid) | 958 | if (out_nid) |
900 | *out_nid = memblock_get_region_node(m); | 959 | *out_nid = m_nid; |
901 | |||
902 | if (m_start >= r_start) | 960 | if (m_start >= r_start) |
903 | mi--; | 961 | idx_a--; |
904 | else | 962 | else |
905 | ri--; | 963 | idx_b--; |
906 | *idx = (u32)mi | (u64)ri << 32; | 964 | *idx = (u32)idx_a | (u64)idx_b << 32; |
907 | return; | 965 | return; |
908 | } | 966 | } |
909 | } | 967 | } |
910 | } | 968 | } |
911 | 969 | /* signal end of iteration */ | |
912 | *idx = ULLONG_MAX; | 970 | *idx = ULLONG_MAX; |
913 | } | 971 | } |
914 | 972 | ||
@@ -975,22 +1033,35 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | |||
975 | } | 1033 | } |
976 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 1034 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
977 | 1035 | ||
978 | static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | 1036 | static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, |
979 | phys_addr_t align, phys_addr_t max_addr, | 1037 | phys_addr_t align, phys_addr_t start, |
980 | int nid) | 1038 | phys_addr_t end, int nid) |
981 | { | 1039 | { |
982 | phys_addr_t found; | 1040 | phys_addr_t found; |
983 | 1041 | ||
984 | if (!align) | 1042 | if (!align) |
985 | align = SMP_CACHE_BYTES; | 1043 | align = SMP_CACHE_BYTES; |
986 | 1044 | ||
987 | found = memblock_find_in_range_node(size, align, 0, max_addr, nid); | 1045 | found = memblock_find_in_range_node(size, align, start, end, nid); |
988 | if (found && !memblock_reserve(found, size)) | 1046 | if (found && !memblock_reserve(found, size)) |
989 | return found; | 1047 | return found; |
990 | 1048 | ||
991 | return 0; | 1049 | return 0; |
992 | } | 1050 | } |
993 | 1051 | ||
1052 | phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, | ||
1053 | phys_addr_t start, phys_addr_t end) | ||
1054 | { | ||
1055 | return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); | ||
1056 | } | ||
1057 | |||
1058 | static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | ||
1059 | phys_addr_t align, phys_addr_t max_addr, | ||
1060 | int nid) | ||
1061 | { | ||
1062 | return memblock_alloc_range_nid(size, align, 0, max_addr, nid); | ||
1063 | } | ||
1064 | |||
994 | phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) | 1065 | phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) |
995 | { | 1066 | { |
996 | return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); | 1067 | return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); |
@@ -1201,7 +1272,7 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) | |||
1201 | __func__, (u64)base, (u64)base + size - 1, | 1272 | __func__, (u64)base, (u64)base + size - 1, |
1202 | (void *)_RET_IP_); | 1273 | (void *)_RET_IP_); |
1203 | kmemleak_free_part(__va(base), size); | 1274 | kmemleak_free_part(__va(base), size); |
1204 | __memblock_remove(&memblock.reserved, base, size); | 1275 | memblock_remove_range(&memblock.reserved, base, size); |
1205 | } | 1276 | } |
1206 | 1277 | ||
1207 | /* | 1278 | /* |
@@ -1287,8 +1358,10 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) | |||
1287 | } | 1358 | } |
1288 | 1359 | ||
1289 | /* truncate both memory and reserved regions */ | 1360 | /* truncate both memory and reserved regions */ |
1290 | __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); | 1361 | memblock_remove_range(&memblock.memory, max_addr, |
1291 | __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX); | 1362 | (phys_addr_t)ULLONG_MAX); |
1363 | memblock_remove_range(&memblock.reserved, max_addr, | ||
1364 | (phys_addr_t)ULLONG_MAX); | ||
1292 | } | 1365 | } |
1293 | 1366 | ||
1294 | static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) | 1367 | static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) |
@@ -1329,9 +1402,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, | |||
1329 | if (mid == -1) | 1402 | if (mid == -1) |
1330 | return -1; | 1403 | return -1; |
1331 | 1404 | ||
1332 | *start_pfn = type->regions[mid].base >> PAGE_SHIFT; | 1405 | *start_pfn = PFN_DOWN(type->regions[mid].base); |
1333 | *end_pfn = (type->regions[mid].base + type->regions[mid].size) | 1406 | *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); |
1334 | >> PAGE_SHIFT; | ||
1335 | 1407 | ||
1336 | return type->regions[mid].nid; | 1408 | return type->regions[mid].nid; |
1337 | } | 1409 | } |
@@ -1502,6 +1574,9 @@ static int __init memblock_init_debugfs(void) | |||
1502 | return -ENXIO; | 1574 | return -ENXIO; |
1503 | debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); | 1575 | debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); |
1504 | debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); | 1576 | debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); |
1577 | #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP | ||
1578 | debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops); | ||
1579 | #endif | ||
1505 | 1580 | ||
1506 | return 0; | 1581 | return 0; |
1507 | } | 1582 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c47dffdcb246..a500cb0594c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -80,7 +80,7 @@ int do_swap_account __read_mostly; | |||
80 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | 80 | #ifdef CONFIG_MEMCG_SWAP_ENABLED |
81 | static int really_do_swap_account __initdata = 1; | 81 | static int really_do_swap_account __initdata = 1; |
82 | #else | 82 | #else |
83 | static int really_do_swap_account __initdata = 0; | 83 | static int really_do_swap_account __initdata; |
84 | #endif | 84 | #endif |
85 | 85 | ||
86 | #else | 86 | #else |
@@ -357,10 +357,9 @@ struct mem_cgroup { | |||
357 | struct cg_proto tcp_mem; | 357 | struct cg_proto tcp_mem; |
358 | #endif | 358 | #endif |
359 | #if defined(CONFIG_MEMCG_KMEM) | 359 | #if defined(CONFIG_MEMCG_KMEM) |
360 | /* analogous to slab_common's slab_caches list. per-memcg */ | 360 | /* analogous to slab_common's slab_caches list, but per-memcg; |
361 | * protected by memcg_slab_mutex */ | ||
361 | struct list_head memcg_slab_caches; | 362 | struct list_head memcg_slab_caches; |
362 | /* Not a spinlock, we can take a lot of time walking the list */ | ||
363 | struct mutex slab_caches_mutex; | ||
364 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | 363 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ |
365 | int kmemcg_id; | 364 | int kmemcg_id; |
366 | #endif | 365 | #endif |
@@ -1077,9 +1076,18 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
1077 | 1076 | ||
1078 | rcu_read_lock(); | 1077 | rcu_read_lock(); |
1079 | do { | 1078 | do { |
1080 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1079 | /* |
1081 | if (unlikely(!memcg)) | 1080 | * Page cache insertions can happen withou an |
1081 | * actual mm context, e.g. during disk probing | ||
1082 | * on boot, loopback IO, acct() writes etc. | ||
1083 | */ | ||
1084 | if (unlikely(!mm)) | ||
1082 | memcg = root_mem_cgroup; | 1085 | memcg = root_mem_cgroup; |
1086 | else { | ||
1087 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
1088 | if (unlikely(!memcg)) | ||
1089 | memcg = root_mem_cgroup; | ||
1090 | } | ||
1083 | } while (!css_tryget(&memcg->css)); | 1091 | } while (!css_tryget(&memcg->css)); |
1084 | rcu_read_unlock(); | 1092 | rcu_read_unlock(); |
1085 | return memcg; | 1093 | return memcg; |
@@ -1586,23 +1594,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) | |||
1586 | } | 1594 | } |
1587 | 1595 | ||
1588 | /* | 1596 | /* |
1589 | * 2 routines for checking "mem" is under move_account() or not. | 1597 | * A routine for checking "mem" is under move_account() or not. |
1590 | * | ||
1591 | * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This | ||
1592 | * is used for avoiding races in accounting. If true, | ||
1593 | * pc->mem_cgroup may be overwritten. | ||
1594 | * | 1598 | * |
1595 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or | 1599 | * Checking a cgroup is mc.from or mc.to or under hierarchy of |
1596 | * under hierarchy of moving cgroups. This is for | 1600 | * moving cgroups. This is for waiting at high-memory pressure |
1597 | * waiting at hith-memory prressure caused by "move". | 1601 | * caused by "move". |
1598 | */ | 1602 | */ |
1599 | |||
1600 | static bool mem_cgroup_stolen(struct mem_cgroup *memcg) | ||
1601 | { | ||
1602 | VM_BUG_ON(!rcu_read_lock_held()); | ||
1603 | return atomic_read(&memcg->moving_account) > 0; | ||
1604 | } | ||
1605 | |||
1606 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | 1603 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) |
1607 | { | 1604 | { |
1608 | struct mem_cgroup *from; | 1605 | struct mem_cgroup *from; |
@@ -1645,7 +1642,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | |||
1645 | * Take this lock when | 1642 | * Take this lock when |
1646 | * - a code tries to modify page's memcg while it's USED. | 1643 | * - a code tries to modify page's memcg while it's USED. |
1647 | * - a code tries to modify page state accounting in a memcg. | 1644 | * - a code tries to modify page state accounting in a memcg. |
1648 | * see mem_cgroup_stolen(), too. | ||
1649 | */ | 1645 | */ |
1650 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, | 1646 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, |
1651 | unsigned long *flags) | 1647 | unsigned long *flags) |
@@ -2280,12 +2276,11 @@ cleanup: | |||
2280 | } | 2276 | } |
2281 | 2277 | ||
2282 | /* | 2278 | /* |
2283 | * Currently used to update mapped file statistics, but the routine can be | 2279 | * Used to update mapped file or writeback or other statistics. |
2284 | * generalized to update other statistics as well. | ||
2285 | * | 2280 | * |
2286 | * Notes: Race condition | 2281 | * Notes: Race condition |
2287 | * | 2282 | * |
2288 | * We usually use page_cgroup_lock() for accessing page_cgroup member but | 2283 | * We usually use lock_page_cgroup() for accessing page_cgroup member but |
2289 | * it tends to be costly. But considering some conditions, we doesn't need | 2284 | * it tends to be costly. But considering some conditions, we doesn't need |
2290 | * to do so _always_. | 2285 | * to do so _always_. |
2291 | * | 2286 | * |
@@ -2299,8 +2294,8 @@ cleanup: | |||
2299 | * by flags. | 2294 | * by flags. |
2300 | * | 2295 | * |
2301 | * Considering "move", this is an only case we see a race. To make the race | 2296 | * Considering "move", this is an only case we see a race. To make the race |
2302 | * small, we check mm->moving_account and detect there are possibility of race | 2297 | * small, we check memcg->moving_account and detect there are possibility |
2303 | * If there is, we take a lock. | 2298 | * of race or not. If there is, we take a lock. |
2304 | */ | 2299 | */ |
2305 | 2300 | ||
2306 | void __mem_cgroup_begin_update_page_stat(struct page *page, | 2301 | void __mem_cgroup_begin_update_page_stat(struct page *page, |
@@ -2318,9 +2313,10 @@ again: | |||
2318 | * If this memory cgroup is not under account moving, we don't | 2313 | * If this memory cgroup is not under account moving, we don't |
2319 | * need to take move_lock_mem_cgroup(). Because we already hold | 2314 | * need to take move_lock_mem_cgroup(). Because we already hold |
2320 | * rcu_read_lock(), any calls to move_account will be delayed until | 2315 | * rcu_read_lock(), any calls to move_account will be delayed until |
2321 | * rcu_read_unlock() if mem_cgroup_stolen() == true. | 2316 | * rcu_read_unlock(). |
2322 | */ | 2317 | */ |
2323 | if (!mem_cgroup_stolen(memcg)) | 2318 | VM_BUG_ON(!rcu_read_lock_held()); |
2319 | if (atomic_read(&memcg->moving_account) <= 0) | ||
2324 | return; | 2320 | return; |
2325 | 2321 | ||
2326 | move_lock_mem_cgroup(memcg, flags); | 2322 | move_lock_mem_cgroup(memcg, flags); |
@@ -2428,7 +2424,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) | |||
2428 | */ | 2424 | */ |
2429 | static void drain_local_stock(struct work_struct *dummy) | 2425 | static void drain_local_stock(struct work_struct *dummy) |
2430 | { | 2426 | { |
2431 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | 2427 | struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); |
2432 | drain_stock(stock); | 2428 | drain_stock(stock); |
2433 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 2429 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); |
2434 | } | 2430 | } |
@@ -2675,7 +2671,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, | |||
2675 | * free their memory. | 2671 | * free their memory. |
2676 | */ | 2672 | */ |
2677 | if (unlikely(test_thread_flag(TIF_MEMDIE) || | 2673 | if (unlikely(test_thread_flag(TIF_MEMDIE) || |
2678 | fatal_signal_pending(current))) | 2674 | fatal_signal_pending(current) || |
2675 | current->flags & PF_EXITING)) | ||
2679 | goto bypass; | 2676 | goto bypass; |
2680 | 2677 | ||
2681 | if (unlikely(task_in_memcg_oom(current))) | 2678 | if (unlikely(task_in_memcg_oom(current))) |
@@ -2903,6 +2900,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2903 | static DEFINE_MUTEX(set_limit_mutex); | 2900 | static DEFINE_MUTEX(set_limit_mutex); |
2904 | 2901 | ||
2905 | #ifdef CONFIG_MEMCG_KMEM | 2902 | #ifdef CONFIG_MEMCG_KMEM |
2903 | /* | ||
2904 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or | ||
2905 | * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. | ||
2906 | */ | ||
2907 | static DEFINE_MUTEX(memcg_slab_mutex); | ||
2908 | |||
2906 | static DEFINE_MUTEX(activate_kmem_mutex); | 2909 | static DEFINE_MUTEX(activate_kmem_mutex); |
2907 | 2910 | ||
2908 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | 2911 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) |
@@ -2935,10 +2938,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) | |||
2935 | 2938 | ||
2936 | print_slabinfo_header(m); | 2939 | print_slabinfo_header(m); |
2937 | 2940 | ||
2938 | mutex_lock(&memcg->slab_caches_mutex); | 2941 | mutex_lock(&memcg_slab_mutex); |
2939 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) | 2942 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) |
2940 | cache_show(memcg_params_to_cache(params), m); | 2943 | cache_show(memcg_params_to_cache(params), m); |
2941 | mutex_unlock(&memcg->slab_caches_mutex); | 2944 | mutex_unlock(&memcg_slab_mutex); |
2942 | 2945 | ||
2943 | return 0; | 2946 | return 0; |
2944 | } | 2947 | } |
@@ -3040,8 +3043,6 @@ void memcg_update_array_size(int num) | |||
3040 | memcg_limited_groups_array_size = memcg_caches_array_size(num); | 3043 | memcg_limited_groups_array_size = memcg_caches_array_size(num); |
3041 | } | 3044 | } |
3042 | 3045 | ||
3043 | static void kmem_cache_destroy_work_func(struct work_struct *w); | ||
3044 | |||
3045 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | 3046 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups) |
3046 | { | 3047 | { |
3047 | struct memcg_cache_params *cur_params = s->memcg_params; | 3048 | struct memcg_cache_params *cur_params = s->memcg_params; |
@@ -3094,29 +3095,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3094 | return 0; | 3095 | return 0; |
3095 | } | 3096 | } |
3096 | 3097 | ||
3097 | char *memcg_create_cache_name(struct mem_cgroup *memcg, | ||
3098 | struct kmem_cache *root_cache) | ||
3099 | { | ||
3100 | static char *buf = NULL; | ||
3101 | |||
3102 | /* | ||
3103 | * We need a mutex here to protect the shared buffer. Since this is | ||
3104 | * expected to be called only on cache creation, we can employ the | ||
3105 | * slab_mutex for that purpose. | ||
3106 | */ | ||
3107 | lockdep_assert_held(&slab_mutex); | ||
3108 | |||
3109 | if (!buf) { | ||
3110 | buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
3111 | if (!buf) | ||
3112 | return NULL; | ||
3113 | } | ||
3114 | |||
3115 | cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); | ||
3116 | return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, | ||
3117 | memcg_cache_id(memcg), buf); | ||
3118 | } | ||
3119 | |||
3120 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | 3098 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, |
3121 | struct kmem_cache *root_cache) | 3099 | struct kmem_cache *root_cache) |
3122 | { | 3100 | { |
@@ -3138,8 +3116,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | |||
3138 | if (memcg) { | 3116 | if (memcg) { |
3139 | s->memcg_params->memcg = memcg; | 3117 | s->memcg_params->memcg = memcg; |
3140 | s->memcg_params->root_cache = root_cache; | 3118 | s->memcg_params->root_cache = root_cache; |
3141 | INIT_WORK(&s->memcg_params->destroy, | ||
3142 | kmem_cache_destroy_work_func); | ||
3143 | css_get(&memcg->css); | 3119 | css_get(&memcg->css); |
3144 | } else | 3120 | } else |
3145 | s->memcg_params->is_root_cache = true; | 3121 | s->memcg_params->is_root_cache = true; |
@@ -3156,24 +3132,37 @@ void memcg_free_cache_params(struct kmem_cache *s) | |||
3156 | kfree(s->memcg_params); | 3132 | kfree(s->memcg_params); |
3157 | } | 3133 | } |
3158 | 3134 | ||
3159 | void memcg_register_cache(struct kmem_cache *s) | 3135 | static void memcg_register_cache(struct mem_cgroup *memcg, |
3136 | struct kmem_cache *root_cache) | ||
3160 | { | 3137 | { |
3161 | struct kmem_cache *root; | 3138 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by |
3162 | struct mem_cgroup *memcg; | 3139 | memcg_slab_mutex */ |
3140 | struct kmem_cache *cachep; | ||
3163 | int id; | 3141 | int id; |
3164 | 3142 | ||
3165 | if (is_root_cache(s)) | 3143 | lockdep_assert_held(&memcg_slab_mutex); |
3144 | |||
3145 | id = memcg_cache_id(memcg); | ||
3146 | |||
3147 | /* | ||
3148 | * Since per-memcg caches are created asynchronously on first | ||
3149 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
3150 | * create the same cache, but only one of them may succeed. | ||
3151 | */ | ||
3152 | if (cache_from_memcg_idx(root_cache, id)) | ||
3166 | return; | 3153 | return; |
3167 | 3154 | ||
3155 | cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); | ||
3156 | cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); | ||
3168 | /* | 3157 | /* |
3169 | * Holding the slab_mutex assures nobody will touch the memcg_caches | 3158 | * If we could not create a memcg cache, do not complain, because |
3170 | * array while we are modifying it. | 3159 | * that's not critical at all as we can always proceed with the root |
3160 | * cache. | ||
3171 | */ | 3161 | */ |
3172 | lockdep_assert_held(&slab_mutex); | 3162 | if (!cachep) |
3163 | return; | ||
3173 | 3164 | ||
3174 | root = s->memcg_params->root_cache; | 3165 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); |
3175 | memcg = s->memcg_params->memcg; | ||
3176 | id = memcg_cache_id(memcg); | ||
3177 | 3166 | ||
3178 | /* | 3167 | /* |
3179 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | 3168 | * Since readers won't lock (see cache_from_memcg_idx()), we need a |
@@ -3182,49 +3171,30 @@ void memcg_register_cache(struct kmem_cache *s) | |||
3182 | */ | 3171 | */ |
3183 | smp_wmb(); | 3172 | smp_wmb(); |
3184 | 3173 | ||
3185 | /* | 3174 | BUG_ON(root_cache->memcg_params->memcg_caches[id]); |
3186 | * Initialize the pointer to this cache in its parent's memcg_params | 3175 | root_cache->memcg_params->memcg_caches[id] = cachep; |
3187 | * before adding it to the memcg_slab_caches list, otherwise we can | ||
3188 | * fail to convert memcg_params_to_cache() while traversing the list. | ||
3189 | */ | ||
3190 | VM_BUG_ON(root->memcg_params->memcg_caches[id]); | ||
3191 | root->memcg_params->memcg_caches[id] = s; | ||
3192 | |||
3193 | mutex_lock(&memcg->slab_caches_mutex); | ||
3194 | list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); | ||
3195 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3196 | } | 3176 | } |
3197 | 3177 | ||
3198 | void memcg_unregister_cache(struct kmem_cache *s) | 3178 | static void memcg_unregister_cache(struct kmem_cache *cachep) |
3199 | { | 3179 | { |
3200 | struct kmem_cache *root; | 3180 | struct kmem_cache *root_cache; |
3201 | struct mem_cgroup *memcg; | 3181 | struct mem_cgroup *memcg; |
3202 | int id; | 3182 | int id; |
3203 | 3183 | ||
3204 | if (is_root_cache(s)) | 3184 | lockdep_assert_held(&memcg_slab_mutex); |
3205 | return; | ||
3206 | 3185 | ||
3207 | /* | 3186 | BUG_ON(is_root_cache(cachep)); |
3208 | * Holding the slab_mutex assures nobody will touch the memcg_caches | ||
3209 | * array while we are modifying it. | ||
3210 | */ | ||
3211 | lockdep_assert_held(&slab_mutex); | ||
3212 | 3187 | ||
3213 | root = s->memcg_params->root_cache; | 3188 | root_cache = cachep->memcg_params->root_cache; |
3214 | memcg = s->memcg_params->memcg; | 3189 | memcg = cachep->memcg_params->memcg; |
3215 | id = memcg_cache_id(memcg); | 3190 | id = memcg_cache_id(memcg); |
3216 | 3191 | ||
3217 | mutex_lock(&memcg->slab_caches_mutex); | 3192 | BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); |
3218 | list_del(&s->memcg_params->list); | 3193 | root_cache->memcg_params->memcg_caches[id] = NULL; |
3219 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3220 | 3194 | ||
3221 | /* | 3195 | list_del(&cachep->memcg_params->list); |
3222 | * Clear the pointer to this cache in its parent's memcg_params only | 3196 | |
3223 | * after removing it from the memcg_slab_caches list, otherwise we can | 3197 | kmem_cache_destroy(cachep); |
3224 | * fail to convert memcg_params_to_cache() while traversing the list. | ||
3225 | */ | ||
3226 | VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); | ||
3227 | root->memcg_params->memcg_caches[id] = NULL; | ||
3228 | } | 3198 | } |
3229 | 3199 | ||
3230 | /* | 3200 | /* |
@@ -3258,144 +3228,61 @@ static inline void memcg_resume_kmem_account(void) | |||
3258 | current->memcg_kmem_skip_account--; | 3228 | current->memcg_kmem_skip_account--; |
3259 | } | 3229 | } |
3260 | 3230 | ||
3261 | static void kmem_cache_destroy_work_func(struct work_struct *w) | 3231 | int __memcg_cleanup_cache_params(struct kmem_cache *s) |
3262 | { | ||
3263 | struct kmem_cache *cachep; | ||
3264 | struct memcg_cache_params *p; | ||
3265 | |||
3266 | p = container_of(w, struct memcg_cache_params, destroy); | ||
3267 | |||
3268 | cachep = memcg_params_to_cache(p); | ||
3269 | |||
3270 | /* | ||
3271 | * If we get down to 0 after shrink, we could delete right away. | ||
3272 | * However, memcg_release_pages() already puts us back in the workqueue | ||
3273 | * in that case. If we proceed deleting, we'll get a dangling | ||
3274 | * reference, and removing the object from the workqueue in that case | ||
3275 | * is unnecessary complication. We are not a fast path. | ||
3276 | * | ||
3277 | * Note that this case is fundamentally different from racing with | ||
3278 | * shrink_slab(): if memcg_cgroup_destroy_cache() is called in | ||
3279 | * kmem_cache_shrink, not only we would be reinserting a dead cache | ||
3280 | * into the queue, but doing so from inside the worker racing to | ||
3281 | * destroy it. | ||
3282 | * | ||
3283 | * So if we aren't down to zero, we'll just schedule a worker and try | ||
3284 | * again | ||
3285 | */ | ||
3286 | if (atomic_read(&cachep->memcg_params->nr_pages) != 0) | ||
3287 | kmem_cache_shrink(cachep); | ||
3288 | else | ||
3289 | kmem_cache_destroy(cachep); | ||
3290 | } | ||
3291 | |||
3292 | void mem_cgroup_destroy_cache(struct kmem_cache *cachep) | ||
3293 | { | ||
3294 | if (!cachep->memcg_params->dead) | ||
3295 | return; | ||
3296 | |||
3297 | /* | ||
3298 | * There are many ways in which we can get here. | ||
3299 | * | ||
3300 | * We can get to a memory-pressure situation while the delayed work is | ||
3301 | * still pending to run. The vmscan shrinkers can then release all | ||
3302 | * cache memory and get us to destruction. If this is the case, we'll | ||
3303 | * be executed twice, which is a bug (the second time will execute over | ||
3304 | * bogus data). In this case, cancelling the work should be fine. | ||
3305 | * | ||
3306 | * But we can also get here from the worker itself, if | ||
3307 | * kmem_cache_shrink is enough to shake all the remaining objects and | ||
3308 | * get the page count to 0. In this case, we'll deadlock if we try to | ||
3309 | * cancel the work (the worker runs with an internal lock held, which | ||
3310 | * is the same lock we would hold for cancel_work_sync().) | ||
3311 | * | ||
3312 | * Since we can't possibly know who got us here, just refrain from | ||
3313 | * running if there is already work pending | ||
3314 | */ | ||
3315 | if (work_pending(&cachep->memcg_params->destroy)) | ||
3316 | return; | ||
3317 | /* | ||
3318 | * We have to defer the actual destroying to a workqueue, because | ||
3319 | * we might currently be in a context that cannot sleep. | ||
3320 | */ | ||
3321 | schedule_work(&cachep->memcg_params->destroy); | ||
3322 | } | ||
3323 | |||
3324 | int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) | ||
3325 | { | 3232 | { |
3326 | struct kmem_cache *c; | 3233 | struct kmem_cache *c; |
3327 | int i, failed = 0; | 3234 | int i, failed = 0; |
3328 | 3235 | ||
3329 | /* | 3236 | mutex_lock(&memcg_slab_mutex); |
3330 | * If the cache is being destroyed, we trust that there is no one else | ||
3331 | * requesting objects from it. Even if there are, the sanity checks in | ||
3332 | * kmem_cache_destroy should caught this ill-case. | ||
3333 | * | ||
3334 | * Still, we don't want anyone else freeing memcg_caches under our | ||
3335 | * noses, which can happen if a new memcg comes to life. As usual, | ||
3336 | * we'll take the activate_kmem_mutex to protect ourselves against | ||
3337 | * this. | ||
3338 | */ | ||
3339 | mutex_lock(&activate_kmem_mutex); | ||
3340 | for_each_memcg_cache_index(i) { | 3237 | for_each_memcg_cache_index(i) { |
3341 | c = cache_from_memcg_idx(s, i); | 3238 | c = cache_from_memcg_idx(s, i); |
3342 | if (!c) | 3239 | if (!c) |
3343 | continue; | 3240 | continue; |
3344 | 3241 | ||
3345 | /* | 3242 | memcg_unregister_cache(c); |
3346 | * We will now manually delete the caches, so to avoid races | ||
3347 | * we need to cancel all pending destruction workers and | ||
3348 | * proceed with destruction ourselves. | ||
3349 | * | ||
3350 | * kmem_cache_destroy() will call kmem_cache_shrink internally, | ||
3351 | * and that could spawn the workers again: it is likely that | ||
3352 | * the cache still have active pages until this very moment. | ||
3353 | * This would lead us back to mem_cgroup_destroy_cache. | ||
3354 | * | ||
3355 | * But that will not execute at all if the "dead" flag is not | ||
3356 | * set, so flip it down to guarantee we are in control. | ||
3357 | */ | ||
3358 | c->memcg_params->dead = false; | ||
3359 | cancel_work_sync(&c->memcg_params->destroy); | ||
3360 | kmem_cache_destroy(c); | ||
3361 | 3243 | ||
3362 | if (cache_from_memcg_idx(s, i)) | 3244 | if (cache_from_memcg_idx(s, i)) |
3363 | failed++; | 3245 | failed++; |
3364 | } | 3246 | } |
3365 | mutex_unlock(&activate_kmem_mutex); | 3247 | mutex_unlock(&memcg_slab_mutex); |
3366 | return failed; | 3248 | return failed; |
3367 | } | 3249 | } |
3368 | 3250 | ||
3369 | static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | 3251 | static void memcg_unregister_all_caches(struct mem_cgroup *memcg) |
3370 | { | 3252 | { |
3371 | struct kmem_cache *cachep; | 3253 | struct kmem_cache *cachep; |
3372 | struct memcg_cache_params *params; | 3254 | struct memcg_cache_params *params, *tmp; |
3373 | 3255 | ||
3374 | if (!memcg_kmem_is_active(memcg)) | 3256 | if (!memcg_kmem_is_active(memcg)) |
3375 | return; | 3257 | return; |
3376 | 3258 | ||
3377 | mutex_lock(&memcg->slab_caches_mutex); | 3259 | mutex_lock(&memcg_slab_mutex); |
3378 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) { | 3260 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { |
3379 | cachep = memcg_params_to_cache(params); | 3261 | cachep = memcg_params_to_cache(params); |
3380 | cachep->memcg_params->dead = true; | 3262 | kmem_cache_shrink(cachep); |
3381 | schedule_work(&cachep->memcg_params->destroy); | 3263 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) |
3264 | memcg_unregister_cache(cachep); | ||
3382 | } | 3265 | } |
3383 | mutex_unlock(&memcg->slab_caches_mutex); | 3266 | mutex_unlock(&memcg_slab_mutex); |
3384 | } | 3267 | } |
3385 | 3268 | ||
3386 | struct create_work { | 3269 | struct memcg_register_cache_work { |
3387 | struct mem_cgroup *memcg; | 3270 | struct mem_cgroup *memcg; |
3388 | struct kmem_cache *cachep; | 3271 | struct kmem_cache *cachep; |
3389 | struct work_struct work; | 3272 | struct work_struct work; |
3390 | }; | 3273 | }; |
3391 | 3274 | ||
3392 | static void memcg_create_cache_work_func(struct work_struct *w) | 3275 | static void memcg_register_cache_func(struct work_struct *w) |
3393 | { | 3276 | { |
3394 | struct create_work *cw = container_of(w, struct create_work, work); | 3277 | struct memcg_register_cache_work *cw = |
3278 | container_of(w, struct memcg_register_cache_work, work); | ||
3395 | struct mem_cgroup *memcg = cw->memcg; | 3279 | struct mem_cgroup *memcg = cw->memcg; |
3396 | struct kmem_cache *cachep = cw->cachep; | 3280 | struct kmem_cache *cachep = cw->cachep; |
3397 | 3281 | ||
3398 | kmem_cache_create_memcg(memcg, cachep); | 3282 | mutex_lock(&memcg_slab_mutex); |
3283 | memcg_register_cache(memcg, cachep); | ||
3284 | mutex_unlock(&memcg_slab_mutex); | ||
3285 | |||
3399 | css_put(&memcg->css); | 3286 | css_put(&memcg->css); |
3400 | kfree(cw); | 3287 | kfree(cw); |
3401 | } | 3288 | } |
@@ -3403,12 +3290,12 @@ static void memcg_create_cache_work_func(struct work_struct *w) | |||
3403 | /* | 3290 | /* |
3404 | * Enqueue the creation of a per-memcg kmem_cache. | 3291 | * Enqueue the creation of a per-memcg kmem_cache. |
3405 | */ | 3292 | */ |
3406 | static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, | 3293 | static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, |
3407 | struct kmem_cache *cachep) | 3294 | struct kmem_cache *cachep) |
3408 | { | 3295 | { |
3409 | struct create_work *cw; | 3296 | struct memcg_register_cache_work *cw; |
3410 | 3297 | ||
3411 | cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); | 3298 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
3412 | if (cw == NULL) { | 3299 | if (cw == NULL) { |
3413 | css_put(&memcg->css); | 3300 | css_put(&memcg->css); |
3414 | return; | 3301 | return; |
@@ -3417,17 +3304,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, | |||
3417 | cw->memcg = memcg; | 3304 | cw->memcg = memcg; |
3418 | cw->cachep = cachep; | 3305 | cw->cachep = cachep; |
3419 | 3306 | ||
3420 | INIT_WORK(&cw->work, memcg_create_cache_work_func); | 3307 | INIT_WORK(&cw->work, memcg_register_cache_func); |
3421 | schedule_work(&cw->work); | 3308 | schedule_work(&cw->work); |
3422 | } | 3309 | } |
3423 | 3310 | ||
3424 | static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, | 3311 | static void memcg_schedule_register_cache(struct mem_cgroup *memcg, |
3425 | struct kmem_cache *cachep) | 3312 | struct kmem_cache *cachep) |
3426 | { | 3313 | { |
3427 | /* | 3314 | /* |
3428 | * We need to stop accounting when we kmalloc, because if the | 3315 | * We need to stop accounting when we kmalloc, because if the |
3429 | * corresponding kmalloc cache is not yet created, the first allocation | 3316 | * corresponding kmalloc cache is not yet created, the first allocation |
3430 | * in __memcg_create_cache_enqueue will recurse. | 3317 | * in __memcg_schedule_register_cache will recurse. |
3431 | * | 3318 | * |
3432 | * However, it is better to enclose the whole function. Depending on | 3319 | * However, it is better to enclose the whole function. Depending on |
3433 | * the debugging options enabled, INIT_WORK(), for instance, can | 3320 | * the debugging options enabled, INIT_WORK(), for instance, can |
@@ -3436,9 +3323,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, | |||
3436 | * the safest choice is to do it like this, wrapping the whole function. | 3323 | * the safest choice is to do it like this, wrapping the whole function. |
3437 | */ | 3324 | */ |
3438 | memcg_stop_kmem_account(); | 3325 | memcg_stop_kmem_account(); |
3439 | __memcg_create_cache_enqueue(memcg, cachep); | 3326 | __memcg_schedule_register_cache(memcg, cachep); |
3440 | memcg_resume_kmem_account(); | 3327 | memcg_resume_kmem_account(); |
3441 | } | 3328 | } |
3329 | |||
3330 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | ||
3331 | { | ||
3332 | int res; | ||
3333 | |||
3334 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, | ||
3335 | PAGE_SIZE << order); | ||
3336 | if (!res) | ||
3337 | atomic_add(1 << order, &cachep->memcg_params->nr_pages); | ||
3338 | return res; | ||
3339 | } | ||
3340 | |||
3341 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | ||
3342 | { | ||
3343 | memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); | ||
3344 | atomic_sub(1 << order, &cachep->memcg_params->nr_pages); | ||
3345 | } | ||
3346 | |||
3442 | /* | 3347 | /* |
3443 | * Return the kmem_cache we're supposed to use for a slab allocation. | 3348 | * Return the kmem_cache we're supposed to use for a slab allocation. |
3444 | * We try to use the current memcg's version of the cache. | 3349 | * We try to use the current memcg's version of the cache. |
@@ -3489,22 +3394,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3489 | * | 3394 | * |
3490 | * However, there are some clashes that can arrive from locking. | 3395 | * However, there are some clashes that can arrive from locking. |
3491 | * For instance, because we acquire the slab_mutex while doing | 3396 | * For instance, because we acquire the slab_mutex while doing |
3492 | * kmem_cache_dup, this means no further allocation could happen | 3397 | * memcg_create_kmem_cache, this means no further allocation |
3493 | * with the slab_mutex held. | 3398 | * could happen with the slab_mutex held. So it's better to |
3494 | * | 3399 | * defer everything. |
3495 | * Also, because cache creation issue get_online_cpus(), this | ||
3496 | * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, | ||
3497 | * that ends up reversed during cpu hotplug. (cpuset allocates | ||
3498 | * a bunch of GFP_KERNEL memory during cpuup). Due to all that, | ||
3499 | * better to defer everything. | ||
3500 | */ | 3400 | */ |
3501 | memcg_create_cache_enqueue(memcg, cachep); | 3401 | memcg_schedule_register_cache(memcg, cachep); |
3502 | return cachep; | 3402 | return cachep; |
3503 | out: | 3403 | out: |
3504 | rcu_read_unlock(); | 3404 | rcu_read_unlock(); |
3505 | return cachep; | 3405 | return cachep; |
3506 | } | 3406 | } |
3507 | EXPORT_SYMBOL(__memcg_kmem_get_cache); | ||
3508 | 3407 | ||
3509 | /* | 3408 | /* |
3510 | * We need to verify if the allocation against current->mm->owner's memcg is | 3409 | * We need to verify if the allocation against current->mm->owner's memcg is |
@@ -3531,11 +3430,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3531 | /* | 3430 | /* |
3532 | * Disabling accounting is only relevant for some specific memcg | 3431 | * Disabling accounting is only relevant for some specific memcg |
3533 | * internal allocations. Therefore we would initially not have such | 3432 | * internal allocations. Therefore we would initially not have such |
3534 | * check here, since direct calls to the page allocator that are marked | 3433 | * check here, since direct calls to the page allocator that are |
3535 | * with GFP_KMEMCG only happen outside memcg core. We are mostly | 3434 | * accounted to kmemcg (alloc_kmem_pages and friends) only happen |
3536 | * concerned with cache allocations, and by having this test at | 3435 | * outside memcg core. We are mostly concerned with cache allocations, |
3537 | * memcg_kmem_get_cache, we are already able to relay the allocation to | 3436 | * and by having this test at memcg_kmem_get_cache, we are already able |
3538 | * the root cache and bypass the memcg cache altogether. | 3437 | * to relay the allocation to the root cache and bypass the memcg cache |
3438 | * altogether. | ||
3539 | * | 3439 | * |
3540 | * There is one exception, though: the SLUB allocator does not create | 3440 | * There is one exception, though: the SLUB allocator does not create |
3541 | * large order caches, but rather service large kmallocs directly from | 3441 | * large order caches, but rather service large kmallocs directly from |
@@ -3622,7 +3522,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
3622 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | 3522 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); |
3623 | } | 3523 | } |
3624 | #else | 3524 | #else |
3625 | static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | 3525 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) |
3626 | { | 3526 | { |
3627 | } | 3527 | } |
3628 | #endif /* CONFIG_MEMCG_KMEM */ | 3528 | #endif /* CONFIG_MEMCG_KMEM */ |
@@ -3958,17 +3858,9 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, | |||
3958 | return 0; | 3858 | return 0; |
3959 | } | 3859 | } |
3960 | 3860 | ||
3961 | /* | 3861 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); |
3962 | * Page cache insertions can happen without an actual mm | 3862 | if (!memcg) |
3963 | * context, e.g. during disk probing on boot. | 3863 | return -ENOMEM; |
3964 | */ | ||
3965 | if (unlikely(!mm)) | ||
3966 | memcg = root_mem_cgroup; | ||
3967 | else { | ||
3968 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); | ||
3969 | if (!memcg) | ||
3970 | return -ENOMEM; | ||
3971 | } | ||
3972 | __mem_cgroup_commit_charge(memcg, page, 1, type, false); | 3864 | __mem_cgroup_commit_charge(memcg, page, 1, type, false); |
3973 | return 0; | 3865 | return 0; |
3974 | } | 3866 | } |
@@ -4783,9 +4675,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
4783 | if (mem_cgroup_move_parent(page, pc, memcg)) { | 4675 | if (mem_cgroup_move_parent(page, pc, memcg)) { |
4784 | /* found lock contention or "pc" is obsolete. */ | 4676 | /* found lock contention or "pc" is obsolete. */ |
4785 | busy = page; | 4677 | busy = page; |
4786 | cond_resched(); | ||
4787 | } else | 4678 | } else |
4788 | busy = NULL; | 4679 | busy = NULL; |
4680 | cond_resched(); | ||
4789 | } while (!list_empty(list)); | 4681 | } while (!list_empty(list)); |
4790 | } | 4682 | } |
4791 | 4683 | ||
@@ -5061,13 +4953,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, | |||
5061 | * Make sure we have enough space for this cgroup in each root cache's | 4953 | * Make sure we have enough space for this cgroup in each root cache's |
5062 | * memcg_params. | 4954 | * memcg_params. |
5063 | */ | 4955 | */ |
4956 | mutex_lock(&memcg_slab_mutex); | ||
5064 | err = memcg_update_all_caches(memcg_id + 1); | 4957 | err = memcg_update_all_caches(memcg_id + 1); |
4958 | mutex_unlock(&memcg_slab_mutex); | ||
5065 | if (err) | 4959 | if (err) |
5066 | goto out_rmid; | 4960 | goto out_rmid; |
5067 | 4961 | ||
5068 | memcg->kmemcg_id = memcg_id; | 4962 | memcg->kmemcg_id = memcg_id; |
5069 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | 4963 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); |
5070 | mutex_init(&memcg->slab_caches_mutex); | ||
5071 | 4964 | ||
5072 | /* | 4965 | /* |
5073 | * We couldn't have accounted to this cgroup, because it hasn't got the | 4966 | * We couldn't have accounted to this cgroup, because it hasn't got the |
@@ -5442,22 +5335,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, | |||
5442 | struct cftype *cft, u64 val) | 5335 | struct cftype *cft, u64 val) |
5443 | { | 5336 | { |
5444 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5337 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5445 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); | ||
5446 | 5338 | ||
5447 | if (val > 100 || !parent) | 5339 | if (val > 100) |
5448 | return -EINVAL; | 5340 | return -EINVAL; |
5449 | 5341 | ||
5450 | mutex_lock(&memcg_create_mutex); | 5342 | if (css_parent(css)) |
5451 | 5343 | memcg->swappiness = val; | |
5452 | /* If under hierarchy, only empty-root can set this value */ | 5344 | else |
5453 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { | 5345 | vm_swappiness = val; |
5454 | mutex_unlock(&memcg_create_mutex); | ||
5455 | return -EINVAL; | ||
5456 | } | ||
5457 | |||
5458 | memcg->swappiness = val; | ||
5459 | |||
5460 | mutex_unlock(&memcg_create_mutex); | ||
5461 | 5346 | ||
5462 | return 0; | 5347 | return 0; |
5463 | } | 5348 | } |
@@ -5789,22 +5674,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, | |||
5789 | struct cftype *cft, u64 val) | 5674 | struct cftype *cft, u64 val) |
5790 | { | 5675 | { |
5791 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5676 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5792 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); | ||
5793 | 5677 | ||
5794 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | 5678 | /* cannot set to root cgroup and only 0 and 1 are allowed */ |
5795 | if (!parent || !((val == 0) || (val == 1))) | 5679 | if (!css_parent(css) || !((val == 0) || (val == 1))) |
5796 | return -EINVAL; | 5680 | return -EINVAL; |
5797 | 5681 | ||
5798 | mutex_lock(&memcg_create_mutex); | ||
5799 | /* oom-kill-disable is a flag for subhierarchy. */ | ||
5800 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { | ||
5801 | mutex_unlock(&memcg_create_mutex); | ||
5802 | return -EINVAL; | ||
5803 | } | ||
5804 | memcg->oom_kill_disable = val; | 5682 | memcg->oom_kill_disable = val; |
5805 | if (!val) | 5683 | if (!val) |
5806 | memcg_oom_recover(memcg); | 5684 | memcg_oom_recover(memcg); |
5807 | mutex_unlock(&memcg_create_mutex); | 5685 | |
5808 | return 0; | 5686 | return 0; |
5809 | } | 5687 | } |
5810 | 5688 | ||
@@ -6490,7 +6368,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
6490 | css_for_each_descendant_post(iter, css) | 6368 | css_for_each_descendant_post(iter, css) |
6491 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); | 6369 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); |
6492 | 6370 | ||
6493 | mem_cgroup_destroy_all_caches(memcg); | 6371 | memcg_unregister_all_caches(memcg); |
6494 | vmpressure_cleanup(&memcg->vmpressure); | 6372 | vmpressure_cleanup(&memcg->vmpressure); |
6495 | } | 6373 | } |
6496 | 6374 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 35ef28acf137..cd8989c1027e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -204,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, | |||
204 | #endif | 204 | #endif |
205 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; | 205 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; |
206 | 206 | ||
207 | if ((flags & MF_ACTION_REQUIRED) && t == current) { | 207 | if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { |
208 | si.si_code = BUS_MCEERR_AR; | 208 | si.si_code = BUS_MCEERR_AR; |
209 | ret = force_sig_info(SIGBUS, &si, t); | 209 | ret = force_sig_info(SIGBUS, &si, current); |
210 | } else { | 210 | } else { |
211 | /* | 211 | /* |
212 | * Don't use force here, it's convenient if the signal | 212 | * Don't use force here, it's convenient if the signal |
@@ -380,20 +380,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, | |||
380 | } | 380 | } |
381 | } | 381 | } |
382 | 382 | ||
383 | static int task_early_kill(struct task_struct *tsk) | 383 | /* |
384 | * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) | ||
385 | * on behalf of the thread group. Return task_struct of the (first found) | ||
386 | * dedicated thread if found, and return NULL otherwise. | ||
387 | * | ||
388 | * We already hold read_lock(&tasklist_lock) in the caller, so we don't | ||
389 | * have to call rcu_read_lock/unlock() in this function. | ||
390 | */ | ||
391 | static struct task_struct *find_early_kill_thread(struct task_struct *tsk) | ||
384 | { | 392 | { |
393 | struct task_struct *t; | ||
394 | |||
395 | for_each_thread(tsk, t) | ||
396 | if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) | ||
397 | return t; | ||
398 | return NULL; | ||
399 | } | ||
400 | |||
401 | /* | ||
402 | * Determine whether a given process is "early kill" process which expects | ||
403 | * to be signaled when some page under the process is hwpoisoned. | ||
404 | * Return task_struct of the dedicated thread (main thread unless explicitly | ||
405 | * specified) if the process is "early kill," and otherwise returns NULL. | ||
406 | */ | ||
407 | static struct task_struct *task_early_kill(struct task_struct *tsk, | ||
408 | int force_early) | ||
409 | { | ||
410 | struct task_struct *t; | ||
385 | if (!tsk->mm) | 411 | if (!tsk->mm) |
386 | return 0; | 412 | return NULL; |
387 | if (tsk->flags & PF_MCE_PROCESS) | 413 | if (force_early) |
388 | return !!(tsk->flags & PF_MCE_EARLY); | 414 | return tsk; |
389 | return sysctl_memory_failure_early_kill; | 415 | t = find_early_kill_thread(tsk); |
416 | if (t) | ||
417 | return t; | ||
418 | if (sysctl_memory_failure_early_kill) | ||
419 | return tsk; | ||
420 | return NULL; | ||
390 | } | 421 | } |
391 | 422 | ||
392 | /* | 423 | /* |
393 | * Collect processes when the error hit an anonymous page. | 424 | * Collect processes when the error hit an anonymous page. |
394 | */ | 425 | */ |
395 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, | 426 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, |
396 | struct to_kill **tkc) | 427 | struct to_kill **tkc, int force_early) |
397 | { | 428 | { |
398 | struct vm_area_struct *vma; | 429 | struct vm_area_struct *vma; |
399 | struct task_struct *tsk; | 430 | struct task_struct *tsk; |
@@ -408,16 +439,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
408 | read_lock(&tasklist_lock); | 439 | read_lock(&tasklist_lock); |
409 | for_each_process (tsk) { | 440 | for_each_process (tsk) { |
410 | struct anon_vma_chain *vmac; | 441 | struct anon_vma_chain *vmac; |
442 | struct task_struct *t = task_early_kill(tsk, force_early); | ||
411 | 443 | ||
412 | if (!task_early_kill(tsk)) | 444 | if (!t) |
413 | continue; | 445 | continue; |
414 | anon_vma_interval_tree_foreach(vmac, &av->rb_root, | 446 | anon_vma_interval_tree_foreach(vmac, &av->rb_root, |
415 | pgoff, pgoff) { | 447 | pgoff, pgoff) { |
416 | vma = vmac->vma; | 448 | vma = vmac->vma; |
417 | if (!page_mapped_in_vma(page, vma)) | 449 | if (!page_mapped_in_vma(page, vma)) |
418 | continue; | 450 | continue; |
419 | if (vma->vm_mm == tsk->mm) | 451 | if (vma->vm_mm == t->mm) |
420 | add_to_kill(tsk, page, vma, to_kill, tkc); | 452 | add_to_kill(t, page, vma, to_kill, tkc); |
421 | } | 453 | } |
422 | } | 454 | } |
423 | read_unlock(&tasklist_lock); | 455 | read_unlock(&tasklist_lock); |
@@ -428,7 +460,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
428 | * Collect processes when the error hit a file mapped page. | 460 | * Collect processes when the error hit a file mapped page. |
429 | */ | 461 | */ |
430 | static void collect_procs_file(struct page *page, struct list_head *to_kill, | 462 | static void collect_procs_file(struct page *page, struct list_head *to_kill, |
431 | struct to_kill **tkc) | 463 | struct to_kill **tkc, int force_early) |
432 | { | 464 | { |
433 | struct vm_area_struct *vma; | 465 | struct vm_area_struct *vma; |
434 | struct task_struct *tsk; | 466 | struct task_struct *tsk; |
@@ -438,10 +470,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
438 | read_lock(&tasklist_lock); | 470 | read_lock(&tasklist_lock); |
439 | for_each_process(tsk) { | 471 | for_each_process(tsk) { |
440 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 472 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
473 | struct task_struct *t = task_early_kill(tsk, force_early); | ||
441 | 474 | ||
442 | if (!task_early_kill(tsk)) | 475 | if (!t) |
443 | continue; | 476 | continue; |
444 | |||
445 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, | 477 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, |
446 | pgoff) { | 478 | pgoff) { |
447 | /* | 479 | /* |
@@ -451,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
451 | * Assume applications who requested early kill want | 483 | * Assume applications who requested early kill want |
452 | * to be informed of all such data corruptions. | 484 | * to be informed of all such data corruptions. |
453 | */ | 485 | */ |
454 | if (vma->vm_mm == tsk->mm) | 486 | if (vma->vm_mm == t->mm) |
455 | add_to_kill(tsk, page, vma, to_kill, tkc); | 487 | add_to_kill(t, page, vma, to_kill, tkc); |
456 | } | 488 | } |
457 | } | 489 | } |
458 | read_unlock(&tasklist_lock); | 490 | read_unlock(&tasklist_lock); |
@@ -465,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
465 | * First preallocate one tokill structure outside the spin locks, | 497 | * First preallocate one tokill structure outside the spin locks, |
466 | * so that we can kill at least one process reasonably reliable. | 498 | * so that we can kill at least one process reasonably reliable. |
467 | */ | 499 | */ |
468 | static void collect_procs(struct page *page, struct list_head *tokill) | 500 | static void collect_procs(struct page *page, struct list_head *tokill, |
501 | int force_early) | ||
469 | { | 502 | { |
470 | struct to_kill *tk; | 503 | struct to_kill *tk; |
471 | 504 | ||
@@ -476,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill) | |||
476 | if (!tk) | 509 | if (!tk) |
477 | return; | 510 | return; |
478 | if (PageAnon(page)) | 511 | if (PageAnon(page)) |
479 | collect_procs_anon(page, tokill, &tk); | 512 | collect_procs_anon(page, tokill, &tk, force_early); |
480 | else | 513 | else |
481 | collect_procs_file(page, tokill, &tk); | 514 | collect_procs_file(page, tokill, &tk, force_early); |
482 | kfree(tk); | 515 | kfree(tk); |
483 | } | 516 | } |
484 | 517 | ||
@@ -963,7 +996,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
963 | * there's nothing that can be done. | 996 | * there's nothing that can be done. |
964 | */ | 997 | */ |
965 | if (kill) | 998 | if (kill) |
966 | collect_procs(ppage, &tokill); | 999 | collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); |
967 | 1000 | ||
968 | ret = try_to_unmap(ppage, ttu); | 1001 | ret = try_to_unmap(ppage, ttu); |
969 | if (ret != SWAP_SUCCESS) | 1002 | if (ret != SWAP_SUCCESS) |
@@ -1081,15 +1114,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1081 | return 0; | 1114 | return 0; |
1082 | } else if (PageHuge(hpage)) { | 1115 | } else if (PageHuge(hpage)) { |
1083 | /* | 1116 | /* |
1084 | * Check "just unpoisoned", "filter hit", and | 1117 | * Check "filter hit" and "race with other subpage." |
1085 | * "race with other subpage." | ||
1086 | */ | 1118 | */ |
1087 | lock_page(hpage); | 1119 | lock_page(hpage); |
1088 | if (!PageHWPoison(hpage) | 1120 | if (PageHWPoison(hpage)) { |
1089 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | 1121 | if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) |
1090 | || (p != hpage && TestSetPageHWPoison(hpage))) { | 1122 | || (p != hpage && TestSetPageHWPoison(hpage))) { |
1091 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1123 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1092 | return 0; | 1124 | unlock_page(hpage); |
1125 | return 0; | ||
1126 | } | ||
1093 | } | 1127 | } |
1094 | set_page_hwpoison_huge_page(hpage); | 1128 | set_page_hwpoison_huge_page(hpage); |
1095 | res = dequeue_hwpoisoned_huge_page(hpage); | 1129 | res = dequeue_hwpoisoned_huge_page(hpage); |
@@ -1131,11 +1165,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1131 | } | 1165 | } |
1132 | } | 1166 | } |
1133 | 1167 | ||
1134 | /* | ||
1135 | * Lock the page and wait for writeback to finish. | ||
1136 | * It's very difficult to mess with pages currently under IO | ||
1137 | * and in many cases impossible, so we just avoid it here. | ||
1138 | */ | ||
1139 | lock_page(hpage); | 1168 | lock_page(hpage); |
1140 | 1169 | ||
1141 | /* | 1170 | /* |
@@ -1152,6 +1181,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1152 | */ | 1181 | */ |
1153 | if (!PageHWPoison(p)) { | 1182 | if (!PageHWPoison(p)) { |
1154 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); | 1183 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); |
1184 | atomic_long_sub(nr_pages, &num_poisoned_pages); | ||
1185 | put_page(hpage); | ||
1155 | res = 0; | 1186 | res = 0; |
1156 | goto out; | 1187 | goto out; |
1157 | } | 1188 | } |
@@ -1183,6 +1214,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1183 | if (PageHuge(p)) | 1214 | if (PageHuge(p)) |
1184 | set_page_hwpoison_huge_page(hpage); | 1215 | set_page_hwpoison_huge_page(hpage); |
1185 | 1216 | ||
1217 | /* | ||
1218 | * It's very difficult to mess with pages currently under IO | ||
1219 | * and in many cases impossible, so we just avoid it here. | ||
1220 | */ | ||
1186 | wait_on_page_writeback(p); | 1221 | wait_on_page_writeback(p); |
1187 | 1222 | ||
1188 | /* | 1223 | /* |
@@ -1295,7 +1330,7 @@ static void memory_failure_work_func(struct work_struct *work) | |||
1295 | unsigned long proc_flags; | 1330 | unsigned long proc_flags; |
1296 | int gotten; | 1331 | int gotten; |
1297 | 1332 | ||
1298 | mf_cpu = &__get_cpu_var(memory_failure_cpu); | 1333 | mf_cpu = this_cpu_ptr(&memory_failure_cpu); |
1299 | for (;;) { | 1334 | for (;;) { |
1300 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | 1335 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); |
1301 | gotten = kfifo_get(&mf_cpu->fifo, &entry); | 1336 | gotten = kfifo_get(&mf_cpu->fifo, &entry); |
@@ -1500,7 +1535,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1500 | 1535 | ||
1501 | /* Keep page count to indicate a given hugepage is isolated. */ | 1536 | /* Keep page count to indicate a given hugepage is isolated. */ |
1502 | list_move(&hpage->lru, &pagelist); | 1537 | list_move(&hpage->lru, &pagelist); |
1503 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1538 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1504 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1539 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1505 | if (ret) { | 1540 | if (ret) { |
1506 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1541 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
@@ -1581,7 +1616,7 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1581 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1616 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
1582 | page_is_file_cache(page)); | 1617 | page_is_file_cache(page)); |
1583 | list_add(&page->lru, &pagelist); | 1618 | list_add(&page->lru, &pagelist); |
1584 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1619 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1585 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1620 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1586 | if (ret) { | 1621 | if (ret) { |
1587 | if (!list_empty(&pagelist)) { | 1622 | if (!list_empty(&pagelist)) { |
@@ -1661,11 +1696,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1661 | } | 1696 | } |
1662 | } | 1697 | } |
1663 | 1698 | ||
1664 | /* | 1699 | get_online_mems(); |
1665 | * The lock_memory_hotplug prevents a race with memory hotplug. | ||
1666 | * This is a big hammer, a better would be nicer. | ||
1667 | */ | ||
1668 | lock_memory_hotplug(); | ||
1669 | 1700 | ||
1670 | /* | 1701 | /* |
1671 | * Isolate the page, so that it doesn't get reallocated if it | 1702 | * Isolate the page, so that it doesn't get reallocated if it |
@@ -1676,7 +1707,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1676 | set_migratetype_isolate(page, true); | 1707 | set_migratetype_isolate(page, true); |
1677 | 1708 | ||
1678 | ret = get_any_page(page, pfn, flags); | 1709 | ret = get_any_page(page, pfn, flags); |
1679 | unlock_memory_hotplug(); | 1710 | put_online_mems(); |
1680 | if (ret > 0) { /* for in-use pages */ | 1711 | if (ret > 0) { /* for in-use pages */ |
1681 | if (PageHuge(page)) | 1712 | if (PageHuge(page)) |
1682 | ret = soft_offline_huge_page(page, flags); | 1713 | ret = soft_offline_huge_page(page, flags); |
diff --git a/mm/memory.c b/mm/memory.c index 037b812a9531..d67fd9fcf1f2 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -698,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
698 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 698 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
699 | } | 699 | } |
700 | 700 | ||
701 | static inline bool is_cow_mapping(vm_flags_t flags) | ||
702 | { | ||
703 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
704 | } | ||
705 | |||
706 | /* | 701 | /* |
707 | * vm_normal_page -- This function gets the "struct page" associated with a pte. | 702 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
708 | * | 703 | * |
@@ -756,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
756 | unsigned long pfn = pte_pfn(pte); | 751 | unsigned long pfn = pte_pfn(pte); |
757 | 752 | ||
758 | if (HAVE_PTE_SPECIAL) { | 753 | if (HAVE_PTE_SPECIAL) { |
759 | if (likely(!pte_special(pte))) | 754 | if (likely(!pte_special(pte) || pte_numa(pte))) |
760 | goto check_pfn; | 755 | goto check_pfn; |
761 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) | 756 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) |
762 | return NULL; | 757 | return NULL; |
@@ -782,14 +777,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
782 | } | 777 | } |
783 | } | 778 | } |
784 | 779 | ||
785 | if (is_zero_pfn(pfn)) | ||
786 | return NULL; | ||
787 | check_pfn: | 780 | check_pfn: |
788 | if (unlikely(pfn > highest_memmap_pfn)) { | 781 | if (unlikely(pfn > highest_memmap_pfn)) { |
789 | print_bad_pte(vma, addr, pte, NULL); | 782 | print_bad_pte(vma, addr, pte, NULL); |
790 | return NULL; | 783 | return NULL; |
791 | } | 784 | } |
792 | 785 | ||
786 | if (is_zero_pfn(pfn)) | ||
787 | return NULL; | ||
788 | |||
793 | /* | 789 | /* |
794 | * NOTE! We still have PageReserved() pages in the page tables. | 790 | * NOTE! We still have PageReserved() pages in the page tables. |
795 | * eg. VDSO mappings can cause them to exist. | 791 | * eg. VDSO mappings can cause them to exist. |
@@ -1457,646 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | |||
1457 | } | 1453 | } |
1458 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | 1454 | EXPORT_SYMBOL_GPL(zap_vma_ptes); |
1459 | 1455 | ||
1460 | /** | ||
1461 | * follow_page_mask - look up a page descriptor from a user-virtual address | ||
1462 | * @vma: vm_area_struct mapping @address | ||
1463 | * @address: virtual address to look up | ||
1464 | * @flags: flags modifying lookup behaviour | ||
1465 | * @page_mask: on output, *page_mask is set according to the size of the page | ||
1466 | * | ||
1467 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> | ||
1468 | * | ||
1469 | * Returns the mapped (struct page *), %NULL if no mapping exists, or | ||
1470 | * an error pointer if there is a mapping to something not represented | ||
1471 | * by a page descriptor (see also vm_normal_page()). | ||
1472 | */ | ||
1473 | struct page *follow_page_mask(struct vm_area_struct *vma, | ||
1474 | unsigned long address, unsigned int flags, | ||
1475 | unsigned int *page_mask) | ||
1476 | { | ||
1477 | pgd_t *pgd; | ||
1478 | pud_t *pud; | ||
1479 | pmd_t *pmd; | ||
1480 | pte_t *ptep, pte; | ||
1481 | spinlock_t *ptl; | ||
1482 | struct page *page; | ||
1483 | struct mm_struct *mm = vma->vm_mm; | ||
1484 | |||
1485 | *page_mask = 0; | ||
1486 | |||
1487 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | ||
1488 | if (!IS_ERR(page)) { | ||
1489 | BUG_ON(flags & FOLL_GET); | ||
1490 | goto out; | ||
1491 | } | ||
1492 | |||
1493 | page = NULL; | ||
1494 | pgd = pgd_offset(mm, address); | ||
1495 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
1496 | goto no_page_table; | ||
1497 | |||
1498 | pud = pud_offset(pgd, address); | ||
1499 | if (pud_none(*pud)) | ||
1500 | goto no_page_table; | ||
1501 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | ||
1502 | if (flags & FOLL_GET) | ||
1503 | goto out; | ||
1504 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | ||
1505 | goto out; | ||
1506 | } | ||
1507 | if (unlikely(pud_bad(*pud))) | ||
1508 | goto no_page_table; | ||
1509 | |||
1510 | pmd = pmd_offset(pud, address); | ||
1511 | if (pmd_none(*pmd)) | ||
1512 | goto no_page_table; | ||
1513 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | ||
1514 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | ||
1515 | if (flags & FOLL_GET) { | ||
1516 | /* | ||
1517 | * Refcount on tail pages are not well-defined and | ||
1518 | * shouldn't be taken. The caller should handle a NULL | ||
1519 | * return when trying to follow tail pages. | ||
1520 | */ | ||
1521 | if (PageHead(page)) | ||
1522 | get_page(page); | ||
1523 | else { | ||
1524 | page = NULL; | ||
1525 | goto out; | ||
1526 | } | ||
1527 | } | ||
1528 | goto out; | ||
1529 | } | ||
1530 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | ||
1531 | goto no_page_table; | ||
1532 | if (pmd_trans_huge(*pmd)) { | ||
1533 | if (flags & FOLL_SPLIT) { | ||
1534 | split_huge_page_pmd(vma, address, pmd); | ||
1535 | goto split_fallthrough; | ||
1536 | } | ||
1537 | ptl = pmd_lock(mm, pmd); | ||
1538 | if (likely(pmd_trans_huge(*pmd))) { | ||
1539 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
1540 | spin_unlock(ptl); | ||
1541 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1542 | } else { | ||
1543 | page = follow_trans_huge_pmd(vma, address, | ||
1544 | pmd, flags); | ||
1545 | spin_unlock(ptl); | ||
1546 | *page_mask = HPAGE_PMD_NR - 1; | ||
1547 | goto out; | ||
1548 | } | ||
1549 | } else | ||
1550 | spin_unlock(ptl); | ||
1551 | /* fall through */ | ||
1552 | } | ||
1553 | split_fallthrough: | ||
1554 | if (unlikely(pmd_bad(*pmd))) | ||
1555 | goto no_page_table; | ||
1556 | |||
1557 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
1558 | |||
1559 | pte = *ptep; | ||
1560 | if (!pte_present(pte)) { | ||
1561 | swp_entry_t entry; | ||
1562 | /* | ||
1563 | * KSM's break_ksm() relies upon recognizing a ksm page | ||
1564 | * even while it is being migrated, so for that case we | ||
1565 | * need migration_entry_wait(). | ||
1566 | */ | ||
1567 | if (likely(!(flags & FOLL_MIGRATION))) | ||
1568 | goto no_page; | ||
1569 | if (pte_none(pte) || pte_file(pte)) | ||
1570 | goto no_page; | ||
1571 | entry = pte_to_swp_entry(pte); | ||
1572 | if (!is_migration_entry(entry)) | ||
1573 | goto no_page; | ||
1574 | pte_unmap_unlock(ptep, ptl); | ||
1575 | migration_entry_wait(mm, pmd, address); | ||
1576 | goto split_fallthrough; | ||
1577 | } | ||
1578 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | ||
1579 | goto no_page; | ||
1580 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | ||
1581 | goto unlock; | ||
1582 | |||
1583 | page = vm_normal_page(vma, address, pte); | ||
1584 | if (unlikely(!page)) { | ||
1585 | if ((flags & FOLL_DUMP) || | ||
1586 | !is_zero_pfn(pte_pfn(pte))) | ||
1587 | goto bad_page; | ||
1588 | page = pte_page(pte); | ||
1589 | } | ||
1590 | |||
1591 | if (flags & FOLL_GET) | ||
1592 | get_page_foll(page); | ||
1593 | if (flags & FOLL_TOUCH) { | ||
1594 | if ((flags & FOLL_WRITE) && | ||
1595 | !pte_dirty(pte) && !PageDirty(page)) | ||
1596 | set_page_dirty(page); | ||
1597 | /* | ||
1598 | * pte_mkyoung() would be more correct here, but atomic care | ||
1599 | * is needed to avoid losing the dirty bit: it is easier to use | ||
1600 | * mark_page_accessed(). | ||
1601 | */ | ||
1602 | mark_page_accessed(page); | ||
1603 | } | ||
1604 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
1605 | /* | ||
1606 | * The preliminary mapping check is mainly to avoid the | ||
1607 | * pointless overhead of lock_page on the ZERO_PAGE | ||
1608 | * which might bounce very badly if there is contention. | ||
1609 | * | ||
1610 | * If the page is already locked, we don't need to | ||
1611 | * handle it now - vmscan will handle it later if and | ||
1612 | * when it attempts to reclaim the page. | ||
1613 | */ | ||
1614 | if (page->mapping && trylock_page(page)) { | ||
1615 | lru_add_drain(); /* push cached pages to LRU */ | ||
1616 | /* | ||
1617 | * Because we lock page here, and migration is | ||
1618 | * blocked by the pte's page reference, and we | ||
1619 | * know the page is still mapped, we don't even | ||
1620 | * need to check for file-cache page truncation. | ||
1621 | */ | ||
1622 | mlock_vma_page(page); | ||
1623 | unlock_page(page); | ||
1624 | } | ||
1625 | } | ||
1626 | unlock: | ||
1627 | pte_unmap_unlock(ptep, ptl); | ||
1628 | out: | ||
1629 | return page; | ||
1630 | |||
1631 | bad_page: | ||
1632 | pte_unmap_unlock(ptep, ptl); | ||
1633 | return ERR_PTR(-EFAULT); | ||
1634 | |||
1635 | no_page: | ||
1636 | pte_unmap_unlock(ptep, ptl); | ||
1637 | if (!pte_none(pte)) | ||
1638 | return page; | ||
1639 | |||
1640 | no_page_table: | ||
1641 | /* | ||
1642 | * When core dumping an enormous anonymous area that nobody | ||
1643 | * has touched so far, we don't want to allocate unnecessary pages or | ||
1644 | * page tables. Return error instead of NULL to skip handle_mm_fault, | ||
1645 | * then get_dump_page() will return NULL to leave a hole in the dump. | ||
1646 | * But we can only make this optimization where a hole would surely | ||
1647 | * be zero-filled if handle_mm_fault() actually did handle it. | ||
1648 | */ | ||
1649 | if ((flags & FOLL_DUMP) && | ||
1650 | (!vma->vm_ops || !vma->vm_ops->fault)) | ||
1651 | return ERR_PTR(-EFAULT); | ||
1652 | return page; | ||
1653 | } | ||
1654 | |||
1655 | static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) | ||
1656 | { | ||
1657 | return stack_guard_page_start(vma, addr) || | ||
1658 | stack_guard_page_end(vma, addr+PAGE_SIZE); | ||
1659 | } | ||
1660 | |||
1661 | /** | ||
1662 | * __get_user_pages() - pin user pages in memory | ||
1663 | * @tsk: task_struct of target task | ||
1664 | * @mm: mm_struct of target mm | ||
1665 | * @start: starting user address | ||
1666 | * @nr_pages: number of pages from start to pin | ||
1667 | * @gup_flags: flags modifying pin behaviour | ||
1668 | * @pages: array that receives pointers to the pages pinned. | ||
1669 | * Should be at least nr_pages long. Or NULL, if caller | ||
1670 | * only intends to ensure the pages are faulted in. | ||
1671 | * @vmas: array of pointers to vmas corresponding to each page. | ||
1672 | * Or NULL if the caller does not require them. | ||
1673 | * @nonblocking: whether waiting for disk IO or mmap_sem contention | ||
1674 | * | ||
1675 | * Returns number of pages pinned. This may be fewer than the number | ||
1676 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
1677 | * were pinned, returns -errno. Each page returned must be released | ||
1678 | * with a put_page() call when it is finished with. vmas will only | ||
1679 | * remain valid while mmap_sem is held. | ||
1680 | * | ||
1681 | * Must be called with mmap_sem held for read or write. | ||
1682 | * | ||
1683 | * __get_user_pages walks a process's page tables and takes a reference to | ||
1684 | * each struct page that each user address corresponds to at a given | ||
1685 | * instant. That is, it takes the page that would be accessed if a user | ||
1686 | * thread accesses the given user virtual address at that instant. | ||
1687 | * | ||
1688 | * This does not guarantee that the page exists in the user mappings when | ||
1689 | * __get_user_pages returns, and there may even be a completely different | ||
1690 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
1691 | * and subsequently re faulted). However it does guarantee that the page | ||
1692 | * won't be freed completely. And mostly callers simply care that the page | ||
1693 | * contains data that was valid *at some point in time*. Typically, an IO | ||
1694 | * or similar operation cannot guarantee anything stronger anyway because | ||
1695 | * locks can't be held over the syscall boundary. | ||
1696 | * | ||
1697 | * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If | ||
1698 | * the page is written to, set_page_dirty (or set_page_dirty_lock, as | ||
1699 | * appropriate) must be called after the page is finished with, and | ||
1700 | * before put_page is called. | ||
1701 | * | ||
1702 | * If @nonblocking != NULL, __get_user_pages will not wait for disk IO | ||
1703 | * or mmap_sem contention, and if waiting is needed to pin all pages, | ||
1704 | * *@nonblocking will be set to 0. | ||
1705 | * | ||
1706 | * In most cases, get_user_pages or get_user_pages_fast should be used | ||
1707 | * instead of __get_user_pages. __get_user_pages should be used only if | ||
1708 | * you need some special @gup_flags. | ||
1709 | */ | ||
1710 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
1711 | unsigned long start, unsigned long nr_pages, | ||
1712 | unsigned int gup_flags, struct page **pages, | ||
1713 | struct vm_area_struct **vmas, int *nonblocking) | ||
1714 | { | ||
1715 | long i; | ||
1716 | unsigned long vm_flags; | ||
1717 | unsigned int page_mask; | ||
1718 | |||
1719 | if (!nr_pages) | ||
1720 | return 0; | ||
1721 | |||
1722 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); | ||
1723 | |||
1724 | /* | ||
1725 | * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault | ||
1726 | * would be called on PROT_NONE ranges. We must never invoke | ||
1727 | * handle_mm_fault on PROT_NONE ranges or the NUMA hinting | ||
1728 | * page faults would unprotect the PROT_NONE ranges if | ||
1729 | * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd | ||
1730 | * bitflag. So to avoid that, don't set FOLL_NUMA if | ||
1731 | * FOLL_FORCE is set. | ||
1732 | */ | ||
1733 | if (!(gup_flags & FOLL_FORCE)) | ||
1734 | gup_flags |= FOLL_NUMA; | ||
1735 | |||
1736 | i = 0; | ||
1737 | |||
1738 | do { | ||
1739 | struct vm_area_struct *vma; | ||
1740 | |||
1741 | vma = find_extend_vma(mm, start); | ||
1742 | if (!vma && in_gate_area(mm, start)) { | ||
1743 | unsigned long pg = start & PAGE_MASK; | ||
1744 | pgd_t *pgd; | ||
1745 | pud_t *pud; | ||
1746 | pmd_t *pmd; | ||
1747 | pte_t *pte; | ||
1748 | |||
1749 | /* user gate pages are read-only */ | ||
1750 | if (gup_flags & FOLL_WRITE) | ||
1751 | goto efault; | ||
1752 | if (pg > TASK_SIZE) | ||
1753 | pgd = pgd_offset_k(pg); | ||
1754 | else | ||
1755 | pgd = pgd_offset_gate(mm, pg); | ||
1756 | BUG_ON(pgd_none(*pgd)); | ||
1757 | pud = pud_offset(pgd, pg); | ||
1758 | BUG_ON(pud_none(*pud)); | ||
1759 | pmd = pmd_offset(pud, pg); | ||
1760 | if (pmd_none(*pmd)) | ||
1761 | goto efault; | ||
1762 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1763 | pte = pte_offset_map(pmd, pg); | ||
1764 | if (pte_none(*pte)) { | ||
1765 | pte_unmap(pte); | ||
1766 | goto efault; | ||
1767 | } | ||
1768 | vma = get_gate_vma(mm); | ||
1769 | if (pages) { | ||
1770 | struct page *page; | ||
1771 | |||
1772 | page = vm_normal_page(vma, start, *pte); | ||
1773 | if (!page) { | ||
1774 | if (!(gup_flags & FOLL_DUMP) && | ||
1775 | is_zero_pfn(pte_pfn(*pte))) | ||
1776 | page = pte_page(*pte); | ||
1777 | else { | ||
1778 | pte_unmap(pte); | ||
1779 | goto efault; | ||
1780 | } | ||
1781 | } | ||
1782 | pages[i] = page; | ||
1783 | get_page(page); | ||
1784 | } | ||
1785 | pte_unmap(pte); | ||
1786 | page_mask = 0; | ||
1787 | goto next_page; | ||
1788 | } | ||
1789 | |||
1790 | if (!vma) | ||
1791 | goto efault; | ||
1792 | vm_flags = vma->vm_flags; | ||
1793 | if (vm_flags & (VM_IO | VM_PFNMAP)) | ||
1794 | goto efault; | ||
1795 | |||
1796 | if (gup_flags & FOLL_WRITE) { | ||
1797 | if (!(vm_flags & VM_WRITE)) { | ||
1798 | if (!(gup_flags & FOLL_FORCE)) | ||
1799 | goto efault; | ||
1800 | /* | ||
1801 | * We used to let the write,force case do COW | ||
1802 | * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so | ||
1803 | * ptrace could set a breakpoint in a read-only | ||
1804 | * mapping of an executable, without corrupting | ||
1805 | * the file (yet only when that file had been | ||
1806 | * opened for writing!). Anon pages in shared | ||
1807 | * mappings are surprising: now just reject it. | ||
1808 | */ | ||
1809 | if (!is_cow_mapping(vm_flags)) { | ||
1810 | WARN_ON_ONCE(vm_flags & VM_MAYWRITE); | ||
1811 | goto efault; | ||
1812 | } | ||
1813 | } | ||
1814 | } else { | ||
1815 | if (!(vm_flags & VM_READ)) { | ||
1816 | if (!(gup_flags & FOLL_FORCE)) | ||
1817 | goto efault; | ||
1818 | /* | ||
1819 | * Is there actually any vma we can reach here | ||
1820 | * which does not have VM_MAYREAD set? | ||
1821 | */ | ||
1822 | if (!(vm_flags & VM_MAYREAD)) | ||
1823 | goto efault; | ||
1824 | } | ||
1825 | } | ||
1826 | |||
1827 | if (is_vm_hugetlb_page(vma)) { | ||
1828 | i = follow_hugetlb_page(mm, vma, pages, vmas, | ||
1829 | &start, &nr_pages, i, gup_flags); | ||
1830 | continue; | ||
1831 | } | ||
1832 | |||
1833 | do { | ||
1834 | struct page *page; | ||
1835 | unsigned int foll_flags = gup_flags; | ||
1836 | unsigned int page_increm; | ||
1837 | |||
1838 | /* | ||
1839 | * If we have a pending SIGKILL, don't keep faulting | ||
1840 | * pages and potentially allocating memory. | ||
1841 | */ | ||
1842 | if (unlikely(fatal_signal_pending(current))) | ||
1843 | return i ? i : -ERESTARTSYS; | ||
1844 | |||
1845 | cond_resched(); | ||
1846 | while (!(page = follow_page_mask(vma, start, | ||
1847 | foll_flags, &page_mask))) { | ||
1848 | int ret; | ||
1849 | unsigned int fault_flags = 0; | ||
1850 | |||
1851 | /* For mlock, just skip the stack guard page. */ | ||
1852 | if (foll_flags & FOLL_MLOCK) { | ||
1853 | if (stack_guard_page(vma, start)) | ||
1854 | goto next_page; | ||
1855 | } | ||
1856 | if (foll_flags & FOLL_WRITE) | ||
1857 | fault_flags |= FAULT_FLAG_WRITE; | ||
1858 | if (nonblocking) | ||
1859 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
1860 | if (foll_flags & FOLL_NOWAIT) | ||
1861 | fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); | ||
1862 | |||
1863 | ret = handle_mm_fault(mm, vma, start, | ||
1864 | fault_flags); | ||
1865 | |||
1866 | if (ret & VM_FAULT_ERROR) { | ||
1867 | if (ret & VM_FAULT_OOM) | ||
1868 | return i ? i : -ENOMEM; | ||
1869 | if (ret & (VM_FAULT_HWPOISON | | ||
1870 | VM_FAULT_HWPOISON_LARGE)) { | ||
1871 | if (i) | ||
1872 | return i; | ||
1873 | else if (gup_flags & FOLL_HWPOISON) | ||
1874 | return -EHWPOISON; | ||
1875 | else | ||
1876 | return -EFAULT; | ||
1877 | } | ||
1878 | if (ret & VM_FAULT_SIGBUS) | ||
1879 | goto efault; | ||
1880 | BUG(); | ||
1881 | } | ||
1882 | |||
1883 | if (tsk) { | ||
1884 | if (ret & VM_FAULT_MAJOR) | ||
1885 | tsk->maj_flt++; | ||
1886 | else | ||
1887 | tsk->min_flt++; | ||
1888 | } | ||
1889 | |||
1890 | if (ret & VM_FAULT_RETRY) { | ||
1891 | if (nonblocking) | ||
1892 | *nonblocking = 0; | ||
1893 | return i; | ||
1894 | } | ||
1895 | |||
1896 | /* | ||
1897 | * The VM_FAULT_WRITE bit tells us that | ||
1898 | * do_wp_page has broken COW when necessary, | ||
1899 | * even if maybe_mkwrite decided not to set | ||
1900 | * pte_write. We can thus safely do subsequent | ||
1901 | * page lookups as if they were reads. But only | ||
1902 | * do so when looping for pte_write is futile: | ||
1903 | * in some cases userspace may also be wanting | ||
1904 | * to write to the gotten user page, which a | ||
1905 | * read fault here might prevent (a readonly | ||
1906 | * page might get reCOWed by userspace write). | ||
1907 | */ | ||
1908 | if ((ret & VM_FAULT_WRITE) && | ||
1909 | !(vma->vm_flags & VM_WRITE)) | ||
1910 | foll_flags &= ~FOLL_WRITE; | ||
1911 | |||
1912 | cond_resched(); | ||
1913 | } | ||
1914 | if (IS_ERR(page)) | ||
1915 | return i ? i : PTR_ERR(page); | ||
1916 | if (pages) { | ||
1917 | pages[i] = page; | ||
1918 | |||
1919 | flush_anon_page(vma, page, start); | ||
1920 | flush_dcache_page(page); | ||
1921 | page_mask = 0; | ||
1922 | } | ||
1923 | next_page: | ||
1924 | if (vmas) { | ||
1925 | vmas[i] = vma; | ||
1926 | page_mask = 0; | ||
1927 | } | ||
1928 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); | ||
1929 | if (page_increm > nr_pages) | ||
1930 | page_increm = nr_pages; | ||
1931 | i += page_increm; | ||
1932 | start += page_increm * PAGE_SIZE; | ||
1933 | nr_pages -= page_increm; | ||
1934 | } while (nr_pages && start < vma->vm_end); | ||
1935 | } while (nr_pages); | ||
1936 | return i; | ||
1937 | efault: | ||
1938 | return i ? : -EFAULT; | ||
1939 | } | ||
1940 | EXPORT_SYMBOL(__get_user_pages); | ||
1941 | |||
1942 | /* | ||
1943 | * fixup_user_fault() - manually resolve a user page fault | ||
1944 | * @tsk: the task_struct to use for page fault accounting, or | ||
1945 | * NULL if faults are not to be recorded. | ||
1946 | * @mm: mm_struct of target mm | ||
1947 | * @address: user address | ||
1948 | * @fault_flags:flags to pass down to handle_mm_fault() | ||
1949 | * | ||
1950 | * This is meant to be called in the specific scenario where for locking reasons | ||
1951 | * we try to access user memory in atomic context (within a pagefault_disable() | ||
1952 | * section), this returns -EFAULT, and we want to resolve the user fault before | ||
1953 | * trying again. | ||
1954 | * | ||
1955 | * Typically this is meant to be used by the futex code. | ||
1956 | * | ||
1957 | * The main difference with get_user_pages() is that this function will | ||
1958 | * unconditionally call handle_mm_fault() which will in turn perform all the | ||
1959 | * necessary SW fixup of the dirty and young bits in the PTE, while | ||
1960 | * handle_mm_fault() only guarantees to update these in the struct page. | ||
1961 | * | ||
1962 | * This is important for some architectures where those bits also gate the | ||
1963 | * access permission to the page because they are maintained in software. On | ||
1964 | * such architectures, gup() will not be enough to make a subsequent access | ||
1965 | * succeed. | ||
1966 | * | ||
1967 | * This should be called with the mm_sem held for read. | ||
1968 | */ | ||
1969 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | ||
1970 | unsigned long address, unsigned int fault_flags) | ||
1971 | { | ||
1972 | struct vm_area_struct *vma; | ||
1973 | vm_flags_t vm_flags; | ||
1974 | int ret; | ||
1975 | |||
1976 | vma = find_extend_vma(mm, address); | ||
1977 | if (!vma || address < vma->vm_start) | ||
1978 | return -EFAULT; | ||
1979 | |||
1980 | vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; | ||
1981 | if (!(vm_flags & vma->vm_flags)) | ||
1982 | return -EFAULT; | ||
1983 | |||
1984 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
1985 | if (ret & VM_FAULT_ERROR) { | ||
1986 | if (ret & VM_FAULT_OOM) | ||
1987 | return -ENOMEM; | ||
1988 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
1989 | return -EHWPOISON; | ||
1990 | if (ret & VM_FAULT_SIGBUS) | ||
1991 | return -EFAULT; | ||
1992 | BUG(); | ||
1993 | } | ||
1994 | if (tsk) { | ||
1995 | if (ret & VM_FAULT_MAJOR) | ||
1996 | tsk->maj_flt++; | ||
1997 | else | ||
1998 | tsk->min_flt++; | ||
1999 | } | ||
2000 | return 0; | ||
2001 | } | ||
2002 | |||
2003 | /* | ||
2004 | * get_user_pages() - pin user pages in memory | ||
2005 | * @tsk: the task_struct to use for page fault accounting, or | ||
2006 | * NULL if faults are not to be recorded. | ||
2007 | * @mm: mm_struct of target mm | ||
2008 | * @start: starting user address | ||
2009 | * @nr_pages: number of pages from start to pin | ||
2010 | * @write: whether pages will be written to by the caller | ||
2011 | * @force: whether to force access even when user mapping is currently | ||
2012 | * protected (but never forces write access to shared mapping). | ||
2013 | * @pages: array that receives pointers to the pages pinned. | ||
2014 | * Should be at least nr_pages long. Or NULL, if caller | ||
2015 | * only intends to ensure the pages are faulted in. | ||
2016 | * @vmas: array of pointers to vmas corresponding to each page. | ||
2017 | * Or NULL if the caller does not require them. | ||
2018 | * | ||
2019 | * Returns number of pages pinned. This may be fewer than the number | ||
2020 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
2021 | * were pinned, returns -errno. Each page returned must be released | ||
2022 | * with a put_page() call when it is finished with. vmas will only | ||
2023 | * remain valid while mmap_sem is held. | ||
2024 | * | ||
2025 | * Must be called with mmap_sem held for read or write. | ||
2026 | * | ||
2027 | * get_user_pages walks a process's page tables and takes a reference to | ||
2028 | * each struct page that each user address corresponds to at a given | ||
2029 | * instant. That is, it takes the page that would be accessed if a user | ||
2030 | * thread accesses the given user virtual address at that instant. | ||
2031 | * | ||
2032 | * This does not guarantee that the page exists in the user mappings when | ||
2033 | * get_user_pages returns, and there may even be a completely different | ||
2034 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
2035 | * and subsequently re faulted). However it does guarantee that the page | ||
2036 | * won't be freed completely. And mostly callers simply care that the page | ||
2037 | * contains data that was valid *at some point in time*. Typically, an IO | ||
2038 | * or similar operation cannot guarantee anything stronger anyway because | ||
2039 | * locks can't be held over the syscall boundary. | ||
2040 | * | ||
2041 | * If write=0, the page must not be written to. If the page is written to, | ||
2042 | * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called | ||
2043 | * after the page is finished with, and before put_page is called. | ||
2044 | * | ||
2045 | * get_user_pages is typically used for fewer-copy IO operations, to get a | ||
2046 | * handle on the memory by some means other than accesses via the user virtual | ||
2047 | * addresses. The pages may be submitted for DMA to devices or accessed via | ||
2048 | * their kernel linear mapping (via the kmap APIs). Care should be taken to | ||
2049 | * use the correct cache flushing APIs. | ||
2050 | * | ||
2051 | * See also get_user_pages_fast, for performance critical applications. | ||
2052 | */ | ||
2053 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
2054 | unsigned long start, unsigned long nr_pages, int write, | ||
2055 | int force, struct page **pages, struct vm_area_struct **vmas) | ||
2056 | { | ||
2057 | int flags = FOLL_TOUCH; | ||
2058 | |||
2059 | if (pages) | ||
2060 | flags |= FOLL_GET; | ||
2061 | if (write) | ||
2062 | flags |= FOLL_WRITE; | ||
2063 | if (force) | ||
2064 | flags |= FOLL_FORCE; | ||
2065 | |||
2066 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, | ||
2067 | NULL); | ||
2068 | } | ||
2069 | EXPORT_SYMBOL(get_user_pages); | ||
2070 | |||
2071 | /** | ||
2072 | * get_dump_page() - pin user page in memory while writing it to core dump | ||
2073 | * @addr: user address | ||
2074 | * | ||
2075 | * Returns struct page pointer of user page pinned for dump, | ||
2076 | * to be freed afterwards by page_cache_release() or put_page(). | ||
2077 | * | ||
2078 | * Returns NULL on any kind of failure - a hole must then be inserted into | ||
2079 | * the corefile, to preserve alignment with its headers; and also returns | ||
2080 | * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - | ||
2081 | * allowing a hole to be left in the corefile to save diskspace. | ||
2082 | * | ||
2083 | * Called without mmap_sem, but after all other threads have been killed. | ||
2084 | */ | ||
2085 | #ifdef CONFIG_ELF_CORE | ||
2086 | struct page *get_dump_page(unsigned long addr) | ||
2087 | { | ||
2088 | struct vm_area_struct *vma; | ||
2089 | struct page *page; | ||
2090 | |||
2091 | if (__get_user_pages(current, current->mm, addr, 1, | ||
2092 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, | ||
2093 | NULL) < 1) | ||
2094 | return NULL; | ||
2095 | flush_cache_page(vma, addr, page_to_pfn(page)); | ||
2096 | return page; | ||
2097 | } | ||
2098 | #endif /* CONFIG_ELF_CORE */ | ||
2099 | |||
2100 | pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, | 1456 | pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, |
2101 | spinlock_t **ptl) | 1457 | spinlock_t **ptl) |
2102 | { | 1458 | { |
@@ -3402,65 +2758,76 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
3402 | update_mmu_cache(vma, address, pte); | 2758 | update_mmu_cache(vma, address, pte); |
3403 | } | 2759 | } |
3404 | 2760 | ||
3405 | #define FAULT_AROUND_ORDER 4 | 2761 | static unsigned long fault_around_bytes = 65536; |
3406 | 2762 | ||
3407 | #ifdef CONFIG_DEBUG_FS | 2763 | /* |
3408 | static unsigned int fault_around_order = FAULT_AROUND_ORDER; | 2764 | * fault_around_pages() and fault_around_mask() round down fault_around_bytes |
2765 | * to nearest page order. It's what do_fault_around() expects to see. | ||
2766 | */ | ||
2767 | static inline unsigned long fault_around_pages(void) | ||
2768 | { | ||
2769 | return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE; | ||
2770 | } | ||
2771 | |||
2772 | static inline unsigned long fault_around_mask(void) | ||
2773 | { | ||
2774 | return ~(rounddown_pow_of_two(fault_around_bytes) - 1) & PAGE_MASK; | ||
2775 | } | ||
3409 | 2776 | ||
3410 | static int fault_around_order_get(void *data, u64 *val) | 2777 | |
2778 | #ifdef CONFIG_DEBUG_FS | ||
2779 | static int fault_around_bytes_get(void *data, u64 *val) | ||
3411 | { | 2780 | { |
3412 | *val = fault_around_order; | 2781 | *val = fault_around_bytes; |
3413 | return 0; | 2782 | return 0; |
3414 | } | 2783 | } |
3415 | 2784 | ||
3416 | static int fault_around_order_set(void *data, u64 val) | 2785 | static int fault_around_bytes_set(void *data, u64 val) |
3417 | { | 2786 | { |
3418 | BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); | 2787 | if (val / PAGE_SIZE > PTRS_PER_PTE) |
3419 | if (1UL << val > PTRS_PER_PTE) | ||
3420 | return -EINVAL; | 2788 | return -EINVAL; |
3421 | fault_around_order = val; | 2789 | fault_around_bytes = val; |
3422 | return 0; | 2790 | return 0; |
3423 | } | 2791 | } |
3424 | DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, | 2792 | DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, |
3425 | fault_around_order_get, fault_around_order_set, "%llu\n"); | 2793 | fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); |
3426 | 2794 | ||
3427 | static int __init fault_around_debugfs(void) | 2795 | static int __init fault_around_debugfs(void) |
3428 | { | 2796 | { |
3429 | void *ret; | 2797 | void *ret; |
3430 | 2798 | ||
3431 | ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, | 2799 | ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, |
3432 | &fault_around_order_fops); | 2800 | &fault_around_bytes_fops); |
3433 | if (!ret) | 2801 | if (!ret) |
3434 | pr_warn("Failed to create fault_around_order in debugfs"); | 2802 | pr_warn("Failed to create fault_around_bytes in debugfs"); |
3435 | return 0; | 2803 | return 0; |
3436 | } | 2804 | } |
3437 | late_initcall(fault_around_debugfs); | 2805 | late_initcall(fault_around_debugfs); |
3438 | |||
3439 | static inline unsigned long fault_around_pages(void) | ||
3440 | { | ||
3441 | return 1UL << fault_around_order; | ||
3442 | } | ||
3443 | |||
3444 | static inline unsigned long fault_around_mask(void) | ||
3445 | { | ||
3446 | return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); | ||
3447 | } | ||
3448 | #else | ||
3449 | static inline unsigned long fault_around_pages(void) | ||
3450 | { | ||
3451 | unsigned long nr_pages; | ||
3452 | |||
3453 | nr_pages = 1UL << FAULT_AROUND_ORDER; | ||
3454 | BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); | ||
3455 | return nr_pages; | ||
3456 | } | ||
3457 | |||
3458 | static inline unsigned long fault_around_mask(void) | ||
3459 | { | ||
3460 | return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); | ||
3461 | } | ||
3462 | #endif | 2806 | #endif |
3463 | 2807 | ||
2808 | /* | ||
2809 | * do_fault_around() tries to map few pages around the fault address. The hope | ||
2810 | * is that the pages will be needed soon and this will lower the number of | ||
2811 | * faults to handle. | ||
2812 | * | ||
2813 | * It uses vm_ops->map_pages() to map the pages, which skips the page if it's | ||
2814 | * not ready to be mapped: not up-to-date, locked, etc. | ||
2815 | * | ||
2816 | * This function is called with the page table lock taken. In the split ptlock | ||
2817 | * case the page table lock only protects only those entries which belong to | ||
2818 | * the page table corresponding to the fault address. | ||
2819 | * | ||
2820 | * This function doesn't cross the VMA boundaries, in order to call map_pages() | ||
2821 | * only once. | ||
2822 | * | ||
2823 | * fault_around_pages() defines how many pages we'll try to map. | ||
2824 | * do_fault_around() expects it to return a power of two less than or equal to | ||
2825 | * PTRS_PER_PTE. | ||
2826 | * | ||
2827 | * The virtual address of the area that we map is naturally aligned to the | ||
2828 | * fault_around_pages() value (and therefore to page order). This way it's | ||
2829 | * easier to guarantee that we don't cross page table boundaries. | ||
2830 | */ | ||
3464 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | 2831 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, |
3465 | pte_t *pte, pgoff_t pgoff, unsigned int flags) | 2832 | pte_t *pte, pgoff_t pgoff, unsigned int flags) |
3466 | { | 2833 | { |
@@ -3476,7 +2843,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | |||
3476 | 2843 | ||
3477 | /* | 2844 | /* |
3478 | * max_pgoff is either end of page table or end of vma | 2845 | * max_pgoff is either end of page table or end of vma |
3479 | * or fault_around_pages() from pgoff, depending what is neast. | 2846 | * or fault_around_pages() from pgoff, depending what is nearest. |
3480 | */ | 2847 | */ |
3481 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 2848 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + |
3482 | PTRS_PER_PTE - 1; | 2849 | PTRS_PER_PTE - 1; |
@@ -3515,7 +2882,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3515 | * if page by the offset is not ready to be mapped (cold cache or | 2882 | * if page by the offset is not ready to be mapped (cold cache or |
3516 | * something). | 2883 | * something). |
3517 | */ | 2884 | */ |
3518 | if (vma->vm_ops->map_pages) { | 2885 | if (vma->vm_ops->map_pages && fault_around_pages() > 1) { |
3519 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2886 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
3520 | do_fault_around(vma, address, pte, pgoff, flags); | 2887 | do_fault_around(vma, address, pte, pgoff, flags); |
3521 | if (!pte_same(*pte, orig_pte)) | 2888 | if (!pte_same(*pte, orig_pte)) |
@@ -3920,9 +3287,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3920 | } | 3287 | } |
3921 | } | 3288 | } |
3922 | 3289 | ||
3923 | /* THP should already have been handled */ | ||
3924 | BUG_ON(pmd_numa(*pmd)); | ||
3925 | |||
3926 | /* | 3290 | /* |
3927 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3291 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
3928 | * run pte_offset_map on the pmd, if an huge pmd could | 3292 | * run pte_offset_map on the pmd, if an huge pmd could |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a650db29606f..469bbf505f85 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -46,19 +46,84 @@ | |||
46 | static void generic_online_page(struct page *page); | 46 | static void generic_online_page(struct page *page); |
47 | 47 | ||
48 | static online_page_callback_t online_page_callback = generic_online_page; | 48 | static online_page_callback_t online_page_callback = generic_online_page; |
49 | static DEFINE_MUTEX(online_page_callback_lock); | ||
49 | 50 | ||
50 | DEFINE_MUTEX(mem_hotplug_mutex); | 51 | /* The same as the cpu_hotplug lock, but for memory hotplug. */ |
52 | static struct { | ||
53 | struct task_struct *active_writer; | ||
54 | struct mutex lock; /* Synchronizes accesses to refcount, */ | ||
55 | /* | ||
56 | * Also blocks the new readers during | ||
57 | * an ongoing mem hotplug operation. | ||
58 | */ | ||
59 | int refcount; | ||
60 | |||
61 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
62 | struct lockdep_map dep_map; | ||
63 | #endif | ||
64 | } mem_hotplug = { | ||
65 | .active_writer = NULL, | ||
66 | .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), | ||
67 | .refcount = 0, | ||
68 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
69 | .dep_map = {.name = "mem_hotplug.lock" }, | ||
70 | #endif | ||
71 | }; | ||
72 | |||
73 | /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ | ||
74 | #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) | ||
75 | #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) | ||
76 | #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) | ||
77 | |||
78 | void get_online_mems(void) | ||
79 | { | ||
80 | might_sleep(); | ||
81 | if (mem_hotplug.active_writer == current) | ||
82 | return; | ||
83 | memhp_lock_acquire_read(); | ||
84 | mutex_lock(&mem_hotplug.lock); | ||
85 | mem_hotplug.refcount++; | ||
86 | mutex_unlock(&mem_hotplug.lock); | ||
87 | |||
88 | } | ||
51 | 89 | ||
52 | void lock_memory_hotplug(void) | 90 | void put_online_mems(void) |
53 | { | 91 | { |
54 | mutex_lock(&mem_hotplug_mutex); | 92 | if (mem_hotplug.active_writer == current) |
93 | return; | ||
94 | mutex_lock(&mem_hotplug.lock); | ||
95 | |||
96 | if (WARN_ON(!mem_hotplug.refcount)) | ||
97 | mem_hotplug.refcount++; /* try to fix things up */ | ||
98 | |||
99 | if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) | ||
100 | wake_up_process(mem_hotplug.active_writer); | ||
101 | mutex_unlock(&mem_hotplug.lock); | ||
102 | memhp_lock_release(); | ||
103 | |||
55 | } | 104 | } |
56 | 105 | ||
57 | void unlock_memory_hotplug(void) | 106 | static void mem_hotplug_begin(void) |
58 | { | 107 | { |
59 | mutex_unlock(&mem_hotplug_mutex); | 108 | mem_hotplug.active_writer = current; |
109 | |||
110 | memhp_lock_acquire(); | ||
111 | for (;;) { | ||
112 | mutex_lock(&mem_hotplug.lock); | ||
113 | if (likely(!mem_hotplug.refcount)) | ||
114 | break; | ||
115 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
116 | mutex_unlock(&mem_hotplug.lock); | ||
117 | schedule(); | ||
118 | } | ||
60 | } | 119 | } |
61 | 120 | ||
121 | static void mem_hotplug_done(void) | ||
122 | { | ||
123 | mem_hotplug.active_writer = NULL; | ||
124 | mutex_unlock(&mem_hotplug.lock); | ||
125 | memhp_lock_release(); | ||
126 | } | ||
62 | 127 | ||
63 | /* add this memory to iomem resource */ | 128 | /* add this memory to iomem resource */ |
64 | static struct resource *register_memory_resource(u64 start, u64 size) | 129 | static struct resource *register_memory_resource(u64 start, u64 size) |
@@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback) | |||
727 | { | 792 | { |
728 | int rc = -EINVAL; | 793 | int rc = -EINVAL; |
729 | 794 | ||
730 | lock_memory_hotplug(); | 795 | get_online_mems(); |
796 | mutex_lock(&online_page_callback_lock); | ||
731 | 797 | ||
732 | if (online_page_callback == generic_online_page) { | 798 | if (online_page_callback == generic_online_page) { |
733 | online_page_callback = callback; | 799 | online_page_callback = callback; |
734 | rc = 0; | 800 | rc = 0; |
735 | } | 801 | } |
736 | 802 | ||
737 | unlock_memory_hotplug(); | 803 | mutex_unlock(&online_page_callback_lock); |
804 | put_online_mems(); | ||
738 | 805 | ||
739 | return rc; | 806 | return rc; |
740 | } | 807 | } |
@@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback) | |||
744 | { | 811 | { |
745 | int rc = -EINVAL; | 812 | int rc = -EINVAL; |
746 | 813 | ||
747 | lock_memory_hotplug(); | 814 | get_online_mems(); |
815 | mutex_lock(&online_page_callback_lock); | ||
748 | 816 | ||
749 | if (online_page_callback == callback) { | 817 | if (online_page_callback == callback) { |
750 | online_page_callback = generic_online_page; | 818 | online_page_callback = generic_online_page; |
751 | rc = 0; | 819 | rc = 0; |
752 | } | 820 | } |
753 | 821 | ||
754 | unlock_memory_hotplug(); | 822 | mutex_unlock(&online_page_callback_lock); |
823 | put_online_mems(); | ||
755 | 824 | ||
756 | return rc; | 825 | return rc; |
757 | } | 826 | } |
@@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
899 | int ret; | 968 | int ret; |
900 | struct memory_notify arg; | 969 | struct memory_notify arg; |
901 | 970 | ||
902 | lock_memory_hotplug(); | 971 | mem_hotplug_begin(); |
903 | /* | 972 | /* |
904 | * This doesn't need a lock to do pfn_to_page(). | 973 | * This doesn't need a lock to do pfn_to_page(). |
905 | * The section can't be removed here because of the | 974 | * The section can't be removed here because of the |
@@ -907,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
907 | */ | 976 | */ |
908 | zone = page_zone(pfn_to_page(pfn)); | 977 | zone = page_zone(pfn_to_page(pfn)); |
909 | 978 | ||
979 | ret = -EINVAL; | ||
910 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && | 980 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && |
911 | !can_online_high_movable(zone)) { | 981 | !can_online_high_movable(zone)) |
912 | unlock_memory_hotplug(); | 982 | goto out; |
913 | return -EINVAL; | ||
914 | } | ||
915 | 983 | ||
916 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | 984 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { |
917 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { | 985 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) |
918 | unlock_memory_hotplug(); | 986 | goto out; |
919 | return -EINVAL; | ||
920 | } | ||
921 | } | 987 | } |
922 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | 988 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { |
923 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { | 989 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) |
924 | unlock_memory_hotplug(); | 990 | goto out; |
925 | return -EINVAL; | ||
926 | } | ||
927 | } | 991 | } |
928 | 992 | ||
929 | /* Previous code may changed the zone of the pfn range */ | 993 | /* Previous code may changed the zone of the pfn range */ |
@@ -939,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
939 | ret = notifier_to_errno(ret); | 1003 | ret = notifier_to_errno(ret); |
940 | if (ret) { | 1004 | if (ret) { |
941 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 1005 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
942 | unlock_memory_hotplug(); | 1006 | goto out; |
943 | return ret; | ||
944 | } | 1007 | } |
945 | /* | 1008 | /* |
946 | * If this zone is not populated, then it is not in zonelist. | 1009 | * If this zone is not populated, then it is not in zonelist. |
@@ -964,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
964 | (((unsigned long long) pfn + nr_pages) | 1027 | (((unsigned long long) pfn + nr_pages) |
965 | << PAGE_SHIFT) - 1); | 1028 | << PAGE_SHIFT) - 1); |
966 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 1029 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
967 | unlock_memory_hotplug(); | 1030 | goto out; |
968 | return ret; | ||
969 | } | 1031 | } |
970 | 1032 | ||
971 | zone->present_pages += onlined_pages; | 1033 | zone->present_pages += onlined_pages; |
@@ -995,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
995 | 1057 | ||
996 | if (onlined_pages) | 1058 | if (onlined_pages) |
997 | memory_notify(MEM_ONLINE, &arg); | 1059 | memory_notify(MEM_ONLINE, &arg); |
998 | unlock_memory_hotplug(); | 1060 | out: |
999 | 1061 | mem_hotplug_done(); | |
1000 | return 0; | 1062 | return ret; |
1001 | } | 1063 | } |
1002 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ | 1064 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ |
1003 | 1065 | ||
@@ -1007,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
1007 | struct pglist_data *pgdat; | 1069 | struct pglist_data *pgdat; |
1008 | unsigned long zones_size[MAX_NR_ZONES] = {0}; | 1070 | unsigned long zones_size[MAX_NR_ZONES] = {0}; |
1009 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; | 1071 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; |
1010 | unsigned long start_pfn = start >> PAGE_SHIFT; | 1072 | unsigned long start_pfn = PFN_DOWN(start); |
1011 | 1073 | ||
1012 | pgdat = NODE_DATA(nid); | 1074 | pgdat = NODE_DATA(nid); |
1013 | if (!pgdat) { | 1075 | if (!pgdat) { |
@@ -1055,7 +1117,7 @@ int try_online_node(int nid) | |||
1055 | if (node_online(nid)) | 1117 | if (node_online(nid)) |
1056 | return 0; | 1118 | return 0; |
1057 | 1119 | ||
1058 | lock_memory_hotplug(); | 1120 | mem_hotplug_begin(); |
1059 | pgdat = hotadd_new_pgdat(nid, 0); | 1121 | pgdat = hotadd_new_pgdat(nid, 0); |
1060 | if (!pgdat) { | 1122 | if (!pgdat) { |
1061 | pr_err("Cannot online node %d due to NULL pgdat\n", nid); | 1123 | pr_err("Cannot online node %d due to NULL pgdat\n", nid); |
@@ -1073,13 +1135,13 @@ int try_online_node(int nid) | |||
1073 | } | 1135 | } |
1074 | 1136 | ||
1075 | out: | 1137 | out: |
1076 | unlock_memory_hotplug(); | 1138 | mem_hotplug_done(); |
1077 | return ret; | 1139 | return ret; |
1078 | } | 1140 | } |
1079 | 1141 | ||
1080 | static int check_hotplug_memory_range(u64 start, u64 size) | 1142 | static int check_hotplug_memory_range(u64 start, u64 size) |
1081 | { | 1143 | { |
1082 | u64 start_pfn = start >> PAGE_SHIFT; | 1144 | u64 start_pfn = PFN_DOWN(start); |
1083 | u64 nr_pages = size >> PAGE_SHIFT; | 1145 | u64 nr_pages = size >> PAGE_SHIFT; |
1084 | 1146 | ||
1085 | /* Memory range must be aligned with section */ | 1147 | /* Memory range must be aligned with section */ |
@@ -1117,7 +1179,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
1117 | new_pgdat = !p; | 1179 | new_pgdat = !p; |
1118 | } | 1180 | } |
1119 | 1181 | ||
1120 | lock_memory_hotplug(); | 1182 | mem_hotplug_begin(); |
1121 | 1183 | ||
1122 | new_node = !node_online(nid); | 1184 | new_node = !node_online(nid); |
1123 | if (new_node) { | 1185 | if (new_node) { |
@@ -1158,7 +1220,7 @@ error: | |||
1158 | release_memory_resource(res); | 1220 | release_memory_resource(res); |
1159 | 1221 | ||
1160 | out: | 1222 | out: |
1161 | unlock_memory_hotplug(); | 1223 | mem_hotplug_done(); |
1162 | return ret; | 1224 | return ret; |
1163 | } | 1225 | } |
1164 | EXPORT_SYMBOL_GPL(add_memory); | 1226 | EXPORT_SYMBOL_GPL(add_memory); |
@@ -1332,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1332 | * alloc_migrate_target should be improooooved!! | 1394 | * alloc_migrate_target should be improooooved!! |
1333 | * migrate_pages returns # of failed pages. | 1395 | * migrate_pages returns # of failed pages. |
1334 | */ | 1396 | */ |
1335 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1397 | ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, |
1336 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); | 1398 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
1337 | if (ret) | 1399 | if (ret) |
1338 | putback_movable_pages(&source); | 1400 | putback_movable_pages(&source); |
@@ -1565,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1565 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | 1627 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) |
1566 | return -EINVAL; | 1628 | return -EINVAL; |
1567 | 1629 | ||
1568 | lock_memory_hotplug(); | 1630 | mem_hotplug_begin(); |
1569 | 1631 | ||
1570 | zone = page_zone(pfn_to_page(start_pfn)); | 1632 | zone = page_zone(pfn_to_page(start_pfn)); |
1571 | node = zone_to_nid(zone); | 1633 | node = zone_to_nid(zone); |
@@ -1672,7 +1734,7 @@ repeat: | |||
1672 | writeback_set_ratelimit(); | 1734 | writeback_set_ratelimit(); |
1673 | 1735 | ||
1674 | memory_notify(MEM_OFFLINE, &arg); | 1736 | memory_notify(MEM_OFFLINE, &arg); |
1675 | unlock_memory_hotplug(); | 1737 | mem_hotplug_done(); |
1676 | return 0; | 1738 | return 0; |
1677 | 1739 | ||
1678 | failed_removal: | 1740 | failed_removal: |
@@ -1684,7 +1746,7 @@ failed_removal: | |||
1684 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1746 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
1685 | 1747 | ||
1686 | out: | 1748 | out: |
1687 | unlock_memory_hotplug(); | 1749 | mem_hotplug_done(); |
1688 | return ret; | 1750 | return ret; |
1689 | } | 1751 | } |
1690 | 1752 | ||
@@ -1888,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) | |||
1888 | 1950 | ||
1889 | BUG_ON(check_hotplug_memory_range(start, size)); | 1951 | BUG_ON(check_hotplug_memory_range(start, size)); |
1890 | 1952 | ||
1891 | lock_memory_hotplug(); | 1953 | mem_hotplug_begin(); |
1892 | 1954 | ||
1893 | /* | 1955 | /* |
1894 | * All memory blocks must be offlined before removing memory. Check | 1956 | * All memory blocks must be offlined before removing memory. Check |
@@ -1897,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) | |||
1897 | */ | 1959 | */ |
1898 | ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, | 1960 | ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, |
1899 | check_memblock_offlined_cb); | 1961 | check_memblock_offlined_cb); |
1900 | if (ret) { | 1962 | if (ret) |
1901 | unlock_memory_hotplug(); | ||
1902 | BUG(); | 1963 | BUG(); |
1903 | } | ||
1904 | 1964 | ||
1905 | /* remove memmap entry */ | 1965 | /* remove memmap entry */ |
1906 | firmware_map_remove(start, start + size, "System RAM"); | 1966 | firmware_map_remove(start, start + size, "System RAM"); |
@@ -1909,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) | |||
1909 | 1969 | ||
1910 | try_offline_node(nid); | 1970 | try_offline_node(nid); |
1911 | 1971 | ||
1912 | unlock_memory_hotplug(); | 1972 | mem_hotplug_done(); |
1913 | } | 1973 | } |
1914 | EXPORT_SYMBOL_GPL(remove_memory); | 1974 | EXPORT_SYMBOL_GPL(remove_memory); |
1915 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 1975 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78e1472933ea..16bc9fa42998 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
1028 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 1028 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
1029 | 1029 | ||
1030 | if (!list_empty(&pagelist)) { | 1030 | if (!list_empty(&pagelist)) { |
1031 | err = migrate_pages(&pagelist, new_node_page, dest, | 1031 | err = migrate_pages(&pagelist, new_node_page, NULL, dest, |
1032 | MIGRATE_SYNC, MR_SYSCALL); | 1032 | MIGRATE_SYNC, MR_SYSCALL); |
1033 | if (err) | 1033 | if (err) |
1034 | putback_movable_pages(&pagelist); | 1034 | putback_movable_pages(&pagelist); |
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1277 | if (!list_empty(&pagelist)) { | 1277 | if (!list_empty(&pagelist)) { |
1278 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | 1278 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); |
1279 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1279 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1280 | (unsigned long)vma, | 1280 | NULL, (unsigned long)vma, |
1281 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); | 1281 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); |
1282 | if (nr_failed) | 1282 | if (nr_failed) |
1283 | putback_movable_pages(&pagelist); | 1283 | putback_movable_pages(&pagelist); |
@@ -1362,7 +1362,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | |||
1362 | } | 1362 | } |
1363 | 1363 | ||
1364 | SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, | 1364 | SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, |
1365 | unsigned long, mode, unsigned long __user *, nmask, | 1365 | unsigned long, mode, const unsigned long __user *, nmask, |
1366 | unsigned long, maxnode, unsigned, flags) | 1366 | unsigned long, maxnode, unsigned, flags) |
1367 | { | 1367 | { |
1368 | nodemask_t nodes; | 1368 | nodemask_t nodes; |
@@ -1383,7 +1383,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, | |||
1383 | } | 1383 | } |
1384 | 1384 | ||
1385 | /* Set the process memory policy */ | 1385 | /* Set the process memory policy */ |
1386 | SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, | 1386 | SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, |
1387 | unsigned long, maxnode) | 1387 | unsigned long, maxnode) |
1388 | { | 1388 | { |
1389 | int err; | 1389 | int err; |
@@ -1606,9 +1606,9 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, | |||
1606 | 1606 | ||
1607 | /* | 1607 | /* |
1608 | * get_vma_policy(@task, @vma, @addr) | 1608 | * get_vma_policy(@task, @vma, @addr) |
1609 | * @task - task for fallback if vma policy == default | 1609 | * @task: task for fallback if vma policy == default |
1610 | * @vma - virtual memory area whose policy is sought | 1610 | * @vma: virtual memory area whose policy is sought |
1611 | * @addr - address in @vma for shared policy lookup | 1611 | * @addr: address in @vma for shared policy lookup |
1612 | * | 1612 | * |
1613 | * Returns effective policy for a VMA at specified address. | 1613 | * Returns effective policy for a VMA at specified address. |
1614 | * Falls back to @task or system default policy, as necessary. | 1614 | * Falls back to @task or system default policy, as necessary. |
@@ -1854,11 +1854,11 @@ int node_random(const nodemask_t *maskp) | |||
1854 | #ifdef CONFIG_HUGETLBFS | 1854 | #ifdef CONFIG_HUGETLBFS |
1855 | /* | 1855 | /* |
1856 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) | 1856 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) |
1857 | * @vma = virtual memory area whose policy is sought | 1857 | * @vma: virtual memory area whose policy is sought |
1858 | * @addr = address in @vma for shared policy lookup and interleave policy | 1858 | * @addr: address in @vma for shared policy lookup and interleave policy |
1859 | * @gfp_flags = for requested zone | 1859 | * @gfp_flags: for requested zone |
1860 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy | 1860 | * @mpol: pointer to mempolicy pointer for reference counted mempolicy |
1861 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | 1861 | * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask |
1862 | * | 1862 | * |
1863 | * Returns a zonelist suitable for a huge page allocation and a pointer | 1863 | * Returns a zonelist suitable for a huge page allocation and a pointer |
1864 | * to the struct mempolicy for conditional unref after allocation. | 1864 | * to the struct mempolicy for conditional unref after allocation. |
@@ -2270,9 +2270,9 @@ static void sp_free(struct sp_node *n) | |||
2270 | /** | 2270 | /** |
2271 | * mpol_misplaced - check whether current page node is valid in policy | 2271 | * mpol_misplaced - check whether current page node is valid in policy |
2272 | * | 2272 | * |
2273 | * @page - page to be checked | 2273 | * @page: page to be checked |
2274 | * @vma - vm area where page mapped | 2274 | * @vma: vm area where page mapped |
2275 | * @addr - virtual address where page mapped | 2275 | * @addr: virtual address where page mapped |
2276 | * | 2276 | * |
2277 | * Lookup current policy node id for vma,addr and "compare to" page's | 2277 | * Lookup current policy node id for vma,addr and "compare to" page's |
2278 | * node id. | 2278 | * node id. |
diff --git a/mm/mempool.c b/mm/mempool.c index 905434f18c97..455d468c3a5d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -192,6 +192,7 @@ EXPORT_SYMBOL(mempool_resize); | |||
192 | * returns NULL. Note that due to preallocation, this function | 192 | * returns NULL. Note that due to preallocation, this function |
193 | * *never* fails when called from process contexts. (it might | 193 | * *never* fails when called from process contexts. (it might |
194 | * fail if called from an IRQ context.) | 194 | * fail if called from an IRQ context.) |
195 | * Note: using __GFP_ZERO is not supported. | ||
195 | */ | 196 | */ |
196 | void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) | 197 | void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) |
197 | { | 198 | { |
@@ -200,6 +201,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) | |||
200 | wait_queue_t wait; | 201 | wait_queue_t wait; |
201 | gfp_t gfp_temp; | 202 | gfp_t gfp_temp; |
202 | 203 | ||
204 | VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); | ||
203 | might_sleep_if(gfp_mask & __GFP_WAIT); | 205 | might_sleep_if(gfp_mask & __GFP_WAIT); |
204 | 206 | ||
205 | gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ | 207 | gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ |
diff --git a/mm/migrate.c b/mm/migrate.c index bed48809e5d0..63f0cd559999 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -938,8 +938,9 @@ out: | |||
938 | * Obtain the lock on page, remove all ptes and migrate the page | 938 | * Obtain the lock on page, remove all ptes and migrate the page |
939 | * to the newly allocated page in newpage. | 939 | * to the newly allocated page in newpage. |
940 | */ | 940 | */ |
941 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 941 | static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, |
942 | struct page *page, int force, enum migrate_mode mode) | 942 | unsigned long private, struct page *page, int force, |
943 | enum migrate_mode mode) | ||
943 | { | 944 | { |
944 | int rc = 0; | 945 | int rc = 0; |
945 | int *result = NULL; | 946 | int *result = NULL; |
@@ -983,11 +984,17 @@ out: | |||
983 | page_is_file_cache(page)); | 984 | page_is_file_cache(page)); |
984 | putback_lru_page(page); | 985 | putback_lru_page(page); |
985 | } | 986 | } |
987 | |||
986 | /* | 988 | /* |
987 | * Move the new page to the LRU. If migration was not successful | 989 | * If migration was not successful and there's a freeing callback, use |
988 | * then this will free the page. | 990 | * it. Otherwise, putback_lru_page() will drop the reference grabbed |
991 | * during isolation. | ||
989 | */ | 992 | */ |
990 | putback_lru_page(newpage); | 993 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) |
994 | put_new_page(newpage, private); | ||
995 | else | ||
996 | putback_lru_page(newpage); | ||
997 | |||
991 | if (result) { | 998 | if (result) { |
992 | if (rc) | 999 | if (rc) |
993 | *result = rc; | 1000 | *result = rc; |
@@ -1016,8 +1023,9 @@ out: | |||
1016 | * will wait in the page fault for migration to complete. | 1023 | * will wait in the page fault for migration to complete. |
1017 | */ | 1024 | */ |
1018 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 1025 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
1019 | unsigned long private, struct page *hpage, | 1026 | free_page_t put_new_page, unsigned long private, |
1020 | int force, enum migrate_mode mode) | 1027 | struct page *hpage, int force, |
1028 | enum migrate_mode mode) | ||
1021 | { | 1029 | { |
1022 | int rc = 0; | 1030 | int rc = 0; |
1023 | int *result = NULL; | 1031 | int *result = NULL; |
@@ -1031,7 +1039,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1031 | * tables or check whether the hugepage is pmd-based or not before | 1039 | * tables or check whether the hugepage is pmd-based or not before |
1032 | * kicking migration. | 1040 | * kicking migration. |
1033 | */ | 1041 | */ |
1034 | if (!hugepage_migration_support(page_hstate(hpage))) { | 1042 | if (!hugepage_migration_supported(page_hstate(hpage))) { |
1035 | putback_active_hugepage(hpage); | 1043 | putback_active_hugepage(hpage); |
1036 | return -ENOSYS; | 1044 | return -ENOSYS; |
1037 | } | 1045 | } |
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1056 | if (!page_mapped(hpage)) | 1064 | if (!page_mapped(hpage)) |
1057 | rc = move_to_new_page(new_hpage, hpage, 1, mode); | 1065 | rc = move_to_new_page(new_hpage, hpage, 1, mode); |
1058 | 1066 | ||
1059 | if (rc) | 1067 | if (rc != MIGRATEPAGE_SUCCESS) |
1060 | remove_migration_ptes(hpage, hpage); | 1068 | remove_migration_ptes(hpage, hpage); |
1061 | 1069 | ||
1062 | if (anon_vma) | 1070 | if (anon_vma) |
1063 | put_anon_vma(anon_vma); | 1071 | put_anon_vma(anon_vma); |
1064 | 1072 | ||
1065 | if (!rc) | 1073 | if (rc == MIGRATEPAGE_SUCCESS) |
1066 | hugetlb_cgroup_migrate(hpage, new_hpage); | 1074 | hugetlb_cgroup_migrate(hpage, new_hpage); |
1067 | 1075 | ||
1068 | unlock_page(hpage); | 1076 | unlock_page(hpage); |
1069 | out: | 1077 | out: |
1070 | if (rc != -EAGAIN) | 1078 | if (rc != -EAGAIN) |
1071 | putback_active_hugepage(hpage); | 1079 | putback_active_hugepage(hpage); |
1072 | put_page(new_hpage); | 1080 | |
1081 | /* | ||
1082 | * If migration was not successful and there's a freeing callback, use | ||
1083 | * it. Otherwise, put_page() will drop the reference grabbed during | ||
1084 | * isolation. | ||
1085 | */ | ||
1086 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) | ||
1087 | put_new_page(new_hpage, private); | ||
1088 | else | ||
1089 | put_page(new_hpage); | ||
1090 | |||
1073 | if (result) { | 1091 | if (result) { |
1074 | if (rc) | 1092 | if (rc) |
1075 | *result = rc; | 1093 | *result = rc; |
@@ -1086,6 +1104,8 @@ out: | |||
1086 | * @from: The list of pages to be migrated. | 1104 | * @from: The list of pages to be migrated. |
1087 | * @get_new_page: The function used to allocate free pages to be used | 1105 | * @get_new_page: The function used to allocate free pages to be used |
1088 | * as the target of the page migration. | 1106 | * as the target of the page migration. |
1107 | * @put_new_page: The function used to free target pages if migration | ||
1108 | * fails, or NULL if no special handling is necessary. | ||
1089 | * @private: Private data to be passed on to get_new_page() | 1109 | * @private: Private data to be passed on to get_new_page() |
1090 | * @mode: The migration mode that specifies the constraints for | 1110 | * @mode: The migration mode that specifies the constraints for |
1091 | * page migration, if any. | 1111 | * page migration, if any. |
@@ -1099,7 +1119,8 @@ out: | |||
1099 | * Returns the number of pages that were not migrated, or an error code. | 1119 | * Returns the number of pages that were not migrated, or an error code. |
1100 | */ | 1120 | */ |
1101 | int migrate_pages(struct list_head *from, new_page_t get_new_page, | 1121 | int migrate_pages(struct list_head *from, new_page_t get_new_page, |
1102 | unsigned long private, enum migrate_mode mode, int reason) | 1122 | free_page_t put_new_page, unsigned long private, |
1123 | enum migrate_mode mode, int reason) | ||
1103 | { | 1124 | { |
1104 | int retry = 1; | 1125 | int retry = 1; |
1105 | int nr_failed = 0; | 1126 | int nr_failed = 0; |
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1121 | 1142 | ||
1122 | if (PageHuge(page)) | 1143 | if (PageHuge(page)) |
1123 | rc = unmap_and_move_huge_page(get_new_page, | 1144 | rc = unmap_and_move_huge_page(get_new_page, |
1124 | private, page, pass > 2, mode); | 1145 | put_new_page, private, page, |
1146 | pass > 2, mode); | ||
1125 | else | 1147 | else |
1126 | rc = unmap_and_move(get_new_page, private, | 1148 | rc = unmap_and_move(get_new_page, put_new_page, |
1127 | page, pass > 2, mode); | 1149 | private, page, pass > 2, mode); |
1128 | 1150 | ||
1129 | switch(rc) { | 1151 | switch(rc) { |
1130 | case -ENOMEM: | 1152 | case -ENOMEM: |
@@ -1273,7 +1295,7 @@ set_status: | |||
1273 | 1295 | ||
1274 | err = 0; | 1296 | err = 0; |
1275 | if (!list_empty(&pagelist)) { | 1297 | if (!list_empty(&pagelist)) { |
1276 | err = migrate_pages(&pagelist, new_page_node, | 1298 | err = migrate_pages(&pagelist, new_page_node, NULL, |
1277 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); | 1299 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); |
1278 | if (err) | 1300 | if (err) |
1279 | putback_movable_pages(&pagelist); | 1301 | putback_movable_pages(&pagelist); |
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, | |||
1729 | 1751 | ||
1730 | list_add(&page->lru, &migratepages); | 1752 | list_add(&page->lru, &migratepages); |
1731 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, | 1753 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
1732 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); | 1754 | NULL, node, MIGRATE_ASYNC, |
1755 | MR_NUMA_MISPLACED); | ||
1733 | if (nr_remaining) { | 1756 | if (nr_remaining) { |
1734 | if (!list_empty(&migratepages)) { | 1757 | if (!list_empty(&migratepages)) { |
1735 | list_del(&page->lru); | 1758 | list_del(&page->lru); |
@@ -1852,7 +1875,7 @@ fail_putback: | |||
1852 | * guarantee the copy is visible before the pagetable update. | 1875 | * guarantee the copy is visible before the pagetable update. |
1853 | */ | 1876 | */ |
1854 | flush_cache_range(vma, mmun_start, mmun_end); | 1877 | flush_cache_range(vma, mmun_start, mmun_end); |
1855 | page_add_new_anon_rmap(new_page, vma, mmun_start); | 1878 | page_add_anon_rmap(new_page, vma, mmun_start); |
1856 | pmdp_clear_flush(vma, mmun_start, pmd); | 1879 | pmdp_clear_flush(vma, mmun_start, pmd); |
1857 | set_pmd_at(mm, mmun_start, pmd, entry); | 1880 | set_pmd_at(mm, mmun_start, pmd, entry); |
1858 | flush_tlb_range(vma, mmun_start, mmun_end); | 1881 | flush_tlb_range(vma, mmun_start, mmun_end); |
@@ -1877,6 +1900,10 @@ fail_putback: | |||
1877 | spin_unlock(ptl); | 1900 | spin_unlock(ptl); |
1878 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1901 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1879 | 1902 | ||
1903 | /* Take an "isolate" reference and put new page on the LRU. */ | ||
1904 | get_page(new_page); | ||
1905 | putback_lru_page(new_page); | ||
1906 | |||
1880 | unlock_page(new_page); | 1907 | unlock_page(new_page); |
1881 | unlock_page(page); | 1908 | unlock_page(page); |
1882 | put_page(page); /* Drop the rmap reference */ | 1909 | put_page(page); /* Drop the rmap reference */ |
@@ -640,11 +640,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
640 | { | 640 | { |
641 | struct address_space *mapping = NULL; | 641 | struct address_space *mapping = NULL; |
642 | 642 | ||
643 | if (vma->vm_file) | 643 | if (vma->vm_file) { |
644 | mapping = vma->vm_file->f_mapping; | 644 | mapping = vma->vm_file->f_mapping; |
645 | |||
646 | if (mapping) | ||
647 | mutex_lock(&mapping->i_mmap_mutex); | 645 | mutex_lock(&mapping->i_mmap_mutex); |
646 | } | ||
648 | 647 | ||
649 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 648 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
650 | __vma_link_file(vma); | 649 | __vma_link_file(vma); |
@@ -2965,9 +2964,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
2965 | struct vm_area_struct *vma = _install_special_mapping(mm, | 2964 | struct vm_area_struct *vma = _install_special_mapping(mm, |
2966 | addr, len, vm_flags, pages); | 2965 | addr, len, vm_flags, pages); |
2967 | 2966 | ||
2968 | if (IS_ERR(vma)) | 2967 | return PTR_ERR_OR_ZERO(vma); |
2969 | return PTR_ERR(vma); | ||
2970 | return 0; | ||
2971 | } | 2968 | } |
2972 | 2969 | ||
2973 | static DEFINE_MUTEX(mm_all_locks_mutex); | 2970 | static DEFINE_MUTEX(mm_all_locks_mutex); |
diff --git a/mm/msync.c b/mm/msync.c index 632df4527c01..a5c673669ca6 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) | |||
58 | vma = find_vma(mm, start); | 58 | vma = find_vma(mm, start); |
59 | for (;;) { | 59 | for (;;) { |
60 | struct file *file; | 60 | struct file *file; |
61 | loff_t fstart, fend; | ||
61 | 62 | ||
62 | /* Still start < end. */ | 63 | /* Still start < end. */ |
63 | error = -ENOMEM; | 64 | error = -ENOMEM; |
@@ -77,12 +78,17 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) | |||
77 | goto out_unlock; | 78 | goto out_unlock; |
78 | } | 79 | } |
79 | file = vma->vm_file; | 80 | file = vma->vm_file; |
81 | fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
82 | fend = fstart + (min(end, vma->vm_end) - start) - 1; | ||
80 | start = vma->vm_end; | 83 | start = vma->vm_end; |
81 | if ((flags & MS_SYNC) && file && | 84 | if ((flags & MS_SYNC) && file && |
82 | (vma->vm_flags & VM_SHARED)) { | 85 | (vma->vm_flags & VM_SHARED)) { |
83 | get_file(file); | 86 | get_file(file); |
84 | up_read(&mm->mmap_sem); | 87 | up_read(&mm->mmap_sem); |
85 | error = vfs_fsync(file, 0); | 88 | if (vma->vm_flags & VM_NONLINEAR) |
89 | error = vfs_fsync(file, 1); | ||
90 | else | ||
91 | error = vfs_fsync_range(file, fstart, fend, 1); | ||
86 | fput(file); | 92 | fput(file); |
87 | if (error || start >= end) | 93 | if (error || start >= end) |
88 | goto out; | 94 | goto out; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a4317da60532..533fa60c9ac1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -156,24 +156,6 @@ static unsigned long writeout_period_time = 0; | |||
156 | #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) | 156 | #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) |
157 | 157 | ||
158 | /* | 158 | /* |
159 | * Work out the current dirty-memory clamping and background writeout | ||
160 | * thresholds. | ||
161 | * | ||
162 | * The main aim here is to lower them aggressively if there is a lot of mapped | ||
163 | * memory around. To avoid stressing page reclaim with lots of unreclaimable | ||
164 | * pages. It is better to clamp down on writers than to start swapping, and | ||
165 | * performing lots of scanning. | ||
166 | * | ||
167 | * We only allow 1/2 of the currently-unmapped memory to be dirtied. | ||
168 | * | ||
169 | * We don't permit the clamping level to fall below 5% - that is getting rather | ||
170 | * excessive. | ||
171 | * | ||
172 | * We make sure that the background writeout level is below the adjusted | ||
173 | * clamping level. | ||
174 | */ | ||
175 | |||
176 | /* | ||
177 | * In a memory zone, there is a certain amount of pages we consider | 159 | * In a memory zone, there is a certain amount of pages we consider |
178 | * available for the page cache, which is essentially the number of | 160 | * available for the page cache, which is essentially the number of |
179 | * free and reclaimable pages, minus some zone reserves to protect | 161 | * free and reclaimable pages, minus some zone reserves to protect |
@@ -1623,7 +1605,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) | |||
1623 | * 1000+ tasks, all of them start dirtying pages at exactly the same | 1605 | * 1000+ tasks, all of them start dirtying pages at exactly the same |
1624 | * time, hence all honoured too large initial task->nr_dirtied_pause. | 1606 | * time, hence all honoured too large initial task->nr_dirtied_pause. |
1625 | */ | 1607 | */ |
1626 | p = &__get_cpu_var(bdp_ratelimits); | 1608 | p = this_cpu_ptr(&bdp_ratelimits); |
1627 | if (unlikely(current->nr_dirtied >= ratelimit)) | 1609 | if (unlikely(current->nr_dirtied >= ratelimit)) |
1628 | *p = 0; | 1610 | *p = 0; |
1629 | else if (unlikely(*p >= ratelimit_pages)) { | 1611 | else if (unlikely(*p >= ratelimit_pages)) { |
@@ -1635,7 +1617,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) | |||
1635 | * short-lived tasks (eg. gcc invocations in a kernel build) escaping | 1617 | * short-lived tasks (eg. gcc invocations in a kernel build) escaping |
1636 | * the dirty throttling and livelock other long-run dirtiers. | 1618 | * the dirty throttling and livelock other long-run dirtiers. |
1637 | */ | 1619 | */ |
1638 | p = &__get_cpu_var(dirty_throttle_leaks); | 1620 | p = this_cpu_ptr(&dirty_throttle_leaks); |
1639 | if (*p > 0 && current->nr_dirtied < ratelimit) { | 1621 | if (*p > 0 && current->nr_dirtied < ratelimit) { |
1640 | unsigned long nr_pages_dirtied; | 1622 | unsigned long nr_pages_dirtied; |
1641 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); | 1623 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c0..a59bdb653958 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | |||
261 | } while (zone_span_seqretry(zone, seq)); | 261 | } while (zone_span_seqretry(zone, seq)); |
262 | 262 | ||
263 | if (ret) | 263 | if (ret) |
264 | pr_err("page %lu outside zone [ %lu - %lu ]\n", | 264 | pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", |
265 | pfn, start_pfn, start_pfn + sp); | 265 | pfn, zone_to_nid(zone), zone->name, |
266 | start_pfn, start_pfn + sp); | ||
266 | 267 | ||
267 | return ret; | 268 | return ret; |
268 | } | 269 | } |
@@ -408,7 +409,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
408 | return bad; | 409 | return bad; |
409 | } | 410 | } |
410 | 411 | ||
411 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 412 | static inline void prep_zero_page(struct page *page, unsigned int order, |
413 | gfp_t gfp_flags) | ||
412 | { | 414 | { |
413 | int i; | 415 | int i; |
414 | 416 | ||
@@ -452,7 +454,7 @@ static inline void set_page_guard_flag(struct page *page) { } | |||
452 | static inline void clear_page_guard_flag(struct page *page) { } | 454 | static inline void clear_page_guard_flag(struct page *page) { } |
453 | #endif | 455 | #endif |
454 | 456 | ||
455 | static inline void set_page_order(struct page *page, int order) | 457 | static inline void set_page_order(struct page *page, unsigned int order) |
456 | { | 458 | { |
457 | set_page_private(page, order); | 459 | set_page_private(page, order); |
458 | __SetPageBuddy(page); | 460 | __SetPageBuddy(page); |
@@ -503,21 +505,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) | |||
503 | * For recording page's order, we use page_private(page). | 505 | * For recording page's order, we use page_private(page). |
504 | */ | 506 | */ |
505 | static inline int page_is_buddy(struct page *page, struct page *buddy, | 507 | static inline int page_is_buddy(struct page *page, struct page *buddy, |
506 | int order) | 508 | unsigned int order) |
507 | { | 509 | { |
508 | if (!pfn_valid_within(page_to_pfn(buddy))) | 510 | if (!pfn_valid_within(page_to_pfn(buddy))) |
509 | return 0; | 511 | return 0; |
510 | 512 | ||
511 | if (page_zone_id(page) != page_zone_id(buddy)) | ||
512 | return 0; | ||
513 | |||
514 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 513 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
515 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | 514 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
515 | |||
516 | if (page_zone_id(page) != page_zone_id(buddy)) | ||
517 | return 0; | ||
518 | |||
516 | return 1; | 519 | return 1; |
517 | } | 520 | } |
518 | 521 | ||
519 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 522 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
520 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | 523 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
524 | |||
525 | /* | ||
526 | * zone check is done late to avoid uselessly | ||
527 | * calculating zone/node ids for pages that could | ||
528 | * never merge. | ||
529 | */ | ||
530 | if (page_zone_id(page) != page_zone_id(buddy)) | ||
531 | return 0; | ||
532 | |||
521 | return 1; | 533 | return 1; |
522 | } | 534 | } |
523 | return 0; | 535 | return 0; |
@@ -549,6 +561,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
549 | */ | 561 | */ |
550 | 562 | ||
551 | static inline void __free_one_page(struct page *page, | 563 | static inline void __free_one_page(struct page *page, |
564 | unsigned long pfn, | ||
552 | struct zone *zone, unsigned int order, | 565 | struct zone *zone, unsigned int order, |
553 | int migratetype) | 566 | int migratetype) |
554 | { | 567 | { |
@@ -565,7 +578,7 @@ static inline void __free_one_page(struct page *page, | |||
565 | 578 | ||
566 | VM_BUG_ON(migratetype == -1); | 579 | VM_BUG_ON(migratetype == -1); |
567 | 580 | ||
568 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 581 | page_idx = pfn & ((1 << MAX_ORDER) - 1); |
569 | 582 | ||
570 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); | 583 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); |
571 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 584 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
@@ -700,7 +713,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
700 | list_del(&page->lru); | 713 | list_del(&page->lru); |
701 | mt = get_freepage_migratetype(page); | 714 | mt = get_freepage_migratetype(page); |
702 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 715 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
703 | __free_one_page(page, zone, 0, mt); | 716 | __free_one_page(page, page_to_pfn(page), zone, 0, mt); |
704 | trace_mm_page_pcpu_drain(page, 0, mt); | 717 | trace_mm_page_pcpu_drain(page, 0, mt); |
705 | if (likely(!is_migrate_isolate_page(page))) { | 718 | if (likely(!is_migrate_isolate_page(page))) { |
706 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); | 719 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
@@ -712,13 +725,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
712 | spin_unlock(&zone->lock); | 725 | spin_unlock(&zone->lock); |
713 | } | 726 | } |
714 | 727 | ||
715 | static void free_one_page(struct zone *zone, struct page *page, int order, | 728 | static void free_one_page(struct zone *zone, |
729 | struct page *page, unsigned long pfn, | ||
730 | unsigned int order, | ||
716 | int migratetype) | 731 | int migratetype) |
717 | { | 732 | { |
718 | spin_lock(&zone->lock); | 733 | spin_lock(&zone->lock); |
719 | zone->pages_scanned = 0; | 734 | zone->pages_scanned = 0; |
720 | 735 | ||
721 | __free_one_page(page, zone, order, migratetype); | 736 | __free_one_page(page, pfn, zone, order, migratetype); |
722 | if (unlikely(!is_migrate_isolate(migratetype))) | 737 | if (unlikely(!is_migrate_isolate(migratetype))) |
723 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | 738 | __mod_zone_freepage_state(zone, 1 << order, migratetype); |
724 | spin_unlock(&zone->lock); | 739 | spin_unlock(&zone->lock); |
@@ -755,15 +770,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
755 | { | 770 | { |
756 | unsigned long flags; | 771 | unsigned long flags; |
757 | int migratetype; | 772 | int migratetype; |
773 | unsigned long pfn = page_to_pfn(page); | ||
758 | 774 | ||
759 | if (!free_pages_prepare(page, order)) | 775 | if (!free_pages_prepare(page, order)) |
760 | return; | 776 | return; |
761 | 777 | ||
778 | migratetype = get_pfnblock_migratetype(page, pfn); | ||
762 | local_irq_save(flags); | 779 | local_irq_save(flags); |
763 | __count_vm_events(PGFREE, 1 << order); | 780 | __count_vm_events(PGFREE, 1 << order); |
764 | migratetype = get_pageblock_migratetype(page); | ||
765 | set_freepage_migratetype(page, migratetype); | 781 | set_freepage_migratetype(page, migratetype); |
766 | free_one_page(page_zone(page), page, order, migratetype); | 782 | free_one_page(page_zone(page), page, pfn, order, migratetype); |
767 | local_irq_restore(flags); | 783 | local_irq_restore(flags); |
768 | } | 784 | } |
769 | 785 | ||
@@ -882,7 +898,7 @@ static inline int check_new_page(struct page *page) | |||
882 | return 0; | 898 | return 0; |
883 | } | 899 | } |
884 | 900 | ||
885 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | 901 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) |
886 | { | 902 | { |
887 | int i; | 903 | int i; |
888 | 904 | ||
@@ -931,6 +947,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
931 | rmv_page_order(page); | 947 | rmv_page_order(page); |
932 | area->nr_free--; | 948 | area->nr_free--; |
933 | expand(zone, page, order, current_order, area, migratetype); | 949 | expand(zone, page, order, current_order, area, migratetype); |
950 | set_freepage_migratetype(page, migratetype); | ||
934 | return page; | 951 | return page; |
935 | } | 952 | } |
936 | 953 | ||
@@ -1057,7 +1074,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1057 | 1074 | ||
1058 | /* | 1075 | /* |
1059 | * When borrowing from MIGRATE_CMA, we need to release the excess | 1076 | * When borrowing from MIGRATE_CMA, we need to release the excess |
1060 | * buddy pages to CMA itself. | 1077 | * buddy pages to CMA itself. We also ensure the freepage_migratetype |
1078 | * is set to CMA so it is returned to the correct freelist in case | ||
1079 | * the page ends up being not actually allocated from the pcp lists. | ||
1061 | */ | 1080 | */ |
1062 | if (is_migrate_cma(fallback_type)) | 1081 | if (is_migrate_cma(fallback_type)) |
1063 | return fallback_type; | 1082 | return fallback_type; |
@@ -1090,16 +1109,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1090 | 1109 | ||
1091 | /* Remove an element from the buddy allocator from the fallback list */ | 1110 | /* Remove an element from the buddy allocator from the fallback list */ |
1092 | static inline struct page * | 1111 | static inline struct page * |
1093 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 1112 | __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) |
1094 | { | 1113 | { |
1095 | struct free_area *area; | 1114 | struct free_area *area; |
1096 | int current_order; | 1115 | unsigned int current_order; |
1097 | struct page *page; | 1116 | struct page *page; |
1098 | int migratetype, new_type, i; | 1117 | int migratetype, new_type, i; |
1099 | 1118 | ||
1100 | /* Find the largest possible block of pages in the other list */ | 1119 | /* Find the largest possible block of pages in the other list */ |
1101 | for (current_order = MAX_ORDER-1; current_order >= order; | 1120 | for (current_order = MAX_ORDER-1; |
1102 | --current_order) { | 1121 | current_order >= order && current_order <= MAX_ORDER-1; |
1122 | --current_order) { | ||
1103 | for (i = 0;; i++) { | 1123 | for (i = 0;; i++) { |
1104 | migratetype = fallbacks[start_migratetype][i]; | 1124 | migratetype = fallbacks[start_migratetype][i]; |
1105 | 1125 | ||
@@ -1125,6 +1145,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1125 | 1145 | ||
1126 | expand(zone, page, order, current_order, area, | 1146 | expand(zone, page, order, current_order, area, |
1127 | new_type); | 1147 | new_type); |
1148 | /* The freepage_migratetype may differ from pageblock's | ||
1149 | * migratetype depending on the decisions in | ||
1150 | * try_to_steal_freepages. This is OK as long as it does | ||
1151 | * not differ for MIGRATE_CMA type. | ||
1152 | */ | ||
1153 | set_freepage_migratetype(page, new_type); | ||
1128 | 1154 | ||
1129 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1155 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1130 | start_migratetype, migratetype, new_type); | 1156 | start_migratetype, migratetype, new_type); |
@@ -1173,9 +1199,9 @@ retry_reserve: | |||
1173 | */ | 1199 | */ |
1174 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 1200 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
1175 | unsigned long count, struct list_head *list, | 1201 | unsigned long count, struct list_head *list, |
1176 | int migratetype, int cold) | 1202 | int migratetype, bool cold) |
1177 | { | 1203 | { |
1178 | int mt = migratetype, i; | 1204 | int i; |
1179 | 1205 | ||
1180 | spin_lock(&zone->lock); | 1206 | spin_lock(&zone->lock); |
1181 | for (i = 0; i < count; ++i) { | 1207 | for (i = 0; i < count; ++i) { |
@@ -1192,18 +1218,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1192 | * merge IO requests if the physical pages are ordered | 1218 | * merge IO requests if the physical pages are ordered |
1193 | * properly. | 1219 | * properly. |
1194 | */ | 1220 | */ |
1195 | if (likely(cold == 0)) | 1221 | if (likely(!cold)) |
1196 | list_add(&page->lru, list); | 1222 | list_add(&page->lru, list); |
1197 | else | 1223 | else |
1198 | list_add_tail(&page->lru, list); | 1224 | list_add_tail(&page->lru, list); |
1199 | if (IS_ENABLED(CONFIG_CMA)) { | ||
1200 | mt = get_pageblock_migratetype(page); | ||
1201 | if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) | ||
1202 | mt = migratetype; | ||
1203 | } | ||
1204 | set_freepage_migratetype(page, mt); | ||
1205 | list = &page->lru; | 1225 | list = &page->lru; |
1206 | if (is_migrate_cma(mt)) | 1226 | if (is_migrate_cma(get_freepage_migratetype(page))) |
1207 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, | 1227 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, |
1208 | -(1 << order)); | 1228 | -(1 << order)); |
1209 | } | 1229 | } |
@@ -1327,7 +1347,7 @@ void mark_free_pages(struct zone *zone) | |||
1327 | { | 1347 | { |
1328 | unsigned long pfn, max_zone_pfn; | 1348 | unsigned long pfn, max_zone_pfn; |
1329 | unsigned long flags; | 1349 | unsigned long flags; |
1330 | int order, t; | 1350 | unsigned int order, t; |
1331 | struct list_head *curr; | 1351 | struct list_head *curr; |
1332 | 1352 | ||
1333 | if (zone_is_empty(zone)) | 1353 | if (zone_is_empty(zone)) |
@@ -1359,19 +1379,20 @@ void mark_free_pages(struct zone *zone) | |||
1359 | 1379 | ||
1360 | /* | 1380 | /* |
1361 | * Free a 0-order page | 1381 | * Free a 0-order page |
1362 | * cold == 1 ? free a cold page : free a hot page | 1382 | * cold == true ? free a cold page : free a hot page |
1363 | */ | 1383 | */ |
1364 | void free_hot_cold_page(struct page *page, int cold) | 1384 | void free_hot_cold_page(struct page *page, bool cold) |
1365 | { | 1385 | { |
1366 | struct zone *zone = page_zone(page); | 1386 | struct zone *zone = page_zone(page); |
1367 | struct per_cpu_pages *pcp; | 1387 | struct per_cpu_pages *pcp; |
1368 | unsigned long flags; | 1388 | unsigned long flags; |
1389 | unsigned long pfn = page_to_pfn(page); | ||
1369 | int migratetype; | 1390 | int migratetype; |
1370 | 1391 | ||
1371 | if (!free_pages_prepare(page, 0)) | 1392 | if (!free_pages_prepare(page, 0)) |
1372 | return; | 1393 | return; |
1373 | 1394 | ||
1374 | migratetype = get_pageblock_migratetype(page); | 1395 | migratetype = get_pfnblock_migratetype(page, pfn); |
1375 | set_freepage_migratetype(page, migratetype); | 1396 | set_freepage_migratetype(page, migratetype); |
1376 | local_irq_save(flags); | 1397 | local_irq_save(flags); |
1377 | __count_vm_event(PGFREE); | 1398 | __count_vm_event(PGFREE); |
@@ -1385,17 +1406,17 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1385 | */ | 1406 | */ |
1386 | if (migratetype >= MIGRATE_PCPTYPES) { | 1407 | if (migratetype >= MIGRATE_PCPTYPES) { |
1387 | if (unlikely(is_migrate_isolate(migratetype))) { | 1408 | if (unlikely(is_migrate_isolate(migratetype))) { |
1388 | free_one_page(zone, page, 0, migratetype); | 1409 | free_one_page(zone, page, pfn, 0, migratetype); |
1389 | goto out; | 1410 | goto out; |
1390 | } | 1411 | } |
1391 | migratetype = MIGRATE_MOVABLE; | 1412 | migratetype = MIGRATE_MOVABLE; |
1392 | } | 1413 | } |
1393 | 1414 | ||
1394 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 1415 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
1395 | if (cold) | 1416 | if (!cold) |
1396 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | ||
1397 | else | ||
1398 | list_add(&page->lru, &pcp->lists[migratetype]); | 1417 | list_add(&page->lru, &pcp->lists[migratetype]); |
1418 | else | ||
1419 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | ||
1399 | pcp->count++; | 1420 | pcp->count++; |
1400 | if (pcp->count >= pcp->high) { | 1421 | if (pcp->count >= pcp->high) { |
1401 | unsigned long batch = ACCESS_ONCE(pcp->batch); | 1422 | unsigned long batch = ACCESS_ONCE(pcp->batch); |
@@ -1410,7 +1431,7 @@ out: | |||
1410 | /* | 1431 | /* |
1411 | * Free a list of 0-order pages | 1432 | * Free a list of 0-order pages |
1412 | */ | 1433 | */ |
1413 | void free_hot_cold_page_list(struct list_head *list, int cold) | 1434 | void free_hot_cold_page_list(struct list_head *list, bool cold) |
1414 | { | 1435 | { |
1415 | struct page *page, *next; | 1436 | struct page *page, *next; |
1416 | 1437 | ||
@@ -1522,12 +1543,12 @@ int split_free_page(struct page *page) | |||
1522 | */ | 1543 | */ |
1523 | static inline | 1544 | static inline |
1524 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1545 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
1525 | struct zone *zone, int order, gfp_t gfp_flags, | 1546 | struct zone *zone, unsigned int order, |
1526 | int migratetype) | 1547 | gfp_t gfp_flags, int migratetype) |
1527 | { | 1548 | { |
1528 | unsigned long flags; | 1549 | unsigned long flags; |
1529 | struct page *page; | 1550 | struct page *page; |
1530 | int cold = !!(gfp_flags & __GFP_COLD); | 1551 | bool cold = ((gfp_flags & __GFP_COLD) != 0); |
1531 | 1552 | ||
1532 | again: | 1553 | again: |
1533 | if (likely(order == 0)) { | 1554 | if (likely(order == 0)) { |
@@ -1572,7 +1593,7 @@ again: | |||
1572 | if (!page) | 1593 | if (!page) |
1573 | goto failed; | 1594 | goto failed; |
1574 | __mod_zone_freepage_state(zone, -(1 << order), | 1595 | __mod_zone_freepage_state(zone, -(1 << order), |
1575 | get_pageblock_migratetype(page)); | 1596 | get_freepage_migratetype(page)); |
1576 | } | 1597 | } |
1577 | 1598 | ||
1578 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | 1599 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); |
@@ -1672,8 +1693,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | |||
1672 | * Return true if free pages are above 'mark'. This takes into account the order | 1693 | * Return true if free pages are above 'mark'. This takes into account the order |
1673 | * of the allocation. | 1694 | * of the allocation. |
1674 | */ | 1695 | */ |
1675 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1696 | static bool __zone_watermark_ok(struct zone *z, unsigned int order, |
1676 | int classzone_idx, int alloc_flags, long free_pages) | 1697 | unsigned long mark, int classzone_idx, int alloc_flags, |
1698 | long free_pages) | ||
1677 | { | 1699 | { |
1678 | /* free_pages my go negative - that's OK */ | 1700 | /* free_pages my go negative - that's OK */ |
1679 | long min = mark; | 1701 | long min = mark; |
@@ -1707,15 +1729,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1707 | return true; | 1729 | return true; |
1708 | } | 1730 | } |
1709 | 1731 | ||
1710 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1732 | bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
1711 | int classzone_idx, int alloc_flags) | 1733 | int classzone_idx, int alloc_flags) |
1712 | { | 1734 | { |
1713 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1735 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1714 | zone_page_state(z, NR_FREE_PAGES)); | 1736 | zone_page_state(z, NR_FREE_PAGES)); |
1715 | } | 1737 | } |
1716 | 1738 | ||
1717 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | 1739 | bool zone_watermark_ok_safe(struct zone *z, unsigned int order, |
1718 | int classzone_idx, int alloc_flags) | 1740 | unsigned long mark, int classzone_idx, int alloc_flags) |
1719 | { | 1741 | { |
1720 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | 1742 | long free_pages = zone_page_state(z, NR_FREE_PAGES); |
1721 | 1743 | ||
@@ -1850,18 +1872,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone) | |||
1850 | 1872 | ||
1851 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | 1873 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
1852 | { | 1874 | { |
1853 | return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); | 1875 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < |
1854 | } | 1876 | RECLAIM_DISTANCE; |
1855 | |||
1856 | static void __paginginit init_zone_allows_reclaim(int nid) | ||
1857 | { | ||
1858 | int i; | ||
1859 | |||
1860 | for_each_node_state(i, N_MEMORY) | ||
1861 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) | ||
1862 | node_set(i, NODE_DATA(nid)->reclaim_nodes); | ||
1863 | else | ||
1864 | zone_reclaim_mode = 1; | ||
1865 | } | 1877 | } |
1866 | 1878 | ||
1867 | #else /* CONFIG_NUMA */ | 1879 | #else /* CONFIG_NUMA */ |
@@ -1895,9 +1907,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | |||
1895 | return true; | 1907 | return true; |
1896 | } | 1908 | } |
1897 | 1909 | ||
1898 | static inline void init_zone_allows_reclaim(int nid) | ||
1899 | { | ||
1900 | } | ||
1901 | #endif /* CONFIG_NUMA */ | 1910 | #endif /* CONFIG_NUMA */ |
1902 | 1911 | ||
1903 | /* | 1912 | /* |
@@ -1907,17 +1916,17 @@ static inline void init_zone_allows_reclaim(int nid) | |||
1907 | static struct page * | 1916 | static struct page * |
1908 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1917 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1909 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 1918 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
1910 | struct zone *preferred_zone, int migratetype) | 1919 | struct zone *preferred_zone, int classzone_idx, int migratetype) |
1911 | { | 1920 | { |
1912 | struct zoneref *z; | 1921 | struct zoneref *z; |
1913 | struct page *page = NULL; | 1922 | struct page *page = NULL; |
1914 | int classzone_idx; | ||
1915 | struct zone *zone; | 1923 | struct zone *zone; |
1916 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1924 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1917 | int zlc_active = 0; /* set if using zonelist_cache */ | 1925 | int zlc_active = 0; /* set if using zonelist_cache */ |
1918 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1926 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1927 | bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && | ||
1928 | (gfp_mask & __GFP_WRITE); | ||
1919 | 1929 | ||
1920 | classzone_idx = zone_idx(preferred_zone); | ||
1921 | zonelist_scan: | 1930 | zonelist_scan: |
1922 | /* | 1931 | /* |
1923 | * Scan zonelist, looking for a zone with enough free. | 1932 | * Scan zonelist, looking for a zone with enough free. |
@@ -1930,12 +1939,10 @@ zonelist_scan: | |||
1930 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && | 1939 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1931 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1940 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1932 | continue; | 1941 | continue; |
1933 | if ((alloc_flags & ALLOC_CPUSET) && | 1942 | if (cpusets_enabled() && |
1943 | (alloc_flags & ALLOC_CPUSET) && | ||
1934 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1944 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1935 | continue; | 1945 | continue; |
1936 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
1937 | if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) | ||
1938 | goto try_this_zone; | ||
1939 | /* | 1946 | /* |
1940 | * Distribute pages in proportion to the individual | 1947 | * Distribute pages in proportion to the individual |
1941 | * zone size to ensure fair page aging. The zone a | 1948 | * zone size to ensure fair page aging. The zone a |
@@ -1974,15 +1981,19 @@ zonelist_scan: | |||
1974 | * will require awareness of zones in the | 1981 | * will require awareness of zones in the |
1975 | * dirty-throttling and the flusher threads. | 1982 | * dirty-throttling and the flusher threads. |
1976 | */ | 1983 | */ |
1977 | if ((alloc_flags & ALLOC_WMARK_LOW) && | 1984 | if (consider_zone_dirty && !zone_dirty_ok(zone)) |
1978 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) | 1985 | continue; |
1979 | goto this_zone_full; | ||
1980 | 1986 | ||
1981 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 1987 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1982 | if (!zone_watermark_ok(zone, order, mark, | 1988 | if (!zone_watermark_ok(zone, order, mark, |
1983 | classzone_idx, alloc_flags)) { | 1989 | classzone_idx, alloc_flags)) { |
1984 | int ret; | 1990 | int ret; |
1985 | 1991 | ||
1992 | /* Checked here to keep the fast path fast */ | ||
1993 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
1994 | if (alloc_flags & ALLOC_NO_WATERMARKS) | ||
1995 | goto try_this_zone; | ||
1996 | |||
1986 | if (IS_ENABLED(CONFIG_NUMA) && | 1997 | if (IS_ENABLED(CONFIG_NUMA) && |
1987 | !did_zlc_setup && nr_online_nodes > 1) { | 1998 | !did_zlc_setup && nr_online_nodes > 1) { |
1988 | /* | 1999 | /* |
@@ -2044,7 +2055,7 @@ try_this_zone: | |||
2044 | if (page) | 2055 | if (page) |
2045 | break; | 2056 | break; |
2046 | this_zone_full: | 2057 | this_zone_full: |
2047 | if (IS_ENABLED(CONFIG_NUMA)) | 2058 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) |
2048 | zlc_mark_zone_full(zonelist, z); | 2059 | zlc_mark_zone_full(zonelist, z); |
2049 | } | 2060 | } |
2050 | 2061 | ||
@@ -2173,7 +2184,7 @@ static inline struct page * | |||
2173 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2184 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2174 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2185 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2175 | nodemask_t *nodemask, struct zone *preferred_zone, | 2186 | nodemask_t *nodemask, struct zone *preferred_zone, |
2176 | int migratetype) | 2187 | int classzone_idx, int migratetype) |
2177 | { | 2188 | { |
2178 | struct page *page; | 2189 | struct page *page; |
2179 | 2190 | ||
@@ -2191,7 +2202,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2191 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2202 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, |
2192 | order, zonelist, high_zoneidx, | 2203 | order, zonelist, high_zoneidx, |
2193 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | 2204 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, |
2194 | preferred_zone, migratetype); | 2205 | preferred_zone, classzone_idx, migratetype); |
2195 | if (page) | 2206 | if (page) |
2196 | goto out; | 2207 | goto out; |
2197 | 2208 | ||
@@ -2226,7 +2237,7 @@ static struct page * | |||
2226 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2237 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2227 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2238 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2228 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2239 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2229 | int migratetype, bool sync_migration, | 2240 | int classzone_idx, int migratetype, enum migrate_mode mode, |
2230 | bool *contended_compaction, bool *deferred_compaction, | 2241 | bool *contended_compaction, bool *deferred_compaction, |
2231 | unsigned long *did_some_progress) | 2242 | unsigned long *did_some_progress) |
2232 | { | 2243 | { |
@@ -2240,7 +2251,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2240 | 2251 | ||
2241 | current->flags |= PF_MEMALLOC; | 2252 | current->flags |= PF_MEMALLOC; |
2242 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2253 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2243 | nodemask, sync_migration, | 2254 | nodemask, mode, |
2244 | contended_compaction); | 2255 | contended_compaction); |
2245 | current->flags &= ~PF_MEMALLOC; | 2256 | current->flags &= ~PF_MEMALLOC; |
2246 | 2257 | ||
@@ -2254,7 +2265,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2254 | page = get_page_from_freelist(gfp_mask, nodemask, | 2265 | page = get_page_from_freelist(gfp_mask, nodemask, |
2255 | order, zonelist, high_zoneidx, | 2266 | order, zonelist, high_zoneidx, |
2256 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2267 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2257 | preferred_zone, migratetype); | 2268 | preferred_zone, classzone_idx, migratetype); |
2258 | if (page) { | 2269 | if (page) { |
2259 | preferred_zone->compact_blockskip_flush = false; | 2270 | preferred_zone->compact_blockskip_flush = false; |
2260 | compaction_defer_reset(preferred_zone, order, true); | 2271 | compaction_defer_reset(preferred_zone, order, true); |
@@ -2273,7 +2284,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2273 | * As async compaction considers a subset of pageblocks, only | 2284 | * As async compaction considers a subset of pageblocks, only |
2274 | * defer if the failure was a sync compaction failure. | 2285 | * defer if the failure was a sync compaction failure. |
2275 | */ | 2286 | */ |
2276 | if (sync_migration) | 2287 | if (mode != MIGRATE_ASYNC) |
2277 | defer_compaction(preferred_zone, order); | 2288 | defer_compaction(preferred_zone, order); |
2278 | 2289 | ||
2279 | cond_resched(); | 2290 | cond_resched(); |
@@ -2286,9 +2297,9 @@ static inline struct page * | |||
2286 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2297 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2287 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2298 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2288 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2299 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2289 | int migratetype, bool sync_migration, | 2300 | int classzone_idx, int migratetype, |
2290 | bool *contended_compaction, bool *deferred_compaction, | 2301 | enum migrate_mode mode, bool *contended_compaction, |
2291 | unsigned long *did_some_progress) | 2302 | bool *deferred_compaction, unsigned long *did_some_progress) |
2292 | { | 2303 | { |
2293 | return NULL; | 2304 | return NULL; |
2294 | } | 2305 | } |
@@ -2327,7 +2338,7 @@ static inline struct page * | |||
2327 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2338 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
2328 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2339 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2329 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2340 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2330 | int migratetype, unsigned long *did_some_progress) | 2341 | int classzone_idx, int migratetype, unsigned long *did_some_progress) |
2331 | { | 2342 | { |
2332 | struct page *page = NULL; | 2343 | struct page *page = NULL; |
2333 | bool drained = false; | 2344 | bool drained = false; |
@@ -2345,7 +2356,8 @@ retry: | |||
2345 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2356 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2346 | zonelist, high_zoneidx, | 2357 | zonelist, high_zoneidx, |
2347 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2358 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2348 | preferred_zone, migratetype); | 2359 | preferred_zone, classzone_idx, |
2360 | migratetype); | ||
2349 | 2361 | ||
2350 | /* | 2362 | /* |
2351 | * If an allocation failed after direct reclaim, it could be because | 2363 | * If an allocation failed after direct reclaim, it could be because |
@@ -2368,14 +2380,14 @@ static inline struct page * | |||
2368 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2380 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
2369 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2381 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2370 | nodemask_t *nodemask, struct zone *preferred_zone, | 2382 | nodemask_t *nodemask, struct zone *preferred_zone, |
2371 | int migratetype) | 2383 | int classzone_idx, int migratetype) |
2372 | { | 2384 | { |
2373 | struct page *page; | 2385 | struct page *page; |
2374 | 2386 | ||
2375 | do { | 2387 | do { |
2376 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2388 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2377 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2389 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, |
2378 | preferred_zone, migratetype); | 2390 | preferred_zone, classzone_idx, migratetype); |
2379 | 2391 | ||
2380 | if (!page && gfp_mask & __GFP_NOFAIL) | 2392 | if (!page && gfp_mask & __GFP_NOFAIL) |
2381 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2393 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
@@ -2476,14 +2488,14 @@ static inline struct page * | |||
2476 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2488 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2477 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2489 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2478 | nodemask_t *nodemask, struct zone *preferred_zone, | 2490 | nodemask_t *nodemask, struct zone *preferred_zone, |
2479 | int migratetype) | 2491 | int classzone_idx, int migratetype) |
2480 | { | 2492 | { |
2481 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2493 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2482 | struct page *page = NULL; | 2494 | struct page *page = NULL; |
2483 | int alloc_flags; | 2495 | int alloc_flags; |
2484 | unsigned long pages_reclaimed = 0; | 2496 | unsigned long pages_reclaimed = 0; |
2485 | unsigned long did_some_progress; | 2497 | unsigned long did_some_progress; |
2486 | bool sync_migration = false; | 2498 | enum migrate_mode migration_mode = MIGRATE_ASYNC; |
2487 | bool deferred_compaction = false; | 2499 | bool deferred_compaction = false; |
2488 | bool contended_compaction = false; | 2500 | bool contended_compaction = false; |
2489 | 2501 | ||
@@ -2525,15 +2537,18 @@ restart: | |||
2525 | * Find the true preferred zone if the allocation is unconstrained by | 2537 | * Find the true preferred zone if the allocation is unconstrained by |
2526 | * cpusets. | 2538 | * cpusets. |
2527 | */ | 2539 | */ |
2528 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) | 2540 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { |
2529 | first_zones_zonelist(zonelist, high_zoneidx, NULL, | 2541 | struct zoneref *preferred_zoneref; |
2530 | &preferred_zone); | 2542 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, |
2543 | NULL, &preferred_zone); | ||
2544 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | ||
2545 | } | ||
2531 | 2546 | ||
2532 | rebalance: | 2547 | rebalance: |
2533 | /* This is the last chance, in general, before the goto nopage. */ | 2548 | /* This is the last chance, in general, before the goto nopage. */ |
2534 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2549 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
2535 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2550 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
2536 | preferred_zone, migratetype); | 2551 | preferred_zone, classzone_idx, migratetype); |
2537 | if (page) | 2552 | if (page) |
2538 | goto got_pg; | 2553 | goto got_pg; |
2539 | 2554 | ||
@@ -2548,7 +2563,7 @@ rebalance: | |||
2548 | 2563 | ||
2549 | page = __alloc_pages_high_priority(gfp_mask, order, | 2564 | page = __alloc_pages_high_priority(gfp_mask, order, |
2550 | zonelist, high_zoneidx, nodemask, | 2565 | zonelist, high_zoneidx, nodemask, |
2551 | preferred_zone, migratetype); | 2566 | preferred_zone, classzone_idx, migratetype); |
2552 | if (page) { | 2567 | if (page) { |
2553 | goto got_pg; | 2568 | goto got_pg; |
2554 | } | 2569 | } |
@@ -2577,17 +2592,23 @@ rebalance: | |||
2577 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2592 | * Try direct compaction. The first pass is asynchronous. Subsequent |
2578 | * attempts after direct reclaim are synchronous | 2593 | * attempts after direct reclaim are synchronous |
2579 | */ | 2594 | */ |
2580 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2595 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, |
2581 | zonelist, high_zoneidx, | 2596 | high_zoneidx, nodemask, alloc_flags, |
2582 | nodemask, | 2597 | preferred_zone, |
2583 | alloc_flags, preferred_zone, | 2598 | classzone_idx, migratetype, |
2584 | migratetype, sync_migration, | 2599 | migration_mode, &contended_compaction, |
2585 | &contended_compaction, | ||
2586 | &deferred_compaction, | 2600 | &deferred_compaction, |
2587 | &did_some_progress); | 2601 | &did_some_progress); |
2588 | if (page) | 2602 | if (page) |
2589 | goto got_pg; | 2603 | goto got_pg; |
2590 | sync_migration = true; | 2604 | |
2605 | /* | ||
2606 | * It can become very expensive to allocate transparent hugepages at | ||
2607 | * fault, so use asynchronous memory compaction for THP unless it is | ||
2608 | * khugepaged trying to collapse. | ||
2609 | */ | ||
2610 | if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) | ||
2611 | migration_mode = MIGRATE_SYNC_LIGHT; | ||
2591 | 2612 | ||
2592 | /* | 2613 | /* |
2593 | * If compaction is deferred for high-order allocations, it is because | 2614 | * If compaction is deferred for high-order allocations, it is because |
@@ -2604,7 +2625,8 @@ rebalance: | |||
2604 | zonelist, high_zoneidx, | 2625 | zonelist, high_zoneidx, |
2605 | nodemask, | 2626 | nodemask, |
2606 | alloc_flags, preferred_zone, | 2627 | alloc_flags, preferred_zone, |
2607 | migratetype, &did_some_progress); | 2628 | classzone_idx, migratetype, |
2629 | &did_some_progress); | ||
2608 | if (page) | 2630 | if (page) |
2609 | goto got_pg; | 2631 | goto got_pg; |
2610 | 2632 | ||
@@ -2623,7 +2645,7 @@ rebalance: | |||
2623 | page = __alloc_pages_may_oom(gfp_mask, order, | 2645 | page = __alloc_pages_may_oom(gfp_mask, order, |
2624 | zonelist, high_zoneidx, | 2646 | zonelist, high_zoneidx, |
2625 | nodemask, preferred_zone, | 2647 | nodemask, preferred_zone, |
2626 | migratetype); | 2648 | classzone_idx, migratetype); |
2627 | if (page) | 2649 | if (page) |
2628 | goto got_pg; | 2650 | goto got_pg; |
2629 | 2651 | ||
@@ -2662,12 +2684,11 @@ rebalance: | |||
2662 | * direct reclaim and reclaim/compaction depends on compaction | 2684 | * direct reclaim and reclaim/compaction depends on compaction |
2663 | * being called after reclaim so call directly if necessary | 2685 | * being called after reclaim so call directly if necessary |
2664 | */ | 2686 | */ |
2665 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2687 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, |
2666 | zonelist, high_zoneidx, | 2688 | high_zoneidx, nodemask, alloc_flags, |
2667 | nodemask, | 2689 | preferred_zone, |
2668 | alloc_flags, preferred_zone, | 2690 | classzone_idx, migratetype, |
2669 | migratetype, sync_migration, | 2691 | migration_mode, &contended_compaction, |
2670 | &contended_compaction, | ||
2671 | &deferred_compaction, | 2692 | &deferred_compaction, |
2672 | &did_some_progress); | 2693 | &did_some_progress); |
2673 | if (page) | 2694 | if (page) |
@@ -2693,11 +2714,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2693 | { | 2714 | { |
2694 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2715 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2695 | struct zone *preferred_zone; | 2716 | struct zone *preferred_zone; |
2717 | struct zoneref *preferred_zoneref; | ||
2696 | struct page *page = NULL; | 2718 | struct page *page = NULL; |
2697 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2719 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2698 | unsigned int cpuset_mems_cookie; | 2720 | unsigned int cpuset_mems_cookie; |
2699 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2721 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2700 | struct mem_cgroup *memcg = NULL; | 2722 | int classzone_idx; |
2701 | 2723 | ||
2702 | gfp_mask &= gfp_allowed_mask; | 2724 | gfp_mask &= gfp_allowed_mask; |
2703 | 2725 | ||
@@ -2716,22 +2738,16 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2716 | if (unlikely(!zonelist->_zonerefs->zone)) | 2738 | if (unlikely(!zonelist->_zonerefs->zone)) |
2717 | return NULL; | 2739 | return NULL; |
2718 | 2740 | ||
2719 | /* | ||
2720 | * Will only have any effect when __GFP_KMEMCG is set. This is | ||
2721 | * verified in the (always inline) callee | ||
2722 | */ | ||
2723 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
2724 | return NULL; | ||
2725 | |||
2726 | retry_cpuset: | 2741 | retry_cpuset: |
2727 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2742 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2728 | 2743 | ||
2729 | /* The preferred zone is used for statistics later */ | 2744 | /* The preferred zone is used for statistics later */ |
2730 | first_zones_zonelist(zonelist, high_zoneidx, | 2745 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, |
2731 | nodemask ? : &cpuset_current_mems_allowed, | 2746 | nodemask ? : &cpuset_current_mems_allowed, |
2732 | &preferred_zone); | 2747 | &preferred_zone); |
2733 | if (!preferred_zone) | 2748 | if (!preferred_zone) |
2734 | goto out; | 2749 | goto out; |
2750 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | ||
2735 | 2751 | ||
2736 | #ifdef CONFIG_CMA | 2752 | #ifdef CONFIG_CMA |
2737 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 2753 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
@@ -2741,7 +2757,7 @@ retry: | |||
2741 | /* First allocation attempt */ | 2757 | /* First allocation attempt */ |
2742 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2758 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2743 | zonelist, high_zoneidx, alloc_flags, | 2759 | zonelist, high_zoneidx, alloc_flags, |
2744 | preferred_zone, migratetype); | 2760 | preferred_zone, classzone_idx, migratetype); |
2745 | if (unlikely(!page)) { | 2761 | if (unlikely(!page)) { |
2746 | /* | 2762 | /* |
2747 | * The first pass makes sure allocations are spread | 2763 | * The first pass makes sure allocations are spread |
@@ -2767,7 +2783,7 @@ retry: | |||
2767 | gfp_mask = memalloc_noio_flags(gfp_mask); | 2783 | gfp_mask = memalloc_noio_flags(gfp_mask); |
2768 | page = __alloc_pages_slowpath(gfp_mask, order, | 2784 | page = __alloc_pages_slowpath(gfp_mask, order, |
2769 | zonelist, high_zoneidx, nodemask, | 2785 | zonelist, high_zoneidx, nodemask, |
2770 | preferred_zone, migratetype); | 2786 | preferred_zone, classzone_idx, migratetype); |
2771 | } | 2787 | } |
2772 | 2788 | ||
2773 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2789 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
@@ -2782,8 +2798,6 @@ out: | |||
2782 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2798 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2783 | goto retry_cpuset; | 2799 | goto retry_cpuset; |
2784 | 2800 | ||
2785 | memcg_kmem_commit_charge(page, memcg, order); | ||
2786 | |||
2787 | return page; | 2801 | return page; |
2788 | } | 2802 | } |
2789 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2803 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2818,7 +2832,7 @@ void __free_pages(struct page *page, unsigned int order) | |||
2818 | { | 2832 | { |
2819 | if (put_page_testzero(page)) { | 2833 | if (put_page_testzero(page)) { |
2820 | if (order == 0) | 2834 | if (order == 0) |
2821 | free_hot_cold_page(page, 0); | 2835 | free_hot_cold_page(page, false); |
2822 | else | 2836 | else |
2823 | __free_pages_ok(page, order); | 2837 | __free_pages_ok(page, order); |
2824 | } | 2838 | } |
@@ -2837,27 +2851,51 @@ void free_pages(unsigned long addr, unsigned int order) | |||
2837 | EXPORT_SYMBOL(free_pages); | 2851 | EXPORT_SYMBOL(free_pages); |
2838 | 2852 | ||
2839 | /* | 2853 | /* |
2840 | * __free_memcg_kmem_pages and free_memcg_kmem_pages will free | 2854 | * alloc_kmem_pages charges newly allocated pages to the kmem resource counter |
2841 | * pages allocated with __GFP_KMEMCG. | 2855 | * of the current memory cgroup. |
2842 | * | ||
2843 | * Those pages are accounted to a particular memcg, embedded in the | ||
2844 | * corresponding page_cgroup. To avoid adding a hit in the allocator to search | ||
2845 | * for that information only to find out that it is NULL for users who have no | ||
2846 | * interest in that whatsoever, we provide these functions. | ||
2847 | * | 2856 | * |
2848 | * The caller knows better which flags it relies on. | 2857 | * It should be used when the caller would like to use kmalloc, but since the |
2858 | * allocation is large, it has to fall back to the page allocator. | ||
2849 | */ | 2859 | */ |
2850 | void __free_memcg_kmem_pages(struct page *page, unsigned int order) | 2860 | struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) |
2861 | { | ||
2862 | struct page *page; | ||
2863 | struct mem_cgroup *memcg = NULL; | ||
2864 | |||
2865 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
2866 | return NULL; | ||
2867 | page = alloc_pages(gfp_mask, order); | ||
2868 | memcg_kmem_commit_charge(page, memcg, order); | ||
2869 | return page; | ||
2870 | } | ||
2871 | |||
2872 | struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) | ||
2873 | { | ||
2874 | struct page *page; | ||
2875 | struct mem_cgroup *memcg = NULL; | ||
2876 | |||
2877 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
2878 | return NULL; | ||
2879 | page = alloc_pages_node(nid, gfp_mask, order); | ||
2880 | memcg_kmem_commit_charge(page, memcg, order); | ||
2881 | return page; | ||
2882 | } | ||
2883 | |||
2884 | /* | ||
2885 | * __free_kmem_pages and free_kmem_pages will free pages allocated with | ||
2886 | * alloc_kmem_pages. | ||
2887 | */ | ||
2888 | void __free_kmem_pages(struct page *page, unsigned int order) | ||
2851 | { | 2889 | { |
2852 | memcg_kmem_uncharge_pages(page, order); | 2890 | memcg_kmem_uncharge_pages(page, order); |
2853 | __free_pages(page, order); | 2891 | __free_pages(page, order); |
2854 | } | 2892 | } |
2855 | 2893 | ||
2856 | void free_memcg_kmem_pages(unsigned long addr, unsigned int order) | 2894 | void free_kmem_pages(unsigned long addr, unsigned int order) |
2857 | { | 2895 | { |
2858 | if (addr != 0) { | 2896 | if (addr != 0) { |
2859 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | 2897 | VM_BUG_ON(!virt_addr_valid((void *)addr)); |
2860 | __free_memcg_kmem_pages(virt_to_page((void *)addr), order); | 2898 | __free_kmem_pages(virt_to_page((void *)addr), order); |
2861 | } | 2899 | } |
2862 | } | 2900 | } |
2863 | 2901 | ||
@@ -4095,7 +4133,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
4095 | 4133 | ||
4096 | static void __meminit zone_init_free_lists(struct zone *zone) | 4134 | static void __meminit zone_init_free_lists(struct zone *zone) |
4097 | { | 4135 | { |
4098 | int order, t; | 4136 | unsigned int order, t; |
4099 | for_each_migratetype_order(order, t) { | 4137 | for_each_migratetype_order(order, t) { |
4100 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); | 4138 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); |
4101 | zone->free_area[order].nr_free = 0; | 4139 | zone->free_area[order].nr_free = 0; |
@@ -4349,9 +4387,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, | |||
4349 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | 4387 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
4350 | /* | 4388 | /* |
4351 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. | 4389 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
4352 | * Architectures may implement their own version but if add_active_range() | ||
4353 | * was used and there are no special requirements, this is a convenient | ||
4354 | * alternative | ||
4355 | */ | 4390 | */ |
4356 | int __meminit __early_pfn_to_nid(unsigned long pfn) | 4391 | int __meminit __early_pfn_to_nid(unsigned long pfn) |
4357 | { | 4392 | { |
@@ -4406,10 +4441,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) | |||
4406 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | 4441 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
4407 | * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid | 4442 | * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid |
4408 | * | 4443 | * |
4409 | * If an architecture guarantees that all ranges registered with | 4444 | * If an architecture guarantees that all ranges registered contain no holes |
4410 | * add_active_ranges() contain no holes and may be freed, this | 4445 | * and may be freed, this this function may be used instead of calling |
4411 | * this function may be used instead of calling memblock_free_early_nid() | 4446 | * memblock_free_early_nid() manually. |
4412 | * manually. | ||
4413 | */ | 4447 | */ |
4414 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | 4448 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
4415 | { | 4449 | { |
@@ -4431,9 +4465,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | |||
4431 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | 4465 | * sparse_memory_present_with_active_regions - Call memory_present for each active range |
4432 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. | 4466 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
4433 | * | 4467 | * |
4434 | * If an architecture guarantees that all ranges registered with | 4468 | * If an architecture guarantees that all ranges registered contain no holes and may |
4435 | * add_active_ranges() contain no holes and may be freed, this | 4469 | * be freed, this function may be used instead of calling memory_present() manually. |
4436 | * function may be used instead of calling memory_present() manually. | ||
4437 | */ | 4470 | */ |
4438 | void __init sparse_memory_present_with_active_regions(int nid) | 4471 | void __init sparse_memory_present_with_active_regions(int nid) |
4439 | { | 4472 | { |
@@ -4451,7 +4484,7 @@ void __init sparse_memory_present_with_active_regions(int nid) | |||
4451 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. | 4484 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. |
4452 | * | 4485 | * |
4453 | * It returns the start and end page frame of a node based on information | 4486 | * It returns the start and end page frame of a node based on information |
4454 | * provided by an arch calling add_active_range(). If called for a node | 4487 | * provided by memblock_set_node(). If called for a node |
4455 | * with no available memory, a warning is printed and the start and end | 4488 | * with no available memory, a warning is printed and the start and end |
4456 | * PFNs will be 0. | 4489 | * PFNs will be 0. |
4457 | */ | 4490 | */ |
@@ -4921,8 +4954,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4921 | 4954 | ||
4922 | pgdat->node_id = nid; | 4955 | pgdat->node_id = nid; |
4923 | pgdat->node_start_pfn = node_start_pfn; | 4956 | pgdat->node_start_pfn = node_start_pfn; |
4924 | if (node_state(nid, N_MEMORY)) | ||
4925 | init_zone_allows_reclaim(nid); | ||
4926 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4957 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
4927 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 4958 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
4928 | #endif | 4959 | #endif |
@@ -5030,7 +5061,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) | |||
5030 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | 5061 | * find_min_pfn_with_active_regions - Find the minimum PFN registered |
5031 | * | 5062 | * |
5032 | * It returns the minimum PFN based on information provided via | 5063 | * It returns the minimum PFN based on information provided via |
5033 | * add_active_range(). | 5064 | * memblock_set_node(). |
5034 | */ | 5065 | */ |
5035 | unsigned long __init find_min_pfn_with_active_regions(void) | 5066 | unsigned long __init find_min_pfn_with_active_regions(void) |
5036 | { | 5067 | { |
@@ -5251,7 +5282,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) | |||
5251 | * @max_zone_pfn: an array of max PFNs for each zone | 5282 | * @max_zone_pfn: an array of max PFNs for each zone |
5252 | * | 5283 | * |
5253 | * This will call free_area_init_node() for each active node in the system. | 5284 | * This will call free_area_init_node() for each active node in the system. |
5254 | * Using the page ranges provided by add_active_range(), the size of each | 5285 | * Using the page ranges provided by memblock_set_node(), the size of each |
5255 | * zone in each node and their holes is calculated. If the maximum PFN | 5286 | * zone in each node and their holes is calculated. If the maximum PFN |
5256 | * between two adjacent zones match, it is assumed that the zone is empty. | 5287 | * between two adjacent zones match, it is assumed that the zone is empty. |
5257 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed | 5288 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed |
@@ -6009,53 +6040,64 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | |||
6009 | * @end_bitidx: The last bit of interest | 6040 | * @end_bitidx: The last bit of interest |
6010 | * returns pageblock_bits flags | 6041 | * returns pageblock_bits flags |
6011 | */ | 6042 | */ |
6012 | unsigned long get_pageblock_flags_group(struct page *page, | 6043 | unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, |
6013 | int start_bitidx, int end_bitidx) | 6044 | unsigned long end_bitidx, |
6045 | unsigned long mask) | ||
6014 | { | 6046 | { |
6015 | struct zone *zone; | 6047 | struct zone *zone; |
6016 | unsigned long *bitmap; | 6048 | unsigned long *bitmap; |
6017 | unsigned long pfn, bitidx; | 6049 | unsigned long bitidx, word_bitidx; |
6018 | unsigned long flags = 0; | 6050 | unsigned long word; |
6019 | unsigned long value = 1; | ||
6020 | 6051 | ||
6021 | zone = page_zone(page); | 6052 | zone = page_zone(page); |
6022 | pfn = page_to_pfn(page); | ||
6023 | bitmap = get_pageblock_bitmap(zone, pfn); | 6053 | bitmap = get_pageblock_bitmap(zone, pfn); |
6024 | bitidx = pfn_to_bitidx(zone, pfn); | 6054 | bitidx = pfn_to_bitidx(zone, pfn); |
6055 | word_bitidx = bitidx / BITS_PER_LONG; | ||
6056 | bitidx &= (BITS_PER_LONG-1); | ||
6025 | 6057 | ||
6026 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 6058 | word = bitmap[word_bitidx]; |
6027 | if (test_bit(bitidx + start_bitidx, bitmap)) | 6059 | bitidx += end_bitidx; |
6028 | flags |= value; | 6060 | return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; |
6029 | |||
6030 | return flags; | ||
6031 | } | 6061 | } |
6032 | 6062 | ||
6033 | /** | 6063 | /** |
6034 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages | 6064 | * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages |
6035 | * @page: The page within the block of interest | 6065 | * @page: The page within the block of interest |
6036 | * @start_bitidx: The first bit of interest | 6066 | * @start_bitidx: The first bit of interest |
6037 | * @end_bitidx: The last bit of interest | 6067 | * @end_bitidx: The last bit of interest |
6038 | * @flags: The flags to set | 6068 | * @flags: The flags to set |
6039 | */ | 6069 | */ |
6040 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | 6070 | void set_pfnblock_flags_mask(struct page *page, unsigned long flags, |
6041 | int start_bitidx, int end_bitidx) | 6071 | unsigned long pfn, |
6072 | unsigned long end_bitidx, | ||
6073 | unsigned long mask) | ||
6042 | { | 6074 | { |
6043 | struct zone *zone; | 6075 | struct zone *zone; |
6044 | unsigned long *bitmap; | 6076 | unsigned long *bitmap; |
6045 | unsigned long pfn, bitidx; | 6077 | unsigned long bitidx, word_bitidx; |
6046 | unsigned long value = 1; | 6078 | unsigned long old_word, word; |
6079 | |||
6080 | BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); | ||
6047 | 6081 | ||
6048 | zone = page_zone(page); | 6082 | zone = page_zone(page); |
6049 | pfn = page_to_pfn(page); | ||
6050 | bitmap = get_pageblock_bitmap(zone, pfn); | 6083 | bitmap = get_pageblock_bitmap(zone, pfn); |
6051 | bitidx = pfn_to_bitidx(zone, pfn); | 6084 | bitidx = pfn_to_bitidx(zone, pfn); |
6085 | word_bitidx = bitidx / BITS_PER_LONG; | ||
6086 | bitidx &= (BITS_PER_LONG-1); | ||
6087 | |||
6052 | VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); | 6088 | VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); |
6053 | 6089 | ||
6054 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 6090 | bitidx += end_bitidx; |
6055 | if (flags & value) | 6091 | mask <<= (BITS_PER_LONG - bitidx - 1); |
6056 | __set_bit(bitidx + start_bitidx, bitmap); | 6092 | flags <<= (BITS_PER_LONG - bitidx - 1); |
6057 | else | 6093 | |
6058 | __clear_bit(bitidx + start_bitidx, bitmap); | 6094 | word = ACCESS_ONCE(bitmap[word_bitidx]); |
6095 | for (;;) { | ||
6096 | old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); | ||
6097 | if (word == old_word) | ||
6098 | break; | ||
6099 | word = old_word; | ||
6100 | } | ||
6059 | } | 6101 | } |
6060 | 6102 | ||
6061 | /* | 6103 | /* |
@@ -6215,7 +6257,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
6215 | cc->nr_migratepages -= nr_reclaimed; | 6257 | cc->nr_migratepages -= nr_reclaimed; |
6216 | 6258 | ||
6217 | ret = migrate_pages(&cc->migratepages, alloc_migrate_target, | 6259 | ret = migrate_pages(&cc->migratepages, alloc_migrate_target, |
6218 | 0, MIGRATE_SYNC, MR_CMA); | 6260 | NULL, 0, cc->mode, MR_CMA); |
6219 | } | 6261 | } |
6220 | if (ret < 0) { | 6262 | if (ret < 0) { |
6221 | putback_movable_pages(&cc->migratepages); | 6263 | putback_movable_pages(&cc->migratepages); |
@@ -6254,7 +6296,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
6254 | .nr_migratepages = 0, | 6296 | .nr_migratepages = 0, |
6255 | .order = -1, | 6297 | .order = -1, |
6256 | .zone = page_zone(pfn_to_page(start)), | 6298 | .zone = page_zone(pfn_to_page(start)), |
6257 | .sync = true, | 6299 | .mode = MIGRATE_SYNC, |
6258 | .ignore_skip_hint = true, | 6300 | .ignore_skip_hint = true, |
6259 | }; | 6301 | }; |
6260 | INIT_LIST_HEAD(&cc.migratepages); | 6302 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -6409,7 +6451,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
6409 | { | 6451 | { |
6410 | struct page *page; | 6452 | struct page *page; |
6411 | struct zone *zone; | 6453 | struct zone *zone; |
6412 | int order, i; | 6454 | unsigned int order, i; |
6413 | unsigned long pfn; | 6455 | unsigned long pfn; |
6414 | unsigned long flags; | 6456 | unsigned long flags; |
6415 | /* find the first valid pfn */ | 6457 | /* find the first valid pfn */ |
@@ -6461,7 +6503,7 @@ bool is_free_buddy_page(struct page *page) | |||
6461 | struct zone *zone = page_zone(page); | 6503 | struct zone *zone = page_zone(page); |
6462 | unsigned long pfn = page_to_pfn(page); | 6504 | unsigned long pfn = page_to_pfn(page); |
6463 | unsigned long flags; | 6505 | unsigned long flags; |
6464 | int order; | 6506 | unsigned int order; |
6465 | 6507 | ||
6466 | spin_lock_irqsave(&zone->lock, flags); | 6508 | spin_lock_irqsave(&zone->lock, flags); |
6467 | for (order = 0; order < MAX_ORDER; order++) { | 6509 | for (order = 0; order < MAX_ORDER; order++) { |
diff --git a/mm/page_io.c b/mm/page_io.c index 7c59ef681381..58b50d2901fe 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -248,11 +248,16 @@ out: | |||
248 | return ret; | 248 | return ret; |
249 | } | 249 | } |
250 | 250 | ||
251 | static sector_t swap_page_sector(struct page *page) | ||
252 | { | ||
253 | return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); | ||
254 | } | ||
255 | |||
251 | int __swap_writepage(struct page *page, struct writeback_control *wbc, | 256 | int __swap_writepage(struct page *page, struct writeback_control *wbc, |
252 | void (*end_write_func)(struct bio *, int)) | 257 | void (*end_write_func)(struct bio *, int)) |
253 | { | 258 | { |
254 | struct bio *bio; | 259 | struct bio *bio; |
255 | int ret = 0, rw = WRITE; | 260 | int ret, rw = WRITE; |
256 | struct swap_info_struct *sis = page_swap_info(page); | 261 | struct swap_info_struct *sis = page_swap_info(page); |
257 | 262 | ||
258 | if (sis->flags & SWP_FILE) { | 263 | if (sis->flags & SWP_FILE) { |
@@ -297,6 +302,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
297 | return ret; | 302 | return ret; |
298 | } | 303 | } |
299 | 304 | ||
305 | ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); | ||
306 | if (!ret) { | ||
307 | count_vm_event(PSWPOUT); | ||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | ret = 0; | ||
300 | bio = get_swap_bio(GFP_NOIO, page, end_write_func); | 312 | bio = get_swap_bio(GFP_NOIO, page, end_write_func); |
301 | if (bio == NULL) { | 313 | if (bio == NULL) { |
302 | set_page_dirty(page); | 314 | set_page_dirty(page); |
@@ -338,6 +350,13 @@ int swap_readpage(struct page *page) | |||
338 | return ret; | 350 | return ret; |
339 | } | 351 | } |
340 | 352 | ||
353 | ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); | ||
354 | if (!ret) { | ||
355 | count_vm_event(PSWPIN); | ||
356 | return 0; | ||
357 | } | ||
358 | |||
359 | ret = 0; | ||
341 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 360 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
342 | if (bio == NULL) { | 361 | if (bio == NULL) { |
343 | unlock_page(page); | 362 | unlock_page(page); |
@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
103 | * LOCK should suffice since the actual taking of the lock must | 103 | * LOCK should suffice since the actual taking of the lock must |
104 | * happen _before_ what follows. | 104 | * happen _before_ what follows. |
105 | */ | 105 | */ |
106 | might_sleep(); | ||
106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { | 107 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
107 | anon_vma_lock_write(anon_vma); | 108 | anon_vma_lock_write(anon_vma); |
108 | anon_vma_unlock_write(anon_vma); | 109 | anon_vma_unlock_write(anon_vma); |
@@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page) | |||
426 | * above cannot corrupt). | 427 | * above cannot corrupt). |
427 | */ | 428 | */ |
428 | if (!page_mapped(page)) { | 429 | if (!page_mapped(page)) { |
430 | rcu_read_unlock(); | ||
429 | put_anon_vma(anon_vma); | 431 | put_anon_vma(anon_vma); |
430 | anon_vma = NULL; | 432 | return NULL; |
431 | } | 433 | } |
432 | out: | 434 | out: |
433 | rcu_read_unlock(); | 435 | rcu_read_unlock(); |
@@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) | |||
477 | } | 479 | } |
478 | 480 | ||
479 | if (!page_mapped(page)) { | 481 | if (!page_mapped(page)) { |
482 | rcu_read_unlock(); | ||
480 | put_anon_vma(anon_vma); | 483 | put_anon_vma(anon_vma); |
481 | anon_vma = NULL; | 484 | return NULL; |
482 | goto out; | ||
483 | } | 485 | } |
484 | 486 | ||
485 | /* we pinned the anon_vma, its safe to sleep */ | 487 | /* we pinned the anon_vma, its safe to sleep */ |
@@ -669,7 +671,7 @@ struct page_referenced_arg { | |||
669 | /* | 671 | /* |
670 | * arg: page_referenced_arg will be passed | 672 | * arg: page_referenced_arg will be passed |
671 | */ | 673 | */ |
672 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, | 674 | static int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
673 | unsigned long address, void *arg) | 675 | unsigned long address, void *arg) |
674 | { | 676 | { |
675 | struct mm_struct *mm = vma->vm_mm; | 677 | struct mm_struct *mm = vma->vm_mm; |
@@ -986,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page, | |||
986 | { | 988 | { |
987 | int first = atomic_inc_and_test(&page->_mapcount); | 989 | int first = atomic_inc_and_test(&page->_mapcount); |
988 | if (first) { | 990 | if (first) { |
991 | /* | ||
992 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because | ||
993 | * these counters are not modified in interrupt context, and | ||
994 | * pte lock(a spinlock) is held, which implies preemption | ||
995 | * disabled. | ||
996 | */ | ||
989 | if (PageTransHuge(page)) | 997 | if (PageTransHuge(page)) |
990 | __inc_zone_page_state(page, | 998 | __inc_zone_page_state(page, |
991 | NR_ANON_TRANSPARENT_HUGEPAGES); | 999 | NR_ANON_TRANSPARENT_HUGEPAGES); |
@@ -1024,11 +1032,25 @@ void page_add_new_anon_rmap(struct page *page, | |||
1024 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | 1032 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, |
1025 | hpage_nr_pages(page)); | 1033 | hpage_nr_pages(page)); |
1026 | __page_set_anon_rmap(page, vma, address, 1); | 1034 | __page_set_anon_rmap(page, vma, address, 1); |
1027 | if (!mlocked_vma_newpage(vma, page)) { | 1035 | |
1036 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
1037 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { | ||
1028 | SetPageActive(page); | 1038 | SetPageActive(page); |
1029 | lru_cache_add(page); | 1039 | lru_cache_add(page); |
1030 | } else | 1040 | return; |
1031 | add_page_to_unevictable_list(page); | 1041 | } |
1042 | |||
1043 | if (!TestSetPageMlocked(page)) { | ||
1044 | /* | ||
1045 | * We use the irq-unsafe __mod_zone_page_stat because this | ||
1046 | * counter is not modified from interrupt context, and the pte | ||
1047 | * lock is held(spinlock), which implies preemption disabled. | ||
1048 | */ | ||
1049 | __mod_zone_page_state(page_zone(page), NR_MLOCK, | ||
1050 | hpage_nr_pages(page)); | ||
1051 | count_vm_event(UNEVICTABLE_PGMLOCKED); | ||
1052 | } | ||
1053 | add_page_to_unevictable_list(page); | ||
1032 | } | 1054 | } |
1033 | 1055 | ||
1034 | /** | 1056 | /** |
@@ -1077,6 +1099,11 @@ void page_remove_rmap(struct page *page) | |||
1077 | /* | 1099 | /* |
1078 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED | 1100 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED |
1079 | * and not charged by memcg for now. | 1101 | * and not charged by memcg for now. |
1102 | * | ||
1103 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because | ||
1104 | * these counters are not modified in interrupt context, and | ||
1105 | * these counters are not modified in interrupt context, and | ||
1106 | * pte lock(a spinlock) is held, which implies preemption disabled. | ||
1080 | */ | 1107 | */ |
1081 | if (unlikely(PageHuge(page))) | 1108 | if (unlikely(PageHuge(page))) |
1082 | goto out; | 1109 | goto out; |
@@ -1112,7 +1139,7 @@ out: | |||
1112 | /* | 1139 | /* |
1113 | * @arg: enum ttu_flags will be passed to this argument | 1140 | * @arg: enum ttu_flags will be passed to this argument |
1114 | */ | 1141 | */ |
1115 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1142 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
1116 | unsigned long address, void *arg) | 1143 | unsigned long address, void *arg) |
1117 | { | 1144 | { |
1118 | struct mm_struct *mm = vma->vm_mm; | 1145 | struct mm_struct *mm = vma->vm_mm; |
@@ -1135,7 +1162,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1135 | if (vma->vm_flags & VM_LOCKED) | 1162 | if (vma->vm_flags & VM_LOCKED) |
1136 | goto out_mlock; | 1163 | goto out_mlock; |
1137 | 1164 | ||
1138 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | 1165 | if (flags & TTU_MUNLOCK) |
1139 | goto out_unmap; | 1166 | goto out_unmap; |
1140 | } | 1167 | } |
1141 | if (!(flags & TTU_IGNORE_ACCESS)) { | 1168 | if (!(flags & TTU_IGNORE_ACCESS)) { |
@@ -1203,7 +1230,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1203 | * pte. do_swap_page() will wait until the migration | 1230 | * pte. do_swap_page() will wait until the migration |
1204 | * pte is removed and then restart fault handling. | 1231 | * pte is removed and then restart fault handling. |
1205 | */ | 1232 | */ |
1206 | BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); | 1233 | BUG_ON(!(flags & TTU_MIGRATION)); |
1207 | entry = make_migration_entry(page, pte_write(pteval)); | 1234 | entry = make_migration_entry(page, pte_write(pteval)); |
1208 | } | 1235 | } |
1209 | swp_pte = swp_entry_to_pte(entry); | 1236 | swp_pte = swp_entry_to_pte(entry); |
@@ -1212,7 +1239,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1212 | set_pte_at(mm, address, pte, swp_pte); | 1239 | set_pte_at(mm, address, pte, swp_pte); |
1213 | BUG_ON(pte_file(*pte)); | 1240 | BUG_ON(pte_file(*pte)); |
1214 | } else if (IS_ENABLED(CONFIG_MIGRATION) && | 1241 | } else if (IS_ENABLED(CONFIG_MIGRATION) && |
1215 | (TTU_ACTION(flags) == TTU_MIGRATION)) { | 1242 | (flags & TTU_MIGRATION)) { |
1216 | /* Establish migration entry for a file page */ | 1243 | /* Establish migration entry for a file page */ |
1217 | swp_entry_t entry; | 1244 | swp_entry_t entry; |
1218 | entry = make_migration_entry(page, pte_write(pteval)); | 1245 | entry = make_migration_entry(page, pte_write(pteval)); |
@@ -1225,7 +1252,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1225 | 1252 | ||
1226 | out_unmap: | 1253 | out_unmap: |
1227 | pte_unmap_unlock(pte, ptl); | 1254 | pte_unmap_unlock(pte, ptl); |
1228 | if (ret != SWAP_FAIL) | 1255 | if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) |
1229 | mmu_notifier_invalidate_page(mm, address); | 1256 | mmu_notifier_invalidate_page(mm, address); |
1230 | out: | 1257 | out: |
1231 | return ret; | 1258 | return ret; |
@@ -1359,7 +1386,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1359 | if (page->index != linear_page_index(vma, address)) { | 1386 | if (page->index != linear_page_index(vma, address)) { |
1360 | pte_t ptfile = pgoff_to_pte(page->index); | 1387 | pte_t ptfile = pgoff_to_pte(page->index); |
1361 | if (pte_soft_dirty(pteval)) | 1388 | if (pte_soft_dirty(pteval)) |
1362 | pte_file_mksoft_dirty(ptfile); | 1389 | ptfile = pte_file_mksoft_dirty(ptfile); |
1363 | set_pte_at(mm, address, pte, ptfile); | 1390 | set_pte_at(mm, address, pte, ptfile); |
1364 | } | 1391 | } |
1365 | 1392 | ||
@@ -1512,7 +1539,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1512 | * locking requirements of exec(), migration skips | 1539 | * locking requirements of exec(), migration skips |
1513 | * temporary VMAs until after exec() completes. | 1540 | * temporary VMAs until after exec() completes. |
1514 | */ | 1541 | */ |
1515 | if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) | 1542 | if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) |
1516 | rwc.invalid_vma = invalid_migration_vma; | 1543 | rwc.invalid_vma = invalid_migration_vma; |
1517 | 1544 | ||
1518 | ret = rmap_walk(page, &rwc); | 1545 | ret = rmap_walk(page, &rwc); |
diff --git a/mm/shmem.c b/mm/shmem.c index 9f70e02111c6..5402481c28d1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1132,7 +1132,7 @@ repeat: | |||
1132 | goto decused; | 1132 | goto decused; |
1133 | } | 1133 | } |
1134 | 1134 | ||
1135 | SetPageSwapBacked(page); | 1135 | __SetPageSwapBacked(page); |
1136 | __set_page_locked(page); | 1136 | __set_page_locked(page); |
1137 | error = mem_cgroup_charge_file(page, current->mm, | 1137 | error = mem_cgroup_charge_file(page, current->mm, |
1138 | gfp & GFP_RECLAIM_MASK); | 1138 | gfp & GFP_RECLAIM_MASK); |
@@ -1372,9 +1372,13 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |||
1372 | loff_t pos, unsigned len, unsigned flags, | 1372 | loff_t pos, unsigned len, unsigned flags, |
1373 | struct page **pagep, void **fsdata) | 1373 | struct page **pagep, void **fsdata) |
1374 | { | 1374 | { |
1375 | int ret; | ||
1375 | struct inode *inode = mapping->host; | 1376 | struct inode *inode = mapping->host; |
1376 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1377 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1377 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | 1378 | ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); |
1379 | if (ret == 0 && *pagep) | ||
1380 | init_page_accessed(*pagep); | ||
1381 | return ret; | ||
1378 | } | 1382 | } |
1379 | 1383 | ||
1380 | static int | 1384 | static int |
@@ -1621,10 +1621,16 @@ __initcall(cpucache_init); | |||
1621 | static noinline void | 1621 | static noinline void |
1622 | slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | 1622 | slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) |
1623 | { | 1623 | { |
1624 | #if DEBUG | ||
1624 | struct kmem_cache_node *n; | 1625 | struct kmem_cache_node *n; |
1625 | struct page *page; | 1626 | struct page *page; |
1626 | unsigned long flags; | 1627 | unsigned long flags; |
1627 | int node; | 1628 | int node; |
1629 | static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, | ||
1630 | DEFAULT_RATELIMIT_BURST); | ||
1631 | |||
1632 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) | ||
1633 | return; | ||
1628 | 1634 | ||
1629 | printk(KERN_WARNING | 1635 | printk(KERN_WARNING |
1630 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 1636 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", |
@@ -1662,6 +1668,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | |||
1662 | node, active_slabs, num_slabs, active_objs, num_objs, | 1668 | node, active_slabs, num_slabs, active_objs, num_objs, |
1663 | free_objects); | 1669 | free_objects); |
1664 | } | 1670 | } |
1671 | #endif | ||
1665 | } | 1672 | } |
1666 | 1673 | ||
1667 | /* | 1674 | /* |
@@ -1681,10 +1688,13 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
1681 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1688 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1682 | flags |= __GFP_RECLAIMABLE; | 1689 | flags |= __GFP_RECLAIMABLE; |
1683 | 1690 | ||
1691 | if (memcg_charge_slab(cachep, flags, cachep->gfporder)) | ||
1692 | return NULL; | ||
1693 | |||
1684 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); | 1694 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
1685 | if (!page) { | 1695 | if (!page) { |
1686 | if (!(flags & __GFP_NOWARN) && printk_ratelimit()) | 1696 | memcg_uncharge_slab(cachep, cachep->gfporder); |
1687 | slab_out_of_memory(cachep, flags, nodeid); | 1697 | slab_out_of_memory(cachep, flags, nodeid); |
1688 | return NULL; | 1698 | return NULL; |
1689 | } | 1699 | } |
1690 | 1700 | ||
@@ -1702,7 +1712,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
1702 | __SetPageSlab(page); | 1712 | __SetPageSlab(page); |
1703 | if (page->pfmemalloc) | 1713 | if (page->pfmemalloc) |
1704 | SetPageSlabPfmemalloc(page); | 1714 | SetPageSlabPfmemalloc(page); |
1705 | memcg_bind_pages(cachep, cachep->gfporder); | ||
1706 | 1715 | ||
1707 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1716 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
1708 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | 1717 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); |
@@ -1738,10 +1747,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) | |||
1738 | page_mapcount_reset(page); | 1747 | page_mapcount_reset(page); |
1739 | page->mapping = NULL; | 1748 | page->mapping = NULL; |
1740 | 1749 | ||
1741 | memcg_release_pages(cachep, cachep->gfporder); | ||
1742 | if (current->reclaim_state) | 1750 | if (current->reclaim_state) |
1743 | current->reclaim_state->reclaimed_slab += nr_freed; | 1751 | current->reclaim_state->reclaimed_slab += nr_freed; |
1744 | __free_memcg_kmem_pages(page, cachep->gfporder); | 1752 | __free_pages(page, cachep->gfporder); |
1753 | memcg_uncharge_slab(cachep, cachep->gfporder); | ||
1745 | } | 1754 | } |
1746 | 1755 | ||
1747 | static void kmem_rcu_free(struct rcu_head *head) | 1756 | static void kmem_rcu_free(struct rcu_head *head) |
@@ -2469,8 +2478,7 @@ out: | |||
2469 | return nr_freed; | 2478 | return nr_freed; |
2470 | } | 2479 | } |
2471 | 2480 | ||
2472 | /* Called with slab_mutex held to protect against cpu hotplug */ | 2481 | int __kmem_cache_shrink(struct kmem_cache *cachep) |
2473 | static int __cache_shrink(struct kmem_cache *cachep) | ||
2474 | { | 2482 | { |
2475 | int ret = 0, i = 0; | 2483 | int ret = 0, i = 0; |
2476 | struct kmem_cache_node *n; | 2484 | struct kmem_cache_node *n; |
@@ -2491,32 +2499,11 @@ static int __cache_shrink(struct kmem_cache *cachep) | |||
2491 | return (ret ? 1 : 0); | 2499 | return (ret ? 1 : 0); |
2492 | } | 2500 | } |
2493 | 2501 | ||
2494 | /** | ||
2495 | * kmem_cache_shrink - Shrink a cache. | ||
2496 | * @cachep: The cache to shrink. | ||
2497 | * | ||
2498 | * Releases as many slabs as possible for a cache. | ||
2499 | * To help debugging, a zero exit status indicates all slabs were released. | ||
2500 | */ | ||
2501 | int kmem_cache_shrink(struct kmem_cache *cachep) | ||
2502 | { | ||
2503 | int ret; | ||
2504 | BUG_ON(!cachep || in_interrupt()); | ||
2505 | |||
2506 | get_online_cpus(); | ||
2507 | mutex_lock(&slab_mutex); | ||
2508 | ret = __cache_shrink(cachep); | ||
2509 | mutex_unlock(&slab_mutex); | ||
2510 | put_online_cpus(); | ||
2511 | return ret; | ||
2512 | } | ||
2513 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
2514 | |||
2515 | int __kmem_cache_shutdown(struct kmem_cache *cachep) | 2502 | int __kmem_cache_shutdown(struct kmem_cache *cachep) |
2516 | { | 2503 | { |
2517 | int i; | 2504 | int i; |
2518 | struct kmem_cache_node *n; | 2505 | struct kmem_cache_node *n; |
2519 | int rc = __cache_shrink(cachep); | 2506 | int rc = __kmem_cache_shrink(cachep); |
2520 | 2507 | ||
2521 | if (rc) | 2508 | if (rc) |
2522 | return rc; | 2509 | return rc; |
@@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, | |||
91 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) | 91 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) |
92 | 92 | ||
93 | int __kmem_cache_shutdown(struct kmem_cache *); | 93 | int __kmem_cache_shutdown(struct kmem_cache *); |
94 | int __kmem_cache_shrink(struct kmem_cache *); | ||
94 | void slab_kmem_cache_release(struct kmem_cache *); | 95 | void slab_kmem_cache_release(struct kmem_cache *); |
95 | 96 | ||
96 | struct seq_file; | 97 | struct seq_file; |
@@ -120,21 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s) | |||
120 | return !s->memcg_params || s->memcg_params->is_root_cache; | 121 | return !s->memcg_params || s->memcg_params->is_root_cache; |
121 | } | 122 | } |
122 | 123 | ||
123 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | ||
124 | { | ||
125 | if (!is_root_cache(s)) | ||
126 | atomic_add(1 << order, &s->memcg_params->nr_pages); | ||
127 | } | ||
128 | |||
129 | static inline void memcg_release_pages(struct kmem_cache *s, int order) | ||
130 | { | ||
131 | if (is_root_cache(s)) | ||
132 | return; | ||
133 | |||
134 | if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) | ||
135 | mem_cgroup_destroy_cache(s); | ||
136 | } | ||
137 | |||
138 | static inline bool slab_equal_or_root(struct kmem_cache *s, | 124 | static inline bool slab_equal_or_root(struct kmem_cache *s, |
139 | struct kmem_cache *p) | 125 | struct kmem_cache *p) |
140 | { | 126 | { |
@@ -192,18 +178,29 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | |||
192 | return s; | 178 | return s; |
193 | return s->memcg_params->root_cache; | 179 | return s->memcg_params->root_cache; |
194 | } | 180 | } |
195 | #else | 181 | |
196 | static inline bool is_root_cache(struct kmem_cache *s) | 182 | static __always_inline int memcg_charge_slab(struct kmem_cache *s, |
183 | gfp_t gfp, int order) | ||
197 | { | 184 | { |
198 | return true; | 185 | if (!memcg_kmem_enabled()) |
186 | return 0; | ||
187 | if (is_root_cache(s)) | ||
188 | return 0; | ||
189 | return __memcg_charge_slab(s, gfp, order); | ||
199 | } | 190 | } |
200 | 191 | ||
201 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | 192 | static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) |
202 | { | 193 | { |
194 | if (!memcg_kmem_enabled()) | ||
195 | return; | ||
196 | if (is_root_cache(s)) | ||
197 | return; | ||
198 | __memcg_uncharge_slab(s, order); | ||
203 | } | 199 | } |
204 | 200 | #else | |
205 | static inline void memcg_release_pages(struct kmem_cache *s, int order) | 201 | static inline bool is_root_cache(struct kmem_cache *s) |
206 | { | 202 | { |
203 | return true; | ||
207 | } | 204 | } |
208 | 205 | ||
209 | static inline bool slab_equal_or_root(struct kmem_cache *s, | 206 | static inline bool slab_equal_or_root(struct kmem_cache *s, |
@@ -227,6 +224,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | |||
227 | { | 224 | { |
228 | return s; | 225 | return s; |
229 | } | 226 | } |
227 | |||
228 | static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) | ||
229 | { | ||
230 | return 0; | ||
231 | } | ||
232 | |||
233 | static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | ||
234 | { | ||
235 | } | ||
230 | #endif | 236 | #endif |
231 | 237 | ||
232 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | 238 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 102cc6fca3d3..735e01a0db6f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -160,7 +160,6 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, | |||
160 | 160 | ||
161 | s->refcount = 1; | 161 | s->refcount = 1; |
162 | list_add(&s->list, &slab_caches); | 162 | list_add(&s->list, &slab_caches); |
163 | memcg_register_cache(s); | ||
164 | out: | 163 | out: |
165 | if (err) | 164 | if (err) |
166 | return ERR_PTR(err); | 165 | return ERR_PTR(err); |
@@ -205,6 +204,8 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
205 | int err; | 204 | int err; |
206 | 205 | ||
207 | get_online_cpus(); | 206 | get_online_cpus(); |
207 | get_online_mems(); | ||
208 | |||
208 | mutex_lock(&slab_mutex); | 209 | mutex_lock(&slab_mutex); |
209 | 210 | ||
210 | err = kmem_cache_sanity_check(name, size); | 211 | err = kmem_cache_sanity_check(name, size); |
@@ -239,6 +240,8 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
239 | 240 | ||
240 | out_unlock: | 241 | out_unlock: |
241 | mutex_unlock(&slab_mutex); | 242 | mutex_unlock(&slab_mutex); |
243 | |||
244 | put_online_mems(); | ||
242 | put_online_cpus(); | 245 | put_online_cpus(); |
243 | 246 | ||
244 | if (err) { | 247 | if (err) { |
@@ -258,31 +261,29 @@ EXPORT_SYMBOL(kmem_cache_create); | |||
258 | 261 | ||
259 | #ifdef CONFIG_MEMCG_KMEM | 262 | #ifdef CONFIG_MEMCG_KMEM |
260 | /* | 263 | /* |
261 | * kmem_cache_create_memcg - Create a cache for a memory cgroup. | 264 | * memcg_create_kmem_cache - Create a cache for a memory cgroup. |
262 | * @memcg: The memory cgroup the new cache is for. | 265 | * @memcg: The memory cgroup the new cache is for. |
263 | * @root_cache: The parent of the new cache. | 266 | * @root_cache: The parent of the new cache. |
267 | * @memcg_name: The name of the memory cgroup (used for naming the new cache). | ||
264 | * | 268 | * |
265 | * This function attempts to create a kmem cache that will serve allocation | 269 | * This function attempts to create a kmem cache that will serve allocation |
266 | * requests going from @memcg to @root_cache. The new cache inherits properties | 270 | * requests going from @memcg to @root_cache. The new cache inherits properties |
267 | * from its parent. | 271 | * from its parent. |
268 | */ | 272 | */ |
269 | void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) | 273 | struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, |
274 | struct kmem_cache *root_cache, | ||
275 | const char *memcg_name) | ||
270 | { | 276 | { |
271 | struct kmem_cache *s; | 277 | struct kmem_cache *s = NULL; |
272 | char *cache_name; | 278 | char *cache_name; |
273 | 279 | ||
274 | get_online_cpus(); | 280 | get_online_cpus(); |
275 | mutex_lock(&slab_mutex); | 281 | get_online_mems(); |
276 | 282 | ||
277 | /* | 283 | mutex_lock(&slab_mutex); |
278 | * Since per-memcg caches are created asynchronously on first | ||
279 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
280 | * create the same cache, but only one of them may succeed. | ||
281 | */ | ||
282 | if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg))) | ||
283 | goto out_unlock; | ||
284 | 284 | ||
285 | cache_name = memcg_create_cache_name(memcg, root_cache); | 285 | cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, |
286 | memcg_cache_id(memcg), memcg_name); | ||
286 | if (!cache_name) | 287 | if (!cache_name) |
287 | goto out_unlock; | 288 | goto out_unlock; |
288 | 289 | ||
@@ -292,17 +293,19 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c | |||
292 | memcg, root_cache); | 293 | memcg, root_cache); |
293 | if (IS_ERR(s)) { | 294 | if (IS_ERR(s)) { |
294 | kfree(cache_name); | 295 | kfree(cache_name); |
295 | goto out_unlock; | 296 | s = NULL; |
296 | } | 297 | } |
297 | 298 | ||
298 | s->allocflags |= __GFP_KMEMCG; | ||
299 | |||
300 | out_unlock: | 299 | out_unlock: |
301 | mutex_unlock(&slab_mutex); | 300 | mutex_unlock(&slab_mutex); |
301 | |||
302 | put_online_mems(); | ||
302 | put_online_cpus(); | 303 | put_online_cpus(); |
304 | |||
305 | return s; | ||
303 | } | 306 | } |
304 | 307 | ||
305 | static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) | 308 | static int memcg_cleanup_cache_params(struct kmem_cache *s) |
306 | { | 309 | { |
307 | int rc; | 310 | int rc; |
308 | 311 | ||
@@ -311,13 +314,13 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) | |||
311 | return 0; | 314 | return 0; |
312 | 315 | ||
313 | mutex_unlock(&slab_mutex); | 316 | mutex_unlock(&slab_mutex); |
314 | rc = __kmem_cache_destroy_memcg_children(s); | 317 | rc = __memcg_cleanup_cache_params(s); |
315 | mutex_lock(&slab_mutex); | 318 | mutex_lock(&slab_mutex); |
316 | 319 | ||
317 | return rc; | 320 | return rc; |
318 | } | 321 | } |
319 | #else | 322 | #else |
320 | static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) | 323 | static int memcg_cleanup_cache_params(struct kmem_cache *s) |
321 | { | 324 | { |
322 | return 0; | 325 | return 0; |
323 | } | 326 | } |
@@ -332,27 +335,26 @@ void slab_kmem_cache_release(struct kmem_cache *s) | |||
332 | void kmem_cache_destroy(struct kmem_cache *s) | 335 | void kmem_cache_destroy(struct kmem_cache *s) |
333 | { | 336 | { |
334 | get_online_cpus(); | 337 | get_online_cpus(); |
338 | get_online_mems(); | ||
339 | |||
335 | mutex_lock(&slab_mutex); | 340 | mutex_lock(&slab_mutex); |
336 | 341 | ||
337 | s->refcount--; | 342 | s->refcount--; |
338 | if (s->refcount) | 343 | if (s->refcount) |
339 | goto out_unlock; | 344 | goto out_unlock; |
340 | 345 | ||
341 | if (kmem_cache_destroy_memcg_children(s) != 0) | 346 | if (memcg_cleanup_cache_params(s) != 0) |
342 | goto out_unlock; | 347 | goto out_unlock; |
343 | 348 | ||
344 | list_del(&s->list); | ||
345 | memcg_unregister_cache(s); | ||
346 | |||
347 | if (__kmem_cache_shutdown(s) != 0) { | 349 | if (__kmem_cache_shutdown(s) != 0) { |
348 | list_add(&s->list, &slab_caches); | ||
349 | memcg_register_cache(s); | ||
350 | printk(KERN_ERR "kmem_cache_destroy %s: " | 350 | printk(KERN_ERR "kmem_cache_destroy %s: " |
351 | "Slab cache still has objects\n", s->name); | 351 | "Slab cache still has objects\n", s->name); |
352 | dump_stack(); | 352 | dump_stack(); |
353 | goto out_unlock; | 353 | goto out_unlock; |
354 | } | 354 | } |
355 | 355 | ||
356 | list_del(&s->list); | ||
357 | |||
356 | mutex_unlock(&slab_mutex); | 358 | mutex_unlock(&slab_mutex); |
357 | if (s->flags & SLAB_DESTROY_BY_RCU) | 359 | if (s->flags & SLAB_DESTROY_BY_RCU) |
358 | rcu_barrier(); | 360 | rcu_barrier(); |
@@ -363,15 +365,36 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
363 | #else | 365 | #else |
364 | slab_kmem_cache_release(s); | 366 | slab_kmem_cache_release(s); |
365 | #endif | 367 | #endif |
366 | goto out_put_cpus; | 368 | goto out; |
367 | 369 | ||
368 | out_unlock: | 370 | out_unlock: |
369 | mutex_unlock(&slab_mutex); | 371 | mutex_unlock(&slab_mutex); |
370 | out_put_cpus: | 372 | out: |
373 | put_online_mems(); | ||
371 | put_online_cpus(); | 374 | put_online_cpus(); |
372 | } | 375 | } |
373 | EXPORT_SYMBOL(kmem_cache_destroy); | 376 | EXPORT_SYMBOL(kmem_cache_destroy); |
374 | 377 | ||
378 | /** | ||
379 | * kmem_cache_shrink - Shrink a cache. | ||
380 | * @cachep: The cache to shrink. | ||
381 | * | ||
382 | * Releases as many slabs as possible for a cache. | ||
383 | * To help debugging, a zero exit status indicates all slabs were released. | ||
384 | */ | ||
385 | int kmem_cache_shrink(struct kmem_cache *cachep) | ||
386 | { | ||
387 | int ret; | ||
388 | |||
389 | get_online_cpus(); | ||
390 | get_online_mems(); | ||
391 | ret = __kmem_cache_shrink(cachep); | ||
392 | put_online_mems(); | ||
393 | put_online_cpus(); | ||
394 | return ret; | ||
395 | } | ||
396 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
397 | |||
375 | int slab_is_available(void) | 398 | int slab_is_available(void) |
376 | { | 399 | { |
377 | return slab_state >= UP; | 400 | return slab_state >= UP; |
@@ -586,6 +609,24 @@ void __init create_kmalloc_caches(unsigned long flags) | |||
586 | } | 609 | } |
587 | #endif /* !CONFIG_SLOB */ | 610 | #endif /* !CONFIG_SLOB */ |
588 | 611 | ||
612 | /* | ||
613 | * To avoid unnecessary overhead, we pass through large allocation requests | ||
614 | * directly to the page allocator. We use __GFP_COMP, because we will need to | ||
615 | * know the allocation order to free the pages properly in kfree. | ||
616 | */ | ||
617 | void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) | ||
618 | { | ||
619 | void *ret; | ||
620 | struct page *page; | ||
621 | |||
622 | flags |= __GFP_COMP; | ||
623 | page = alloc_kmem_pages(flags, order); | ||
624 | ret = page ? page_address(page) : NULL; | ||
625 | kmemleak_alloc(ret, size, 1, flags); | ||
626 | return ret; | ||
627 | } | ||
628 | EXPORT_SYMBOL(kmalloc_order); | ||
629 | |||
589 | #ifdef CONFIG_TRACING | 630 | #ifdef CONFIG_TRACING |
590 | void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) | 631 | void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) |
591 | { | 632 | { |
@@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c) | |||
620 | return 0; | 620 | return 0; |
621 | } | 621 | } |
622 | 622 | ||
623 | int kmem_cache_shrink(struct kmem_cache *d) | 623 | int __kmem_cache_shrink(struct kmem_cache *d) |
624 | { | 624 | { |
625 | return 0; | 625 | return 0; |
626 | } | 626 | } |
627 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
628 | 627 | ||
629 | struct kmem_cache kmem_cache_boot = { | 628 | struct kmem_cache kmem_cache_boot = { |
630 | .name = "kmem_cache", | 629 | .name = "kmem_cache", |
@@ -403,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page | |||
403 | stat(s, CMPXCHG_DOUBLE_FAIL); | 403 | stat(s, CMPXCHG_DOUBLE_FAIL); |
404 | 404 | ||
405 | #ifdef SLUB_DEBUG_CMPXCHG | 405 | #ifdef SLUB_DEBUG_CMPXCHG |
406 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | 406 | pr_info("%s %s: cmpxchg double redo ", n, s->name); |
407 | #endif | 407 | #endif |
408 | 408 | ||
409 | return 0; | 409 | return 0; |
@@ -444,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
444 | stat(s, CMPXCHG_DOUBLE_FAIL); | 444 | stat(s, CMPXCHG_DOUBLE_FAIL); |
445 | 445 | ||
446 | #ifdef SLUB_DEBUG_CMPXCHG | 446 | #ifdef SLUB_DEBUG_CMPXCHG |
447 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | 447 | pr_info("%s %s: cmpxchg double redo ", n, s->name); |
448 | #endif | 448 | #endif |
449 | 449 | ||
450 | return 0; | 450 | return 0; |
@@ -546,14 +546,14 @@ static void print_track(const char *s, struct track *t) | |||
546 | if (!t->addr) | 546 | if (!t->addr) |
547 | return; | 547 | return; |
548 | 548 | ||
549 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", | 549 | pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", |
550 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); | 550 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); |
551 | #ifdef CONFIG_STACKTRACE | 551 | #ifdef CONFIG_STACKTRACE |
552 | { | 552 | { |
553 | int i; | 553 | int i; |
554 | for (i = 0; i < TRACK_ADDRS_COUNT; i++) | 554 | for (i = 0; i < TRACK_ADDRS_COUNT; i++) |
555 | if (t->addrs[i]) | 555 | if (t->addrs[i]) |
556 | printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); | 556 | pr_err("\t%pS\n", (void *)t->addrs[i]); |
557 | else | 557 | else |
558 | break; | 558 | break; |
559 | } | 559 | } |
@@ -571,38 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object) | |||
571 | 571 | ||
572 | static void print_page_info(struct page *page) | 572 | static void print_page_info(struct page *page) |
573 | { | 573 | { |
574 | printk(KERN_ERR | 574 | pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", |
575 | "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", | ||
576 | page, page->objects, page->inuse, page->freelist, page->flags); | 575 | page, page->objects, page->inuse, page->freelist, page->flags); |
577 | 576 | ||
578 | } | 577 | } |
579 | 578 | ||
580 | static void slab_bug(struct kmem_cache *s, char *fmt, ...) | 579 | static void slab_bug(struct kmem_cache *s, char *fmt, ...) |
581 | { | 580 | { |
581 | struct va_format vaf; | ||
582 | va_list args; | 582 | va_list args; |
583 | char buf[100]; | ||
584 | 583 | ||
585 | va_start(args, fmt); | 584 | va_start(args, fmt); |
586 | vsnprintf(buf, sizeof(buf), fmt, args); | 585 | vaf.fmt = fmt; |
587 | va_end(args); | 586 | vaf.va = &args; |
588 | printk(KERN_ERR "========================================" | 587 | pr_err("=============================================================================\n"); |
589 | "=====================================\n"); | 588 | pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); |
590 | printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); | 589 | pr_err("-----------------------------------------------------------------------------\n\n"); |
591 | printk(KERN_ERR "----------------------------------------" | ||
592 | "-------------------------------------\n\n"); | ||
593 | 590 | ||
594 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 591 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
592 | va_end(args); | ||
595 | } | 593 | } |
596 | 594 | ||
597 | static void slab_fix(struct kmem_cache *s, char *fmt, ...) | 595 | static void slab_fix(struct kmem_cache *s, char *fmt, ...) |
598 | { | 596 | { |
597 | struct va_format vaf; | ||
599 | va_list args; | 598 | va_list args; |
600 | char buf[100]; | ||
601 | 599 | ||
602 | va_start(args, fmt); | 600 | va_start(args, fmt); |
603 | vsnprintf(buf, sizeof(buf), fmt, args); | 601 | vaf.fmt = fmt; |
602 | vaf.va = &args; | ||
603 | pr_err("FIX %s: %pV\n", s->name, &vaf); | ||
604 | va_end(args); | 604 | va_end(args); |
605 | printk(KERN_ERR "FIX %s: %s\n", s->name, buf); | ||
606 | } | 605 | } |
607 | 606 | ||
608 | static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | 607 | static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) |
@@ -614,8 +613,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
614 | 613 | ||
615 | print_page_info(page); | 614 | print_page_info(page); |
616 | 615 | ||
617 | printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", | 616 | pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", |
618 | p, p - addr, get_freepointer(s, p)); | 617 | p, p - addr, get_freepointer(s, p)); |
619 | 618 | ||
620 | if (p > addr + 16) | 619 | if (p > addr + 16) |
621 | print_section("Bytes b4 ", p - 16, 16); | 620 | print_section("Bytes b4 ", p - 16, 16); |
@@ -698,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
698 | end--; | 697 | end--; |
699 | 698 | ||
700 | slab_bug(s, "%s overwritten", what); | 699 | slab_bug(s, "%s overwritten", what); |
701 | printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", | 700 | pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", |
702 | fault, end - 1, fault[0], value); | 701 | fault, end - 1, fault[0], value); |
703 | print_trailer(s, page, object); | 702 | print_trailer(s, page, object); |
704 | 703 | ||
@@ -931,7 +930,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, | |||
931 | int alloc) | 930 | int alloc) |
932 | { | 931 | { |
933 | if (s->flags & SLAB_TRACE) { | 932 | if (s->flags & SLAB_TRACE) { |
934 | printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", | 933 | pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", |
935 | s->name, | 934 | s->name, |
936 | alloc ? "alloc" : "free", | 935 | alloc ? "alloc" : "free", |
937 | object, page->inuse, | 936 | object, page->inuse, |
@@ -1134,9 +1133,8 @@ static noinline struct kmem_cache_node *free_debug_processing( | |||
1134 | slab_err(s, page, "Attempt to free object(0x%p) " | 1133 | slab_err(s, page, "Attempt to free object(0x%p) " |
1135 | "outside of slab", object); | 1134 | "outside of slab", object); |
1136 | } else if (!page->slab_cache) { | 1135 | } else if (!page->slab_cache) { |
1137 | printk(KERN_ERR | 1136 | pr_err("SLUB <none>: no slab for object 0x%p.\n", |
1138 | "SLUB <none>: no slab for object 0x%p.\n", | 1137 | object); |
1139 | object); | ||
1140 | dump_stack(); | 1138 | dump_stack(); |
1141 | } else | 1139 | } else |
1142 | object_err(s, page, object, | 1140 | object_err(s, page, object, |
@@ -1219,8 +1217,8 @@ static int __init setup_slub_debug(char *str) | |||
1219 | slub_debug |= SLAB_FAILSLAB; | 1217 | slub_debug |= SLAB_FAILSLAB; |
1220 | break; | 1218 | break; |
1221 | default: | 1219 | default: |
1222 | printk(KERN_ERR "slub_debug option '%c' " | 1220 | pr_err("slub_debug option '%c' unknown. skipped\n", |
1223 | "unknown. skipped\n", *str); | 1221 | *str); |
1224 | } | 1222 | } |
1225 | } | 1223 | } |
1226 | 1224 | ||
@@ -1314,17 +1312,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
1314 | /* | 1312 | /* |
1315 | * Slab allocation and freeing | 1313 | * Slab allocation and freeing |
1316 | */ | 1314 | */ |
1317 | static inline struct page *alloc_slab_page(gfp_t flags, int node, | 1315 | static inline struct page *alloc_slab_page(struct kmem_cache *s, |
1318 | struct kmem_cache_order_objects oo) | 1316 | gfp_t flags, int node, struct kmem_cache_order_objects oo) |
1319 | { | 1317 | { |
1318 | struct page *page; | ||
1320 | int order = oo_order(oo); | 1319 | int order = oo_order(oo); |
1321 | 1320 | ||
1322 | flags |= __GFP_NOTRACK; | 1321 | flags |= __GFP_NOTRACK; |
1323 | 1322 | ||
1323 | if (memcg_charge_slab(s, flags, order)) | ||
1324 | return NULL; | ||
1325 | |||
1324 | if (node == NUMA_NO_NODE) | 1326 | if (node == NUMA_NO_NODE) |
1325 | return alloc_pages(flags, order); | 1327 | page = alloc_pages(flags, order); |
1326 | else | 1328 | else |
1327 | return alloc_pages_exact_node(node, flags, order); | 1329 | page = alloc_pages_exact_node(node, flags, order); |
1330 | |||
1331 | if (!page) | ||
1332 | memcg_uncharge_slab(s, order); | ||
1333 | |||
1334 | return page; | ||
1328 | } | 1335 | } |
1329 | 1336 | ||
1330 | static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | 1337 | static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) |
@@ -1346,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1346 | */ | 1353 | */ |
1347 | alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; | 1354 | alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; |
1348 | 1355 | ||
1349 | page = alloc_slab_page(alloc_gfp, node, oo); | 1356 | page = alloc_slab_page(s, alloc_gfp, node, oo); |
1350 | if (unlikely(!page)) { | 1357 | if (unlikely(!page)) { |
1351 | oo = s->min; | 1358 | oo = s->min; |
1352 | alloc_gfp = flags; | 1359 | alloc_gfp = flags; |
@@ -1354,7 +1361,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1354 | * Allocation may have failed due to fragmentation. | 1361 | * Allocation may have failed due to fragmentation. |
1355 | * Try a lower order alloc if possible | 1362 | * Try a lower order alloc if possible |
1356 | */ | 1363 | */ |
1357 | page = alloc_slab_page(alloc_gfp, node, oo); | 1364 | page = alloc_slab_page(s, alloc_gfp, node, oo); |
1358 | 1365 | ||
1359 | if (page) | 1366 | if (page) |
1360 | stat(s, ORDER_FALLBACK); | 1367 | stat(s, ORDER_FALLBACK); |
@@ -1415,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1415 | 1422 | ||
1416 | order = compound_order(page); | 1423 | order = compound_order(page); |
1417 | inc_slabs_node(s, page_to_nid(page), page->objects); | 1424 | inc_slabs_node(s, page_to_nid(page), page->objects); |
1418 | memcg_bind_pages(s, order); | ||
1419 | page->slab_cache = s; | 1425 | page->slab_cache = s; |
1420 | __SetPageSlab(page); | 1426 | __SetPageSlab(page); |
1421 | if (page->pfmemalloc) | 1427 | if (page->pfmemalloc) |
@@ -1466,11 +1472,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1466 | __ClearPageSlabPfmemalloc(page); | 1472 | __ClearPageSlabPfmemalloc(page); |
1467 | __ClearPageSlab(page); | 1473 | __ClearPageSlab(page); |
1468 | 1474 | ||
1469 | memcg_release_pages(s, order); | ||
1470 | page_mapcount_reset(page); | 1475 | page_mapcount_reset(page); |
1471 | if (current->reclaim_state) | 1476 | if (current->reclaim_state) |
1472 | current->reclaim_state->reclaimed_slab += pages; | 1477 | current->reclaim_state->reclaimed_slab += pages; |
1473 | __free_memcg_kmem_pages(page, order); | 1478 | __free_pages(page, order); |
1479 | memcg_uncharge_slab(s, order); | ||
1474 | } | 1480 | } |
1475 | 1481 | ||
1476 | #define need_reserve_slab_rcu \ | 1482 | #define need_reserve_slab_rcu \ |
@@ -1770,19 +1776,19 @@ static inline void note_cmpxchg_failure(const char *n, | |||
1770 | #ifdef SLUB_DEBUG_CMPXCHG | 1776 | #ifdef SLUB_DEBUG_CMPXCHG |
1771 | unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); | 1777 | unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); |
1772 | 1778 | ||
1773 | printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); | 1779 | pr_info("%s %s: cmpxchg redo ", n, s->name); |
1774 | 1780 | ||
1775 | #ifdef CONFIG_PREEMPT | 1781 | #ifdef CONFIG_PREEMPT |
1776 | if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) | 1782 | if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) |
1777 | printk("due to cpu change %d -> %d\n", | 1783 | pr_warn("due to cpu change %d -> %d\n", |
1778 | tid_to_cpu(tid), tid_to_cpu(actual_tid)); | 1784 | tid_to_cpu(tid), tid_to_cpu(actual_tid)); |
1779 | else | 1785 | else |
1780 | #endif | 1786 | #endif |
1781 | if (tid_to_event(tid) != tid_to_event(actual_tid)) | 1787 | if (tid_to_event(tid) != tid_to_event(actual_tid)) |
1782 | printk("due to cpu running other code. Event %ld->%ld\n", | 1788 | pr_warn("due to cpu running other code. Event %ld->%ld\n", |
1783 | tid_to_event(tid), tid_to_event(actual_tid)); | 1789 | tid_to_event(tid), tid_to_event(actual_tid)); |
1784 | else | 1790 | else |
1785 | printk("for unknown reason: actual=%lx was=%lx target=%lx\n", | 1791 | pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", |
1786 | actual_tid, tid, next_tid(tid)); | 1792 | actual_tid, tid, next_tid(tid)); |
1787 | #endif | 1793 | #endif |
1788 | stat(s, CMPXCHG_DOUBLE_CPU_FAIL); | 1794 | stat(s, CMPXCHG_DOUBLE_CPU_FAIL); |
@@ -2121,11 +2127,19 @@ static inline int node_match(struct page *page, int node) | |||
2121 | return 1; | 2127 | return 1; |
2122 | } | 2128 | } |
2123 | 2129 | ||
2130 | #ifdef CONFIG_SLUB_DEBUG | ||
2124 | static int count_free(struct page *page) | 2131 | static int count_free(struct page *page) |
2125 | { | 2132 | { |
2126 | return page->objects - page->inuse; | 2133 | return page->objects - page->inuse; |
2127 | } | 2134 | } |
2128 | 2135 | ||
2136 | static inline unsigned long node_nr_objs(struct kmem_cache_node *n) | ||
2137 | { | ||
2138 | return atomic_long_read(&n->total_objects); | ||
2139 | } | ||
2140 | #endif /* CONFIG_SLUB_DEBUG */ | ||
2141 | |||
2142 | #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) | ||
2129 | static unsigned long count_partial(struct kmem_cache_node *n, | 2143 | static unsigned long count_partial(struct kmem_cache_node *n, |
2130 | int (*get_count)(struct page *)) | 2144 | int (*get_count)(struct page *)) |
2131 | { | 2145 | { |
@@ -2139,31 +2153,28 @@ static unsigned long count_partial(struct kmem_cache_node *n, | |||
2139 | spin_unlock_irqrestore(&n->list_lock, flags); | 2153 | spin_unlock_irqrestore(&n->list_lock, flags); |
2140 | return x; | 2154 | return x; |
2141 | } | 2155 | } |
2142 | 2156 | #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ | |
2143 | static inline unsigned long node_nr_objs(struct kmem_cache_node *n) | ||
2144 | { | ||
2145 | #ifdef CONFIG_SLUB_DEBUG | ||
2146 | return atomic_long_read(&n->total_objects); | ||
2147 | #else | ||
2148 | return 0; | ||
2149 | #endif | ||
2150 | } | ||
2151 | 2157 | ||
2152 | static noinline void | 2158 | static noinline void |
2153 | slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | 2159 | slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) |
2154 | { | 2160 | { |
2161 | #ifdef CONFIG_SLUB_DEBUG | ||
2162 | static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, | ||
2163 | DEFAULT_RATELIMIT_BURST); | ||
2155 | int node; | 2164 | int node; |
2156 | 2165 | ||
2157 | printk(KERN_WARNING | 2166 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) |
2158 | "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 2167 | return; |
2168 | |||
2169 | pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", | ||
2159 | nid, gfpflags); | 2170 | nid, gfpflags); |
2160 | printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " | 2171 | pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", |
2161 | "default order: %d, min order: %d\n", s->name, s->object_size, | 2172 | s->name, s->object_size, s->size, oo_order(s->oo), |
2162 | s->size, oo_order(s->oo), oo_order(s->min)); | 2173 | oo_order(s->min)); |
2163 | 2174 | ||
2164 | if (oo_order(s->min) > get_order(s->object_size)) | 2175 | if (oo_order(s->min) > get_order(s->object_size)) |
2165 | printk(KERN_WARNING " %s debugging increased min order, use " | 2176 | pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", |
2166 | "slub_debug=O to disable.\n", s->name); | 2177 | s->name); |
2167 | 2178 | ||
2168 | for_each_online_node(node) { | 2179 | for_each_online_node(node) { |
2169 | struct kmem_cache_node *n = get_node(s, node); | 2180 | struct kmem_cache_node *n = get_node(s, node); |
@@ -2178,10 +2189,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
2178 | nr_slabs = node_nr_slabs(n); | 2189 | nr_slabs = node_nr_slabs(n); |
2179 | nr_objs = node_nr_objs(n); | 2190 | nr_objs = node_nr_objs(n); |
2180 | 2191 | ||
2181 | printk(KERN_WARNING | 2192 | pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", |
2182 | " node %d: slabs: %ld, objs: %ld, free: %ld\n", | ||
2183 | node, nr_slabs, nr_objs, nr_free); | 2193 | node, nr_slabs, nr_objs, nr_free); |
2184 | } | 2194 | } |
2195 | #endif | ||
2185 | } | 2196 | } |
2186 | 2197 | ||
2187 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | 2198 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, |
@@ -2198,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | |||
2198 | 2209 | ||
2199 | page = new_slab(s, flags, node); | 2210 | page = new_slab(s, flags, node); |
2200 | if (page) { | 2211 | if (page) { |
2201 | c = __this_cpu_ptr(s->cpu_slab); | 2212 | c = raw_cpu_ptr(s->cpu_slab); |
2202 | if (c->page) | 2213 | if (c->page) |
2203 | flush_slab(s, c); | 2214 | flush_slab(s, c); |
2204 | 2215 | ||
@@ -2323,8 +2334,6 @@ redo: | |||
2323 | if (freelist) | 2334 | if (freelist) |
2324 | goto load_freelist; | 2335 | goto load_freelist; |
2325 | 2336 | ||
2326 | stat(s, ALLOC_SLOWPATH); | ||
2327 | |||
2328 | freelist = get_freelist(s, page); | 2337 | freelist = get_freelist(s, page); |
2329 | 2338 | ||
2330 | if (!freelist) { | 2339 | if (!freelist) { |
@@ -2360,9 +2369,7 @@ new_slab: | |||
2360 | freelist = new_slab_objects(s, gfpflags, node, &c); | 2369 | freelist = new_slab_objects(s, gfpflags, node, &c); |
2361 | 2370 | ||
2362 | if (unlikely(!freelist)) { | 2371 | if (unlikely(!freelist)) { |
2363 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | 2372 | slab_out_of_memory(s, gfpflags, node); |
2364 | slab_out_of_memory(s, gfpflags, node); | ||
2365 | |||
2366 | local_irq_restore(flags); | 2373 | local_irq_restore(flags); |
2367 | return NULL; | 2374 | return NULL; |
2368 | } | 2375 | } |
@@ -2418,7 +2425,7 @@ redo: | |||
2418 | * and the retrieval of the tid. | 2425 | * and the retrieval of the tid. |
2419 | */ | 2426 | */ |
2420 | preempt_disable(); | 2427 | preempt_disable(); |
2421 | c = __this_cpu_ptr(s->cpu_slab); | 2428 | c = this_cpu_ptr(s->cpu_slab); |
2422 | 2429 | ||
2423 | /* | 2430 | /* |
2424 | * The transaction ids are globally unique per cpu and per operation on | 2431 | * The transaction ids are globally unique per cpu and per operation on |
@@ -2431,10 +2438,10 @@ redo: | |||
2431 | 2438 | ||
2432 | object = c->freelist; | 2439 | object = c->freelist; |
2433 | page = c->page; | 2440 | page = c->page; |
2434 | if (unlikely(!object || !node_match(page, node))) | 2441 | if (unlikely(!object || !node_match(page, node))) { |
2435 | object = __slab_alloc(s, gfpflags, node, addr, c); | 2442 | object = __slab_alloc(s, gfpflags, node, addr, c); |
2436 | 2443 | stat(s, ALLOC_SLOWPATH); | |
2437 | else { | 2444 | } else { |
2438 | void *next_object = get_freepointer_safe(s, object); | 2445 | void *next_object = get_freepointer_safe(s, object); |
2439 | 2446 | ||
2440 | /* | 2447 | /* |
@@ -2674,7 +2681,7 @@ redo: | |||
2674 | * during the cmpxchg then the free will succedd. | 2681 | * during the cmpxchg then the free will succedd. |
2675 | */ | 2682 | */ |
2676 | preempt_disable(); | 2683 | preempt_disable(); |
2677 | c = __this_cpu_ptr(s->cpu_slab); | 2684 | c = this_cpu_ptr(s->cpu_slab); |
2678 | 2685 | ||
2679 | tid = c->tid; | 2686 | tid = c->tid; |
2680 | preempt_enable(); | 2687 | preempt_enable(); |
@@ -2894,10 +2901,8 @@ static void early_kmem_cache_node_alloc(int node) | |||
2894 | 2901 | ||
2895 | BUG_ON(!page); | 2902 | BUG_ON(!page); |
2896 | if (page_to_nid(page) != node) { | 2903 | if (page_to_nid(page) != node) { |
2897 | printk(KERN_ERR "SLUB: Unable to allocate memory from " | 2904 | pr_err("SLUB: Unable to allocate memory from node %d\n", node); |
2898 | "node %d\n", node); | 2905 | pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); |
2899 | printk(KERN_ERR "SLUB: Allocating a useless per node structure " | ||
2900 | "in order to be able to continue\n"); | ||
2901 | } | 2906 | } |
2902 | 2907 | ||
2903 | n = page->freelist; | 2908 | n = page->freelist; |
@@ -3182,8 +3187,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, | |||
3182 | for_each_object(p, s, addr, page->objects) { | 3187 | for_each_object(p, s, addr, page->objects) { |
3183 | 3188 | ||
3184 | if (!test_bit(slab_index(p, s, addr), map)) { | 3189 | if (!test_bit(slab_index(p, s, addr), map)) { |
3185 | printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", | 3190 | pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); |
3186 | p, p - addr); | ||
3187 | print_tracking(s, p); | 3191 | print_tracking(s, p); |
3188 | } | 3192 | } |
3189 | } | 3193 | } |
@@ -3305,8 +3309,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | |||
3305 | struct page *page; | 3309 | struct page *page; |
3306 | void *ptr = NULL; | 3310 | void *ptr = NULL; |
3307 | 3311 | ||
3308 | flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; | 3312 | flags |= __GFP_COMP | __GFP_NOTRACK; |
3309 | page = alloc_pages_node(node, flags, get_order(size)); | 3313 | page = alloc_kmem_pages_node(node, flags, get_order(size)); |
3310 | if (page) | 3314 | if (page) |
3311 | ptr = page_address(page); | 3315 | ptr = page_address(page); |
3312 | 3316 | ||
@@ -3375,7 +3379,7 @@ void kfree(const void *x) | |||
3375 | if (unlikely(!PageSlab(page))) { | 3379 | if (unlikely(!PageSlab(page))) { |
3376 | BUG_ON(!PageCompound(page)); | 3380 | BUG_ON(!PageCompound(page)); |
3377 | kfree_hook(x); | 3381 | kfree_hook(x); |
3378 | __free_memcg_kmem_pages(page, compound_order(page)); | 3382 | __free_kmem_pages(page, compound_order(page)); |
3379 | return; | 3383 | return; |
3380 | } | 3384 | } |
3381 | slab_free(page->slab_cache, page, object, _RET_IP_); | 3385 | slab_free(page->slab_cache, page, object, _RET_IP_); |
@@ -3392,7 +3396,7 @@ EXPORT_SYMBOL(kfree); | |||
3392 | * being allocated from last increasing the chance that the last objects | 3396 | * being allocated from last increasing the chance that the last objects |
3393 | * are freed in them. | 3397 | * are freed in them. |
3394 | */ | 3398 | */ |
3395 | int kmem_cache_shrink(struct kmem_cache *s) | 3399 | int __kmem_cache_shrink(struct kmem_cache *s) |
3396 | { | 3400 | { |
3397 | int node; | 3401 | int node; |
3398 | int i; | 3402 | int i; |
@@ -3448,7 +3452,6 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
3448 | kfree(slabs_by_inuse); | 3452 | kfree(slabs_by_inuse); |
3449 | return 0; | 3453 | return 0; |
3450 | } | 3454 | } |
3451 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
3452 | 3455 | ||
3453 | static int slab_mem_going_offline_callback(void *arg) | 3456 | static int slab_mem_going_offline_callback(void *arg) |
3454 | { | 3457 | { |
@@ -3456,7 +3459,7 @@ static int slab_mem_going_offline_callback(void *arg) | |||
3456 | 3459 | ||
3457 | mutex_lock(&slab_mutex); | 3460 | mutex_lock(&slab_mutex); |
3458 | list_for_each_entry(s, &slab_caches, list) | 3461 | list_for_each_entry(s, &slab_caches, list) |
3459 | kmem_cache_shrink(s); | 3462 | __kmem_cache_shrink(s); |
3460 | mutex_unlock(&slab_mutex); | 3463 | mutex_unlock(&slab_mutex); |
3461 | 3464 | ||
3462 | return 0; | 3465 | return 0; |
@@ -3650,9 +3653,7 @@ void __init kmem_cache_init(void) | |||
3650 | register_cpu_notifier(&slab_notifier); | 3653 | register_cpu_notifier(&slab_notifier); |
3651 | #endif | 3654 | #endif |
3652 | 3655 | ||
3653 | printk(KERN_INFO | 3656 | pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", |
3654 | "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," | ||
3655 | " CPUs=%d, Nodes=%d\n", | ||
3656 | cache_line_size(), | 3657 | cache_line_size(), |
3657 | slub_min_order, slub_max_order, slub_min_objects, | 3658 | slub_min_order, slub_max_order, slub_min_objects, |
3658 | nr_cpu_ids, nr_node_ids); | 3659 | nr_cpu_ids, nr_node_ids); |
@@ -3934,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s, | |||
3934 | count++; | 3935 | count++; |
3935 | } | 3936 | } |
3936 | if (count != n->nr_partial) | 3937 | if (count != n->nr_partial) |
3937 | printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " | 3938 | pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", |
3938 | "counter=%ld\n", s->name, count, n->nr_partial); | 3939 | s->name, count, n->nr_partial); |
3939 | 3940 | ||
3940 | if (!(s->flags & SLAB_STORE_USER)) | 3941 | if (!(s->flags & SLAB_STORE_USER)) |
3941 | goto out; | 3942 | goto out; |
@@ -3945,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s, | |||
3945 | count++; | 3946 | count++; |
3946 | } | 3947 | } |
3947 | if (count != atomic_long_read(&n->nr_slabs)) | 3948 | if (count != atomic_long_read(&n->nr_slabs)) |
3948 | printk(KERN_ERR "SLUB: %s %ld slabs counted but " | 3949 | pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", |
3949 | "counter=%ld\n", s->name, count, | 3950 | s->name, count, atomic_long_read(&n->nr_slabs)); |
3950 | atomic_long_read(&n->nr_slabs)); | ||
3951 | 3951 | ||
3952 | out: | 3952 | out: |
3953 | spin_unlock_irqrestore(&n->list_lock, flags); | 3953 | spin_unlock_irqrestore(&n->list_lock, flags); |
@@ -4211,53 +4211,50 @@ static void resiliency_test(void) | |||
4211 | 4211 | ||
4212 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); | 4212 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); |
4213 | 4213 | ||
4214 | printk(KERN_ERR "SLUB resiliency testing\n"); | 4214 | pr_err("SLUB resiliency testing\n"); |
4215 | printk(KERN_ERR "-----------------------\n"); | 4215 | pr_err("-----------------------\n"); |
4216 | printk(KERN_ERR "A. Corruption after allocation\n"); | 4216 | pr_err("A. Corruption after allocation\n"); |
4217 | 4217 | ||
4218 | p = kzalloc(16, GFP_KERNEL); | 4218 | p = kzalloc(16, GFP_KERNEL); |
4219 | p[16] = 0x12; | 4219 | p[16] = 0x12; |
4220 | printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" | 4220 | pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", |
4221 | " 0x12->0x%p\n\n", p + 16); | 4221 | p + 16); |
4222 | 4222 | ||
4223 | validate_slab_cache(kmalloc_caches[4]); | 4223 | validate_slab_cache(kmalloc_caches[4]); |
4224 | 4224 | ||
4225 | /* Hmmm... The next two are dangerous */ | 4225 | /* Hmmm... The next two are dangerous */ |
4226 | p = kzalloc(32, GFP_KERNEL); | 4226 | p = kzalloc(32, GFP_KERNEL); |
4227 | p[32 + sizeof(void *)] = 0x34; | 4227 | p[32 + sizeof(void *)] = 0x34; |
4228 | printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" | 4228 | pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", |
4229 | " 0x34 -> -0x%p\n", p); | 4229 | p); |
4230 | printk(KERN_ERR | 4230 | pr_err("If allocated object is overwritten then not detectable\n\n"); |
4231 | "If allocated object is overwritten then not detectable\n\n"); | ||
4232 | 4231 | ||
4233 | validate_slab_cache(kmalloc_caches[5]); | 4232 | validate_slab_cache(kmalloc_caches[5]); |
4234 | p = kzalloc(64, GFP_KERNEL); | 4233 | p = kzalloc(64, GFP_KERNEL); |
4235 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); | 4234 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); |
4236 | *p = 0x56; | 4235 | *p = 0x56; |
4237 | printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", | 4236 | pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", |
4238 | p); | 4237 | p); |
4239 | printk(KERN_ERR | 4238 | pr_err("If allocated object is overwritten then not detectable\n\n"); |
4240 | "If allocated object is overwritten then not detectable\n\n"); | ||
4241 | validate_slab_cache(kmalloc_caches[6]); | 4239 | validate_slab_cache(kmalloc_caches[6]); |
4242 | 4240 | ||
4243 | printk(KERN_ERR "\nB. Corruption after free\n"); | 4241 | pr_err("\nB. Corruption after free\n"); |
4244 | p = kzalloc(128, GFP_KERNEL); | 4242 | p = kzalloc(128, GFP_KERNEL); |
4245 | kfree(p); | 4243 | kfree(p); |
4246 | *p = 0x78; | 4244 | *p = 0x78; |
4247 | printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); | 4245 | pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); |
4248 | validate_slab_cache(kmalloc_caches[7]); | 4246 | validate_slab_cache(kmalloc_caches[7]); |
4249 | 4247 | ||
4250 | p = kzalloc(256, GFP_KERNEL); | 4248 | p = kzalloc(256, GFP_KERNEL); |
4251 | kfree(p); | 4249 | kfree(p); |
4252 | p[50] = 0x9a; | 4250 | p[50] = 0x9a; |
4253 | printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", | 4251 | pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); |
4254 | p); | ||
4255 | validate_slab_cache(kmalloc_caches[8]); | 4252 | validate_slab_cache(kmalloc_caches[8]); |
4256 | 4253 | ||
4257 | p = kzalloc(512, GFP_KERNEL); | 4254 | p = kzalloc(512, GFP_KERNEL); |
4258 | kfree(p); | 4255 | kfree(p); |
4259 | p[512] = 0xab; | 4256 | p[512] = 0xab; |
4260 | printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); | 4257 | pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); |
4261 | validate_slab_cache(kmalloc_caches[9]); | 4258 | validate_slab_cache(kmalloc_caches[9]); |
4262 | } | 4259 | } |
4263 | #else | 4260 | #else |
@@ -4332,7 +4329,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4332 | } | 4329 | } |
4333 | } | 4330 | } |
4334 | 4331 | ||
4335 | lock_memory_hotplug(); | 4332 | get_online_mems(); |
4336 | #ifdef CONFIG_SLUB_DEBUG | 4333 | #ifdef CONFIG_SLUB_DEBUG |
4337 | if (flags & SO_ALL) { | 4334 | if (flags & SO_ALL) { |
4338 | for_each_node_state(node, N_NORMAL_MEMORY) { | 4335 | for_each_node_state(node, N_NORMAL_MEMORY) { |
@@ -4372,7 +4369,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4372 | x += sprintf(buf + x, " N%d=%lu", | 4369 | x += sprintf(buf + x, " N%d=%lu", |
4373 | node, nodes[node]); | 4370 | node, nodes[node]); |
4374 | #endif | 4371 | #endif |
4375 | unlock_memory_hotplug(); | 4372 | put_online_mems(); |
4376 | kfree(nodes); | 4373 | kfree(nodes); |
4377 | return x + sprintf(buf + x, "\n"); | 4374 | return x + sprintf(buf + x, "\n"); |
4378 | } | 4375 | } |
@@ -5303,7 +5300,7 @@ static int __init slab_sysfs_init(void) | |||
5303 | slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); | 5300 | slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); |
5304 | if (!slab_kset) { | 5301 | if (!slab_kset) { |
5305 | mutex_unlock(&slab_mutex); | 5302 | mutex_unlock(&slab_mutex); |
5306 | printk(KERN_ERR "Cannot register slab subsystem.\n"); | 5303 | pr_err("Cannot register slab subsystem.\n"); |
5307 | return -ENOSYS; | 5304 | return -ENOSYS; |
5308 | } | 5305 | } |
5309 | 5306 | ||
@@ -5312,8 +5309,8 @@ static int __init slab_sysfs_init(void) | |||
5312 | list_for_each_entry(s, &slab_caches, list) { | 5309 | list_for_each_entry(s, &slab_caches, list) { |
5313 | err = sysfs_slab_add(s); | 5310 | err = sysfs_slab_add(s); |
5314 | if (err) | 5311 | if (err) |
5315 | printk(KERN_ERR "SLUB: Unable to add boot slab %s" | 5312 | pr_err("SLUB: Unable to add boot slab %s to sysfs\n", |
5316 | " to sysfs\n", s->name); | 5313 | s->name); |
5317 | } | 5314 | } |
5318 | 5315 | ||
5319 | while (alias_list) { | 5316 | while (alias_list) { |
@@ -5322,8 +5319,8 @@ static int __init slab_sysfs_init(void) | |||
5322 | alias_list = alias_list->next; | 5319 | alias_list = alias_list->next; |
5323 | err = sysfs_slab_alias(al->s, al->name); | 5320 | err = sysfs_slab_alias(al->s, al->name); |
5324 | if (err) | 5321 | if (err) |
5325 | printk(KERN_ERR "SLUB: Unable to add boot slab alias" | 5322 | pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", |
5326 | " %s to sysfs\n", al->name); | 5323 | al->name); |
5327 | kfree(al); | 5324 | kfree(al); |
5328 | } | 5325 | } |
5329 | 5326 | ||
@@ -67,7 +67,7 @@ static void __page_cache_release(struct page *page) | |||
67 | static void __put_single_page(struct page *page) | 67 | static void __put_single_page(struct page *page) |
68 | { | 68 | { |
69 | __page_cache_release(page); | 69 | __page_cache_release(page); |
70 | free_hot_cold_page(page, 0); | 70 | free_hot_cold_page(page, false); |
71 | } | 71 | } |
72 | 72 | ||
73 | static void __put_compound_page(struct page *page) | 73 | static void __put_compound_page(struct page *page) |
@@ -79,95 +79,88 @@ static void __put_compound_page(struct page *page) | |||
79 | (*dtor)(page); | 79 | (*dtor)(page); |
80 | } | 80 | } |
81 | 81 | ||
82 | static void put_compound_page(struct page *page) | 82 | /** |
83 | * Two special cases here: we could avoid taking compound_lock_irqsave | ||
84 | * and could skip the tail refcounting(in _mapcount). | ||
85 | * | ||
86 | * 1. Hugetlbfs page: | ||
87 | * | ||
88 | * PageHeadHuge will remain true until the compound page | ||
89 | * is released and enters the buddy allocator, and it could | ||
90 | * not be split by __split_huge_page_refcount(). | ||
91 | * | ||
92 | * So if we see PageHeadHuge set, and we have the tail page pin, | ||
93 | * then we could safely put head page. | ||
94 | * | ||
95 | * 2. Slab THP page: | ||
96 | * | ||
97 | * PG_slab is cleared before the slab frees the head page, and | ||
98 | * tail pin cannot be the last reference left on the head page, | ||
99 | * because the slab code is free to reuse the compound page | ||
100 | * after a kfree/kmem_cache_free without having to check if | ||
101 | * there's any tail pin left. In turn all tail pinsmust be always | ||
102 | * released while the head is still pinned by the slab code | ||
103 | * and so we know PG_slab will be still set too. | ||
104 | * | ||
105 | * So if we see PageSlab set, and we have the tail page pin, | ||
106 | * then we could safely put head page. | ||
107 | */ | ||
108 | static __always_inline | ||
109 | void put_unrefcounted_compound_page(struct page *page_head, struct page *page) | ||
83 | { | 110 | { |
84 | struct page *page_head; | ||
85 | |||
86 | if (likely(!PageTail(page))) { | ||
87 | if (put_page_testzero(page)) { | ||
88 | /* | ||
89 | * By the time all refcounts have been released | ||
90 | * split_huge_page cannot run anymore from under us. | ||
91 | */ | ||
92 | if (PageHead(page)) | ||
93 | __put_compound_page(page); | ||
94 | else | ||
95 | __put_single_page(page); | ||
96 | } | ||
97 | return; | ||
98 | } | ||
99 | |||
100 | /* __split_huge_page_refcount can run under us */ | ||
101 | page_head = compound_head(page); | ||
102 | |||
103 | /* | 111 | /* |
104 | * THP can not break up slab pages so avoid taking | 112 | * If @page is a THP tail, we must read the tail page |
105 | * compound_lock() and skip the tail page refcounting (in | 113 | * flags after the head page flags. The |
106 | * _mapcount) too. Slab performs non-atomic bit ops on | 114 | * __split_huge_page_refcount side enforces write memory barriers |
107 | * page->flags for better performance. In particular | 115 | * between clearing PageTail and before the head page |
108 | * slab_unlock() in slub used to be a hot path. It is still | 116 | * can be freed and reallocated. |
109 | * hot on arches that do not support | ||
110 | * this_cpu_cmpxchg_double(). | ||
111 | * | ||
112 | * If "page" is part of a slab or hugetlbfs page it cannot be | ||
113 | * splitted and the head page cannot change from under us. And | ||
114 | * if "page" is part of a THP page under splitting, if the | ||
115 | * head page pointed by the THP tail isn't a THP head anymore, | ||
116 | * we'll find PageTail clear after smp_rmb() and we'll treat | ||
117 | * it as a single page. | ||
118 | */ | 117 | */ |
119 | if (!__compound_tail_refcounted(page_head)) { | 118 | smp_rmb(); |
119 | if (likely(PageTail(page))) { | ||
120 | /* | 120 | /* |
121 | * If "page" is a THP tail, we must read the tail page | 121 | * __split_huge_page_refcount cannot race |
122 | * flags after the head page flags. The | 122 | * here, see the comment above this function. |
123 | * split_huge_page side enforces write memory barriers | ||
124 | * between clearing PageTail and before the head page | ||
125 | * can be freed and reallocated. | ||
126 | */ | 123 | */ |
127 | smp_rmb(); | 124 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); |
128 | if (likely(PageTail(page))) { | 125 | VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); |
129 | /* | 126 | if (put_page_testzero(page_head)) { |
130 | * __split_huge_page_refcount cannot race | ||
131 | * here. | ||
132 | */ | ||
133 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); | ||
134 | VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); | ||
135 | if (put_page_testzero(page_head)) { | ||
136 | /* | ||
137 | * If this is the tail of a slab | ||
138 | * compound page, the tail pin must | ||
139 | * not be the last reference held on | ||
140 | * the page, because the PG_slab | ||
141 | * cannot be cleared before all tail | ||
142 | * pins (which skips the _mapcount | ||
143 | * tail refcounting) have been | ||
144 | * released. For hugetlbfs the tail | ||
145 | * pin may be the last reference on | ||
146 | * the page instead, because | ||
147 | * PageHeadHuge will not go away until | ||
148 | * the compound page enters the buddy | ||
149 | * allocator. | ||
150 | */ | ||
151 | VM_BUG_ON_PAGE(PageSlab(page_head), page_head); | ||
152 | __put_compound_page(page_head); | ||
153 | } | ||
154 | return; | ||
155 | } else | ||
156 | /* | 127 | /* |
157 | * __split_huge_page_refcount run before us, | 128 | * If this is the tail of a slab THP page, |
158 | * "page" was a THP tail. The split page_head | 129 | * the tail pin must not be the last reference |
159 | * has been freed and reallocated as slab or | 130 | * held on the page, because the PG_slab cannot |
160 | * hugetlbfs page of smaller order (only | 131 | * be cleared before all tail pins (which skips |
161 | * possible if reallocated as slab on x86). | 132 | * the _mapcount tail refcounting) have been |
133 | * released. | ||
134 | * | ||
135 | * If this is the tail of a hugetlbfs page, | ||
136 | * the tail pin may be the last reference on | ||
137 | * the page instead, because PageHeadHuge will | ||
138 | * not go away until the compound page enters | ||
139 | * the buddy allocator. | ||
162 | */ | 140 | */ |
163 | goto out_put_single; | 141 | VM_BUG_ON_PAGE(PageSlab(page_head), page_head); |
164 | } | 142 | __put_compound_page(page_head); |
143 | } | ||
144 | } else | ||
145 | /* | ||
146 | * __split_huge_page_refcount run before us, | ||
147 | * @page was a THP tail. The split @page_head | ||
148 | * has been freed and reallocated as slab or | ||
149 | * hugetlbfs page of smaller order (only | ||
150 | * possible if reallocated as slab on x86). | ||
151 | */ | ||
152 | if (put_page_testzero(page)) | ||
153 | __put_single_page(page); | ||
154 | } | ||
165 | 155 | ||
156 | static __always_inline | ||
157 | void put_refcounted_compound_page(struct page *page_head, struct page *page) | ||
158 | { | ||
166 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 159 | if (likely(page != page_head && get_page_unless_zero(page_head))) { |
167 | unsigned long flags; | 160 | unsigned long flags; |
168 | 161 | ||
169 | /* | 162 | /* |
170 | * page_head wasn't a dangling pointer but it may not | 163 | * @page_head wasn't a dangling pointer but it may not |
171 | * be a head page anymore by the time we obtain the | 164 | * be a head page anymore by the time we obtain the |
172 | * lock. That is ok as long as it can't be freed from | 165 | * lock. That is ok as long as it can't be freed from |
173 | * under us. | 166 | * under us. |
@@ -178,7 +171,7 @@ static void put_compound_page(struct page *page) | |||
178 | compound_unlock_irqrestore(page_head, flags); | 171 | compound_unlock_irqrestore(page_head, flags); |
179 | if (put_page_testzero(page_head)) { | 172 | if (put_page_testzero(page_head)) { |
180 | /* | 173 | /* |
181 | * The head page may have been freed | 174 | * The @page_head may have been freed |
182 | * and reallocated as a compound page | 175 | * and reallocated as a compound page |
183 | * of smaller order and then freed | 176 | * of smaller order and then freed |
184 | * again. All we know is that it | 177 | * again. All we know is that it |
@@ -222,12 +215,51 @@ out_put_single: | |||
222 | __put_single_page(page_head); | 215 | __put_single_page(page_head); |
223 | } | 216 | } |
224 | } else { | 217 | } else { |
225 | /* page_head is a dangling pointer */ | 218 | /* @page_head is a dangling pointer */ |
226 | VM_BUG_ON_PAGE(PageTail(page), page); | 219 | VM_BUG_ON_PAGE(PageTail(page), page); |
227 | goto out_put_single; | 220 | goto out_put_single; |
228 | } | 221 | } |
229 | } | 222 | } |
230 | 223 | ||
224 | static void put_compound_page(struct page *page) | ||
225 | { | ||
226 | struct page *page_head; | ||
227 | |||
228 | /* | ||
229 | * We see the PageCompound set and PageTail not set, so @page maybe: | ||
230 | * 1. hugetlbfs head page, or | ||
231 | * 2. THP head page. | ||
232 | */ | ||
233 | if (likely(!PageTail(page))) { | ||
234 | if (put_page_testzero(page)) { | ||
235 | /* | ||
236 | * By the time all refcounts have been released | ||
237 | * split_huge_page cannot run anymore from under us. | ||
238 | */ | ||
239 | if (PageHead(page)) | ||
240 | __put_compound_page(page); | ||
241 | else | ||
242 | __put_single_page(page); | ||
243 | } | ||
244 | return; | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * We see the PageCompound set and PageTail set, so @page maybe: | ||
249 | * 1. a tail hugetlbfs page, or | ||
250 | * 2. a tail THP page, or | ||
251 | * 3. a split THP page. | ||
252 | * | ||
253 | * Case 3 is possible, as we may race with | ||
254 | * __split_huge_page_refcount tearing down a THP page. | ||
255 | */ | ||
256 | page_head = compound_head_by_tail(page); | ||
257 | if (!__compound_tail_refcounted(page_head)) | ||
258 | put_unrefcounted_compound_page(page_head, page); | ||
259 | else | ||
260 | put_refcounted_compound_page(page_head, page); | ||
261 | } | ||
262 | |||
231 | void put_page(struct page *page) | 263 | void put_page(struct page *page) |
232 | { | 264 | { |
233 | if (unlikely(PageCompound(page))) | 265 | if (unlikely(PageCompound(page))) |
@@ -441,7 +473,7 @@ void rotate_reclaimable_page(struct page *page) | |||
441 | 473 | ||
442 | page_cache_get(page); | 474 | page_cache_get(page); |
443 | local_irq_save(flags); | 475 | local_irq_save(flags); |
444 | pvec = &__get_cpu_var(lru_rotate_pvecs); | 476 | pvec = this_cpu_ptr(&lru_rotate_pvecs); |
445 | if (!pagevec_add(pvec, page)) | 477 | if (!pagevec_add(pvec, page)) |
446 | pagevec_move_tail(pvec); | 478 | pagevec_move_tail(pvec); |
447 | local_irq_restore(flags); | 479 | local_irq_restore(flags); |
@@ -583,12 +615,17 @@ void mark_page_accessed(struct page *page) | |||
583 | EXPORT_SYMBOL(mark_page_accessed); | 615 | EXPORT_SYMBOL(mark_page_accessed); |
584 | 616 | ||
585 | /* | 617 | /* |
586 | * Queue the page for addition to the LRU via pagevec. The decision on whether | 618 | * Used to mark_page_accessed(page) that is not visible yet and when it is |
587 | * to add the page to the [in]active [file|anon] list is deferred until the | 619 | * still safe to use non-atomic ops |
588 | * pagevec is drained. This gives a chance for the caller of __lru_cache_add() | ||
589 | * have the page added to the active list using mark_page_accessed(). | ||
590 | */ | 620 | */ |
591 | void __lru_cache_add(struct page *page) | 621 | void init_page_accessed(struct page *page) |
622 | { | ||
623 | if (!PageReferenced(page)) | ||
624 | __SetPageReferenced(page); | ||
625 | } | ||
626 | EXPORT_SYMBOL(init_page_accessed); | ||
627 | |||
628 | static void __lru_cache_add(struct page *page) | ||
592 | { | 629 | { |
593 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); | 630 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); |
594 | 631 | ||
@@ -598,11 +635,34 @@ void __lru_cache_add(struct page *page) | |||
598 | pagevec_add(pvec, page); | 635 | pagevec_add(pvec, page); |
599 | put_cpu_var(lru_add_pvec); | 636 | put_cpu_var(lru_add_pvec); |
600 | } | 637 | } |
601 | EXPORT_SYMBOL(__lru_cache_add); | 638 | |
639 | /** | ||
640 | * lru_cache_add: add a page to the page lists | ||
641 | * @page: the page to add | ||
642 | */ | ||
643 | void lru_cache_add_anon(struct page *page) | ||
644 | { | ||
645 | if (PageActive(page)) | ||
646 | ClearPageActive(page); | ||
647 | __lru_cache_add(page); | ||
648 | } | ||
649 | |||
650 | void lru_cache_add_file(struct page *page) | ||
651 | { | ||
652 | if (PageActive(page)) | ||
653 | ClearPageActive(page); | ||
654 | __lru_cache_add(page); | ||
655 | } | ||
656 | EXPORT_SYMBOL(lru_cache_add_file); | ||
602 | 657 | ||
603 | /** | 658 | /** |
604 | * lru_cache_add - add a page to a page list | 659 | * lru_cache_add - add a page to a page list |
605 | * @page: the page to be added to the LRU. | 660 | * @page: the page to be added to the LRU. |
661 | * | ||
662 | * Queue the page for addition to the LRU via pagevec. The decision on whether | ||
663 | * to add the page to the [in]active [file|anon] list is deferred until the | ||
664 | * pagevec is drained. This gives a chance for the caller of lru_cache_add() | ||
665 | * have the page added to the active list using mark_page_accessed(). | ||
606 | */ | 666 | */ |
607 | void lru_cache_add(struct page *page) | 667 | void lru_cache_add(struct page *page) |
608 | { | 668 | { |
@@ -813,7 +873,7 @@ void lru_add_drain_all(void) | |||
813 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() | 873 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() |
814 | * will free it. | 874 | * will free it. |
815 | */ | 875 | */ |
816 | void release_pages(struct page **pages, int nr, int cold) | 876 | void release_pages(struct page **pages, int nr, bool cold) |
817 | { | 877 | { |
818 | int i; | 878 | int i; |
819 | LIST_HEAD(pages_to_free); | 879 | LIST_HEAD(pages_to_free); |
@@ -854,7 +914,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
854 | } | 914 | } |
855 | 915 | ||
856 | /* Clear Active bit in case of parallel mark_page_accessed */ | 916 | /* Clear Active bit in case of parallel mark_page_accessed */ |
857 | ClearPageActive(page); | 917 | __ClearPageActive(page); |
858 | 918 | ||
859 | list_add(&page->lru, &pages_to_free); | 919 | list_add(&page->lru, &pages_to_free); |
860 | } | 920 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index e76ace30d436..2972eee184a4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -270,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) | |||
270 | 270 | ||
271 | for (i = 0; i < todo; i++) | 271 | for (i = 0; i < todo; i++) |
272 | free_swap_cache(pagep[i]); | 272 | free_swap_cache(pagep[i]); |
273 | release_pages(pagep, todo, 0); | 273 | release_pages(pagep, todo, false); |
274 | pagep += todo; | 274 | pagep += todo; |
275 | nr -= todo; | 275 | nr -= todo; |
276 | } | 276 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 4a7f7e6992b6..4c524f7bd0bf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages; | |||
51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | 51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ |
52 | long total_swap_pages; | 52 | long total_swap_pages; |
53 | static int least_priority; | 53 | static int least_priority; |
54 | static atomic_t highest_priority_index = ATOMIC_INIT(-1); | ||
55 | 54 | ||
56 | static const char Bad_file[] = "Bad swap file entry "; | 55 | static const char Bad_file[] = "Bad swap file entry "; |
57 | static const char Unused_file[] = "Unused swap file entry "; | 56 | static const char Unused_file[] = "Unused swap file entry "; |
58 | static const char Bad_offset[] = "Bad swap offset entry "; | 57 | static const char Bad_offset[] = "Bad swap offset entry "; |
59 | static const char Unused_offset[] = "Unused swap offset entry "; | 58 | static const char Unused_offset[] = "Unused swap offset entry "; |
60 | 59 | ||
61 | struct swap_list_t swap_list = {-1, -1}; | 60 | /* |
61 | * all active swap_info_structs | ||
62 | * protected with swap_lock, and ordered by priority. | ||
63 | */ | ||
64 | PLIST_HEAD(swap_active_head); | ||
65 | |||
66 | /* | ||
67 | * all available (active, not full) swap_info_structs | ||
68 | * protected with swap_avail_lock, ordered by priority. | ||
69 | * This is used by get_swap_page() instead of swap_active_head | ||
70 | * because swap_active_head includes all swap_info_structs, | ||
71 | * but get_swap_page() doesn't need to look at full ones. | ||
72 | * This uses its own lock instead of swap_lock because when a | ||
73 | * swap_info_struct changes between not-full/full, it needs to | ||
74 | * add/remove itself to/from this list, but the swap_info_struct->lock | ||
75 | * is held and the locking order requires swap_lock to be taken | ||
76 | * before any swap_info_struct->lock. | ||
77 | */ | ||
78 | static PLIST_HEAD(swap_avail_head); | ||
79 | static DEFINE_SPINLOCK(swap_avail_lock); | ||
62 | 80 | ||
63 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 81 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
64 | 82 | ||
@@ -505,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
505 | /* | 523 | /* |
506 | * If seek is expensive, start searching for new cluster from | 524 | * If seek is expensive, start searching for new cluster from |
507 | * start of partition, to minimize the span of allocated swap. | 525 | * start of partition, to minimize the span of allocated swap. |
508 | * But if seek is cheap, search from our current position, so | 526 | * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info |
509 | * that swap is allocated from all over the partition: if the | 527 | * case, just handled by scan_swap_map_try_ssd_cluster() above. |
510 | * Flash Translation Layer only remaps within limited zones, | ||
511 | * we don't want to wear out the first zone too quickly. | ||
512 | */ | 528 | */ |
513 | if (!(si->flags & SWP_SOLIDSTATE)) | 529 | scan_base = offset = si->lowest_bit; |
514 | scan_base = offset = si->lowest_bit; | ||
515 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | 530 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; |
516 | 531 | ||
517 | /* Locate the first empty (unaligned) cluster */ | 532 | /* Locate the first empty (unaligned) cluster */ |
@@ -531,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
531 | } | 546 | } |
532 | } | 547 | } |
533 | 548 | ||
534 | offset = si->lowest_bit; | ||
535 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | ||
536 | |||
537 | /* Locate the first empty (unaligned) cluster */ | ||
538 | for (; last_in_cluster < scan_base; offset++) { | ||
539 | if (si->swap_map[offset]) | ||
540 | last_in_cluster = offset + SWAPFILE_CLUSTER; | ||
541 | else if (offset == last_in_cluster) { | ||
542 | spin_lock(&si->lock); | ||
543 | offset -= SWAPFILE_CLUSTER - 1; | ||
544 | si->cluster_next = offset; | ||
545 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
546 | goto checks; | ||
547 | } | ||
548 | if (unlikely(--latency_ration < 0)) { | ||
549 | cond_resched(); | ||
550 | latency_ration = LATENCY_LIMIT; | ||
551 | } | ||
552 | } | ||
553 | |||
554 | offset = scan_base; | 549 | offset = scan_base; |
555 | spin_lock(&si->lock); | 550 | spin_lock(&si->lock); |
556 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 551 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
@@ -591,6 +586,9 @@ checks: | |||
591 | if (si->inuse_pages == si->pages) { | 586 | if (si->inuse_pages == si->pages) { |
592 | si->lowest_bit = si->max; | 587 | si->lowest_bit = si->max; |
593 | si->highest_bit = 0; | 588 | si->highest_bit = 0; |
589 | spin_lock(&swap_avail_lock); | ||
590 | plist_del(&si->avail_list, &swap_avail_head); | ||
591 | spin_unlock(&swap_avail_lock); | ||
594 | } | 592 | } |
595 | si->swap_map[offset] = usage; | 593 | si->swap_map[offset] = usage; |
596 | inc_cluster_info_page(si, si->cluster_info, offset); | 594 | inc_cluster_info_page(si, si->cluster_info, offset); |
@@ -640,71 +638,65 @@ no_page: | |||
640 | 638 | ||
641 | swp_entry_t get_swap_page(void) | 639 | swp_entry_t get_swap_page(void) |
642 | { | 640 | { |
643 | struct swap_info_struct *si; | 641 | struct swap_info_struct *si, *next; |
644 | pgoff_t offset; | 642 | pgoff_t offset; |
645 | int type, next; | ||
646 | int wrapped = 0; | ||
647 | int hp_index; | ||
648 | 643 | ||
649 | spin_lock(&swap_lock); | ||
650 | if (atomic_long_read(&nr_swap_pages) <= 0) | 644 | if (atomic_long_read(&nr_swap_pages) <= 0) |
651 | goto noswap; | 645 | goto noswap; |
652 | atomic_long_dec(&nr_swap_pages); | 646 | atomic_long_dec(&nr_swap_pages); |
653 | 647 | ||
654 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 648 | spin_lock(&swap_avail_lock); |
655 | hp_index = atomic_xchg(&highest_priority_index, -1); | ||
656 | /* | ||
657 | * highest_priority_index records current highest priority swap | ||
658 | * type which just frees swap entries. If its priority is | ||
659 | * higher than that of swap_list.next swap type, we use it. It | ||
660 | * isn't protected by swap_lock, so it can be an invalid value | ||
661 | * if the corresponding swap type is swapoff. We double check | ||
662 | * the flags here. It's even possible the swap type is swapoff | ||
663 | * and swapon again and its priority is changed. In such rare | ||
664 | * case, low prority swap type might be used, but eventually | ||
665 | * high priority swap will be used after several rounds of | ||
666 | * swap. | ||
667 | */ | ||
668 | if (hp_index != -1 && hp_index != type && | ||
669 | swap_info[type]->prio < swap_info[hp_index]->prio && | ||
670 | (swap_info[hp_index]->flags & SWP_WRITEOK)) { | ||
671 | type = hp_index; | ||
672 | swap_list.next = type; | ||
673 | } | ||
674 | |||
675 | si = swap_info[type]; | ||
676 | next = si->next; | ||
677 | if (next < 0 || | ||
678 | (!wrapped && si->prio != swap_info[next]->prio)) { | ||
679 | next = swap_list.head; | ||
680 | wrapped++; | ||
681 | } | ||
682 | 649 | ||
650 | start_over: | ||
651 | plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { | ||
652 | /* requeue si to after same-priority siblings */ | ||
653 | plist_requeue(&si->avail_list, &swap_avail_head); | ||
654 | spin_unlock(&swap_avail_lock); | ||
683 | spin_lock(&si->lock); | 655 | spin_lock(&si->lock); |
684 | if (!si->highest_bit) { | 656 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { |
685 | spin_unlock(&si->lock); | 657 | spin_lock(&swap_avail_lock); |
686 | continue; | 658 | if (plist_node_empty(&si->avail_list)) { |
687 | } | 659 | spin_unlock(&si->lock); |
688 | if (!(si->flags & SWP_WRITEOK)) { | 660 | goto nextsi; |
661 | } | ||
662 | WARN(!si->highest_bit, | ||
663 | "swap_info %d in list but !highest_bit\n", | ||
664 | si->type); | ||
665 | WARN(!(si->flags & SWP_WRITEOK), | ||
666 | "swap_info %d in list but !SWP_WRITEOK\n", | ||
667 | si->type); | ||
668 | plist_del(&si->avail_list, &swap_avail_head); | ||
689 | spin_unlock(&si->lock); | 669 | spin_unlock(&si->lock); |
690 | continue; | 670 | goto nextsi; |
691 | } | 671 | } |
692 | 672 | ||
693 | swap_list.next = next; | ||
694 | |||
695 | spin_unlock(&swap_lock); | ||
696 | /* This is called for allocating swap entry for cache */ | 673 | /* This is called for allocating swap entry for cache */ |
697 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | 674 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
698 | spin_unlock(&si->lock); | 675 | spin_unlock(&si->lock); |
699 | if (offset) | 676 | if (offset) |
700 | return swp_entry(type, offset); | 677 | return swp_entry(si->type, offset); |
701 | spin_lock(&swap_lock); | 678 | pr_debug("scan_swap_map of si %d failed to find offset\n", |
702 | next = swap_list.next; | 679 | si->type); |
680 | spin_lock(&swap_avail_lock); | ||
681 | nextsi: | ||
682 | /* | ||
683 | * if we got here, it's likely that si was almost full before, | ||
684 | * and since scan_swap_map() can drop the si->lock, multiple | ||
685 | * callers probably all tried to get a page from the same si | ||
686 | * and it filled up before we could get one; or, the si filled | ||
687 | * up between us dropping swap_avail_lock and taking si->lock. | ||
688 | * Since we dropped the swap_avail_lock, the swap_avail_head | ||
689 | * list may have been modified; so if next is still in the | ||
690 | * swap_avail_head list then try it, otherwise start over. | ||
691 | */ | ||
692 | if (plist_node_empty(&next->avail_list)) | ||
693 | goto start_over; | ||
703 | } | 694 | } |
704 | 695 | ||
696 | spin_unlock(&swap_avail_lock); | ||
697 | |||
705 | atomic_long_inc(&nr_swap_pages); | 698 | atomic_long_inc(&nr_swap_pages); |
706 | noswap: | 699 | noswap: |
707 | spin_unlock(&swap_lock); | ||
708 | return (swp_entry_t) {0}; | 700 | return (swp_entry_t) {0}; |
709 | } | 701 | } |
710 | 702 | ||
@@ -766,27 +758,6 @@ out: | |||
766 | return NULL; | 758 | return NULL; |
767 | } | 759 | } |
768 | 760 | ||
769 | /* | ||
770 | * This swap type frees swap entry, check if it is the highest priority swap | ||
771 | * type which just frees swap entry. get_swap_page() uses | ||
772 | * highest_priority_index to search highest priority swap type. The | ||
773 | * swap_info_struct.lock can't protect us if there are multiple swap types | ||
774 | * active, so we use atomic_cmpxchg. | ||
775 | */ | ||
776 | static void set_highest_priority_index(int type) | ||
777 | { | ||
778 | int old_hp_index, new_hp_index; | ||
779 | |||
780 | do { | ||
781 | old_hp_index = atomic_read(&highest_priority_index); | ||
782 | if (old_hp_index != -1 && | ||
783 | swap_info[old_hp_index]->prio >= swap_info[type]->prio) | ||
784 | break; | ||
785 | new_hp_index = type; | ||
786 | } while (atomic_cmpxchg(&highest_priority_index, | ||
787 | old_hp_index, new_hp_index) != old_hp_index); | ||
788 | } | ||
789 | |||
790 | static unsigned char swap_entry_free(struct swap_info_struct *p, | 761 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
791 | swp_entry_t entry, unsigned char usage) | 762 | swp_entry_t entry, unsigned char usage) |
792 | { | 763 | { |
@@ -828,9 +799,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
828 | dec_cluster_info_page(p, p->cluster_info, offset); | 799 | dec_cluster_info_page(p, p->cluster_info, offset); |
829 | if (offset < p->lowest_bit) | 800 | if (offset < p->lowest_bit) |
830 | p->lowest_bit = offset; | 801 | p->lowest_bit = offset; |
831 | if (offset > p->highest_bit) | 802 | if (offset > p->highest_bit) { |
803 | bool was_full = !p->highest_bit; | ||
832 | p->highest_bit = offset; | 804 | p->highest_bit = offset; |
833 | set_highest_priority_index(p->type); | 805 | if (was_full && (p->flags & SWP_WRITEOK)) { |
806 | spin_lock(&swap_avail_lock); | ||
807 | WARN_ON(!plist_node_empty(&p->avail_list)); | ||
808 | if (plist_node_empty(&p->avail_list)) | ||
809 | plist_add(&p->avail_list, | ||
810 | &swap_avail_head); | ||
811 | spin_unlock(&swap_avail_lock); | ||
812 | } | ||
813 | } | ||
834 | atomic_long_inc(&nr_swap_pages); | 814 | atomic_long_inc(&nr_swap_pages); |
835 | p->inuse_pages--; | 815 | p->inuse_pages--; |
836 | frontswap_invalidate_page(p->type, offset); | 816 | frontswap_invalidate_page(p->type, offset); |
@@ -1765,30 +1745,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1765 | unsigned char *swap_map, | 1745 | unsigned char *swap_map, |
1766 | struct swap_cluster_info *cluster_info) | 1746 | struct swap_cluster_info *cluster_info) |
1767 | { | 1747 | { |
1768 | int i, prev; | ||
1769 | |||
1770 | if (prio >= 0) | 1748 | if (prio >= 0) |
1771 | p->prio = prio; | 1749 | p->prio = prio; |
1772 | else | 1750 | else |
1773 | p->prio = --least_priority; | 1751 | p->prio = --least_priority; |
1752 | /* | ||
1753 | * the plist prio is negated because plist ordering is | ||
1754 | * low-to-high, while swap ordering is high-to-low | ||
1755 | */ | ||
1756 | p->list.prio = -p->prio; | ||
1757 | p->avail_list.prio = -p->prio; | ||
1774 | p->swap_map = swap_map; | 1758 | p->swap_map = swap_map; |
1775 | p->cluster_info = cluster_info; | 1759 | p->cluster_info = cluster_info; |
1776 | p->flags |= SWP_WRITEOK; | 1760 | p->flags |= SWP_WRITEOK; |
1777 | atomic_long_add(p->pages, &nr_swap_pages); | 1761 | atomic_long_add(p->pages, &nr_swap_pages); |
1778 | total_swap_pages += p->pages; | 1762 | total_swap_pages += p->pages; |
1779 | 1763 | ||
1780 | /* insert swap space into swap_list: */ | 1764 | assert_spin_locked(&swap_lock); |
1781 | prev = -1; | 1765 | /* |
1782 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { | 1766 | * both lists are plists, and thus priority ordered. |
1783 | if (p->prio >= swap_info[i]->prio) | 1767 | * swap_active_head needs to be priority ordered for swapoff(), |
1784 | break; | 1768 | * which on removal of any swap_info_struct with an auto-assigned |
1785 | prev = i; | 1769 | * (i.e. negative) priority increments the auto-assigned priority |
1786 | } | 1770 | * of any lower-priority swap_info_structs. |
1787 | p->next = i; | 1771 | * swap_avail_head needs to be priority ordered for get_swap_page(), |
1788 | if (prev < 0) | 1772 | * which allocates swap pages from the highest available priority |
1789 | swap_list.head = swap_list.next = p->type; | 1773 | * swap_info_struct. |
1790 | else | 1774 | */ |
1791 | swap_info[prev]->next = p->type; | 1775 | plist_add(&p->list, &swap_active_head); |
1776 | spin_lock(&swap_avail_lock); | ||
1777 | plist_add(&p->avail_list, &swap_avail_head); | ||
1778 | spin_unlock(&swap_avail_lock); | ||
1792 | } | 1779 | } |
1793 | 1780 | ||
1794 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1781 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
@@ -1823,8 +1810,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1823 | struct address_space *mapping; | 1810 | struct address_space *mapping; |
1824 | struct inode *inode; | 1811 | struct inode *inode; |
1825 | struct filename *pathname; | 1812 | struct filename *pathname; |
1826 | int i, type, prev; | 1813 | int err, found = 0; |
1827 | int err; | ||
1828 | unsigned int old_block_size; | 1814 | unsigned int old_block_size; |
1829 | 1815 | ||
1830 | if (!capable(CAP_SYS_ADMIN)) | 1816 | if (!capable(CAP_SYS_ADMIN)) |
@@ -1842,17 +1828,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1842 | goto out; | 1828 | goto out; |
1843 | 1829 | ||
1844 | mapping = victim->f_mapping; | 1830 | mapping = victim->f_mapping; |
1845 | prev = -1; | ||
1846 | spin_lock(&swap_lock); | 1831 | spin_lock(&swap_lock); |
1847 | for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { | 1832 | plist_for_each_entry(p, &swap_active_head, list) { |
1848 | p = swap_info[type]; | ||
1849 | if (p->flags & SWP_WRITEOK) { | 1833 | if (p->flags & SWP_WRITEOK) { |
1850 | if (p->swap_file->f_mapping == mapping) | 1834 | if (p->swap_file->f_mapping == mapping) { |
1835 | found = 1; | ||
1851 | break; | 1836 | break; |
1837 | } | ||
1852 | } | 1838 | } |
1853 | prev = type; | ||
1854 | } | 1839 | } |
1855 | if (type < 0) { | 1840 | if (!found) { |
1856 | err = -EINVAL; | 1841 | err = -EINVAL; |
1857 | spin_unlock(&swap_lock); | 1842 | spin_unlock(&swap_lock); |
1858 | goto out_dput; | 1843 | goto out_dput; |
@@ -1864,20 +1849,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1864 | spin_unlock(&swap_lock); | 1849 | spin_unlock(&swap_lock); |
1865 | goto out_dput; | 1850 | goto out_dput; |
1866 | } | 1851 | } |
1867 | if (prev < 0) | 1852 | spin_lock(&swap_avail_lock); |
1868 | swap_list.head = p->next; | 1853 | plist_del(&p->avail_list, &swap_avail_head); |
1869 | else | 1854 | spin_unlock(&swap_avail_lock); |
1870 | swap_info[prev]->next = p->next; | ||
1871 | if (type == swap_list.next) { | ||
1872 | /* just pick something that's safe... */ | ||
1873 | swap_list.next = swap_list.head; | ||
1874 | } | ||
1875 | spin_lock(&p->lock); | 1855 | spin_lock(&p->lock); |
1876 | if (p->prio < 0) { | 1856 | if (p->prio < 0) { |
1877 | for (i = p->next; i >= 0; i = swap_info[i]->next) | 1857 | struct swap_info_struct *si = p; |
1878 | swap_info[i]->prio = p->prio--; | 1858 | |
1859 | plist_for_each_entry_continue(si, &swap_active_head, list) { | ||
1860 | si->prio++; | ||
1861 | si->list.prio--; | ||
1862 | si->avail_list.prio--; | ||
1863 | } | ||
1879 | least_priority++; | 1864 | least_priority++; |
1880 | } | 1865 | } |
1866 | plist_del(&p->list, &swap_active_head); | ||
1881 | atomic_long_sub(p->pages, &nr_swap_pages); | 1867 | atomic_long_sub(p->pages, &nr_swap_pages); |
1882 | total_swap_pages -= p->pages; | 1868 | total_swap_pages -= p->pages; |
1883 | p->flags &= ~SWP_WRITEOK; | 1869 | p->flags &= ~SWP_WRITEOK; |
@@ -1885,7 +1871,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1885 | spin_unlock(&swap_lock); | 1871 | spin_unlock(&swap_lock); |
1886 | 1872 | ||
1887 | set_current_oom_origin(); | 1873 | set_current_oom_origin(); |
1888 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ | 1874 | err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ |
1889 | clear_current_oom_origin(); | 1875 | clear_current_oom_origin(); |
1890 | 1876 | ||
1891 | if (err) { | 1877 | if (err) { |
@@ -1926,7 +1912,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1926 | frontswap_map = frontswap_map_get(p); | 1912 | frontswap_map = frontswap_map_get(p); |
1927 | spin_unlock(&p->lock); | 1913 | spin_unlock(&p->lock); |
1928 | spin_unlock(&swap_lock); | 1914 | spin_unlock(&swap_lock); |
1929 | frontswap_invalidate_area(type); | 1915 | frontswap_invalidate_area(p->type); |
1930 | frontswap_map_set(p, NULL); | 1916 | frontswap_map_set(p, NULL); |
1931 | mutex_unlock(&swapon_mutex); | 1917 | mutex_unlock(&swapon_mutex); |
1932 | free_percpu(p->percpu_cluster); | 1918 | free_percpu(p->percpu_cluster); |
@@ -1935,7 +1921,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1935 | vfree(cluster_info); | 1921 | vfree(cluster_info); |
1936 | vfree(frontswap_map); | 1922 | vfree(frontswap_map); |
1937 | /* Destroy swap account information */ | 1923 | /* Destroy swap account information */ |
1938 | swap_cgroup_swapoff(type); | 1924 | swap_cgroup_swapoff(p->type); |
1939 | 1925 | ||
1940 | inode = mapping->host; | 1926 | inode = mapping->host; |
1941 | if (S_ISBLK(inode->i_mode)) { | 1927 | if (S_ISBLK(inode->i_mode)) { |
@@ -2142,8 +2128,9 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2142 | */ | 2128 | */ |
2143 | } | 2129 | } |
2144 | INIT_LIST_HEAD(&p->first_swap_extent.list); | 2130 | INIT_LIST_HEAD(&p->first_swap_extent.list); |
2131 | plist_node_init(&p->list, 0); | ||
2132 | plist_node_init(&p->avail_list, 0); | ||
2145 | p->flags = SWP_USED; | 2133 | p->flags = SWP_USED; |
2146 | p->next = -1; | ||
2147 | spin_unlock(&swap_lock); | 2134 | spin_unlock(&swap_lock); |
2148 | spin_lock_init(&p->lock); | 2135 | spin_lock_init(&p->lock); |
2149 | 2136 | ||
diff --git a/mm/vmacache.c b/mm/vmacache.c index 1037a3bab505..9f25af825dec 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c | |||
@@ -17,6 +17,16 @@ void vmacache_flush_all(struct mm_struct *mm) | |||
17 | { | 17 | { |
18 | struct task_struct *g, *p; | 18 | struct task_struct *g, *p; |
19 | 19 | ||
20 | /* | ||
21 | * Single threaded tasks need not iterate the entire | ||
22 | * list of process. We can avoid the flushing as well | ||
23 | * since the mm's seqnum was increased and don't have | ||
24 | * to worry about other threads' seqnum. Current's | ||
25 | * flush will occur upon the next lookup. | ||
26 | */ | ||
27 | if (atomic_read(&mm->mm_users) == 1) | ||
28 | return; | ||
29 | |||
20 | rcu_read_lock(); | 30 | rcu_read_lock(); |
21 | for_each_process_thread(g, p) { | 31 | for_each_process_thread(g, p) { |
22 | /* | 32 | /* |
@@ -78,6 +88,8 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) | |||
78 | if (!vmacache_valid(mm)) | 88 | if (!vmacache_valid(mm)) |
79 | return NULL; | 89 | return NULL; |
80 | 90 | ||
91 | count_vm_vmacache_event(VMACACHE_FIND_CALLS); | ||
92 | |||
81 | for (i = 0; i < VMACACHE_SIZE; i++) { | 93 | for (i = 0; i < VMACACHE_SIZE; i++) { |
82 | struct vm_area_struct *vma = current->vmacache[i]; | 94 | struct vm_area_struct *vma = current->vmacache[i]; |
83 | 95 | ||
@@ -85,8 +97,10 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) | |||
85 | continue; | 97 | continue; |
86 | if (WARN_ON_ONCE(vma->vm_mm != mm)) | 98 | if (WARN_ON_ONCE(vma->vm_mm != mm)) |
87 | break; | 99 | break; |
88 | if (vma->vm_start <= addr && vma->vm_end > addr) | 100 | if (vma->vm_start <= addr && vma->vm_end > addr) { |
101 | count_vm_vmacache_event(VMACACHE_FIND_HITS); | ||
89 | return vma; | 102 | return vma; |
103 | } | ||
90 | } | 104 | } |
91 | 105 | ||
92 | return NULL; | 106 | return NULL; |
@@ -102,11 +116,15 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, | |||
102 | if (!vmacache_valid(mm)) | 116 | if (!vmacache_valid(mm)) |
103 | return NULL; | 117 | return NULL; |
104 | 118 | ||
119 | count_vm_vmacache_event(VMACACHE_FIND_CALLS); | ||
120 | |||
105 | for (i = 0; i < VMACACHE_SIZE; i++) { | 121 | for (i = 0; i < VMACACHE_SIZE; i++) { |
106 | struct vm_area_struct *vma = current->vmacache[i]; | 122 | struct vm_area_struct *vma = current->vmacache[i]; |
107 | 123 | ||
108 | if (vma && vma->vm_start == start && vma->vm_end == end) | 124 | if (vma && vma->vm_start == start && vma->vm_end == end) { |
125 | count_vm_vmacache_event(VMACACHE_FIND_HITS); | ||
109 | return vma; | 126 | return vma; |
127 | } | ||
110 | } | 128 | } |
111 | 129 | ||
112 | return NULL; | 130 | return NULL; |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bf233b283319..f64632b67196 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1268,6 +1268,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) | |||
1268 | vunmap_page_range(addr, end); | 1268 | vunmap_page_range(addr, end); |
1269 | flush_tlb_kernel_range(addr, end); | 1269 | flush_tlb_kernel_range(addr, end); |
1270 | } | 1270 | } |
1271 | EXPORT_SYMBOL_GPL(unmap_kernel_range); | ||
1271 | 1272 | ||
1272 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | 1273 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) |
1273 | { | 1274 | { |
@@ -1496,7 +1497,7 @@ void vfree(const void *addr) | |||
1496 | if (!addr) | 1497 | if (!addr) |
1497 | return; | 1498 | return; |
1498 | if (unlikely(in_interrupt())) { | 1499 | if (unlikely(in_interrupt())) { |
1499 | struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); | 1500 | struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); |
1500 | if (llist_add((struct llist_node *)addr, &p->list)) | 1501 | if (llist_add((struct llist_node *)addr, &p->list)) |
1501 | schedule_work(&p->wq); | 1502 | schedule_work(&p->wq); |
1502 | } else | 1503 | } else |
@@ -2619,19 +2620,19 @@ static int s_show(struct seq_file *m, void *p) | |||
2619 | seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); | 2620 | seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); |
2620 | 2621 | ||
2621 | if (v->flags & VM_IOREMAP) | 2622 | if (v->flags & VM_IOREMAP) |
2622 | seq_printf(m, " ioremap"); | 2623 | seq_puts(m, " ioremap"); |
2623 | 2624 | ||
2624 | if (v->flags & VM_ALLOC) | 2625 | if (v->flags & VM_ALLOC) |
2625 | seq_printf(m, " vmalloc"); | 2626 | seq_puts(m, " vmalloc"); |
2626 | 2627 | ||
2627 | if (v->flags & VM_MAP) | 2628 | if (v->flags & VM_MAP) |
2628 | seq_printf(m, " vmap"); | 2629 | seq_puts(m, " vmap"); |
2629 | 2630 | ||
2630 | if (v->flags & VM_USERMAP) | 2631 | if (v->flags & VM_USERMAP) |
2631 | seq_printf(m, " user"); | 2632 | seq_puts(m, " user"); |
2632 | 2633 | ||
2633 | if (v->flags & VM_VPAGES) | 2634 | if (v->flags & VM_VPAGES) |
2634 | seq_printf(m, " vpages"); | 2635 | seq_puts(m, " vpages"); |
2635 | 2636 | ||
2636 | show_numa_info(m, v); | 2637 | show_numa_info(m, v); |
2637 | seq_putc(m, '\n'); | 2638 | seq_putc(m, '\n'); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 32c661d66a45..9149444f947d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
324 | else | 324 | else |
325 | new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); | 325 | new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); |
326 | 326 | ||
327 | trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); | 327 | trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); |
328 | return freed; | 328 | return freed; |
329 | } | 329 | } |
330 | 330 | ||
@@ -1121,7 +1121,7 @@ keep: | |||
1121 | VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); | 1121 | VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); |
1122 | } | 1122 | } |
1123 | 1123 | ||
1124 | free_hot_cold_page_list(&free_pages, 1); | 1124 | free_hot_cold_page_list(&free_pages, true); |
1125 | 1125 | ||
1126 | list_splice(&ret_pages, page_list); | 1126 | list_splice(&ret_pages, page_list); |
1127 | count_vm_events(PGACTIVATE, pgactivate); | 1127 | count_vm_events(PGACTIVATE, pgactivate); |
@@ -1439,6 +1439,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | |||
1439 | } | 1439 | } |
1440 | 1440 | ||
1441 | /* | 1441 | /* |
1442 | * If a kernel thread (such as nfsd for loop-back mounts) services | ||
1443 | * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. | ||
1444 | * In that case we should only throttle if the backing device it is | ||
1445 | * writing to is congested. In other cases it is safe to throttle. | ||
1446 | */ | ||
1447 | static int current_may_throttle(void) | ||
1448 | { | ||
1449 | return !(current->flags & PF_LESS_THROTTLE) || | ||
1450 | current->backing_dev_info == NULL || | ||
1451 | bdi_write_congested(current->backing_dev_info); | ||
1452 | } | ||
1453 | |||
1454 | /* | ||
1442 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1455 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
1443 | * of reclaimed pages | 1456 | * of reclaimed pages |
1444 | */ | 1457 | */ |
@@ -1519,7 +1532,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1519 | 1532 | ||
1520 | spin_unlock_irq(&zone->lru_lock); | 1533 | spin_unlock_irq(&zone->lru_lock); |
1521 | 1534 | ||
1522 | free_hot_cold_page_list(&page_list, 1); | 1535 | free_hot_cold_page_list(&page_list, true); |
1523 | 1536 | ||
1524 | /* | 1537 | /* |
1525 | * If reclaim is isolating dirty pages under writeback, it implies | 1538 | * If reclaim is isolating dirty pages under writeback, it implies |
@@ -1566,7 +1579,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1566 | * implies that pages are cycling through the LRU faster than | 1579 | * implies that pages are cycling through the LRU faster than |
1567 | * they are written so also forcibly stall. | 1580 | * they are written so also forcibly stall. |
1568 | */ | 1581 | */ |
1569 | if (nr_unqueued_dirty == nr_taken || nr_immediate) | 1582 | if ((nr_unqueued_dirty == nr_taken || nr_immediate) && |
1583 | current_may_throttle()) | ||
1570 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1584 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1571 | } | 1585 | } |
1572 | 1586 | ||
@@ -1575,7 +1589,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1575 | * is congested. Allow kswapd to continue until it starts encountering | 1589 | * is congested. Allow kswapd to continue until it starts encountering |
1576 | * unqueued dirty pages or cycling through the LRU too quickly. | 1590 | * unqueued dirty pages or cycling through the LRU too quickly. |
1577 | */ | 1591 | */ |
1578 | if (!sc->hibernation_mode && !current_is_kswapd()) | 1592 | if (!sc->hibernation_mode && !current_is_kswapd() && |
1593 | current_may_throttle()) | ||
1579 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | 1594 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); |
1580 | 1595 | ||
1581 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1596 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
@@ -1740,7 +1755,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1740 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); | 1755 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
1741 | spin_unlock_irq(&zone->lru_lock); | 1756 | spin_unlock_irq(&zone->lru_lock); |
1742 | 1757 | ||
1743 | free_hot_cold_page_list(&l_hold, 1); | 1758 | free_hot_cold_page_list(&l_hold, true); |
1744 | } | 1759 | } |
1745 | 1760 | ||
1746 | #ifdef CONFIG_SWAP | 1761 | #ifdef CONFIG_SWAP |
@@ -1866,6 +1881,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1866 | bool force_scan = false; | 1881 | bool force_scan = false; |
1867 | unsigned long ap, fp; | 1882 | unsigned long ap, fp; |
1868 | enum lru_list lru; | 1883 | enum lru_list lru; |
1884 | bool some_scanned; | ||
1885 | int pass; | ||
1869 | 1886 | ||
1870 | /* | 1887 | /* |
1871 | * If the zone or memcg is small, nr[l] can be 0. This | 1888 | * If the zone or memcg is small, nr[l] can be 0. This |
@@ -1989,39 +2006,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1989 | fraction[1] = fp; | 2006 | fraction[1] = fp; |
1990 | denominator = ap + fp + 1; | 2007 | denominator = ap + fp + 1; |
1991 | out: | 2008 | out: |
1992 | for_each_evictable_lru(lru) { | 2009 | some_scanned = false; |
1993 | int file = is_file_lru(lru); | 2010 | /* Only use force_scan on second pass. */ |
1994 | unsigned long size; | 2011 | for (pass = 0; !some_scanned && pass < 2; pass++) { |
1995 | unsigned long scan; | 2012 | for_each_evictable_lru(lru) { |
2013 | int file = is_file_lru(lru); | ||
2014 | unsigned long size; | ||
2015 | unsigned long scan; | ||
1996 | 2016 | ||
1997 | size = get_lru_size(lruvec, lru); | 2017 | size = get_lru_size(lruvec, lru); |
1998 | scan = size >> sc->priority; | 2018 | scan = size >> sc->priority; |
1999 | 2019 | ||
2000 | if (!scan && force_scan) | 2020 | if (!scan && pass && force_scan) |
2001 | scan = min(size, SWAP_CLUSTER_MAX); | 2021 | scan = min(size, SWAP_CLUSTER_MAX); |
2002 | 2022 | ||
2003 | switch (scan_balance) { | 2023 | switch (scan_balance) { |
2004 | case SCAN_EQUAL: | 2024 | case SCAN_EQUAL: |
2005 | /* Scan lists relative to size */ | 2025 | /* Scan lists relative to size */ |
2006 | break; | 2026 | break; |
2007 | case SCAN_FRACT: | 2027 | case SCAN_FRACT: |
2028 | /* | ||
2029 | * Scan types proportional to swappiness and | ||
2030 | * their relative recent reclaim efficiency. | ||
2031 | */ | ||
2032 | scan = div64_u64(scan * fraction[file], | ||
2033 | denominator); | ||
2034 | break; | ||
2035 | case SCAN_FILE: | ||
2036 | case SCAN_ANON: | ||
2037 | /* Scan one type exclusively */ | ||
2038 | if ((scan_balance == SCAN_FILE) != file) | ||
2039 | scan = 0; | ||
2040 | break; | ||
2041 | default: | ||
2042 | /* Look ma, no brain */ | ||
2043 | BUG(); | ||
2044 | } | ||
2045 | nr[lru] = scan; | ||
2008 | /* | 2046 | /* |
2009 | * Scan types proportional to swappiness and | 2047 | * Skip the second pass and don't force_scan, |
2010 | * their relative recent reclaim efficiency. | 2048 | * if we found something to scan. |
2011 | */ | 2049 | */ |
2012 | scan = div64_u64(scan * fraction[file], denominator); | 2050 | some_scanned |= !!scan; |
2013 | break; | ||
2014 | case SCAN_FILE: | ||
2015 | case SCAN_ANON: | ||
2016 | /* Scan one type exclusively */ | ||
2017 | if ((scan_balance == SCAN_FILE) != file) | ||
2018 | scan = 0; | ||
2019 | break; | ||
2020 | default: | ||
2021 | /* Look ma, no brain */ | ||
2022 | BUG(); | ||
2023 | } | 2051 | } |
2024 | nr[lru] = scan; | ||
2025 | } | 2052 | } |
2026 | } | 2053 | } |
2027 | 2054 | ||
@@ -2037,13 +2064,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2037 | unsigned long nr_reclaimed = 0; | 2064 | unsigned long nr_reclaimed = 0; |
2038 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 2065 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
2039 | struct blk_plug plug; | 2066 | struct blk_plug plug; |
2040 | bool scan_adjusted = false; | 2067 | bool scan_adjusted; |
2041 | 2068 | ||
2042 | get_scan_count(lruvec, sc, nr); | 2069 | get_scan_count(lruvec, sc, nr); |
2043 | 2070 | ||
2044 | /* Record the original scan target for proportional adjustments later */ | 2071 | /* Record the original scan target for proportional adjustments later */ |
2045 | memcpy(targets, nr, sizeof(nr)); | 2072 | memcpy(targets, nr, sizeof(nr)); |
2046 | 2073 | ||
2074 | /* | ||
2075 | * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal | ||
2076 | * event that can occur when there is little memory pressure e.g. | ||
2077 | * multiple streaming readers/writers. Hence, we do not abort scanning | ||
2078 | * when the requested number of pages are reclaimed when scanning at | ||
2079 | * DEF_PRIORITY on the assumption that the fact we are direct | ||
2080 | * reclaiming implies that kswapd is not keeping up and it is best to | ||
2081 | * do a batch of work at once. For memcg reclaim one check is made to | ||
2082 | * abort proportional reclaim if either the file or anon lru has already | ||
2083 | * dropped to zero at the first pass. | ||
2084 | */ | ||
2085 | scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && | ||
2086 | sc->priority == DEF_PRIORITY); | ||
2087 | |||
2047 | blk_start_plug(&plug); | 2088 | blk_start_plug(&plug); |
2048 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 2089 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
2049 | nr[LRU_INACTIVE_FILE]) { | 2090 | nr[LRU_INACTIVE_FILE]) { |
@@ -2064,17 +2105,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2064 | continue; | 2105 | continue; |
2065 | 2106 | ||
2066 | /* | 2107 | /* |
2067 | * For global direct reclaim, reclaim only the number of pages | ||
2068 | * requested. Less care is taken to scan proportionally as it | ||
2069 | * is more important to minimise direct reclaim stall latency | ||
2070 | * than it is to properly age the LRU lists. | ||
2071 | */ | ||
2072 | if (global_reclaim(sc) && !current_is_kswapd()) | ||
2073 | break; | ||
2074 | |||
2075 | /* | ||
2076 | * For kswapd and memcg, reclaim at least the number of pages | 2108 | * For kswapd and memcg, reclaim at least the number of pages |
2077 | * requested. Ensure that the anon and file LRUs shrink | 2109 | * requested. Ensure that the anon and file LRUs are scanned |
2078 | * proportionally what was requested by get_scan_count(). We | 2110 | * proportionally what was requested by get_scan_count(). We |
2079 | * stop reclaiming one LRU and reduce the amount scanning | 2111 | * stop reclaiming one LRU and reduce the amount scanning |
2080 | * proportional to the original scan target. | 2112 | * proportional to the original scan target. |
@@ -2082,6 +2114,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2082 | nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; | 2114 | nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; |
2083 | nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; | 2115 | nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; |
2084 | 2116 | ||
2117 | /* | ||
2118 | * It's just vindictive to attack the larger once the smaller | ||
2119 | * has gone to zero. And given the way we stop scanning the | ||
2120 | * smaller below, this makes sure that we only make one nudge | ||
2121 | * towards proportionality once we've got nr_to_reclaim. | ||
2122 | */ | ||
2123 | if (!nr_file || !nr_anon) | ||
2124 | break; | ||
2125 | |||
2085 | if (nr_file > nr_anon) { | 2126 | if (nr_file > nr_anon) { |
2086 | unsigned long scan_target = targets[LRU_INACTIVE_ANON] + | 2127 | unsigned long scan_target = targets[LRU_INACTIVE_ANON] + |
2087 | targets[LRU_ACTIVE_ANON] + 1; | 2128 | targets[LRU_ACTIVE_ANON] + 1; |
@@ -2268,9 +2309,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
2268 | * there is a buffer of free pages available to give compaction | 2309 | * there is a buffer of free pages available to give compaction |
2269 | * a reasonable chance of completing and allocating the page | 2310 | * a reasonable chance of completing and allocating the page |
2270 | */ | 2311 | */ |
2271 | balance_gap = min(low_wmark_pages(zone), | 2312 | balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( |
2272 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2313 | zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); |
2273 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2274 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | 2314 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); |
2275 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | 2315 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); |
2276 | 2316 | ||
@@ -2525,10 +2565,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2525 | 2565 | ||
2526 | for (i = 0; i <= ZONE_NORMAL; i++) { | 2566 | for (i = 0; i <= ZONE_NORMAL; i++) { |
2527 | zone = &pgdat->node_zones[i]; | 2567 | zone = &pgdat->node_zones[i]; |
2568 | if (!populated_zone(zone)) | ||
2569 | continue; | ||
2570 | |||
2528 | pfmemalloc_reserve += min_wmark_pages(zone); | 2571 | pfmemalloc_reserve += min_wmark_pages(zone); |
2529 | free_pages += zone_page_state(zone, NR_FREE_PAGES); | 2572 | free_pages += zone_page_state(zone, NR_FREE_PAGES); |
2530 | } | 2573 | } |
2531 | 2574 | ||
2575 | /* If there are no reserves (unexpected config) then do not throttle */ | ||
2576 | if (!pfmemalloc_reserve) | ||
2577 | return true; | ||
2578 | |||
2532 | wmark_ok = free_pages > pfmemalloc_reserve / 2; | 2579 | wmark_ok = free_pages > pfmemalloc_reserve / 2; |
2533 | 2580 | ||
2534 | /* kswapd must be awake if processes are being throttled */ | 2581 | /* kswapd must be awake if processes are being throttled */ |
@@ -2553,9 +2600,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2553 | static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | 2600 | static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, |
2554 | nodemask_t *nodemask) | 2601 | nodemask_t *nodemask) |
2555 | { | 2602 | { |
2603 | struct zoneref *z; | ||
2556 | struct zone *zone; | 2604 | struct zone *zone; |
2557 | int high_zoneidx = gfp_zone(gfp_mask); | 2605 | pg_data_t *pgdat = NULL; |
2558 | pg_data_t *pgdat; | ||
2559 | 2606 | ||
2560 | /* | 2607 | /* |
2561 | * Kernel threads should not be throttled as they may be indirectly | 2608 | * Kernel threads should not be throttled as they may be indirectly |
@@ -2574,10 +2621,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2574 | if (fatal_signal_pending(current)) | 2621 | if (fatal_signal_pending(current)) |
2575 | goto out; | 2622 | goto out; |
2576 | 2623 | ||
2577 | /* Check if the pfmemalloc reserves are ok */ | 2624 | /* |
2578 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | 2625 | * Check if the pfmemalloc reserves are ok by finding the first node |
2579 | pgdat = zone->zone_pgdat; | 2626 | * with a usable ZONE_NORMAL or lower zone. The expectation is that |
2580 | if (pfmemalloc_watermark_ok(pgdat)) | 2627 | * GFP_KERNEL will be required for allocating network buffers when |
2628 | * swapping over the network so ZONE_HIGHMEM is unusable. | ||
2629 | * | ||
2630 | * Throttling is based on the first usable node and throttled processes | ||
2631 | * wait on a queue until kswapd makes progress and wakes them. There | ||
2632 | * is an affinity then between processes waking up and where reclaim | ||
2633 | * progress has been made assuming the process wakes on the same node. | ||
2634 | * More importantly, processes running on remote nodes will not compete | ||
2635 | * for remote pfmemalloc reserves and processes on different nodes | ||
2636 | * should make reasonable progress. | ||
2637 | */ | ||
2638 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
2639 | gfp_mask, nodemask) { | ||
2640 | if (zone_idx(zone) > ZONE_NORMAL) | ||
2641 | continue; | ||
2642 | |||
2643 | /* Throttle based on the first usable node */ | ||
2644 | pgdat = zone->zone_pgdat; | ||
2645 | if (pfmemalloc_watermark_ok(pgdat)) | ||
2646 | goto out; | ||
2647 | break; | ||
2648 | } | ||
2649 | |||
2650 | /* If no zone was usable by the allocation flags then do not throttle */ | ||
2651 | if (!pgdat) | ||
2581 | goto out; | 2652 | goto out; |
2582 | 2653 | ||
2583 | /* Account for the throttling */ | 2654 | /* Account for the throttling */ |
@@ -2891,9 +2962,8 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2891 | * high wmark plus a "gap" where the gap is either the low | 2962 | * high wmark plus a "gap" where the gap is either the low |
2892 | * watermark or 1% of the zone, whichever is smaller. | 2963 | * watermark or 1% of the zone, whichever is smaller. |
2893 | */ | 2964 | */ |
2894 | balance_gap = min(low_wmark_pages(zone), | 2965 | balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( |
2895 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2966 | zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); |
2896 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2897 | 2967 | ||
2898 | /* | 2968 | /* |
2899 | * If there is no low memory pressure or the zone is balanced then no | 2969 | * If there is no low memory pressure or the zone is balanced then no |
@@ -3422,7 +3492,7 @@ int kswapd_run(int nid) | |||
3422 | 3492 | ||
3423 | /* | 3493 | /* |
3424 | * Called by memory hotplug when all memory in a node is offlined. Caller must | 3494 | * Called by memory hotplug when all memory in a node is offlined. Caller must |
3425 | * hold lock_memory_hotplug(). | 3495 | * hold mem_hotplug_begin/end(). |
3426 | */ | 3496 | */ |
3427 | void kswapd_stop(int nid) | 3497 | void kswapd_stop(int nid) |
3428 | { | 3498 | { |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 302dd076b8bf..b37bd49bfd55 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -207,7 +207,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, | |||
207 | } | 207 | } |
208 | 208 | ||
209 | /* | 209 | /* |
210 | * For use when we know that interrupts are disabled. | 210 | * For use when we know that interrupts are disabled, |
211 | * or when we know that preemption is disabled and that | ||
212 | * particular counter cannot be updated from interrupt context. | ||
211 | */ | 213 | */ |
212 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 214 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
213 | int delta) | 215 | int delta) |
@@ -489,7 +491,7 @@ static void refresh_cpu_vm_stats(void) | |||
489 | continue; | 491 | continue; |
490 | 492 | ||
491 | if (__this_cpu_read(p->pcp.count)) | 493 | if (__this_cpu_read(p->pcp.count)) |
492 | drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); | 494 | drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); |
493 | #endif | 495 | #endif |
494 | } | 496 | } |
495 | fold_diff(global_diff); | 497 | fold_diff(global_diff); |
@@ -866,6 +868,10 @@ const char * const vmstat_text[] = { | |||
866 | "nr_tlb_local_flush_one", | 868 | "nr_tlb_local_flush_one", |
867 | #endif /* CONFIG_DEBUG_TLBFLUSH */ | 869 | #endif /* CONFIG_DEBUG_TLBFLUSH */ |
868 | 870 | ||
871 | #ifdef CONFIG_DEBUG_VM_VMACACHE | ||
872 | "vmacache_find_calls", | ||
873 | "vmacache_find_hits", | ||
874 | #endif | ||
869 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 875 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
870 | }; | 876 | }; |
871 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ | 877 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
@@ -1226,7 +1232,7 @@ int sysctl_stat_interval __read_mostly = HZ; | |||
1226 | static void vmstat_update(struct work_struct *w) | 1232 | static void vmstat_update(struct work_struct *w) |
1227 | { | 1233 | { |
1228 | refresh_cpu_vm_stats(); | 1234 | refresh_cpu_vm_stats(); |
1229 | schedule_delayed_work(&__get_cpu_var(vmstat_work), | 1235 | schedule_delayed_work(this_cpu_ptr(&vmstat_work), |
1230 | round_jiffies_relative(sysctl_stat_interval)); | 1236 | round_jiffies_relative(sysctl_stat_interval)); |
1231 | } | 1237 | } |
1232 | 1238 | ||
@@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) | |||
247 | * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate | 247 | * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate |
248 | * a new page. | 248 | * a new page. |
249 | */ | 249 | */ |
250 | int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, | 250 | int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, |
251 | unsigned long *handle) | 251 | unsigned long *handle) |
252 | { | 252 | { |
253 | int chunks, i, freechunks; | 253 | int chunks, i, freechunks; |
@@ -255,7 +255,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, | |||
255 | enum buddy bud; | 255 | enum buddy bud; |
256 | struct page *page; | 256 | struct page *page; |
257 | 257 | ||
258 | if (size <= 0 || gfp & __GFP_HIGHMEM) | 258 | if (!size || (gfp & __GFP_HIGHMEM)) |
259 | return -EINVAL; | 259 | return -EINVAL; |
260 | if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) | 260 | if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) |
261 | return -ENOSPC; | 261 | return -ENOSPC; |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 36b4591a7a2d..fe78189624cf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -141,7 +141,7 @@ | |||
141 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE | 141 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE |
142 | 142 | ||
143 | /* | 143 | /* |
144 | * On systems with 4K page size, this gives 254 size classes! There is a | 144 | * On systems with 4K page size, this gives 255 size classes! There is a |
145 | * trader-off here: | 145 | * trader-off here: |
146 | * - Large number of size classes is potentially wasteful as free page are | 146 | * - Large number of size classes is potentially wasteful as free page are |
147 | * spread across these classes | 147 | * spread across these classes |
@@ -1082,7 +1082,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1082 | class = &pool->size_class[class_idx]; | 1082 | class = &pool->size_class[class_idx]; |
1083 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1083 | off = obj_idx_to_offset(page, obj_idx, class->size); |
1084 | 1084 | ||
1085 | area = &__get_cpu_var(zs_map_area); | 1085 | area = this_cpu_ptr(&zs_map_area); |
1086 | if (off + class->size <= PAGE_SIZE) | 1086 | if (off + class->size <= PAGE_SIZE) |
1087 | kunmap_atomic(area->vm_addr); | 1087 | kunmap_atomic(area->vm_addr); |
1088 | else { | 1088 | else { |
diff --git a/mm/zswap.c b/mm/zswap.c index aeaef0fb5624..008388fe7b0f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -347,7 +347,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) | |||
347 | return NOTIFY_BAD; | 347 | return NOTIFY_BAD; |
348 | } | 348 | } |
349 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; | 349 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; |
350 | dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); | 350 | dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); |
351 | if (!dst) { | 351 | if (!dst) { |
352 | pr_err("can't allocate compressor buffer\n"); | 352 | pr_err("can't allocate compressor buffer\n"); |
353 | crypto_free_comp(tfm); | 353 | crypto_free_comp(tfm); |