diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 27 | ||||
-rw-r--r-- | mm/backing-dev.c | 11 | ||||
-rw-r--r-- | mm/bootmem.c | 32 | ||||
-rw-r--r-- | mm/bounce.c | 48 | ||||
-rw-r--r-- | mm/cleancache.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 133 | ||||
-rw-r--r-- | mm/fadvise.c | 20 | ||||
-rw-r--r-- | mm/filemap.c | 7 | ||||
-rw-r--r-- | mm/fremap.c | 51 | ||||
-rw-r--r-- | mm/huge_memory.c | 113 | ||||
-rw-r--r-- | mm/hugetlb.c | 39 | ||||
-rw-r--r-- | mm/internal.h | 7 | ||||
-rw-r--r-- | mm/kmemleak.c | 14 | ||||
-rw-r--r-- | mm/ksm.c | 670 | ||||
-rw-r--r-- | mm/madvise.c | 105 | ||||
-rw-r--r-- | mm/memblock.c | 70 | ||||
-rw-r--r-- | mm/memcontrol.c | 477 | ||||
-rw-r--r-- | mm/memory-failure.c | 202 | ||||
-rw-r--r-- | mm/memory.c | 127 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 553 | ||||
-rw-r--r-- | mm/mempolicy.c | 59 | ||||
-rw-r--r-- | mm/migrate.c | 154 | ||||
-rw-r--r-- | mm/mincore.c | 5 | ||||
-rw-r--r-- | mm/mlock.c | 137 | ||||
-rw-r--r-- | mm/mm_init.c | 31 | ||||
-rw-r--r-- | mm/mmap.c | 123 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 102 | ||||
-rw-r--r-- | mm/mmzone.c | 20 | ||||
-rw-r--r-- | mm/mremap.c | 28 | ||||
-rw-r--r-- | mm/nobootmem.c | 23 | ||||
-rw-r--r-- | mm/nommu.c | 41 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 28 | ||||
-rw-r--r-- | mm/page_alloc.c | 498 | ||||
-rw-r--r-- | mm/rmap.c | 30 | ||||
-rw-r--r-- | mm/shmem.c | 102 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 4 | ||||
-rw-r--r-- | mm/sparse.c | 12 | ||||
-rw-r--r-- | mm/swap.c | 9 | ||||
-rw-r--r-- | mm/swap_state.c | 58 | ||||
-rw-r--r-- | mm/swapfile.c | 176 | ||||
-rw-r--r-- | mm/util.c | 26 | ||||
-rw-r--r-- | mm/vmalloc.c | 33 | ||||
-rw-r--r-- | mm/vmscan.c | 397 | ||||
-rw-r--r-- | mm/vmstat.c | 7 |
47 files changed, 3230 insertions, 1591 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 278e3ab1f169..ae55c1e04d10 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -1,6 +1,6 @@ | |||
1 | config SELECT_MEMORY_MODEL | 1 | config SELECT_MEMORY_MODEL |
2 | def_bool y | 2 | def_bool y |
3 | depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL | 3 | depends on ARCH_SELECT_MEMORY_MODEL |
4 | 4 | ||
5 | choice | 5 | choice |
6 | prompt "Memory model" | 6 | prompt "Memory model" |
@@ -162,10 +162,16 @@ config MOVABLE_NODE | |||
162 | Say Y here if you want to hotplug a whole node. | 162 | Say Y here if you want to hotplug a whole node. |
163 | Say N here if you want kernel to use memory on all nodes evenly. | 163 | Say N here if you want kernel to use memory on all nodes evenly. |
164 | 164 | ||
165 | # | ||
166 | # Only be set on architectures that have completely implemented memory hotplug | ||
167 | # feature. If you are not sure, don't touch it. | ||
168 | # | ||
169 | config HAVE_BOOTMEM_INFO_NODE | ||
170 | def_bool n | ||
171 | |||
165 | # eventually, we can have this option just 'select SPARSEMEM' | 172 | # eventually, we can have this option just 'select SPARSEMEM' |
166 | config MEMORY_HOTPLUG | 173 | config MEMORY_HOTPLUG |
167 | bool "Allow for memory hot-add" | 174 | bool "Allow for memory hot-add" |
168 | select MEMORY_ISOLATION | ||
169 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 175 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
170 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG | 176 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG |
171 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) | 177 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) |
@@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE | |||
176 | 182 | ||
177 | config MEMORY_HOTREMOVE | 183 | config MEMORY_HOTREMOVE |
178 | bool "Allow for memory hot remove" | 184 | bool "Allow for memory hot remove" |
185 | select MEMORY_ISOLATION | ||
186 | select HAVE_BOOTMEM_INFO_NODE if X86_64 | ||
179 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE | 187 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE |
180 | depends on MIGRATION | 188 | depends on MIGRATION |
181 | 189 | ||
@@ -258,6 +266,19 @@ config BOUNCE | |||
258 | def_bool y | 266 | def_bool y |
259 | depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) | 267 | depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) |
260 | 268 | ||
269 | # On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often | ||
270 | # have more than 4GB of memory, but we don't currently use the IOTLB to present | ||
271 | # a 32-bit address to OHCI. So we need to use a bounce pool instead. | ||
272 | # | ||
273 | # We also use the bounce pool to provide stable page writes for jbd. jbd | ||
274 | # initiates buffer writeback without locking the page or setting PG_writeback, | ||
275 | # and fixing that behavior (a second time; jbd2 doesn't have this problem) is | ||
276 | # a major rework effort. Instead, use the bounce buffer to snapshot pages | ||
277 | # (until jbd goes away). The only jbd user is ext3. | ||
278 | config NEED_BOUNCE_POOL | ||
279 | bool | ||
280 | default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD) | ||
281 | |||
261 | config NR_QUICK | 282 | config NR_QUICK |
262 | int | 283 | int |
263 | depends on QUICKLIST | 284 | depends on QUICKLIST |
@@ -266,7 +287,7 @@ config NR_QUICK | |||
266 | 287 | ||
267 | config VIRT_TO_BUS | 288 | config VIRT_TO_BUS |
268 | def_bool y | 289 | def_bool y |
269 | depends on !ARCH_NO_VIRT_TO_BUS | 290 | depends on HAVE_VIRT_TO_BUS |
270 | 291 | ||
271 | config MMU_NOTIFIER | 292 | config MMU_NOTIFIER |
272 | bool | 293 | bool |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d3ca2b3ee176..41733c5dc820 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -221,12 +221,23 @@ static ssize_t max_ratio_store(struct device *dev, | |||
221 | } | 221 | } |
222 | BDI_SHOW(max_ratio, bdi->max_ratio) | 222 | BDI_SHOW(max_ratio, bdi->max_ratio) |
223 | 223 | ||
224 | static ssize_t stable_pages_required_show(struct device *dev, | ||
225 | struct device_attribute *attr, | ||
226 | char *page) | ||
227 | { | ||
228 | struct backing_dev_info *bdi = dev_get_drvdata(dev); | ||
229 | |||
230 | return snprintf(page, PAGE_SIZE-1, "%d\n", | ||
231 | bdi_cap_stable_pages_required(bdi) ? 1 : 0); | ||
232 | } | ||
233 | |||
224 | #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) | 234 | #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) |
225 | 235 | ||
226 | static struct device_attribute bdi_dev_attrs[] = { | 236 | static struct device_attribute bdi_dev_attrs[] = { |
227 | __ATTR_RW(read_ahead_kb), | 237 | __ATTR_RW(read_ahead_kb), |
228 | __ATTR_RW(min_ratio), | 238 | __ATTR_RW(min_ratio), |
229 | __ATTR_RW(max_ratio), | 239 | __ATTR_RW(max_ratio), |
240 | __ATTR_RO(stable_pages_required), | ||
230 | __ATTR_NULL, | 241 | __ATTR_NULL, |
231 | }; | 242 | }; |
232 | 243 | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index 1324cd74faec..2b0bcb019ec2 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
185 | 185 | ||
186 | while (start < end) { | 186 | while (start < end) { |
187 | unsigned long *map, idx, vec; | 187 | unsigned long *map, idx, vec; |
188 | unsigned shift; | ||
188 | 189 | ||
189 | map = bdata->node_bootmem_map; | 190 | map = bdata->node_bootmem_map; |
190 | idx = start - bdata->node_min_pfn; | 191 | idx = start - bdata->node_min_pfn; |
192 | shift = idx & (BITS_PER_LONG - 1); | ||
193 | /* | ||
194 | * vec holds at most BITS_PER_LONG map bits, | ||
195 | * bit 0 corresponds to start. | ||
196 | */ | ||
191 | vec = ~map[idx / BITS_PER_LONG]; | 197 | vec = ~map[idx / BITS_PER_LONG]; |
198 | |||
199 | if (shift) { | ||
200 | vec >>= shift; | ||
201 | if (end - start >= BITS_PER_LONG) | ||
202 | vec |= ~map[idx / BITS_PER_LONG + 1] << | ||
203 | (BITS_PER_LONG - shift); | ||
204 | } | ||
192 | /* | 205 | /* |
193 | * If we have a properly aligned and fully unreserved | 206 | * If we have a properly aligned and fully unreserved |
194 | * BITS_PER_LONG block of pages in front of us, free | 207 | * BITS_PER_LONG block of pages in front of us, free |
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
201 | count += BITS_PER_LONG; | 214 | count += BITS_PER_LONG; |
202 | start += BITS_PER_LONG; | 215 | start += BITS_PER_LONG; |
203 | } else { | 216 | } else { |
204 | unsigned long off = 0; | 217 | unsigned long cur = start; |
205 | 218 | ||
206 | vec >>= start & (BITS_PER_LONG - 1); | 219 | start = ALIGN(start + 1, BITS_PER_LONG); |
207 | while (vec) { | 220 | while (vec && cur != start) { |
208 | if (vec & 1) { | 221 | if (vec & 1) { |
209 | page = pfn_to_page(start + off); | 222 | page = pfn_to_page(cur); |
210 | __free_pages_bootmem(page, 0); | 223 | __free_pages_bootmem(page, 0); |
211 | count++; | 224 | count++; |
212 | } | 225 | } |
213 | vec >>= 1; | 226 | vec >>= 1; |
214 | off++; | 227 | ++cur; |
215 | } | 228 | } |
216 | start = ALIGN(start + 1, BITS_PER_LONG); | ||
217 | } | 229 | } |
218 | } | 230 | } |
219 | 231 | ||
@@ -821,6 +833,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | |||
821 | return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); | 833 | return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); |
822 | } | 834 | } |
823 | 835 | ||
836 | void * __init __alloc_bootmem_low_nopanic(unsigned long size, | ||
837 | unsigned long align, | ||
838 | unsigned long goal) | ||
839 | { | ||
840 | return ___alloc_bootmem_nopanic(size, align, goal, | ||
841 | ARCH_LOW_ADDRESS_LIMIT); | ||
842 | } | ||
843 | |||
824 | /** | 844 | /** |
825 | * __alloc_bootmem_low_node - allocate low boot memory from a specific node | 845 | * __alloc_bootmem_low_node - allocate low boot memory from a specific node |
826 | * @pgdat: node to allocate from | 846 | * @pgdat: node to allocate from |
diff --git a/mm/bounce.c b/mm/bounce.c index 042086775561..5f8901768602 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -178,8 +178,45 @@ static void bounce_end_io_read_isa(struct bio *bio, int err) | |||
178 | __bounce_end_io_read(bio, isa_page_pool, err); | 178 | __bounce_end_io_read(bio, isa_page_pool, err); |
179 | } | 179 | } |
180 | 180 | ||
181 | #ifdef CONFIG_NEED_BOUNCE_POOL | ||
182 | static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) | ||
183 | { | ||
184 | struct page *page; | ||
185 | struct backing_dev_info *bdi; | ||
186 | struct address_space *mapping; | ||
187 | struct bio_vec *from; | ||
188 | int i; | ||
189 | |||
190 | if (bio_data_dir(bio) != WRITE) | ||
191 | return 0; | ||
192 | |||
193 | if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) | ||
194 | return 0; | ||
195 | |||
196 | /* | ||
197 | * Based on the first page that has a valid mapping, decide whether or | ||
198 | * not we have to employ bounce buffering to guarantee stable pages. | ||
199 | */ | ||
200 | bio_for_each_segment(from, bio, i) { | ||
201 | page = from->bv_page; | ||
202 | mapping = page_mapping(page); | ||
203 | if (!mapping) | ||
204 | continue; | ||
205 | bdi = mapping->backing_dev_info; | ||
206 | return mapping->host->i_sb->s_flags & MS_SNAP_STABLE; | ||
207 | } | ||
208 | |||
209 | return 0; | ||
210 | } | ||
211 | #else | ||
212 | static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) | ||
213 | { | ||
214 | return 0; | ||
215 | } | ||
216 | #endif /* CONFIG_NEED_BOUNCE_POOL */ | ||
217 | |||
181 | static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | 218 | static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, |
182 | mempool_t *pool) | 219 | mempool_t *pool, int force) |
183 | { | 220 | { |
184 | struct page *page; | 221 | struct page *page; |
185 | struct bio *bio = NULL; | 222 | struct bio *bio = NULL; |
@@ -192,7 +229,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
192 | /* | 229 | /* |
193 | * is destination page below bounce pfn? | 230 | * is destination page below bounce pfn? |
194 | */ | 231 | */ |
195 | if (page_to_pfn(page) <= queue_bounce_pfn(q)) | 232 | if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) |
196 | continue; | 233 | continue; |
197 | 234 | ||
198 | /* | 235 | /* |
@@ -270,6 +307,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
270 | 307 | ||
271 | void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | 308 | void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) |
272 | { | 309 | { |
310 | int must_bounce; | ||
273 | mempool_t *pool; | 311 | mempool_t *pool; |
274 | 312 | ||
275 | /* | 313 | /* |
@@ -278,13 +316,15 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
278 | if (!bio_has_data(*bio_orig)) | 316 | if (!bio_has_data(*bio_orig)) |
279 | return; | 317 | return; |
280 | 318 | ||
319 | must_bounce = must_snapshot_stable_pages(q, *bio_orig); | ||
320 | |||
281 | /* | 321 | /* |
282 | * for non-isa bounce case, just check if the bounce pfn is equal | 322 | * for non-isa bounce case, just check if the bounce pfn is equal |
283 | * to or bigger than the highest pfn in the system -- in that case, | 323 | * to or bigger than the highest pfn in the system -- in that case, |
284 | * don't waste time iterating over bio segments | 324 | * don't waste time iterating over bio segments |
285 | */ | 325 | */ |
286 | if (!(q->bounce_gfp & GFP_DMA)) { | 326 | if (!(q->bounce_gfp & GFP_DMA)) { |
287 | if (queue_bounce_pfn(q) >= blk_max_pfn) | 327 | if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) |
288 | return; | 328 | return; |
289 | pool = page_pool; | 329 | pool = page_pool; |
290 | } else { | 330 | } else { |
@@ -295,7 +335,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
295 | /* | 335 | /* |
296 | * slow path | 336 | * slow path |
297 | */ | 337 | */ |
298 | __blk_queue_bounce(q, bio_orig, pool); | 338 | __blk_queue_bounce(q, bio_orig, pool, must_bounce); |
299 | } | 339 | } |
300 | 340 | ||
301 | EXPORT_SYMBOL(blk_queue_bounce); | 341 | EXPORT_SYMBOL(blk_queue_bounce); |
diff --git a/mm/cleancache.c b/mm/cleancache.c index 32e6f4136fa2..d76ba74be2d0 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c | |||
@@ -89,7 +89,7 @@ static int cleancache_get_key(struct inode *inode, | |||
89 | fhfn = sb->s_export_op->encode_fh; | 89 | fhfn = sb->s_export_op->encode_fh; |
90 | if (fhfn) { | 90 | if (fhfn) { |
91 | len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); | 91 | len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); |
92 | if (len <= 0 || len == 255) | 92 | if (len <= FILEID_ROOT || len == FILEID_INVALID) |
93 | return -1; | 93 | return -1; |
94 | if (maxlen > CLEANCACHE_KEY_MAX) | 94 | if (maxlen > CLEANCACHE_KEY_MAX) |
95 | return -1; | 95 | return -1; |
diff --git a/mm/compaction.c b/mm/compaction.c index 6b807e466497..05ccb4cc0bdb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/sysctl.h> | 15 | #include <linux/sysctl.h> |
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include <linux/balloon_compaction.h> | 17 | #include <linux/balloon_compaction.h> |
18 | #include <linux/page-isolation.h> | ||
18 | #include "internal.h" | 19 | #include "internal.h" |
19 | 20 | ||
20 | #ifdef CONFIG_COMPACTION | 21 | #ifdef CONFIG_COMPACTION |
@@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc, | |||
85 | static void __reset_isolation_suitable(struct zone *zone) | 86 | static void __reset_isolation_suitable(struct zone *zone) |
86 | { | 87 | { |
87 | unsigned long start_pfn = zone->zone_start_pfn; | 88 | unsigned long start_pfn = zone->zone_start_pfn; |
88 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 89 | unsigned long end_pfn = zone_end_pfn(zone); |
89 | unsigned long pfn; | 90 | unsigned long pfn; |
90 | 91 | ||
91 | zone->compact_cached_migrate_pfn = start_pfn; | 92 | zone->compact_cached_migrate_pfn = start_pfn; |
@@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page) | |||
215 | int migratetype = get_pageblock_migratetype(page); | 216 | int migratetype = get_pageblock_migratetype(page); |
216 | 217 | ||
217 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | 218 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ |
218 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | 219 | if (migratetype == MIGRATE_RESERVE) |
220 | return false; | ||
221 | |||
222 | if (is_migrate_isolate(migratetype)) | ||
219 | return false; | 223 | return false; |
220 | 224 | ||
221 | /* If the page is a large free page, then allow migration */ | 225 | /* If the page is a large free page, then allow migration */ |
@@ -611,8 +615,7 @@ check_compact_cluster: | |||
611 | continue; | 615 | continue; |
612 | 616 | ||
613 | next_pageblock: | 617 | next_pageblock: |
614 | low_pfn += pageblock_nr_pages; | 618 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; |
615 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | ||
616 | last_pageblock_nr = pageblock_nr; | 619 | last_pageblock_nr = pageblock_nr; |
617 | } | 620 | } |
618 | 621 | ||
@@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone, | |||
644 | struct compact_control *cc) | 647 | struct compact_control *cc) |
645 | { | 648 | { |
646 | struct page *page; | 649 | struct page *page; |
647 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; | 650 | unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn; |
648 | int nr_freepages = cc->nr_freepages; | 651 | int nr_freepages = cc->nr_freepages; |
649 | struct list_head *freelist = &cc->freepages; | 652 | struct list_head *freelist = &cc->freepages; |
650 | 653 | ||
@@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone, | |||
663 | */ | 666 | */ |
664 | high_pfn = min(low_pfn, pfn); | 667 | high_pfn = min(low_pfn, pfn); |
665 | 668 | ||
666 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 669 | z_end_pfn = zone_end_pfn(zone); |
667 | 670 | ||
668 | /* | 671 | /* |
669 | * Isolate free pages until enough are available to migrate the | 672 | * Isolate free pages until enough are available to migrate the |
@@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone, | |||
706 | * only scans within a pageblock | 709 | * only scans within a pageblock |
707 | */ | 710 | */ |
708 | end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 711 | end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
709 | end_pfn = min(end_pfn, zone_end_pfn); | 712 | end_pfn = min(end_pfn, z_end_pfn); |
710 | isolated = isolate_freepages_block(cc, pfn, end_pfn, | 713 | isolated = isolate_freepages_block(cc, pfn, end_pfn, |
711 | freelist, false); | 714 | freelist, false); |
712 | nr_freepages += isolated; | 715 | nr_freepages += isolated; |
@@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
795 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 798 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); |
796 | 799 | ||
797 | /* Only scan within a pageblock boundary */ | 800 | /* Only scan within a pageblock boundary */ |
798 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); | 801 | end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); |
799 | 802 | ||
800 | /* Do not cross the free scanner or scan within a memory hole */ | 803 | /* Do not cross the free scanner or scan within a memory hole */ |
801 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | 804 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { |
@@ -816,6 +819,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
816 | static int compact_finished(struct zone *zone, | 819 | static int compact_finished(struct zone *zone, |
817 | struct compact_control *cc) | 820 | struct compact_control *cc) |
818 | { | 821 | { |
822 | unsigned int order; | ||
819 | unsigned long watermark; | 823 | unsigned long watermark; |
820 | 824 | ||
821 | if (fatal_signal_pending(current)) | 825 | if (fatal_signal_pending(current)) |
@@ -850,22 +854,16 @@ static int compact_finished(struct zone *zone, | |||
850 | return COMPACT_CONTINUE; | 854 | return COMPACT_CONTINUE; |
851 | 855 | ||
852 | /* Direct compactor: Is a suitable page free? */ | 856 | /* Direct compactor: Is a suitable page free? */ |
853 | if (cc->page) { | 857 | for (order = cc->order; order < MAX_ORDER; order++) { |
854 | /* Was a suitable page captured? */ | 858 | struct free_area *area = &zone->free_area[order]; |
855 | if (*cc->page) | 859 | |
860 | /* Job done if page is free of the right migratetype */ | ||
861 | if (!list_empty(&area->free_list[cc->migratetype])) | ||
862 | return COMPACT_PARTIAL; | ||
863 | |||
864 | /* Job done if allocation would set block type */ | ||
865 | if (cc->order >= pageblock_order && area->nr_free) | ||
856 | return COMPACT_PARTIAL; | 866 | return COMPACT_PARTIAL; |
857 | } else { | ||
858 | unsigned int order; | ||
859 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
860 | struct free_area *area = &zone->free_area[cc->order]; | ||
861 | /* Job done if page is free of the right migratetype */ | ||
862 | if (!list_empty(&area->free_list[cc->migratetype])) | ||
863 | return COMPACT_PARTIAL; | ||
864 | |||
865 | /* Job done if allocation would set block type */ | ||
866 | if (cc->order >= pageblock_order && area->nr_free) | ||
867 | return COMPACT_PARTIAL; | ||
868 | } | ||
869 | } | 867 | } |
870 | 868 | ||
871 | return COMPACT_CONTINUE; | 869 | return COMPACT_CONTINUE; |
@@ -921,65 +919,11 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
921 | return COMPACT_CONTINUE; | 919 | return COMPACT_CONTINUE; |
922 | } | 920 | } |
923 | 921 | ||
924 | static void compact_capture_page(struct compact_control *cc) | ||
925 | { | ||
926 | unsigned long flags; | ||
927 | int mtype, mtype_low, mtype_high; | ||
928 | |||
929 | if (!cc->page || *cc->page) | ||
930 | return; | ||
931 | |||
932 | /* | ||
933 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
934 | * regardless of the migratetype of the freelist is is captured from. | ||
935 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
936 | * allocation is typically at least a pageblock size and overall | ||
937 | * fragmentation is not impaired. Other allocation types must | ||
938 | * capture pages from their own migratelist because otherwise they | ||
939 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
940 | * difficult to move pages and making fragmentation worse overall. | ||
941 | */ | ||
942 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
943 | mtype_low = 0; | ||
944 | mtype_high = MIGRATE_PCPTYPES; | ||
945 | } else { | ||
946 | mtype_low = cc->migratetype; | ||
947 | mtype_high = cc->migratetype + 1; | ||
948 | } | ||
949 | |||
950 | /* Speculatively examine the free lists without zone lock */ | ||
951 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
952 | int order; | ||
953 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
954 | struct page *page; | ||
955 | struct free_area *area; | ||
956 | area = &(cc->zone->free_area[order]); | ||
957 | if (list_empty(&area->free_list[mtype])) | ||
958 | continue; | ||
959 | |||
960 | /* Take the lock and attempt capture of the page */ | ||
961 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
962 | return; | ||
963 | if (!list_empty(&area->free_list[mtype])) { | ||
964 | page = list_entry(area->free_list[mtype].next, | ||
965 | struct page, lru); | ||
966 | if (capture_free_page(page, cc->order, mtype)) { | ||
967 | spin_unlock_irqrestore(&cc->zone->lock, | ||
968 | flags); | ||
969 | *cc->page = page; | ||
970 | return; | ||
971 | } | ||
972 | } | ||
973 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
974 | } | ||
975 | } | ||
976 | } | ||
977 | |||
978 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 922 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
979 | { | 923 | { |
980 | int ret; | 924 | int ret; |
981 | unsigned long start_pfn = zone->zone_start_pfn; | 925 | unsigned long start_pfn = zone->zone_start_pfn; |
982 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 926 | unsigned long end_pfn = zone_end_pfn(zone); |
983 | 927 | ||
984 | ret = compaction_suitable(zone, cc->order); | 928 | ret = compaction_suitable(zone, cc->order); |
985 | switch (ret) { | 929 | switch (ret) { |
@@ -1036,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1036 | 980 | ||
1037 | nr_migrate = cc->nr_migratepages; | 981 | nr_migrate = cc->nr_migratepages; |
1038 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 982 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
1039 | (unsigned long)cc, false, | 983 | (unsigned long)cc, |
1040 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, | 984 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, |
1041 | MR_COMPACTION); | 985 | MR_COMPACTION); |
1042 | update_nr_listpages(cc); | 986 | update_nr_listpages(cc); |
@@ -1054,9 +998,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1054 | goto out; | 998 | goto out; |
1055 | } | 999 | } |
1056 | } | 1000 | } |
1057 | |||
1058 | /* Capture a page now if it is a suitable size */ | ||
1059 | compact_capture_page(cc); | ||
1060 | } | 1001 | } |
1061 | 1002 | ||
1062 | out: | 1003 | out: |
@@ -1069,8 +1010,7 @@ out: | |||
1069 | 1010 | ||
1070 | static unsigned long compact_zone_order(struct zone *zone, | 1011 | static unsigned long compact_zone_order(struct zone *zone, |
1071 | int order, gfp_t gfp_mask, | 1012 | int order, gfp_t gfp_mask, |
1072 | bool sync, bool *contended, | 1013 | bool sync, bool *contended) |
1073 | struct page **page) | ||
1074 | { | 1014 | { |
1075 | unsigned long ret; | 1015 | unsigned long ret; |
1076 | struct compact_control cc = { | 1016 | struct compact_control cc = { |
@@ -1080,7 +1020,6 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
1080 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1020 | .migratetype = allocflags_to_migratetype(gfp_mask), |
1081 | .zone = zone, | 1021 | .zone = zone, |
1082 | .sync = sync, | 1022 | .sync = sync, |
1083 | .page = page, | ||
1084 | }; | 1023 | }; |
1085 | INIT_LIST_HEAD(&cc.freepages); | 1024 | INIT_LIST_HEAD(&cc.freepages); |
1086 | INIT_LIST_HEAD(&cc.migratepages); | 1025 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -1110,7 +1049,7 @@ int sysctl_extfrag_threshold = 500; | |||
1110 | */ | 1049 | */ |
1111 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1050 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
1112 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1051 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
1113 | bool sync, bool *contended, struct page **page) | 1052 | bool sync, bool *contended) |
1114 | { | 1053 | { |
1115 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1054 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1116 | int may_enter_fs = gfp_mask & __GFP_FS; | 1055 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -1136,7 +1075,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1136 | int status; | 1075 | int status; |
1137 | 1076 | ||
1138 | status = compact_zone_order(zone, order, gfp_mask, sync, | 1077 | status = compact_zone_order(zone, order, gfp_mask, sync, |
1139 | contended, page); | 1078 | contended); |
1140 | rc = max(status, rc); | 1079 | rc = max(status, rc); |
1141 | 1080 | ||
1142 | /* If a normal allocation would succeed, stop compacting */ | 1081 | /* If a normal allocation would succeed, stop compacting */ |
@@ -1150,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1150 | 1089 | ||
1151 | 1090 | ||
1152 | /* Compact all zones within a node */ | 1091 | /* Compact all zones within a node */ |
1153 | static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | 1092 | static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) |
1154 | { | 1093 | { |
1155 | int zoneid; | 1094 | int zoneid; |
1156 | struct zone *zone; | 1095 | struct zone *zone; |
@@ -1183,34 +1122,30 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1183 | VM_BUG_ON(!list_empty(&cc->freepages)); | 1122 | VM_BUG_ON(!list_empty(&cc->freepages)); |
1184 | VM_BUG_ON(!list_empty(&cc->migratepages)); | 1123 | VM_BUG_ON(!list_empty(&cc->migratepages)); |
1185 | } | 1124 | } |
1186 | |||
1187 | return 0; | ||
1188 | } | 1125 | } |
1189 | 1126 | ||
1190 | int compact_pgdat(pg_data_t *pgdat, int order) | 1127 | void compact_pgdat(pg_data_t *pgdat, int order) |
1191 | { | 1128 | { |
1192 | struct compact_control cc = { | 1129 | struct compact_control cc = { |
1193 | .order = order, | 1130 | .order = order, |
1194 | .sync = false, | 1131 | .sync = false, |
1195 | .page = NULL, | ||
1196 | }; | 1132 | }; |
1197 | 1133 | ||
1198 | return __compact_pgdat(pgdat, &cc); | 1134 | __compact_pgdat(pgdat, &cc); |
1199 | } | 1135 | } |
1200 | 1136 | ||
1201 | static int compact_node(int nid) | 1137 | static void compact_node(int nid) |
1202 | { | 1138 | { |
1203 | struct compact_control cc = { | 1139 | struct compact_control cc = { |
1204 | .order = -1, | 1140 | .order = -1, |
1205 | .sync = true, | 1141 | .sync = true, |
1206 | .page = NULL, | ||
1207 | }; | 1142 | }; |
1208 | 1143 | ||
1209 | return __compact_pgdat(NODE_DATA(nid), &cc); | 1144 | __compact_pgdat(NODE_DATA(nid), &cc); |
1210 | } | 1145 | } |
1211 | 1146 | ||
1212 | /* Compact all nodes in the system */ | 1147 | /* Compact all nodes in the system */ |
1213 | static int compact_nodes(void) | 1148 | static void compact_nodes(void) |
1214 | { | 1149 | { |
1215 | int nid; | 1150 | int nid; |
1216 | 1151 | ||
@@ -1219,8 +1154,6 @@ static int compact_nodes(void) | |||
1219 | 1154 | ||
1220 | for_each_online_node(nid) | 1155 | for_each_online_node(nid) |
1221 | compact_node(nid); | 1156 | compact_node(nid); |
1222 | |||
1223 | return COMPACT_COMPLETE; | ||
1224 | } | 1157 | } |
1225 | 1158 | ||
1226 | /* The written value is actually unused, all memory is compacted */ | 1159 | /* The written value is actually unused, all memory is compacted */ |
@@ -1231,7 +1164,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, | |||
1231 | void __user *buffer, size_t *length, loff_t *ppos) | 1164 | void __user *buffer, size_t *length, loff_t *ppos) |
1232 | { | 1165 | { |
1233 | if (write) | 1166 | if (write) |
1234 | return compact_nodes(); | 1167 | compact_nodes(); |
1235 | 1168 | ||
1236 | return 0; | 1169 | return 0; |
1237 | } | 1170 | } |
diff --git a/mm/fadvise.c b/mm/fadvise.c index a47f0f50c89f..7e092689a12a 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/fadvise.h> | 17 | #include <linux/fadvise.h> |
18 | #include <linux/writeback.h> | 18 | #include <linux/writeback.h> |
19 | #include <linux/syscalls.h> | 19 | #include <linux/syscalls.h> |
20 | #include <linux/swap.h> | ||
20 | 21 | ||
21 | #include <asm/unistd.h> | 22 | #include <asm/unistd.h> |
22 | 23 | ||
@@ -38,7 +39,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
38 | if (!f.file) | 39 | if (!f.file) |
39 | return -EBADF; | 40 | return -EBADF; |
40 | 41 | ||
41 | if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) { | 42 | if (S_ISFIFO(file_inode(f.file)->i_mode)) { |
42 | ret = -ESPIPE; | 43 | ret = -ESPIPE; |
43 | goto out; | 44 | goto out; |
44 | } | 45 | } |
@@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
120 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | 121 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; |
121 | end_index = (endbyte >> PAGE_CACHE_SHIFT); | 122 | end_index = (endbyte >> PAGE_CACHE_SHIFT); |
122 | 123 | ||
123 | if (end_index >= start_index) | 124 | if (end_index >= start_index) { |
124 | invalidate_mapping_pages(mapping, start_index, | 125 | unsigned long count = invalidate_mapping_pages(mapping, |
126 | start_index, end_index); | ||
127 | |||
128 | /* | ||
129 | * If fewer pages were invalidated than expected then | ||
130 | * it is possible that some of the pages were on | ||
131 | * a per-cpu pagevec for a remote CPU. Drain all | ||
132 | * pagevecs and try again. | ||
133 | */ | ||
134 | if (count < (end_index - start_index + 1)) { | ||
135 | lru_add_drain_all(); | ||
136 | invalidate_mapping_pages(mapping, start_index, | ||
125 | end_index); | 137 | end_index); |
138 | } | ||
139 | } | ||
126 | break; | 140 | break; |
127 | default: | 141 | default: |
128 | ret = -EINVAL; | 142 | ret = -EINVAL; |
diff --git a/mm/filemap.c b/mm/filemap.c index 83efee76a5c0..e1979fdca805 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1711,7 +1711,7 @@ EXPORT_SYMBOL(filemap_fault); | |||
1711 | int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | 1711 | int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) |
1712 | { | 1712 | { |
1713 | struct page *page = vmf->page; | 1713 | struct page *page = vmf->page; |
1714 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 1714 | struct inode *inode = file_inode(vma->vm_file); |
1715 | int ret = VM_FAULT_LOCKED; | 1715 | int ret = VM_FAULT_LOCKED; |
1716 | 1716 | ||
1717 | sb_start_pagefault(inode->i_sb); | 1717 | sb_start_pagefault(inode->i_sb); |
@@ -1728,6 +1728,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1728 | * see the dirty page and writeprotect it again. | 1728 | * see the dirty page and writeprotect it again. |
1729 | */ | 1729 | */ |
1730 | set_page_dirty(page); | 1730 | set_page_dirty(page); |
1731 | wait_for_stable_page(page); | ||
1731 | out: | 1732 | out: |
1732 | sb_end_pagefault(inode->i_sb); | 1733 | sb_end_pagefault(inode->i_sb); |
1733 | return ret; | 1734 | return ret; |
@@ -2056,7 +2057,7 @@ EXPORT_SYMBOL(iov_iter_fault_in_readable); | |||
2056 | /* | 2057 | /* |
2057 | * Return the count of just the current iov_iter segment. | 2058 | * Return the count of just the current iov_iter segment. |
2058 | */ | 2059 | */ |
2059 | size_t iov_iter_single_seg_count(struct iov_iter *i) | 2060 | size_t iov_iter_single_seg_count(const struct iov_iter *i) |
2060 | { | 2061 | { |
2061 | const struct iovec *iov = i->iov; | 2062 | const struct iovec *iov = i->iov; |
2062 | if (i->nr_segs == 1) | 2063 | if (i->nr_segs == 1) |
@@ -2274,7 +2275,7 @@ repeat: | |||
2274 | return NULL; | 2275 | return NULL; |
2275 | } | 2276 | } |
2276 | found: | 2277 | found: |
2277 | wait_on_page_writeback(page); | 2278 | wait_for_stable_page(page); |
2278 | return page; | 2279 | return page; |
2279 | } | 2280 | } |
2280 | EXPORT_SYMBOL(grab_cache_page_write_begin); | 2281 | EXPORT_SYMBOL(grab_cache_page_write_begin); |
diff --git a/mm/fremap.c b/mm/fremap.c index a0aaf0e56800..0cd4c11488ed 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
129 | struct vm_area_struct *vma; | 129 | struct vm_area_struct *vma; |
130 | int err = -EINVAL; | 130 | int err = -EINVAL; |
131 | int has_write_lock = 0; | 131 | int has_write_lock = 0; |
132 | vm_flags_t vm_flags; | ||
132 | 133 | ||
133 | if (prot) | 134 | if (prot) |
134 | return err; | 135 | return err; |
@@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
160 | /* | 161 | /* |
161 | * Make sure the vma is shared, that it supports prefaulting, | 162 | * Make sure the vma is shared, that it supports prefaulting, |
162 | * and that the remapped range is valid and fully within | 163 | * and that the remapped range is valid and fully within |
163 | * the single existing vma. vm_private_data is used as a | 164 | * the single existing vma. |
164 | * swapout cursor in a VM_NONLINEAR vma. | ||
165 | */ | 165 | */ |
166 | if (!vma || !(vma->vm_flags & VM_SHARED)) | 166 | if (!vma || !(vma->vm_flags & VM_SHARED)) |
167 | goto out; | 167 | goto out; |
168 | 168 | ||
169 | if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) | ||
170 | goto out; | ||
171 | |||
172 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) | 169 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) |
173 | goto out; | 170 | goto out; |
174 | 171 | ||
@@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
177 | 174 | ||
178 | /* Must set VM_NONLINEAR before any pages are populated. */ | 175 | /* Must set VM_NONLINEAR before any pages are populated. */ |
179 | if (!(vma->vm_flags & VM_NONLINEAR)) { | 176 | if (!(vma->vm_flags & VM_NONLINEAR)) { |
177 | /* | ||
178 | * vm_private_data is used as a swapout cursor | ||
179 | * in a VM_NONLINEAR vma. | ||
180 | */ | ||
181 | if (vma->vm_private_data) | ||
182 | goto out; | ||
183 | |||
180 | /* Don't need a nonlinear mapping, exit success */ | 184 | /* Don't need a nonlinear mapping, exit success */ |
181 | if (pgoff == linear_page_index(vma, start)) { | 185 | if (pgoff == linear_page_index(vma, start)) { |
182 | err = 0; | 186 | err = 0; |
@@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
184 | } | 188 | } |
185 | 189 | ||
186 | if (!has_write_lock) { | 190 | if (!has_write_lock) { |
191 | get_write_lock: | ||
187 | up_read(&mm->mmap_sem); | 192 | up_read(&mm->mmap_sem); |
188 | down_write(&mm->mmap_sem); | 193 | down_write(&mm->mmap_sem); |
189 | has_write_lock = 1; | 194 | has_write_lock = 1; |
@@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
199 | unsigned long addr; | 204 | unsigned long addr; |
200 | struct file *file = get_file(vma->vm_file); | 205 | struct file *file = get_file(vma->vm_file); |
201 | 206 | ||
202 | flags &= MAP_NONBLOCK; | 207 | vm_flags = vma->vm_flags; |
203 | addr = mmap_region(file, start, size, | 208 | if (!(flags & MAP_NONBLOCK)) |
204 | flags, vma->vm_flags, pgoff); | 209 | vm_flags |= VM_POPULATE; |
210 | addr = mmap_region(file, start, size, vm_flags, pgoff); | ||
205 | fput(file); | 211 | fput(file); |
206 | if (IS_ERR_VALUE(addr)) { | 212 | if (IS_ERR_VALUE(addr)) { |
207 | err = addr; | 213 | err = addr; |
@@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
220 | mutex_unlock(&mapping->i_mmap_mutex); | 226 | mutex_unlock(&mapping->i_mmap_mutex); |
221 | } | 227 | } |
222 | 228 | ||
229 | if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) { | ||
230 | if (!has_write_lock) | ||
231 | goto get_write_lock; | ||
232 | vma->vm_flags |= VM_POPULATE; | ||
233 | } | ||
234 | |||
223 | if (vma->vm_flags & VM_LOCKED) { | 235 | if (vma->vm_flags & VM_LOCKED) { |
224 | /* | 236 | /* |
225 | * drop PG_Mlocked flag for over-mapped range | 237 | * drop PG_Mlocked flag for over-mapped range |
226 | */ | 238 | */ |
227 | vm_flags_t saved_flags = vma->vm_flags; | 239 | if (!has_write_lock) |
240 | goto get_write_lock; | ||
241 | vm_flags = vma->vm_flags; | ||
228 | munlock_vma_pages_range(vma, start, start + size); | 242 | munlock_vma_pages_range(vma, start, start + size); |
229 | vma->vm_flags = saved_flags; | 243 | vma->vm_flags = vm_flags; |
230 | } | 244 | } |
231 | 245 | ||
232 | mmu_notifier_invalidate_range_start(mm, start, start + size); | 246 | mmu_notifier_invalidate_range_start(mm, start, start + size); |
233 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); | 247 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); |
234 | mmu_notifier_invalidate_range_end(mm, start, start + size); | 248 | mmu_notifier_invalidate_range_end(mm, start, start + size); |
235 | if (!err && !(flags & MAP_NONBLOCK)) { | ||
236 | if (vma->vm_flags & VM_LOCKED) { | ||
237 | /* | ||
238 | * might be mapping previously unmapped range of file | ||
239 | */ | ||
240 | mlock_vma_pages_range(vma, start, start + size); | ||
241 | } else { | ||
242 | if (unlikely(has_write_lock)) { | ||
243 | downgrade_write(&mm->mmap_sem); | ||
244 | has_write_lock = 0; | ||
245 | } | ||
246 | make_pages_present(start, start+size); | ||
247 | } | ||
248 | } | ||
249 | 249 | ||
250 | /* | 250 | /* |
251 | * We can't clear VM_NONLINEAR because we'd have to do | 251 | * We can't clear VM_NONLINEAR because we'd have to do |
@@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
254 | */ | 254 | */ |
255 | 255 | ||
256 | out: | 256 | out: |
257 | vm_flags = vma->vm_flags; | ||
257 | if (likely(!has_write_lock)) | 258 | if (likely(!has_write_lock)) |
258 | up_read(&mm->mmap_sem); | 259 | up_read(&mm->mmap_sem); |
259 | else | 260 | else |
260 | up_write(&mm->mmap_sem); | 261 | up_write(&mm->mmap_sem); |
262 | if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) | ||
263 | mm_populate(start, size); | ||
261 | 264 | ||
262 | return err; | 265 | return err; |
263 | } | 266 | } |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9e894edc7811..e2f7f5aaaafb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/migrate.h> | 22 | #include <linux/migrate.h> |
23 | #include <linux/hashtable.h> | ||
23 | 24 | ||
24 | #include <asm/tlb.h> | 25 | #include <asm/tlb.h> |
25 | #include <asm/pgalloc.h> | 26 | #include <asm/pgalloc.h> |
@@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | |||
62 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | 63 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; |
63 | 64 | ||
64 | static int khugepaged(void *none); | 65 | static int khugepaged(void *none); |
65 | static int mm_slots_hash_init(void); | ||
66 | static int khugepaged_slab_init(void); | 66 | static int khugepaged_slab_init(void); |
67 | static void khugepaged_slab_free(void); | ||
68 | 67 | ||
69 | #define MM_SLOTS_HASH_HEADS 1024 | 68 | #define MM_SLOTS_HASH_BITS 10 |
70 | static struct hlist_head *mm_slots_hash __read_mostly; | 69 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); |
70 | |||
71 | static struct kmem_cache *mm_slot_cache __read_mostly; | 71 | static struct kmem_cache *mm_slot_cache __read_mostly; |
72 | 72 | ||
73 | /** | 73 | /** |
@@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void) | |||
105 | struct zone *zone; | 105 | struct zone *zone; |
106 | int nr_zones = 0; | 106 | int nr_zones = 0; |
107 | unsigned long recommended_min; | 107 | unsigned long recommended_min; |
108 | extern int min_free_kbytes; | ||
109 | 108 | ||
110 | if (!khugepaged_enabled()) | 109 | if (!khugepaged_enabled()) |
111 | return 0; | 110 | return 0; |
@@ -634,12 +633,6 @@ static int __init hugepage_init(void) | |||
634 | if (err) | 633 | if (err) |
635 | goto out; | 634 | goto out; |
636 | 635 | ||
637 | err = mm_slots_hash_init(); | ||
638 | if (err) { | ||
639 | khugepaged_slab_free(); | ||
640 | goto out; | ||
641 | } | ||
642 | |||
643 | register_shrinker(&huge_zero_page_shrinker); | 636 | register_shrinker(&huge_zero_page_shrinker); |
644 | 637 | ||
645 | /* | 638 | /* |
@@ -1257,6 +1250,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1257 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) | 1250 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) |
1258 | goto out; | 1251 | goto out; |
1259 | 1252 | ||
1253 | /* Avoid dumping huge zero page */ | ||
1254 | if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) | ||
1255 | return ERR_PTR(-EFAULT); | ||
1256 | |||
1260 | page = pmd_page(*pmd); | 1257 | page = pmd_page(*pmd); |
1261 | VM_BUG_ON(!PageHead(page)); | 1258 | VM_BUG_ON(!PageHead(page)); |
1262 | if (flags & FOLL_TOUCH) { | 1259 | if (flags & FOLL_TOUCH) { |
@@ -1298,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1298 | int target_nid; | 1295 | int target_nid; |
1299 | int current_nid = -1; | 1296 | int current_nid = -1; |
1300 | bool migrated; | 1297 | bool migrated; |
1301 | bool page_locked = false; | ||
1302 | 1298 | ||
1303 | spin_lock(&mm->page_table_lock); | 1299 | spin_lock(&mm->page_table_lock); |
1304 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1300 | if (unlikely(!pmd_same(pmd, *pmdp))) |
@@ -1320,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1320 | /* Acquire the page lock to serialise THP migrations */ | 1316 | /* Acquire the page lock to serialise THP migrations */ |
1321 | spin_unlock(&mm->page_table_lock); | 1317 | spin_unlock(&mm->page_table_lock); |
1322 | lock_page(page); | 1318 | lock_page(page); |
1323 | page_locked = true; | ||
1324 | 1319 | ||
1325 | /* Confirm the PTE did not while locked */ | 1320 | /* Confirm the PTE did not while locked */ |
1326 | spin_lock(&mm->page_table_lock); | 1321 | spin_lock(&mm->page_table_lock); |
@@ -1333,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1333 | 1328 | ||
1334 | /* Migrate the THP to the requested node */ | 1329 | /* Migrate the THP to the requested node */ |
1335 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1330 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
1336 | pmdp, pmd, addr, | 1331 | pmdp, pmd, addr, page, target_nid); |
1337 | page, target_nid); | 1332 | if (!migrated) |
1338 | if (migrated) | 1333 | goto check_same; |
1339 | current_nid = target_nid; | ||
1340 | else { | ||
1341 | spin_lock(&mm->page_table_lock); | ||
1342 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1343 | unlock_page(page); | ||
1344 | goto out_unlock; | ||
1345 | } | ||
1346 | goto clear_pmdnuma; | ||
1347 | } | ||
1348 | 1334 | ||
1349 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | 1335 | task_numa_fault(target_nid, HPAGE_PMD_NR, true); |
1350 | return 0; | 1336 | return 0; |
1351 | 1337 | ||
1338 | check_same: | ||
1339 | spin_lock(&mm->page_table_lock); | ||
1340 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
1341 | goto out_unlock; | ||
1352 | clear_pmdnuma: | 1342 | clear_pmdnuma: |
1353 | pmd = pmd_mknonnuma(pmd); | 1343 | pmd = pmd_mknonnuma(pmd); |
1354 | set_pmd_at(mm, haddr, pmdp, pmd); | 1344 | set_pmd_at(mm, haddr, pmdp, pmd); |
1355 | VM_BUG_ON(pmd_numa(*pmdp)); | 1345 | VM_BUG_ON(pmd_numa(*pmdp)); |
1356 | update_mmu_cache_pmd(vma, addr, pmdp); | 1346 | update_mmu_cache_pmd(vma, addr, pmdp); |
1357 | if (page_locked) | ||
1358 | unlock_page(page); | ||
1359 | |||
1360 | out_unlock: | 1347 | out_unlock: |
1361 | spin_unlock(&mm->page_table_lock); | 1348 | spin_unlock(&mm->page_table_lock); |
1362 | if (current_nid != -1) | 1349 | if (current_nid != -1) |
1363 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | 1350 | task_numa_fault(current_nid, HPAGE_PMD_NR, false); |
1364 | return 0; | 1351 | return 0; |
1365 | } | 1352 | } |
1366 | 1353 | ||
@@ -1652,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1652 | page_tail->mapping = page->mapping; | 1639 | page_tail->mapping = page->mapping; |
1653 | 1640 | ||
1654 | page_tail->index = page->index + i; | 1641 | page_tail->index = page->index + i; |
1655 | page_xchg_last_nid(page_tail, page_last_nid(page)); | 1642 | page_nid_xchg_last(page_tail, page_nid_last(page)); |
1656 | 1643 | ||
1657 | BUG_ON(!PageAnon(page_tail)); | 1644 | BUG_ON(!PageAnon(page_tail)); |
1658 | BUG_ON(!PageUptodate(page_tail)); | 1645 | BUG_ON(!PageUptodate(page_tail)); |
@@ -1819,9 +1806,19 @@ int split_huge_page(struct page *page) | |||
1819 | 1806 | ||
1820 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | 1807 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); |
1821 | BUG_ON(!PageAnon(page)); | 1808 | BUG_ON(!PageAnon(page)); |
1822 | anon_vma = page_lock_anon_vma_read(page); | 1809 | |
1810 | /* | ||
1811 | * The caller does not necessarily hold an mmap_sem that would prevent | ||
1812 | * the anon_vma disappearing so we first we take a reference to it | ||
1813 | * and then lock the anon_vma for write. This is similar to | ||
1814 | * page_lock_anon_vma_read except the write lock is taken to serialise | ||
1815 | * against parallel split or collapse operations. | ||
1816 | */ | ||
1817 | anon_vma = page_get_anon_vma(page); | ||
1823 | if (!anon_vma) | 1818 | if (!anon_vma) |
1824 | goto out; | 1819 | goto out; |
1820 | anon_vma_lock_write(anon_vma); | ||
1821 | |||
1825 | ret = 0; | 1822 | ret = 0; |
1826 | if (!PageCompound(page)) | 1823 | if (!PageCompound(page)) |
1827 | goto out_unlock; | 1824 | goto out_unlock; |
@@ -1832,7 +1829,8 @@ int split_huge_page(struct page *page) | |||
1832 | 1829 | ||
1833 | BUG_ON(PageCompound(page)); | 1830 | BUG_ON(PageCompound(page)); |
1834 | out_unlock: | 1831 | out_unlock: |
1835 | page_unlock_anon_vma_read(anon_vma); | 1832 | anon_vma_unlock_write(anon_vma); |
1833 | put_anon_vma(anon_vma); | ||
1836 | out: | 1834 | out: |
1837 | return ret; | 1835 | return ret; |
1838 | } | 1836 | } |
@@ -1893,12 +1891,6 @@ static int __init khugepaged_slab_init(void) | |||
1893 | return 0; | 1891 | return 0; |
1894 | } | 1892 | } |
1895 | 1893 | ||
1896 | static void __init khugepaged_slab_free(void) | ||
1897 | { | ||
1898 | kmem_cache_destroy(mm_slot_cache); | ||
1899 | mm_slot_cache = NULL; | ||
1900 | } | ||
1901 | |||
1902 | static inline struct mm_slot *alloc_mm_slot(void) | 1894 | static inline struct mm_slot *alloc_mm_slot(void) |
1903 | { | 1895 | { |
1904 | if (!mm_slot_cache) /* initialization failed */ | 1896 | if (!mm_slot_cache) /* initialization failed */ |
@@ -1911,47 +1903,22 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) | |||
1911 | kmem_cache_free(mm_slot_cache, mm_slot); | 1903 | kmem_cache_free(mm_slot_cache, mm_slot); |
1912 | } | 1904 | } |
1913 | 1905 | ||
1914 | static int __init mm_slots_hash_init(void) | ||
1915 | { | ||
1916 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), | ||
1917 | GFP_KERNEL); | ||
1918 | if (!mm_slots_hash) | ||
1919 | return -ENOMEM; | ||
1920 | return 0; | ||
1921 | } | ||
1922 | |||
1923 | #if 0 | ||
1924 | static void __init mm_slots_hash_free(void) | ||
1925 | { | ||
1926 | kfree(mm_slots_hash); | ||
1927 | mm_slots_hash = NULL; | ||
1928 | } | ||
1929 | #endif | ||
1930 | |||
1931 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | 1906 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) |
1932 | { | 1907 | { |
1933 | struct mm_slot *mm_slot; | 1908 | struct mm_slot *mm_slot; |
1934 | struct hlist_head *bucket; | ||
1935 | struct hlist_node *node; | ||
1936 | 1909 | ||
1937 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 1910 | hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) |
1938 | % MM_SLOTS_HASH_HEADS]; | ||
1939 | hlist_for_each_entry(mm_slot, node, bucket, hash) { | ||
1940 | if (mm == mm_slot->mm) | 1911 | if (mm == mm_slot->mm) |
1941 | return mm_slot; | 1912 | return mm_slot; |
1942 | } | 1913 | |
1943 | return NULL; | 1914 | return NULL; |
1944 | } | 1915 | } |
1945 | 1916 | ||
1946 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | 1917 | static void insert_to_mm_slots_hash(struct mm_struct *mm, |
1947 | struct mm_slot *mm_slot) | 1918 | struct mm_slot *mm_slot) |
1948 | { | 1919 | { |
1949 | struct hlist_head *bucket; | ||
1950 | |||
1951 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
1952 | % MM_SLOTS_HASH_HEADS]; | ||
1953 | mm_slot->mm = mm; | 1920 | mm_slot->mm = mm; |
1954 | hlist_add_head(&mm_slot->hash, bucket); | 1921 | hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); |
1955 | } | 1922 | } |
1956 | 1923 | ||
1957 | static inline int khugepaged_test_exit(struct mm_struct *mm) | 1924 | static inline int khugepaged_test_exit(struct mm_struct *mm) |
@@ -2020,7 +1987,7 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
2020 | spin_lock(&khugepaged_mm_lock); | 1987 | spin_lock(&khugepaged_mm_lock); |
2021 | mm_slot = get_mm_slot(mm); | 1988 | mm_slot = get_mm_slot(mm); |
2022 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | 1989 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { |
2023 | hlist_del(&mm_slot->hash); | 1990 | hash_del(&mm_slot->hash); |
2024 | list_del(&mm_slot->mm_node); | 1991 | list_del(&mm_slot->mm_node); |
2025 | free = 1; | 1992 | free = 1; |
2026 | } | 1993 | } |
@@ -2353,7 +2320,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2353 | BUG_ON(!pmd_none(*pmd)); | 2320 | BUG_ON(!pmd_none(*pmd)); |
2354 | set_pmd_at(mm, address, pmd, _pmd); | 2321 | set_pmd_at(mm, address, pmd, _pmd); |
2355 | spin_unlock(&mm->page_table_lock); | 2322 | spin_unlock(&mm->page_table_lock); |
2356 | anon_vma_unlock(vma->anon_vma); | 2323 | anon_vma_unlock_write(vma->anon_vma); |
2357 | goto out; | 2324 | goto out; |
2358 | } | 2325 | } |
2359 | 2326 | ||
@@ -2361,7 +2328,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2361 | * All pages are isolated and locked so anon_vma rmap | 2328 | * All pages are isolated and locked so anon_vma rmap |
2362 | * can't run anymore. | 2329 | * can't run anymore. |
2363 | */ | 2330 | */ |
2364 | anon_vma_unlock(vma->anon_vma); | 2331 | anon_vma_unlock_write(vma->anon_vma); |
2365 | 2332 | ||
2366 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | 2333 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); |
2367 | pte_unmap(pte); | 2334 | pte_unmap(pte); |
@@ -2408,7 +2375,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2408 | struct page *page; | 2375 | struct page *page; |
2409 | unsigned long _address; | 2376 | unsigned long _address; |
2410 | spinlock_t *ptl; | 2377 | spinlock_t *ptl; |
2411 | int node = -1; | 2378 | int node = NUMA_NO_NODE; |
2412 | 2379 | ||
2413 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2380 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2414 | 2381 | ||
@@ -2438,7 +2405,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2438 | * be more sophisticated and look at more pages, | 2405 | * be more sophisticated and look at more pages, |
2439 | * but isn't for now. | 2406 | * but isn't for now. |
2440 | */ | 2407 | */ |
2441 | if (node == -1) | 2408 | if (node == NUMA_NO_NODE) |
2442 | node = page_to_nid(page); | 2409 | node = page_to_nid(page); |
2443 | VM_BUG_ON(PageCompound(page)); | 2410 | VM_BUG_ON(PageCompound(page)); |
2444 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2411 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
@@ -2469,7 +2436,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) | |||
2469 | 2436 | ||
2470 | if (khugepaged_test_exit(mm)) { | 2437 | if (khugepaged_test_exit(mm)) { |
2471 | /* free mm_slot */ | 2438 | /* free mm_slot */ |
2472 | hlist_del(&mm_slot->hash); | 2439 | hash_del(&mm_slot->hash); |
2473 | list_del(&mm_slot->mm_node); | 2440 | list_del(&mm_slot->mm_node); |
2474 | 2441 | ||
2475 | /* | 2442 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4f3ea0b1e57c..0a0be33bb199 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -127,7 +127,7 @@ static inline struct hugepage_subpool *subpool_inode(struct inode *inode) | |||
127 | 127 | ||
128 | static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) | 128 | static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) |
129 | { | 129 | { |
130 | return subpool_inode(vma->vm_file->f_dentry->d_inode); | 130 | return subpool_inode(file_inode(vma->vm_file)); |
131 | } | 131 | } |
132 | 132 | ||
133 | /* | 133 | /* |
@@ -1293,8 +1293,7 @@ static void __init report_hugepages(void) | |||
1293 | 1293 | ||
1294 | for_each_hstate(h) { | 1294 | for_each_hstate(h) { |
1295 | char buf[32]; | 1295 | char buf[32]; |
1296 | printk(KERN_INFO "HugeTLB registered %s page size, " | 1296 | pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", |
1297 | "pre-allocated %ld pages\n", | ||
1298 | memfmt(buf, huge_page_size(h)), | 1297 | memfmt(buf, huge_page_size(h)), |
1299 | h->free_huge_pages); | 1298 | h->free_huge_pages); |
1300 | } | 1299 | } |
@@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void) | |||
1702 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, | 1701 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
1703 | hstate_kobjs, &hstate_attr_group); | 1702 | hstate_kobjs, &hstate_attr_group); |
1704 | if (err) | 1703 | if (err) |
1705 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1704 | pr_err("Hugetlb: Unable to add hstate %s", h->name); |
1706 | h->name); | ||
1707 | } | 1705 | } |
1708 | } | 1706 | } |
1709 | 1707 | ||
@@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node) | |||
1826 | nhs->hstate_kobjs, | 1824 | nhs->hstate_kobjs, |
1827 | &per_node_hstate_attr_group); | 1825 | &per_node_hstate_attr_group); |
1828 | if (err) { | 1826 | if (err) { |
1829 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | 1827 | pr_err("Hugetlb: Unable to add hstate %s for node %d\n", |
1830 | " for node %d\n", | 1828 | h->name, node->dev.id); |
1831 | h->name, node->dev.id); | ||
1832 | hugetlb_unregister_node(node); | 1829 | hugetlb_unregister_node(node); |
1833 | break; | 1830 | break; |
1834 | } | 1831 | } |
@@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1924 | unsigned long i; | 1921 | unsigned long i; |
1925 | 1922 | ||
1926 | if (size_to_hstate(PAGE_SIZE << order)) { | 1923 | if (size_to_hstate(PAGE_SIZE << order)) { |
1927 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | 1924 | pr_warning("hugepagesz= specified twice, ignoring\n"); |
1928 | return; | 1925 | return; |
1929 | } | 1926 | } |
1930 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); | 1927 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); |
@@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1960 | mhp = &parsed_hstate->max_huge_pages; | 1957 | mhp = &parsed_hstate->max_huge_pages; |
1961 | 1958 | ||
1962 | if (mhp == last_mhp) { | 1959 | if (mhp == last_mhp) { |
1963 | printk(KERN_WARNING "hugepages= specified twice without " | 1960 | pr_warning("hugepages= specified twice without " |
1964 | "interleaving hugepagesz=, ignoring\n"); | 1961 | "interleaving hugepagesz=, ignoring\n"); |
1965 | return 1; | 1962 | return 1; |
1966 | } | 1963 | } |
1967 | 1964 | ||
@@ -2482,7 +2479,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2482 | address = address & huge_page_mask(h); | 2479 | address = address & huge_page_mask(h); |
2483 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + | 2480 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + |
2484 | vma->vm_pgoff; | 2481 | vma->vm_pgoff; |
2485 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; | 2482 | mapping = file_inode(vma->vm_file)->i_mapping; |
2486 | 2483 | ||
2487 | /* | 2484 | /* |
2488 | * Take the mapping lock for the duration of the table walk. As | 2485 | * Take the mapping lock for the duration of the table walk. As |
@@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2692 | * COW. Warn that such a situation has occurred as it may not be obvious | 2689 | * COW. Warn that such a situation has occurred as it may not be obvious |
2693 | */ | 2690 | */ |
2694 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { | 2691 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { |
2695 | printk(KERN_WARNING | 2692 | pr_warning("PID %d killed due to inadequate hugepage pool\n", |
2696 | "PID %d killed due to inadequate hugepage pool\n", | 2693 | current->pid); |
2697 | current->pid); | ||
2698 | return ret; | 2694 | return ret; |
2699 | } | 2695 | } |
2700 | 2696 | ||
@@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
2924 | return NULL; | 2920 | return NULL; |
2925 | } | 2921 | } |
2926 | 2922 | ||
2927 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2923 | long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2928 | struct page **pages, struct vm_area_struct **vmas, | 2924 | struct page **pages, struct vm_area_struct **vmas, |
2929 | unsigned long *position, int *length, int i, | 2925 | unsigned long *position, unsigned long *nr_pages, |
2930 | unsigned int flags) | 2926 | long i, unsigned int flags) |
2931 | { | 2927 | { |
2932 | unsigned long pfn_offset; | 2928 | unsigned long pfn_offset; |
2933 | unsigned long vaddr = *position; | 2929 | unsigned long vaddr = *position; |
2934 | int remainder = *length; | 2930 | unsigned long remainder = *nr_pages; |
2935 | struct hstate *h = hstate_vma(vma); | 2931 | struct hstate *h = hstate_vma(vma); |
2936 | 2932 | ||
2937 | spin_lock(&mm->page_table_lock); | 2933 | spin_lock(&mm->page_table_lock); |
@@ -3001,7 +2997,7 @@ same_page: | |||
3001 | } | 2997 | } |
3002 | } | 2998 | } |
3003 | spin_unlock(&mm->page_table_lock); | 2999 | spin_unlock(&mm->page_table_lock); |
3004 | *length = remainder; | 3000 | *nr_pages = remainder; |
3005 | *position = vaddr; | 3001 | *position = vaddr; |
3006 | 3002 | ||
3007 | return i ? i : -EFAULT; | 3003 | return i ? i : -EFAULT; |
@@ -3033,6 +3029,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3033 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3029 | if (!huge_pte_none(huge_ptep_get(ptep))) { |
3034 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 3030 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
3035 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 3031 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
3032 | pte = arch_make_huge_pte(pte, vma, NULL, 0); | ||
3036 | set_huge_pte_at(mm, address, ptep, pte); | 3033 | set_huge_pte_at(mm, address, ptep, pte); |
3037 | pages++; | 3034 | pages++; |
3038 | } | 3035 | } |
diff --git a/mm/internal.h b/mm/internal.h index d597f94cc205..8562de0a5197 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -135,7 +135,6 @@ struct compact_control { | |||
135 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 135 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
136 | struct zone *zone; | 136 | struct zone *zone; |
137 | bool contended; /* True if a lock was contended */ | 137 | bool contended; /* True if a lock was contended */ |
138 | struct page **page; /* Page captured of requested size */ | ||
139 | }; | 138 | }; |
140 | 139 | ||
141 | unsigned long | 140 | unsigned long |
@@ -163,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | |||
163 | struct vm_area_struct *prev, struct rb_node *rb_parent); | 162 | struct vm_area_struct *prev, struct rb_node *rb_parent); |
164 | 163 | ||
165 | #ifdef CONFIG_MMU | 164 | #ifdef CONFIG_MMU |
166 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | 165 | extern long __mlock_vma_pages_range(struct vm_area_struct *vma, |
167 | unsigned long start, unsigned long end); | 166 | unsigned long start, unsigned long end, int *nonblocking); |
168 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | 167 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, |
169 | unsigned long start, unsigned long end); | 168 | unsigned long start, unsigned long end); |
170 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | 169 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) |
@@ -196,7 +195,7 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | |||
196 | * must be called with vma's mmap_sem held for read or write, and page locked. | 195 | * must be called with vma's mmap_sem held for read or write, and page locked. |
197 | */ | 196 | */ |
198 | extern void mlock_vma_page(struct page *page); | 197 | extern void mlock_vma_page(struct page *page); |
199 | extern void munlock_vma_page(struct page *page); | 198 | extern unsigned int munlock_vma_page(struct page *page); |
200 | 199 | ||
201 | /* | 200 | /* |
202 | * Clear the page's PageMlocked(). This can be useful in a situation where | 201 | * Clear the page's PageMlocked(). This can be useful in a situation where |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 752a705c77c2..c8d7f3110fd0 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -436,7 +436,7 @@ static int get_object(struct kmemleak_object *object) | |||
436 | */ | 436 | */ |
437 | static void free_object_rcu(struct rcu_head *rcu) | 437 | static void free_object_rcu(struct rcu_head *rcu) |
438 | { | 438 | { |
439 | struct hlist_node *elem, *tmp; | 439 | struct hlist_node *tmp; |
440 | struct kmemleak_scan_area *area; | 440 | struct kmemleak_scan_area *area; |
441 | struct kmemleak_object *object = | 441 | struct kmemleak_object *object = |
442 | container_of(rcu, struct kmemleak_object, rcu); | 442 | container_of(rcu, struct kmemleak_object, rcu); |
@@ -445,8 +445,8 @@ static void free_object_rcu(struct rcu_head *rcu) | |||
445 | * Once use_count is 0 (guaranteed by put_object), there is no other | 445 | * Once use_count is 0 (guaranteed by put_object), there is no other |
446 | * code accessing this object, hence no need for locking. | 446 | * code accessing this object, hence no need for locking. |
447 | */ | 447 | */ |
448 | hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) { | 448 | hlist_for_each_entry_safe(area, tmp, &object->area_list, node) { |
449 | hlist_del(elem); | 449 | hlist_del(&area->node); |
450 | kmem_cache_free(scan_area_cache, area); | 450 | kmem_cache_free(scan_area_cache, area); |
451 | } | 451 | } |
452 | kmem_cache_free(object_cache, object); | 452 | kmem_cache_free(object_cache, object); |
@@ -1177,7 +1177,6 @@ static void scan_block(void *_start, void *_end, | |||
1177 | static void scan_object(struct kmemleak_object *object) | 1177 | static void scan_object(struct kmemleak_object *object) |
1178 | { | 1178 | { |
1179 | struct kmemleak_scan_area *area; | 1179 | struct kmemleak_scan_area *area; |
1180 | struct hlist_node *elem; | ||
1181 | unsigned long flags; | 1180 | unsigned long flags; |
1182 | 1181 | ||
1183 | /* | 1182 | /* |
@@ -1205,7 +1204,7 @@ static void scan_object(struct kmemleak_object *object) | |||
1205 | spin_lock_irqsave(&object->lock, flags); | 1204 | spin_lock_irqsave(&object->lock, flags); |
1206 | } | 1205 | } |
1207 | } else | 1206 | } else |
1208 | hlist_for_each_entry(area, elem, &object->area_list, node) | 1207 | hlist_for_each_entry(area, &object->area_list, node) |
1209 | scan_block((void *)area->start, | 1208 | scan_block((void *)area->start, |
1210 | (void *)(area->start + area->size), | 1209 | (void *)(area->start + area->size), |
1211 | object, 0); | 1210 | object, 0); |
@@ -1300,9 +1299,8 @@ static void kmemleak_scan(void) | |||
1300 | */ | 1299 | */ |
1301 | lock_memory_hotplug(); | 1300 | lock_memory_hotplug(); |
1302 | for_each_online_node(i) { | 1301 | for_each_online_node(i) { |
1303 | pg_data_t *pgdat = NODE_DATA(i); | 1302 | unsigned long start_pfn = node_start_pfn(i); |
1304 | unsigned long start_pfn = pgdat->node_start_pfn; | 1303 | unsigned long end_pfn = node_end_pfn(i); |
1305 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
1306 | unsigned long pfn; | 1304 | unsigned long pfn; |
1307 | 1305 | ||
1308 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 1306 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
@@ -33,13 +33,22 @@ | |||
33 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
36 | #include <linux/hash.h> | 36 | #include <linux/hashtable.h> |
37 | #include <linux/freezer.h> | 37 | #include <linux/freezer.h> |
38 | #include <linux/oom.h> | 38 | #include <linux/oom.h> |
39 | #include <linux/numa.h> | ||
39 | 40 | ||
40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
41 | #include "internal.h" | 42 | #include "internal.h" |
42 | 43 | ||
44 | #ifdef CONFIG_NUMA | ||
45 | #define NUMA(x) (x) | ||
46 | #define DO_NUMA(x) do { (x); } while (0) | ||
47 | #else | ||
48 | #define NUMA(x) (0) | ||
49 | #define DO_NUMA(x) do { } while (0) | ||
50 | #endif | ||
51 | |||
43 | /* | 52 | /* |
44 | * A few notes about the KSM scanning process, | 53 | * A few notes about the KSM scanning process, |
45 | * to make it easier to understand the data structures below: | 54 | * to make it easier to understand the data structures below: |
@@ -78,6 +87,9 @@ | |||
78 | * take 10 attempts to find a page in the unstable tree, once it is found, | 87 | * take 10 attempts to find a page in the unstable tree, once it is found, |
79 | * it is secured in the stable tree. (When we scan a new page, we first | 88 | * it is secured in the stable tree. (When we scan a new page, we first |
80 | * compare it against the stable tree, and then against the unstable tree.) | 89 | * compare it against the stable tree, and then against the unstable tree.) |
90 | * | ||
91 | * If the merge_across_nodes tunable is unset, then KSM maintains multiple | ||
92 | * stable trees and multiple unstable trees: one of each for each NUMA node. | ||
81 | */ | 93 | */ |
82 | 94 | ||
83 | /** | 95 | /** |
@@ -113,19 +125,32 @@ struct ksm_scan { | |||
113 | /** | 125 | /** |
114 | * struct stable_node - node of the stable rbtree | 126 | * struct stable_node - node of the stable rbtree |
115 | * @node: rb node of this ksm page in the stable tree | 127 | * @node: rb node of this ksm page in the stable tree |
128 | * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list | ||
129 | * @list: linked into migrate_nodes, pending placement in the proper node tree | ||
116 | * @hlist: hlist head of rmap_items using this ksm page | 130 | * @hlist: hlist head of rmap_items using this ksm page |
117 | * @kpfn: page frame number of this ksm page | 131 | * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) |
132 | * @nid: NUMA node id of stable tree in which linked (may not match kpfn) | ||
118 | */ | 133 | */ |
119 | struct stable_node { | 134 | struct stable_node { |
120 | struct rb_node node; | 135 | union { |
136 | struct rb_node node; /* when node of stable tree */ | ||
137 | struct { /* when listed for migration */ | ||
138 | struct list_head *head; | ||
139 | struct list_head list; | ||
140 | }; | ||
141 | }; | ||
121 | struct hlist_head hlist; | 142 | struct hlist_head hlist; |
122 | unsigned long kpfn; | 143 | unsigned long kpfn; |
144 | #ifdef CONFIG_NUMA | ||
145 | int nid; | ||
146 | #endif | ||
123 | }; | 147 | }; |
124 | 148 | ||
125 | /** | 149 | /** |
126 | * struct rmap_item - reverse mapping item for virtual addresses | 150 | * struct rmap_item - reverse mapping item for virtual addresses |
127 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list | 151 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
128 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | 152 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree |
153 | * @nid: NUMA node id of unstable tree in which linked (may not match page) | ||
129 | * @mm: the memory structure this rmap_item is pointing into | 154 | * @mm: the memory structure this rmap_item is pointing into |
130 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 155 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
131 | * @oldchecksum: previous checksum of the page at that virtual address | 156 | * @oldchecksum: previous checksum of the page at that virtual address |
@@ -135,7 +160,12 @@ struct stable_node { | |||
135 | */ | 160 | */ |
136 | struct rmap_item { | 161 | struct rmap_item { |
137 | struct rmap_item *rmap_list; | 162 | struct rmap_item *rmap_list; |
138 | struct anon_vma *anon_vma; /* when stable */ | 163 | union { |
164 | struct anon_vma *anon_vma; /* when stable */ | ||
165 | #ifdef CONFIG_NUMA | ||
166 | int nid; /* when node of unstable tree */ | ||
167 | #endif | ||
168 | }; | ||
139 | struct mm_struct *mm; | 169 | struct mm_struct *mm; |
140 | unsigned long address; /* + low bits used for flags below */ | 170 | unsigned long address; /* + low bits used for flags below */ |
141 | unsigned int oldchecksum; /* when unstable */ | 171 | unsigned int oldchecksum; /* when unstable */ |
@@ -153,12 +183,16 @@ struct rmap_item { | |||
153 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ | 183 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ |
154 | 184 | ||
155 | /* The stable and unstable tree heads */ | 185 | /* The stable and unstable tree heads */ |
156 | static struct rb_root root_stable_tree = RB_ROOT; | 186 | static struct rb_root one_stable_tree[1] = { RB_ROOT }; |
157 | static struct rb_root root_unstable_tree = RB_ROOT; | 187 | static struct rb_root one_unstable_tree[1] = { RB_ROOT }; |
188 | static struct rb_root *root_stable_tree = one_stable_tree; | ||
189 | static struct rb_root *root_unstable_tree = one_unstable_tree; | ||
158 | 190 | ||
159 | #define MM_SLOTS_HASH_SHIFT 10 | 191 | /* Recently migrated nodes of stable tree, pending proper placement */ |
160 | #define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) | 192 | static LIST_HEAD(migrate_nodes); |
161 | static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; | 193 | |
194 | #define MM_SLOTS_HASH_BITS 10 | ||
195 | static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | ||
162 | 196 | ||
163 | static struct mm_slot ksm_mm_head = { | 197 | static struct mm_slot ksm_mm_head = { |
164 | .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), | 198 | .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), |
@@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100; | |||
189 | /* Milliseconds ksmd should sleep between batches */ | 223 | /* Milliseconds ksmd should sleep between batches */ |
190 | static unsigned int ksm_thread_sleep_millisecs = 20; | 224 | static unsigned int ksm_thread_sleep_millisecs = 20; |
191 | 225 | ||
226 | #ifdef CONFIG_NUMA | ||
227 | /* Zeroed when merging across nodes is not allowed */ | ||
228 | static unsigned int ksm_merge_across_nodes = 1; | ||
229 | static int ksm_nr_node_ids = 1; | ||
230 | #else | ||
231 | #define ksm_merge_across_nodes 1U | ||
232 | #define ksm_nr_node_ids 1 | ||
233 | #endif | ||
234 | |||
192 | #define KSM_RUN_STOP 0 | 235 | #define KSM_RUN_STOP 0 |
193 | #define KSM_RUN_MERGE 1 | 236 | #define KSM_RUN_MERGE 1 |
194 | #define KSM_RUN_UNMERGE 2 | 237 | #define KSM_RUN_UNMERGE 2 |
195 | static unsigned int ksm_run = KSM_RUN_STOP; | 238 | #define KSM_RUN_OFFLINE 4 |
239 | static unsigned long ksm_run = KSM_RUN_STOP; | ||
240 | static void wait_while_offlining(void); | ||
196 | 241 | ||
197 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | 242 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); |
198 | static DEFINE_MUTEX(ksm_thread_mutex); | 243 | static DEFINE_MUTEX(ksm_thread_mutex); |
@@ -275,31 +320,20 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) | |||
275 | 320 | ||
276 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | 321 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) |
277 | { | 322 | { |
278 | struct mm_slot *mm_slot; | 323 | struct mm_slot *slot; |
279 | struct hlist_head *bucket; | 324 | |
280 | struct hlist_node *node; | 325 | hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) |
326 | if (slot->mm == mm) | ||
327 | return slot; | ||
281 | 328 | ||
282 | bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; | ||
283 | hlist_for_each_entry(mm_slot, node, bucket, link) { | ||
284 | if (mm == mm_slot->mm) | ||
285 | return mm_slot; | ||
286 | } | ||
287 | return NULL; | 329 | return NULL; |
288 | } | 330 | } |
289 | 331 | ||
290 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | 332 | static void insert_to_mm_slots_hash(struct mm_struct *mm, |
291 | struct mm_slot *mm_slot) | 333 | struct mm_slot *mm_slot) |
292 | { | 334 | { |
293 | struct hlist_head *bucket; | ||
294 | |||
295 | bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; | ||
296 | mm_slot->mm = mm; | 335 | mm_slot->mm = mm; |
297 | hlist_add_head(&mm_slot->link, bucket); | 336 | hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); |
298 | } | ||
299 | |||
300 | static inline int in_stable_tree(struct rmap_item *rmap_item) | ||
301 | { | ||
302 | return rmap_item->address & STABLE_FLAG; | ||
303 | } | 337 | } |
304 | 338 | ||
305 | /* | 339 | /* |
@@ -333,7 +367,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
333 | 367 | ||
334 | do { | 368 | do { |
335 | cond_resched(); | 369 | cond_resched(); |
336 | page = follow_page(vma, addr, FOLL_GET); | 370 | page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); |
337 | if (IS_ERR_OR_NULL(page)) | 371 | if (IS_ERR_OR_NULL(page)) |
338 | break; | 372 | break; |
339 | if (PageKsm(page)) | 373 | if (PageKsm(page)) |
@@ -447,12 +481,22 @@ out: page = NULL; | |||
447 | return page; | 481 | return page; |
448 | } | 482 | } |
449 | 483 | ||
484 | /* | ||
485 | * This helper is used for getting right index into array of tree roots. | ||
486 | * When merge_across_nodes knob is set to 1, there are only two rb-trees for | ||
487 | * stable and unstable pages from all nodes with roots in index 0. Otherwise, | ||
488 | * every node has its own stable and unstable tree. | ||
489 | */ | ||
490 | static inline int get_kpfn_nid(unsigned long kpfn) | ||
491 | { | ||
492 | return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn); | ||
493 | } | ||
494 | |||
450 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | 495 | static void remove_node_from_stable_tree(struct stable_node *stable_node) |
451 | { | 496 | { |
452 | struct rmap_item *rmap_item; | 497 | struct rmap_item *rmap_item; |
453 | struct hlist_node *hlist; | ||
454 | 498 | ||
455 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 499 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { |
456 | if (rmap_item->hlist.next) | 500 | if (rmap_item->hlist.next) |
457 | ksm_pages_sharing--; | 501 | ksm_pages_sharing--; |
458 | else | 502 | else |
@@ -462,7 +506,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
462 | cond_resched(); | 506 | cond_resched(); |
463 | } | 507 | } |
464 | 508 | ||
465 | rb_erase(&stable_node->node, &root_stable_tree); | 509 | if (stable_node->head == &migrate_nodes) |
510 | list_del(&stable_node->list); | ||
511 | else | ||
512 | rb_erase(&stable_node->node, | ||
513 | root_stable_tree + NUMA(stable_node->nid)); | ||
466 | free_stable_node(stable_node); | 514 | free_stable_node(stable_node); |
467 | } | 515 | } |
468 | 516 | ||
@@ -472,6 +520,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
472 | * In which case we can trust the content of the page, and it | 520 | * In which case we can trust the content of the page, and it |
473 | * returns the gotten page; but if the page has now been zapped, | 521 | * returns the gotten page; but if the page has now been zapped, |
474 | * remove the stale node from the stable tree and return NULL. | 522 | * remove the stale node from the stable tree and return NULL. |
523 | * But beware, the stable node's page might be being migrated. | ||
475 | * | 524 | * |
476 | * You would expect the stable_node to hold a reference to the ksm page. | 525 | * You would expect the stable_node to hold a reference to the ksm page. |
477 | * But if it increments the page's count, swapping out has to wait for | 526 | * But if it increments the page's count, swapping out has to wait for |
@@ -482,40 +531,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
482 | * pointing back to this stable node. This relies on freeing a PageAnon | 531 | * pointing back to this stable node. This relies on freeing a PageAnon |
483 | * page to reset its page->mapping to NULL, and relies on no other use of | 532 | * page to reset its page->mapping to NULL, and relies on no other use of |
484 | * a page to put something that might look like our key in page->mapping. | 533 | * a page to put something that might look like our key in page->mapping. |
485 | * | ||
486 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
487 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
488 | * interesting for assuming that no other use of the struct page could ever | ||
489 | * put our expected_mapping into page->mapping (or a field of the union which | ||
490 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
491 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
492 | * | ||
493 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
494 | * then page the next, if the page is in between page_freeze_refs() and | ||
495 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
496 | * is on its way to being freed; but it is an anomaly to bear in mind. | 534 | * is on its way to being freed; but it is an anomaly to bear in mind. |
497 | */ | 535 | */ |
498 | static struct page *get_ksm_page(struct stable_node *stable_node) | 536 | static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) |
499 | { | 537 | { |
500 | struct page *page; | 538 | struct page *page; |
501 | void *expected_mapping; | 539 | void *expected_mapping; |
540 | unsigned long kpfn; | ||
502 | 541 | ||
503 | page = pfn_to_page(stable_node->kpfn); | ||
504 | expected_mapping = (void *)stable_node + | 542 | expected_mapping = (void *)stable_node + |
505 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 543 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); |
506 | rcu_read_lock(); | 544 | again: |
507 | if (page->mapping != expected_mapping) | 545 | kpfn = ACCESS_ONCE(stable_node->kpfn); |
508 | goto stale; | 546 | page = pfn_to_page(kpfn); |
509 | if (!get_page_unless_zero(page)) | 547 | |
548 | /* | ||
549 | * page is computed from kpfn, so on most architectures reading | ||
550 | * page->mapping is naturally ordered after reading node->kpfn, | ||
551 | * but on Alpha we need to be more careful. | ||
552 | */ | ||
553 | smp_read_barrier_depends(); | ||
554 | if (ACCESS_ONCE(page->mapping) != expected_mapping) | ||
510 | goto stale; | 555 | goto stale; |
511 | if (page->mapping != expected_mapping) { | 556 | |
557 | /* | ||
558 | * We cannot do anything with the page while its refcount is 0. | ||
559 | * Usually 0 means free, or tail of a higher-order page: in which | ||
560 | * case this node is no longer referenced, and should be freed; | ||
561 | * however, it might mean that the page is under page_freeze_refs(). | ||
562 | * The __remove_mapping() case is easy, again the node is now stale; | ||
563 | * but if page is swapcache in migrate_page_move_mapping(), it might | ||
564 | * still be our page, in which case it's essential to keep the node. | ||
565 | */ | ||
566 | while (!get_page_unless_zero(page)) { | ||
567 | /* | ||
568 | * Another check for page->mapping != expected_mapping would | ||
569 | * work here too. We have chosen the !PageSwapCache test to | ||
570 | * optimize the common case, when the page is or is about to | ||
571 | * be freed: PageSwapCache is cleared (under spin_lock_irq) | ||
572 | * in the freeze_refs section of __remove_mapping(); but Anon | ||
573 | * page->mapping reset to NULL later, in free_pages_prepare(). | ||
574 | */ | ||
575 | if (!PageSwapCache(page)) | ||
576 | goto stale; | ||
577 | cpu_relax(); | ||
578 | } | ||
579 | |||
580 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | ||
512 | put_page(page); | 581 | put_page(page); |
513 | goto stale; | 582 | goto stale; |
514 | } | 583 | } |
515 | rcu_read_unlock(); | 584 | |
585 | if (lock_it) { | ||
586 | lock_page(page); | ||
587 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | ||
588 | unlock_page(page); | ||
589 | put_page(page); | ||
590 | goto stale; | ||
591 | } | ||
592 | } | ||
516 | return page; | 593 | return page; |
594 | |||
517 | stale: | 595 | stale: |
518 | rcu_read_unlock(); | 596 | /* |
597 | * We come here from above when page->mapping or !PageSwapCache | ||
598 | * suggests that the node is stale; but it might be under migration. | ||
599 | * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), | ||
600 | * before checking whether node->kpfn has been changed. | ||
601 | */ | ||
602 | smp_rmb(); | ||
603 | if (ACCESS_ONCE(stable_node->kpfn) != kpfn) | ||
604 | goto again; | ||
519 | remove_node_from_stable_tree(stable_node); | 605 | remove_node_from_stable_tree(stable_node); |
520 | return NULL; | 606 | return NULL; |
521 | } | 607 | } |
@@ -531,11 +617,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
531 | struct page *page; | 617 | struct page *page; |
532 | 618 | ||
533 | stable_node = rmap_item->head; | 619 | stable_node = rmap_item->head; |
534 | page = get_ksm_page(stable_node); | 620 | page = get_ksm_page(stable_node, true); |
535 | if (!page) | 621 | if (!page) |
536 | goto out; | 622 | goto out; |
537 | 623 | ||
538 | lock_page(page); | ||
539 | hlist_del(&rmap_item->hlist); | 624 | hlist_del(&rmap_item->hlist); |
540 | unlock_page(page); | 625 | unlock_page(page); |
541 | put_page(page); | 626 | put_page(page); |
@@ -560,8 +645,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
560 | age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); | 645 | age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); |
561 | BUG_ON(age > 1); | 646 | BUG_ON(age > 1); |
562 | if (!age) | 647 | if (!age) |
563 | rb_erase(&rmap_item->node, &root_unstable_tree); | 648 | rb_erase(&rmap_item->node, |
564 | 649 | root_unstable_tree + NUMA(rmap_item->nid)); | |
565 | ksm_pages_unshared--; | 650 | ksm_pages_unshared--; |
566 | rmap_item->address &= PAGE_MASK; | 651 | rmap_item->address &= PAGE_MASK; |
567 | } | 652 | } |
@@ -581,7 +666,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | |||
581 | } | 666 | } |
582 | 667 | ||
583 | /* | 668 | /* |
584 | * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather | 669 | * Though it's very tempting to unmerge rmap_items from stable tree rather |
585 | * than check every pte of a given vma, the locking doesn't quite work for | 670 | * than check every pte of a given vma, the locking doesn't quite work for |
586 | * that - an rmap_item is assigned to the stable tree after inserting ksm | 671 | * that - an rmap_item is assigned to the stable tree after inserting ksm |
587 | * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing | 672 | * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing |
@@ -614,6 +699,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, | |||
614 | /* | 699 | /* |
615 | * Only called through the sysfs control interface: | 700 | * Only called through the sysfs control interface: |
616 | */ | 701 | */ |
702 | static int remove_stable_node(struct stable_node *stable_node) | ||
703 | { | ||
704 | struct page *page; | ||
705 | int err; | ||
706 | |||
707 | page = get_ksm_page(stable_node, true); | ||
708 | if (!page) { | ||
709 | /* | ||
710 | * get_ksm_page did remove_node_from_stable_tree itself. | ||
711 | */ | ||
712 | return 0; | ||
713 | } | ||
714 | |||
715 | if (WARN_ON_ONCE(page_mapped(page))) { | ||
716 | /* | ||
717 | * This should not happen: but if it does, just refuse to let | ||
718 | * merge_across_nodes be switched - there is no need to panic. | ||
719 | */ | ||
720 | err = -EBUSY; | ||
721 | } else { | ||
722 | /* | ||
723 | * The stable node did not yet appear stale to get_ksm_page(), | ||
724 | * since that allows for an unmapped ksm page to be recognized | ||
725 | * right up until it is freed; but the node is safe to remove. | ||
726 | * This page might be in a pagevec waiting to be freed, | ||
727 | * or it might be PageSwapCache (perhaps under writeback), | ||
728 | * or it might have been removed from swapcache a moment ago. | ||
729 | */ | ||
730 | set_page_stable_node(page, NULL); | ||
731 | remove_node_from_stable_tree(stable_node); | ||
732 | err = 0; | ||
733 | } | ||
734 | |||
735 | unlock_page(page); | ||
736 | put_page(page); | ||
737 | return err; | ||
738 | } | ||
739 | |||
740 | static int remove_all_stable_nodes(void) | ||
741 | { | ||
742 | struct stable_node *stable_node; | ||
743 | struct list_head *this, *next; | ||
744 | int nid; | ||
745 | int err = 0; | ||
746 | |||
747 | for (nid = 0; nid < ksm_nr_node_ids; nid++) { | ||
748 | while (root_stable_tree[nid].rb_node) { | ||
749 | stable_node = rb_entry(root_stable_tree[nid].rb_node, | ||
750 | struct stable_node, node); | ||
751 | if (remove_stable_node(stable_node)) { | ||
752 | err = -EBUSY; | ||
753 | break; /* proceed to next nid */ | ||
754 | } | ||
755 | cond_resched(); | ||
756 | } | ||
757 | } | ||
758 | list_for_each_safe(this, next, &migrate_nodes) { | ||
759 | stable_node = list_entry(this, struct stable_node, list); | ||
760 | if (remove_stable_node(stable_node)) | ||
761 | err = -EBUSY; | ||
762 | cond_resched(); | ||
763 | } | ||
764 | return err; | ||
765 | } | ||
766 | |||
617 | static int unmerge_and_remove_all_rmap_items(void) | 767 | static int unmerge_and_remove_all_rmap_items(void) |
618 | { | 768 | { |
619 | struct mm_slot *mm_slot; | 769 | struct mm_slot *mm_slot; |
@@ -647,7 +797,7 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
647 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | 797 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
648 | struct mm_slot, mm_list); | 798 | struct mm_slot, mm_list); |
649 | if (ksm_test_exit(mm)) { | 799 | if (ksm_test_exit(mm)) { |
650 | hlist_del(&mm_slot->link); | 800 | hash_del(&mm_slot->link); |
651 | list_del(&mm_slot->mm_list); | 801 | list_del(&mm_slot->mm_list); |
652 | spin_unlock(&ksm_mmlist_lock); | 802 | spin_unlock(&ksm_mmlist_lock); |
653 | 803 | ||
@@ -661,6 +811,8 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
661 | } | 811 | } |
662 | } | 812 | } |
663 | 813 | ||
814 | /* Clean up stable nodes, but don't worry if some are still busy */ | ||
815 | remove_all_stable_nodes(); | ||
664 | ksm_scan.seqnr = 0; | 816 | ksm_scan.seqnr = 0; |
665 | return 0; | 817 | return 0; |
666 | 818 | ||
@@ -946,6 +1098,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, | |||
946 | if (err) | 1098 | if (err) |
947 | goto out; | 1099 | goto out; |
948 | 1100 | ||
1101 | /* Unstable nid is in union with stable anon_vma: remove first */ | ||
1102 | remove_rmap_item_from_tree(rmap_item); | ||
1103 | |||
949 | /* Must get reference to anon_vma while still holding mmap_sem */ | 1104 | /* Must get reference to anon_vma while still holding mmap_sem */ |
950 | rmap_item->anon_vma = vma->anon_vma; | 1105 | rmap_item->anon_vma = vma->anon_vma; |
951 | get_anon_vma(vma->anon_vma); | 1106 | get_anon_vma(vma->anon_vma); |
@@ -996,42 +1151,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
996 | */ | 1151 | */ |
997 | static struct page *stable_tree_search(struct page *page) | 1152 | static struct page *stable_tree_search(struct page *page) |
998 | { | 1153 | { |
999 | struct rb_node *node = root_stable_tree.rb_node; | 1154 | int nid; |
1155 | struct rb_root *root; | ||
1156 | struct rb_node **new; | ||
1157 | struct rb_node *parent; | ||
1000 | struct stable_node *stable_node; | 1158 | struct stable_node *stable_node; |
1159 | struct stable_node *page_node; | ||
1001 | 1160 | ||
1002 | stable_node = page_stable_node(page); | 1161 | page_node = page_stable_node(page); |
1003 | if (stable_node) { /* ksm page forked */ | 1162 | if (page_node && page_node->head != &migrate_nodes) { |
1163 | /* ksm page forked */ | ||
1004 | get_page(page); | 1164 | get_page(page); |
1005 | return page; | 1165 | return page; |
1006 | } | 1166 | } |
1007 | 1167 | ||
1008 | while (node) { | 1168 | nid = get_kpfn_nid(page_to_pfn(page)); |
1169 | root = root_stable_tree + nid; | ||
1170 | again: | ||
1171 | new = &root->rb_node; | ||
1172 | parent = NULL; | ||
1173 | |||
1174 | while (*new) { | ||
1009 | struct page *tree_page; | 1175 | struct page *tree_page; |
1010 | int ret; | 1176 | int ret; |
1011 | 1177 | ||
1012 | cond_resched(); | 1178 | cond_resched(); |
1013 | stable_node = rb_entry(node, struct stable_node, node); | 1179 | stable_node = rb_entry(*new, struct stable_node, node); |
1014 | tree_page = get_ksm_page(stable_node); | 1180 | tree_page = get_ksm_page(stable_node, false); |
1015 | if (!tree_page) | 1181 | if (!tree_page) |
1016 | return NULL; | 1182 | return NULL; |
1017 | 1183 | ||
1018 | ret = memcmp_pages(page, tree_page); | 1184 | ret = memcmp_pages(page, tree_page); |
1185 | put_page(tree_page); | ||
1019 | 1186 | ||
1020 | if (ret < 0) { | 1187 | parent = *new; |
1021 | put_page(tree_page); | 1188 | if (ret < 0) |
1022 | node = node->rb_left; | 1189 | new = &parent->rb_left; |
1023 | } else if (ret > 0) { | 1190 | else if (ret > 0) |
1024 | put_page(tree_page); | 1191 | new = &parent->rb_right; |
1025 | node = node->rb_right; | 1192 | else { |
1026 | } else | 1193 | /* |
1027 | return tree_page; | 1194 | * Lock and unlock the stable_node's page (which |
1195 | * might already have been migrated) so that page | ||
1196 | * migration is sure to notice its raised count. | ||
1197 | * It would be more elegant to return stable_node | ||
1198 | * than kpage, but that involves more changes. | ||
1199 | */ | ||
1200 | tree_page = get_ksm_page(stable_node, true); | ||
1201 | if (tree_page) { | ||
1202 | unlock_page(tree_page); | ||
1203 | if (get_kpfn_nid(stable_node->kpfn) != | ||
1204 | NUMA(stable_node->nid)) { | ||
1205 | put_page(tree_page); | ||
1206 | goto replace; | ||
1207 | } | ||
1208 | return tree_page; | ||
1209 | } | ||
1210 | /* | ||
1211 | * There is now a place for page_node, but the tree may | ||
1212 | * have been rebalanced, so re-evaluate parent and new. | ||
1213 | */ | ||
1214 | if (page_node) | ||
1215 | goto again; | ||
1216 | return NULL; | ||
1217 | } | ||
1028 | } | 1218 | } |
1029 | 1219 | ||
1030 | return NULL; | 1220 | if (!page_node) |
1221 | return NULL; | ||
1222 | |||
1223 | list_del(&page_node->list); | ||
1224 | DO_NUMA(page_node->nid = nid); | ||
1225 | rb_link_node(&page_node->node, parent, new); | ||
1226 | rb_insert_color(&page_node->node, root); | ||
1227 | get_page(page); | ||
1228 | return page; | ||
1229 | |||
1230 | replace: | ||
1231 | if (page_node) { | ||
1232 | list_del(&page_node->list); | ||
1233 | DO_NUMA(page_node->nid = nid); | ||
1234 | rb_replace_node(&stable_node->node, &page_node->node, root); | ||
1235 | get_page(page); | ||
1236 | } else { | ||
1237 | rb_erase(&stable_node->node, root); | ||
1238 | page = NULL; | ||
1239 | } | ||
1240 | stable_node->head = &migrate_nodes; | ||
1241 | list_add(&stable_node->list, stable_node->head); | ||
1242 | return page; | ||
1031 | } | 1243 | } |
1032 | 1244 | ||
1033 | /* | 1245 | /* |
1034 | * stable_tree_insert - insert rmap_item pointing to new ksm page | 1246 | * stable_tree_insert - insert stable tree node pointing to new ksm page |
1035 | * into the stable tree. | 1247 | * into the stable tree. |
1036 | * | 1248 | * |
1037 | * This function returns the stable tree node just allocated on success, | 1249 | * This function returns the stable tree node just allocated on success, |
@@ -1039,17 +1251,25 @@ static struct page *stable_tree_search(struct page *page) | |||
1039 | */ | 1251 | */ |
1040 | static struct stable_node *stable_tree_insert(struct page *kpage) | 1252 | static struct stable_node *stable_tree_insert(struct page *kpage) |
1041 | { | 1253 | { |
1042 | struct rb_node **new = &root_stable_tree.rb_node; | 1254 | int nid; |
1255 | unsigned long kpfn; | ||
1256 | struct rb_root *root; | ||
1257 | struct rb_node **new; | ||
1043 | struct rb_node *parent = NULL; | 1258 | struct rb_node *parent = NULL; |
1044 | struct stable_node *stable_node; | 1259 | struct stable_node *stable_node; |
1045 | 1260 | ||
1261 | kpfn = page_to_pfn(kpage); | ||
1262 | nid = get_kpfn_nid(kpfn); | ||
1263 | root = root_stable_tree + nid; | ||
1264 | new = &root->rb_node; | ||
1265 | |||
1046 | while (*new) { | 1266 | while (*new) { |
1047 | struct page *tree_page; | 1267 | struct page *tree_page; |
1048 | int ret; | 1268 | int ret; |
1049 | 1269 | ||
1050 | cond_resched(); | 1270 | cond_resched(); |
1051 | stable_node = rb_entry(*new, struct stable_node, node); | 1271 | stable_node = rb_entry(*new, struct stable_node, node); |
1052 | tree_page = get_ksm_page(stable_node); | 1272 | tree_page = get_ksm_page(stable_node, false); |
1053 | if (!tree_page) | 1273 | if (!tree_page) |
1054 | return NULL; | 1274 | return NULL; |
1055 | 1275 | ||
@@ -1075,13 +1295,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
1075 | if (!stable_node) | 1295 | if (!stable_node) |
1076 | return NULL; | 1296 | return NULL; |
1077 | 1297 | ||
1078 | rb_link_node(&stable_node->node, parent, new); | ||
1079 | rb_insert_color(&stable_node->node, &root_stable_tree); | ||
1080 | |||
1081 | INIT_HLIST_HEAD(&stable_node->hlist); | 1298 | INIT_HLIST_HEAD(&stable_node->hlist); |
1082 | 1299 | stable_node->kpfn = kpfn; | |
1083 | stable_node->kpfn = page_to_pfn(kpage); | ||
1084 | set_page_stable_node(kpage, stable_node); | 1300 | set_page_stable_node(kpage, stable_node); |
1301 | DO_NUMA(stable_node->nid = nid); | ||
1302 | rb_link_node(&stable_node->node, parent, new); | ||
1303 | rb_insert_color(&stable_node->node, root); | ||
1085 | 1304 | ||
1086 | return stable_node; | 1305 | return stable_node; |
1087 | } | 1306 | } |
@@ -1104,10 +1323,15 @@ static | |||
1104 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | 1323 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, |
1105 | struct page *page, | 1324 | struct page *page, |
1106 | struct page **tree_pagep) | 1325 | struct page **tree_pagep) |
1107 | |||
1108 | { | 1326 | { |
1109 | struct rb_node **new = &root_unstable_tree.rb_node; | 1327 | struct rb_node **new; |
1328 | struct rb_root *root; | ||
1110 | struct rb_node *parent = NULL; | 1329 | struct rb_node *parent = NULL; |
1330 | int nid; | ||
1331 | |||
1332 | nid = get_kpfn_nid(page_to_pfn(page)); | ||
1333 | root = root_unstable_tree + nid; | ||
1334 | new = &root->rb_node; | ||
1111 | 1335 | ||
1112 | while (*new) { | 1336 | while (*new) { |
1113 | struct rmap_item *tree_rmap_item; | 1337 | struct rmap_item *tree_rmap_item; |
@@ -1137,6 +1361,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
1137 | } else if (ret > 0) { | 1361 | } else if (ret > 0) { |
1138 | put_page(tree_page); | 1362 | put_page(tree_page); |
1139 | new = &parent->rb_right; | 1363 | new = &parent->rb_right; |
1364 | } else if (!ksm_merge_across_nodes && | ||
1365 | page_to_nid(tree_page) != nid) { | ||
1366 | /* | ||
1367 | * If tree_page has been migrated to another NUMA node, | ||
1368 | * it will be flushed out and put in the right unstable | ||
1369 | * tree next time: only merge with it when across_nodes. | ||
1370 | */ | ||
1371 | put_page(tree_page); | ||
1372 | return NULL; | ||
1140 | } else { | 1373 | } else { |
1141 | *tree_pagep = tree_page; | 1374 | *tree_pagep = tree_page; |
1142 | return tree_rmap_item; | 1375 | return tree_rmap_item; |
@@ -1145,8 +1378,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
1145 | 1378 | ||
1146 | rmap_item->address |= UNSTABLE_FLAG; | 1379 | rmap_item->address |= UNSTABLE_FLAG; |
1147 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | 1380 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); |
1381 | DO_NUMA(rmap_item->nid = nid); | ||
1148 | rb_link_node(&rmap_item->node, parent, new); | 1382 | rb_link_node(&rmap_item->node, parent, new); |
1149 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | 1383 | rb_insert_color(&rmap_item->node, root); |
1150 | 1384 | ||
1151 | ksm_pages_unshared++; | 1385 | ksm_pages_unshared++; |
1152 | return NULL; | 1386 | return NULL; |
@@ -1188,10 +1422,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1188 | unsigned int checksum; | 1422 | unsigned int checksum; |
1189 | int err; | 1423 | int err; |
1190 | 1424 | ||
1191 | remove_rmap_item_from_tree(rmap_item); | 1425 | stable_node = page_stable_node(page); |
1426 | if (stable_node) { | ||
1427 | if (stable_node->head != &migrate_nodes && | ||
1428 | get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { | ||
1429 | rb_erase(&stable_node->node, | ||
1430 | root_stable_tree + NUMA(stable_node->nid)); | ||
1431 | stable_node->head = &migrate_nodes; | ||
1432 | list_add(&stable_node->list, stable_node->head); | ||
1433 | } | ||
1434 | if (stable_node->head != &migrate_nodes && | ||
1435 | rmap_item->head == stable_node) | ||
1436 | return; | ||
1437 | } | ||
1192 | 1438 | ||
1193 | /* We first start with searching the page inside the stable tree */ | 1439 | /* We first start with searching the page inside the stable tree */ |
1194 | kpage = stable_tree_search(page); | 1440 | kpage = stable_tree_search(page); |
1441 | if (kpage == page && rmap_item->head == stable_node) { | ||
1442 | put_page(kpage); | ||
1443 | return; | ||
1444 | } | ||
1445 | |||
1446 | remove_rmap_item_from_tree(rmap_item); | ||
1447 | |||
1195 | if (kpage) { | 1448 | if (kpage) { |
1196 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); | 1449 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1197 | if (!err) { | 1450 | if (!err) { |
@@ -1225,14 +1478,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1225 | kpage = try_to_merge_two_pages(rmap_item, page, | 1478 | kpage = try_to_merge_two_pages(rmap_item, page, |
1226 | tree_rmap_item, tree_page); | 1479 | tree_rmap_item, tree_page); |
1227 | put_page(tree_page); | 1480 | put_page(tree_page); |
1228 | /* | ||
1229 | * As soon as we merge this page, we want to remove the | ||
1230 | * rmap_item of the page we have merged with from the unstable | ||
1231 | * tree, and insert it instead as new node in the stable tree. | ||
1232 | */ | ||
1233 | if (kpage) { | 1481 | if (kpage) { |
1234 | remove_rmap_item_from_tree(tree_rmap_item); | 1482 | /* |
1235 | 1483 | * The pages were successfully merged: insert new | |
1484 | * node in the stable tree and add both rmap_items. | ||
1485 | */ | ||
1236 | lock_page(kpage); | 1486 | lock_page(kpage); |
1237 | stable_node = stable_tree_insert(kpage); | 1487 | stable_node = stable_tree_insert(kpage); |
1238 | if (stable_node) { | 1488 | if (stable_node) { |
@@ -1289,6 +1539,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1289 | struct mm_slot *slot; | 1539 | struct mm_slot *slot; |
1290 | struct vm_area_struct *vma; | 1540 | struct vm_area_struct *vma; |
1291 | struct rmap_item *rmap_item; | 1541 | struct rmap_item *rmap_item; |
1542 | int nid; | ||
1292 | 1543 | ||
1293 | if (list_empty(&ksm_mm_head.mm_list)) | 1544 | if (list_empty(&ksm_mm_head.mm_list)) |
1294 | return NULL; | 1545 | return NULL; |
@@ -1307,7 +1558,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1307 | */ | 1558 | */ |
1308 | lru_add_drain_all(); | 1559 | lru_add_drain_all(); |
1309 | 1560 | ||
1310 | root_unstable_tree = RB_ROOT; | 1561 | /* |
1562 | * Whereas stale stable_nodes on the stable_tree itself | ||
1563 | * get pruned in the regular course of stable_tree_search(), | ||
1564 | * those moved out to the migrate_nodes list can accumulate: | ||
1565 | * so prune them once before each full scan. | ||
1566 | */ | ||
1567 | if (!ksm_merge_across_nodes) { | ||
1568 | struct stable_node *stable_node; | ||
1569 | struct list_head *this, *next; | ||
1570 | struct page *page; | ||
1571 | |||
1572 | list_for_each_safe(this, next, &migrate_nodes) { | ||
1573 | stable_node = list_entry(this, | ||
1574 | struct stable_node, list); | ||
1575 | page = get_ksm_page(stable_node, false); | ||
1576 | if (page) | ||
1577 | put_page(page); | ||
1578 | cond_resched(); | ||
1579 | } | ||
1580 | } | ||
1581 | |||
1582 | for (nid = 0; nid < ksm_nr_node_ids; nid++) | ||
1583 | root_unstable_tree[nid] = RB_ROOT; | ||
1311 | 1584 | ||
1312 | spin_lock(&ksm_mmlist_lock); | 1585 | spin_lock(&ksm_mmlist_lock); |
1313 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); | 1586 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); |
@@ -1392,7 +1665,7 @@ next_mm: | |||
1392 | * or when all VM_MERGEABLE areas have been unmapped (and | 1665 | * or when all VM_MERGEABLE areas have been unmapped (and |
1393 | * mmap_sem then protects against race with MADV_MERGEABLE). | 1666 | * mmap_sem then protects against race with MADV_MERGEABLE). |
1394 | */ | 1667 | */ |
1395 | hlist_del(&slot->link); | 1668 | hash_del(&slot->link); |
1396 | list_del(&slot->mm_list); | 1669 | list_del(&slot->mm_list); |
1397 | spin_unlock(&ksm_mmlist_lock); | 1670 | spin_unlock(&ksm_mmlist_lock); |
1398 | 1671 | ||
@@ -1428,8 +1701,7 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1428 | rmap_item = scan_get_next_rmap_item(&page); | 1701 | rmap_item = scan_get_next_rmap_item(&page); |
1429 | if (!rmap_item) | 1702 | if (!rmap_item) |
1430 | return; | 1703 | return; |
1431 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1704 | cmp_and_merge_page(page, rmap_item); |
1432 | cmp_and_merge_page(page, rmap_item); | ||
1433 | put_page(page); | 1705 | put_page(page); |
1434 | } | 1706 | } |
1435 | } | 1707 | } |
@@ -1446,6 +1718,7 @@ static int ksm_scan_thread(void *nothing) | |||
1446 | 1718 | ||
1447 | while (!kthread_should_stop()) { | 1719 | while (!kthread_should_stop()) { |
1448 | mutex_lock(&ksm_thread_mutex); | 1720 | mutex_lock(&ksm_thread_mutex); |
1721 | wait_while_offlining(); | ||
1449 | if (ksmd_should_run()) | 1722 | if (ksmd_should_run()) |
1450 | ksm_do_scan(ksm_thread_pages_to_scan); | 1723 | ksm_do_scan(ksm_thread_pages_to_scan); |
1451 | mutex_unlock(&ksm_thread_mutex); | 1724 | mutex_unlock(&ksm_thread_mutex); |
@@ -1525,11 +1798,19 @@ int __ksm_enter(struct mm_struct *mm) | |||
1525 | spin_lock(&ksm_mmlist_lock); | 1798 | spin_lock(&ksm_mmlist_lock); |
1526 | insert_to_mm_slots_hash(mm, mm_slot); | 1799 | insert_to_mm_slots_hash(mm, mm_slot); |
1527 | /* | 1800 | /* |
1528 | * Insert just behind the scanning cursor, to let the area settle | 1801 | * When KSM_RUN_MERGE (or KSM_RUN_STOP), |
1802 | * insert just behind the scanning cursor, to let the area settle | ||
1529 | * down a little; when fork is followed by immediate exec, we don't | 1803 | * down a little; when fork is followed by immediate exec, we don't |
1530 | * want ksmd to waste time setting up and tearing down an rmap_list. | 1804 | * want ksmd to waste time setting up and tearing down an rmap_list. |
1805 | * | ||
1806 | * But when KSM_RUN_UNMERGE, it's important to insert ahead of its | ||
1807 | * scanning cursor, otherwise KSM pages in newly forked mms will be | ||
1808 | * missed: then we might as well insert at the end of the list. | ||
1531 | */ | 1809 | */ |
1532 | list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); | 1810 | if (ksm_run & KSM_RUN_UNMERGE) |
1811 | list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); | ||
1812 | else | ||
1813 | list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); | ||
1533 | spin_unlock(&ksm_mmlist_lock); | 1814 | spin_unlock(&ksm_mmlist_lock); |
1534 | 1815 | ||
1535 | set_bit(MMF_VM_MERGEABLE, &mm->flags); | 1816 | set_bit(MMF_VM_MERGEABLE, &mm->flags); |
@@ -1559,7 +1840,7 @@ void __ksm_exit(struct mm_struct *mm) | |||
1559 | mm_slot = get_mm_slot(mm); | 1840 | mm_slot = get_mm_slot(mm); |
1560 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | 1841 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { |
1561 | if (!mm_slot->rmap_list) { | 1842 | if (!mm_slot->rmap_list) { |
1562 | hlist_del(&mm_slot->link); | 1843 | hash_del(&mm_slot->link); |
1563 | list_del(&mm_slot->mm_list); | 1844 | list_del(&mm_slot->mm_list); |
1564 | easy_to_free = 1; | 1845 | easy_to_free = 1; |
1565 | } else { | 1846 | } else { |
@@ -1579,24 +1860,32 @@ void __ksm_exit(struct mm_struct *mm) | |||
1579 | } | 1860 | } |
1580 | } | 1861 | } |
1581 | 1862 | ||
1582 | struct page *ksm_does_need_to_copy(struct page *page, | 1863 | struct page *ksm_might_need_to_copy(struct page *page, |
1583 | struct vm_area_struct *vma, unsigned long address) | 1864 | struct vm_area_struct *vma, unsigned long address) |
1584 | { | 1865 | { |
1866 | struct anon_vma *anon_vma = page_anon_vma(page); | ||
1585 | struct page *new_page; | 1867 | struct page *new_page; |
1586 | 1868 | ||
1869 | if (PageKsm(page)) { | ||
1870 | if (page_stable_node(page) && | ||
1871 | !(ksm_run & KSM_RUN_UNMERGE)) | ||
1872 | return page; /* no need to copy it */ | ||
1873 | } else if (!anon_vma) { | ||
1874 | return page; /* no need to copy it */ | ||
1875 | } else if (anon_vma->root == vma->anon_vma->root && | ||
1876 | page->index == linear_page_index(vma, address)) { | ||
1877 | return page; /* still no need to copy it */ | ||
1878 | } | ||
1879 | if (!PageUptodate(page)) | ||
1880 | return page; /* let do_swap_page report the error */ | ||
1881 | |||
1587 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1882 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1588 | if (new_page) { | 1883 | if (new_page) { |
1589 | copy_user_highpage(new_page, page, address, vma); | 1884 | copy_user_highpage(new_page, page, address, vma); |
1590 | 1885 | ||
1591 | SetPageDirty(new_page); | 1886 | SetPageDirty(new_page); |
1592 | __SetPageUptodate(new_page); | 1887 | __SetPageUptodate(new_page); |
1593 | SetPageSwapBacked(new_page); | ||
1594 | __set_page_locked(new_page); | 1888 | __set_page_locked(new_page); |
1595 | |||
1596 | if (!mlocked_vma_newpage(vma, new_page)) | ||
1597 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
1598 | else | ||
1599 | add_page_to_unevictable_list(new_page); | ||
1600 | } | 1889 | } |
1601 | 1890 | ||
1602 | return new_page; | 1891 | return new_page; |
@@ -1607,7 +1896,6 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | |||
1607 | { | 1896 | { |
1608 | struct stable_node *stable_node; | 1897 | struct stable_node *stable_node; |
1609 | struct rmap_item *rmap_item; | 1898 | struct rmap_item *rmap_item; |
1610 | struct hlist_node *hlist; | ||
1611 | unsigned int mapcount = page_mapcount(page); | 1899 | unsigned int mapcount = page_mapcount(page); |
1612 | int referenced = 0; | 1900 | int referenced = 0; |
1613 | int search_new_forks = 0; | 1901 | int search_new_forks = 0; |
@@ -1619,7 +1907,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | |||
1619 | if (!stable_node) | 1907 | if (!stable_node) |
1620 | return 0; | 1908 | return 0; |
1621 | again: | 1909 | again: |
1622 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1910 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { |
1623 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1911 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
1624 | struct anon_vma_chain *vmac; | 1912 | struct anon_vma_chain *vmac; |
1625 | struct vm_area_struct *vma; | 1913 | struct vm_area_struct *vma; |
@@ -1661,7 +1949,6 @@ out: | |||
1661 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | 1949 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) |
1662 | { | 1950 | { |
1663 | struct stable_node *stable_node; | 1951 | struct stable_node *stable_node; |
1664 | struct hlist_node *hlist; | ||
1665 | struct rmap_item *rmap_item; | 1952 | struct rmap_item *rmap_item; |
1666 | int ret = SWAP_AGAIN; | 1953 | int ret = SWAP_AGAIN; |
1667 | int search_new_forks = 0; | 1954 | int search_new_forks = 0; |
@@ -1673,7 +1960,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | |||
1673 | if (!stable_node) | 1960 | if (!stable_node) |
1674 | return SWAP_FAIL; | 1961 | return SWAP_FAIL; |
1675 | again: | 1962 | again: |
1676 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1963 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { |
1677 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1964 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
1678 | struct anon_vma_chain *vmac; | 1965 | struct anon_vma_chain *vmac; |
1679 | struct vm_area_struct *vma; | 1966 | struct vm_area_struct *vma; |
@@ -1714,7 +2001,6 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | |||
1714 | struct vm_area_struct *, unsigned long, void *), void *arg) | 2001 | struct vm_area_struct *, unsigned long, void *), void *arg) |
1715 | { | 2002 | { |
1716 | struct stable_node *stable_node; | 2003 | struct stable_node *stable_node; |
1717 | struct hlist_node *hlist; | ||
1718 | struct rmap_item *rmap_item; | 2004 | struct rmap_item *rmap_item; |
1719 | int ret = SWAP_AGAIN; | 2005 | int ret = SWAP_AGAIN; |
1720 | int search_new_forks = 0; | 2006 | int search_new_forks = 0; |
@@ -1726,7 +2012,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | |||
1726 | if (!stable_node) | 2012 | if (!stable_node) |
1727 | return ret; | 2013 | return ret; |
1728 | again: | 2014 | again: |
1729 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 2015 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { |
1730 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 2016 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
1731 | struct anon_vma_chain *vmac; | 2017 | struct anon_vma_chain *vmac; |
1732 | struct vm_area_struct *vma; | 2018 | struct vm_area_struct *vma; |
@@ -1773,64 +2059,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) | |||
1773 | if (stable_node) { | 2059 | if (stable_node) { |
1774 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | 2060 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); |
1775 | stable_node->kpfn = page_to_pfn(newpage); | 2061 | stable_node->kpfn = page_to_pfn(newpage); |
2062 | /* | ||
2063 | * newpage->mapping was set in advance; now we need smp_wmb() | ||
2064 | * to make sure that the new stable_node->kpfn is visible | ||
2065 | * to get_ksm_page() before it can see that oldpage->mapping | ||
2066 | * has gone stale (or that PageSwapCache has been cleared). | ||
2067 | */ | ||
2068 | smp_wmb(); | ||
2069 | set_page_stable_node(oldpage, NULL); | ||
1776 | } | 2070 | } |
1777 | } | 2071 | } |
1778 | #endif /* CONFIG_MIGRATION */ | 2072 | #endif /* CONFIG_MIGRATION */ |
1779 | 2073 | ||
1780 | #ifdef CONFIG_MEMORY_HOTREMOVE | 2074 | #ifdef CONFIG_MEMORY_HOTREMOVE |
1781 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | 2075 | static int just_wait(void *word) |
1782 | unsigned long end_pfn) | ||
1783 | { | 2076 | { |
1784 | struct rb_node *node; | 2077 | schedule(); |
2078 | return 0; | ||
2079 | } | ||
1785 | 2080 | ||
1786 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | 2081 | static void wait_while_offlining(void) |
1787 | struct stable_node *stable_node; | 2082 | { |
2083 | while (ksm_run & KSM_RUN_OFFLINE) { | ||
2084 | mutex_unlock(&ksm_thread_mutex); | ||
2085 | wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), | ||
2086 | just_wait, TASK_UNINTERRUPTIBLE); | ||
2087 | mutex_lock(&ksm_thread_mutex); | ||
2088 | } | ||
2089 | } | ||
1788 | 2090 | ||
1789 | stable_node = rb_entry(node, struct stable_node, node); | 2091 | static void ksm_check_stable_tree(unsigned long start_pfn, |
2092 | unsigned long end_pfn) | ||
2093 | { | ||
2094 | struct stable_node *stable_node; | ||
2095 | struct list_head *this, *next; | ||
2096 | struct rb_node *node; | ||
2097 | int nid; | ||
2098 | |||
2099 | for (nid = 0; nid < ksm_nr_node_ids; nid++) { | ||
2100 | node = rb_first(root_stable_tree + nid); | ||
2101 | while (node) { | ||
2102 | stable_node = rb_entry(node, struct stable_node, node); | ||
2103 | if (stable_node->kpfn >= start_pfn && | ||
2104 | stable_node->kpfn < end_pfn) { | ||
2105 | /* | ||
2106 | * Don't get_ksm_page, page has already gone: | ||
2107 | * which is why we keep kpfn instead of page* | ||
2108 | */ | ||
2109 | remove_node_from_stable_tree(stable_node); | ||
2110 | node = rb_first(root_stable_tree + nid); | ||
2111 | } else | ||
2112 | node = rb_next(node); | ||
2113 | cond_resched(); | ||
2114 | } | ||
2115 | } | ||
2116 | list_for_each_safe(this, next, &migrate_nodes) { | ||
2117 | stable_node = list_entry(this, struct stable_node, list); | ||
1790 | if (stable_node->kpfn >= start_pfn && | 2118 | if (stable_node->kpfn >= start_pfn && |
1791 | stable_node->kpfn < end_pfn) | 2119 | stable_node->kpfn < end_pfn) |
1792 | return stable_node; | 2120 | remove_node_from_stable_tree(stable_node); |
2121 | cond_resched(); | ||
1793 | } | 2122 | } |
1794 | return NULL; | ||
1795 | } | 2123 | } |
1796 | 2124 | ||
1797 | static int ksm_memory_callback(struct notifier_block *self, | 2125 | static int ksm_memory_callback(struct notifier_block *self, |
1798 | unsigned long action, void *arg) | 2126 | unsigned long action, void *arg) |
1799 | { | 2127 | { |
1800 | struct memory_notify *mn = arg; | 2128 | struct memory_notify *mn = arg; |
1801 | struct stable_node *stable_node; | ||
1802 | 2129 | ||
1803 | switch (action) { | 2130 | switch (action) { |
1804 | case MEM_GOING_OFFLINE: | 2131 | case MEM_GOING_OFFLINE: |
1805 | /* | 2132 | /* |
1806 | * Keep it very simple for now: just lock out ksmd and | 2133 | * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() |
1807 | * MADV_UNMERGEABLE while any memory is going offline. | 2134 | * and remove_all_stable_nodes() while memory is going offline: |
1808 | * mutex_lock_nested() is necessary because lockdep was alarmed | 2135 | * it is unsafe for them to touch the stable tree at this time. |
1809 | * that here we take ksm_thread_mutex inside notifier chain | 2136 | * But unmerge_ksm_pages(), rmap lookups and other entry points |
1810 | * mutex, and later take notifier chain mutex inside | 2137 | * which do not need the ksm_thread_mutex are all safe. |
1811 | * ksm_thread_mutex to unlock it. But that's safe because both | ||
1812 | * are inside mem_hotplug_mutex. | ||
1813 | */ | 2138 | */ |
1814 | mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); | 2139 | mutex_lock(&ksm_thread_mutex); |
2140 | ksm_run |= KSM_RUN_OFFLINE; | ||
2141 | mutex_unlock(&ksm_thread_mutex); | ||
1815 | break; | 2142 | break; |
1816 | 2143 | ||
1817 | case MEM_OFFLINE: | 2144 | case MEM_OFFLINE: |
1818 | /* | 2145 | /* |
1819 | * Most of the work is done by page migration; but there might | 2146 | * Most of the work is done by page migration; but there might |
1820 | * be a few stable_nodes left over, still pointing to struct | 2147 | * be a few stable_nodes left over, still pointing to struct |
1821 | * pages which have been offlined: prune those from the tree. | 2148 | * pages which have been offlined: prune those from the tree, |
2149 | * otherwise get_ksm_page() might later try to access a | ||
2150 | * non-existent struct page. | ||
1822 | */ | 2151 | */ |
1823 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | 2152 | ksm_check_stable_tree(mn->start_pfn, |
1824 | mn->start_pfn + mn->nr_pages)) != NULL) | 2153 | mn->start_pfn + mn->nr_pages); |
1825 | remove_node_from_stable_tree(stable_node); | ||
1826 | /* fallthrough */ | 2154 | /* fallthrough */ |
1827 | 2155 | ||
1828 | case MEM_CANCEL_OFFLINE: | 2156 | case MEM_CANCEL_OFFLINE: |
2157 | mutex_lock(&ksm_thread_mutex); | ||
2158 | ksm_run &= ~KSM_RUN_OFFLINE; | ||
1829 | mutex_unlock(&ksm_thread_mutex); | 2159 | mutex_unlock(&ksm_thread_mutex); |
2160 | |||
2161 | smp_mb(); /* wake_up_bit advises this */ | ||
2162 | wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); | ||
1830 | break; | 2163 | break; |
1831 | } | 2164 | } |
1832 | return NOTIFY_OK; | 2165 | return NOTIFY_OK; |
1833 | } | 2166 | } |
2167 | #else | ||
2168 | static void wait_while_offlining(void) | ||
2169 | { | ||
2170 | } | ||
1834 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 2171 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
1835 | 2172 | ||
1836 | #ifdef CONFIG_SYSFS | 2173 | #ifdef CONFIG_SYSFS |
@@ -1893,7 +2230,7 @@ KSM_ATTR(pages_to_scan); | |||
1893 | static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, | 2230 | static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, |
1894 | char *buf) | 2231 | char *buf) |
1895 | { | 2232 | { |
1896 | return sprintf(buf, "%u\n", ksm_run); | 2233 | return sprintf(buf, "%lu\n", ksm_run); |
1897 | } | 2234 | } |
1898 | 2235 | ||
1899 | static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | 2236 | static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, |
@@ -1916,6 +2253,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1916 | */ | 2253 | */ |
1917 | 2254 | ||
1918 | mutex_lock(&ksm_thread_mutex); | 2255 | mutex_lock(&ksm_thread_mutex); |
2256 | wait_while_offlining(); | ||
1919 | if (ksm_run != flags) { | 2257 | if (ksm_run != flags) { |
1920 | ksm_run = flags; | 2258 | ksm_run = flags; |
1921 | if (flags & KSM_RUN_UNMERGE) { | 2259 | if (flags & KSM_RUN_UNMERGE) { |
@@ -1937,6 +2275,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1937 | } | 2275 | } |
1938 | KSM_ATTR(run); | 2276 | KSM_ATTR(run); |
1939 | 2277 | ||
2278 | #ifdef CONFIG_NUMA | ||
2279 | static ssize_t merge_across_nodes_show(struct kobject *kobj, | ||
2280 | struct kobj_attribute *attr, char *buf) | ||
2281 | { | ||
2282 | return sprintf(buf, "%u\n", ksm_merge_across_nodes); | ||
2283 | } | ||
2284 | |||
2285 | static ssize_t merge_across_nodes_store(struct kobject *kobj, | ||
2286 | struct kobj_attribute *attr, | ||
2287 | const char *buf, size_t count) | ||
2288 | { | ||
2289 | int err; | ||
2290 | unsigned long knob; | ||
2291 | |||
2292 | err = kstrtoul(buf, 10, &knob); | ||
2293 | if (err) | ||
2294 | return err; | ||
2295 | if (knob > 1) | ||
2296 | return -EINVAL; | ||
2297 | |||
2298 | mutex_lock(&ksm_thread_mutex); | ||
2299 | wait_while_offlining(); | ||
2300 | if (ksm_merge_across_nodes != knob) { | ||
2301 | if (ksm_pages_shared || remove_all_stable_nodes()) | ||
2302 | err = -EBUSY; | ||
2303 | else if (root_stable_tree == one_stable_tree) { | ||
2304 | struct rb_root *buf; | ||
2305 | /* | ||
2306 | * This is the first time that we switch away from the | ||
2307 | * default of merging across nodes: must now allocate | ||
2308 | * a buffer to hold as many roots as may be needed. | ||
2309 | * Allocate stable and unstable together: | ||
2310 | * MAXSMP NODES_SHIFT 10 will use 16kB. | ||
2311 | */ | ||
2312 | buf = kcalloc(nr_node_ids + nr_node_ids, | ||
2313 | sizeof(*buf), GFP_KERNEL | __GFP_ZERO); | ||
2314 | /* Let us assume that RB_ROOT is NULL is zero */ | ||
2315 | if (!buf) | ||
2316 | err = -ENOMEM; | ||
2317 | else { | ||
2318 | root_stable_tree = buf; | ||
2319 | root_unstable_tree = buf + nr_node_ids; | ||
2320 | /* Stable tree is empty but not the unstable */ | ||
2321 | root_unstable_tree[0] = one_unstable_tree[0]; | ||
2322 | } | ||
2323 | } | ||
2324 | if (!err) { | ||
2325 | ksm_merge_across_nodes = knob; | ||
2326 | ksm_nr_node_ids = knob ? 1 : nr_node_ids; | ||
2327 | } | ||
2328 | } | ||
2329 | mutex_unlock(&ksm_thread_mutex); | ||
2330 | |||
2331 | return err ? err : count; | ||
2332 | } | ||
2333 | KSM_ATTR(merge_across_nodes); | ||
2334 | #endif | ||
2335 | |||
1940 | static ssize_t pages_shared_show(struct kobject *kobj, | 2336 | static ssize_t pages_shared_show(struct kobject *kobj, |
1941 | struct kobj_attribute *attr, char *buf) | 2337 | struct kobj_attribute *attr, char *buf) |
1942 | { | 2338 | { |
@@ -1991,6 +2387,9 @@ static struct attribute *ksm_attrs[] = { | |||
1991 | &pages_unshared_attr.attr, | 2387 | &pages_unshared_attr.attr, |
1992 | &pages_volatile_attr.attr, | 2388 | &pages_volatile_attr.attr, |
1993 | &full_scans_attr.attr, | 2389 | &full_scans_attr.attr, |
2390 | #ifdef CONFIG_NUMA | ||
2391 | &merge_across_nodes_attr.attr, | ||
2392 | #endif | ||
1994 | NULL, | 2393 | NULL, |
1995 | }; | 2394 | }; |
1996 | 2395 | ||
@@ -2029,10 +2428,7 @@ static int __init ksm_init(void) | |||
2029 | #endif /* CONFIG_SYSFS */ | 2428 | #endif /* CONFIG_SYSFS */ |
2030 | 2429 | ||
2031 | #ifdef CONFIG_MEMORY_HOTREMOVE | 2430 | #ifdef CONFIG_MEMORY_HOTREMOVE |
2032 | /* | 2431 | /* There is no significance to this priority 100 */ |
2033 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
2034 | * later callbacks could only be taking locks which nest within that. | ||
2035 | */ | ||
2036 | hotplug_memory_notifier(ksm_memory_callback, 100); | 2432 | hotplug_memory_notifier(ksm_memory_callback, 100); |
2037 | #endif | 2433 | #endif |
2038 | return 0; | 2434 | return 0; |
diff --git a/mm/madvise.c b/mm/madvise.c index 03dfa5c7adb3..c58c94b56c3d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -16,6 +16,9 @@ | |||
16 | #include <linux/ksm.h> | 16 | #include <linux/ksm.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/file.h> | 18 | #include <linux/file.h> |
19 | #include <linux/blkdev.h> | ||
20 | #include <linux/swap.h> | ||
21 | #include <linux/swapops.h> | ||
19 | 22 | ||
20 | /* | 23 | /* |
21 | * Any behaviour which results in changes to the vma->vm_flags needs to | 24 | * Any behaviour which results in changes to the vma->vm_flags needs to |
@@ -131,6 +134,84 @@ out: | |||
131 | return error; | 134 | return error; |
132 | } | 135 | } |
133 | 136 | ||
137 | #ifdef CONFIG_SWAP | ||
138 | static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, | ||
139 | unsigned long end, struct mm_walk *walk) | ||
140 | { | ||
141 | pte_t *orig_pte; | ||
142 | struct vm_area_struct *vma = walk->private; | ||
143 | unsigned long index; | ||
144 | |||
145 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
146 | return 0; | ||
147 | |||
148 | for (index = start; index != end; index += PAGE_SIZE) { | ||
149 | pte_t pte; | ||
150 | swp_entry_t entry; | ||
151 | struct page *page; | ||
152 | spinlock_t *ptl; | ||
153 | |||
154 | orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); | ||
155 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); | ||
156 | pte_unmap_unlock(orig_pte, ptl); | ||
157 | |||
158 | if (pte_present(pte) || pte_none(pte) || pte_file(pte)) | ||
159 | continue; | ||
160 | entry = pte_to_swp_entry(pte); | ||
161 | if (unlikely(non_swap_entry(entry))) | ||
162 | continue; | ||
163 | |||
164 | page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, | ||
165 | vma, index); | ||
166 | if (page) | ||
167 | page_cache_release(page); | ||
168 | } | ||
169 | |||
170 | return 0; | ||
171 | } | ||
172 | |||
173 | static void force_swapin_readahead(struct vm_area_struct *vma, | ||
174 | unsigned long start, unsigned long end) | ||
175 | { | ||
176 | struct mm_walk walk = { | ||
177 | .mm = vma->vm_mm, | ||
178 | .pmd_entry = swapin_walk_pmd_entry, | ||
179 | .private = vma, | ||
180 | }; | ||
181 | |||
182 | walk_page_range(start, end, &walk); | ||
183 | |||
184 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
185 | } | ||
186 | |||
187 | static void force_shm_swapin_readahead(struct vm_area_struct *vma, | ||
188 | unsigned long start, unsigned long end, | ||
189 | struct address_space *mapping) | ||
190 | { | ||
191 | pgoff_t index; | ||
192 | struct page *page; | ||
193 | swp_entry_t swap; | ||
194 | |||
195 | for (; start < end; start += PAGE_SIZE) { | ||
196 | index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
197 | |||
198 | page = find_get_page(mapping, index); | ||
199 | if (!radix_tree_exceptional_entry(page)) { | ||
200 | if (page) | ||
201 | page_cache_release(page); | ||
202 | continue; | ||
203 | } | ||
204 | swap = radix_to_swp_entry(page); | ||
205 | page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, | ||
206 | NULL, 0); | ||
207 | if (page) | ||
208 | page_cache_release(page); | ||
209 | } | ||
210 | |||
211 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
212 | } | ||
213 | #endif /* CONFIG_SWAP */ | ||
214 | |||
134 | /* | 215 | /* |
135 | * Schedule all required I/O operations. Do not wait for completion. | 216 | * Schedule all required I/O operations. Do not wait for completion. |
136 | */ | 217 | */ |
@@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
140 | { | 221 | { |
141 | struct file *file = vma->vm_file; | 222 | struct file *file = vma->vm_file; |
142 | 223 | ||
224 | #ifdef CONFIG_SWAP | ||
225 | if (!file || mapping_cap_swap_backed(file->f_mapping)) { | ||
226 | *prev = vma; | ||
227 | if (!file) | ||
228 | force_swapin_readahead(vma, start, end); | ||
229 | else | ||
230 | force_shm_swapin_readahead(vma, start, end, | ||
231 | file->f_mapping); | ||
232 | return 0; | ||
233 | } | ||
234 | #endif | ||
235 | |||
143 | if (!file) | 236 | if (!file) |
144 | return -EBADF; | 237 | return -EBADF; |
145 | 238 | ||
@@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
371 | int error = -EINVAL; | 464 | int error = -EINVAL; |
372 | int write; | 465 | int write; |
373 | size_t len; | 466 | size_t len; |
467 | struct blk_plug plug; | ||
374 | 468 | ||
375 | #ifdef CONFIG_MEMORY_FAILURE | 469 | #ifdef CONFIG_MEMORY_FAILURE |
376 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) | 470 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
@@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
410 | if (vma && start > vma->vm_start) | 504 | if (vma && start > vma->vm_start) |
411 | prev = vma; | 505 | prev = vma; |
412 | 506 | ||
507 | blk_start_plug(&plug); | ||
413 | for (;;) { | 508 | for (;;) { |
414 | /* Still start < end. */ | 509 | /* Still start < end. */ |
415 | error = -ENOMEM; | 510 | error = -ENOMEM; |
416 | if (!vma) | 511 | if (!vma) |
417 | goto out; | 512 | goto out_plug; |
418 | 513 | ||
419 | /* Here start < (end|vma->vm_end). */ | 514 | /* Here start < (end|vma->vm_end). */ |
420 | if (start < vma->vm_start) { | 515 | if (start < vma->vm_start) { |
421 | unmapped_error = -ENOMEM; | 516 | unmapped_error = -ENOMEM; |
422 | start = vma->vm_start; | 517 | start = vma->vm_start; |
423 | if (start >= end) | 518 | if (start >= end) |
424 | goto out; | 519 | goto out_plug; |
425 | } | 520 | } |
426 | 521 | ||
427 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ | 522 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ |
@@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
432 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ | 527 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ |
433 | error = madvise_vma(vma, &prev, start, tmp, behavior); | 528 | error = madvise_vma(vma, &prev, start, tmp, behavior); |
434 | if (error) | 529 | if (error) |
435 | goto out; | 530 | goto out_plug; |
436 | start = tmp; | 531 | start = tmp; |
437 | if (prev && start < prev->vm_end) | 532 | if (prev && start < prev->vm_end) |
438 | start = prev->vm_end; | 533 | start = prev->vm_end; |
439 | error = unmapped_error; | 534 | error = unmapped_error; |
440 | if (start >= end) | 535 | if (start >= end) |
441 | goto out; | 536 | goto out_plug; |
442 | if (prev) | 537 | if (prev) |
443 | vma = prev->vm_next; | 538 | vma = prev->vm_next; |
444 | else /* madvise_remove dropped mmap_sem */ | 539 | else /* madvise_remove dropped mmap_sem */ |
445 | vma = find_vma(current->mm, start); | 540 | vma = find_vma(current->mm, start); |
446 | } | 541 | } |
542 | out_plug: | ||
543 | blk_finish_plug(&plug); | ||
447 | out: | 544 | out: |
448 | if (write) | 545 | if (write) |
449 | up_write(¤t->mm->mmap_sem); | 546 | up_write(¤t->mm->mmap_sem); |
diff --git a/mm/memblock.c b/mm/memblock.c index 625905523c2a..1bcd9b970564 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
92 | * | 92 | * |
93 | * Find @size free area aligned to @align in the specified range and node. | 93 | * Find @size free area aligned to @align in the specified range and node. |
94 | * | 94 | * |
95 | * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the | ||
96 | * memory we found if not in hotpluggable ranges. | ||
97 | * | ||
95 | * RETURNS: | 98 | * RETURNS: |
96 | * Found address on success, %0 on failure. | 99 | * Found address on success, %0 on failure. |
97 | */ | 100 | */ |
101 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
102 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | ||
103 | phys_addr_t end, phys_addr_t size, | ||
104 | phys_addr_t align, int nid) | ||
105 | { | ||
106 | phys_addr_t this_start, this_end, cand; | ||
107 | u64 i; | ||
108 | int curr = movablemem_map.nr_map - 1; | ||
109 | |||
110 | /* pump up @end */ | ||
111 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | ||
112 | end = memblock.current_limit; | ||
113 | |||
114 | /* avoid allocating the first page */ | ||
115 | start = max_t(phys_addr_t, start, PAGE_SIZE); | ||
116 | end = max(start, end); | ||
117 | |||
118 | for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { | ||
119 | this_start = clamp(this_start, start, end); | ||
120 | this_end = clamp(this_end, start, end); | ||
121 | |||
122 | restart: | ||
123 | if (this_end <= this_start || this_end < size) | ||
124 | continue; | ||
125 | |||
126 | for (; curr >= 0; curr--) { | ||
127 | if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT) | ||
128 | < this_end) | ||
129 | break; | ||
130 | } | ||
131 | |||
132 | cand = round_down(this_end - size, align); | ||
133 | if (curr >= 0 && | ||
134 | cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) { | ||
135 | this_end = movablemem_map.map[curr].start_pfn | ||
136 | << PAGE_SHIFT; | ||
137 | goto restart; | ||
138 | } | ||
139 | |||
140 | if (cand >= this_start) | ||
141 | return cand; | ||
142 | } | ||
143 | |||
144 | return 0; | ||
145 | } | ||
146 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
98 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | 147 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, |
99 | phys_addr_t end, phys_addr_t size, | 148 | phys_addr_t end, phys_addr_t size, |
100 | phys_addr_t align, int nid) | 149 | phys_addr_t align, int nid) |
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | |||
123 | } | 172 | } |
124 | return 0; | 173 | return 0; |
125 | } | 174 | } |
175 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
126 | 176 | ||
127 | /** | 177 | /** |
128 | * memblock_find_in_range - find free area in given range | 178 | * memblock_find_in_range - find free area in given range |
@@ -314,7 +364,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
314 | } | 364 | } |
315 | 365 | ||
316 | this->size += next->size; | 366 | this->size += next->size; |
317 | memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); | 367 | /* move forward from next + 1, index of which is i + 2 */ |
368 | memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); | ||
318 | type->cnt--; | 369 | type->cnt--; |
319 | } | 370 | } |
320 | } | 371 | } |
@@ -827,6 +878,23 @@ phys_addr_t __init memblock_phys_mem_size(void) | |||
827 | return memblock.memory.total_size; | 878 | return memblock.memory.total_size; |
828 | } | 879 | } |
829 | 880 | ||
881 | phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) | ||
882 | { | ||
883 | unsigned long pages = 0; | ||
884 | struct memblock_region *r; | ||
885 | unsigned long start_pfn, end_pfn; | ||
886 | |||
887 | for_each_memblock(memory, r) { | ||
888 | start_pfn = memblock_region_memory_base_pfn(r); | ||
889 | end_pfn = memblock_region_memory_end_pfn(r); | ||
890 | start_pfn = min_t(unsigned long, start_pfn, limit_pfn); | ||
891 | end_pfn = min_t(unsigned long, end_pfn, limit_pfn); | ||
892 | pages += end_pfn - start_pfn; | ||
893 | } | ||
894 | |||
895 | return (phys_addr_t)pages << PAGE_SHIFT; | ||
896 | } | ||
897 | |||
830 | /* lowest address */ | 898 | /* lowest address */ |
831 | phys_addr_t __init_memblock memblock_start_of_DRAM(void) | 899 | phys_addr_t __init_memblock memblock_start_of_DRAM(void) |
832 | { | 900 | { |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09255ec8159c..53b8201b31eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = { | |||
120 | "pgmajfault", | 120 | "pgmajfault", |
121 | }; | 121 | }; |
122 | 122 | ||
123 | static const char * const mem_cgroup_lru_names[] = { | ||
124 | "inactive_anon", | ||
125 | "active_anon", | ||
126 | "inactive_file", | ||
127 | "active_file", | ||
128 | "unevictable", | ||
129 | }; | ||
130 | |||
123 | /* | 131 | /* |
124 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | 132 | * Per memcg event counter is incremented at every pagein/pageout. With THP, |
125 | * it will be incremated by the number of pages. This counter is used for | 133 | * it will be incremated by the number of pages. This counter is used for |
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node { | |||
172 | }; | 180 | }; |
173 | 181 | ||
174 | struct mem_cgroup_lru_info { | 182 | struct mem_cgroup_lru_info { |
175 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; | 183 | struct mem_cgroup_per_node *nodeinfo[0]; |
176 | }; | 184 | }; |
177 | 185 | ||
178 | /* | 186 | /* |
@@ -276,17 +284,6 @@ struct mem_cgroup { | |||
276 | */ | 284 | */ |
277 | struct res_counter kmem; | 285 | struct res_counter kmem; |
278 | /* | 286 | /* |
279 | * Per cgroup active and inactive list, similar to the | ||
280 | * per zone LRU lists. | ||
281 | */ | ||
282 | struct mem_cgroup_lru_info info; | ||
283 | int last_scanned_node; | ||
284 | #if MAX_NUMNODES > 1 | ||
285 | nodemask_t scan_nodes; | ||
286 | atomic_t numainfo_events; | ||
287 | atomic_t numainfo_updating; | ||
288 | #endif | ||
289 | /* | ||
290 | * Should the accounting and control be hierarchical, per subtree? | 287 | * Should the accounting and control be hierarchical, per subtree? |
291 | */ | 288 | */ |
292 | bool use_hierarchy; | 289 | bool use_hierarchy; |
@@ -349,8 +346,29 @@ struct mem_cgroup { | |||
349 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | 346 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ |
350 | int kmemcg_id; | 347 | int kmemcg_id; |
351 | #endif | 348 | #endif |
349 | |||
350 | int last_scanned_node; | ||
351 | #if MAX_NUMNODES > 1 | ||
352 | nodemask_t scan_nodes; | ||
353 | atomic_t numainfo_events; | ||
354 | atomic_t numainfo_updating; | ||
355 | #endif | ||
356 | /* | ||
357 | * Per cgroup active and inactive list, similar to the | ||
358 | * per zone LRU lists. | ||
359 | * | ||
360 | * WARNING: This has to be the last element of the struct. Don't | ||
361 | * add new fields after this point. | ||
362 | */ | ||
363 | struct mem_cgroup_lru_info info; | ||
352 | }; | 364 | }; |
353 | 365 | ||
366 | static size_t memcg_size(void) | ||
367 | { | ||
368 | return sizeof(struct mem_cgroup) + | ||
369 | nr_node_ids * sizeof(struct mem_cgroup_per_node); | ||
370 | } | ||
371 | |||
354 | /* internal only representation about the status of kmem accounting. */ | 372 | /* internal only representation about the status of kmem accounting. */ |
355 | enum { | 373 | enum { |
356 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ | 374 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ |
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) | |||
398 | 416 | ||
399 | /* Stuffs for move charges at task migration. */ | 417 | /* Stuffs for move charges at task migration. */ |
400 | /* | 418 | /* |
401 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | 419 | * Types of charges to be moved. "move_charge_at_immitgrate" and |
402 | * left-shifted bitmap of these types. | 420 | * "immigrate_flags" are treated as a left-shifted bitmap of these types. |
403 | */ | 421 | */ |
404 | enum move_type { | 422 | enum move_type { |
405 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 423 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ |
@@ -412,6 +430,7 @@ static struct move_charge_struct { | |||
412 | spinlock_t lock; /* for from, to */ | 430 | spinlock_t lock; /* for from, to */ |
413 | struct mem_cgroup *from; | 431 | struct mem_cgroup *from; |
414 | struct mem_cgroup *to; | 432 | struct mem_cgroup *to; |
433 | unsigned long immigrate_flags; | ||
415 | unsigned long precharge; | 434 | unsigned long precharge; |
416 | unsigned long moved_charge; | 435 | unsigned long moved_charge; |
417 | unsigned long moved_swap; | 436 | unsigned long moved_swap; |
@@ -424,14 +443,12 @@ static struct move_charge_struct { | |||
424 | 443 | ||
425 | static bool move_anon(void) | 444 | static bool move_anon(void) |
426 | { | 445 | { |
427 | return test_bit(MOVE_CHARGE_TYPE_ANON, | 446 | return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); |
428 | &mc.to->move_charge_at_immigrate); | ||
429 | } | 447 | } |
430 | 448 | ||
431 | static bool move_file(void) | 449 | static bool move_file(void) |
432 | { | 450 | { |
433 | return test_bit(MOVE_CHARGE_TYPE_FILE, | 451 | return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); |
434 | &mc.to->move_charge_at_immigrate); | ||
435 | } | 452 | } |
436 | 453 | ||
437 | /* | 454 | /* |
@@ -471,6 +488,13 @@ enum res_type { | |||
471 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | 488 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 |
472 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | 489 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) |
473 | 490 | ||
491 | /* | ||
492 | * The memcg_create_mutex will be held whenever a new cgroup is created. | ||
493 | * As a consequence, any change that needs to protect against new child cgroups | ||
494 | * appearing has to hold it as well. | ||
495 | */ | ||
496 | static DEFINE_MUTEX(memcg_create_mutex); | ||
497 | |||
474 | static void mem_cgroup_get(struct mem_cgroup *memcg); | 498 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
475 | static void mem_cgroup_put(struct mem_cgroup *memcg); | 499 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
476 | 500 | ||
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg); | |||
627 | static struct mem_cgroup_per_zone * | 651 | static struct mem_cgroup_per_zone * |
628 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) | 652 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) |
629 | { | 653 | { |
654 | VM_BUG_ON((unsigned)nid >= nr_node_ids); | ||
630 | return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; | 655 | return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; |
631 | } | 656 | } |
632 | 657 | ||
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
1371 | return inactive * inactive_ratio < active; | 1396 | return inactive * inactive_ratio < active; |
1372 | } | 1397 | } |
1373 | 1398 | ||
1374 | int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) | ||
1375 | { | ||
1376 | unsigned long active; | ||
1377 | unsigned long inactive; | ||
1378 | |||
1379 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
1380 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE); | ||
1381 | |||
1382 | return (active > inactive); | ||
1383 | } | ||
1384 | |||
1385 | #define mem_cgroup_from_res_counter(counter, member) \ | 1399 | #define mem_cgroup_from_res_counter(counter, member) \ |
1386 | container_of(counter, struct mem_cgroup, member) | 1400 | container_of(counter, struct mem_cgroup, member) |
1387 | 1401 | ||
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | |||
1524 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 1538 | spin_unlock_irqrestore(&memcg->move_lock, *flags); |
1525 | } | 1539 | } |
1526 | 1540 | ||
1541 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
1527 | /** | 1542 | /** |
1528 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. | 1543 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. |
1529 | * @memcg: The memory cgroup that went over limit | 1544 | * @memcg: The memory cgroup that went over limit |
1530 | * @p: Task that is going to be killed | 1545 | * @p: Task that is going to be killed |
1531 | * | 1546 | * |
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1543 | */ | 1558 | */ |
1544 | static char memcg_name[PATH_MAX]; | 1559 | static char memcg_name[PATH_MAX]; |
1545 | int ret; | 1560 | int ret; |
1561 | struct mem_cgroup *iter; | ||
1562 | unsigned int i; | ||
1546 | 1563 | ||
1547 | if (!memcg || !p) | 1564 | if (!p) |
1548 | return; | 1565 | return; |
1549 | 1566 | ||
1550 | rcu_read_lock(); | 1567 | rcu_read_lock(); |
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1563 | } | 1580 | } |
1564 | rcu_read_unlock(); | 1581 | rcu_read_unlock(); |
1565 | 1582 | ||
1566 | printk(KERN_INFO "Task in %s killed", memcg_name); | 1583 | pr_info("Task in %s killed", memcg_name); |
1567 | 1584 | ||
1568 | rcu_read_lock(); | 1585 | rcu_read_lock(); |
1569 | ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); | 1586 | ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); |
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1576 | /* | 1593 | /* |
1577 | * Continues from above, so we don't need an KERN_ level | 1594 | * Continues from above, so we don't need an KERN_ level |
1578 | */ | 1595 | */ |
1579 | printk(KERN_CONT " as a result of limit of %s\n", memcg_name); | 1596 | pr_cont(" as a result of limit of %s\n", memcg_name); |
1580 | done: | 1597 | done: |
1581 | 1598 | ||
1582 | printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", | 1599 | pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", |
1583 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, | 1600 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, |
1584 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, | 1601 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, |
1585 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); | 1602 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); |
1586 | printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " | 1603 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", |
1587 | "failcnt %llu\n", | ||
1588 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, | 1604 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, |
1589 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, | 1605 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, |
1590 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); | 1606 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); |
1591 | printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", | 1607 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", |
1592 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, | 1608 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, |
1593 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, | 1609 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, |
1594 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); | 1610 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); |
1611 | |||
1612 | for_each_mem_cgroup_tree(iter, memcg) { | ||
1613 | pr_info("Memory cgroup stats"); | ||
1614 | |||
1615 | rcu_read_lock(); | ||
1616 | ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); | ||
1617 | if (!ret) | ||
1618 | pr_cont(" for %s", memcg_name); | ||
1619 | rcu_read_unlock(); | ||
1620 | pr_cont(":"); | ||
1621 | |||
1622 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | ||
1623 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | ||
1624 | continue; | ||
1625 | pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], | ||
1626 | K(mem_cgroup_read_stat(iter, i))); | ||
1627 | } | ||
1628 | |||
1629 | for (i = 0; i < NR_LRU_LISTS; i++) | ||
1630 | pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], | ||
1631 | K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); | ||
1632 | |||
1633 | pr_cont("\n"); | ||
1634 | } | ||
1595 | } | 1635 | } |
1596 | 1636 | ||
1597 | /* | 1637 | /* |
@@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy) | |||
2256 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 2296 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); |
2257 | } | 2297 | } |
2258 | 2298 | ||
2299 | static void __init memcg_stock_init(void) | ||
2300 | { | ||
2301 | int cpu; | ||
2302 | |||
2303 | for_each_possible_cpu(cpu) { | ||
2304 | struct memcg_stock_pcp *stock = | ||
2305 | &per_cpu(memcg_stock, cpu); | ||
2306 | INIT_WORK(&stock->work, drain_local_stock); | ||
2307 | } | ||
2308 | } | ||
2309 | |||
2259 | /* | 2310 | /* |
2260 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 2311 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
2261 | * This will be consumed by consume_stock() function, later. | 2312 | * This will be consumed by consume_stock() function, later. |
@@ -3030,7 +3081,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, | |||
3030 | if (memcg) { | 3081 | if (memcg) { |
3031 | s->memcg_params->memcg = memcg; | 3082 | s->memcg_params->memcg = memcg; |
3032 | s->memcg_params->root_cache = root_cache; | 3083 | s->memcg_params->root_cache = root_cache; |
3033 | } | 3084 | } else |
3085 | s->memcg_params->is_root_cache = true; | ||
3086 | |||
3034 | return 0; | 3087 | return 0; |
3035 | } | 3088 | } |
3036 | 3089 | ||
@@ -4389,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page) | |||
4389 | 4442 | ||
4390 | pc = lookup_page_cgroup_used(page); | 4443 | pc = lookup_page_cgroup_used(page); |
4391 | if (pc) { | 4444 | if (pc) { |
4392 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", | 4445 | pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", |
4393 | pc, pc->flags, pc->mem_cgroup); | 4446 | pc, pc->flags, pc->mem_cgroup); |
4394 | } | 4447 | } |
4395 | } | 4448 | } |
4396 | #endif | 4449 | #endif |
@@ -4717,6 +4770,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | |||
4717 | } | 4770 | } |
4718 | 4771 | ||
4719 | /* | 4772 | /* |
4773 | * This mainly exists for tests during the setting of set of use_hierarchy. | ||
4774 | * Since this is the very setting we are changing, the current hierarchy value | ||
4775 | * is meaningless | ||
4776 | */ | ||
4777 | static inline bool __memcg_has_children(struct mem_cgroup *memcg) | ||
4778 | { | ||
4779 | struct cgroup *pos; | ||
4780 | |||
4781 | /* bounce at first found */ | ||
4782 | cgroup_for_each_child(pos, memcg->css.cgroup) | ||
4783 | return true; | ||
4784 | return false; | ||
4785 | } | ||
4786 | |||
4787 | /* | ||
4788 | * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed | ||
4789 | * to be already dead (as in mem_cgroup_force_empty, for instance). This is | ||
4790 | * from mem_cgroup_count_children(), in the sense that we don't really care how | ||
4791 | * many children we have; we only need to know if we have any. It also counts | ||
4792 | * any memcg without hierarchy as infertile. | ||
4793 | */ | ||
4794 | static inline bool memcg_has_children(struct mem_cgroup *memcg) | ||
4795 | { | ||
4796 | return memcg->use_hierarchy && __memcg_has_children(memcg); | ||
4797 | } | ||
4798 | |||
4799 | /* | ||
4720 | * Reclaims as many pages from the given memcg as possible and moves | 4800 | * Reclaims as many pages from the given memcg as possible and moves |
4721 | * the rest to the parent. | 4801 | * the rest to the parent. |
4722 | * | 4802 | * |
@@ -4786,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
4786 | if (parent) | 4866 | if (parent) |
4787 | parent_memcg = mem_cgroup_from_cont(parent); | 4867 | parent_memcg = mem_cgroup_from_cont(parent); |
4788 | 4868 | ||
4789 | cgroup_lock(); | 4869 | mutex_lock(&memcg_create_mutex); |
4790 | 4870 | ||
4791 | if (memcg->use_hierarchy == val) | 4871 | if (memcg->use_hierarchy == val) |
4792 | goto out; | 4872 | goto out; |
@@ -4801,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
4801 | */ | 4881 | */ |
4802 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && | 4882 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && |
4803 | (val == 1 || val == 0)) { | 4883 | (val == 1 || val == 0)) { |
4804 | if (list_empty(&cont->children)) | 4884 | if (!__memcg_has_children(memcg)) |
4805 | memcg->use_hierarchy = val; | 4885 | memcg->use_hierarchy = val; |
4806 | else | 4886 | else |
4807 | retval = -EBUSY; | 4887 | retval = -EBUSY; |
@@ -4809,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
4809 | retval = -EINVAL; | 4889 | retval = -EINVAL; |
4810 | 4890 | ||
4811 | out: | 4891 | out: |
4812 | cgroup_unlock(); | 4892 | mutex_unlock(&memcg_create_mutex); |
4813 | 4893 | ||
4814 | return retval; | 4894 | return retval; |
4815 | } | 4895 | } |
@@ -4894,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
4894 | { | 4974 | { |
4895 | int ret = -EINVAL; | 4975 | int ret = -EINVAL; |
4896 | #ifdef CONFIG_MEMCG_KMEM | 4976 | #ifdef CONFIG_MEMCG_KMEM |
4897 | bool must_inc_static_branch = false; | ||
4898 | |||
4899 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4977 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4900 | /* | 4978 | /* |
4901 | * For simplicity, we won't allow this to be disabled. It also can't | 4979 | * For simplicity, we won't allow this to be disabled. It also can't |
@@ -4908,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
4908 | * | 4986 | * |
4909 | * After it first became limited, changes in the value of the limit are | 4987 | * After it first became limited, changes in the value of the limit are |
4910 | * of course permitted. | 4988 | * of course permitted. |
4911 | * | ||
4912 | * Taking the cgroup_lock is really offensive, but it is so far the only | ||
4913 | * way to guarantee that no children will appear. There are plenty of | ||
4914 | * other offenders, and they should all go away. Fine grained locking | ||
4915 | * is probably the way to go here. When we are fully hierarchical, we | ||
4916 | * can also get rid of the use_hierarchy check. | ||
4917 | */ | 4989 | */ |
4918 | cgroup_lock(); | 4990 | mutex_lock(&memcg_create_mutex); |
4919 | mutex_lock(&set_limit_mutex); | 4991 | mutex_lock(&set_limit_mutex); |
4920 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { | 4992 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { |
4921 | if (cgroup_task_count(cont) || (memcg->use_hierarchy && | 4993 | if (cgroup_task_count(cont) || memcg_has_children(memcg)) { |
4922 | !list_empty(&cont->children))) { | ||
4923 | ret = -EBUSY; | 4994 | ret = -EBUSY; |
4924 | goto out; | 4995 | goto out; |
4925 | } | 4996 | } |
@@ -4931,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
4931 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); | 5002 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); |
4932 | goto out; | 5003 | goto out; |
4933 | } | 5004 | } |
4934 | must_inc_static_branch = true; | 5005 | static_key_slow_inc(&memcg_kmem_enabled_key); |
5006 | /* | ||
5007 | * setting the active bit after the inc will guarantee no one | ||
5008 | * starts accounting before all call sites are patched | ||
5009 | */ | ||
5010 | memcg_kmem_set_active(memcg); | ||
5011 | |||
4935 | /* | 5012 | /* |
4936 | * kmem charges can outlive the cgroup. In the case of slab | 5013 | * kmem charges can outlive the cgroup. In the case of slab |
4937 | * pages, for instance, a page contain objects from various | 5014 | * pages, for instance, a page contain objects from various |
@@ -4943,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
4943 | ret = res_counter_set_limit(&memcg->kmem, val); | 5020 | ret = res_counter_set_limit(&memcg->kmem, val); |
4944 | out: | 5021 | out: |
4945 | mutex_unlock(&set_limit_mutex); | 5022 | mutex_unlock(&set_limit_mutex); |
4946 | cgroup_unlock(); | 5023 | mutex_unlock(&memcg_create_mutex); |
4947 | |||
4948 | /* | ||
4949 | * We are by now familiar with the fact that we can't inc the static | ||
4950 | * branch inside cgroup_lock. See disarm functions for details. A | ||
4951 | * worker here is overkill, but also wrong: After the limit is set, we | ||
4952 | * must start accounting right away. Since this operation can't fail, | ||
4953 | * we can safely defer it to here - no rollback will be needed. | ||
4954 | * | ||
4955 | * The boolean used to control this is also safe, because | ||
4956 | * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be | ||
4957 | * able to set it to true; | ||
4958 | */ | ||
4959 | if (must_inc_static_branch) { | ||
4960 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
4961 | /* | ||
4962 | * setting the active bit after the inc will guarantee no one | ||
4963 | * starts accounting before all call sites are patched | ||
4964 | */ | ||
4965 | memcg_kmem_set_active(memcg); | ||
4966 | } | ||
4967 | |||
4968 | #endif | 5024 | #endif |
4969 | return ret; | 5025 | return ret; |
4970 | } | 5026 | } |
4971 | 5027 | ||
5028 | #ifdef CONFIG_MEMCG_KMEM | ||
4972 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) | 5029 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) |
4973 | { | 5030 | { |
4974 | int ret = 0; | 5031 | int ret = 0; |
@@ -4977,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
4977 | goto out; | 5034 | goto out; |
4978 | 5035 | ||
4979 | memcg->kmem_account_flags = parent->kmem_account_flags; | 5036 | memcg->kmem_account_flags = parent->kmem_account_flags; |
4980 | #ifdef CONFIG_MEMCG_KMEM | ||
4981 | /* | 5037 | /* |
4982 | * When that happen, we need to disable the static branch only on those | 5038 | * When that happen, we need to disable the static branch only on those |
4983 | * memcgs that enabled it. To achieve this, we would be forced to | 5039 | * memcgs that enabled it. To achieve this, we would be forced to |
@@ -5003,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
5003 | mutex_lock(&set_limit_mutex); | 5059 | mutex_lock(&set_limit_mutex); |
5004 | ret = memcg_update_cache_sizes(memcg); | 5060 | ret = memcg_update_cache_sizes(memcg); |
5005 | mutex_unlock(&set_limit_mutex); | 5061 | mutex_unlock(&set_limit_mutex); |
5006 | #endif | ||
5007 | out: | 5062 | out: |
5008 | return ret; | 5063 | return ret; |
5009 | } | 5064 | } |
5065 | #endif /* CONFIG_MEMCG_KMEM */ | ||
5010 | 5066 | ||
5011 | /* | 5067 | /* |
5012 | * The user of this function is... | 5068 | * The user of this function is... |
@@ -5146,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
5146 | 5202 | ||
5147 | if (val >= (1 << NR_MOVE_TYPE)) | 5203 | if (val >= (1 << NR_MOVE_TYPE)) |
5148 | return -EINVAL; | 5204 | return -EINVAL; |
5205 | |||
5149 | /* | 5206 | /* |
5150 | * We check this value several times in both in can_attach() and | 5207 | * No kind of locking is needed in here, because ->can_attach() will |
5151 | * attach(), so we need cgroup lock to prevent this value from being | 5208 | * check this value once in the beginning of the process, and then carry |
5152 | * inconsistent. | 5209 | * on with stale data. This means that changes to this value will only |
5210 | * affect task migrations starting after the change. | ||
5153 | */ | 5211 | */ |
5154 | cgroup_lock(); | ||
5155 | memcg->move_charge_at_immigrate = val; | 5212 | memcg->move_charge_at_immigrate = val; |
5156 | cgroup_unlock(); | ||
5157 | |||
5158 | return 0; | 5213 | return 0; |
5159 | } | 5214 | } |
5160 | #else | 5215 | #else |
@@ -5212,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
5212 | } | 5267 | } |
5213 | #endif /* CONFIG_NUMA */ | 5268 | #endif /* CONFIG_NUMA */ |
5214 | 5269 | ||
5215 | static const char * const mem_cgroup_lru_names[] = { | ||
5216 | "inactive_anon", | ||
5217 | "active_anon", | ||
5218 | "inactive_file", | ||
5219 | "active_file", | ||
5220 | "unevictable", | ||
5221 | }; | ||
5222 | |||
5223 | static inline void mem_cgroup_lru_names_not_uptodate(void) | 5270 | static inline void mem_cgroup_lru_names_not_uptodate(void) |
5224 | { | 5271 | { |
5225 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 5272 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
@@ -5333,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
5333 | 5380 | ||
5334 | parent = mem_cgroup_from_cont(cgrp->parent); | 5381 | parent = mem_cgroup_from_cont(cgrp->parent); |
5335 | 5382 | ||
5336 | cgroup_lock(); | 5383 | mutex_lock(&memcg_create_mutex); |
5337 | 5384 | ||
5338 | /* If under hierarchy, only empty-root can set this value */ | 5385 | /* If under hierarchy, only empty-root can set this value */ |
5339 | if ((parent->use_hierarchy) || | 5386 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { |
5340 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { | 5387 | mutex_unlock(&memcg_create_mutex); |
5341 | cgroup_unlock(); | ||
5342 | return -EINVAL; | 5388 | return -EINVAL; |
5343 | } | 5389 | } |
5344 | 5390 | ||
5345 | memcg->swappiness = val; | 5391 | memcg->swappiness = val; |
5346 | 5392 | ||
5347 | cgroup_unlock(); | 5393 | mutex_unlock(&memcg_create_mutex); |
5348 | 5394 | ||
5349 | return 0; | 5395 | return 0; |
5350 | } | 5396 | } |
@@ -5670,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
5670 | 5716 | ||
5671 | parent = mem_cgroup_from_cont(cgrp->parent); | 5717 | parent = mem_cgroup_from_cont(cgrp->parent); |
5672 | 5718 | ||
5673 | cgroup_lock(); | 5719 | mutex_lock(&memcg_create_mutex); |
5674 | /* oom-kill-disable is a flag for subhierarchy. */ | 5720 | /* oom-kill-disable is a flag for subhierarchy. */ |
5675 | if ((parent->use_hierarchy) || | 5721 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { |
5676 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { | 5722 | mutex_unlock(&memcg_create_mutex); |
5677 | cgroup_unlock(); | ||
5678 | return -EINVAL; | 5723 | return -EINVAL; |
5679 | } | 5724 | } |
5680 | memcg->oom_kill_disable = val; | 5725 | memcg->oom_kill_disable = val; |
5681 | if (!val) | 5726 | if (!val) |
5682 | memcg_oom_recover(memcg); | 5727 | memcg_oom_recover(memcg); |
5683 | cgroup_unlock(); | 5728 | mutex_unlock(&memcg_create_mutex); |
5684 | return 0; | 5729 | return 0; |
5685 | } | 5730 | } |
5686 | 5731 | ||
@@ -5795,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = { | |||
5795 | .read_seq_string = memcg_numa_stat_show, | 5840 | .read_seq_string = memcg_numa_stat_show, |
5796 | }, | 5841 | }, |
5797 | #endif | 5842 | #endif |
5798 | #ifdef CONFIG_MEMCG_SWAP | ||
5799 | { | ||
5800 | .name = "memsw.usage_in_bytes", | ||
5801 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
5802 | .read = mem_cgroup_read, | ||
5803 | .register_event = mem_cgroup_usage_register_event, | ||
5804 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
5805 | }, | ||
5806 | { | ||
5807 | .name = "memsw.max_usage_in_bytes", | ||
5808 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
5809 | .trigger = mem_cgroup_reset, | ||
5810 | .read = mem_cgroup_read, | ||
5811 | }, | ||
5812 | { | ||
5813 | .name = "memsw.limit_in_bytes", | ||
5814 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
5815 | .write_string = mem_cgroup_write, | ||
5816 | .read = mem_cgroup_read, | ||
5817 | }, | ||
5818 | { | ||
5819 | .name = "memsw.failcnt", | ||
5820 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
5821 | .trigger = mem_cgroup_reset, | ||
5822 | .read = mem_cgroup_read, | ||
5823 | }, | ||
5824 | #endif | ||
5825 | #ifdef CONFIG_MEMCG_KMEM | 5843 | #ifdef CONFIG_MEMCG_KMEM |
5826 | { | 5844 | { |
5827 | .name = "kmem.limit_in_bytes", | 5845 | .name = "kmem.limit_in_bytes", |
@@ -5856,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = { | |||
5856 | { }, /* terminate */ | 5874 | { }, /* terminate */ |
5857 | }; | 5875 | }; |
5858 | 5876 | ||
5877 | #ifdef CONFIG_MEMCG_SWAP | ||
5878 | static struct cftype memsw_cgroup_files[] = { | ||
5879 | { | ||
5880 | .name = "memsw.usage_in_bytes", | ||
5881 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
5882 | .read = mem_cgroup_read, | ||
5883 | .register_event = mem_cgroup_usage_register_event, | ||
5884 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
5885 | }, | ||
5886 | { | ||
5887 | .name = "memsw.max_usage_in_bytes", | ||
5888 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
5889 | .trigger = mem_cgroup_reset, | ||
5890 | .read = mem_cgroup_read, | ||
5891 | }, | ||
5892 | { | ||
5893 | .name = "memsw.limit_in_bytes", | ||
5894 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
5895 | .write_string = mem_cgroup_write, | ||
5896 | .read = mem_cgroup_read, | ||
5897 | }, | ||
5898 | { | ||
5899 | .name = "memsw.failcnt", | ||
5900 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
5901 | .trigger = mem_cgroup_reset, | ||
5902 | .read = mem_cgroup_read, | ||
5903 | }, | ||
5904 | { }, /* terminate */ | ||
5905 | }; | ||
5906 | #endif | ||
5859 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 5907 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
5860 | { | 5908 | { |
5861 | struct mem_cgroup_per_node *pn; | 5909 | struct mem_cgroup_per_node *pn; |
@@ -5894,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
5894 | static struct mem_cgroup *mem_cgroup_alloc(void) | 5942 | static struct mem_cgroup *mem_cgroup_alloc(void) |
5895 | { | 5943 | { |
5896 | struct mem_cgroup *memcg; | 5944 | struct mem_cgroup *memcg; |
5897 | int size = sizeof(struct mem_cgroup); | 5945 | size_t size = memcg_size(); |
5898 | 5946 | ||
5899 | /* Can be very big if MAX_NUMNODES is very big */ | 5947 | /* Can be very big if nr_node_ids is very big */ |
5900 | if (size < PAGE_SIZE) | 5948 | if (size < PAGE_SIZE) |
5901 | memcg = kzalloc(size, GFP_KERNEL); | 5949 | memcg = kzalloc(size, GFP_KERNEL); |
5902 | else | 5950 | else |
@@ -5933,7 +5981,7 @@ out_free: | |||
5933 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | 5981 | static void __mem_cgroup_free(struct mem_cgroup *memcg) |
5934 | { | 5982 | { |
5935 | int node; | 5983 | int node; |
5936 | int size = sizeof(struct mem_cgroup); | 5984 | size_t size = memcg_size(); |
5937 | 5985 | ||
5938 | mem_cgroup_remove_from_trees(memcg); | 5986 | mem_cgroup_remove_from_trees(memcg); |
5939 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 5987 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
@@ -6015,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
6015 | } | 6063 | } |
6016 | EXPORT_SYMBOL(parent_mem_cgroup); | 6064 | EXPORT_SYMBOL(parent_mem_cgroup); |
6017 | 6065 | ||
6018 | #ifdef CONFIG_MEMCG_SWAP | 6066 | static void __init mem_cgroup_soft_limit_tree_init(void) |
6019 | static void __init enable_swap_cgroup(void) | ||
6020 | { | ||
6021 | if (!mem_cgroup_disabled() && really_do_swap_account) | ||
6022 | do_swap_account = 1; | ||
6023 | } | ||
6024 | #else | ||
6025 | static void __init enable_swap_cgroup(void) | ||
6026 | { | ||
6027 | } | ||
6028 | #endif | ||
6029 | |||
6030 | static int mem_cgroup_soft_limit_tree_init(void) | ||
6031 | { | 6067 | { |
6032 | struct mem_cgroup_tree_per_node *rtpn; | 6068 | struct mem_cgroup_tree_per_node *rtpn; |
6033 | struct mem_cgroup_tree_per_zone *rtpz; | 6069 | struct mem_cgroup_tree_per_zone *rtpz; |
@@ -6038,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
6038 | if (!node_state(node, N_NORMAL_MEMORY)) | 6074 | if (!node_state(node, N_NORMAL_MEMORY)) |
6039 | tmp = -1; | 6075 | tmp = -1; |
6040 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | 6076 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); |
6041 | if (!rtpn) | 6077 | BUG_ON(!rtpn); |
6042 | goto err_cleanup; | ||
6043 | 6078 | ||
6044 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | 6079 | soft_limit_tree.rb_tree_per_node[node] = rtpn; |
6045 | 6080 | ||
@@ -6049,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
6049 | spin_lock_init(&rtpz->lock); | 6084 | spin_lock_init(&rtpz->lock); |
6050 | } | 6085 | } |
6051 | } | 6086 | } |
6052 | return 0; | ||
6053 | |||
6054 | err_cleanup: | ||
6055 | for_each_node(node) { | ||
6056 | if (!soft_limit_tree.rb_tree_per_node[node]) | ||
6057 | break; | ||
6058 | kfree(soft_limit_tree.rb_tree_per_node[node]); | ||
6059 | soft_limit_tree.rb_tree_per_node[node] = NULL; | ||
6060 | } | ||
6061 | return 1; | ||
6062 | |||
6063 | } | 6087 | } |
6064 | 6088 | ||
6065 | static struct cgroup_subsys_state * __ref | 6089 | static struct cgroup_subsys_state * __ref |
6066 | mem_cgroup_css_alloc(struct cgroup *cont) | 6090 | mem_cgroup_css_alloc(struct cgroup *cont) |
6067 | { | 6091 | { |
6068 | struct mem_cgroup *memcg, *parent; | 6092 | struct mem_cgroup *memcg; |
6069 | long error = -ENOMEM; | 6093 | long error = -ENOMEM; |
6070 | int node; | 6094 | int node; |
6071 | 6095 | ||
@@ -6079,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
6079 | 6103 | ||
6080 | /* root ? */ | 6104 | /* root ? */ |
6081 | if (cont->parent == NULL) { | 6105 | if (cont->parent == NULL) { |
6082 | int cpu; | ||
6083 | enable_swap_cgroup(); | ||
6084 | parent = NULL; | ||
6085 | if (mem_cgroup_soft_limit_tree_init()) | ||
6086 | goto free_out; | ||
6087 | root_mem_cgroup = memcg; | 6106 | root_mem_cgroup = memcg; |
6088 | for_each_possible_cpu(cpu) { | 6107 | res_counter_init(&memcg->res, NULL); |
6089 | struct memcg_stock_pcp *stock = | 6108 | res_counter_init(&memcg->memsw, NULL); |
6090 | &per_cpu(memcg_stock, cpu); | 6109 | res_counter_init(&memcg->kmem, NULL); |
6091 | INIT_WORK(&stock->work, drain_local_stock); | ||
6092 | } | ||
6093 | } else { | ||
6094 | parent = mem_cgroup_from_cont(cont->parent); | ||
6095 | memcg->use_hierarchy = parent->use_hierarchy; | ||
6096 | memcg->oom_kill_disable = parent->oom_kill_disable; | ||
6097 | } | 6110 | } |
6098 | 6111 | ||
6099 | if (parent && parent->use_hierarchy) { | 6112 | memcg->last_scanned_node = MAX_NUMNODES; |
6113 | INIT_LIST_HEAD(&memcg->oom_notify); | ||
6114 | atomic_set(&memcg->refcnt, 1); | ||
6115 | memcg->move_charge_at_immigrate = 0; | ||
6116 | mutex_init(&memcg->thresholds_lock); | ||
6117 | spin_lock_init(&memcg->move_lock); | ||
6118 | |||
6119 | return &memcg->css; | ||
6120 | |||
6121 | free_out: | ||
6122 | __mem_cgroup_free(memcg); | ||
6123 | return ERR_PTR(error); | ||
6124 | } | ||
6125 | |||
6126 | static int | ||
6127 | mem_cgroup_css_online(struct cgroup *cont) | ||
6128 | { | ||
6129 | struct mem_cgroup *memcg, *parent; | ||
6130 | int error = 0; | ||
6131 | |||
6132 | if (!cont->parent) | ||
6133 | return 0; | ||
6134 | |||
6135 | mutex_lock(&memcg_create_mutex); | ||
6136 | memcg = mem_cgroup_from_cont(cont); | ||
6137 | parent = mem_cgroup_from_cont(cont->parent); | ||
6138 | |||
6139 | memcg->use_hierarchy = parent->use_hierarchy; | ||
6140 | memcg->oom_kill_disable = parent->oom_kill_disable; | ||
6141 | memcg->swappiness = mem_cgroup_swappiness(parent); | ||
6142 | |||
6143 | if (parent->use_hierarchy) { | ||
6100 | res_counter_init(&memcg->res, &parent->res); | 6144 | res_counter_init(&memcg->res, &parent->res); |
6101 | res_counter_init(&memcg->memsw, &parent->memsw); | 6145 | res_counter_init(&memcg->memsw, &parent->memsw); |
6102 | res_counter_init(&memcg->kmem, &parent->kmem); | 6146 | res_counter_init(&memcg->kmem, &parent->kmem); |
@@ -6117,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
6117 | * much sense so let cgroup subsystem know about this | 6161 | * much sense so let cgroup subsystem know about this |
6118 | * unfortunate state in our controller. | 6162 | * unfortunate state in our controller. |
6119 | */ | 6163 | */ |
6120 | if (parent && parent != root_mem_cgroup) | 6164 | if (parent != root_mem_cgroup) |
6121 | mem_cgroup_subsys.broken_hierarchy = true; | 6165 | mem_cgroup_subsys.broken_hierarchy = true; |
6122 | } | 6166 | } |
6123 | memcg->last_scanned_node = MAX_NUMNODES; | ||
6124 | INIT_LIST_HEAD(&memcg->oom_notify); | ||
6125 | |||
6126 | if (parent) | ||
6127 | memcg->swappiness = mem_cgroup_swappiness(parent); | ||
6128 | atomic_set(&memcg->refcnt, 1); | ||
6129 | memcg->move_charge_at_immigrate = 0; | ||
6130 | mutex_init(&memcg->thresholds_lock); | ||
6131 | spin_lock_init(&memcg->move_lock); | ||
6132 | 6167 | ||
6133 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); | 6168 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); |
6169 | mutex_unlock(&memcg_create_mutex); | ||
6134 | if (error) { | 6170 | if (error) { |
6135 | /* | 6171 | /* |
6136 | * We call put now because our (and parent's) refcnts | 6172 | * We call put now because our (and parent's) refcnts |
@@ -6138,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
6138 | * call __mem_cgroup_free, so return directly | 6174 | * call __mem_cgroup_free, so return directly |
6139 | */ | 6175 | */ |
6140 | mem_cgroup_put(memcg); | 6176 | mem_cgroup_put(memcg); |
6141 | return ERR_PTR(error); | 6177 | if (parent->use_hierarchy) |
6178 | mem_cgroup_put(parent); | ||
6142 | } | 6179 | } |
6143 | return &memcg->css; | 6180 | return error; |
6144 | free_out: | ||
6145 | __mem_cgroup_free(memcg); | ||
6146 | return ERR_PTR(error); | ||
6147 | } | 6181 | } |
6148 | 6182 | ||
6149 | static void mem_cgroup_css_offline(struct cgroup *cont) | 6183 | static void mem_cgroup_css_offline(struct cgroup *cont) |
@@ -6279,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |||
6279 | * Because lookup_swap_cache() updates some statistics counter, | 6313 | * Because lookup_swap_cache() updates some statistics counter, |
6280 | * we call find_get_page() with swapper_space directly. | 6314 | * we call find_get_page() with swapper_space directly. |
6281 | */ | 6315 | */ |
6282 | page = find_get_page(&swapper_space, ent.val); | 6316 | page = find_get_page(swap_address_space(ent), ent.val); |
6283 | if (do_swap_account) | 6317 | if (do_swap_account) |
6284 | entry->val = ent.val; | 6318 | entry->val = ent.val; |
6285 | 6319 | ||
@@ -6320,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
6320 | swp_entry_t swap = radix_to_swp_entry(page); | 6354 | swp_entry_t swap = radix_to_swp_entry(page); |
6321 | if (do_swap_account) | 6355 | if (do_swap_account) |
6322 | *entry = swap; | 6356 | *entry = swap; |
6323 | page = find_get_page(&swapper_space, swap.val); | 6357 | page = find_get_page(swap_address_space(swap), swap.val); |
6324 | } | 6358 | } |
6325 | #endif | 6359 | #endif |
6326 | return page; | 6360 | return page; |
@@ -6530,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, | |||
6530 | struct task_struct *p = cgroup_taskset_first(tset); | 6564 | struct task_struct *p = cgroup_taskset_first(tset); |
6531 | int ret = 0; | 6565 | int ret = 0; |
6532 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); | 6566 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); |
6567 | unsigned long move_charge_at_immigrate; | ||
6533 | 6568 | ||
6534 | if (memcg->move_charge_at_immigrate) { | 6569 | /* |
6570 | * We are now commited to this value whatever it is. Changes in this | ||
6571 | * tunable will only affect upcoming migrations, not the current one. | ||
6572 | * So we need to save it, and keep it going. | ||
6573 | */ | ||
6574 | move_charge_at_immigrate = memcg->move_charge_at_immigrate; | ||
6575 | if (move_charge_at_immigrate) { | ||
6535 | struct mm_struct *mm; | 6576 | struct mm_struct *mm; |
6536 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 6577 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
6537 | 6578 | ||
@@ -6551,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, | |||
6551 | spin_lock(&mc.lock); | 6592 | spin_lock(&mc.lock); |
6552 | mc.from = from; | 6593 | mc.from = from; |
6553 | mc.to = memcg; | 6594 | mc.to = memcg; |
6595 | mc.immigrate_flags = move_charge_at_immigrate; | ||
6554 | spin_unlock(&mc.lock); | 6596 | spin_unlock(&mc.lock); |
6555 | /* We set mc.moving_task later */ | 6597 | /* We set mc.moving_task later */ |
6556 | 6598 | ||
@@ -6745,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
6745 | .name = "memory", | 6787 | .name = "memory", |
6746 | .subsys_id = mem_cgroup_subsys_id, | 6788 | .subsys_id = mem_cgroup_subsys_id, |
6747 | .css_alloc = mem_cgroup_css_alloc, | 6789 | .css_alloc = mem_cgroup_css_alloc, |
6790 | .css_online = mem_cgroup_css_online, | ||
6748 | .css_offline = mem_cgroup_css_offline, | 6791 | .css_offline = mem_cgroup_css_offline, |
6749 | .css_free = mem_cgroup_css_free, | 6792 | .css_free = mem_cgroup_css_free, |
6750 | .can_attach = mem_cgroup_can_attach, | 6793 | .can_attach = mem_cgroup_can_attach, |
@@ -6755,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
6755 | .use_id = 1, | 6798 | .use_id = 1, |
6756 | }; | 6799 | }; |
6757 | 6800 | ||
6758 | /* | ||
6759 | * The rest of init is performed during ->css_alloc() for root css which | ||
6760 | * happens before initcalls. hotcpu_notifier() can't be done together as | ||
6761 | * it would introduce circular locking by adding cgroup_lock -> cpu hotplug | ||
6762 | * dependency. Do it from a subsys_initcall(). | ||
6763 | */ | ||
6764 | static int __init mem_cgroup_init(void) | ||
6765 | { | ||
6766 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | ||
6767 | return 0; | ||
6768 | } | ||
6769 | subsys_initcall(mem_cgroup_init); | ||
6770 | |||
6771 | #ifdef CONFIG_MEMCG_SWAP | 6801 | #ifdef CONFIG_MEMCG_SWAP |
6772 | static int __init enable_swap_account(char *s) | 6802 | static int __init enable_swap_account(char *s) |
6773 | { | 6803 | { |
@@ -6780,4 +6810,39 @@ static int __init enable_swap_account(char *s) | |||
6780 | } | 6810 | } |
6781 | __setup("swapaccount=", enable_swap_account); | 6811 | __setup("swapaccount=", enable_swap_account); |
6782 | 6812 | ||
6813 | static void __init memsw_file_init(void) | ||
6814 | { | ||
6815 | WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); | ||
6816 | } | ||
6817 | |||
6818 | static void __init enable_swap_cgroup(void) | ||
6819 | { | ||
6820 | if (!mem_cgroup_disabled() && really_do_swap_account) { | ||
6821 | do_swap_account = 1; | ||
6822 | memsw_file_init(); | ||
6823 | } | ||
6824 | } | ||
6825 | |||
6826 | #else | ||
6827 | static void __init enable_swap_cgroup(void) | ||
6828 | { | ||
6829 | } | ||
6783 | #endif | 6830 | #endif |
6831 | |||
6832 | /* | ||
6833 | * subsys_initcall() for memory controller. | ||
6834 | * | ||
6835 | * Some parts like hotcpu_notifier() have to be initialized from this context | ||
6836 | * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically | ||
6837 | * everything that doesn't depend on a specific mem_cgroup structure should | ||
6838 | * be initialized from here. | ||
6839 | */ | ||
6840 | static int __init mem_cgroup_init(void) | ||
6841 | { | ||
6842 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | ||
6843 | enable_swap_cgroup(); | ||
6844 | mem_cgroup_soft_limit_tree_init(); | ||
6845 | memcg_stock_init(); | ||
6846 | return 0; | ||
6847 | } | ||
6848 | subsys_initcall(mem_cgroup_init); | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c6e4dd3e1c08..df0694c6adef 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0; | |||
61 | 61 | ||
62 | int sysctl_memory_failure_recovery __read_mostly = 1; | 62 | int sysctl_memory_failure_recovery __read_mostly = 1; |
63 | 63 | ||
64 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | 64 | atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); |
65 | 65 | ||
66 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) | 66 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) |
67 | 67 | ||
@@ -784,12 +784,12 @@ static struct page_state { | |||
784 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, | 784 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, |
785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, | 785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, |
786 | 786 | ||
787 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | ||
788 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, | ||
789 | |||
790 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, | 787 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, |
791 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, | 788 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, |
792 | 789 | ||
790 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | ||
791 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, | ||
792 | |||
793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, | 793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, |
794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
795 | 795 | ||
@@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1021 | struct page *hpage; | 1021 | struct page *hpage; |
1022 | int res; | 1022 | int res; |
1023 | unsigned int nr_pages; | 1023 | unsigned int nr_pages; |
1024 | unsigned long page_flags; | ||
1024 | 1025 | ||
1025 | if (!sysctl_memory_failure_recovery) | 1026 | if (!sysctl_memory_failure_recovery) |
1026 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 1027 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
@@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1039 | return 0; | 1040 | return 0; |
1040 | } | 1041 | } |
1041 | 1042 | ||
1042 | nr_pages = 1 << compound_trans_order(hpage); | 1043 | /* |
1043 | atomic_long_add(nr_pages, &mce_bad_pages); | 1044 | * Currently errors on hugetlbfs pages are measured in hugepage units, |
1045 | * so nr_pages should be 1 << compound_order. OTOH when errors are on | ||
1046 | * transparent hugepages, they are supposed to be split and error | ||
1047 | * measurement is done in normal page units. So nr_pages should be one | ||
1048 | * in this case. | ||
1049 | */ | ||
1050 | if (PageHuge(p)) | ||
1051 | nr_pages = 1 << compound_order(hpage); | ||
1052 | else /* normal page or thp */ | ||
1053 | nr_pages = 1; | ||
1054 | atomic_long_add(nr_pages, &num_poisoned_pages); | ||
1044 | 1055 | ||
1045 | /* | 1056 | /* |
1046 | * We need/can do nothing about count=0 pages. | 1057 | * We need/can do nothing about count=0 pages. |
@@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1070 | if (!PageHWPoison(hpage) | 1081 | if (!PageHWPoison(hpage) |
1071 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | 1082 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) |
1072 | || (p != hpage && TestSetPageHWPoison(hpage))) { | 1083 | || (p != hpage && TestSetPageHWPoison(hpage))) { |
1073 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1084 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1074 | return 0; | 1085 | return 0; |
1075 | } | 1086 | } |
1076 | set_page_hwpoison_huge_page(hpage); | 1087 | set_page_hwpoison_huge_page(hpage); |
@@ -1119,6 +1130,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1119 | lock_page(hpage); | 1130 | lock_page(hpage); |
1120 | 1131 | ||
1121 | /* | 1132 | /* |
1133 | * We use page flags to determine what action should be taken, but | ||
1134 | * the flags can be modified by the error containment action. One | ||
1135 | * example is an mlocked page, where PG_mlocked is cleared by | ||
1136 | * page_remove_rmap() in try_to_unmap_one(). So to determine page status | ||
1137 | * correctly, we save a copy of the page flags at this time. | ||
1138 | */ | ||
1139 | page_flags = p->flags; | ||
1140 | |||
1141 | /* | ||
1122 | * unpoison always clear PG_hwpoison inside page lock | 1142 | * unpoison always clear PG_hwpoison inside page lock |
1123 | */ | 1143 | */ |
1124 | if (!PageHWPoison(p)) { | 1144 | if (!PageHWPoison(p)) { |
@@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1128 | } | 1148 | } |
1129 | if (hwpoison_filter(p)) { | 1149 | if (hwpoison_filter(p)) { |
1130 | if (TestClearPageHWPoison(p)) | 1150 | if (TestClearPageHWPoison(p)) |
1131 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1151 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1132 | unlock_page(hpage); | 1152 | unlock_page(hpage); |
1133 | put_page(hpage); | 1153 | put_page(hpage); |
1134 | return 0; | 1154 | return 0; |
@@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1176 | } | 1196 | } |
1177 | 1197 | ||
1178 | res = -EBUSY; | 1198 | res = -EBUSY; |
1179 | for (ps = error_states;; ps++) { | 1199 | /* |
1180 | if ((p->flags & ps->mask) == ps->res) { | 1200 | * The first check uses the current page flags which may not have any |
1181 | res = page_action(ps, p, pfn); | 1201 | * relevant information. The second check with the saved page flagss is |
1202 | * carried out only if the first check can't determine the page status. | ||
1203 | */ | ||
1204 | for (ps = error_states;; ps++) | ||
1205 | if ((p->flags & ps->mask) == ps->res) | ||
1182 | break; | 1206 | break; |
1183 | } | 1207 | if (!ps->mask) |
1184 | } | 1208 | for (ps = error_states;; ps++) |
1209 | if ((page_flags & ps->mask) == ps->res) | ||
1210 | break; | ||
1211 | res = page_action(ps, p, pfn); | ||
1185 | out: | 1212 | out: |
1186 | unlock_page(hpage); | 1213 | unlock_page(hpage); |
1187 | return res; | 1214 | return res; |
@@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn) | |||
1323 | return 0; | 1350 | return 0; |
1324 | } | 1351 | } |
1325 | if (TestClearPageHWPoison(p)) | 1352 | if (TestClearPageHWPoison(p)) |
1326 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1353 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1327 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1354 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1328 | return 0; | 1355 | return 0; |
1329 | } | 1356 | } |
@@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn) | |||
1337 | */ | 1364 | */ |
1338 | if (TestClearPageHWPoison(page)) { | 1365 | if (TestClearPageHWPoison(page)) { |
1339 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); | 1366 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
1340 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1367 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1341 | freeit = 1; | 1368 | freeit = 1; |
1342 | if (PageHuge(page)) | 1369 | if (PageHuge(page)) |
1343 | clear_page_hwpoison_huge_page(page); | 1370 | clear_page_hwpoison_huge_page(page); |
@@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) | |||
1368 | * that is not free, and 1 for any other page type. | 1395 | * that is not free, and 1 for any other page type. |
1369 | * For 1 the page is returned with increased page count, otherwise not. | 1396 | * For 1 the page is returned with increased page count, otherwise not. |
1370 | */ | 1397 | */ |
1371 | static int get_any_page(struct page *p, unsigned long pfn, int flags) | 1398 | static int __get_any_page(struct page *p, unsigned long pfn, int flags) |
1372 | { | 1399 | { |
1373 | int ret; | 1400 | int ret; |
1374 | 1401 | ||
@@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1393 | if (!get_page_unless_zero(compound_head(p))) { | 1420 | if (!get_page_unless_zero(compound_head(p))) { |
1394 | if (PageHuge(p)) { | 1421 | if (PageHuge(p)) { |
1395 | pr_info("%s: %#lx free huge page\n", __func__, pfn); | 1422 | pr_info("%s: %#lx free huge page\n", __func__, pfn); |
1396 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | 1423 | ret = 0; |
1397 | } else if (is_free_buddy_page(p)) { | 1424 | } else if (is_free_buddy_page(p)) { |
1398 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); | 1425 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); |
1399 | /* Set hwpoison bit while page is still isolated */ | ||
1400 | SetPageHWPoison(p); | ||
1401 | ret = 0; | 1426 | ret = 0; |
1402 | } else { | 1427 | } else { |
1403 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", | 1428 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", |
@@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1413 | return ret; | 1438 | return ret; |
1414 | } | 1439 | } |
1415 | 1440 | ||
1441 | static int get_any_page(struct page *page, unsigned long pfn, int flags) | ||
1442 | { | ||
1443 | int ret = __get_any_page(page, pfn, flags); | ||
1444 | |||
1445 | if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { | ||
1446 | /* | ||
1447 | * Try to free it. | ||
1448 | */ | ||
1449 | put_page(page); | ||
1450 | shake_page(page, 1); | ||
1451 | |||
1452 | /* | ||
1453 | * Did it turn free? | ||
1454 | */ | ||
1455 | ret = __get_any_page(page, pfn, 0); | ||
1456 | if (!PageLRU(page)) { | ||
1457 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
1458 | pfn, page->flags); | ||
1459 | return -EIO; | ||
1460 | } | ||
1461 | } | ||
1462 | return ret; | ||
1463 | } | ||
1464 | |||
1416 | static int soft_offline_huge_page(struct page *page, int flags) | 1465 | static int soft_offline_huge_page(struct page *page, int flags) |
1417 | { | 1466 | { |
1418 | int ret; | 1467 | int ret; |
1419 | unsigned long pfn = page_to_pfn(page); | 1468 | unsigned long pfn = page_to_pfn(page); |
1420 | struct page *hpage = compound_head(page); | 1469 | struct page *hpage = compound_head(page); |
1421 | 1470 | ||
1422 | ret = get_any_page(page, pfn, flags); | 1471 | /* |
1423 | if (ret < 0) | 1472 | * This double-check of PageHWPoison is to avoid the race with |
1424 | return ret; | 1473 | * memory_failure(). See also comment in __soft_offline_page(). |
1425 | if (ret == 0) | 1474 | */ |
1426 | goto done; | 1475 | lock_page(hpage); |
1427 | |||
1428 | if (PageHWPoison(hpage)) { | 1476 | if (PageHWPoison(hpage)) { |
1477 | unlock_page(hpage); | ||
1429 | put_page(hpage); | 1478 | put_page(hpage); |
1430 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); | 1479 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); |
1431 | return -EBUSY; | 1480 | return -EBUSY; |
1432 | } | 1481 | } |
1482 | unlock_page(hpage); | ||
1433 | 1483 | ||
1434 | /* Keep page count to indicate a given hugepage is isolated. */ | 1484 | /* Keep page count to indicate a given hugepage is isolated. */ |
1435 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, | 1485 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, |
1436 | MIGRATE_SYNC); | 1486 | MIGRATE_SYNC); |
1437 | put_page(hpage); | 1487 | put_page(hpage); |
1438 | if (ret) { | 1488 | if (ret) { |
1439 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1489 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1440 | pfn, ret, page->flags); | 1490 | pfn, ret, page->flags); |
1441 | return ret; | 1491 | } else { |
1442 | } | 1492 | set_page_hwpoison_huge_page(hpage); |
1443 | done: | 1493 | dequeue_hwpoisoned_huge_page(hpage); |
1444 | if (!PageHWPoison(hpage)) | ||
1445 | atomic_long_add(1 << compound_trans_order(hpage), | 1494 | atomic_long_add(1 << compound_trans_order(hpage), |
1446 | &mce_bad_pages); | 1495 | &num_poisoned_pages); |
1447 | set_page_hwpoison_huge_page(hpage); | 1496 | } |
1448 | dequeue_hwpoisoned_huge_page(hpage); | ||
1449 | /* keep elevated page count for bad page */ | 1497 | /* keep elevated page count for bad page */ |
1450 | return ret; | 1498 | return ret; |
1451 | } | 1499 | } |
1452 | 1500 | ||
1501 | static int __soft_offline_page(struct page *page, int flags); | ||
1502 | |||
1453 | /** | 1503 | /** |
1454 | * soft_offline_page - Soft offline a page. | 1504 | * soft_offline_page - Soft offline a page. |
1455 | * @page: page to offline | 1505 | * @page: page to offline |
@@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags) | |||
1478 | unsigned long pfn = page_to_pfn(page); | 1528 | unsigned long pfn = page_to_pfn(page); |
1479 | struct page *hpage = compound_trans_head(page); | 1529 | struct page *hpage = compound_trans_head(page); |
1480 | 1530 | ||
1481 | if (PageHuge(page)) | 1531 | if (PageHWPoison(page)) { |
1482 | return soft_offline_huge_page(page, flags); | 1532 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
1483 | if (PageTransHuge(hpage)) { | 1533 | return -EBUSY; |
1534 | } | ||
1535 | if (!PageHuge(page) && PageTransHuge(hpage)) { | ||
1484 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { | 1536 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { |
1485 | pr_info("soft offline: %#lx: failed to split THP\n", | 1537 | pr_info("soft offline: %#lx: failed to split THP\n", |
1486 | pfn); | 1538 | pfn); |
@@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags) | |||
1491 | ret = get_any_page(page, pfn, flags); | 1543 | ret = get_any_page(page, pfn, flags); |
1492 | if (ret < 0) | 1544 | if (ret < 0) |
1493 | return ret; | 1545 | return ret; |
1494 | if (ret == 0) | 1546 | if (ret) { /* for in-use pages */ |
1495 | goto done; | 1547 | if (PageHuge(page)) |
1496 | 1548 | ret = soft_offline_huge_page(page, flags); | |
1497 | /* | 1549 | else |
1498 | * Page cache page we can handle? | 1550 | ret = __soft_offline_page(page, flags); |
1499 | */ | 1551 | } else { /* for free pages */ |
1500 | if (!PageLRU(page)) { | 1552 | if (PageHuge(page)) { |
1501 | /* | 1553 | set_page_hwpoison_huge_page(hpage); |
1502 | * Try to free it. | 1554 | dequeue_hwpoisoned_huge_page(hpage); |
1503 | */ | 1555 | atomic_long_add(1 << compound_trans_order(hpage), |
1504 | put_page(page); | 1556 | &num_poisoned_pages); |
1505 | shake_page(page, 1); | 1557 | } else { |
1506 | 1558 | SetPageHWPoison(page); | |
1507 | /* | 1559 | atomic_long_inc(&num_poisoned_pages); |
1508 | * Did it turn free? | 1560 | } |
1509 | */ | ||
1510 | ret = get_any_page(page, pfn, 0); | ||
1511 | if (ret < 0) | ||
1512 | return ret; | ||
1513 | if (ret == 0) | ||
1514 | goto done; | ||
1515 | } | ||
1516 | if (!PageLRU(page)) { | ||
1517 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
1518 | pfn, page->flags); | ||
1519 | return -EIO; | ||
1520 | } | 1561 | } |
1562 | /* keep elevated page count for bad page */ | ||
1563 | return ret; | ||
1564 | } | ||
1521 | 1565 | ||
1522 | lock_page(page); | 1566 | static int __soft_offline_page(struct page *page, int flags) |
1523 | wait_on_page_writeback(page); | 1567 | { |
1568 | int ret; | ||
1569 | unsigned long pfn = page_to_pfn(page); | ||
1524 | 1570 | ||
1525 | /* | 1571 | /* |
1526 | * Synchronized using the page lock with memory_failure() | 1572 | * Check PageHWPoison again inside page lock because PageHWPoison |
1573 | * is set by memory_failure() outside page lock. Note that | ||
1574 | * memory_failure() also double-checks PageHWPoison inside page lock, | ||
1575 | * so there's no race between soft_offline_page() and memory_failure(). | ||
1527 | */ | 1576 | */ |
1577 | lock_page(page); | ||
1578 | wait_on_page_writeback(page); | ||
1528 | if (PageHWPoison(page)) { | 1579 | if (PageHWPoison(page)) { |
1529 | unlock_page(page); | 1580 | unlock_page(page); |
1530 | put_page(page); | 1581 | put_page(page); |
1531 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | 1582 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
1532 | return -EBUSY; | 1583 | return -EBUSY; |
1533 | } | 1584 | } |
1534 | |||
1535 | /* | 1585 | /* |
1536 | * Try to invalidate first. This should work for | 1586 | * Try to invalidate first. This should work for |
1537 | * non dirty unmapped page cache pages. | 1587 | * non dirty unmapped page cache pages. |
@@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags) | |||
1544 | */ | 1594 | */ |
1545 | if (ret == 1) { | 1595 | if (ret == 1) { |
1546 | put_page(page); | 1596 | put_page(page); |
1547 | ret = 0; | ||
1548 | pr_info("soft_offline: %#lx: invalidated\n", pfn); | 1597 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
1549 | goto done; | 1598 | SetPageHWPoison(page); |
1599 | atomic_long_inc(&num_poisoned_pages); | ||
1600 | return 0; | ||
1550 | } | 1601 | } |
1551 | 1602 | ||
1552 | /* | 1603 | /* |
@@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags) | |||
1563 | if (!ret) { | 1614 | if (!ret) { |
1564 | LIST_HEAD(pagelist); | 1615 | LIST_HEAD(pagelist); |
1565 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1616 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
1566 | page_is_file_cache(page)); | 1617 | page_is_file_cache(page)); |
1567 | list_add(&page->lru, &pagelist); | 1618 | list_add(&page->lru, &pagelist); |
1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1619 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1569 | false, MIGRATE_SYNC, | 1620 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1570 | MR_MEMORY_FAILURE); | ||
1571 | if (ret) { | 1621 | if (ret) { |
1572 | putback_lru_pages(&pagelist); | 1622 | putback_lru_pages(&pagelist); |
1573 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1623 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1574 | pfn, ret, page->flags); | 1624 | pfn, ret, page->flags); |
1575 | if (ret > 0) | 1625 | if (ret > 0) |
1576 | ret = -EIO; | 1626 | ret = -EIO; |
1627 | } else { | ||
1628 | SetPageHWPoison(page); | ||
1629 | atomic_long_inc(&num_poisoned_pages); | ||
1577 | } | 1630 | } |
1578 | } else { | 1631 | } else { |
1579 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1632 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
1580 | pfn, ret, page_count(page), page->flags); | 1633 | pfn, ret, page_count(page), page->flags); |
1581 | } | 1634 | } |
1582 | if (ret) | ||
1583 | return ret; | ||
1584 | |||
1585 | done: | ||
1586 | atomic_long_add(1, &mce_bad_pages); | ||
1587 | SetPageHWPoison(page); | ||
1588 | /* keep elevated page count for bad page */ | ||
1589 | return ret; | 1635 | return ret; |
1590 | } | 1636 | } |
diff --git a/mm/memory.c b/mm/memory.c index bb1369f7b9b4..494526ae024a 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -69,6 +69,10 @@ | |||
69 | 69 | ||
70 | #include "internal.h" | 70 | #include "internal.h" |
71 | 71 | ||
72 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | ||
73 | #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. | ||
74 | #endif | ||
75 | |||
72 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 76 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
73 | /* use the per-pgdat data instead for discontigmem - mbligh */ | 77 | /* use the per-pgdat data instead for discontigmem - mbligh */ |
74 | unsigned long max_mapnr; | 78 | unsigned long max_mapnr; |
@@ -716,7 +720,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
716 | print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", | 720 | print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", |
717 | (unsigned long)vma->vm_file->f_op->mmap); | 721 | (unsigned long)vma->vm_file->f_op->mmap); |
718 | dump_stack(); | 722 | dump_stack(); |
719 | add_taint(TAINT_BAD_PAGE); | 723 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
720 | } | 724 | } |
721 | 725 | ||
722 | static inline bool is_cow_mapping(vm_flags_t flags) | 726 | static inline bool is_cow_mapping(vm_flags_t flags) |
@@ -1458,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | |||
1458 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | 1462 | EXPORT_SYMBOL_GPL(zap_vma_ptes); |
1459 | 1463 | ||
1460 | /** | 1464 | /** |
1461 | * follow_page - look up a page descriptor from a user-virtual address | 1465 | * follow_page_mask - look up a page descriptor from a user-virtual address |
1462 | * @vma: vm_area_struct mapping @address | 1466 | * @vma: vm_area_struct mapping @address |
1463 | * @address: virtual address to look up | 1467 | * @address: virtual address to look up |
1464 | * @flags: flags modifying lookup behaviour | 1468 | * @flags: flags modifying lookup behaviour |
1469 | * @page_mask: on output, *page_mask is set according to the size of the page | ||
1465 | * | 1470 | * |
1466 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> | 1471 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> |
1467 | * | 1472 | * |
@@ -1469,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); | |||
1469 | * an error pointer if there is a mapping to something not represented | 1474 | * an error pointer if there is a mapping to something not represented |
1470 | * by a page descriptor (see also vm_normal_page()). | 1475 | * by a page descriptor (see also vm_normal_page()). |
1471 | */ | 1476 | */ |
1472 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1477 | struct page *follow_page_mask(struct vm_area_struct *vma, |
1473 | unsigned int flags) | 1478 | unsigned long address, unsigned int flags, |
1479 | unsigned int *page_mask) | ||
1474 | { | 1480 | { |
1475 | pgd_t *pgd; | 1481 | pgd_t *pgd; |
1476 | pud_t *pud; | 1482 | pud_t *pud; |
@@ -1480,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1480 | struct page *page; | 1486 | struct page *page; |
1481 | struct mm_struct *mm = vma->vm_mm; | 1487 | struct mm_struct *mm = vma->vm_mm; |
1482 | 1488 | ||
1489 | *page_mask = 0; | ||
1490 | |||
1483 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | 1491 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
1484 | if (!IS_ERR(page)) { | 1492 | if (!IS_ERR(page)) { |
1485 | BUG_ON(flags & FOLL_GET); | 1493 | BUG_ON(flags & FOLL_GET); |
@@ -1526,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1526 | page = follow_trans_huge_pmd(vma, address, | 1534 | page = follow_trans_huge_pmd(vma, address, |
1527 | pmd, flags); | 1535 | pmd, flags); |
1528 | spin_unlock(&mm->page_table_lock); | 1536 | spin_unlock(&mm->page_table_lock); |
1537 | *page_mask = HPAGE_PMD_NR - 1; | ||
1529 | goto out; | 1538 | goto out; |
1530 | } | 1539 | } |
1531 | } else | 1540 | } else |
@@ -1539,8 +1548,24 @@ split_fallthrough: | |||
1539 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 1548 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
1540 | 1549 | ||
1541 | pte = *ptep; | 1550 | pte = *ptep; |
1542 | if (!pte_present(pte)) | 1551 | if (!pte_present(pte)) { |
1543 | goto no_page; | 1552 | swp_entry_t entry; |
1553 | /* | ||
1554 | * KSM's break_ksm() relies upon recognizing a ksm page | ||
1555 | * even while it is being migrated, so for that case we | ||
1556 | * need migration_entry_wait(). | ||
1557 | */ | ||
1558 | if (likely(!(flags & FOLL_MIGRATION))) | ||
1559 | goto no_page; | ||
1560 | if (pte_none(pte) || pte_file(pte)) | ||
1561 | goto no_page; | ||
1562 | entry = pte_to_swp_entry(pte); | ||
1563 | if (!is_migration_entry(entry)) | ||
1564 | goto no_page; | ||
1565 | pte_unmap_unlock(ptep, ptl); | ||
1566 | migration_entry_wait(mm, pmd, address); | ||
1567 | goto split_fallthrough; | ||
1568 | } | ||
1544 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | 1569 | if ((flags & FOLL_NUMA) && pte_numa(pte)) |
1545 | goto no_page; | 1570 | goto no_page; |
1546 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1571 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
@@ -1673,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add | |||
1673 | * instead of __get_user_pages. __get_user_pages should be used only if | 1698 | * instead of __get_user_pages. __get_user_pages should be used only if |
1674 | * you need some special @gup_flags. | 1699 | * you need some special @gup_flags. |
1675 | */ | 1700 | */ |
1676 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1701 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1677 | unsigned long start, int nr_pages, unsigned int gup_flags, | 1702 | unsigned long start, unsigned long nr_pages, |
1678 | struct page **pages, struct vm_area_struct **vmas, | 1703 | unsigned int gup_flags, struct page **pages, |
1679 | int *nonblocking) | 1704 | struct vm_area_struct **vmas, int *nonblocking) |
1680 | { | 1705 | { |
1681 | int i; | 1706 | long i; |
1682 | unsigned long vm_flags; | 1707 | unsigned long vm_flags; |
1708 | unsigned int page_mask; | ||
1683 | 1709 | ||
1684 | if (nr_pages <= 0) | 1710 | if (!nr_pages) |
1685 | return 0; | 1711 | return 0; |
1686 | 1712 | ||
1687 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); | 1713 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); |
@@ -1757,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1757 | get_page(page); | 1783 | get_page(page); |
1758 | } | 1784 | } |
1759 | pte_unmap(pte); | 1785 | pte_unmap(pte); |
1786 | page_mask = 0; | ||
1760 | goto next_page; | 1787 | goto next_page; |
1761 | } | 1788 | } |
1762 | 1789 | ||
@@ -1774,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1774 | do { | 1801 | do { |
1775 | struct page *page; | 1802 | struct page *page; |
1776 | unsigned int foll_flags = gup_flags; | 1803 | unsigned int foll_flags = gup_flags; |
1804 | unsigned int page_increm; | ||
1777 | 1805 | ||
1778 | /* | 1806 | /* |
1779 | * If we have a pending SIGKILL, don't keep faulting | 1807 | * If we have a pending SIGKILL, don't keep faulting |
@@ -1783,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1783 | return i ? i : -ERESTARTSYS; | 1811 | return i ? i : -ERESTARTSYS; |
1784 | 1812 | ||
1785 | cond_resched(); | 1813 | cond_resched(); |
1786 | while (!(page = follow_page(vma, start, foll_flags))) { | 1814 | while (!(page = follow_page_mask(vma, start, |
1815 | foll_flags, &page_mask))) { | ||
1787 | int ret; | 1816 | int ret; |
1788 | unsigned int fault_flags = 0; | 1817 | unsigned int fault_flags = 0; |
1789 | 1818 | ||
@@ -1857,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1857 | 1886 | ||
1858 | flush_anon_page(vma, page, start); | 1887 | flush_anon_page(vma, page, start); |
1859 | flush_dcache_page(page); | 1888 | flush_dcache_page(page); |
1889 | page_mask = 0; | ||
1860 | } | 1890 | } |
1861 | next_page: | 1891 | next_page: |
1862 | if (vmas) | 1892 | if (vmas) { |
1863 | vmas[i] = vma; | 1893 | vmas[i] = vma; |
1864 | i++; | 1894 | page_mask = 0; |
1865 | start += PAGE_SIZE; | 1895 | } |
1866 | nr_pages--; | 1896 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); |
1897 | if (page_increm > nr_pages) | ||
1898 | page_increm = nr_pages; | ||
1899 | i += page_increm; | ||
1900 | start += page_increm * PAGE_SIZE; | ||
1901 | nr_pages -= page_increm; | ||
1867 | } while (nr_pages && start < vma->vm_end); | 1902 | } while (nr_pages && start < vma->vm_end); |
1868 | } while (nr_pages); | 1903 | } while (nr_pages); |
1869 | return i; | 1904 | return i; |
@@ -1977,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
1977 | * | 2012 | * |
1978 | * See also get_user_pages_fast, for performance critical applications. | 2013 | * See also get_user_pages_fast, for performance critical applications. |
1979 | */ | 2014 | */ |
1980 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 2015 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1981 | unsigned long start, int nr_pages, int write, int force, | 2016 | unsigned long start, unsigned long nr_pages, int write, |
1982 | struct page **pages, struct vm_area_struct **vmas) | 2017 | int force, struct page **pages, struct vm_area_struct **vmas) |
1983 | { | 2018 | { |
1984 | int flags = FOLL_TOUCH; | 2019 | int flags = FOLL_TOUCH; |
1985 | 2020 | ||
@@ -2919,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2919 | unsigned int flags, pte_t orig_pte) | 2954 | unsigned int flags, pte_t orig_pte) |
2920 | { | 2955 | { |
2921 | spinlock_t *ptl; | 2956 | spinlock_t *ptl; |
2922 | struct page *page, *swapcache = NULL; | 2957 | struct page *page, *swapcache; |
2923 | swp_entry_t entry; | 2958 | swp_entry_t entry; |
2924 | pte_t pte; | 2959 | pte_t pte; |
2925 | int locked; | 2960 | int locked; |
@@ -2970,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2970 | */ | 3005 | */ |
2971 | ret = VM_FAULT_HWPOISON; | 3006 | ret = VM_FAULT_HWPOISON; |
2972 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 3007 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
3008 | swapcache = page; | ||
2973 | goto out_release; | 3009 | goto out_release; |
2974 | } | 3010 | } |
2975 | 3011 | ||
3012 | swapcache = page; | ||
2976 | locked = lock_page_or_retry(page, mm, flags); | 3013 | locked = lock_page_or_retry(page, mm, flags); |
2977 | 3014 | ||
2978 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 3015 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
@@ -2990,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2990 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 3027 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
2991 | goto out_page; | 3028 | goto out_page; |
2992 | 3029 | ||
2993 | if (ksm_might_need_to_copy(page, vma, address)) { | 3030 | page = ksm_might_need_to_copy(page, vma, address); |
2994 | swapcache = page; | 3031 | if (unlikely(!page)) { |
2995 | page = ksm_does_need_to_copy(page, vma, address); | 3032 | ret = VM_FAULT_OOM; |
2996 | 3033 | page = swapcache; | |
2997 | if (unlikely(!page)) { | 3034 | goto out_page; |
2998 | ret = VM_FAULT_OOM; | ||
2999 | page = swapcache; | ||
3000 | swapcache = NULL; | ||
3001 | goto out_page; | ||
3002 | } | ||
3003 | } | 3035 | } |
3004 | 3036 | ||
3005 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { | 3037 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
@@ -3044,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3044 | } | 3076 | } |
3045 | flush_icache_page(vma, page); | 3077 | flush_icache_page(vma, page); |
3046 | set_pte_at(mm, address, page_table, pte); | 3078 | set_pte_at(mm, address, page_table, pte); |
3047 | do_page_add_anon_rmap(page, vma, address, exclusive); | 3079 | if (page == swapcache) |
3080 | do_page_add_anon_rmap(page, vma, address, exclusive); | ||
3081 | else /* ksm created a completely new copy */ | ||
3082 | page_add_new_anon_rmap(page, vma, address); | ||
3048 | /* It's better to call commit-charge after rmap is established */ | 3083 | /* It's better to call commit-charge after rmap is established */ |
3049 | mem_cgroup_commit_charge_swapin(page, ptr); | 3084 | mem_cgroup_commit_charge_swapin(page, ptr); |
3050 | 3085 | ||
@@ -3052,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3052 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 3087 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
3053 | try_to_free_swap(page); | 3088 | try_to_free_swap(page); |
3054 | unlock_page(page); | 3089 | unlock_page(page); |
3055 | if (swapcache) { | 3090 | if (page != swapcache) { |
3056 | /* | 3091 | /* |
3057 | * Hold the lock to avoid the swap entry to be reused | 3092 | * Hold the lock to avoid the swap entry to be reused |
3058 | * until we take the PT lock for the pte_same() check | 3093 | * until we take the PT lock for the pte_same() check |
@@ -3085,7 +3120,7 @@ out_page: | |||
3085 | unlock_page(page); | 3120 | unlock_page(page); |
3086 | out_release: | 3121 | out_release: |
3087 | page_cache_release(page); | 3122 | page_cache_release(page); |
3088 | if (swapcache) { | 3123 | if (page != swapcache) { |
3089 | unlock_page(swapcache); | 3124 | unlock_page(swapcache); |
3090 | page_cache_release(swapcache); | 3125 | page_cache_release(swapcache); |
3091 | } | 3126 | } |
@@ -3821,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
3821 | } | 3856 | } |
3822 | #endif /* __PAGETABLE_PMD_FOLDED */ | 3857 | #endif /* __PAGETABLE_PMD_FOLDED */ |
3823 | 3858 | ||
3824 | int make_pages_present(unsigned long addr, unsigned long end) | ||
3825 | { | ||
3826 | int ret, len, write; | ||
3827 | struct vm_area_struct * vma; | ||
3828 | |||
3829 | vma = find_vma(current->mm, addr); | ||
3830 | if (!vma) | ||
3831 | return -ENOMEM; | ||
3832 | /* | ||
3833 | * We want to touch writable mappings with a write fault in order | ||
3834 | * to break COW, except for shared mappings because these don't COW | ||
3835 | * and we would not want to dirty them for nothing. | ||
3836 | */ | ||
3837 | write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; | ||
3838 | BUG_ON(addr >= end); | ||
3839 | BUG_ON(end > vma->vm_end); | ||
3840 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | ||
3841 | ret = get_user_pages(current, current->mm, addr, | ||
3842 | len, write, 0, NULL, NULL); | ||
3843 | if (ret < 0) | ||
3844 | return ret; | ||
3845 | return ret == len ? 0 : -EFAULT; | ||
3846 | } | ||
3847 | |||
3848 | #if !defined(__HAVE_ARCH_GATE_AREA) | 3859 | #if !defined(__HAVE_ARCH_GATE_AREA) |
3849 | 3860 | ||
3850 | #if defined(AT_SYSINFO_EHDR) | 3861 | #if defined(AT_SYSINFO_EHDR) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d04ed87bfacb..b81a367b9f39 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
30 | #include <linux/mm_inline.h> | 30 | #include <linux/mm_inline.h> |
31 | #include <linux/firmware-map.h> | 31 | #include <linux/firmware-map.h> |
32 | #include <linux/stop_machine.h> | ||
32 | 33 | ||
33 | #include <asm/tlbflush.h> | 34 | #include <asm/tlbflush.h> |
34 | 35 | ||
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res) | |||
91 | } | 92 | } |
92 | 93 | ||
93 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 94 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
94 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 95 | void get_page_bootmem(unsigned long info, struct page *page, |
95 | static void get_page_bootmem(unsigned long info, struct page *page, | 96 | unsigned long type) |
96 | unsigned long type) | ||
97 | { | 97 | { |
98 | page->lru.next = (struct list_head *) type; | 98 | page->lru.next = (struct list_head *) type; |
99 | SetPagePrivate(page); | 99 | SetPagePrivate(page); |
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page) | |||
124 | mutex_lock(&ppb_lock); | 124 | mutex_lock(&ppb_lock); |
125 | __free_pages_bootmem(page, 0); | 125 | __free_pages_bootmem(page, 0); |
126 | mutex_unlock(&ppb_lock); | 126 | mutex_unlock(&ppb_lock); |
127 | totalram_pages++; | ||
127 | } | 128 | } |
128 | 129 | ||
129 | } | 130 | } |
130 | 131 | ||
132 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE | ||
133 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | ||
131 | static void register_page_bootmem_info_section(unsigned long start_pfn) | 134 | static void register_page_bootmem_info_section(unsigned long start_pfn) |
132 | { | 135 | { |
133 | unsigned long *usemap, mapsize, section_nr, i; | 136 | unsigned long *usemap, mapsize, section_nr, i; |
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) | |||
161 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); | 164 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); |
162 | 165 | ||
163 | } | 166 | } |
167 | #else /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
168 | static void register_page_bootmem_info_section(unsigned long start_pfn) | ||
169 | { | ||
170 | unsigned long *usemap, mapsize, section_nr, i; | ||
171 | struct mem_section *ms; | ||
172 | struct page *page, *memmap; | ||
173 | |||
174 | if (!pfn_valid(start_pfn)) | ||
175 | return; | ||
176 | |||
177 | section_nr = pfn_to_section_nr(start_pfn); | ||
178 | ms = __nr_to_section(section_nr); | ||
179 | |||
180 | memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); | ||
181 | |||
182 | register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); | ||
183 | |||
184 | usemap = __nr_to_section(section_nr)->pageblock_flags; | ||
185 | page = virt_to_page(usemap); | ||
186 | |||
187 | mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; | ||
188 | |||
189 | for (i = 0; i < mapsize; i++, page++) | ||
190 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); | ||
191 | } | ||
192 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | ||
164 | 193 | ||
165 | void register_page_bootmem_info_node(struct pglist_data *pgdat) | 194 | void register_page_bootmem_info_node(struct pglist_data *pgdat) |
166 | { | 195 | { |
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
189 | } | 218 | } |
190 | 219 | ||
191 | pfn = pgdat->node_start_pfn; | 220 | pfn = pgdat->node_start_pfn; |
192 | end_pfn = pfn + pgdat->node_spanned_pages; | 221 | end_pfn = pgdat_end_pfn(pgdat); |
193 | 222 | ||
194 | /* register_section info */ | 223 | /* register_section info */ |
195 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 224 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
203 | register_page_bootmem_info_section(pfn); | 232 | register_page_bootmem_info_section(pfn); |
204 | } | 233 | } |
205 | } | 234 | } |
206 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 235 | #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ |
207 | 236 | ||
208 | static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | 237 | static void grow_zone_span(struct zone *zone, unsigned long start_pfn, |
209 | unsigned long end_pfn) | 238 | unsigned long end_pfn) |
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | |||
253 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); | 282 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); |
254 | } | 283 | } |
255 | 284 | ||
285 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or | ||
286 | * alloc_bootmem_node_nopanic() */ | ||
287 | static int __ref ensure_zone_is_initialized(struct zone *zone, | ||
288 | unsigned long start_pfn, unsigned long num_pages) | ||
289 | { | ||
290 | if (!zone_is_initialized(zone)) | ||
291 | return init_currently_empty_zone(zone, start_pfn, num_pages, | ||
292 | MEMMAP_HOTPLUG); | ||
293 | return 0; | ||
294 | } | ||
295 | |||
256 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | 296 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, |
257 | unsigned long start_pfn, unsigned long end_pfn) | 297 | unsigned long start_pfn, unsigned long end_pfn) |
258 | { | 298 | { |
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
260 | unsigned long flags; | 300 | unsigned long flags; |
261 | unsigned long z1_start_pfn; | 301 | unsigned long z1_start_pfn; |
262 | 302 | ||
263 | if (!z1->wait_table) { | 303 | ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); |
264 | ret = init_currently_empty_zone(z1, start_pfn, | 304 | if (ret) |
265 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | 305 | return ret; |
266 | if (ret) | ||
267 | return ret; | ||
268 | } | ||
269 | 306 | ||
270 | pgdat_resize_lock(z1->zone_pgdat, &flags); | 307 | pgdat_resize_lock(z1->zone_pgdat, &flags); |
271 | 308 | ||
272 | /* can't move pfns which are higher than @z2 */ | 309 | /* can't move pfns which are higher than @z2 */ |
273 | if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | 310 | if (end_pfn > zone_end_pfn(z2)) |
274 | goto out_fail; | 311 | goto out_fail; |
275 | /* the move out part mast at the left most of @z2 */ | 312 | /* the move out part mast at the left most of @z2 */ |
276 | if (start_pfn > z2->zone_start_pfn) | 313 | if (start_pfn > z2->zone_start_pfn) |
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
286 | z1_start_pfn = start_pfn; | 323 | z1_start_pfn = start_pfn; |
287 | 324 | ||
288 | resize_zone(z1, z1_start_pfn, end_pfn); | 325 | resize_zone(z1, z1_start_pfn, end_pfn); |
289 | resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | 326 | resize_zone(z2, end_pfn, zone_end_pfn(z2)); |
290 | 327 | ||
291 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | 328 | pgdat_resize_unlock(z1->zone_pgdat, &flags); |
292 | 329 | ||
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | |||
305 | unsigned long flags; | 342 | unsigned long flags; |
306 | unsigned long z2_end_pfn; | 343 | unsigned long z2_end_pfn; |
307 | 344 | ||
308 | if (!z2->wait_table) { | 345 | ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); |
309 | ret = init_currently_empty_zone(z2, start_pfn, | 346 | if (ret) |
310 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | 347 | return ret; |
311 | if (ret) | ||
312 | return ret; | ||
313 | } | ||
314 | 348 | ||
315 | pgdat_resize_lock(z1->zone_pgdat, &flags); | 349 | pgdat_resize_lock(z1->zone_pgdat, &flags); |
316 | 350 | ||
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | |||
318 | if (z1->zone_start_pfn > start_pfn) | 352 | if (z1->zone_start_pfn > start_pfn) |
319 | goto out_fail; | 353 | goto out_fail; |
320 | /* the move out part mast at the right most of @z1 */ | 354 | /* the move out part mast at the right most of @z1 */ |
321 | if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) | 355 | if (zone_end_pfn(z1) > end_pfn) |
322 | goto out_fail; | 356 | goto out_fail; |
323 | /* must included/overlap */ | 357 | /* must included/overlap */ |
324 | if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | 358 | if (start_pfn >= zone_end_pfn(z1)) |
325 | goto out_fail; | 359 | goto out_fail; |
326 | 360 | ||
327 | /* use end_pfn for z2's end_pfn if z2 is empty */ | 361 | /* use end_pfn for z2's end_pfn if z2 is empty */ |
328 | if (z2->spanned_pages) | 362 | if (z2->spanned_pages) |
329 | z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; | 363 | z2_end_pfn = zone_end_pfn(z2); |
330 | else | 364 | else |
331 | z2_end_pfn = end_pfn; | 365 | z2_end_pfn = end_pfn; |
332 | 366 | ||
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
363 | int nid = pgdat->node_id; | 397 | int nid = pgdat->node_id; |
364 | int zone_type; | 398 | int zone_type; |
365 | unsigned long flags; | 399 | unsigned long flags; |
400 | int ret; | ||
366 | 401 | ||
367 | zone_type = zone - pgdat->node_zones; | 402 | zone_type = zone - pgdat->node_zones; |
368 | if (!zone->wait_table) { | 403 | ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); |
369 | int ret; | 404 | if (ret) |
405 | return ret; | ||
370 | 406 | ||
371 | ret = init_currently_empty_zone(zone, phys_start_pfn, | ||
372 | nr_pages, MEMMAP_HOTPLUG); | ||
373 | if (ret) | ||
374 | return ret; | ||
375 | } | ||
376 | pgdat_resize_lock(zone->zone_pgdat, &flags); | 407 | pgdat_resize_lock(zone->zone_pgdat, &flags); |
377 | grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); | 408 | grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); |
378 | grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, | 409 | grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, |
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone, | |||
405 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); | 436 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); |
406 | } | 437 | } |
407 | 438 | ||
408 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 439 | /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ |
409 | static int __remove_section(struct zone *zone, struct mem_section *ms) | 440 | static int find_smallest_section_pfn(int nid, struct zone *zone, |
441 | unsigned long start_pfn, | ||
442 | unsigned long end_pfn) | ||
443 | { | ||
444 | struct mem_section *ms; | ||
445 | |||
446 | for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { | ||
447 | ms = __pfn_to_section(start_pfn); | ||
448 | |||
449 | if (unlikely(!valid_section(ms))) | ||
450 | continue; | ||
451 | |||
452 | if (unlikely(pfn_to_nid(start_pfn) != nid)) | ||
453 | continue; | ||
454 | |||
455 | if (zone && zone != page_zone(pfn_to_page(start_pfn))) | ||
456 | continue; | ||
457 | |||
458 | return start_pfn; | ||
459 | } | ||
460 | |||
461 | return 0; | ||
462 | } | ||
463 | |||
464 | /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ | ||
465 | static int find_biggest_section_pfn(int nid, struct zone *zone, | ||
466 | unsigned long start_pfn, | ||
467 | unsigned long end_pfn) | ||
468 | { | ||
469 | struct mem_section *ms; | ||
470 | unsigned long pfn; | ||
471 | |||
472 | /* pfn is the end pfn of a memory section. */ | ||
473 | pfn = end_pfn - 1; | ||
474 | for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { | ||
475 | ms = __pfn_to_section(pfn); | ||
476 | |||
477 | if (unlikely(!valid_section(ms))) | ||
478 | continue; | ||
479 | |||
480 | if (unlikely(pfn_to_nid(pfn) != nid)) | ||
481 | continue; | ||
482 | |||
483 | if (zone && zone != page_zone(pfn_to_page(pfn))) | ||
484 | continue; | ||
485 | |||
486 | return pfn; | ||
487 | } | ||
488 | |||
489 | return 0; | ||
490 | } | ||
491 | |||
492 | static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, | ||
493 | unsigned long end_pfn) | ||
410 | { | 494 | { |
495 | unsigned long zone_start_pfn = zone->zone_start_pfn; | ||
496 | unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
497 | unsigned long pfn; | ||
498 | struct mem_section *ms; | ||
499 | int nid = zone_to_nid(zone); | ||
500 | |||
501 | zone_span_writelock(zone); | ||
502 | if (zone_start_pfn == start_pfn) { | ||
503 | /* | ||
504 | * If the section is smallest section in the zone, it need | ||
505 | * shrink zone->zone_start_pfn and zone->zone_spanned_pages. | ||
506 | * In this case, we find second smallest valid mem_section | ||
507 | * for shrinking zone. | ||
508 | */ | ||
509 | pfn = find_smallest_section_pfn(nid, zone, end_pfn, | ||
510 | zone_end_pfn); | ||
511 | if (pfn) { | ||
512 | zone->zone_start_pfn = pfn; | ||
513 | zone->spanned_pages = zone_end_pfn - pfn; | ||
514 | } | ||
515 | } else if (zone_end_pfn == end_pfn) { | ||
516 | /* | ||
517 | * If the section is biggest section in the zone, it need | ||
518 | * shrink zone->spanned_pages. | ||
519 | * In this case, we find second biggest valid mem_section for | ||
520 | * shrinking zone. | ||
521 | */ | ||
522 | pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, | ||
523 | start_pfn); | ||
524 | if (pfn) | ||
525 | zone->spanned_pages = pfn - zone_start_pfn + 1; | ||
526 | } | ||
527 | |||
411 | /* | 528 | /* |
412 | * XXX: Freeing memmap with vmemmap is not implement yet. | 529 | * The section is not biggest or smallest mem_section in the zone, it |
413 | * This should be removed later. | 530 | * only creates a hole in the zone. So in this case, we need not |
531 | * change the zone. But perhaps, the zone has only hole data. Thus | ||
532 | * it check the zone has only hole or not. | ||
414 | */ | 533 | */ |
415 | return -EBUSY; | 534 | pfn = zone_start_pfn; |
535 | for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { | ||
536 | ms = __pfn_to_section(pfn); | ||
537 | |||
538 | if (unlikely(!valid_section(ms))) | ||
539 | continue; | ||
540 | |||
541 | if (page_zone(pfn_to_page(pfn)) != zone) | ||
542 | continue; | ||
543 | |||
544 | /* If the section is current section, it continues the loop */ | ||
545 | if (start_pfn == pfn) | ||
546 | continue; | ||
547 | |||
548 | /* If we find valid section, we have nothing to do */ | ||
549 | zone_span_writeunlock(zone); | ||
550 | return; | ||
551 | } | ||
552 | |||
553 | /* The zone has no valid section */ | ||
554 | zone->zone_start_pfn = 0; | ||
555 | zone->spanned_pages = 0; | ||
556 | zone_span_writeunlock(zone); | ||
416 | } | 557 | } |
417 | #else | 558 | |
418 | static int __remove_section(struct zone *zone, struct mem_section *ms) | 559 | static void shrink_pgdat_span(struct pglist_data *pgdat, |
560 | unsigned long start_pfn, unsigned long end_pfn) | ||
561 | { | ||
562 | unsigned long pgdat_start_pfn = pgdat->node_start_pfn; | ||
563 | unsigned long pgdat_end_pfn = | ||
564 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
565 | unsigned long pfn; | ||
566 | struct mem_section *ms; | ||
567 | int nid = pgdat->node_id; | ||
568 | |||
569 | if (pgdat_start_pfn == start_pfn) { | ||
570 | /* | ||
571 | * If the section is smallest section in the pgdat, it need | ||
572 | * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. | ||
573 | * In this case, we find second smallest valid mem_section | ||
574 | * for shrinking zone. | ||
575 | */ | ||
576 | pfn = find_smallest_section_pfn(nid, NULL, end_pfn, | ||
577 | pgdat_end_pfn); | ||
578 | if (pfn) { | ||
579 | pgdat->node_start_pfn = pfn; | ||
580 | pgdat->node_spanned_pages = pgdat_end_pfn - pfn; | ||
581 | } | ||
582 | } else if (pgdat_end_pfn == end_pfn) { | ||
583 | /* | ||
584 | * If the section is biggest section in the pgdat, it need | ||
585 | * shrink pgdat->node_spanned_pages. | ||
586 | * In this case, we find second biggest valid mem_section for | ||
587 | * shrinking zone. | ||
588 | */ | ||
589 | pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, | ||
590 | start_pfn); | ||
591 | if (pfn) | ||
592 | pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * If the section is not biggest or smallest mem_section in the pgdat, | ||
597 | * it only creates a hole in the pgdat. So in this case, we need not | ||
598 | * change the pgdat. | ||
599 | * But perhaps, the pgdat has only hole data. Thus it check the pgdat | ||
600 | * has only hole or not. | ||
601 | */ | ||
602 | pfn = pgdat_start_pfn; | ||
603 | for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { | ||
604 | ms = __pfn_to_section(pfn); | ||
605 | |||
606 | if (unlikely(!valid_section(ms))) | ||
607 | continue; | ||
608 | |||
609 | if (pfn_to_nid(pfn) != nid) | ||
610 | continue; | ||
611 | |||
612 | /* If the section is current section, it continues the loop */ | ||
613 | if (start_pfn == pfn) | ||
614 | continue; | ||
615 | |||
616 | /* If we find valid section, we have nothing to do */ | ||
617 | return; | ||
618 | } | ||
619 | |||
620 | /* The pgdat has no valid section */ | ||
621 | pgdat->node_start_pfn = 0; | ||
622 | pgdat->node_spanned_pages = 0; | ||
623 | } | ||
624 | |||
625 | static void __remove_zone(struct zone *zone, unsigned long start_pfn) | ||
419 | { | 626 | { |
420 | unsigned long flags; | ||
421 | struct pglist_data *pgdat = zone->zone_pgdat; | 627 | struct pglist_data *pgdat = zone->zone_pgdat; |
628 | int nr_pages = PAGES_PER_SECTION; | ||
629 | int zone_type; | ||
630 | unsigned long flags; | ||
631 | |||
632 | zone_type = zone - pgdat->node_zones; | ||
633 | |||
634 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
635 | shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); | ||
636 | shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); | ||
637 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | ||
638 | } | ||
639 | |||
640 | static int __remove_section(struct zone *zone, struct mem_section *ms) | ||
641 | { | ||
642 | unsigned long start_pfn; | ||
643 | int scn_nr; | ||
422 | int ret = -EINVAL; | 644 | int ret = -EINVAL; |
423 | 645 | ||
424 | if (!valid_section(ms)) | 646 | if (!valid_section(ms)) |
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
428 | if (ret) | 650 | if (ret) |
429 | return ret; | 651 | return ret; |
430 | 652 | ||
431 | pgdat_resize_lock(pgdat, &flags); | 653 | scn_nr = __section_nr(ms); |
654 | start_pfn = section_nr_to_pfn(scn_nr); | ||
655 | __remove_zone(zone, start_pfn); | ||
656 | |||
432 | sparse_remove_one_section(zone, ms); | 657 | sparse_remove_one_section(zone, ms); |
433 | pgdat_resize_unlock(pgdat, &flags); | ||
434 | return 0; | 658 | return 0; |
435 | } | 659 | } |
436 | #endif | ||
437 | 660 | ||
438 | /* | 661 | /* |
439 | * Reasonably generic function for adding memory. It is | 662 | * Reasonably generic function for adding memory. It is |
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
797 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; | 1020 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; |
798 | unsigned long start_pfn = start >> PAGE_SHIFT; | 1021 | unsigned long start_pfn = start >> PAGE_SHIFT; |
799 | 1022 | ||
800 | pgdat = arch_alloc_nodedata(nid); | 1023 | pgdat = NODE_DATA(nid); |
801 | if (!pgdat) | 1024 | if (!pgdat) { |
802 | return NULL; | 1025 | pgdat = arch_alloc_nodedata(nid); |
1026 | if (!pgdat) | ||
1027 | return NULL; | ||
803 | 1028 | ||
804 | arch_refresh_nodedata(nid, pgdat); | 1029 | arch_refresh_nodedata(nid, pgdat); |
1030 | } | ||
805 | 1031 | ||
806 | /* we can use NODE_DATA(nid) from here */ | 1032 | /* we can use NODE_DATA(nid) from here */ |
807 | 1033 | ||
@@ -854,7 +1080,8 @@ out: | |||
854 | int __ref add_memory(int nid, u64 start, u64 size) | 1080 | int __ref add_memory(int nid, u64 start, u64 size) |
855 | { | 1081 | { |
856 | pg_data_t *pgdat = NULL; | 1082 | pg_data_t *pgdat = NULL; |
857 | int new_pgdat = 0; | 1083 | bool new_pgdat; |
1084 | bool new_node; | ||
858 | struct resource *res; | 1085 | struct resource *res; |
859 | int ret; | 1086 | int ret; |
860 | 1087 | ||
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
865 | if (!res) | 1092 | if (!res) |
866 | goto out; | 1093 | goto out; |
867 | 1094 | ||
868 | if (!node_online(nid)) { | 1095 | { /* Stupid hack to suppress address-never-null warning */ |
1096 | void *p = NODE_DATA(nid); | ||
1097 | new_pgdat = !p; | ||
1098 | } | ||
1099 | new_node = !node_online(nid); | ||
1100 | if (new_node) { | ||
869 | pgdat = hotadd_new_pgdat(nid, start); | 1101 | pgdat = hotadd_new_pgdat(nid, start); |
870 | ret = -ENOMEM; | 1102 | ret = -ENOMEM; |
871 | if (!pgdat) | 1103 | if (!pgdat) |
872 | goto error; | 1104 | goto error; |
873 | new_pgdat = 1; | ||
874 | } | 1105 | } |
875 | 1106 | ||
876 | /* call arch's memory hotadd */ | 1107 | /* call arch's memory hotadd */ |
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
882 | /* we online node here. we can't roll back from here. */ | 1113 | /* we online node here. we can't roll back from here. */ |
883 | node_set_online(nid); | 1114 | node_set_online(nid); |
884 | 1115 | ||
885 | if (new_pgdat) { | 1116 | if (new_node) { |
886 | ret = register_one_node(nid); | 1117 | ret = register_one_node(nid); |
887 | /* | 1118 | /* |
888 | * If sysfs file of new node can't create, cpu on the node | 1119 | * If sysfs file of new node can't create, cpu on the node |
@@ -901,8 +1132,7 @@ error: | |||
901 | /* rollback pgdat allocation and others */ | 1132 | /* rollback pgdat allocation and others */ |
902 | if (new_pgdat) | 1133 | if (new_pgdat) |
903 | rollback_node_hotadd(nid, pgdat); | 1134 | rollback_node_hotadd(nid, pgdat); |
904 | if (res) | 1135 | release_memory_resource(res); |
905 | release_memory_resource(res); | ||
906 | 1136 | ||
907 | out: | 1137 | out: |
908 | unlock_memory_hotplug(); | 1138 | unlock_memory_hotplug(); |
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1058 | * migrate_pages returns # of failed pages. | 1288 | * migrate_pages returns # of failed pages. |
1059 | */ | 1289 | */ |
1060 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1290 | ret = migrate_pages(&source, alloc_migrate_target, 0, |
1061 | true, MIGRATE_SYNC, | 1291 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
1062 | MR_MEMORY_HOTPLUG); | ||
1063 | if (ret) | 1292 | if (ret) |
1064 | putback_lru_pages(&source); | 1293 | putback_lru_pages(&source); |
1065 | } | 1294 | } |
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | |||
1381 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | 1610 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); |
1382 | } | 1611 | } |
1383 | 1612 | ||
1384 | int remove_memory(u64 start, u64 size) | 1613 | /** |
1614 | * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) | ||
1615 | * @start_pfn: start pfn of the memory range | ||
1616 | * @end_pfn: end pft of the memory range | ||
1617 | * @arg: argument passed to func | ||
1618 | * @func: callback for each memory section walked | ||
1619 | * | ||
1620 | * This function walks through all present mem sections in range | ||
1621 | * [start_pfn, end_pfn) and call func on each mem section. | ||
1622 | * | ||
1623 | * Returns the return value of func. | ||
1624 | */ | ||
1625 | static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, | ||
1626 | void *arg, int (*func)(struct memory_block *, void *)) | ||
1385 | { | 1627 | { |
1386 | struct memory_block *mem = NULL; | 1628 | struct memory_block *mem = NULL; |
1387 | struct mem_section *section; | 1629 | struct mem_section *section; |
1388 | unsigned long start_pfn, end_pfn; | ||
1389 | unsigned long pfn, section_nr; | 1630 | unsigned long pfn, section_nr; |
1390 | int ret; | 1631 | int ret; |
1391 | 1632 | ||
1392 | start_pfn = PFN_DOWN(start); | ||
1393 | end_pfn = start_pfn + PFN_DOWN(size); | ||
1394 | |||
1395 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 1633 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
1396 | section_nr = pfn_to_section_nr(pfn); | 1634 | section_nr = pfn_to_section_nr(pfn); |
1397 | if (!present_section_nr(section_nr)) | 1635 | if (!present_section_nr(section_nr)) |
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size) | |||
1408 | if (!mem) | 1646 | if (!mem) |
1409 | continue; | 1647 | continue; |
1410 | 1648 | ||
1411 | ret = offline_memory_block(mem); | 1649 | ret = func(mem, arg); |
1412 | if (ret) { | 1650 | if (ret) { |
1413 | kobject_put(&mem->dev.kobj); | 1651 | kobject_put(&mem->dev.kobj); |
1414 | return ret; | 1652 | return ret; |
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size) | |||
1420 | 1658 | ||
1421 | return 0; | 1659 | return 0; |
1422 | } | 1660 | } |
1661 | |||
1662 | /** | ||
1663 | * offline_memory_block_cb - callback function for offlining memory block | ||
1664 | * @mem: the memory block to be offlined | ||
1665 | * @arg: buffer to hold error msg | ||
1666 | * | ||
1667 | * Always return 0, and put the error msg in arg if any. | ||
1668 | */ | ||
1669 | static int offline_memory_block_cb(struct memory_block *mem, void *arg) | ||
1670 | { | ||
1671 | int *ret = arg; | ||
1672 | int error = offline_memory_block(mem); | ||
1673 | |||
1674 | if (error != 0 && *ret == 0) | ||
1675 | *ret = error; | ||
1676 | |||
1677 | return 0; | ||
1678 | } | ||
1679 | |||
1680 | static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) | ||
1681 | { | ||
1682 | int ret = !is_memblock_offlined(mem); | ||
1683 | |||
1684 | if (unlikely(ret)) | ||
1685 | pr_warn("removing memory fails, because memory " | ||
1686 | "[%#010llx-%#010llx] is onlined\n", | ||
1687 | PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)), | ||
1688 | PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1); | ||
1689 | |||
1690 | return ret; | ||
1691 | } | ||
1692 | |||
1693 | static int check_cpu_on_node(void *data) | ||
1694 | { | ||
1695 | struct pglist_data *pgdat = data; | ||
1696 | int cpu; | ||
1697 | |||
1698 | for_each_present_cpu(cpu) { | ||
1699 | if (cpu_to_node(cpu) == pgdat->node_id) | ||
1700 | /* | ||
1701 | * the cpu on this node isn't removed, and we can't | ||
1702 | * offline this node. | ||
1703 | */ | ||
1704 | return -EBUSY; | ||
1705 | } | ||
1706 | |||
1707 | return 0; | ||
1708 | } | ||
1709 | |||
1710 | static void unmap_cpu_on_node(void *data) | ||
1711 | { | ||
1712 | #ifdef CONFIG_ACPI_NUMA | ||
1713 | struct pglist_data *pgdat = data; | ||
1714 | int cpu; | ||
1715 | |||
1716 | for_each_possible_cpu(cpu) | ||
1717 | if (cpu_to_node(cpu) == pgdat->node_id) | ||
1718 | numa_clear_node(cpu); | ||
1719 | #endif | ||
1720 | } | ||
1721 | |||
1722 | static int check_and_unmap_cpu_on_node(void *data) | ||
1723 | { | ||
1724 | int ret = check_cpu_on_node(data); | ||
1725 | |||
1726 | if (ret) | ||
1727 | return ret; | ||
1728 | |||
1729 | /* | ||
1730 | * the node will be offlined when we come here, so we can clear | ||
1731 | * the cpu_to_node() now. | ||
1732 | */ | ||
1733 | |||
1734 | unmap_cpu_on_node(data); | ||
1735 | return 0; | ||
1736 | } | ||
1737 | |||
1738 | /* offline the node if all memory sections of this node are removed */ | ||
1739 | void try_offline_node(int nid) | ||
1740 | { | ||
1741 | pg_data_t *pgdat = NODE_DATA(nid); | ||
1742 | unsigned long start_pfn = pgdat->node_start_pfn; | ||
1743 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
1744 | unsigned long pfn; | ||
1745 | struct page *pgdat_page = virt_to_page(pgdat); | ||
1746 | int i; | ||
1747 | |||
1748 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | ||
1749 | unsigned long section_nr = pfn_to_section_nr(pfn); | ||
1750 | |||
1751 | if (!present_section_nr(section_nr)) | ||
1752 | continue; | ||
1753 | |||
1754 | if (pfn_to_nid(pfn) != nid) | ||
1755 | continue; | ||
1756 | |||
1757 | /* | ||
1758 | * some memory sections of this node are not removed, and we | ||
1759 | * can't offline node now. | ||
1760 | */ | ||
1761 | return; | ||
1762 | } | ||
1763 | |||
1764 | if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) | ||
1765 | return; | ||
1766 | |||
1767 | /* | ||
1768 | * all memory/cpu of this node are removed, we can offline this | ||
1769 | * node now. | ||
1770 | */ | ||
1771 | node_set_offline(nid); | ||
1772 | unregister_one_node(nid); | ||
1773 | |||
1774 | if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) | ||
1775 | /* node data is allocated from boot memory */ | ||
1776 | return; | ||
1777 | |||
1778 | /* free waittable in each zone */ | ||
1779 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1780 | struct zone *zone = pgdat->node_zones + i; | ||
1781 | |||
1782 | if (zone->wait_table) | ||
1783 | vfree(zone->wait_table); | ||
1784 | } | ||
1785 | |||
1786 | /* | ||
1787 | * Since there is no way to guarentee the address of pgdat/zone is not | ||
1788 | * on stack of any kernel threads or used by other kernel objects | ||
1789 | * without reference counting or other symchronizing method, do not | ||
1790 | * reset node_data and free pgdat here. Just reset it to 0 and reuse | ||
1791 | * the memory when the node is online again. | ||
1792 | */ | ||
1793 | memset(pgdat, 0, sizeof(*pgdat)); | ||
1794 | } | ||
1795 | EXPORT_SYMBOL(try_offline_node); | ||
1796 | |||
1797 | int __ref remove_memory(int nid, u64 start, u64 size) | ||
1798 | { | ||
1799 | unsigned long start_pfn, end_pfn; | ||
1800 | int ret = 0; | ||
1801 | int retry = 1; | ||
1802 | |||
1803 | start_pfn = PFN_DOWN(start); | ||
1804 | end_pfn = start_pfn + PFN_DOWN(size); | ||
1805 | |||
1806 | /* | ||
1807 | * When CONFIG_MEMCG is on, one memory block may be used by other | ||
1808 | * blocks to store page cgroup when onlining pages. But we don't know | ||
1809 | * in what order pages are onlined. So we iterate twice to offline | ||
1810 | * memory: | ||
1811 | * 1st iterate: offline every non primary memory block. | ||
1812 | * 2nd iterate: offline primary (i.e. first added) memory block. | ||
1813 | */ | ||
1814 | repeat: | ||
1815 | walk_memory_range(start_pfn, end_pfn, &ret, | ||
1816 | offline_memory_block_cb); | ||
1817 | if (ret) { | ||
1818 | if (!retry) | ||
1819 | return ret; | ||
1820 | |||
1821 | retry = 0; | ||
1822 | ret = 0; | ||
1823 | goto repeat; | ||
1824 | } | ||
1825 | |||
1826 | lock_memory_hotplug(); | ||
1827 | |||
1828 | /* | ||
1829 | * we have offlined all memory blocks like this: | ||
1830 | * 1. lock memory hotplug | ||
1831 | * 2. offline a memory block | ||
1832 | * 3. unlock memory hotplug | ||
1833 | * | ||
1834 | * repeat step1-3 to offline the memory block. All memory blocks | ||
1835 | * must be offlined before removing memory. But we don't hold the | ||
1836 | * lock in the whole operation. So we should check whether all | ||
1837 | * memory blocks are offlined. | ||
1838 | */ | ||
1839 | |||
1840 | ret = walk_memory_range(start_pfn, end_pfn, NULL, | ||
1841 | is_memblock_offlined_cb); | ||
1842 | if (ret) { | ||
1843 | unlock_memory_hotplug(); | ||
1844 | return ret; | ||
1845 | } | ||
1846 | |||
1847 | /* remove memmap entry */ | ||
1848 | firmware_map_remove(start, start + size, "System RAM"); | ||
1849 | |||
1850 | arch_remove_memory(start, size); | ||
1851 | |||
1852 | try_offline_node(nid); | ||
1853 | |||
1854 | unlock_memory_hotplug(); | ||
1855 | |||
1856 | return 0; | ||
1857 | } | ||
1423 | #else | 1858 | #else |
1424 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | 1859 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) |
1425 | { | 1860 | { |
1426 | return -EINVAL; | 1861 | return -EINVAL; |
1427 | } | 1862 | } |
1428 | int remove_memory(u64 start, u64 size) | 1863 | int remove_memory(int nid, u64 start, u64 size) |
1429 | { | 1864 | { |
1430 | return -EINVAL; | 1865 | return -EINVAL; |
1431 | } | 1866 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e2df1c1fb41f..31d26637b658 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -26,7 +26,7 @@ | |||
26 | * the allocation to memory nodes instead | 26 | * the allocation to memory nodes instead |
27 | * | 27 | * |
28 | * preferred Try a specific node first before normal fallback. | 28 | * preferred Try a specific node first before normal fallback. |
29 | * As a special case node -1 here means do the allocation | 29 | * As a special case NUMA_NO_NODE here means do the allocation |
30 | * on the local CPU. This is normally identical to default, | 30 | * on the local CPU. This is normally identical to default, |
31 | * but useful to set in a VMA when you have a non default | 31 | * but useful to set in a VMA when you have a non default |
32 | * process policy. | 32 | * process policy. |
@@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p) | |||
127 | 127 | ||
128 | if (!pol) { | 128 | if (!pol) { |
129 | node = numa_node_id(); | 129 | node = numa_node_id(); |
130 | if (node != -1) | 130 | if (node != NUMA_NO_NODE) |
131 | pol = &preferred_node_policy[node]; | 131 | pol = &preferred_node_policy[node]; |
132 | 132 | ||
133 | /* preferred_node_policy is not initialised early in boot */ | 133 | /* preferred_node_policy is not initialised early in boot */ |
@@ -161,19 +161,7 @@ static const struct mempolicy_operations { | |||
161 | /* Check that the nodemask contains at least one populated zone */ | 161 | /* Check that the nodemask contains at least one populated zone */ |
162 | static int is_valid_nodemask(const nodemask_t *nodemask) | 162 | static int is_valid_nodemask(const nodemask_t *nodemask) |
163 | { | 163 | { |
164 | int nd, k; | 164 | return nodes_intersects(*nodemask, node_states[N_MEMORY]); |
165 | |||
166 | for_each_node_mask(nd, *nodemask) { | ||
167 | struct zone *z; | ||
168 | |||
169 | for (k = 0; k <= policy_zone; k++) { | ||
170 | z = &NODE_DATA(nd)->node_zones[k]; | ||
171 | if (z->present_pages > 0) | ||
172 | return 1; | ||
173 | } | ||
174 | } | ||
175 | |||
176 | return 0; | ||
177 | } | 165 | } |
178 | 166 | ||
179 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) | 167 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) |
@@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
270 | struct mempolicy *policy; | 258 | struct mempolicy *policy; |
271 | 259 | ||
272 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 260 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
273 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); | 261 | mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); |
274 | 262 | ||
275 | if (mode == MPOL_DEFAULT) { | 263 | if (mode == MPOL_DEFAULT) { |
276 | if (nodes && !nodes_empty(*nodes)) | 264 | if (nodes && !nodes_empty(*nodes)) |
@@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
508 | /* | 496 | /* |
509 | * vm_normal_page() filters out zero pages, but there might | 497 | * vm_normal_page() filters out zero pages, but there might |
510 | * still be PageReserved pages to skip, perhaps in a VDSO. | 498 | * still be PageReserved pages to skip, perhaps in a VDSO. |
511 | * And we cannot move PageKsm pages sensibly or safely yet. | ||
512 | */ | 499 | */ |
513 | if (PageReserved(page) || PageKsm(page)) | 500 | if (PageReserved(page)) |
514 | continue; | 501 | continue; |
515 | nid = page_to_nid(page); | 502 | nid = page_to_nid(page); |
516 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 503 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
@@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
1027 | 1014 | ||
1028 | if (!list_empty(&pagelist)) { | 1015 | if (!list_empty(&pagelist)) { |
1029 | err = migrate_pages(&pagelist, new_node_page, dest, | 1016 | err = migrate_pages(&pagelist, new_node_page, dest, |
1030 | false, MIGRATE_SYNC, | 1017 | MIGRATE_SYNC, MR_SYSCALL); |
1031 | MR_SYSCALL); | ||
1032 | if (err) | 1018 | if (err) |
1033 | putback_lru_pages(&pagelist); | 1019 | putback_lru_pages(&pagelist); |
1034 | } | 1020 | } |
@@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1235 | 1221 | ||
1236 | pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", | 1222 | pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", |
1237 | start, start + len, mode, mode_flags, | 1223 | start, start + len, mode, mode_flags, |
1238 | nmask ? nodes_addr(*nmask)[0] : -1); | 1224 | nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); |
1239 | 1225 | ||
1240 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | 1226 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { |
1241 | 1227 | ||
@@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1272 | if (!list_empty(&pagelist)) { | 1258 | if (!list_empty(&pagelist)) { |
1273 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | 1259 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); |
1274 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1260 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1275 | (unsigned long)vma, | 1261 | (unsigned long)vma, |
1276 | false, MIGRATE_SYNC, | 1262 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); |
1277 | MR_MEMPOLICY_MBIND); | ||
1278 | if (nr_failed) | 1263 | if (nr_failed) |
1279 | putback_lru_pages(&pagelist); | 1264 | putback_lru_pages(&pagelist); |
1280 | } | 1265 | } |
@@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1644 | return pol; | 1629 | return pol; |
1645 | } | 1630 | } |
1646 | 1631 | ||
1632 | static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) | ||
1633 | { | ||
1634 | enum zone_type dynamic_policy_zone = policy_zone; | ||
1635 | |||
1636 | BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); | ||
1637 | |||
1638 | /* | ||
1639 | * if policy->v.nodes has movable memory only, | ||
1640 | * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. | ||
1641 | * | ||
1642 | * policy->v.nodes is intersect with node_states[N_MEMORY]. | ||
1643 | * so if the following test faile, it implies | ||
1644 | * policy->v.nodes has movable memory only. | ||
1645 | */ | ||
1646 | if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) | ||
1647 | dynamic_policy_zone = ZONE_MOVABLE; | ||
1648 | |||
1649 | return zone >= dynamic_policy_zone; | ||
1650 | } | ||
1651 | |||
1647 | /* | 1652 | /* |
1648 | * Return a nodemask representing a mempolicy for filtering nodes for | 1653 | * Return a nodemask representing a mempolicy for filtering nodes for |
1649 | * page allocation | 1654 | * page allocation |
@@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) | |||
1652 | { | 1657 | { |
1653 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | 1658 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ |
1654 | if (unlikely(policy->mode == MPOL_BIND) && | 1659 | if (unlikely(policy->mode == MPOL_BIND) && |
1655 | gfp_zone(gfp) >= policy_zone && | 1660 | apply_policy_zone(policy, gfp_zone(gfp)) && |
1656 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) | 1661 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) |
1657 | return &policy->v.nodes; | 1662 | return &policy->v.nodes; |
1658 | 1663 | ||
@@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2308 | * it less likely we act on an unlikely task<->page | 2313 | * it less likely we act on an unlikely task<->page |
2309 | * relation. | 2314 | * relation. |
2310 | */ | 2315 | */ |
2311 | last_nid = page_xchg_last_nid(page, polnid); | 2316 | last_nid = page_nid_xchg_last(page, polnid); |
2312 | if (last_nid != polnid) | 2317 | if (last_nid != polnid) |
2313 | goto out; | 2318 | goto out; |
2314 | } | 2319 | } |
@@ -2483,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
2483 | vma->vm_pgoff, | 2488 | vma->vm_pgoff, |
2484 | sz, npol ? npol->mode : -1, | 2489 | sz, npol ? npol->mode : -1, |
2485 | npol ? npol->flags : -1, | 2490 | npol ? npol->flags : -1, |
2486 | npol ? nodes_addr(npol->v.nodes)[0] : -1); | 2491 | npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); |
2487 | 2492 | ||
2488 | if (npol) { | 2493 | if (npol) { |
2489 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 2494 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
diff --git a/mm/migrate.c b/mm/migrate.c index 3b676b0c5c3e..3bbaf5d230b0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -160,8 +160,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
160 | if (is_write_migration_entry(entry)) | 160 | if (is_write_migration_entry(entry)) |
161 | pte = pte_mkwrite(pte); | 161 | pte = pte_mkwrite(pte); |
162 | #ifdef CONFIG_HUGETLB_PAGE | 162 | #ifdef CONFIG_HUGETLB_PAGE |
163 | if (PageHuge(new)) | 163 | if (PageHuge(new)) { |
164 | pte = pte_mkhuge(pte); | 164 | pte = pte_mkhuge(pte); |
165 | pte = arch_make_huge_pte(pte, vma, new, 0); | ||
166 | } | ||
165 | #endif | 167 | #endif |
166 | flush_cache_page(vma, addr, pte_pfn(pte)); | 168 | flush_cache_page(vma, addr, pte_pfn(pte)); |
167 | set_pte_at(mm, addr, ptep, pte); | 169 | set_pte_at(mm, addr, ptep, pte); |
@@ -462,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
462 | 464 | ||
463 | mlock_migrate_page(newpage, page); | 465 | mlock_migrate_page(newpage, page); |
464 | ksm_migrate_page(newpage, page); | 466 | ksm_migrate_page(newpage, page); |
465 | 467 | /* | |
468 | * Please do not reorder this without considering how mm/ksm.c's | ||
469 | * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). | ||
470 | */ | ||
466 | ClearPageSwapCache(page); | 471 | ClearPageSwapCache(page); |
467 | ClearPagePrivate(page); | 472 | ClearPagePrivate(page); |
468 | set_page_private(page, 0); | 473 | set_page_private(page, 0); |
@@ -696,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
696 | } | 701 | } |
697 | 702 | ||
698 | static int __unmap_and_move(struct page *page, struct page *newpage, | 703 | static int __unmap_and_move(struct page *page, struct page *newpage, |
699 | int force, bool offlining, enum migrate_mode mode) | 704 | int force, enum migrate_mode mode) |
700 | { | 705 | { |
701 | int rc = -EAGAIN; | 706 | int rc = -EAGAIN; |
702 | int remap_swapcache = 1; | 707 | int remap_swapcache = 1; |
@@ -726,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
726 | lock_page(page); | 731 | lock_page(page); |
727 | } | 732 | } |
728 | 733 | ||
729 | /* | ||
730 | * Only memory hotplug's offline_pages() caller has locked out KSM, | ||
731 | * and can safely migrate a KSM page. The other cases have skipped | ||
732 | * PageKsm along with PageReserved - but it is only now when we have | ||
733 | * the page lock that we can be certain it will not go KSM beneath us | ||
734 | * (KSM will not upgrade a page from PageAnon to PageKsm when it sees | ||
735 | * its pagecount raised, but only here do we take the page lock which | ||
736 | * serializes that). | ||
737 | */ | ||
738 | if (PageKsm(page) && !offlining) { | ||
739 | rc = -EBUSY; | ||
740 | goto unlock; | ||
741 | } | ||
742 | |||
743 | /* charge against new page */ | 734 | /* charge against new page */ |
744 | mem_cgroup_prepare_migration(page, newpage, &mem); | 735 | mem_cgroup_prepare_migration(page, newpage, &mem); |
745 | 736 | ||
@@ -766,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
766 | * File Caches may use write_page() or lock_page() in migration, then, | 757 | * File Caches may use write_page() or lock_page() in migration, then, |
767 | * just care Anon page here. | 758 | * just care Anon page here. |
768 | */ | 759 | */ |
769 | if (PageAnon(page)) { | 760 | if (PageAnon(page) && !PageKsm(page)) { |
770 | /* | 761 | /* |
771 | * Only page_lock_anon_vma_read() understands the subtleties of | 762 | * Only page_lock_anon_vma_read() understands the subtleties of |
772 | * getting a hold on an anon_vma from outside one of its mms. | 763 | * getting a hold on an anon_vma from outside one of its mms. |
@@ -846,7 +837,6 @@ uncharge: | |||
846 | mem_cgroup_end_migration(mem, page, newpage, | 837 | mem_cgroup_end_migration(mem, page, newpage, |
847 | (rc == MIGRATEPAGE_SUCCESS || | 838 | (rc == MIGRATEPAGE_SUCCESS || |
848 | rc == MIGRATEPAGE_BALLOON_SUCCESS)); | 839 | rc == MIGRATEPAGE_BALLOON_SUCCESS)); |
849 | unlock: | ||
850 | unlock_page(page); | 840 | unlock_page(page); |
851 | out: | 841 | out: |
852 | return rc; | 842 | return rc; |
@@ -857,8 +847,7 @@ out: | |||
857 | * to the newly allocated page in newpage. | 847 | * to the newly allocated page in newpage. |
858 | */ | 848 | */ |
859 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 849 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
860 | struct page *page, int force, bool offlining, | 850 | struct page *page, int force, enum migrate_mode mode) |
861 | enum migrate_mode mode) | ||
862 | { | 851 | { |
863 | int rc = 0; | 852 | int rc = 0; |
864 | int *result = NULL; | 853 | int *result = NULL; |
@@ -876,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
876 | if (unlikely(split_huge_page(page))) | 865 | if (unlikely(split_huge_page(page))) |
877 | goto out; | 866 | goto out; |
878 | 867 | ||
879 | rc = __unmap_and_move(page, newpage, force, offlining, mode); | 868 | rc = __unmap_and_move(page, newpage, force, mode); |
880 | 869 | ||
881 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { | 870 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { |
882 | /* | 871 | /* |
@@ -936,8 +925,7 @@ out: | |||
936 | */ | 925 | */ |
937 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 926 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
938 | unsigned long private, struct page *hpage, | 927 | unsigned long private, struct page *hpage, |
939 | int force, bool offlining, | 928 | int force, enum migrate_mode mode) |
940 | enum migrate_mode mode) | ||
941 | { | 929 | { |
942 | int rc = 0; | 930 | int rc = 0; |
943 | int *result = NULL; | 931 | int *result = NULL; |
@@ -999,9 +987,8 @@ out: | |||
999 | * | 987 | * |
1000 | * Return: Number of pages not migrated or error code. | 988 | * Return: Number of pages not migrated or error code. |
1001 | */ | 989 | */ |
1002 | int migrate_pages(struct list_head *from, | 990 | int migrate_pages(struct list_head *from, new_page_t get_new_page, |
1003 | new_page_t get_new_page, unsigned long private, bool offlining, | 991 | unsigned long private, enum migrate_mode mode, int reason) |
1004 | enum migrate_mode mode, int reason) | ||
1005 | { | 992 | { |
1006 | int retry = 1; | 993 | int retry = 1; |
1007 | int nr_failed = 0; | 994 | int nr_failed = 0; |
@@ -1022,8 +1009,7 @@ int migrate_pages(struct list_head *from, | |||
1022 | cond_resched(); | 1009 | cond_resched(); |
1023 | 1010 | ||
1024 | rc = unmap_and_move(get_new_page, private, | 1011 | rc = unmap_and_move(get_new_page, private, |
1025 | page, pass > 2, offlining, | 1012 | page, pass > 2, mode); |
1026 | mode); | ||
1027 | 1013 | ||
1028 | switch(rc) { | 1014 | switch(rc) { |
1029 | case -ENOMEM: | 1015 | case -ENOMEM: |
@@ -1056,15 +1042,13 @@ out: | |||
1056 | } | 1042 | } |
1057 | 1043 | ||
1058 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | 1044 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
1059 | unsigned long private, bool offlining, | 1045 | unsigned long private, enum migrate_mode mode) |
1060 | enum migrate_mode mode) | ||
1061 | { | 1046 | { |
1062 | int pass, rc; | 1047 | int pass, rc; |
1063 | 1048 | ||
1064 | for (pass = 0; pass < 10; pass++) { | 1049 | for (pass = 0; pass < 10; pass++) { |
1065 | rc = unmap_and_move_huge_page(get_new_page, | 1050 | rc = unmap_and_move_huge_page(get_new_page, private, |
1066 | private, hpage, pass > 2, offlining, | 1051 | hpage, pass > 2, mode); |
1067 | mode); | ||
1068 | switch (rc) { | 1052 | switch (rc) { |
1069 | case -ENOMEM: | 1053 | case -ENOMEM: |
1070 | goto out; | 1054 | goto out; |
@@ -1150,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1150 | goto set_status; | 1134 | goto set_status; |
1151 | 1135 | ||
1152 | /* Use PageReserved to check for zero page */ | 1136 | /* Use PageReserved to check for zero page */ |
1153 | if (PageReserved(page) || PageKsm(page)) | 1137 | if (PageReserved(page)) |
1154 | goto put_and_set; | 1138 | goto put_and_set; |
1155 | 1139 | ||
1156 | pp->page = page; | 1140 | pp->page = page; |
@@ -1187,8 +1171,7 @@ set_status: | |||
1187 | err = 0; | 1171 | err = 0; |
1188 | if (!list_empty(&pagelist)) { | 1172 | if (!list_empty(&pagelist)) { |
1189 | err = migrate_pages(&pagelist, new_page_node, | 1173 | err = migrate_pages(&pagelist, new_page_node, |
1190 | (unsigned long)pm, 0, MIGRATE_SYNC, | 1174 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); |
1191 | MR_SYSCALL); | ||
1192 | if (err) | 1175 | if (err) |
1193 | putback_lru_pages(&pagelist); | 1176 | putback_lru_pages(&pagelist); |
1194 | } | 1177 | } |
@@ -1312,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
1312 | 1295 | ||
1313 | err = -ENOENT; | 1296 | err = -ENOENT; |
1314 | /* Use PageReserved to check for zero page */ | 1297 | /* Use PageReserved to check for zero page */ |
1315 | if (!page || PageReserved(page) || PageKsm(page)) | 1298 | if (!page || PageReserved(page)) |
1316 | goto set_status; | 1299 | goto set_status; |
1317 | 1300 | ||
1318 | err = page_to_nid(page); | 1301 | err = page_to_nid(page); |
@@ -1459,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
1459 | * pages. Currently it only checks the watermarks which crude | 1442 | * pages. Currently it only checks the watermarks which crude |
1460 | */ | 1443 | */ |
1461 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | 1444 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, |
1462 | int nr_migrate_pages) | 1445 | unsigned long nr_migrate_pages) |
1463 | { | 1446 | { |
1464 | int z; | 1447 | int z; |
1465 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | 1448 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { |
@@ -1495,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, | |||
1495 | __GFP_NOWARN) & | 1478 | __GFP_NOWARN) & |
1496 | ~GFP_IOFS, 0); | 1479 | ~GFP_IOFS, 0); |
1497 | if (newpage) | 1480 | if (newpage) |
1498 | page_xchg_last_nid(newpage, page_last_nid(page)); | 1481 | page_nid_xchg_last(newpage, page_nid_last(page)); |
1499 | 1482 | ||
1500 | return newpage; | 1483 | return newpage; |
1501 | } | 1484 | } |
@@ -1555,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | |||
1555 | 1538 | ||
1556 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | 1539 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) |
1557 | { | 1540 | { |
1558 | int ret = 0; | 1541 | int page_lru; |
1542 | |||
1543 | VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); | ||
1559 | 1544 | ||
1560 | /* Avoid migrating to a node that is nearly full */ | 1545 | /* Avoid migrating to a node that is nearly full */ |
1561 | if (migrate_balanced_pgdat(pgdat, 1)) { | 1546 | if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) |
1562 | int page_lru; | 1547 | return 0; |
1563 | 1548 | ||
1564 | if (isolate_lru_page(page)) { | 1549 | if (isolate_lru_page(page)) |
1565 | put_page(page); | 1550 | return 0; |
1566 | return 0; | ||
1567 | } | ||
1568 | 1551 | ||
1569 | /* Page is isolated */ | 1552 | /* |
1570 | ret = 1; | 1553 | * migrate_misplaced_transhuge_page() skips page migration's usual |
1571 | page_lru = page_is_file_cache(page); | 1554 | * check on page_count(), so we must do it here, now that the page |
1572 | if (!PageTransHuge(page)) | 1555 | * has been isolated: a GUP pin, or any other pin, prevents migration. |
1573 | inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); | 1556 | * The expected page count is 3: 1 for page's mapcount and 1 for the |
1574 | else | 1557 | * caller's pin and 1 for the reference taken by isolate_lru_page(). |
1575 | mod_zone_page_state(page_zone(page), | 1558 | */ |
1576 | NR_ISOLATED_ANON + page_lru, | 1559 | if (PageTransHuge(page) && page_count(page) != 3) { |
1577 | HPAGE_PMD_NR); | 1560 | putback_lru_page(page); |
1561 | return 0; | ||
1578 | } | 1562 | } |
1579 | 1563 | ||
1564 | page_lru = page_is_file_cache(page); | ||
1565 | mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, | ||
1566 | hpage_nr_pages(page)); | ||
1567 | |||
1580 | /* | 1568 | /* |
1581 | * Page is either isolated or there is not enough space on the target | 1569 | * Isolating the page has taken another reference, so the |
1582 | * node. If isolated, then it has taken a reference count and the | 1570 | * caller's reference can be safely dropped without the page |
1583 | * callers reference can be safely dropped without the page | 1571 | * disappearing underneath us during migration. |
1584 | * disappearing underneath us during migration. Otherwise the page is | ||
1585 | * not to be migrated but the callers reference should still be | ||
1586 | * dropped so it does not leak. | ||
1587 | */ | 1572 | */ |
1588 | put_page(page); | 1573 | put_page(page); |
1589 | 1574 | return 1; | |
1590 | return ret; | ||
1591 | } | 1575 | } |
1592 | 1576 | ||
1593 | /* | 1577 | /* |
@@ -1598,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | |||
1598 | int migrate_misplaced_page(struct page *page, int node) | 1582 | int migrate_misplaced_page(struct page *page, int node) |
1599 | { | 1583 | { |
1600 | pg_data_t *pgdat = NODE_DATA(node); | 1584 | pg_data_t *pgdat = NODE_DATA(node); |
1601 | int isolated = 0; | 1585 | int isolated; |
1602 | int nr_remaining; | 1586 | int nr_remaining; |
1603 | LIST_HEAD(migratepages); | 1587 | LIST_HEAD(migratepages); |
1604 | 1588 | ||
@@ -1606,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node) | |||
1606 | * Don't migrate pages that are mapped in multiple processes. | 1590 | * Don't migrate pages that are mapped in multiple processes. |
1607 | * TODO: Handle false sharing detection instead of this hammer | 1591 | * TODO: Handle false sharing detection instead of this hammer |
1608 | */ | 1592 | */ |
1609 | if (page_mapcount(page) != 1) { | 1593 | if (page_mapcount(page) != 1) |
1610 | put_page(page); | ||
1611 | goto out; | 1594 | goto out; |
1612 | } | ||
1613 | 1595 | ||
1614 | /* | 1596 | /* |
1615 | * Rate-limit the amount of data that is being migrated to a node. | 1597 | * Rate-limit the amount of data that is being migrated to a node. |
1616 | * Optimal placement is no good if the memory bus is saturated and | 1598 | * Optimal placement is no good if the memory bus is saturated and |
1617 | * all the time is being spent migrating! | 1599 | * all the time is being spent migrating! |
1618 | */ | 1600 | */ |
1619 | if (numamigrate_update_ratelimit(pgdat, 1)) { | 1601 | if (numamigrate_update_ratelimit(pgdat, 1)) |
1620 | put_page(page); | ||
1621 | goto out; | 1602 | goto out; |
1622 | } | ||
1623 | 1603 | ||
1624 | isolated = numamigrate_isolate_page(pgdat, page); | 1604 | isolated = numamigrate_isolate_page(pgdat, page); |
1625 | if (!isolated) | 1605 | if (!isolated) |
1626 | goto out; | 1606 | goto out; |
1627 | 1607 | ||
1628 | list_add(&page->lru, &migratepages); | 1608 | list_add(&page->lru, &migratepages); |
1629 | nr_remaining = migrate_pages(&migratepages, | 1609 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
1630 | alloc_misplaced_dst_page, | 1610 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); |
1631 | node, false, MIGRATE_ASYNC, | ||
1632 | MR_NUMA_MISPLACED); | ||
1633 | if (nr_remaining) { | 1611 | if (nr_remaining) { |
1634 | putback_lru_pages(&migratepages); | 1612 | putback_lru_pages(&migratepages); |
1635 | isolated = 0; | 1613 | isolated = 0; |
1636 | } else | 1614 | } else |
1637 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | 1615 | count_vm_numa_event(NUMA_PAGE_MIGRATE); |
1638 | BUG_ON(!list_empty(&migratepages)); | 1616 | BUG_ON(!list_empty(&migratepages)); |
1639 | out: | ||
1640 | return isolated; | 1617 | return isolated; |
1618 | |||
1619 | out: | ||
1620 | put_page(page); | ||
1621 | return 0; | ||
1641 | } | 1622 | } |
1642 | #endif /* CONFIG_NUMA_BALANCING */ | 1623 | #endif /* CONFIG_NUMA_BALANCING */ |
1643 | 1624 | ||
1644 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | 1625 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) |
1626 | /* | ||
1627 | * Migrates a THP to a given target node. page must be locked and is unlocked | ||
1628 | * before returning. | ||
1629 | */ | ||
1645 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, | 1630 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, |
1646 | struct vm_area_struct *vma, | 1631 | struct vm_area_struct *vma, |
1647 | pmd_t *pmd, pmd_t entry, | 1632 | pmd_t *pmd, pmd_t entry, |
@@ -1672,17 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1672 | 1657 | ||
1673 | new_page = alloc_pages_node(node, | 1658 | new_page = alloc_pages_node(node, |
1674 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); | 1659 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); |
1675 | if (!new_page) { | 1660 | if (!new_page) |
1676 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1661 | goto out_fail; |
1677 | goto out_dropref; | 1662 | |
1678 | } | 1663 | page_nid_xchg_last(new_page, page_nid_last(page)); |
1679 | page_xchg_last_nid(new_page, page_last_nid(page)); | ||
1680 | 1664 | ||
1681 | isolated = numamigrate_isolate_page(pgdat, page); | 1665 | isolated = numamigrate_isolate_page(pgdat, page); |
1682 | if (!isolated) { | 1666 | if (!isolated) { |
1683 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1684 | put_page(new_page); | 1667 | put_page(new_page); |
1685 | goto out_keep_locked; | 1668 | goto out_fail; |
1686 | } | 1669 | } |
1687 | 1670 | ||
1688 | /* Prepare a page as a migration target */ | 1671 | /* Prepare a page as a migration target */ |
@@ -1714,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1714 | putback_lru_page(page); | 1697 | putback_lru_page(page); |
1715 | 1698 | ||
1716 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1699 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); |
1700 | isolated = 0; | ||
1717 | goto out; | 1701 | goto out; |
1718 | } | 1702 | } |
1719 | 1703 | ||
@@ -1758,9 +1742,11 @@ out: | |||
1758 | -HPAGE_PMD_NR); | 1742 | -HPAGE_PMD_NR); |
1759 | return isolated; | 1743 | return isolated; |
1760 | 1744 | ||
1745 | out_fail: | ||
1746 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1761 | out_dropref: | 1747 | out_dropref: |
1748 | unlock_page(page); | ||
1762 | put_page(page); | 1749 | put_page(page); |
1763 | out_keep_locked: | ||
1764 | return 0; | 1750 | return 0; |
1765 | } | 1751 | } |
1766 | #endif /* CONFIG_NUMA_BALANCING */ | 1752 | #endif /* CONFIG_NUMA_BALANCING */ |
diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8cb1..da2be56a7b8f 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ | 75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ |
76 | if (radix_tree_exceptional_entry(page)) { | 76 | if (radix_tree_exceptional_entry(page)) { |
77 | swp_entry_t swap = radix_to_swp_entry(page); | 77 | swp_entry_t swap = radix_to_swp_entry(page); |
78 | page = find_get_page(&swapper_space, swap.val); | 78 | page = find_get_page(swap_address_space(swap), swap.val); |
79 | } | 79 | } |
80 | #endif | 80 | #endif |
81 | if (page) { | 81 | if (page) { |
@@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
135 | } else { | 135 | } else { |
136 | #ifdef CONFIG_SWAP | 136 | #ifdef CONFIG_SWAP |
137 | pgoff = entry.val; | 137 | pgoff = entry.val; |
138 | *vec = mincore_page(&swapper_space, pgoff); | 138 | *vec = mincore_page(swap_address_space(entry), |
139 | pgoff); | ||
139 | #else | 140 | #else |
140 | WARN_ON(1); | 141 | WARN_ON(1); |
141 | *vec = 1; | 142 | *vec = 1; |
diff --git a/mm/mlock.c b/mm/mlock.c index f0b9ce572fc7..1c5e33fce639 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -102,13 +102,16 @@ void mlock_vma_page(struct page *page) | |||
102 | * can't isolate the page, we leave it for putback_lru_page() and vmscan | 102 | * can't isolate the page, we leave it for putback_lru_page() and vmscan |
103 | * [page_referenced()/try_to_unmap()] to deal with. | 103 | * [page_referenced()/try_to_unmap()] to deal with. |
104 | */ | 104 | */ |
105 | void munlock_vma_page(struct page *page) | 105 | unsigned int munlock_vma_page(struct page *page) |
106 | { | 106 | { |
107 | unsigned int page_mask = 0; | ||
108 | |||
107 | BUG_ON(!PageLocked(page)); | 109 | BUG_ON(!PageLocked(page)); |
108 | 110 | ||
109 | if (TestClearPageMlocked(page)) { | 111 | if (TestClearPageMlocked(page)) { |
110 | mod_zone_page_state(page_zone(page), NR_MLOCK, | 112 | unsigned int nr_pages = hpage_nr_pages(page); |
111 | -hpage_nr_pages(page)); | 113 | mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
114 | page_mask = nr_pages - 1; | ||
112 | if (!isolate_lru_page(page)) { | 115 | if (!isolate_lru_page(page)) { |
113 | int ret = SWAP_AGAIN; | 116 | int ret = SWAP_AGAIN; |
114 | 117 | ||
@@ -141,6 +144,8 @@ void munlock_vma_page(struct page *page) | |||
141 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | 144 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
142 | } | 145 | } |
143 | } | 146 | } |
147 | |||
148 | return page_mask; | ||
144 | } | 149 | } |
145 | 150 | ||
146 | /** | 151 | /** |
@@ -155,13 +160,11 @@ void munlock_vma_page(struct page *page) | |||
155 | * | 160 | * |
156 | * vma->vm_mm->mmap_sem must be held for at least read. | 161 | * vma->vm_mm->mmap_sem must be held for at least read. |
157 | */ | 162 | */ |
158 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | 163 | long __mlock_vma_pages_range(struct vm_area_struct *vma, |
159 | unsigned long start, unsigned long end, | 164 | unsigned long start, unsigned long end, int *nonblocking) |
160 | int *nonblocking) | ||
161 | { | 165 | { |
162 | struct mm_struct *mm = vma->vm_mm; | 166 | struct mm_struct *mm = vma->vm_mm; |
163 | unsigned long addr = start; | 167 | unsigned long nr_pages = (end - start) / PAGE_SIZE; |
164 | int nr_pages = (end - start) / PAGE_SIZE; | ||
165 | int gup_flags; | 168 | int gup_flags; |
166 | 169 | ||
167 | VM_BUG_ON(start & ~PAGE_MASK); | 170 | VM_BUG_ON(start & ~PAGE_MASK); |
@@ -186,7 +189,11 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
186 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) | 189 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) |
187 | gup_flags |= FOLL_FORCE; | 190 | gup_flags |= FOLL_FORCE; |
188 | 191 | ||
189 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, | 192 | /* |
193 | * We made sure addr is within a VMA, so the following will | ||
194 | * not result in a stack expansion that recurses back here. | ||
195 | */ | ||
196 | return __get_user_pages(current, mm, start, nr_pages, gup_flags, | ||
190 | NULL, NULL, nonblocking); | 197 | NULL, NULL, nonblocking); |
191 | } | 198 | } |
192 | 199 | ||
@@ -202,56 +209,6 @@ static int __mlock_posix_error_return(long retval) | |||
202 | return retval; | 209 | return retval; |
203 | } | 210 | } |
204 | 211 | ||
205 | /** | ||
206 | * mlock_vma_pages_range() - mlock pages in specified vma range. | ||
207 | * @vma - the vma containing the specfied address range | ||
208 | * @start - starting address in @vma to mlock | ||
209 | * @end - end address [+1] in @vma to mlock | ||
210 | * | ||
211 | * For mmap()/mremap()/expansion of mlocked vma. | ||
212 | * | ||
213 | * return 0 on success for "normal" vmas. | ||
214 | * | ||
215 | * return number of pages [> 0] to be removed from locked_vm on success | ||
216 | * of "special" vmas. | ||
217 | */ | ||
218 | long mlock_vma_pages_range(struct vm_area_struct *vma, | ||
219 | unsigned long start, unsigned long end) | ||
220 | { | ||
221 | int nr_pages = (end - start) / PAGE_SIZE; | ||
222 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); | ||
223 | |||
224 | /* | ||
225 | * filter unlockable vmas | ||
226 | */ | ||
227 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
228 | goto no_mlock; | ||
229 | |||
230 | if (!((vma->vm_flags & VM_DONTEXPAND) || | ||
231 | is_vm_hugetlb_page(vma) || | ||
232 | vma == get_gate_vma(current->mm))) { | ||
233 | |||
234 | __mlock_vma_pages_range(vma, start, end, NULL); | ||
235 | |||
236 | /* Hide errors from mmap() and other callers */ | ||
237 | return 0; | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * User mapped kernel pages or huge pages: | ||
242 | * make these pages present to populate the ptes, but | ||
243 | * fall thru' to reset VM_LOCKED--no need to unlock, and | ||
244 | * return nr_pages so these don't get counted against task's | ||
245 | * locked limit. huge pages are already counted against | ||
246 | * locked vm limit. | ||
247 | */ | ||
248 | make_pages_present(start, end); | ||
249 | |||
250 | no_mlock: | ||
251 | vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ | ||
252 | return nr_pages; /* error or pages NOT mlocked */ | ||
253 | } | ||
254 | |||
255 | /* | 212 | /* |
256 | * munlock_vma_pages_range() - munlock all pages in the vma range.' | 213 | * munlock_vma_pages_range() - munlock all pages in the vma range.' |
257 | * @vma - vma containing range to be munlock()ed. | 214 | * @vma - vma containing range to be munlock()ed. |
@@ -273,13 +230,12 @@ no_mlock: | |||
273 | void munlock_vma_pages_range(struct vm_area_struct *vma, | 230 | void munlock_vma_pages_range(struct vm_area_struct *vma, |
274 | unsigned long start, unsigned long end) | 231 | unsigned long start, unsigned long end) |
275 | { | 232 | { |
276 | unsigned long addr; | ||
277 | |||
278 | lru_add_drain(); | ||
279 | vma->vm_flags &= ~VM_LOCKED; | 233 | vma->vm_flags &= ~VM_LOCKED; |
280 | 234 | ||
281 | for (addr = start; addr < end; addr += PAGE_SIZE) { | 235 | while (start < end) { |
282 | struct page *page; | 236 | struct page *page; |
237 | unsigned int page_mask, page_increm; | ||
238 | |||
283 | /* | 239 | /* |
284 | * Although FOLL_DUMP is intended for get_dump_page(), | 240 | * Although FOLL_DUMP is intended for get_dump_page(), |
285 | * it just so happens that its special treatment of the | 241 | * it just so happens that its special treatment of the |
@@ -287,13 +243,22 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
287 | * suits munlock very well (and if somehow an abnormal page | 243 | * suits munlock very well (and if somehow an abnormal page |
288 | * has sneaked into the range, we won't oops here: great). | 244 | * has sneaked into the range, we won't oops here: great). |
289 | */ | 245 | */ |
290 | page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); | 246 | page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, |
247 | &page_mask); | ||
291 | if (page && !IS_ERR(page)) { | 248 | if (page && !IS_ERR(page)) { |
292 | lock_page(page); | 249 | lock_page(page); |
293 | munlock_vma_page(page); | 250 | lru_add_drain(); |
251 | /* | ||
252 | * Any THP page found by follow_page_mask() may have | ||
253 | * gotten split before reaching munlock_vma_page(), | ||
254 | * so we need to recompute the page_mask here. | ||
255 | */ | ||
256 | page_mask = munlock_vma_page(page); | ||
294 | unlock_page(page); | 257 | unlock_page(page); |
295 | put_page(page); | 258 | put_page(page); |
296 | } | 259 | } |
260 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); | ||
261 | start += page_increm * PAGE_SIZE; | ||
297 | cond_resched(); | 262 | cond_resched(); |
298 | } | 263 | } |
299 | } | 264 | } |
@@ -303,7 +268,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
303 | * | 268 | * |
304 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and | 269 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and |
305 | * munlock is a no-op. However, for some special vmas, we go ahead and | 270 | * munlock is a no-op. However, for some special vmas, we go ahead and |
306 | * populate the ptes via make_pages_present(). | 271 | * populate the ptes. |
307 | * | 272 | * |
308 | * For vmas that pass the filters, merge/split as appropriate. | 273 | * For vmas that pass the filters, merge/split as appropriate. |
309 | */ | 274 | */ |
@@ -391,9 +356,9 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
391 | 356 | ||
392 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 357 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
393 | 358 | ||
394 | newflags = vma->vm_flags | VM_LOCKED; | 359 | newflags = vma->vm_flags & ~VM_LOCKED; |
395 | if (!on) | 360 | if (on) |
396 | newflags &= ~VM_LOCKED; | 361 | newflags |= VM_LOCKED | VM_POPULATE; |
397 | 362 | ||
398 | tmp = vma->vm_end; | 363 | tmp = vma->vm_end; |
399 | if (tmp > end) | 364 | if (tmp > end) |
@@ -416,13 +381,20 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
416 | return error; | 381 | return error; |
417 | } | 382 | } |
418 | 383 | ||
419 | static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | 384 | /* |
385 | * __mm_populate - populate and/or mlock pages within a range of address space. | ||
386 | * | ||
387 | * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap | ||
388 | * flags. VMAs must be already marked with the desired vm_flags, and | ||
389 | * mmap_sem must not be held. | ||
390 | */ | ||
391 | int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) | ||
420 | { | 392 | { |
421 | struct mm_struct *mm = current->mm; | 393 | struct mm_struct *mm = current->mm; |
422 | unsigned long end, nstart, nend; | 394 | unsigned long end, nstart, nend; |
423 | struct vm_area_struct *vma = NULL; | 395 | struct vm_area_struct *vma = NULL; |
424 | int locked = 0; | 396 | int locked = 0; |
425 | int ret = 0; | 397 | long ret = 0; |
426 | 398 | ||
427 | VM_BUG_ON(start & ~PAGE_MASK); | 399 | VM_BUG_ON(start & ~PAGE_MASK); |
428 | VM_BUG_ON(len != PAGE_ALIGN(len)); | 400 | VM_BUG_ON(len != PAGE_ALIGN(len)); |
@@ -446,7 +418,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | |||
446 | * range with the first VMA. Also, skip undesirable VMA types. | 418 | * range with the first VMA. Also, skip undesirable VMA types. |
447 | */ | 419 | */ |
448 | nend = min(end, vma->vm_end); | 420 | nend = min(end, vma->vm_end); |
449 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 421 | if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != |
422 | VM_POPULATE) | ||
450 | continue; | 423 | continue; |
451 | if (nstart < vma->vm_start) | 424 | if (nstart < vma->vm_start) |
452 | nstart = vma->vm_start; | 425 | nstart = vma->vm_start; |
@@ -498,7 +471,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
498 | error = do_mlock(start, len, 1); | 471 | error = do_mlock(start, len, 1); |
499 | up_write(¤t->mm->mmap_sem); | 472 | up_write(¤t->mm->mmap_sem); |
500 | if (!error) | 473 | if (!error) |
501 | error = do_mlock_pages(start, len, 0); | 474 | error = __mm_populate(start, len, 0); |
502 | return error; | 475 | return error; |
503 | } | 476 | } |
504 | 477 | ||
@@ -517,20 +490,20 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) | |||
517 | static int do_mlockall(int flags) | 490 | static int do_mlockall(int flags) |
518 | { | 491 | { |
519 | struct vm_area_struct * vma, * prev = NULL; | 492 | struct vm_area_struct * vma, * prev = NULL; |
520 | unsigned int def_flags = 0; | ||
521 | 493 | ||
522 | if (flags & MCL_FUTURE) | 494 | if (flags & MCL_FUTURE) |
523 | def_flags = VM_LOCKED; | 495 | current->mm->def_flags |= VM_LOCKED | VM_POPULATE; |
524 | current->mm->def_flags = def_flags; | 496 | else |
497 | current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); | ||
525 | if (flags == MCL_FUTURE) | 498 | if (flags == MCL_FUTURE) |
526 | goto out; | 499 | goto out; |
527 | 500 | ||
528 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { | 501 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { |
529 | vm_flags_t newflags; | 502 | vm_flags_t newflags; |
530 | 503 | ||
531 | newflags = vma->vm_flags | VM_LOCKED; | 504 | newflags = vma->vm_flags & ~VM_LOCKED; |
532 | if (!(flags & MCL_CURRENT)) | 505 | if (flags & MCL_CURRENT) |
533 | newflags &= ~VM_LOCKED; | 506 | newflags |= VM_LOCKED | VM_POPULATE; |
534 | 507 | ||
535 | /* Ignore errors */ | 508 | /* Ignore errors */ |
536 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); | 509 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); |
@@ -564,10 +537,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
564 | capable(CAP_IPC_LOCK)) | 537 | capable(CAP_IPC_LOCK)) |
565 | ret = do_mlockall(flags); | 538 | ret = do_mlockall(flags); |
566 | up_write(¤t->mm->mmap_sem); | 539 | up_write(¤t->mm->mmap_sem); |
567 | if (!ret && (flags & MCL_CURRENT)) { | 540 | if (!ret && (flags & MCL_CURRENT)) |
568 | /* Ignore errors */ | 541 | mm_populate(0, TASK_SIZE); |
569 | do_mlock_pages(0, TASK_SIZE, 1); | ||
570 | } | ||
571 | out: | 542 | out: |
572 | return ret; | 543 | return ret; |
573 | } | 544 | } |
diff --git a/mm/mm_init.c b/mm/mm_init.c index 1ffd97ae26d7..c280a02ea11e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void) | |||
69 | unsigned long or_mask, add_mask; | 69 | unsigned long or_mask, add_mask; |
70 | 70 | ||
71 | shift = 8 * sizeof(unsigned long); | 71 | shift = 8 * sizeof(unsigned long); |
72 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; | 72 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; |
73 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", | 73 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", |
74 | "Section %d Node %d Zone %d Flags %d\n", | 74 | "Section %d Node %d Zone %d Lastnid %d Flags %d\n", |
75 | SECTIONS_WIDTH, | 75 | SECTIONS_WIDTH, |
76 | NODES_WIDTH, | 76 | NODES_WIDTH, |
77 | ZONES_WIDTH, | 77 | ZONES_WIDTH, |
78 | LAST_NID_WIDTH, | ||
78 | NR_PAGEFLAGS); | 79 | NR_PAGEFLAGS); |
79 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", | 80 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", |
80 | "Section %d Node %d Zone %d\n", | 81 | "Section %d Node %d Zone %d Lastnid %d\n", |
81 | SECTIONS_SHIFT, | 82 | SECTIONS_SHIFT, |
82 | NODES_SHIFT, | 83 | NODES_SHIFT, |
83 | ZONES_SHIFT); | 84 | ZONES_SHIFT, |
84 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", | 85 | LAST_NID_SHIFT); |
85 | "Section %lu Node %lu Zone %lu\n", | 86 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", |
87 | "Section %lu Node %lu Zone %lu Lastnid %lu\n", | ||
86 | (unsigned long)SECTIONS_PGSHIFT, | 88 | (unsigned long)SECTIONS_PGSHIFT, |
87 | (unsigned long)NODES_PGSHIFT, | 89 | (unsigned long)NODES_PGSHIFT, |
88 | (unsigned long)ZONES_PGSHIFT); | 90 | (unsigned long)ZONES_PGSHIFT, |
89 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", | 91 | (unsigned long)LAST_NID_PGSHIFT); |
90 | "Zone ID: %lu -> %lu\n", | 92 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", |
91 | (unsigned long)ZONEID_PGOFF, | 93 | "Node/Zone ID: %lu -> %lu\n", |
92 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); | 94 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), |
95 | (unsigned long)ZONEID_PGOFF); | ||
93 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", | 96 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", |
94 | "location: %d -> %d unused %d -> %d flags %d -> %d\n", | 97 | "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n", |
95 | shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); | 98 | shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); |
96 | #ifdef NODE_NOT_IN_PAGE_FLAGS | 99 | #ifdef NODE_NOT_IN_PAGE_FLAGS |
97 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | 100 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", |
98 | "Node not in page flags"); | 101 | "Node not in page flags"); |
99 | #endif | 102 | #endif |
103 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | ||
104 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | ||
105 | "Last nid not in page flags"); | ||
106 | #endif | ||
100 | 107 | ||
101 | if (SECTIONS_WIDTH) { | 108 | if (SECTIONS_WIDTH) { |
102 | shift -= SECTIONS_WIDTH; | 109 | shift -= SECTIONS_WIDTH; |
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/khugepaged.h> | 32 | #include <linux/khugepaged.h> |
33 | #include <linux/uprobes.h> | 33 | #include <linux/uprobes.h> |
34 | #include <linux/rbtree_augmented.h> | 34 | #include <linux/rbtree_augmented.h> |
35 | #include <linux/sched/sysctl.h> | ||
35 | 36 | ||
36 | #include <asm/uaccess.h> | 37 | #include <asm/uaccess.h> |
37 | #include <asm/cacheflush.h> | 38 | #include <asm/cacheflush.h> |
@@ -143,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
143 | */ | 144 | */ |
144 | free -= global_page_state(NR_SHMEM); | 145 | free -= global_page_state(NR_SHMEM); |
145 | 146 | ||
146 | free += nr_swap_pages; | 147 | free += get_nr_swap_pages(); |
147 | 148 | ||
148 | /* | 149 | /* |
149 | * Any slabs which are created with the | 150 | * Any slabs which are created with the |
@@ -202,7 +203,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
202 | struct file *file, struct address_space *mapping) | 203 | struct file *file, struct address_space *mapping) |
203 | { | 204 | { |
204 | if (vma->vm_flags & VM_DENYWRITE) | 205 | if (vma->vm_flags & VM_DENYWRITE) |
205 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); | 206 | atomic_inc(&file_inode(file)->i_writecount); |
206 | if (vma->vm_flags & VM_SHARED) | 207 | if (vma->vm_flags & VM_SHARED) |
207 | mapping->i_mmap_writable--; | 208 | mapping->i_mmap_writable--; |
208 | 209 | ||
@@ -255,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
255 | unsigned long newbrk, oldbrk; | 256 | unsigned long newbrk, oldbrk; |
256 | struct mm_struct *mm = current->mm; | 257 | struct mm_struct *mm = current->mm; |
257 | unsigned long min_brk; | 258 | unsigned long min_brk; |
259 | bool populate; | ||
258 | 260 | ||
259 | down_write(&mm->mmap_sem); | 261 | down_write(&mm->mmap_sem); |
260 | 262 | ||
@@ -304,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
304 | /* Ok, looks good - let it rip. */ | 306 | /* Ok, looks good - let it rip. */ |
305 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) | 307 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) |
306 | goto out; | 308 | goto out; |
309 | |||
307 | set_brk: | 310 | set_brk: |
308 | mm->brk = brk; | 311 | mm->brk = brk; |
312 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; | ||
313 | up_write(&mm->mmap_sem); | ||
314 | if (populate) | ||
315 | mm_populate(oldbrk, newbrk - oldbrk); | ||
316 | return brk; | ||
317 | |||
309 | out: | 318 | out: |
310 | retval = mm->brk; | 319 | retval = mm->brk; |
311 | up_write(&mm->mmap_sem); | 320 | up_write(&mm->mmap_sem); |
@@ -567,7 +576,7 @@ static void __vma_link_file(struct vm_area_struct *vma) | |||
567 | struct address_space *mapping = file->f_mapping; | 576 | struct address_space *mapping = file->f_mapping; |
568 | 577 | ||
569 | if (vma->vm_flags & VM_DENYWRITE) | 578 | if (vma->vm_flags & VM_DENYWRITE) |
570 | atomic_dec(&file->f_path.dentry->d_inode->i_writecount); | 579 | atomic_dec(&file_inode(file)->i_writecount); |
571 | if (vma->vm_flags & VM_SHARED) | 580 | if (vma->vm_flags & VM_SHARED) |
572 | mapping->i_mmap_writable++; | 581 | mapping->i_mmap_writable++; |
573 | 582 | ||
@@ -800,7 +809,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
800 | anon_vma_interval_tree_post_update_vma(vma); | 809 | anon_vma_interval_tree_post_update_vma(vma); |
801 | if (adjust_next) | 810 | if (adjust_next) |
802 | anon_vma_interval_tree_post_update_vma(next); | 811 | anon_vma_interval_tree_post_update_vma(next); |
803 | anon_vma_unlock(anon_vma); | 812 | anon_vma_unlock_write(anon_vma); |
804 | } | 813 | } |
805 | if (mapping) | 814 | if (mapping) |
806 | mutex_unlock(&mapping->i_mmap_mutex); | 815 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -1153,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint) | |||
1153 | 1162 | ||
1154 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 1163 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
1155 | unsigned long len, unsigned long prot, | 1164 | unsigned long len, unsigned long prot, |
1156 | unsigned long flags, unsigned long pgoff) | 1165 | unsigned long flags, unsigned long pgoff, |
1166 | unsigned long *populate) | ||
1157 | { | 1167 | { |
1158 | struct mm_struct * mm = current->mm; | 1168 | struct mm_struct * mm = current->mm; |
1159 | struct inode *inode; | 1169 | struct inode *inode; |
1160 | vm_flags_t vm_flags; | 1170 | vm_flags_t vm_flags; |
1161 | 1171 | ||
1172 | *populate = 0; | ||
1173 | |||
1162 | /* | 1174 | /* |
1163 | * Does the application expect PROT_READ to imply PROT_EXEC? | 1175 | * Does the application expect PROT_READ to imply PROT_EXEC? |
1164 | * | 1176 | * |
@@ -1217,7 +1229,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1217 | return -EAGAIN; | 1229 | return -EAGAIN; |
1218 | } | 1230 | } |
1219 | 1231 | ||
1220 | inode = file ? file->f_path.dentry->d_inode : NULL; | 1232 | inode = file ? file_inode(file) : NULL; |
1221 | 1233 | ||
1222 | if (file) { | 1234 | if (file) { |
1223 | switch (flags & MAP_TYPE) { | 1235 | switch (flags & MAP_TYPE) { |
@@ -1279,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1279 | } | 1291 | } |
1280 | } | 1292 | } |
1281 | 1293 | ||
1282 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1294 | /* |
1295 | * Set 'VM_NORESERVE' if we should not account for the | ||
1296 | * memory use of this mapping. | ||
1297 | */ | ||
1298 | if (flags & MAP_NORESERVE) { | ||
1299 | /* We honor MAP_NORESERVE if allowed to overcommit */ | ||
1300 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) | ||
1301 | vm_flags |= VM_NORESERVE; | ||
1302 | |||
1303 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ | ||
1304 | if (file && is_file_hugepages(file)) | ||
1305 | vm_flags |= VM_NORESERVE; | ||
1306 | } | ||
1307 | |||
1308 | addr = mmap_region(file, addr, len, vm_flags, pgoff); | ||
1309 | if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) | ||
1310 | *populate = len; | ||
1311 | return addr; | ||
1283 | } | 1312 | } |
1284 | 1313 | ||
1285 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1314 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
@@ -1394,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) | |||
1394 | } | 1423 | } |
1395 | 1424 | ||
1396 | unsigned long mmap_region(struct file *file, unsigned long addr, | 1425 | unsigned long mmap_region(struct file *file, unsigned long addr, |
1397 | unsigned long len, unsigned long flags, | 1426 | unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) |
1398 | vm_flags_t vm_flags, unsigned long pgoff) | ||
1399 | { | 1427 | { |
1400 | struct mm_struct *mm = current->mm; | 1428 | struct mm_struct *mm = current->mm; |
1401 | struct vm_area_struct *vma, *prev; | 1429 | struct vm_area_struct *vma, *prev; |
@@ -1403,7 +1431,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1403 | int error; | 1431 | int error; |
1404 | struct rb_node **rb_link, *rb_parent; | 1432 | struct rb_node **rb_link, *rb_parent; |
1405 | unsigned long charged = 0; | 1433 | unsigned long charged = 0; |
1406 | struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; | 1434 | struct inode *inode = file ? file_inode(file) : NULL; |
1407 | 1435 | ||
1408 | /* Clear old maps */ | 1436 | /* Clear old maps */ |
1409 | error = -ENOMEM; | 1437 | error = -ENOMEM; |
@@ -1419,20 +1447,6 @@ munmap_back: | |||
1419 | return -ENOMEM; | 1447 | return -ENOMEM; |
1420 | 1448 | ||
1421 | /* | 1449 | /* |
1422 | * Set 'VM_NORESERVE' if we should not account for the | ||
1423 | * memory use of this mapping. | ||
1424 | */ | ||
1425 | if ((flags & MAP_NORESERVE)) { | ||
1426 | /* We honor MAP_NORESERVE if allowed to overcommit */ | ||
1427 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) | ||
1428 | vm_flags |= VM_NORESERVE; | ||
1429 | |||
1430 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ | ||
1431 | if (file && is_file_hugepages(file)) | ||
1432 | vm_flags |= VM_NORESERVE; | ||
1433 | } | ||
1434 | |||
1435 | /* | ||
1436 | * Private writable mapping: check memory availability | 1450 | * Private writable mapping: check memory availability |
1437 | */ | 1451 | */ |
1438 | if (accountable_mapping(file, vm_flags)) { | 1452 | if (accountable_mapping(file, vm_flags)) { |
@@ -1530,10 +1544,12 @@ out: | |||
1530 | 1544 | ||
1531 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1545 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1532 | if (vm_flags & VM_LOCKED) { | 1546 | if (vm_flags & VM_LOCKED) { |
1533 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 1547 | if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || |
1548 | vma == get_gate_vma(current->mm))) | ||
1534 | mm->locked_vm += (len >> PAGE_SHIFT); | 1549 | mm->locked_vm += (len >> PAGE_SHIFT); |
1535 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1550 | else |
1536 | make_pages_present(addr, addr + len); | 1551 | vma->vm_flags &= ~VM_LOCKED; |
1552 | } | ||
1537 | 1553 | ||
1538 | if (file) | 1554 | if (file) |
1539 | uprobe_mmap(vma); | 1555 | uprobe_mmap(vma); |
@@ -2169,9 +2185,28 @@ int expand_downwards(struct vm_area_struct *vma, | |||
2169 | return error; | 2185 | return error; |
2170 | } | 2186 | } |
2171 | 2187 | ||
2188 | /* | ||
2189 | * Note how expand_stack() refuses to expand the stack all the way to | ||
2190 | * abut the next virtual mapping, *unless* that mapping itself is also | ||
2191 | * a stack mapping. We want to leave room for a guard page, after all | ||
2192 | * (the guard page itself is not added here, that is done by the | ||
2193 | * actual page faulting logic) | ||
2194 | * | ||
2195 | * This matches the behavior of the guard page logic (see mm/memory.c: | ||
2196 | * check_stack_guard_page()), which only allows the guard page to be | ||
2197 | * removed under these circumstances. | ||
2198 | */ | ||
2172 | #ifdef CONFIG_STACK_GROWSUP | 2199 | #ifdef CONFIG_STACK_GROWSUP |
2173 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 2200 | int expand_stack(struct vm_area_struct *vma, unsigned long address) |
2174 | { | 2201 | { |
2202 | struct vm_area_struct *next; | ||
2203 | |||
2204 | address &= PAGE_MASK; | ||
2205 | next = vma->vm_next; | ||
2206 | if (next && next->vm_start == address + PAGE_SIZE) { | ||
2207 | if (!(next->vm_flags & VM_GROWSUP)) | ||
2208 | return -ENOMEM; | ||
2209 | } | ||
2175 | return expand_upwards(vma, address); | 2210 | return expand_upwards(vma, address); |
2176 | } | 2211 | } |
2177 | 2212 | ||
@@ -2186,14 +2221,21 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
2186 | return vma; | 2221 | return vma; |
2187 | if (!prev || expand_stack(prev, addr)) | 2222 | if (!prev || expand_stack(prev, addr)) |
2188 | return NULL; | 2223 | return NULL; |
2189 | if (prev->vm_flags & VM_LOCKED) { | 2224 | if (prev->vm_flags & VM_LOCKED) |
2190 | mlock_vma_pages_range(prev, addr, prev->vm_end); | 2225 | __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); |
2191 | } | ||
2192 | return prev; | 2226 | return prev; |
2193 | } | 2227 | } |
2194 | #else | 2228 | #else |
2195 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 2229 | int expand_stack(struct vm_area_struct *vma, unsigned long address) |
2196 | { | 2230 | { |
2231 | struct vm_area_struct *prev; | ||
2232 | |||
2233 | address &= PAGE_MASK; | ||
2234 | prev = vma->vm_prev; | ||
2235 | if (prev && prev->vm_end == address) { | ||
2236 | if (!(prev->vm_flags & VM_GROWSDOWN)) | ||
2237 | return -ENOMEM; | ||
2238 | } | ||
2197 | return expand_downwards(vma, address); | 2239 | return expand_downwards(vma, address); |
2198 | } | 2240 | } |
2199 | 2241 | ||
@@ -2214,9 +2256,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
2214 | start = vma->vm_start; | 2256 | start = vma->vm_start; |
2215 | if (expand_stack(vma, addr)) | 2257 | if (expand_stack(vma, addr)) |
2216 | return NULL; | 2258 | return NULL; |
2217 | if (vma->vm_flags & VM_LOCKED) { | 2259 | if (vma->vm_flags & VM_LOCKED) |
2218 | mlock_vma_pages_range(vma, addr, start); | 2260 | __mlock_vma_pages_range(vma, addr, start, NULL); |
2219 | } | ||
2220 | return vma; | 2261 | return vma; |
2221 | } | 2262 | } |
2222 | #endif | 2263 | #endif |
@@ -2589,10 +2630,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2589 | out: | 2630 | out: |
2590 | perf_event_mmap(vma); | 2631 | perf_event_mmap(vma); |
2591 | mm->total_vm += len >> PAGE_SHIFT; | 2632 | mm->total_vm += len >> PAGE_SHIFT; |
2592 | if (flags & VM_LOCKED) { | 2633 | if (flags & VM_LOCKED) |
2593 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 2634 | mm->locked_vm += (len >> PAGE_SHIFT); |
2594 | mm->locked_vm += (len >> PAGE_SHIFT); | ||
2595 | } | ||
2596 | return addr; | 2635 | return addr; |
2597 | } | 2636 | } |
2598 | 2637 | ||
@@ -2600,10 +2639,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) | |||
2600 | { | 2639 | { |
2601 | struct mm_struct *mm = current->mm; | 2640 | struct mm_struct *mm = current->mm; |
2602 | unsigned long ret; | 2641 | unsigned long ret; |
2642 | bool populate; | ||
2603 | 2643 | ||
2604 | down_write(&mm->mmap_sem); | 2644 | down_write(&mm->mmap_sem); |
2605 | ret = do_brk(addr, len); | 2645 | ret = do_brk(addr, len); |
2646 | populate = ((mm->def_flags & VM_LOCKED) != 0); | ||
2606 | up_write(&mm->mmap_sem); | 2647 | up_write(&mm->mmap_sem); |
2648 | if (populate) | ||
2649 | mm_populate(addr, len); | ||
2607 | return ret; | 2650 | return ret; |
2608 | } | 2651 | } |
2609 | EXPORT_SYMBOL(vm_brk); | 2652 | EXPORT_SYMBOL(vm_brk); |
@@ -2886,7 +2929,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | |||
2886 | * The LSB of head.next can't change from under us | 2929 | * The LSB of head.next can't change from under us |
2887 | * because we hold the mm_all_locks_mutex. | 2930 | * because we hold the mm_all_locks_mutex. |
2888 | */ | 2931 | */ |
2889 | down_write(&anon_vma->root->rwsem); | 2932 | down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); |
2890 | /* | 2933 | /* |
2891 | * We can safely modify head.next after taking the | 2934 | * We can safely modify head.next after taking the |
2892 | * anon_vma->root->rwsem. If some other vma in this mm shares | 2935 | * anon_vma->root->rwsem. If some other vma in this mm shares |
@@ -2943,7 +2986,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
2943 | * vma in this mm is backed by the same anon_vma or address_space. | 2986 | * vma in this mm is backed by the same anon_vma or address_space. |
2944 | * | 2987 | * |
2945 | * We can take all the locks in random order because the VM code | 2988 | * We can take all the locks in random order because the VM code |
2946 | * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never | 2989 | * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never |
2947 | * takes more than one of them in a row. Secondly we're protected | 2990 | * takes more than one of them in a row. Secondly we're protected |
2948 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 2991 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. |
2949 | * | 2992 | * |
@@ -3001,7 +3044,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
3001 | if (!__test_and_clear_bit(0, (unsigned long *) | 3044 | if (!__test_and_clear_bit(0, (unsigned long *) |
3002 | &anon_vma->root->rb_root.rb_node)) | 3045 | &anon_vma->root->rb_root.rb_node)) |
3003 | BUG(); | 3046 | BUG(); |
3004 | anon_vma_unlock(anon_vma); | 3047 | anon_vma_unlock_write(anon_vma); |
3005 | } | 3048 | } |
3006 | } | 3049 | } |
3007 | 3050 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8a5ac8c686b0..be04122fb277 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -37,49 +37,51 @@ static struct srcu_struct srcu; | |||
37 | void __mmu_notifier_release(struct mm_struct *mm) | 37 | void __mmu_notifier_release(struct mm_struct *mm) |
38 | { | 38 | { |
39 | struct mmu_notifier *mn; | 39 | struct mmu_notifier *mn; |
40 | struct hlist_node *n; | ||
41 | int id; | 40 | int id; |
42 | 41 | ||
43 | /* | 42 | /* |
44 | * SRCU here will block mmu_notifier_unregister until | 43 | * srcu_read_lock() here will block synchronize_srcu() in |
45 | * ->release returns. | 44 | * mmu_notifier_unregister() until all registered |
45 | * ->release() callouts this function makes have | ||
46 | * returned. | ||
46 | */ | 47 | */ |
47 | id = srcu_read_lock(&srcu); | 48 | id = srcu_read_lock(&srcu); |
48 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
49 | /* | ||
50 | * if ->release runs before mmu_notifier_unregister it | ||
51 | * must be handled as it's the only way for the driver | ||
52 | * to flush all existing sptes and stop the driver | ||
53 | * from establishing any more sptes before all the | ||
54 | * pages in the mm are freed. | ||
55 | */ | ||
56 | if (mn->ops->release) | ||
57 | mn->ops->release(mn, mm); | ||
58 | srcu_read_unlock(&srcu, id); | ||
59 | |||
60 | spin_lock(&mm->mmu_notifier_mm->lock); | 49 | spin_lock(&mm->mmu_notifier_mm->lock); |
61 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 50 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
62 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, | 51 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, |
63 | struct mmu_notifier, | 52 | struct mmu_notifier, |
64 | hlist); | 53 | hlist); |
54 | |||
65 | /* | 55 | /* |
66 | * We arrived before mmu_notifier_unregister so | 56 | * Unlink. This will prevent mmu_notifier_unregister() |
67 | * mmu_notifier_unregister will do nothing other than | 57 | * from also making the ->release() callout. |
68 | * to wait ->release to finish and | ||
69 | * mmu_notifier_unregister to return. | ||
70 | */ | 58 | */ |
71 | hlist_del_init_rcu(&mn->hlist); | 59 | hlist_del_init_rcu(&mn->hlist); |
60 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
61 | |||
62 | /* | ||
63 | * Clear sptes. (see 'release' description in mmu_notifier.h) | ||
64 | */ | ||
65 | if (mn->ops->release) | ||
66 | mn->ops->release(mn, mm); | ||
67 | |||
68 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
72 | } | 69 | } |
73 | spin_unlock(&mm->mmu_notifier_mm->lock); | 70 | spin_unlock(&mm->mmu_notifier_mm->lock); |
74 | 71 | ||
75 | /* | 72 | /* |
76 | * synchronize_srcu here prevents mmu_notifier_release to | 73 | * All callouts to ->release() which we have done are complete. |
77 | * return to exit_mmap (which would proceed freeing all pages | 74 | * Allow synchronize_srcu() in mmu_notifier_unregister() to complete |
78 | * in the mm) until the ->release method returns, if it was | 75 | */ |
79 | * invoked by mmu_notifier_unregister. | 76 | srcu_read_unlock(&srcu, id); |
80 | * | 77 | |
81 | * The mmu_notifier_mm can't go away from under us because one | 78 | /* |
82 | * mm_count is hold by exit_mmap. | 79 | * mmu_notifier_unregister() may have unlinked a notifier and may |
80 | * still be calling out to it. Additionally, other notifiers | ||
81 | * may have been active via vmtruncate() et. al. Block here | ||
82 | * to ensure that all notifier callouts for this mm have been | ||
83 | * completed and the sptes are really cleaned up before returning | ||
84 | * to exit_mmap(). | ||
83 | */ | 85 | */ |
84 | synchronize_srcu(&srcu); | 86 | synchronize_srcu(&srcu); |
85 | } | 87 | } |
@@ -93,11 +95,10 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
93 | unsigned long address) | 95 | unsigned long address) |
94 | { | 96 | { |
95 | struct mmu_notifier *mn; | 97 | struct mmu_notifier *mn; |
96 | struct hlist_node *n; | ||
97 | int young = 0, id; | 98 | int young = 0, id; |
98 | 99 | ||
99 | id = srcu_read_lock(&srcu); | 100 | id = srcu_read_lock(&srcu); |
100 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 101 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { |
101 | if (mn->ops->clear_flush_young) | 102 | if (mn->ops->clear_flush_young) |
102 | young |= mn->ops->clear_flush_young(mn, mm, address); | 103 | young |= mn->ops->clear_flush_young(mn, mm, address); |
103 | } | 104 | } |
@@ -110,11 +111,10 @@ int __mmu_notifier_test_young(struct mm_struct *mm, | |||
110 | unsigned long address) | 111 | unsigned long address) |
111 | { | 112 | { |
112 | struct mmu_notifier *mn; | 113 | struct mmu_notifier *mn; |
113 | struct hlist_node *n; | ||
114 | int young = 0, id; | 114 | int young = 0, id; |
115 | 115 | ||
116 | id = srcu_read_lock(&srcu); | 116 | id = srcu_read_lock(&srcu); |
117 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 117 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { |
118 | if (mn->ops->test_young) { | 118 | if (mn->ops->test_young) { |
119 | young = mn->ops->test_young(mn, mm, address); | 119 | young = mn->ops->test_young(mn, mm, address); |
120 | if (young) | 120 | if (young) |
@@ -130,11 +130,10 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | |||
130 | pte_t pte) | 130 | pte_t pte) |
131 | { | 131 | { |
132 | struct mmu_notifier *mn; | 132 | struct mmu_notifier *mn; |
133 | struct hlist_node *n; | ||
134 | int id; | 133 | int id; |
135 | 134 | ||
136 | id = srcu_read_lock(&srcu); | 135 | id = srcu_read_lock(&srcu); |
137 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 136 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { |
138 | if (mn->ops->change_pte) | 137 | if (mn->ops->change_pte) |
139 | mn->ops->change_pte(mn, mm, address, pte); | 138 | mn->ops->change_pte(mn, mm, address, pte); |
140 | } | 139 | } |
@@ -145,11 +144,10 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, | |||
145 | unsigned long address) | 144 | unsigned long address) |
146 | { | 145 | { |
147 | struct mmu_notifier *mn; | 146 | struct mmu_notifier *mn; |
148 | struct hlist_node *n; | ||
149 | int id; | 147 | int id; |
150 | 148 | ||
151 | id = srcu_read_lock(&srcu); | 149 | id = srcu_read_lock(&srcu); |
152 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 150 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { |
153 | if (mn->ops->invalidate_page) | 151 | if (mn->ops->invalidate_page) |
154 | mn->ops->invalidate_page(mn, mm, address); | 152 | mn->ops->invalidate_page(mn, mm, address); |
155 | } | 153 | } |
@@ -160,31 +158,31 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | |||
160 | unsigned long start, unsigned long end) | 158 | unsigned long start, unsigned long end) |
161 | { | 159 | { |
162 | struct mmu_notifier *mn; | 160 | struct mmu_notifier *mn; |
163 | struct hlist_node *n; | ||
164 | int id; | 161 | int id; |
165 | 162 | ||
166 | id = srcu_read_lock(&srcu); | 163 | id = srcu_read_lock(&srcu); |
167 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 164 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { |
168 | if (mn->ops->invalidate_range_start) | 165 | if (mn->ops->invalidate_range_start) |
169 | mn->ops->invalidate_range_start(mn, mm, start, end); | 166 | mn->ops->invalidate_range_start(mn, mm, start, end); |
170 | } | 167 | } |
171 | srcu_read_unlock(&srcu, id); | 168 | srcu_read_unlock(&srcu, id); |
172 | } | 169 | } |
170 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); | ||
173 | 171 | ||
174 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | 172 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, |
175 | unsigned long start, unsigned long end) | 173 | unsigned long start, unsigned long end) |
176 | { | 174 | { |
177 | struct mmu_notifier *mn; | 175 | struct mmu_notifier *mn; |
178 | struct hlist_node *n; | ||
179 | int id; | 176 | int id; |
180 | 177 | ||
181 | id = srcu_read_lock(&srcu); | 178 | id = srcu_read_lock(&srcu); |
182 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 179 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { |
183 | if (mn->ops->invalidate_range_end) | 180 | if (mn->ops->invalidate_range_end) |
184 | mn->ops->invalidate_range_end(mn, mm, start, end); | 181 | mn->ops->invalidate_range_end(mn, mm, start, end); |
185 | } | 182 | } |
186 | srcu_read_unlock(&srcu, id); | 183 | srcu_read_unlock(&srcu, id); |
187 | } | 184 | } |
185 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); | ||
188 | 186 | ||
189 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | 187 | static int do_mmu_notifier_register(struct mmu_notifier *mn, |
190 | struct mm_struct *mm, | 188 | struct mm_struct *mm, |
@@ -294,31 +292,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
294 | { | 292 | { |
295 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 293 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
296 | 294 | ||
295 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
297 | if (!hlist_unhashed(&mn->hlist)) { | 296 | if (!hlist_unhashed(&mn->hlist)) { |
298 | /* | ||
299 | * SRCU here will force exit_mmap to wait ->release to finish | ||
300 | * before freeing the pages. | ||
301 | */ | ||
302 | int id; | 297 | int id; |
303 | 298 | ||
304 | id = srcu_read_lock(&srcu); | ||
305 | /* | 299 | /* |
306 | * exit_mmap will block in mmu_notifier_release to | 300 | * Ensure we synchronize up with __mmu_notifier_release(). |
307 | * guarantee ->release is called before freeing the | ||
308 | * pages. | ||
309 | */ | 301 | */ |
302 | id = srcu_read_lock(&srcu); | ||
303 | |||
304 | hlist_del_rcu(&mn->hlist); | ||
305 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
306 | |||
310 | if (mn->ops->release) | 307 | if (mn->ops->release) |
311 | mn->ops->release(mn, mm); | 308 | mn->ops->release(mn, mm); |
312 | srcu_read_unlock(&srcu, id); | ||
313 | 309 | ||
314 | spin_lock(&mm->mmu_notifier_mm->lock); | 310 | /* |
315 | hlist_del_rcu(&mn->hlist); | 311 | * Allow __mmu_notifier_release() to complete. |
312 | */ | ||
313 | srcu_read_unlock(&srcu, id); | ||
314 | } else | ||
316 | spin_unlock(&mm->mmu_notifier_mm->lock); | 315 | spin_unlock(&mm->mmu_notifier_mm->lock); |
317 | } | ||
318 | 316 | ||
319 | /* | 317 | /* |
320 | * Wait any running method to finish, of course including | 318 | * Wait for any running method to finish, including ->release() if it |
321 | * ->release if it was run by mmu_notifier_relase instead of us. | 319 | * was run by __mmu_notifier_release() instead of us. |
322 | */ | 320 | */ |
323 | synchronize_srcu(&srcu); | 321 | synchronize_srcu(&srcu); |
324 | 322 | ||
diff --git a/mm/mmzone.c b/mm/mmzone.c index 4596d81b89b1..2ac0afbd68f3 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * linux/mm/mmzone.c | 2 | * linux/mm/mmzone.c |
3 | * | 3 | * |
4 | * management codes for pgdats and zones. | 4 | * management codes for pgdats, zones and page flags |
5 | */ | 5 | */ |
6 | 6 | ||
7 | 7 | ||
@@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec) | |||
96 | for_each_lru(lru) | 96 | for_each_lru(lru) |
97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | 97 | INIT_LIST_HEAD(&lruvec->lists[lru]); |
98 | } | 98 | } |
99 | |||
100 | #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) | ||
101 | int page_nid_xchg_last(struct page *page, int nid) | ||
102 | { | ||
103 | unsigned long old_flags, flags; | ||
104 | int last_nid; | ||
105 | |||
106 | do { | ||
107 | old_flags = flags = page->flags; | ||
108 | last_nid = page_nid_last(page); | ||
109 | |||
110 | flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); | ||
111 | flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; | ||
112 | } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); | ||
113 | |||
114 | return last_nid; | ||
115 | } | ||
116 | #endif | ||
diff --git a/mm/mremap.c b/mm/mremap.c index e1031e1f6a61..463a25705ac6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/security.h> | 19 | #include <linux/security.h> |
20 | #include <linux/syscalls.h> | 20 | #include <linux/syscalls.h> |
21 | #include <linux/mmu_notifier.h> | 21 | #include <linux/mmu_notifier.h> |
22 | #include <linux/sched/sysctl.h> | ||
22 | 23 | ||
23 | #include <asm/uaccess.h> | 24 | #include <asm/uaccess.h> |
24 | #include <asm/cacheflush.h> | 25 | #include <asm/cacheflush.h> |
@@ -134,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
134 | pte_unmap(new_pte - 1); | 135 | pte_unmap(new_pte - 1); |
135 | pte_unmap_unlock(old_pte - 1, old_ptl); | 136 | pte_unmap_unlock(old_pte - 1, old_ptl); |
136 | if (anon_vma) | 137 | if (anon_vma) |
137 | anon_vma_unlock(anon_vma); | 138 | anon_vma_unlock_write(anon_vma); |
138 | if (mapping) | 139 | if (mapping) |
139 | mutex_unlock(&mapping->i_mmap_mutex); | 140 | mutex_unlock(&mapping->i_mmap_mutex); |
140 | } | 141 | } |
@@ -208,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
208 | 209 | ||
209 | static unsigned long move_vma(struct vm_area_struct *vma, | 210 | static unsigned long move_vma(struct vm_area_struct *vma, |
210 | unsigned long old_addr, unsigned long old_len, | 211 | unsigned long old_addr, unsigned long old_len, |
211 | unsigned long new_len, unsigned long new_addr) | 212 | unsigned long new_len, unsigned long new_addr, bool *locked) |
212 | { | 213 | { |
213 | struct mm_struct *mm = vma->vm_mm; | 214 | struct mm_struct *mm = vma->vm_mm; |
214 | struct vm_area_struct *new_vma; | 215 | struct vm_area_struct *new_vma; |
@@ -299,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
299 | 300 | ||
300 | if (vm_flags & VM_LOCKED) { | 301 | if (vm_flags & VM_LOCKED) { |
301 | mm->locked_vm += new_len >> PAGE_SHIFT; | 302 | mm->locked_vm += new_len >> PAGE_SHIFT; |
302 | if (new_len > old_len) | 303 | *locked = true; |
303 | mlock_vma_pages_range(new_vma, new_addr + old_len, | ||
304 | new_addr + new_len); | ||
305 | } | 304 | } |
306 | 305 | ||
307 | return new_addr; | 306 | return new_addr; |
@@ -366,9 +365,8 @@ Eagain: | |||
366 | return ERR_PTR(-EAGAIN); | 365 | return ERR_PTR(-EAGAIN); |
367 | } | 366 | } |
368 | 367 | ||
369 | static unsigned long mremap_to(unsigned long addr, | 368 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, |
370 | unsigned long old_len, unsigned long new_addr, | 369 | unsigned long new_addr, unsigned long new_len, bool *locked) |
371 | unsigned long new_len) | ||
372 | { | 370 | { |
373 | struct mm_struct *mm = current->mm; | 371 | struct mm_struct *mm = current->mm; |
374 | struct vm_area_struct *vma; | 372 | struct vm_area_struct *vma; |
@@ -418,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr, | |||
418 | if (ret & ~PAGE_MASK) | 416 | if (ret & ~PAGE_MASK) |
419 | goto out1; | 417 | goto out1; |
420 | 418 | ||
421 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 419 | ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); |
422 | if (!(ret & ~PAGE_MASK)) | 420 | if (!(ret & ~PAGE_MASK)) |
423 | goto out; | 421 | goto out; |
424 | out1: | 422 | out1: |
@@ -456,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
456 | struct vm_area_struct *vma; | 454 | struct vm_area_struct *vma; |
457 | unsigned long ret = -EINVAL; | 455 | unsigned long ret = -EINVAL; |
458 | unsigned long charged = 0; | 456 | unsigned long charged = 0; |
457 | bool locked = false; | ||
459 | 458 | ||
460 | down_write(¤t->mm->mmap_sem); | 459 | down_write(¤t->mm->mmap_sem); |
461 | 460 | ||
@@ -478,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
478 | 477 | ||
479 | if (flags & MREMAP_FIXED) { | 478 | if (flags & MREMAP_FIXED) { |
480 | if (flags & MREMAP_MAYMOVE) | 479 | if (flags & MREMAP_MAYMOVE) |
481 | ret = mremap_to(addr, old_len, new_addr, new_len); | 480 | ret = mremap_to(addr, old_len, new_addr, new_len, |
481 | &locked); | ||
482 | goto out; | 482 | goto out; |
483 | } | 483 | } |
484 | 484 | ||
@@ -520,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
520 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 520 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
521 | if (vma->vm_flags & VM_LOCKED) { | 521 | if (vma->vm_flags & VM_LOCKED) { |
522 | mm->locked_vm += pages; | 522 | mm->locked_vm += pages; |
523 | mlock_vma_pages_range(vma, addr + old_len, | 523 | locked = true; |
524 | addr + new_len); | 524 | new_addr = addr; |
525 | } | 525 | } |
526 | ret = addr; | 526 | ret = addr; |
527 | goto out; | 527 | goto out; |
@@ -547,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
547 | goto out; | 547 | goto out; |
548 | } | 548 | } |
549 | 549 | ||
550 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 550 | ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); |
551 | } | 551 | } |
552 | out: | 552 | out: |
553 | if (ret & ~PAGE_MASK) | 553 | if (ret & ~PAGE_MASK) |
554 | vm_unacct_memory(charged); | 554 | vm_unacct_memory(charged); |
555 | up_write(¤t->mm->mmap_sem); | 555 | up_write(¤t->mm->mmap_sem); |
556 | if (locked && new_len > old_len) | ||
557 | mm_populate(new_addr + old_len, new_len - old_len); | ||
556 | return ret; | 558 | return ret; |
557 | } | 559 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index b8294fc03df8..5e07d36e381e 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -154,21 +154,6 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | |||
154 | } | 154 | } |
155 | 155 | ||
156 | /** | 156 | /** |
157 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | ||
158 | * @pgdat: node to be released | ||
159 | * | ||
160 | * Returns the number of pages actually released. | ||
161 | */ | ||
162 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | ||
163 | { | ||
164 | register_page_bootmem_info_node(pgdat); | ||
165 | reset_node_lowmem_managed_pages(pgdat); | ||
166 | |||
167 | /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ | ||
168 | return 0; | ||
169 | } | ||
170 | |||
171 | /** | ||
172 | * free_all_bootmem - release free pages to the buddy allocator | 157 | * free_all_bootmem - release free pages to the buddy allocator |
173 | * | 158 | * |
174 | * Returns the number of pages actually released. | 159 | * Returns the number of pages actually released. |
@@ -406,6 +391,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | |||
406 | return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); | 391 | return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); |
407 | } | 392 | } |
408 | 393 | ||
394 | void * __init __alloc_bootmem_low_nopanic(unsigned long size, | ||
395 | unsigned long align, | ||
396 | unsigned long goal) | ||
397 | { | ||
398 | return ___alloc_bootmem_nopanic(size, align, goal, | ||
399 | ARCH_LOW_ADDRESS_LIMIT); | ||
400 | } | ||
401 | |||
409 | /** | 402 | /** |
410 | * __alloc_bootmem_low_node - allocate low boot memory from a specific node | 403 | * __alloc_bootmem_low_node - allocate low boot memory from a specific node |
411 | * @pgdat: node to allocate from | 404 | * @pgdat: node to allocate from |
diff --git a/mm/nommu.c b/mm/nommu.c index 79c3cac87afa..e19328087534 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/security.h> | 29 | #include <linux/security.h> |
30 | #include <linux/syscalls.h> | 30 | #include <linux/syscalls.h> |
31 | #include <linux/audit.h> | 31 | #include <linux/audit.h> |
32 | #include <linux/sched/sysctl.h> | ||
32 | 33 | ||
33 | #include <asm/uaccess.h> | 34 | #include <asm/uaccess.h> |
34 | #include <asm/tlb.h> | 35 | #include <asm/tlb.h> |
@@ -139,10 +140,10 @@ unsigned int kobjsize(const void *objp) | |||
139 | return PAGE_SIZE << compound_order(page); | 140 | return PAGE_SIZE << compound_order(page); |
140 | } | 141 | } |
141 | 142 | ||
142 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 143 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
143 | unsigned long start, int nr_pages, unsigned int foll_flags, | 144 | unsigned long start, unsigned long nr_pages, |
144 | struct page **pages, struct vm_area_struct **vmas, | 145 | unsigned int foll_flags, struct page **pages, |
145 | int *retry) | 146 | struct vm_area_struct **vmas, int *nonblocking) |
146 | { | 147 | { |
147 | struct vm_area_struct *vma; | 148 | struct vm_area_struct *vma; |
148 | unsigned long vm_flags; | 149 | unsigned long vm_flags; |
@@ -189,9 +190,10 @@ finish_or_fault: | |||
189 | * slab page or a secondary page from a compound page | 190 | * slab page or a secondary page from a compound page |
190 | * - don't permit access to VMAs that don't support it, such as I/O mappings | 191 | * - don't permit access to VMAs that don't support it, such as I/O mappings |
191 | */ | 192 | */ |
192 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 193 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
193 | unsigned long start, int nr_pages, int write, int force, | 194 | unsigned long start, unsigned long nr_pages, |
194 | struct page **pages, struct vm_area_struct **vmas) | 195 | int write, int force, struct page **pages, |
196 | struct vm_area_struct **vmas) | ||
195 | { | 197 | { |
196 | int flags = 0; | 198 | int flags = 0; |
197 | 199 | ||
@@ -941,7 +943,7 @@ static int validate_mmap_request(struct file *file, | |||
941 | */ | 943 | */ |
942 | mapping = file->f_mapping; | 944 | mapping = file->f_mapping; |
943 | if (!mapping) | 945 | if (!mapping) |
944 | mapping = file->f_path.dentry->d_inode->i_mapping; | 946 | mapping = file_inode(file)->i_mapping; |
945 | 947 | ||
946 | capabilities = 0; | 948 | capabilities = 0; |
947 | if (mapping && mapping->backing_dev_info) | 949 | if (mapping && mapping->backing_dev_info) |
@@ -950,7 +952,7 @@ static int validate_mmap_request(struct file *file, | |||
950 | if (!capabilities) { | 952 | if (!capabilities) { |
951 | /* no explicit capabilities set, so assume some | 953 | /* no explicit capabilities set, so assume some |
952 | * defaults */ | 954 | * defaults */ |
953 | switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { | 955 | switch (file_inode(file)->i_mode & S_IFMT) { |
954 | case S_IFREG: | 956 | case S_IFREG: |
955 | case S_IFBLK: | 957 | case S_IFBLK: |
956 | capabilities = BDI_CAP_MAP_COPY; | 958 | capabilities = BDI_CAP_MAP_COPY; |
@@ -985,11 +987,11 @@ static int validate_mmap_request(struct file *file, | |||
985 | !(file->f_mode & FMODE_WRITE)) | 987 | !(file->f_mode & FMODE_WRITE)) |
986 | return -EACCES; | 988 | return -EACCES; |
987 | 989 | ||
988 | if (IS_APPEND(file->f_path.dentry->d_inode) && | 990 | if (IS_APPEND(file_inode(file)) && |
989 | (file->f_mode & FMODE_WRITE)) | 991 | (file->f_mode & FMODE_WRITE)) |
990 | return -EACCES; | 992 | return -EACCES; |
991 | 993 | ||
992 | if (locks_verify_locked(file->f_path.dentry->d_inode)) | 994 | if (locks_verify_locked(file_inode(file))) |
993 | return -EAGAIN; | 995 | return -EAGAIN; |
994 | 996 | ||
995 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 997 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) |
@@ -1249,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1249 | unsigned long len, | 1251 | unsigned long len, |
1250 | unsigned long prot, | 1252 | unsigned long prot, |
1251 | unsigned long flags, | 1253 | unsigned long flags, |
1252 | unsigned long pgoff) | 1254 | unsigned long pgoff, |
1255 | unsigned long *populate) | ||
1253 | { | 1256 | { |
1254 | struct vm_area_struct *vma; | 1257 | struct vm_area_struct *vma; |
1255 | struct vm_region *region; | 1258 | struct vm_region *region; |
@@ -1259,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1259 | 1262 | ||
1260 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | 1263 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); |
1261 | 1264 | ||
1265 | *populate = 0; | ||
1266 | |||
1262 | /* decide whether we should attempt the mapping, and if so what sort of | 1267 | /* decide whether we should attempt the mapping, and if so what sort of |
1263 | * mapping */ | 1268 | * mapping */ |
1264 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1269 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
@@ -1322,8 +1327,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1322 | continue; | 1327 | continue; |
1323 | 1328 | ||
1324 | /* search for overlapping mappings on the same file */ | 1329 | /* search for overlapping mappings on the same file */ |
1325 | if (pregion->vm_file->f_path.dentry->d_inode != | 1330 | if (file_inode(pregion->vm_file) != |
1326 | file->f_path.dentry->d_inode) | 1331 | file_inode(file)) |
1327 | continue; | 1332 | continue; |
1328 | 1333 | ||
1329 | if (pregion->vm_pgoff >= pgend) | 1334 | if (pregion->vm_pgoff >= pgend) |
@@ -1814,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
1814 | return ret; | 1819 | return ret; |
1815 | } | 1820 | } |
1816 | 1821 | ||
1817 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1822 | struct page *follow_page_mask(struct vm_area_struct *vma, |
1818 | unsigned int foll_flags) | 1823 | unsigned long address, unsigned int flags, |
1824 | unsigned int *page_mask) | ||
1819 | { | 1825 | { |
1826 | *page_mask = 0; | ||
1820 | return NULL; | 1827 | return NULL; |
1821 | } | 1828 | } |
1822 | 1829 | ||
@@ -1903,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1903 | */ | 1910 | */ |
1904 | free -= global_page_state(NR_SHMEM); | 1911 | free -= global_page_state(NR_SHMEM); |
1905 | 1912 | ||
1906 | free += nr_swap_pages; | 1913 | free += get_nr_swap_pages(); |
1907 | 1914 | ||
1908 | /* | 1915 | /* |
1909 | * Any slabs which are created with the | 1916 | * Any slabs which are created with the |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0399f146ae49..79e451a78c9e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
386 | cpuset_print_task_mems_allowed(current); | 386 | cpuset_print_task_mems_allowed(current); |
387 | task_unlock(current); | 387 | task_unlock(current); |
388 | dump_stack(); | 388 | dump_stack(); |
389 | mem_cgroup_print_oom_info(memcg, p); | 389 | if (memcg) |
390 | show_mem(SHOW_MEM_FILTER_NODES); | 390 | mem_cgroup_print_oom_info(memcg, p); |
391 | else | ||
392 | show_mem(SHOW_MEM_FILTER_NODES); | ||
391 | if (sysctl_oom_dump_tasks) | 393 | if (sysctl_oom_dump_tasks) |
392 | dump_tasks(memcg, nodemask); | 394 | dump_tasks(memcg, nodemask); |
393 | } | 395 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3734cefd4de4..742c40583159 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ | 35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
37 | #include <linux/timer.h> | 37 | #include <linux/timer.h> |
38 | #include <linux/sched/rt.h> | ||
38 | #include <trace/events/writeback.h> | 39 | #include <trace/events/writeback.h> |
39 | 40 | ||
40 | /* | 41 | /* |
@@ -240,6 +241,9 @@ static unsigned long global_dirtyable_memory(void) | |||
240 | if (!vm_highmem_is_dirtyable) | 241 | if (!vm_highmem_is_dirtyable) |
241 | x -= highmem_dirtyable_memory(x); | 242 | x -= highmem_dirtyable_memory(x); |
242 | 243 | ||
244 | /* Subtract min_free_kbytes */ | ||
245 | x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10)); | ||
246 | |||
243 | return x + 1; /* Ensure that we never return 0 */ | 247 | return x + 1; /* Ensure that we never return 0 */ |
244 | } | 248 | } |
245 | 249 | ||
@@ -2291,3 +2295,27 @@ int mapping_tagged(struct address_space *mapping, int tag) | |||
2291 | return radix_tree_tagged(&mapping->page_tree, tag); | 2295 | return radix_tree_tagged(&mapping->page_tree, tag); |
2292 | } | 2296 | } |
2293 | EXPORT_SYMBOL(mapping_tagged); | 2297 | EXPORT_SYMBOL(mapping_tagged); |
2298 | |||
2299 | /** | ||
2300 | * wait_for_stable_page() - wait for writeback to finish, if necessary. | ||
2301 | * @page: The page to wait on. | ||
2302 | * | ||
2303 | * This function determines if the given page is related to a backing device | ||
2304 | * that requires page contents to be held stable during writeback. If so, then | ||
2305 | * it will wait for any pending writeback to complete. | ||
2306 | */ | ||
2307 | void wait_for_stable_page(struct page *page) | ||
2308 | { | ||
2309 | struct address_space *mapping = page_mapping(page); | ||
2310 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
2311 | |||
2312 | if (!bdi_cap_stable_pages_required(bdi)) | ||
2313 | return; | ||
2314 | #ifdef CONFIG_NEED_BOUNCE_POOL | ||
2315 | if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE) | ||
2316 | return; | ||
2317 | #endif /* CONFIG_NEED_BOUNCE_POOL */ | ||
2318 | |||
2319 | wait_on_page_writeback(page); | ||
2320 | } | ||
2321 | EXPORT_SYMBOL_GPL(wait_for_stable_page); | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bc6cc0e913bd..0dade3f18f7d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -58,6 +58,7 @@ | |||
58 | #include <linux/prefetch.h> | 58 | #include <linux/prefetch.h> |
59 | #include <linux/migrate.h> | 59 | #include <linux/migrate.h> |
60 | #include <linux/page-debug-flags.h> | 60 | #include <linux/page-debug-flags.h> |
61 | #include <linux/sched/rt.h> | ||
61 | 62 | ||
62 | #include <asm/tlbflush.h> | 63 | #include <asm/tlbflush.h> |
63 | #include <asm/div64.h> | 64 | #include <asm/div64.h> |
@@ -201,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages; | |||
201 | static unsigned long __meminitdata dma_reserve; | 202 | static unsigned long __meminitdata dma_reserve; |
202 | 203 | ||
203 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 204 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
205 | /* Movable memory ranges, will also be used by memblock subsystem. */ | ||
206 | struct movablemem_map movablemem_map = { | ||
207 | .acpi = false, | ||
208 | .nr_map = 0, | ||
209 | }; | ||
210 | |||
204 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 211 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
205 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 212 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
206 | static unsigned long __initdata required_kernelcore; | 213 | static unsigned long __initdata required_kernelcore; |
207 | static unsigned long __initdata required_movablecore; | 214 | static unsigned long __initdata required_movablecore; |
208 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 215 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
216 | static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES]; | ||
209 | 217 | ||
210 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 218 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
211 | int movable_zone; | 219 | int movable_zone; |
@@ -239,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | |||
239 | int ret = 0; | 247 | int ret = 0; |
240 | unsigned seq; | 248 | unsigned seq; |
241 | unsigned long pfn = page_to_pfn(page); | 249 | unsigned long pfn = page_to_pfn(page); |
250 | unsigned long sp, start_pfn; | ||
242 | 251 | ||
243 | do { | 252 | do { |
244 | seq = zone_span_seqbegin(zone); | 253 | seq = zone_span_seqbegin(zone); |
245 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) | 254 | start_pfn = zone->zone_start_pfn; |
246 | ret = 1; | 255 | sp = zone->spanned_pages; |
247 | else if (pfn < zone->zone_start_pfn) | 256 | if (!zone_spans_pfn(zone, pfn)) |
248 | ret = 1; | 257 | ret = 1; |
249 | } while (zone_span_seqretry(zone, seq)); | 258 | } while (zone_span_seqretry(zone, seq)); |
250 | 259 | ||
260 | if (ret) | ||
261 | pr_err("page %lu outside zone [ %lu - %lu ]\n", | ||
262 | pfn, start_pfn, start_pfn + sp); | ||
263 | |||
251 | return ret; | 264 | return ret; |
252 | } | 265 | } |
253 | 266 | ||
@@ -287,7 +300,7 @@ static void bad_page(struct page *page) | |||
287 | 300 | ||
288 | /* Don't complain about poisoned pages */ | 301 | /* Don't complain about poisoned pages */ |
289 | if (PageHWPoison(page)) { | 302 | if (PageHWPoison(page)) { |
290 | reset_page_mapcount(page); /* remove PageBuddy */ | 303 | page_mapcount_reset(page); /* remove PageBuddy */ |
291 | return; | 304 | return; |
292 | } | 305 | } |
293 | 306 | ||
@@ -319,8 +332,8 @@ static void bad_page(struct page *page) | |||
319 | dump_stack(); | 332 | dump_stack(); |
320 | out: | 333 | out: |
321 | /* Leave bad fields for debug, except PageBuddy could make trouble */ | 334 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
322 | reset_page_mapcount(page); /* remove PageBuddy */ | 335 | page_mapcount_reset(page); /* remove PageBuddy */ |
323 | add_taint(TAINT_BAD_PAGE); | 336 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
324 | } | 337 | } |
325 | 338 | ||
326 | /* | 339 | /* |
@@ -532,6 +545,8 @@ static inline void __free_one_page(struct page *page, | |||
532 | unsigned long uninitialized_var(buddy_idx); | 545 | unsigned long uninitialized_var(buddy_idx); |
533 | struct page *buddy; | 546 | struct page *buddy; |
534 | 547 | ||
548 | VM_BUG_ON(!zone_is_initialized(zone)); | ||
549 | |||
535 | if (unlikely(PageCompound(page))) | 550 | if (unlikely(PageCompound(page))) |
536 | if (unlikely(destroy_compound_page(page, order))) | 551 | if (unlikely(destroy_compound_page(page, order))) |
537 | return; | 552 | return; |
@@ -605,7 +620,7 @@ static inline int free_pages_check(struct page *page) | |||
605 | bad_page(page); | 620 | bad_page(page); |
606 | return 1; | 621 | return 1; |
607 | } | 622 | } |
608 | reset_page_last_nid(page); | 623 | page_nid_reset_last(page); |
609 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 624 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
610 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 625 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
611 | return 0; | 626 | return 0; |
@@ -665,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
665 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 680 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
666 | __free_one_page(page, zone, 0, mt); | 681 | __free_one_page(page, zone, 0, mt); |
667 | trace_mm_page_pcpu_drain(page, 0, mt); | 682 | trace_mm_page_pcpu_drain(page, 0, mt); |
668 | if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { | 683 | if (likely(!is_migrate_isolate_page(page))) { |
669 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); | 684 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
670 | if (is_migrate_cma(mt)) | 685 | if (is_migrate_cma(mt)) |
671 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | 686 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); |
@@ -683,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
683 | zone->pages_scanned = 0; | 698 | zone->pages_scanned = 0; |
684 | 699 | ||
685 | __free_one_page(page, zone, order, migratetype); | 700 | __free_one_page(page, zone, order, migratetype); |
686 | if (unlikely(migratetype != MIGRATE_ISOLATE)) | 701 | if (unlikely(!is_migrate_isolate(migratetype))) |
687 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | 702 | __mod_zone_freepage_state(zone, 1 << order, migratetype); |
688 | spin_unlock(&zone->lock); | 703 | spin_unlock(&zone->lock); |
689 | } | 704 | } |
@@ -773,6 +788,10 @@ void __init init_cma_reserved_pageblock(struct page *page) | |||
773 | set_pageblock_migratetype(page, MIGRATE_CMA); | 788 | set_pageblock_migratetype(page, MIGRATE_CMA); |
774 | __free_pages(page, pageblock_order); | 789 | __free_pages(page, pageblock_order); |
775 | totalram_pages += pageblock_nr_pages; | 790 | totalram_pages += pageblock_nr_pages; |
791 | #ifdef CONFIG_HIGHMEM | ||
792 | if (PageHighMem(page)) | ||
793 | totalhigh_pages += pageblock_nr_pages; | ||
794 | #endif | ||
776 | } | 795 | } |
777 | #endif | 796 | #endif |
778 | 797 | ||
@@ -911,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = { | |||
911 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 930 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
912 | #endif | 931 | #endif |
913 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | 932 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ |
933 | #ifdef CONFIG_MEMORY_ISOLATION | ||
914 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ | 934 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ |
935 | #endif | ||
915 | }; | 936 | }; |
916 | 937 | ||
917 | /* | 938 | /* |
@@ -976,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page, | |||
976 | end_pfn = start_pfn + pageblock_nr_pages - 1; | 997 | end_pfn = start_pfn + pageblock_nr_pages - 1; |
977 | 998 | ||
978 | /* Do not cross zone boundaries */ | 999 | /* Do not cross zone boundaries */ |
979 | if (start_pfn < zone->zone_start_pfn) | 1000 | if (!zone_spans_pfn(zone, start_pfn)) |
980 | start_page = page; | 1001 | start_page = page; |
981 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) | 1002 | if (!zone_spans_pfn(zone, end_pfn)) |
982 | return 0; | 1003 | return 0; |
983 | 1004 | ||
984 | return move_freepages(zone, start_page, end_page, migratetype); | 1005 | return move_freepages(zone, start_page, end_page, migratetype); |
@@ -1137,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1137 | list_add_tail(&page->lru, list); | 1158 | list_add_tail(&page->lru, list); |
1138 | if (IS_ENABLED(CONFIG_CMA)) { | 1159 | if (IS_ENABLED(CONFIG_CMA)) { |
1139 | mt = get_pageblock_migratetype(page); | 1160 | mt = get_pageblock_migratetype(page); |
1140 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | 1161 | if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) |
1141 | mt = migratetype; | 1162 | mt = migratetype; |
1142 | } | 1163 | } |
1143 | set_freepage_migratetype(page, mt); | 1164 | set_freepage_migratetype(page, mt); |
@@ -1272,7 +1293,7 @@ void mark_free_pages(struct zone *zone) | |||
1272 | 1293 | ||
1273 | spin_lock_irqsave(&zone->lock, flags); | 1294 | spin_lock_irqsave(&zone->lock, flags); |
1274 | 1295 | ||
1275 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1296 | max_zone_pfn = zone_end_pfn(zone); |
1276 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1297 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
1277 | if (pfn_valid(pfn)) { | 1298 | if (pfn_valid(pfn)) { |
1278 | struct page *page = pfn_to_page(pfn); | 1299 | struct page *page = pfn_to_page(pfn); |
@@ -1321,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1321 | * excessively into the page allocator | 1342 | * excessively into the page allocator |
1322 | */ | 1343 | */ |
1323 | if (migratetype >= MIGRATE_PCPTYPES) { | 1344 | if (migratetype >= MIGRATE_PCPTYPES) { |
1324 | if (unlikely(migratetype == MIGRATE_ISOLATE)) { | 1345 | if (unlikely(is_migrate_isolate(migratetype))) { |
1325 | free_one_page(zone, page, 0, migratetype); | 1346 | free_one_page(zone, page, 0, migratetype); |
1326 | goto out; | 1347 | goto out; |
1327 | } | 1348 | } |
@@ -1384,14 +1405,8 @@ void split_page(struct page *page, unsigned int order) | |||
1384 | set_page_refcounted(page + i); | 1405 | set_page_refcounted(page + i); |
1385 | } | 1406 | } |
1386 | 1407 | ||
1387 | /* | 1408 | static int __isolate_free_page(struct page *page, unsigned int order) |
1388 | * Similar to the split_page family of functions except that the page | ||
1389 | * required at the given order and being isolated now to prevent races | ||
1390 | * with parallel allocators | ||
1391 | */ | ||
1392 | int capture_free_page(struct page *page, int alloc_order, int migratetype) | ||
1393 | { | 1409 | { |
1394 | unsigned int order; | ||
1395 | unsigned long watermark; | 1410 | unsigned long watermark; |
1396 | struct zone *zone; | 1411 | struct zone *zone; |
1397 | int mt; | 1412 | int mt; |
@@ -1399,16 +1414,15 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1399 | BUG_ON(!PageBuddy(page)); | 1414 | BUG_ON(!PageBuddy(page)); |
1400 | 1415 | ||
1401 | zone = page_zone(page); | 1416 | zone = page_zone(page); |
1402 | order = page_order(page); | ||
1403 | mt = get_pageblock_migratetype(page); | 1417 | mt = get_pageblock_migratetype(page); |
1404 | 1418 | ||
1405 | if (mt != MIGRATE_ISOLATE) { | 1419 | if (!is_migrate_isolate(mt)) { |
1406 | /* Obey watermarks as if the page was being allocated */ | 1420 | /* Obey watermarks as if the page was being allocated */ |
1407 | watermark = low_wmark_pages(zone) + (1 << order); | 1421 | watermark = low_wmark_pages(zone) + (1 << order); |
1408 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1422 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
1409 | return 0; | 1423 | return 0; |
1410 | 1424 | ||
1411 | __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); | 1425 | __mod_zone_freepage_state(zone, -(1UL << order), mt); |
1412 | } | 1426 | } |
1413 | 1427 | ||
1414 | /* Remove page from free list */ | 1428 | /* Remove page from free list */ |
@@ -1416,22 +1430,18 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1416 | zone->free_area[order].nr_free--; | 1430 | zone->free_area[order].nr_free--; |
1417 | rmv_page_order(page); | 1431 | rmv_page_order(page); |
1418 | 1432 | ||
1419 | if (alloc_order != order) | 1433 | /* Set the pageblock if the isolated page is at least a pageblock */ |
1420 | expand(zone, page, alloc_order, order, | ||
1421 | &zone->free_area[order], migratetype); | ||
1422 | |||
1423 | /* Set the pageblock if the captured page is at least a pageblock */ | ||
1424 | if (order >= pageblock_order - 1) { | 1434 | if (order >= pageblock_order - 1) { |
1425 | struct page *endpage = page + (1 << order) - 1; | 1435 | struct page *endpage = page + (1 << order) - 1; |
1426 | for (; page < endpage; page += pageblock_nr_pages) { | 1436 | for (; page < endpage; page += pageblock_nr_pages) { |
1427 | int mt = get_pageblock_migratetype(page); | 1437 | int mt = get_pageblock_migratetype(page); |
1428 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) | 1438 | if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) |
1429 | set_pageblock_migratetype(page, | 1439 | set_pageblock_migratetype(page, |
1430 | MIGRATE_MOVABLE); | 1440 | MIGRATE_MOVABLE); |
1431 | } | 1441 | } |
1432 | } | 1442 | } |
1433 | 1443 | ||
1434 | return 1UL << alloc_order; | 1444 | return 1UL << order; |
1435 | } | 1445 | } |
1436 | 1446 | ||
1437 | /* | 1447 | /* |
@@ -1449,10 +1459,9 @@ int split_free_page(struct page *page) | |||
1449 | unsigned int order; | 1459 | unsigned int order; |
1450 | int nr_pages; | 1460 | int nr_pages; |
1451 | 1461 | ||
1452 | BUG_ON(!PageBuddy(page)); | ||
1453 | order = page_order(page); | 1462 | order = page_order(page); |
1454 | 1463 | ||
1455 | nr_pages = capture_free_page(page, order, 0); | 1464 | nr_pages = __isolate_free_page(page, order); |
1456 | if (!nr_pages) | 1465 | if (!nr_pages) |
1457 | return 0; | 1466 | return 0; |
1458 | 1467 | ||
@@ -2136,8 +2145,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2136 | bool *contended_compaction, bool *deferred_compaction, | 2145 | bool *contended_compaction, bool *deferred_compaction, |
2137 | unsigned long *did_some_progress) | 2146 | unsigned long *did_some_progress) |
2138 | { | 2147 | { |
2139 | struct page *page = NULL; | ||
2140 | |||
2141 | if (!order) | 2148 | if (!order) |
2142 | return NULL; | 2149 | return NULL; |
2143 | 2150 | ||
@@ -2149,16 +2156,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2149 | current->flags |= PF_MEMALLOC; | 2156 | current->flags |= PF_MEMALLOC; |
2150 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2157 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2151 | nodemask, sync_migration, | 2158 | nodemask, sync_migration, |
2152 | contended_compaction, &page); | 2159 | contended_compaction); |
2153 | current->flags &= ~PF_MEMALLOC; | 2160 | current->flags &= ~PF_MEMALLOC; |
2154 | 2161 | ||
2155 | /* If compaction captured a page, prep and use it */ | ||
2156 | if (page) { | ||
2157 | prep_new_page(page, order, gfp_mask); | ||
2158 | goto got_page; | ||
2159 | } | ||
2160 | |||
2161 | if (*did_some_progress != COMPACT_SKIPPED) { | 2162 | if (*did_some_progress != COMPACT_SKIPPED) { |
2163 | struct page *page; | ||
2164 | |||
2162 | /* Page migration frees to the PCP lists but we want merging */ | 2165 | /* Page migration frees to the PCP lists but we want merging */ |
2163 | drain_pages(get_cpu()); | 2166 | drain_pages(get_cpu()); |
2164 | put_cpu(); | 2167 | put_cpu(); |
@@ -2168,7 +2171,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2168 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2171 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2169 | preferred_zone, migratetype); | 2172 | preferred_zone, migratetype); |
2170 | if (page) { | 2173 | if (page) { |
2171 | got_page: | ||
2172 | preferred_zone->compact_blockskip_flush = false; | 2174 | preferred_zone->compact_blockskip_flush = false; |
2173 | preferred_zone->compact_considered = 0; | 2175 | preferred_zone->compact_considered = 0; |
2174 | preferred_zone->compact_defer_shift = 0; | 2176 | preferred_zone->compact_defer_shift = 0; |
@@ -2629,10 +2631,17 @@ retry_cpuset: | |||
2629 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2631 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2630 | zonelist, high_zoneidx, alloc_flags, | 2632 | zonelist, high_zoneidx, alloc_flags, |
2631 | preferred_zone, migratetype); | 2633 | preferred_zone, migratetype); |
2632 | if (unlikely(!page)) | 2634 | if (unlikely(!page)) { |
2635 | /* | ||
2636 | * Runtime PM, block IO and its error handling path | ||
2637 | * can deadlock because I/O on the device might not | ||
2638 | * complete. | ||
2639 | */ | ||
2640 | gfp_mask = memalloc_noio_flags(gfp_mask); | ||
2633 | page = __alloc_pages_slowpath(gfp_mask, order, | 2641 | page = __alloc_pages_slowpath(gfp_mask, order, |
2634 | zonelist, high_zoneidx, nodemask, | 2642 | zonelist, high_zoneidx, nodemask, |
2635 | preferred_zone, migratetype); | 2643 | preferred_zone, migratetype); |
2644 | } | ||
2636 | 2645 | ||
2637 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2646 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2638 | 2647 | ||
@@ -2804,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size) | |||
2804 | } | 2813 | } |
2805 | EXPORT_SYMBOL(free_pages_exact); | 2814 | EXPORT_SYMBOL(free_pages_exact); |
2806 | 2815 | ||
2807 | static unsigned int nr_free_zone_pages(int offset) | 2816 | /** |
2817 | * nr_free_zone_pages - count number of pages beyond high watermark | ||
2818 | * @offset: The zone index of the highest zone | ||
2819 | * | ||
2820 | * nr_free_zone_pages() counts the number of counts pages which are beyond the | ||
2821 | * high watermark within all zones at or below a given zone index. For each | ||
2822 | * zone, the number of pages is calculated as: | ||
2823 | * present_pages - high_pages | ||
2824 | */ | ||
2825 | static unsigned long nr_free_zone_pages(int offset) | ||
2808 | { | 2826 | { |
2809 | struct zoneref *z; | 2827 | struct zoneref *z; |
2810 | struct zone *zone; | 2828 | struct zone *zone; |
2811 | 2829 | ||
2812 | /* Just pick one node, since fallback list is circular */ | 2830 | /* Just pick one node, since fallback list is circular */ |
2813 | unsigned int sum = 0; | 2831 | unsigned long sum = 0; |
2814 | 2832 | ||
2815 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); | 2833 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
2816 | 2834 | ||
2817 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2835 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
2818 | unsigned long size = zone->present_pages; | 2836 | unsigned long size = zone->managed_pages; |
2819 | unsigned long high = high_wmark_pages(zone); | 2837 | unsigned long high = high_wmark_pages(zone); |
2820 | if (size > high) | 2838 | if (size > high) |
2821 | sum += size - high; | 2839 | sum += size - high; |
@@ -2824,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset) | |||
2824 | return sum; | 2842 | return sum; |
2825 | } | 2843 | } |
2826 | 2844 | ||
2827 | /* | 2845 | /** |
2828 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL | 2846 | * nr_free_buffer_pages - count number of pages beyond high watermark |
2847 | * | ||
2848 | * nr_free_buffer_pages() counts the number of pages which are beyond the high | ||
2849 | * watermark within ZONE_DMA and ZONE_NORMAL. | ||
2829 | */ | 2850 | */ |
2830 | unsigned int nr_free_buffer_pages(void) | 2851 | unsigned long nr_free_buffer_pages(void) |
2831 | { | 2852 | { |
2832 | return nr_free_zone_pages(gfp_zone(GFP_USER)); | 2853 | return nr_free_zone_pages(gfp_zone(GFP_USER)); |
2833 | } | 2854 | } |
2834 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); | 2855 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); |
2835 | 2856 | ||
2836 | /* | 2857 | /** |
2837 | * Amount of free RAM allocatable within all zones | 2858 | * nr_free_pagecache_pages - count number of pages beyond high watermark |
2859 | * | ||
2860 | * nr_free_pagecache_pages() counts the number of pages which are beyond the | ||
2861 | * high watermark within all zones. | ||
2838 | */ | 2862 | */ |
2839 | unsigned int nr_free_pagecache_pages(void) | 2863 | unsigned long nr_free_pagecache_pages(void) |
2840 | { | 2864 | { |
2841 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); | 2865 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); |
2842 | } | 2866 | } |
@@ -2868,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2868 | val->totalram = pgdat->node_present_pages; | 2892 | val->totalram = pgdat->node_present_pages; |
2869 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 2893 | val->freeram = node_page_state(nid, NR_FREE_PAGES); |
2870 | #ifdef CONFIG_HIGHMEM | 2894 | #ifdef CONFIG_HIGHMEM |
2871 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 2895 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; |
2872 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], | 2896 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], |
2873 | NR_FREE_PAGES); | 2897 | NR_FREE_PAGES); |
2874 | #else | 2898 | #else |
@@ -2911,7 +2935,9 @@ static void show_migration_types(unsigned char type) | |||
2911 | #ifdef CONFIG_CMA | 2935 | #ifdef CONFIG_CMA |
2912 | [MIGRATE_CMA] = 'C', | 2936 | [MIGRATE_CMA] = 'C', |
2913 | #endif | 2937 | #endif |
2938 | #ifdef CONFIG_MEMORY_ISOLATION | ||
2914 | [MIGRATE_ISOLATE] = 'I', | 2939 | [MIGRATE_ISOLATE] = 'I', |
2940 | #endif | ||
2915 | }; | 2941 | }; |
2916 | char tmp[MIGRATE_TYPES + 1]; | 2942 | char tmp[MIGRATE_TYPES + 1]; |
2917 | char *p = tmp; | 2943 | char *p = tmp; |
@@ -3250,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
3250 | { | 3276 | { |
3251 | int n, val; | 3277 | int n, val; |
3252 | int min_val = INT_MAX; | 3278 | int min_val = INT_MAX; |
3253 | int best_node = -1; | 3279 | int best_node = NUMA_NO_NODE; |
3254 | const struct cpumask *tmp = cpumask_of_node(0); | 3280 | const struct cpumask *tmp = cpumask_of_node(0); |
3255 | 3281 | ||
3256 | /* Use the local node if we haven't already */ | 3282 | /* Use the local node if we haven't already */ |
@@ -3794,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3794 | * the block. | 3820 | * the block. |
3795 | */ | 3821 | */ |
3796 | start_pfn = zone->zone_start_pfn; | 3822 | start_pfn = zone->zone_start_pfn; |
3797 | end_pfn = start_pfn + zone->spanned_pages; | 3823 | end_pfn = zone_end_pfn(zone); |
3798 | start_pfn = roundup(start_pfn, pageblock_nr_pages); | 3824 | start_pfn = roundup(start_pfn, pageblock_nr_pages); |
3799 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> | 3825 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
3800 | pageblock_order; | 3826 | pageblock_order; |
@@ -3890,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3890 | set_page_links(page, zone, nid, pfn); | 3916 | set_page_links(page, zone, nid, pfn); |
3891 | mminit_verify_page_links(page, zone, nid, pfn); | 3917 | mminit_verify_page_links(page, zone, nid, pfn); |
3892 | init_page_count(page); | 3918 | init_page_count(page); |
3893 | reset_page_mapcount(page); | 3919 | page_mapcount_reset(page); |
3894 | reset_page_last_nid(page); | 3920 | page_nid_reset_last(page); |
3895 | SetPageReserved(page); | 3921 | SetPageReserved(page); |
3896 | /* | 3922 | /* |
3897 | * Mark the block movable so that blocks are reserved for | 3923 | * Mark the block movable so that blocks are reserved for |
@@ -3908,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3908 | * pfn out of zone. | 3934 | * pfn out of zone. |
3909 | */ | 3935 | */ |
3910 | if ((z->zone_start_pfn <= pfn) | 3936 | if ((z->zone_start_pfn <= pfn) |
3911 | && (pfn < z->zone_start_pfn + z->spanned_pages) | 3937 | && (pfn < zone_end_pfn(z)) |
3912 | && !(pfn & (pageblock_nr_pages - 1))) | 3938 | && !(pfn & (pageblock_nr_pages - 1))) |
3913 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 3939 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
3914 | 3940 | ||
@@ -3946,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone) | |||
3946 | * | 3972 | * |
3947 | * OK, so we don't know how big the cache is. So guess. | 3973 | * OK, so we don't know how big the cache is. So guess. |
3948 | */ | 3974 | */ |
3949 | batch = zone->present_pages / 1024; | 3975 | batch = zone->managed_pages / 1024; |
3950 | if (batch * PAGE_SIZE > 512 * 1024) | 3976 | if (batch * PAGE_SIZE > 512 * 1024) |
3951 | batch = (512 * 1024) / PAGE_SIZE; | 3977 | batch = (512 * 1024) / PAGE_SIZE; |
3952 | batch /= 4; /* We effectively *= 4 below */ | 3978 | batch /= 4; /* We effectively *= 4 below */ |
@@ -4030,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone) | |||
4030 | 4056 | ||
4031 | if (percpu_pagelist_fraction) | 4057 | if (percpu_pagelist_fraction) |
4032 | setup_pagelist_highmark(pcp, | 4058 | setup_pagelist_highmark(pcp, |
4033 | (zone->present_pages / | 4059 | (zone->managed_pages / |
4034 | percpu_pagelist_fraction)); | 4060 | percpu_pagelist_fraction)); |
4035 | } | 4061 | } |
4036 | } | 4062 | } |
@@ -4386,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
4386 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | 4412 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); |
4387 | } | 4413 | } |
4388 | 4414 | ||
4415 | /** | ||
4416 | * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array. | ||
4417 | * | ||
4418 | * zone_movable_limit is initialized as 0. This function will try to get | ||
4419 | * the first ZONE_MOVABLE pfn of each node from movablemem_map, and | ||
4420 | * assigne them to zone_movable_limit. | ||
4421 | * zone_movable_limit[nid] == 0 means no limit for the node. | ||
4422 | * | ||
4423 | * Note: Each range is represented as [start_pfn, end_pfn) | ||
4424 | */ | ||
4425 | static void __meminit sanitize_zone_movable_limit(void) | ||
4426 | { | ||
4427 | int map_pos = 0, i, nid; | ||
4428 | unsigned long start_pfn, end_pfn; | ||
4429 | |||
4430 | if (!movablemem_map.nr_map) | ||
4431 | return; | ||
4432 | |||
4433 | /* Iterate all ranges from minimum to maximum */ | ||
4434 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { | ||
4435 | /* | ||
4436 | * If we have found lowest pfn of ZONE_MOVABLE of the node | ||
4437 | * specified by user, just go on to check next range. | ||
4438 | */ | ||
4439 | if (zone_movable_limit[nid]) | ||
4440 | continue; | ||
4441 | |||
4442 | #ifdef CONFIG_ZONE_DMA | ||
4443 | /* Skip DMA memory. */ | ||
4444 | if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA]) | ||
4445 | start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA]; | ||
4446 | #endif | ||
4447 | |||
4448 | #ifdef CONFIG_ZONE_DMA32 | ||
4449 | /* Skip DMA32 memory. */ | ||
4450 | if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32]) | ||
4451 | start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32]; | ||
4452 | #endif | ||
4453 | |||
4454 | #ifdef CONFIG_HIGHMEM | ||
4455 | /* Skip lowmem if ZONE_MOVABLE is highmem. */ | ||
4456 | if (zone_movable_is_highmem() && | ||
4457 | start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]) | ||
4458 | start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; | ||
4459 | #endif | ||
4460 | |||
4461 | if (start_pfn >= end_pfn) | ||
4462 | continue; | ||
4463 | |||
4464 | while (map_pos < movablemem_map.nr_map) { | ||
4465 | if (end_pfn <= movablemem_map.map[map_pos].start_pfn) | ||
4466 | break; | ||
4467 | |||
4468 | if (start_pfn >= movablemem_map.map[map_pos].end_pfn) { | ||
4469 | map_pos++; | ||
4470 | continue; | ||
4471 | } | ||
4472 | |||
4473 | /* | ||
4474 | * The start_pfn of ZONE_MOVABLE is either the minimum | ||
4475 | * pfn specified by movablemem_map, or 0, which means | ||
4476 | * the node has no ZONE_MOVABLE. | ||
4477 | */ | ||
4478 | zone_movable_limit[nid] = max(start_pfn, | ||
4479 | movablemem_map.map[map_pos].start_pfn); | ||
4480 | |||
4481 | break; | ||
4482 | } | ||
4483 | } | ||
4484 | } | ||
4485 | |||
4389 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4486 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4390 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | 4487 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
4391 | unsigned long zone_type, | 4488 | unsigned long zone_type, |
@@ -4403,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
4403 | 4500 | ||
4404 | return zholes_size[zone_type]; | 4501 | return zholes_size[zone_type]; |
4405 | } | 4502 | } |
4406 | |||
4407 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4503 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4408 | 4504 | ||
4409 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | 4505 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, |
@@ -4435,10 +4531,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | |||
4435 | * round what is now in bits to nearest long in bits, then return it in | 4531 | * round what is now in bits to nearest long in bits, then return it in |
4436 | * bytes. | 4532 | * bytes. |
4437 | */ | 4533 | */ |
4438 | static unsigned long __init usemap_size(unsigned long zonesize) | 4534 | static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) |
4439 | { | 4535 | { |
4440 | unsigned long usemapsize; | 4536 | unsigned long usemapsize; |
4441 | 4537 | ||
4538 | zonesize += zone_start_pfn & (pageblock_nr_pages-1); | ||
4442 | usemapsize = roundup(zonesize, pageblock_nr_pages); | 4539 | usemapsize = roundup(zonesize, pageblock_nr_pages); |
4443 | usemapsize = usemapsize >> pageblock_order; | 4540 | usemapsize = usemapsize >> pageblock_order; |
4444 | usemapsize *= NR_PAGEBLOCK_BITS; | 4541 | usemapsize *= NR_PAGEBLOCK_BITS; |
@@ -4448,17 +4545,19 @@ static unsigned long __init usemap_size(unsigned long zonesize) | |||
4448 | } | 4545 | } |
4449 | 4546 | ||
4450 | static void __init setup_usemap(struct pglist_data *pgdat, | 4547 | static void __init setup_usemap(struct pglist_data *pgdat, |
4451 | struct zone *zone, unsigned long zonesize) | 4548 | struct zone *zone, |
4549 | unsigned long zone_start_pfn, | ||
4550 | unsigned long zonesize) | ||
4452 | { | 4551 | { |
4453 | unsigned long usemapsize = usemap_size(zonesize); | 4552 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); |
4454 | zone->pageblock_flags = NULL; | 4553 | zone->pageblock_flags = NULL; |
4455 | if (usemapsize) | 4554 | if (usemapsize) |
4456 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, | 4555 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, |
4457 | usemapsize); | 4556 | usemapsize); |
4458 | } | 4557 | } |
4459 | #else | 4558 | #else |
4460 | static inline void setup_usemap(struct pglist_data *pgdat, | 4559 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, |
4461 | struct zone *zone, unsigned long zonesize) {} | 4560 | unsigned long zone_start_pfn, unsigned long zonesize) {} |
4462 | #endif /* CONFIG_SPARSEMEM */ | 4561 | #endif /* CONFIG_SPARSEMEM */ |
4463 | 4562 | ||
4464 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4563 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
@@ -4584,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4584 | nr_all_pages += freesize; | 4683 | nr_all_pages += freesize; |
4585 | 4684 | ||
4586 | zone->spanned_pages = size; | 4685 | zone->spanned_pages = size; |
4587 | zone->present_pages = freesize; | 4686 | zone->present_pages = realsize; |
4588 | /* | 4687 | /* |
4589 | * Set an approximate value for lowmem here, it will be adjusted | 4688 | * Set an approximate value for lowmem here, it will be adjusted |
4590 | * when the bootmem allocator frees pages into the buddy system. | 4689 | * when the bootmem allocator frees pages into the buddy system. |
@@ -4609,7 +4708,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4609 | continue; | 4708 | continue; |
4610 | 4709 | ||
4611 | set_pageblock_order(); | 4710 | set_pageblock_order(); |
4612 | setup_usemap(pgdat, zone, size); | 4711 | setup_usemap(pgdat, zone, zone_start_pfn, size); |
4613 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 4712 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
4614 | size, MEMMAP_EARLY); | 4713 | size, MEMMAP_EARLY); |
4615 | BUG_ON(ret); | 4714 | BUG_ON(ret); |
@@ -4636,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
4636 | * for the buddy allocator to function correctly. | 4735 | * for the buddy allocator to function correctly. |
4637 | */ | 4736 | */ |
4638 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); | 4737 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); |
4639 | end = pgdat->node_start_pfn + pgdat->node_spanned_pages; | 4738 | end = pgdat_end_pfn(pgdat); |
4640 | end = ALIGN(end, MAX_ORDER_NR_PAGES); | 4739 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
4641 | size = (end - start) * sizeof(struct page); | 4740 | size = (end - start) * sizeof(struct page); |
4642 | map = alloc_remap(pgdat->node_id, size); | 4741 | map = alloc_remap(pgdat->node_id, size); |
@@ -4842,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4842 | required_kernelcore = max(required_kernelcore, corepages); | 4941 | required_kernelcore = max(required_kernelcore, corepages); |
4843 | } | 4942 | } |
4844 | 4943 | ||
4845 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ | 4944 | /* |
4846 | if (!required_kernelcore) | 4945 | * If neither kernelcore/movablecore nor movablemem_map is specified, |
4946 | * there is no ZONE_MOVABLE. But if movablemem_map is specified, the | ||
4947 | * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[]. | ||
4948 | */ | ||
4949 | if (!required_kernelcore) { | ||
4950 | if (movablemem_map.nr_map) | ||
4951 | memcpy(zone_movable_pfn, zone_movable_limit, | ||
4952 | sizeof(zone_movable_pfn)); | ||
4847 | goto out; | 4953 | goto out; |
4954 | } | ||
4848 | 4955 | ||
4849 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 4956 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
4850 | find_usable_zone_for_movable(); | ||
4851 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; | 4957 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; |
4852 | 4958 | ||
4853 | restart: | 4959 | restart: |
@@ -4875,10 +4981,24 @@ restart: | |||
4875 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { | 4981 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { |
4876 | unsigned long size_pages; | 4982 | unsigned long size_pages; |
4877 | 4983 | ||
4984 | /* | ||
4985 | * Find more memory for kernelcore in | ||
4986 | * [zone_movable_pfn[nid], zone_movable_limit[nid]). | ||
4987 | */ | ||
4878 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); | 4988 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); |
4879 | if (start_pfn >= end_pfn) | 4989 | if (start_pfn >= end_pfn) |
4880 | continue; | 4990 | continue; |
4881 | 4991 | ||
4992 | if (zone_movable_limit[nid]) { | ||
4993 | end_pfn = min(end_pfn, zone_movable_limit[nid]); | ||
4994 | /* No range left for kernelcore in this node */ | ||
4995 | if (start_pfn >= end_pfn) { | ||
4996 | zone_movable_pfn[nid] = | ||
4997 | zone_movable_limit[nid]; | ||
4998 | break; | ||
4999 | } | ||
5000 | } | ||
5001 | |||
4882 | /* Account for what is only usable for kernelcore */ | 5002 | /* Account for what is only usable for kernelcore */ |
4883 | if (start_pfn < usable_startpfn) { | 5003 | if (start_pfn < usable_startpfn) { |
4884 | unsigned long kernel_pages; | 5004 | unsigned long kernel_pages; |
@@ -4938,12 +5058,12 @@ restart: | |||
4938 | if (usable_nodes && required_kernelcore > usable_nodes) | 5058 | if (usable_nodes && required_kernelcore > usable_nodes) |
4939 | goto restart; | 5059 | goto restart; |
4940 | 5060 | ||
5061 | out: | ||
4941 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ | 5062 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
4942 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 5063 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
4943 | zone_movable_pfn[nid] = | 5064 | zone_movable_pfn[nid] = |
4944 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 5065 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
4945 | 5066 | ||
4946 | out: | ||
4947 | /* restore the node_state */ | 5067 | /* restore the node_state */ |
4948 | node_states[N_MEMORY] = saved_node_state; | 5068 | node_states[N_MEMORY] = saved_node_state; |
4949 | } | 5069 | } |
@@ -5006,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5006 | 5126 | ||
5007 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ | 5127 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ |
5008 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); | 5128 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); |
5129 | find_usable_zone_for_movable(); | ||
5130 | sanitize_zone_movable_limit(); | ||
5009 | find_zone_movable_pfns_for_nodes(); | 5131 | find_zone_movable_pfns_for_nodes(); |
5010 | 5132 | ||
5011 | /* Print out the zone ranges */ | 5133 | /* Print out the zone ranges */ |
@@ -5089,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p) | |||
5089 | early_param("kernelcore", cmdline_parse_kernelcore); | 5211 | early_param("kernelcore", cmdline_parse_kernelcore); |
5090 | early_param("movablecore", cmdline_parse_movablecore); | 5212 | early_param("movablecore", cmdline_parse_movablecore); |
5091 | 5213 | ||
5214 | /** | ||
5215 | * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[]. | ||
5216 | * @start_pfn: start pfn of the range to be checked | ||
5217 | * @end_pfn: end pfn of the range to be checked (exclusive) | ||
5218 | * | ||
5219 | * This function checks if a given memory range [start_pfn, end_pfn) overlaps | ||
5220 | * the movablemem_map.map[] array. | ||
5221 | * | ||
5222 | * Return: index of the first overlapped element in movablemem_map.map[] | ||
5223 | * or -1 if they don't overlap each other. | ||
5224 | */ | ||
5225 | int __init movablemem_map_overlap(unsigned long start_pfn, | ||
5226 | unsigned long end_pfn) | ||
5227 | { | ||
5228 | int overlap; | ||
5229 | |||
5230 | if (!movablemem_map.nr_map) | ||
5231 | return -1; | ||
5232 | |||
5233 | for (overlap = 0; overlap < movablemem_map.nr_map; overlap++) | ||
5234 | if (start_pfn < movablemem_map.map[overlap].end_pfn) | ||
5235 | break; | ||
5236 | |||
5237 | if (overlap == movablemem_map.nr_map || | ||
5238 | end_pfn <= movablemem_map.map[overlap].start_pfn) | ||
5239 | return -1; | ||
5240 | |||
5241 | return overlap; | ||
5242 | } | ||
5243 | |||
5244 | /** | ||
5245 | * insert_movablemem_map - Insert a memory range in to movablemem_map.map. | ||
5246 | * @start_pfn: start pfn of the range | ||
5247 | * @end_pfn: end pfn of the range | ||
5248 | * | ||
5249 | * This function will also merge the overlapped ranges, and sort the array | ||
5250 | * by start_pfn in monotonic increasing order. | ||
5251 | */ | ||
5252 | void __init insert_movablemem_map(unsigned long start_pfn, | ||
5253 | unsigned long end_pfn) | ||
5254 | { | ||
5255 | int pos, overlap; | ||
5256 | |||
5257 | /* | ||
5258 | * pos will be at the 1st overlapped range, or the position | ||
5259 | * where the element should be inserted. | ||
5260 | */ | ||
5261 | for (pos = 0; pos < movablemem_map.nr_map; pos++) | ||
5262 | if (start_pfn <= movablemem_map.map[pos].end_pfn) | ||
5263 | break; | ||
5264 | |||
5265 | /* If there is no overlapped range, just insert the element. */ | ||
5266 | if (pos == movablemem_map.nr_map || | ||
5267 | end_pfn < movablemem_map.map[pos].start_pfn) { | ||
5268 | /* | ||
5269 | * If pos is not the end of array, we need to move all | ||
5270 | * the rest elements backward. | ||
5271 | */ | ||
5272 | if (pos < movablemem_map.nr_map) | ||
5273 | memmove(&movablemem_map.map[pos+1], | ||
5274 | &movablemem_map.map[pos], | ||
5275 | sizeof(struct movablemem_entry) * | ||
5276 | (movablemem_map.nr_map - pos)); | ||
5277 | movablemem_map.map[pos].start_pfn = start_pfn; | ||
5278 | movablemem_map.map[pos].end_pfn = end_pfn; | ||
5279 | movablemem_map.nr_map++; | ||
5280 | return; | ||
5281 | } | ||
5282 | |||
5283 | /* overlap will be at the last overlapped range */ | ||
5284 | for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++) | ||
5285 | if (end_pfn < movablemem_map.map[overlap].start_pfn) | ||
5286 | break; | ||
5287 | |||
5288 | /* | ||
5289 | * If there are more ranges overlapped, we need to merge them, | ||
5290 | * and move the rest elements forward. | ||
5291 | */ | ||
5292 | overlap--; | ||
5293 | movablemem_map.map[pos].start_pfn = min(start_pfn, | ||
5294 | movablemem_map.map[pos].start_pfn); | ||
5295 | movablemem_map.map[pos].end_pfn = max(end_pfn, | ||
5296 | movablemem_map.map[overlap].end_pfn); | ||
5297 | |||
5298 | if (pos != overlap && overlap + 1 != movablemem_map.nr_map) | ||
5299 | memmove(&movablemem_map.map[pos+1], | ||
5300 | &movablemem_map.map[overlap+1], | ||
5301 | sizeof(struct movablemem_entry) * | ||
5302 | (movablemem_map.nr_map - overlap - 1)); | ||
5303 | |||
5304 | movablemem_map.nr_map -= overlap - pos; | ||
5305 | } | ||
5306 | |||
5307 | /** | ||
5308 | * movablemem_map_add_region - Add a memory range into movablemem_map. | ||
5309 | * @start: physical start address of range | ||
5310 | * @end: physical end address of range | ||
5311 | * | ||
5312 | * This function transform the physical address into pfn, and then add the | ||
5313 | * range into movablemem_map by calling insert_movablemem_map(). | ||
5314 | */ | ||
5315 | static void __init movablemem_map_add_region(u64 start, u64 size) | ||
5316 | { | ||
5317 | unsigned long start_pfn, end_pfn; | ||
5318 | |||
5319 | /* In case size == 0 or start + size overflows */ | ||
5320 | if (start + size <= start) | ||
5321 | return; | ||
5322 | |||
5323 | if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) { | ||
5324 | pr_err("movablemem_map: too many entries;" | ||
5325 | " ignoring [mem %#010llx-%#010llx]\n", | ||
5326 | (unsigned long long) start, | ||
5327 | (unsigned long long) (start + size - 1)); | ||
5328 | return; | ||
5329 | } | ||
5330 | |||
5331 | start_pfn = PFN_DOWN(start); | ||
5332 | end_pfn = PFN_UP(start + size); | ||
5333 | insert_movablemem_map(start_pfn, end_pfn); | ||
5334 | } | ||
5335 | |||
5336 | /* | ||
5337 | * cmdline_parse_movablemem_map - Parse boot option movablemem_map. | ||
5338 | * @p: The boot option of the following format: | ||
5339 | * movablemem_map=nn[KMG]@ss[KMG] | ||
5340 | * | ||
5341 | * This option sets the memory range [ss, ss+nn) to be used as movable memory. | ||
5342 | * | ||
5343 | * Return: 0 on success or -EINVAL on failure. | ||
5344 | */ | ||
5345 | static int __init cmdline_parse_movablemem_map(char *p) | ||
5346 | { | ||
5347 | char *oldp; | ||
5348 | u64 start_at, mem_size; | ||
5349 | |||
5350 | if (!p) | ||
5351 | goto err; | ||
5352 | |||
5353 | if (!strcmp(p, "acpi")) | ||
5354 | movablemem_map.acpi = true; | ||
5355 | |||
5356 | /* | ||
5357 | * If user decide to use info from BIOS, all the other user specified | ||
5358 | * ranges will be ingored. | ||
5359 | */ | ||
5360 | if (movablemem_map.acpi) { | ||
5361 | if (movablemem_map.nr_map) { | ||
5362 | memset(movablemem_map.map, 0, | ||
5363 | sizeof(struct movablemem_entry) | ||
5364 | * movablemem_map.nr_map); | ||
5365 | movablemem_map.nr_map = 0; | ||
5366 | } | ||
5367 | return 0; | ||
5368 | } | ||
5369 | |||
5370 | oldp = p; | ||
5371 | mem_size = memparse(p, &p); | ||
5372 | if (p == oldp) | ||
5373 | goto err; | ||
5374 | |||
5375 | if (*p == '@') { | ||
5376 | oldp = ++p; | ||
5377 | start_at = memparse(p, &p); | ||
5378 | if (p == oldp || *p != '\0') | ||
5379 | goto err; | ||
5380 | |||
5381 | movablemem_map_add_region(start_at, mem_size); | ||
5382 | return 0; | ||
5383 | } | ||
5384 | err: | ||
5385 | return -EINVAL; | ||
5386 | } | ||
5387 | early_param("movablemem_map", cmdline_parse_movablemem_map); | ||
5388 | |||
5092 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 5389 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
5093 | 5390 | ||
5094 | /** | 5391 | /** |
@@ -5171,8 +5468,8 @@ static void calculate_totalreserve_pages(void) | |||
5171 | /* we treat the high watermark as reserved pages. */ | 5468 | /* we treat the high watermark as reserved pages. */ |
5172 | max += high_wmark_pages(zone); | 5469 | max += high_wmark_pages(zone); |
5173 | 5470 | ||
5174 | if (max > zone->present_pages) | 5471 | if (max > zone->managed_pages) |
5175 | max = zone->present_pages; | 5472 | max = zone->managed_pages; |
5176 | reserve_pages += max; | 5473 | reserve_pages += max; |
5177 | /* | 5474 | /* |
5178 | * Lowmem reserves are not available to | 5475 | * Lowmem reserves are not available to |
@@ -5204,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
5204 | for_each_online_pgdat(pgdat) { | 5501 | for_each_online_pgdat(pgdat) { |
5205 | for (j = 0; j < MAX_NR_ZONES; j++) { | 5502 | for (j = 0; j < MAX_NR_ZONES; j++) { |
5206 | struct zone *zone = pgdat->node_zones + j; | 5503 | struct zone *zone = pgdat->node_zones + j; |
5207 | unsigned long present_pages = zone->present_pages; | 5504 | unsigned long managed_pages = zone->managed_pages; |
5208 | 5505 | ||
5209 | zone->lowmem_reserve[j] = 0; | 5506 | zone->lowmem_reserve[j] = 0; |
5210 | 5507 | ||
@@ -5218,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void) | |||
5218 | sysctl_lowmem_reserve_ratio[idx] = 1; | 5515 | sysctl_lowmem_reserve_ratio[idx] = 1; |
5219 | 5516 | ||
5220 | lower_zone = pgdat->node_zones + idx; | 5517 | lower_zone = pgdat->node_zones + idx; |
5221 | lower_zone->lowmem_reserve[j] = present_pages / | 5518 | lower_zone->lowmem_reserve[j] = managed_pages / |
5222 | sysctl_lowmem_reserve_ratio[idx]; | 5519 | sysctl_lowmem_reserve_ratio[idx]; |
5223 | present_pages += lower_zone->present_pages; | 5520 | managed_pages += lower_zone->managed_pages; |
5224 | } | 5521 | } |
5225 | } | 5522 | } |
5226 | } | 5523 | } |
@@ -5239,14 +5536,14 @@ static void __setup_per_zone_wmarks(void) | |||
5239 | /* Calculate total number of !ZONE_HIGHMEM pages */ | 5536 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
5240 | for_each_zone(zone) { | 5537 | for_each_zone(zone) { |
5241 | if (!is_highmem(zone)) | 5538 | if (!is_highmem(zone)) |
5242 | lowmem_pages += zone->present_pages; | 5539 | lowmem_pages += zone->managed_pages; |
5243 | } | 5540 | } |
5244 | 5541 | ||
5245 | for_each_zone(zone) { | 5542 | for_each_zone(zone) { |
5246 | u64 tmp; | 5543 | u64 tmp; |
5247 | 5544 | ||
5248 | spin_lock_irqsave(&zone->lock, flags); | 5545 | spin_lock_irqsave(&zone->lock, flags); |
5249 | tmp = (u64)pages_min * zone->present_pages; | 5546 | tmp = (u64)pages_min * zone->managed_pages; |
5250 | do_div(tmp, lowmem_pages); | 5547 | do_div(tmp, lowmem_pages); |
5251 | if (is_highmem(zone)) { | 5548 | if (is_highmem(zone)) { |
5252 | /* | 5549 | /* |
@@ -5258,13 +5555,10 @@ static void __setup_per_zone_wmarks(void) | |||
5258 | * deltas controls asynch page reclaim, and so should | 5555 | * deltas controls asynch page reclaim, and so should |
5259 | * not be capped for highmem. | 5556 | * not be capped for highmem. |
5260 | */ | 5557 | */ |
5261 | int min_pages; | 5558 | unsigned long min_pages; |
5262 | 5559 | ||
5263 | min_pages = zone->present_pages / 1024; | 5560 | min_pages = zone->managed_pages / 1024; |
5264 | if (min_pages < SWAP_CLUSTER_MAX) | 5561 | min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); |
5265 | min_pages = SWAP_CLUSTER_MAX; | ||
5266 | if (min_pages > 128) | ||
5267 | min_pages = 128; | ||
5268 | zone->watermark[WMARK_MIN] = min_pages; | 5562 | zone->watermark[WMARK_MIN] = min_pages; |
5269 | } else { | 5563 | } else { |
5270 | /* | 5564 | /* |
@@ -5325,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone) | |||
5325 | unsigned int gb, ratio; | 5619 | unsigned int gb, ratio; |
5326 | 5620 | ||
5327 | /* Zone size in gigabytes */ | 5621 | /* Zone size in gigabytes */ |
5328 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | 5622 | gb = zone->managed_pages >> (30 - PAGE_SHIFT); |
5329 | if (gb) | 5623 | if (gb) |
5330 | ratio = int_sqrt(10 * gb); | 5624 | ratio = int_sqrt(10 * gb); |
5331 | else | 5625 | else |
@@ -5411,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | |||
5411 | return rc; | 5705 | return rc; |
5412 | 5706 | ||
5413 | for_each_zone(zone) | 5707 | for_each_zone(zone) |
5414 | zone->min_unmapped_pages = (zone->present_pages * | 5708 | zone->min_unmapped_pages = (zone->managed_pages * |
5415 | sysctl_min_unmapped_ratio) / 100; | 5709 | sysctl_min_unmapped_ratio) / 100; |
5416 | return 0; | 5710 | return 0; |
5417 | } | 5711 | } |
@@ -5427,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
5427 | return rc; | 5721 | return rc; |
5428 | 5722 | ||
5429 | for_each_zone(zone) | 5723 | for_each_zone(zone) |
5430 | zone->min_slab_pages = (zone->present_pages * | 5724 | zone->min_slab_pages = (zone->managed_pages * |
5431 | sysctl_min_slab_ratio) / 100; | 5725 | sysctl_min_slab_ratio) / 100; |
5432 | return 0; | 5726 | return 0; |
5433 | } | 5727 | } |
@@ -5469,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
5469 | for_each_populated_zone(zone) { | 5763 | for_each_populated_zone(zone) { |
5470 | for_each_possible_cpu(cpu) { | 5764 | for_each_possible_cpu(cpu) { |
5471 | unsigned long high; | 5765 | unsigned long high; |
5472 | high = zone->present_pages / percpu_pagelist_fraction; | 5766 | high = zone->managed_pages / percpu_pagelist_fraction; |
5473 | setup_pagelist_highmark( | 5767 | setup_pagelist_highmark( |
5474 | per_cpu_ptr(zone->pageset, cpu), high); | 5768 | per_cpu_ptr(zone->pageset, cpu), high); |
5475 | } | 5769 | } |
@@ -5604,7 +5898,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | |||
5604 | pfn &= (PAGES_PER_SECTION-1); | 5898 | pfn &= (PAGES_PER_SECTION-1); |
5605 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5899 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5606 | #else | 5900 | #else |
5607 | pfn = pfn - zone->zone_start_pfn; | 5901 | pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); |
5608 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5902 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5609 | #endif /* CONFIG_SPARSEMEM */ | 5903 | #endif /* CONFIG_SPARSEMEM */ |
5610 | } | 5904 | } |
@@ -5656,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5656 | pfn = page_to_pfn(page); | 5950 | pfn = page_to_pfn(page); |
5657 | bitmap = get_pageblock_bitmap(zone, pfn); | 5951 | bitmap = get_pageblock_bitmap(zone, pfn); |
5658 | bitidx = pfn_to_bitidx(zone, pfn); | 5952 | bitidx = pfn_to_bitidx(zone, pfn); |
5659 | VM_BUG_ON(pfn < zone->zone_start_pfn); | 5953 | VM_BUG_ON(!zone_spans_pfn(zone, pfn)); |
5660 | VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); | ||
5661 | 5954 | ||
5662 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 5955 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) |
5663 | if (flags & value) | 5956 | if (flags & value) |
@@ -5755,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5755 | 6048 | ||
5756 | zone = page_zone(page); | 6049 | zone = page_zone(page); |
5757 | pfn = page_to_pfn(page); | 6050 | pfn = page_to_pfn(page); |
5758 | if (zone->zone_start_pfn > pfn || | 6051 | if (!zone_spans_pfn(zone, pfn)) |
5759 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | ||
5760 | return false; | 6052 | return false; |
5761 | 6053 | ||
5762 | return !has_unmovable_pages(zone, page, 0, true); | 6054 | return !has_unmovable_pages(zone, page, 0, true); |
@@ -5812,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5812 | &cc->migratepages); | 6104 | &cc->migratepages); |
5813 | cc->nr_migratepages -= nr_reclaimed; | 6105 | cc->nr_migratepages -= nr_reclaimed; |
5814 | 6106 | ||
5815 | ret = migrate_pages(&cc->migratepages, | 6107 | ret = migrate_pages(&cc->migratepages, alloc_migrate_target, |
5816 | alloc_migrate_target, | 6108 | 0, MIGRATE_SYNC, MR_CMA); |
5817 | 0, false, MIGRATE_SYNC, | ||
5818 | MR_CMA); | ||
5819 | } | 6109 | } |
5820 | 6110 | if (ret < 0) { | |
5821 | putback_movable_pages(&cc->migratepages); | 6111 | putback_movable_pages(&cc->migratepages); |
5822 | return ret > 0 ? 0 : ret; | 6112 | return ret; |
6113 | } | ||
6114 | return 0; | ||
5823 | } | 6115 | } |
5824 | 6116 | ||
5825 | /** | 6117 | /** |
@@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
105 | */ | 105 | */ |
106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { | 106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
107 | anon_vma_lock_write(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | anon_vma_unlock(anon_vma); | 108 | anon_vma_unlock_write(anon_vma); |
109 | } | 109 | } |
110 | 110 | ||
111 | kmem_cache_free(anon_vma_cachep, anon_vma); | 111 | kmem_cache_free(anon_vma_cachep, anon_vma); |
@@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
191 | avc = NULL; | 191 | avc = NULL; |
192 | } | 192 | } |
193 | spin_unlock(&mm->page_table_lock); | 193 | spin_unlock(&mm->page_table_lock); |
194 | anon_vma_unlock(anon_vma); | 194 | anon_vma_unlock_write(anon_vma); |
195 | 195 | ||
196 | if (unlikely(allocated)) | 196 | if (unlikely(allocated)) |
197 | put_anon_vma(allocated); | 197 | put_anon_vma(allocated); |
@@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
308 | vma->anon_vma = anon_vma; | 308 | vma->anon_vma = anon_vma; |
309 | anon_vma_lock_write(anon_vma); | 309 | anon_vma_lock_write(anon_vma); |
310 | anon_vma_chain_link(vma, avc, anon_vma); | 310 | anon_vma_chain_link(vma, avc, anon_vma); |
311 | anon_vma_unlock(anon_vma); | 311 | anon_vma_unlock_write(anon_vma); |
312 | 312 | ||
313 | return 0; | 313 | return 0; |
314 | 314 | ||
@@ -1126,7 +1126,6 @@ void page_add_file_rmap(struct page *page) | |||
1126 | */ | 1126 | */ |
1127 | void page_remove_rmap(struct page *page) | 1127 | void page_remove_rmap(struct page *page) |
1128 | { | 1128 | { |
1129 | struct address_space *mapping = page_mapping(page); | ||
1130 | bool anon = PageAnon(page); | 1129 | bool anon = PageAnon(page); |
1131 | bool locked; | 1130 | bool locked; |
1132 | unsigned long flags; | 1131 | unsigned long flags; |
@@ -1144,29 +1143,6 @@ void page_remove_rmap(struct page *page) | |||
1144 | goto out; | 1143 | goto out; |
1145 | 1144 | ||
1146 | /* | 1145 | /* |
1147 | * Now that the last pte has gone, s390 must transfer dirty | ||
1148 | * flag from storage key to struct page. We can usually skip | ||
1149 | * this if the page is anon, so about to be freed; but perhaps | ||
1150 | * not if it's in swapcache - there might be another pte slot | ||
1151 | * containing the swap entry, but page not yet written to swap. | ||
1152 | * | ||
1153 | * And we can skip it on file pages, so long as the filesystem | ||
1154 | * participates in dirty tracking (note that this is not only an | ||
1155 | * optimization but also solves problems caused by dirty flag in | ||
1156 | * storage key getting set by a write from inside kernel); but need to | ||
1157 | * catch shm and tmpfs and ramfs pages which have been modified since | ||
1158 | * creation by read fault. | ||
1159 | * | ||
1160 | * Note that mapping must be decided above, before decrementing | ||
1161 | * mapcount (which luckily provides a barrier): once page is unmapped, | ||
1162 | * it could be truncated and page->mapping reset to NULL at any moment. | ||
1163 | * Note also that we are relying on page_mapping(page) to set mapping | ||
1164 | * to &swapper_space when PageSwapCache(page). | ||
1165 | */ | ||
1166 | if (mapping && !mapping_cap_account_dirty(mapping) && | ||
1167 | page_test_and_clear_dirty(page_to_pfn(page), 1)) | ||
1168 | set_page_dirty(page); | ||
1169 | /* | ||
1170 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED | 1146 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED |
1171 | * and not charged by memcg for now. | 1147 | * and not charged by memcg for now. |
1172 | */ | 1148 | */ |
diff --git a/mm/shmem.c b/mm/shmem.c index 5dd56f6efdbd..ed2befb4952e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -335,19 +335,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, | |||
335 | pgoff_t start, unsigned int nr_pages, | 335 | pgoff_t start, unsigned int nr_pages, |
336 | struct page **pages, pgoff_t *indices) | 336 | struct page **pages, pgoff_t *indices) |
337 | { | 337 | { |
338 | unsigned int i; | 338 | void **slot; |
339 | unsigned int ret; | 339 | unsigned int ret = 0; |
340 | unsigned int nr_found; | 340 | struct radix_tree_iter iter; |
341 | |||
342 | if (!nr_pages) | ||
343 | return 0; | ||
341 | 344 | ||
342 | rcu_read_lock(); | 345 | rcu_read_lock(); |
343 | restart: | 346 | restart: |
344 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 347 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { |
345 | (void ***)pages, indices, start, nr_pages); | ||
346 | ret = 0; | ||
347 | for (i = 0; i < nr_found; i++) { | ||
348 | struct page *page; | 348 | struct page *page; |
349 | repeat: | 349 | repeat: |
350 | page = radix_tree_deref_slot((void **)pages[i]); | 350 | page = radix_tree_deref_slot(slot); |
351 | if (unlikely(!page)) | 351 | if (unlikely(!page)) |
352 | continue; | 352 | continue; |
353 | if (radix_tree_exception(page)) { | 353 | if (radix_tree_exception(page)) { |
@@ -364,17 +364,16 @@ repeat: | |||
364 | goto repeat; | 364 | goto repeat; |
365 | 365 | ||
366 | /* Has the page moved? */ | 366 | /* Has the page moved? */ |
367 | if (unlikely(page != *((void **)pages[i]))) { | 367 | if (unlikely(page != *slot)) { |
368 | page_cache_release(page); | 368 | page_cache_release(page); |
369 | goto repeat; | 369 | goto repeat; |
370 | } | 370 | } |
371 | export: | 371 | export: |
372 | indices[ret] = indices[i]; | 372 | indices[ret] = iter.index; |
373 | pages[ret] = page; | 373 | pages[ret] = page; |
374 | ret++; | 374 | if (++ret == nr_pages) |
375 | break; | ||
375 | } | 376 | } |
376 | if (unlikely(!ret && nr_found)) | ||
377 | goto restart; | ||
378 | rcu_read_unlock(); | 377 | rcu_read_unlock(); |
379 | return ret; | 378 | return ret; |
380 | } | 379 | } |
@@ -1295,7 +1294,7 @@ unlock: | |||
1295 | 1294 | ||
1296 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1295 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
1297 | { | 1296 | { |
1298 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 1297 | struct inode *inode = file_inode(vma->vm_file); |
1299 | int error; | 1298 | int error; |
1300 | int ret = VM_FAULT_LOCKED; | 1299 | int ret = VM_FAULT_LOCKED; |
1301 | 1300 | ||
@@ -1313,14 +1312,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1313 | #ifdef CONFIG_NUMA | 1312 | #ifdef CONFIG_NUMA |
1314 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) | 1313 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) |
1315 | { | 1314 | { |
1316 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 1315 | struct inode *inode = file_inode(vma->vm_file); |
1317 | return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); | 1316 | return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); |
1318 | } | 1317 | } |
1319 | 1318 | ||
1320 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, | 1319 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, |
1321 | unsigned long addr) | 1320 | unsigned long addr) |
1322 | { | 1321 | { |
1323 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 1322 | struct inode *inode = file_inode(vma->vm_file); |
1324 | pgoff_t index; | 1323 | pgoff_t index; |
1325 | 1324 | ||
1326 | index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 1325 | index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
@@ -1330,7 +1329,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, | |||
1330 | 1329 | ||
1331 | int shmem_lock(struct file *file, int lock, struct user_struct *user) | 1330 | int shmem_lock(struct file *file, int lock, struct user_struct *user) |
1332 | { | 1331 | { |
1333 | struct inode *inode = file->f_path.dentry->d_inode; | 1332 | struct inode *inode = file_inode(file); |
1334 | struct shmem_inode_info *info = SHMEM_I(inode); | 1333 | struct shmem_inode_info *info = SHMEM_I(inode); |
1335 | int retval = -ENOMEM; | 1334 | int retval = -ENOMEM; |
1336 | 1335 | ||
@@ -1465,7 +1464,7 @@ shmem_write_end(struct file *file, struct address_space *mapping, | |||
1465 | 1464 | ||
1466 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) | 1465 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) |
1467 | { | 1466 | { |
1468 | struct inode *inode = filp->f_path.dentry->d_inode; | 1467 | struct inode *inode = file_inode(filp); |
1469 | struct address_space *mapping = inode->i_mapping; | 1468 | struct address_space *mapping = inode->i_mapping; |
1470 | pgoff_t index; | 1469 | pgoff_t index; |
1471 | unsigned long offset; | 1470 | unsigned long offset; |
@@ -1808,7 +1807,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) | |||
1808 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | 1807 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, |
1809 | loff_t len) | 1808 | loff_t len) |
1810 | { | 1809 | { |
1811 | struct inode *inode = file->f_path.dentry->d_inode; | 1810 | struct inode *inode = file_inode(file); |
1812 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 1811 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); |
1813 | struct shmem_falloc shmem_falloc; | 1812 | struct shmem_falloc shmem_falloc; |
1814 | pgoff_t start, index, end; | 1813 | pgoff_t start, index, end; |
@@ -2351,7 +2350,7 @@ static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, | |||
2351 | { | 2350 | { |
2352 | if (*len < 3) { | 2351 | if (*len < 3) { |
2353 | *len = 3; | 2352 | *len = 3; |
2354 | return 255; | 2353 | return FILEID_INVALID; |
2355 | } | 2354 | } |
2356 | 2355 | ||
2357 | if (inode_unhashed(inode)) { | 2356 | if (inode_unhashed(inode)) { |
@@ -2386,6 +2385,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2386 | bool remount) | 2385 | bool remount) |
2387 | { | 2386 | { |
2388 | char *this_char, *value, *rest; | 2387 | char *this_char, *value, *rest; |
2388 | struct mempolicy *mpol = NULL; | ||
2389 | uid_t uid; | 2389 | uid_t uid; |
2390 | gid_t gid; | 2390 | gid_t gid; |
2391 | 2391 | ||
@@ -2414,7 +2414,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2414 | printk(KERN_ERR | 2414 | printk(KERN_ERR |
2415 | "tmpfs: No value for mount option '%s'\n", | 2415 | "tmpfs: No value for mount option '%s'\n", |
2416 | this_char); | 2416 | this_char); |
2417 | return 1; | 2417 | goto error; |
2418 | } | 2418 | } |
2419 | 2419 | ||
2420 | if (!strcmp(this_char,"size")) { | 2420 | if (!strcmp(this_char,"size")) { |
@@ -2463,19 +2463,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2463 | if (!gid_valid(sbinfo->gid)) | 2463 | if (!gid_valid(sbinfo->gid)) |
2464 | goto bad_val; | 2464 | goto bad_val; |
2465 | } else if (!strcmp(this_char,"mpol")) { | 2465 | } else if (!strcmp(this_char,"mpol")) { |
2466 | if (mpol_parse_str(value, &sbinfo->mpol)) | 2466 | mpol_put(mpol); |
2467 | mpol = NULL; | ||
2468 | if (mpol_parse_str(value, &mpol)) | ||
2467 | goto bad_val; | 2469 | goto bad_val; |
2468 | } else { | 2470 | } else { |
2469 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", | 2471 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", |
2470 | this_char); | 2472 | this_char); |
2471 | return 1; | 2473 | goto error; |
2472 | } | 2474 | } |
2473 | } | 2475 | } |
2476 | sbinfo->mpol = mpol; | ||
2474 | return 0; | 2477 | return 0; |
2475 | 2478 | ||
2476 | bad_val: | 2479 | bad_val: |
2477 | printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", | 2480 | printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", |
2478 | value, this_char); | 2481 | value, this_char); |
2482 | error: | ||
2483 | mpol_put(mpol); | ||
2479 | return 1; | 2484 | return 1; |
2480 | 2485 | ||
2481 | } | 2486 | } |
@@ -2487,6 +2492,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2487 | unsigned long inodes; | 2492 | unsigned long inodes; |
2488 | int error = -EINVAL; | 2493 | int error = -EINVAL; |
2489 | 2494 | ||
2495 | config.mpol = NULL; | ||
2490 | if (shmem_parse_options(data, &config, true)) | 2496 | if (shmem_parse_options(data, &config, true)) |
2491 | return error; | 2497 | return error; |
2492 | 2498 | ||
@@ -2511,8 +2517,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2511 | sbinfo->max_inodes = config.max_inodes; | 2517 | sbinfo->max_inodes = config.max_inodes; |
2512 | sbinfo->free_inodes = config.max_inodes - inodes; | 2518 | sbinfo->free_inodes = config.max_inodes - inodes; |
2513 | 2519 | ||
2514 | mpol_put(sbinfo->mpol); | 2520 | /* |
2515 | sbinfo->mpol = config.mpol; /* transfers initial ref */ | 2521 | * Preserve previous mempolicy unless mpol remount option was specified. |
2522 | */ | ||
2523 | if (config.mpol) { | ||
2524 | mpol_put(sbinfo->mpol); | ||
2525 | sbinfo->mpol = config.mpol; /* transfers initial ref */ | ||
2526 | } | ||
2516 | out: | 2527 | out: |
2517 | spin_unlock(&sbinfo->stat_lock); | 2528 | spin_unlock(&sbinfo->stat_lock); |
2518 | return error; | 2529 | return error; |
@@ -2545,6 +2556,7 @@ static void shmem_put_super(struct super_block *sb) | |||
2545 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 2556 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
2546 | 2557 | ||
2547 | percpu_counter_destroy(&sbinfo->used_blocks); | 2558 | percpu_counter_destroy(&sbinfo->used_blocks); |
2559 | mpol_put(sbinfo->mpol); | ||
2548 | kfree(sbinfo); | 2560 | kfree(sbinfo); |
2549 | sb->s_fs_info = NULL; | 2561 | sb->s_fs_info = NULL; |
2550 | } | 2562 | } |
@@ -2766,6 +2778,7 @@ static struct file_system_type shmem_fs_type = { | |||
2766 | .name = "tmpfs", | 2778 | .name = "tmpfs", |
2767 | .mount = shmem_mount, | 2779 | .mount = shmem_mount, |
2768 | .kill_sb = kill_litter_super, | 2780 | .kill_sb = kill_litter_super, |
2781 | .fs_flags = FS_USERNS_MOUNT, | ||
2769 | }; | 2782 | }; |
2770 | 2783 | ||
2771 | int __init shmem_init(void) | 2784 | int __init shmem_init(void) |
@@ -2823,6 +2836,7 @@ static struct file_system_type shmem_fs_type = { | |||
2823 | .name = "tmpfs", | 2836 | .name = "tmpfs", |
2824 | .mount = ramfs_mount, | 2837 | .mount = ramfs_mount, |
2825 | .kill_sb = kill_litter_super, | 2838 | .kill_sb = kill_litter_super, |
2839 | .fs_flags = FS_USERNS_MOUNT, | ||
2826 | }; | 2840 | }; |
2827 | 2841 | ||
2828 | int __init shmem_init(void) | 2842 | int __init shmem_init(void) |
@@ -2865,6 +2879,16 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); | |||
2865 | 2879 | ||
2866 | /* common code */ | 2880 | /* common code */ |
2867 | 2881 | ||
2882 | static char *shmem_dname(struct dentry *dentry, char *buffer, int buflen) | ||
2883 | { | ||
2884 | return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", | ||
2885 | dentry->d_name.name); | ||
2886 | } | ||
2887 | |||
2888 | static struct dentry_operations anon_ops = { | ||
2889 | .d_dname = shmem_dname | ||
2890 | }; | ||
2891 | |||
2868 | /** | 2892 | /** |
2869 | * shmem_file_setup - get an unlinked file living in tmpfs | 2893 | * shmem_file_setup - get an unlinked file living in tmpfs |
2870 | * @name: name for dentry (to be seen in /proc/<pid>/maps | 2894 | * @name: name for dentry (to be seen in /proc/<pid>/maps |
@@ -2873,15 +2897,14 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); | |||
2873 | */ | 2897 | */ |
2874 | struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) | 2898 | struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) |
2875 | { | 2899 | { |
2876 | int error; | 2900 | struct file *res; |
2877 | struct file *file; | ||
2878 | struct inode *inode; | 2901 | struct inode *inode; |
2879 | struct path path; | 2902 | struct path path; |
2880 | struct dentry *root; | 2903 | struct super_block *sb; |
2881 | struct qstr this; | 2904 | struct qstr this; |
2882 | 2905 | ||
2883 | if (IS_ERR(shm_mnt)) | 2906 | if (IS_ERR(shm_mnt)) |
2884 | return (void *)shm_mnt; | 2907 | return ERR_CAST(shm_mnt); |
2885 | 2908 | ||
2886 | if (size < 0 || size > MAX_LFS_FILESIZE) | 2909 | if (size < 0 || size > MAX_LFS_FILESIZE) |
2887 | return ERR_PTR(-EINVAL); | 2910 | return ERR_PTR(-EINVAL); |
@@ -2889,18 +2912,19 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2889 | if (shmem_acct_size(flags, size)) | 2912 | if (shmem_acct_size(flags, size)) |
2890 | return ERR_PTR(-ENOMEM); | 2913 | return ERR_PTR(-ENOMEM); |
2891 | 2914 | ||
2892 | error = -ENOMEM; | 2915 | res = ERR_PTR(-ENOMEM); |
2893 | this.name = name; | 2916 | this.name = name; |
2894 | this.len = strlen(name); | 2917 | this.len = strlen(name); |
2895 | this.hash = 0; /* will go */ | 2918 | this.hash = 0; /* will go */ |
2896 | root = shm_mnt->mnt_root; | 2919 | sb = shm_mnt->mnt_sb; |
2897 | path.dentry = d_alloc(root, &this); | 2920 | path.dentry = d_alloc_pseudo(sb, &this); |
2898 | if (!path.dentry) | 2921 | if (!path.dentry) |
2899 | goto put_memory; | 2922 | goto put_memory; |
2923 | d_set_d_op(path.dentry, &anon_ops); | ||
2900 | path.mnt = mntget(shm_mnt); | 2924 | path.mnt = mntget(shm_mnt); |
2901 | 2925 | ||
2902 | error = -ENOSPC; | 2926 | res = ERR_PTR(-ENOSPC); |
2903 | inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); | 2927 | inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); |
2904 | if (!inode) | 2928 | if (!inode) |
2905 | goto put_dentry; | 2929 | goto put_dentry; |
2906 | 2930 | ||
@@ -2909,23 +2933,23 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2909 | clear_nlink(inode); /* It is unlinked */ | 2933 | clear_nlink(inode); /* It is unlinked */ |
2910 | #ifndef CONFIG_MMU | 2934 | #ifndef CONFIG_MMU |
2911 | error = ramfs_nommu_expand_for_mapping(inode, size); | 2935 | error = ramfs_nommu_expand_for_mapping(inode, size); |
2936 | res = ERR_PTR(error); | ||
2912 | if (error) | 2937 | if (error) |
2913 | goto put_dentry; | 2938 | goto put_dentry; |
2914 | #endif | 2939 | #endif |
2915 | 2940 | ||
2916 | error = -ENFILE; | 2941 | res = alloc_file(&path, FMODE_WRITE | FMODE_READ, |
2917 | file = alloc_file(&path, FMODE_WRITE | FMODE_READ, | ||
2918 | &shmem_file_operations); | 2942 | &shmem_file_operations); |
2919 | if (!file) | 2943 | if (IS_ERR(res)) |
2920 | goto put_dentry; | 2944 | goto put_dentry; |
2921 | 2945 | ||
2922 | return file; | 2946 | return res; |
2923 | 2947 | ||
2924 | put_dentry: | 2948 | put_dentry: |
2925 | path_put(&path); | 2949 | path_put(&path); |
2926 | put_memory: | 2950 | put_memory: |
2927 | shmem_unacct_size(flags, size); | 2951 | shmem_unacct_size(flags, size); |
2928 | return ERR_PTR(error); | 2952 | return res; |
2929 | } | 2953 | } |
2930 | EXPORT_SYMBOL_GPL(shmem_file_setup); | 2954 | EXPORT_SYMBOL_GPL(shmem_file_setup); |
2931 | 2955 | ||
@@ -812,7 +812,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
812 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", | 812 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", |
813 | function, cachep->name, msg); | 813 | function, cachep->name, msg); |
814 | dump_stack(); | 814 | dump_stack(); |
815 | add_taint(TAINT_BAD_PAGE); | 815 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
816 | } | 816 | } |
817 | #endif | 817 | #endif |
818 | 818 | ||
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size) | |||
360 | clear_slob_page_free(sp); | 360 | clear_slob_page_free(sp); |
361 | spin_unlock_irqrestore(&slob_lock, flags); | 361 | spin_unlock_irqrestore(&slob_lock, flags); |
362 | __ClearPageSlab(sp); | 362 | __ClearPageSlab(sp); |
363 | reset_page_mapcount(sp); | 363 | page_mapcount_reset(sp); |
364 | slob_free_pages(b, 0); | 364 | slob_free_pages(b, 0); |
365 | return; | 365 | return; |
366 | } | 366 | } |
@@ -562,7 +562,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) | |||
562 | printk(KERN_ERR "----------------------------------------" | 562 | printk(KERN_ERR "----------------------------------------" |
563 | "-------------------------------------\n\n"); | 563 | "-------------------------------------\n\n"); |
564 | 564 | ||
565 | add_taint(TAINT_BAD_PAGE); | 565 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
566 | } | 566 | } |
567 | 567 | ||
568 | static void slab_fix(struct kmem_cache *s, char *fmt, ...) | 568 | static void slab_fix(struct kmem_cache *s, char *fmt, ...) |
@@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1408 | __ClearPageSlab(page); | 1408 | __ClearPageSlab(page); |
1409 | 1409 | ||
1410 | memcg_release_pages(s, order); | 1410 | memcg_release_pages(s, order); |
1411 | reset_page_mapcount(page); | 1411 | page_mapcount_reset(page); |
1412 | if (current->reclaim_state) | 1412 | if (current->reclaim_state) |
1413 | current->reclaim_state->reclaimed_slab += pages; | 1413 | current->reclaim_state->reclaimed_slab += pages; |
1414 | __free_memcg_kmem_pages(page, order); | 1414 | __free_memcg_kmem_pages(page, order); |
diff --git a/mm/sparse.c b/mm/sparse.c index 6b5fb762e2ca..7ca6dc847947 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | |||
615 | } | 615 | } |
616 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | 616 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) |
617 | { | 617 | { |
618 | return; /* XXX: Not implemented yet */ | 618 | vmemmap_free(memmap, nr_pages); |
619 | } | 619 | } |
620 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | 620 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) |
621 | { | 621 | { |
622 | vmemmap_free(memmap, nr_pages); | ||
622 | } | 623 | } |
623 | #else | 624 | #else |
624 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 625 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) |
@@ -697,7 +698,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) | |||
697 | /* | 698 | /* |
698 | * Check to see if allocation came from hot-plug-add | 699 | * Check to see if allocation came from hot-plug-add |
699 | */ | 700 | */ |
700 | if (PageSlab(usemap_page)) { | 701 | if (PageSlab(usemap_page) || PageCompound(usemap_page)) { |
701 | kfree(usemap); | 702 | kfree(usemap); |
702 | if (memmap) | 703 | if (memmap) |
703 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); | 704 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); |
@@ -782,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
782 | 783 | ||
783 | for (i = 0; i < PAGES_PER_SECTION; i++) { | 784 | for (i = 0; i < PAGES_PER_SECTION; i++) { |
784 | if (PageHWPoison(&memmap[i])) { | 785 | if (PageHWPoison(&memmap[i])) { |
785 | atomic_long_sub(1, &mce_bad_pages); | 786 | atomic_long_sub(1, &num_poisoned_pages); |
786 | ClearPageHWPoison(&memmap[i]); | 787 | ClearPageHWPoison(&memmap[i]); |
787 | } | 788 | } |
788 | } | 789 | } |
@@ -796,8 +797,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
796 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 797 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) |
797 | { | 798 | { |
798 | struct page *memmap = NULL; | 799 | struct page *memmap = NULL; |
799 | unsigned long *usemap = NULL; | 800 | unsigned long *usemap = NULL, flags; |
801 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
800 | 802 | ||
803 | pgdat_resize_lock(pgdat, &flags); | ||
801 | if (ms->section_mem_map) { | 804 | if (ms->section_mem_map) { |
802 | usemap = ms->pageblock_flags; | 805 | usemap = ms->pageblock_flags; |
803 | memmap = sparse_decode_mem_map(ms->section_mem_map, | 806 | memmap = sparse_decode_mem_map(ms->section_mem_map, |
@@ -805,6 +808,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | |||
805 | ms->section_mem_map = 0; | 808 | ms->section_mem_map = 0; |
806 | ms->pageblock_flags = NULL; | 809 | ms->pageblock_flags = NULL; |
807 | } | 810 | } |
811 | pgdat_resize_unlock(pgdat, &flags); | ||
808 | 812 | ||
809 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); | 813 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); |
810 | free_section_usemap(memmap, usemap); | 814 | free_section_usemap(memmap, usemap); |
@@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); | |||
855 | void __init swap_setup(void) | 855 | void __init swap_setup(void) |
856 | { | 856 | { |
857 | unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); | 857 | unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); |
858 | |||
859 | #ifdef CONFIG_SWAP | 858 | #ifdef CONFIG_SWAP |
860 | bdi_init(swapper_space.backing_dev_info); | 859 | int i; |
860 | |||
861 | bdi_init(swapper_spaces[0].backing_dev_info); | ||
862 | for (i = 0; i < MAX_SWAPFILES; i++) { | ||
863 | spin_lock_init(&swapper_spaces[i].tree_lock); | ||
864 | INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); | ||
865 | } | ||
861 | #endif | 866 | #endif |
862 | 867 | ||
863 | /* Use a smaller cluster for small-memory machines */ | 868 | /* Use a smaller cluster for small-memory machines */ |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 0cb36fb1f61c..7efcf1525921 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = { | |||
36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | 36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
37 | }; | 37 | }; |
38 | 38 | ||
39 | struct address_space swapper_space = { | 39 | struct address_space swapper_spaces[MAX_SWAPFILES] = { |
40 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 40 | [0 ... MAX_SWAPFILES - 1] = { |
41 | .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), | 41 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), |
42 | .a_ops = &swap_aops, | 42 | .a_ops = &swap_aops, |
43 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | 43 | .backing_dev_info = &swap_backing_dev_info, |
44 | .backing_dev_info = &swap_backing_dev_info, | 44 | } |
45 | }; | 45 | }; |
46 | 46 | ||
47 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) | 47 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) |
@@ -53,13 +53,24 @@ static struct { | |||
53 | unsigned long find_total; | 53 | unsigned long find_total; |
54 | } swap_cache_info; | 54 | } swap_cache_info; |
55 | 55 | ||
56 | unsigned long total_swapcache_pages(void) | ||
57 | { | ||
58 | int i; | ||
59 | unsigned long ret = 0; | ||
60 | |||
61 | for (i = 0; i < MAX_SWAPFILES; i++) | ||
62 | ret += swapper_spaces[i].nrpages; | ||
63 | return ret; | ||
64 | } | ||
65 | |||
56 | void show_swap_cache_info(void) | 66 | void show_swap_cache_info(void) |
57 | { | 67 | { |
58 | printk("%lu pages in swap cache\n", total_swapcache_pages); | 68 | printk("%lu pages in swap cache\n", total_swapcache_pages()); |
59 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", | 69 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", |
60 | swap_cache_info.add_total, swap_cache_info.del_total, | 70 | swap_cache_info.add_total, swap_cache_info.del_total, |
61 | swap_cache_info.find_success, swap_cache_info.find_total); | 71 | swap_cache_info.find_success, swap_cache_info.find_total); |
62 | printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); | 72 | printk("Free swap = %ldkB\n", |
73 | get_nr_swap_pages() << (PAGE_SHIFT - 10)); | ||
63 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); | 74 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); |
64 | } | 75 | } |
65 | 76 | ||
@@ -70,6 +81,7 @@ void show_swap_cache_info(void) | |||
70 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry) | 81 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry) |
71 | { | 82 | { |
72 | int error; | 83 | int error; |
84 | struct address_space *address_space; | ||
73 | 85 | ||
74 | VM_BUG_ON(!PageLocked(page)); | 86 | VM_BUG_ON(!PageLocked(page)); |
75 | VM_BUG_ON(PageSwapCache(page)); | 87 | VM_BUG_ON(PageSwapCache(page)); |
@@ -79,14 +91,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
79 | SetPageSwapCache(page); | 91 | SetPageSwapCache(page); |
80 | set_page_private(page, entry.val); | 92 | set_page_private(page, entry.val); |
81 | 93 | ||
82 | spin_lock_irq(&swapper_space.tree_lock); | 94 | address_space = swap_address_space(entry); |
83 | error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); | 95 | spin_lock_irq(&address_space->tree_lock); |
96 | error = radix_tree_insert(&address_space->page_tree, | ||
97 | entry.val, page); | ||
84 | if (likely(!error)) { | 98 | if (likely(!error)) { |
85 | total_swapcache_pages++; | 99 | address_space->nrpages++; |
86 | __inc_zone_page_state(page, NR_FILE_PAGES); | 100 | __inc_zone_page_state(page, NR_FILE_PAGES); |
87 | INC_CACHE_INFO(add_total); | 101 | INC_CACHE_INFO(add_total); |
88 | } | 102 | } |
89 | spin_unlock_irq(&swapper_space.tree_lock); | 103 | spin_unlock_irq(&address_space->tree_lock); |
90 | 104 | ||
91 | if (unlikely(error)) { | 105 | if (unlikely(error)) { |
92 | /* | 106 | /* |
@@ -122,14 +136,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
122 | */ | 136 | */ |
123 | void __delete_from_swap_cache(struct page *page) | 137 | void __delete_from_swap_cache(struct page *page) |
124 | { | 138 | { |
139 | swp_entry_t entry; | ||
140 | struct address_space *address_space; | ||
141 | |||
125 | VM_BUG_ON(!PageLocked(page)); | 142 | VM_BUG_ON(!PageLocked(page)); |
126 | VM_BUG_ON(!PageSwapCache(page)); | 143 | VM_BUG_ON(!PageSwapCache(page)); |
127 | VM_BUG_ON(PageWriteback(page)); | 144 | VM_BUG_ON(PageWriteback(page)); |
128 | 145 | ||
129 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); | 146 | entry.val = page_private(page); |
147 | address_space = swap_address_space(entry); | ||
148 | radix_tree_delete(&address_space->page_tree, page_private(page)); | ||
130 | set_page_private(page, 0); | 149 | set_page_private(page, 0); |
131 | ClearPageSwapCache(page); | 150 | ClearPageSwapCache(page); |
132 | total_swapcache_pages--; | 151 | address_space->nrpages--; |
133 | __dec_zone_page_state(page, NR_FILE_PAGES); | 152 | __dec_zone_page_state(page, NR_FILE_PAGES); |
134 | INC_CACHE_INFO(del_total); | 153 | INC_CACHE_INFO(del_total); |
135 | } | 154 | } |
@@ -195,12 +214,14 @@ int add_to_swap(struct page *page) | |||
195 | void delete_from_swap_cache(struct page *page) | 214 | void delete_from_swap_cache(struct page *page) |
196 | { | 215 | { |
197 | swp_entry_t entry; | 216 | swp_entry_t entry; |
217 | struct address_space *address_space; | ||
198 | 218 | ||
199 | entry.val = page_private(page); | 219 | entry.val = page_private(page); |
200 | 220 | ||
201 | spin_lock_irq(&swapper_space.tree_lock); | 221 | address_space = swap_address_space(entry); |
222 | spin_lock_irq(&address_space->tree_lock); | ||
202 | __delete_from_swap_cache(page); | 223 | __delete_from_swap_cache(page); |
203 | spin_unlock_irq(&swapper_space.tree_lock); | 224 | spin_unlock_irq(&address_space->tree_lock); |
204 | 225 | ||
205 | swapcache_free(entry, page); | 226 | swapcache_free(entry, page); |
206 | page_cache_release(page); | 227 | page_cache_release(page); |
@@ -263,7 +284,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) | |||
263 | { | 284 | { |
264 | struct page *page; | 285 | struct page *page; |
265 | 286 | ||
266 | page = find_get_page(&swapper_space, entry.val); | 287 | page = find_get_page(swap_address_space(entry), entry.val); |
267 | 288 | ||
268 | if (page) | 289 | if (page) |
269 | INC_CACHE_INFO(find_success); | 290 | INC_CACHE_INFO(find_success); |
@@ -290,7 +311,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
290 | * called after lookup_swap_cache() failed, re-calling | 311 | * called after lookup_swap_cache() failed, re-calling |
291 | * that would confuse statistics. | 312 | * that would confuse statistics. |
292 | */ | 313 | */ |
293 | found_page = find_get_page(&swapper_space, entry.val); | 314 | found_page = find_get_page(swap_address_space(entry), |
315 | entry.val); | ||
294 | if (found_page) | 316 | if (found_page) |
295 | break; | 317 | break; |
296 | 318 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index e97a0e5aea91..a1f7772a01fc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**); | |||
47 | 47 | ||
48 | DEFINE_SPINLOCK(swap_lock); | 48 | DEFINE_SPINLOCK(swap_lock); |
49 | static unsigned int nr_swapfiles; | 49 | static unsigned int nr_swapfiles; |
50 | long nr_swap_pages; | 50 | atomic_long_t nr_swap_pages; |
51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | ||
51 | long total_swap_pages; | 52 | long total_swap_pages; |
52 | static int least_priority; | 53 | static int least_priority; |
54 | static atomic_t highest_priority_index = ATOMIC_INIT(-1); | ||
53 | 55 | ||
54 | static const char Bad_file[] = "Bad swap file entry "; | 56 | static const char Bad_file[] = "Bad swap file entry "; |
55 | static const char Unused_file[] = "Unused swap file entry "; | 57 | static const char Unused_file[] = "Unused swap file entry "; |
@@ -79,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | |||
79 | struct page *page; | 81 | struct page *page; |
80 | int ret = 0; | 82 | int ret = 0; |
81 | 83 | ||
82 | page = find_get_page(&swapper_space, entry.val); | 84 | page = find_get_page(swap_address_space(entry), entry.val); |
83 | if (!page) | 85 | if (!page) |
84 | return 0; | 86 | return 0; |
85 | /* | 87 | /* |
@@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
223 | si->lowest_alloc = si->max; | 225 | si->lowest_alloc = si->max; |
224 | si->highest_alloc = 0; | 226 | si->highest_alloc = 0; |
225 | } | 227 | } |
226 | spin_unlock(&swap_lock); | 228 | spin_unlock(&si->lock); |
227 | 229 | ||
228 | /* | 230 | /* |
229 | * If seek is expensive, start searching for new cluster from | 231 | * If seek is expensive, start searching for new cluster from |
@@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
242 | if (si->swap_map[offset]) | 244 | if (si->swap_map[offset]) |
243 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 245 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
244 | else if (offset == last_in_cluster) { | 246 | else if (offset == last_in_cluster) { |
245 | spin_lock(&swap_lock); | 247 | spin_lock(&si->lock); |
246 | offset -= SWAPFILE_CLUSTER - 1; | 248 | offset -= SWAPFILE_CLUSTER - 1; |
247 | si->cluster_next = offset; | 249 | si->cluster_next = offset; |
248 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 250 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
@@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
263 | if (si->swap_map[offset]) | 265 | if (si->swap_map[offset]) |
264 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 266 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
265 | else if (offset == last_in_cluster) { | 267 | else if (offset == last_in_cluster) { |
266 | spin_lock(&swap_lock); | 268 | spin_lock(&si->lock); |
267 | offset -= SWAPFILE_CLUSTER - 1; | 269 | offset -= SWAPFILE_CLUSTER - 1; |
268 | si->cluster_next = offset; | 270 | si->cluster_next = offset; |
269 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 271 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
@@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
277 | } | 279 | } |
278 | 280 | ||
279 | offset = scan_base; | 281 | offset = scan_base; |
280 | spin_lock(&swap_lock); | 282 | spin_lock(&si->lock); |
281 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 283 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
282 | si->lowest_alloc = 0; | 284 | si->lowest_alloc = 0; |
283 | } | 285 | } |
@@ -293,9 +295,9 @@ checks: | |||
293 | /* reuse swap entry of cache-only swap if not busy. */ | 295 | /* reuse swap entry of cache-only swap if not busy. */ |
294 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 296 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
295 | int swap_was_freed; | 297 | int swap_was_freed; |
296 | spin_unlock(&swap_lock); | 298 | spin_unlock(&si->lock); |
297 | swap_was_freed = __try_to_reclaim_swap(si, offset); | 299 | swap_was_freed = __try_to_reclaim_swap(si, offset); |
298 | spin_lock(&swap_lock); | 300 | spin_lock(&si->lock); |
299 | /* entry was freed successfully, try to use this again */ | 301 | /* entry was freed successfully, try to use this again */ |
300 | if (swap_was_freed) | 302 | if (swap_was_freed) |
301 | goto checks; | 303 | goto checks; |
@@ -335,13 +337,13 @@ checks: | |||
335 | si->lowest_alloc <= last_in_cluster) | 337 | si->lowest_alloc <= last_in_cluster) |
336 | last_in_cluster = si->lowest_alloc - 1; | 338 | last_in_cluster = si->lowest_alloc - 1; |
337 | si->flags |= SWP_DISCARDING; | 339 | si->flags |= SWP_DISCARDING; |
338 | spin_unlock(&swap_lock); | 340 | spin_unlock(&si->lock); |
339 | 341 | ||
340 | if (offset < last_in_cluster) | 342 | if (offset < last_in_cluster) |
341 | discard_swap_cluster(si, offset, | 343 | discard_swap_cluster(si, offset, |
342 | last_in_cluster - offset + 1); | 344 | last_in_cluster - offset + 1); |
343 | 345 | ||
344 | spin_lock(&swap_lock); | 346 | spin_lock(&si->lock); |
345 | si->lowest_alloc = 0; | 347 | si->lowest_alloc = 0; |
346 | si->flags &= ~SWP_DISCARDING; | 348 | si->flags &= ~SWP_DISCARDING; |
347 | 349 | ||
@@ -355,10 +357,10 @@ checks: | |||
355 | * could defer that delay until swap_writepage, | 357 | * could defer that delay until swap_writepage, |
356 | * but it's easier to keep this self-contained. | 358 | * but it's easier to keep this self-contained. |
357 | */ | 359 | */ |
358 | spin_unlock(&swap_lock); | 360 | spin_unlock(&si->lock); |
359 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | 361 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), |
360 | wait_for_discard, TASK_UNINTERRUPTIBLE); | 362 | wait_for_discard, TASK_UNINTERRUPTIBLE); |
361 | spin_lock(&swap_lock); | 363 | spin_lock(&si->lock); |
362 | } else { | 364 | } else { |
363 | /* | 365 | /* |
364 | * Note pages allocated by racing tasks while | 366 | * Note pages allocated by racing tasks while |
@@ -374,14 +376,14 @@ checks: | |||
374 | return offset; | 376 | return offset; |
375 | 377 | ||
376 | scan: | 378 | scan: |
377 | spin_unlock(&swap_lock); | 379 | spin_unlock(&si->lock); |
378 | while (++offset <= si->highest_bit) { | 380 | while (++offset <= si->highest_bit) { |
379 | if (!si->swap_map[offset]) { | 381 | if (!si->swap_map[offset]) { |
380 | spin_lock(&swap_lock); | 382 | spin_lock(&si->lock); |
381 | goto checks; | 383 | goto checks; |
382 | } | 384 | } |
383 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 385 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
384 | spin_lock(&swap_lock); | 386 | spin_lock(&si->lock); |
385 | goto checks; | 387 | goto checks; |
386 | } | 388 | } |
387 | if (unlikely(--latency_ration < 0)) { | 389 | if (unlikely(--latency_ration < 0)) { |
@@ -392,11 +394,11 @@ scan: | |||
392 | offset = si->lowest_bit; | 394 | offset = si->lowest_bit; |
393 | while (++offset < scan_base) { | 395 | while (++offset < scan_base) { |
394 | if (!si->swap_map[offset]) { | 396 | if (!si->swap_map[offset]) { |
395 | spin_lock(&swap_lock); | 397 | spin_lock(&si->lock); |
396 | goto checks; | 398 | goto checks; |
397 | } | 399 | } |
398 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 400 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
399 | spin_lock(&swap_lock); | 401 | spin_lock(&si->lock); |
400 | goto checks; | 402 | goto checks; |
401 | } | 403 | } |
402 | if (unlikely(--latency_ration < 0)) { | 404 | if (unlikely(--latency_ration < 0)) { |
@@ -404,7 +406,7 @@ scan: | |||
404 | latency_ration = LATENCY_LIMIT; | 406 | latency_ration = LATENCY_LIMIT; |
405 | } | 407 | } |
406 | } | 408 | } |
407 | spin_lock(&swap_lock); | 409 | spin_lock(&si->lock); |
408 | 410 | ||
409 | no_page: | 411 | no_page: |
410 | si->flags -= SWP_SCANNING; | 412 | si->flags -= SWP_SCANNING; |
@@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void) | |||
417 | pgoff_t offset; | 419 | pgoff_t offset; |
418 | int type, next; | 420 | int type, next; |
419 | int wrapped = 0; | 421 | int wrapped = 0; |
422 | int hp_index; | ||
420 | 423 | ||
421 | spin_lock(&swap_lock); | 424 | spin_lock(&swap_lock); |
422 | if (nr_swap_pages <= 0) | 425 | if (atomic_long_read(&nr_swap_pages) <= 0) |
423 | goto noswap; | 426 | goto noswap; |
424 | nr_swap_pages--; | 427 | atomic_long_dec(&nr_swap_pages); |
425 | 428 | ||
426 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 429 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
430 | hp_index = atomic_xchg(&highest_priority_index, -1); | ||
431 | /* | ||
432 | * highest_priority_index records current highest priority swap | ||
433 | * type which just frees swap entries. If its priority is | ||
434 | * higher than that of swap_list.next swap type, we use it. It | ||
435 | * isn't protected by swap_lock, so it can be an invalid value | ||
436 | * if the corresponding swap type is swapoff. We double check | ||
437 | * the flags here. It's even possible the swap type is swapoff | ||
438 | * and swapon again and its priority is changed. In such rare | ||
439 | * case, low prority swap type might be used, but eventually | ||
440 | * high priority swap will be used after several rounds of | ||
441 | * swap. | ||
442 | */ | ||
443 | if (hp_index != -1 && hp_index != type && | ||
444 | swap_info[type]->prio < swap_info[hp_index]->prio && | ||
445 | (swap_info[hp_index]->flags & SWP_WRITEOK)) { | ||
446 | type = hp_index; | ||
447 | swap_list.next = type; | ||
448 | } | ||
449 | |||
427 | si = swap_info[type]; | 450 | si = swap_info[type]; |
428 | next = si->next; | 451 | next = si->next; |
429 | if (next < 0 || | 452 | if (next < 0 || |
@@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void) | |||
432 | wrapped++; | 455 | wrapped++; |
433 | } | 456 | } |
434 | 457 | ||
435 | if (!si->highest_bit) | 458 | spin_lock(&si->lock); |
459 | if (!si->highest_bit) { | ||
460 | spin_unlock(&si->lock); | ||
436 | continue; | 461 | continue; |
437 | if (!(si->flags & SWP_WRITEOK)) | 462 | } |
463 | if (!(si->flags & SWP_WRITEOK)) { | ||
464 | spin_unlock(&si->lock); | ||
438 | continue; | 465 | continue; |
466 | } | ||
439 | 467 | ||
440 | swap_list.next = next; | 468 | swap_list.next = next; |
469 | |||
470 | spin_unlock(&swap_lock); | ||
441 | /* This is called for allocating swap entry for cache */ | 471 | /* This is called for allocating swap entry for cache */ |
442 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | 472 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
443 | if (offset) { | 473 | spin_unlock(&si->lock); |
444 | spin_unlock(&swap_lock); | 474 | if (offset) |
445 | return swp_entry(type, offset); | 475 | return swp_entry(type, offset); |
446 | } | 476 | spin_lock(&swap_lock); |
447 | next = swap_list.next; | 477 | next = swap_list.next; |
448 | } | 478 | } |
449 | 479 | ||
450 | nr_swap_pages++; | 480 | atomic_long_inc(&nr_swap_pages); |
451 | noswap: | 481 | noswap: |
452 | spin_unlock(&swap_lock); | 482 | spin_unlock(&swap_lock); |
453 | return (swp_entry_t) {0}; | 483 | return (swp_entry_t) {0}; |
@@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type) | |||
459 | struct swap_info_struct *si; | 489 | struct swap_info_struct *si; |
460 | pgoff_t offset; | 490 | pgoff_t offset; |
461 | 491 | ||
462 | spin_lock(&swap_lock); | ||
463 | si = swap_info[type]; | 492 | si = swap_info[type]; |
493 | spin_lock(&si->lock); | ||
464 | if (si && (si->flags & SWP_WRITEOK)) { | 494 | if (si && (si->flags & SWP_WRITEOK)) { |
465 | nr_swap_pages--; | 495 | atomic_long_dec(&nr_swap_pages); |
466 | /* This is called for allocating swap entry, not cache */ | 496 | /* This is called for allocating swap entry, not cache */ |
467 | offset = scan_swap_map(si, 1); | 497 | offset = scan_swap_map(si, 1); |
468 | if (offset) { | 498 | if (offset) { |
469 | spin_unlock(&swap_lock); | 499 | spin_unlock(&si->lock); |
470 | return swp_entry(type, offset); | 500 | return swp_entry(type, offset); |
471 | } | 501 | } |
472 | nr_swap_pages++; | 502 | atomic_long_inc(&nr_swap_pages); |
473 | } | 503 | } |
474 | spin_unlock(&swap_lock); | 504 | spin_unlock(&si->lock); |
475 | return (swp_entry_t) {0}; | 505 | return (swp_entry_t) {0}; |
476 | } | 506 | } |
477 | 507 | ||
@@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) | |||
493 | goto bad_offset; | 523 | goto bad_offset; |
494 | if (!p->swap_map[offset]) | 524 | if (!p->swap_map[offset]) |
495 | goto bad_free; | 525 | goto bad_free; |
496 | spin_lock(&swap_lock); | 526 | spin_lock(&p->lock); |
497 | return p; | 527 | return p; |
498 | 528 | ||
499 | bad_free: | 529 | bad_free: |
@@ -511,6 +541,27 @@ out: | |||
511 | return NULL; | 541 | return NULL; |
512 | } | 542 | } |
513 | 543 | ||
544 | /* | ||
545 | * This swap type frees swap entry, check if it is the highest priority swap | ||
546 | * type which just frees swap entry. get_swap_page() uses | ||
547 | * highest_priority_index to search highest priority swap type. The | ||
548 | * swap_info_struct.lock can't protect us if there are multiple swap types | ||
549 | * active, so we use atomic_cmpxchg. | ||
550 | */ | ||
551 | static void set_highest_priority_index(int type) | ||
552 | { | ||
553 | int old_hp_index, new_hp_index; | ||
554 | |||
555 | do { | ||
556 | old_hp_index = atomic_read(&highest_priority_index); | ||
557 | if (old_hp_index != -1 && | ||
558 | swap_info[old_hp_index]->prio >= swap_info[type]->prio) | ||
559 | break; | ||
560 | new_hp_index = type; | ||
561 | } while (atomic_cmpxchg(&highest_priority_index, | ||
562 | old_hp_index, new_hp_index) != old_hp_index); | ||
563 | } | ||
564 | |||
514 | static unsigned char swap_entry_free(struct swap_info_struct *p, | 565 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
515 | swp_entry_t entry, unsigned char usage) | 566 | swp_entry_t entry, unsigned char usage) |
516 | { | 567 | { |
@@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
553 | p->lowest_bit = offset; | 604 | p->lowest_bit = offset; |
554 | if (offset > p->highest_bit) | 605 | if (offset > p->highest_bit) |
555 | p->highest_bit = offset; | 606 | p->highest_bit = offset; |
556 | if (swap_list.next >= 0 && | 607 | set_highest_priority_index(p->type); |
557 | p->prio > swap_info[swap_list.next]->prio) | 608 | atomic_long_inc(&nr_swap_pages); |
558 | swap_list.next = p->type; | ||
559 | nr_swap_pages++; | ||
560 | p->inuse_pages--; | 609 | p->inuse_pages--; |
561 | frontswap_invalidate_page(p->type, offset); | 610 | frontswap_invalidate_page(p->type, offset); |
562 | if (p->flags & SWP_BLKDEV) { | 611 | if (p->flags & SWP_BLKDEV) { |
@@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry) | |||
581 | p = swap_info_get(entry); | 630 | p = swap_info_get(entry); |
582 | if (p) { | 631 | if (p) { |
583 | swap_entry_free(p, entry, 1); | 632 | swap_entry_free(p, entry, 1); |
584 | spin_unlock(&swap_lock); | 633 | spin_unlock(&p->lock); |
585 | } | 634 | } |
586 | } | 635 | } |
587 | 636 | ||
@@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
598 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); | 647 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); |
599 | if (page) | 648 | if (page) |
600 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); | 649 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); |
601 | spin_unlock(&swap_lock); | 650 | spin_unlock(&p->lock); |
602 | } | 651 | } |
603 | } | 652 | } |
604 | 653 | ||
@@ -617,7 +666,7 @@ int page_swapcount(struct page *page) | |||
617 | p = swap_info_get(entry); | 666 | p = swap_info_get(entry); |
618 | if (p) { | 667 | if (p) { |
619 | count = swap_count(p->swap_map[swp_offset(entry)]); | 668 | count = swap_count(p->swap_map[swp_offset(entry)]); |
620 | spin_unlock(&swap_lock); | 669 | spin_unlock(&p->lock); |
621 | } | 670 | } |
622 | return count; | 671 | return count; |
623 | } | 672 | } |
@@ -699,13 +748,14 @@ int free_swap_and_cache(swp_entry_t entry) | |||
699 | p = swap_info_get(entry); | 748 | p = swap_info_get(entry); |
700 | if (p) { | 749 | if (p) { |
701 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { | 750 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
702 | page = find_get_page(&swapper_space, entry.val); | 751 | page = find_get_page(swap_address_space(entry), |
752 | entry.val); | ||
703 | if (page && !trylock_page(page)) { | 753 | if (page && !trylock_page(page)) { |
704 | page_cache_release(page); | 754 | page_cache_release(page); |
705 | page = NULL; | 755 | page = NULL; |
706 | } | 756 | } |
707 | } | 757 | } |
708 | spin_unlock(&swap_lock); | 758 | spin_unlock(&p->lock); |
709 | } | 759 | } |
710 | if (page) { | 760 | if (page) { |
711 | /* | 761 | /* |
@@ -803,11 +853,13 @@ unsigned int count_swap_pages(int type, int free) | |||
803 | if ((unsigned int)type < nr_swapfiles) { | 853 | if ((unsigned int)type < nr_swapfiles) { |
804 | struct swap_info_struct *sis = swap_info[type]; | 854 | struct swap_info_struct *sis = swap_info[type]; |
805 | 855 | ||
856 | spin_lock(&sis->lock); | ||
806 | if (sis->flags & SWP_WRITEOK) { | 857 | if (sis->flags & SWP_WRITEOK) { |
807 | n = sis->pages; | 858 | n = sis->pages; |
808 | if (free) | 859 | if (free) |
809 | n -= sis->inuse_pages; | 860 | n -= sis->inuse_pages; |
810 | } | 861 | } |
862 | spin_unlock(&sis->lock); | ||
811 | } | 863 | } |
812 | spin_unlock(&swap_lock); | 864 | spin_unlock(&swap_lock); |
813 | return n; | 865 | return n; |
@@ -822,11 +874,17 @@ unsigned int count_swap_pages(int type, int free) | |||
822 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 874 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
823 | unsigned long addr, swp_entry_t entry, struct page *page) | 875 | unsigned long addr, swp_entry_t entry, struct page *page) |
824 | { | 876 | { |
877 | struct page *swapcache; | ||
825 | struct mem_cgroup *memcg; | 878 | struct mem_cgroup *memcg; |
826 | spinlock_t *ptl; | 879 | spinlock_t *ptl; |
827 | pte_t *pte; | 880 | pte_t *pte; |
828 | int ret = 1; | 881 | int ret = 1; |
829 | 882 | ||
883 | swapcache = page; | ||
884 | page = ksm_might_need_to_copy(page, vma, addr); | ||
885 | if (unlikely(!page)) | ||
886 | return -ENOMEM; | ||
887 | |||
830 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, | 888 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, |
831 | GFP_KERNEL, &memcg)) { | 889 | GFP_KERNEL, &memcg)) { |
832 | ret = -ENOMEM; | 890 | ret = -ENOMEM; |
@@ -845,7 +903,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
845 | get_page(page); | 903 | get_page(page); |
846 | set_pte_at(vma->vm_mm, addr, pte, | 904 | set_pte_at(vma->vm_mm, addr, pte, |
847 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 905 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
848 | page_add_anon_rmap(page, vma, addr); | 906 | if (page == swapcache) |
907 | page_add_anon_rmap(page, vma, addr); | ||
908 | else /* ksm created a completely new copy */ | ||
909 | page_add_new_anon_rmap(page, vma, addr); | ||
849 | mem_cgroup_commit_charge_swapin(page, memcg); | 910 | mem_cgroup_commit_charge_swapin(page, memcg); |
850 | swap_free(entry); | 911 | swap_free(entry); |
851 | /* | 912 | /* |
@@ -856,6 +917,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
856 | out: | 917 | out: |
857 | pte_unmap_unlock(pte, ptl); | 918 | pte_unmap_unlock(pte, ptl); |
858 | out_nolock: | 919 | out_nolock: |
920 | if (page != swapcache) { | ||
921 | unlock_page(page); | ||
922 | put_page(page); | ||
923 | } | ||
859 | return ret; | 924 | return ret; |
860 | } | 925 | } |
861 | 926 | ||
@@ -1456,7 +1521,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1456 | p->swap_map = swap_map; | 1521 | p->swap_map = swap_map; |
1457 | frontswap_map_set(p, frontswap_map); | 1522 | frontswap_map_set(p, frontswap_map); |
1458 | p->flags |= SWP_WRITEOK; | 1523 | p->flags |= SWP_WRITEOK; |
1459 | nr_swap_pages += p->pages; | 1524 | atomic_long_add(p->pages, &nr_swap_pages); |
1460 | total_swap_pages += p->pages; | 1525 | total_swap_pages += p->pages; |
1461 | 1526 | ||
1462 | /* insert swap space into swap_list: */ | 1527 | /* insert swap space into swap_list: */ |
@@ -1478,15 +1543,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1478 | unsigned long *frontswap_map) | 1543 | unsigned long *frontswap_map) |
1479 | { | 1544 | { |
1480 | spin_lock(&swap_lock); | 1545 | spin_lock(&swap_lock); |
1546 | spin_lock(&p->lock); | ||
1481 | _enable_swap_info(p, prio, swap_map, frontswap_map); | 1547 | _enable_swap_info(p, prio, swap_map, frontswap_map); |
1482 | frontswap_init(p->type); | 1548 | frontswap_init(p->type); |
1549 | spin_unlock(&p->lock); | ||
1483 | spin_unlock(&swap_lock); | 1550 | spin_unlock(&swap_lock); |
1484 | } | 1551 | } |
1485 | 1552 | ||
1486 | static void reinsert_swap_info(struct swap_info_struct *p) | 1553 | static void reinsert_swap_info(struct swap_info_struct *p) |
1487 | { | 1554 | { |
1488 | spin_lock(&swap_lock); | 1555 | spin_lock(&swap_lock); |
1556 | spin_lock(&p->lock); | ||
1489 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | 1557 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
1558 | spin_unlock(&p->lock); | ||
1490 | spin_unlock(&swap_lock); | 1559 | spin_unlock(&swap_lock); |
1491 | } | 1560 | } |
1492 | 1561 | ||
@@ -1546,14 +1615,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1546 | /* just pick something that's safe... */ | 1615 | /* just pick something that's safe... */ |
1547 | swap_list.next = swap_list.head; | 1616 | swap_list.next = swap_list.head; |
1548 | } | 1617 | } |
1618 | spin_lock(&p->lock); | ||
1549 | if (p->prio < 0) { | 1619 | if (p->prio < 0) { |
1550 | for (i = p->next; i >= 0; i = swap_info[i]->next) | 1620 | for (i = p->next; i >= 0; i = swap_info[i]->next) |
1551 | swap_info[i]->prio = p->prio--; | 1621 | swap_info[i]->prio = p->prio--; |
1552 | least_priority++; | 1622 | least_priority++; |
1553 | } | 1623 | } |
1554 | nr_swap_pages -= p->pages; | 1624 | atomic_long_sub(p->pages, &nr_swap_pages); |
1555 | total_swap_pages -= p->pages; | 1625 | total_swap_pages -= p->pages; |
1556 | p->flags &= ~SWP_WRITEOK; | 1626 | p->flags &= ~SWP_WRITEOK; |
1627 | spin_unlock(&p->lock); | ||
1557 | spin_unlock(&swap_lock); | 1628 | spin_unlock(&swap_lock); |
1558 | 1629 | ||
1559 | set_current_oom_origin(); | 1630 | set_current_oom_origin(); |
@@ -1572,14 +1643,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1572 | 1643 | ||
1573 | mutex_lock(&swapon_mutex); | 1644 | mutex_lock(&swapon_mutex); |
1574 | spin_lock(&swap_lock); | 1645 | spin_lock(&swap_lock); |
1646 | spin_lock(&p->lock); | ||
1575 | drain_mmlist(); | 1647 | drain_mmlist(); |
1576 | 1648 | ||
1577 | /* wait for anyone still in scan_swap_map */ | 1649 | /* wait for anyone still in scan_swap_map */ |
1578 | p->highest_bit = 0; /* cuts scans short */ | 1650 | p->highest_bit = 0; /* cuts scans short */ |
1579 | while (p->flags >= SWP_SCANNING) { | 1651 | while (p->flags >= SWP_SCANNING) { |
1652 | spin_unlock(&p->lock); | ||
1580 | spin_unlock(&swap_lock); | 1653 | spin_unlock(&swap_lock); |
1581 | schedule_timeout_uninterruptible(1); | 1654 | schedule_timeout_uninterruptible(1); |
1582 | spin_lock(&swap_lock); | 1655 | spin_lock(&swap_lock); |
1656 | spin_lock(&p->lock); | ||
1583 | } | 1657 | } |
1584 | 1658 | ||
1585 | swap_file = p->swap_file; | 1659 | swap_file = p->swap_file; |
@@ -1589,6 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1589 | p->swap_map = NULL; | 1663 | p->swap_map = NULL; |
1590 | p->flags = 0; | 1664 | p->flags = 0; |
1591 | frontswap_invalidate_area(type); | 1665 | frontswap_invalidate_area(type); |
1666 | spin_unlock(&p->lock); | ||
1592 | spin_unlock(&swap_lock); | 1667 | spin_unlock(&swap_lock); |
1593 | mutex_unlock(&swapon_mutex); | 1668 | mutex_unlock(&swapon_mutex); |
1594 | vfree(swap_map); | 1669 | vfree(swap_map); |
@@ -1699,7 +1774,7 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1699 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1774 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
1700 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1775 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1701 | len < 40 ? 40 - len : 1, " ", | 1776 | len < 40 ? 40 - len : 1, " ", |
1702 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1777 | S_ISBLK(file_inode(file)->i_mode) ? |
1703 | "partition" : "file\t", | 1778 | "partition" : "file\t", |
1704 | si->pages << (PAGE_SHIFT - 10), | 1779 | si->pages << (PAGE_SHIFT - 10), |
1705 | si->inuse_pages << (PAGE_SHIFT - 10), | 1780 | si->inuse_pages << (PAGE_SHIFT - 10), |
@@ -1794,6 +1869,7 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
1794 | p->flags = SWP_USED; | 1869 | p->flags = SWP_USED; |
1795 | p->next = -1; | 1870 | p->next = -1; |
1796 | spin_unlock(&swap_lock); | 1871 | spin_unlock(&swap_lock); |
1872 | spin_lock_init(&p->lock); | ||
1797 | 1873 | ||
1798 | return p; | 1874 | return p; |
1799 | } | 1875 | } |
@@ -2116,7 +2192,7 @@ void si_swapinfo(struct sysinfo *val) | |||
2116 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) | 2192 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
2117 | nr_to_be_unused += si->inuse_pages; | 2193 | nr_to_be_unused += si->inuse_pages; |
2118 | } | 2194 | } |
2119 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2195 | val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; |
2120 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2196 | val->totalswap = total_swap_pages + nr_to_be_unused; |
2121 | spin_unlock(&swap_lock); | 2197 | spin_unlock(&swap_lock); |
2122 | } | 2198 | } |
@@ -2149,7 +2225,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2149 | p = swap_info[type]; | 2225 | p = swap_info[type]; |
2150 | offset = swp_offset(entry); | 2226 | offset = swp_offset(entry); |
2151 | 2227 | ||
2152 | spin_lock(&swap_lock); | 2228 | spin_lock(&p->lock); |
2153 | if (unlikely(offset >= p->max)) | 2229 | if (unlikely(offset >= p->max)) |
2154 | goto unlock_out; | 2230 | goto unlock_out; |
2155 | 2231 | ||
@@ -2184,7 +2260,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2184 | p->swap_map[offset] = count | has_cache; | 2260 | p->swap_map[offset] = count | has_cache; |
2185 | 2261 | ||
2186 | unlock_out: | 2262 | unlock_out: |
2187 | spin_unlock(&swap_lock); | 2263 | spin_unlock(&p->lock); |
2188 | out: | 2264 | out: |
2189 | return err; | 2265 | return err; |
2190 | 2266 | ||
@@ -2309,7 +2385,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
2309 | } | 2385 | } |
2310 | 2386 | ||
2311 | if (!page) { | 2387 | if (!page) { |
2312 | spin_unlock(&swap_lock); | 2388 | spin_unlock(&si->lock); |
2313 | return -ENOMEM; | 2389 | return -ENOMEM; |
2314 | } | 2390 | } |
2315 | 2391 | ||
@@ -2357,7 +2433,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
2357 | list_add_tail(&page->lru, &head->lru); | 2433 | list_add_tail(&page->lru, &head->lru); |
2358 | page = NULL; /* now it's attached, don't free it */ | 2434 | page = NULL; /* now it's attached, don't free it */ |
2359 | out: | 2435 | out: |
2360 | spin_unlock(&swap_lock); | 2436 | spin_unlock(&si->lock); |
2361 | outer: | 2437 | outer: |
2362 | if (page) | 2438 | if (page) |
2363 | __free_page(page); | 2439 | __free_page(page); |
@@ -5,6 +5,8 @@ | |||
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
8 | #include <linux/swap.h> | ||
9 | #include <linux/swapops.h> | ||
8 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> |
9 | 11 | ||
10 | #include "internal.h" | 12 | #include "internal.h" |
@@ -355,12 +357,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, | |||
355 | { | 357 | { |
356 | unsigned long ret; | 358 | unsigned long ret; |
357 | struct mm_struct *mm = current->mm; | 359 | struct mm_struct *mm = current->mm; |
360 | unsigned long populate; | ||
358 | 361 | ||
359 | ret = security_mmap_file(file, prot, flag); | 362 | ret = security_mmap_file(file, prot, flag); |
360 | if (!ret) { | 363 | if (!ret) { |
361 | down_write(&mm->mmap_sem); | 364 | down_write(&mm->mmap_sem); |
362 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); | 365 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, |
366 | &populate); | ||
363 | up_write(&mm->mmap_sem); | 367 | up_write(&mm->mmap_sem); |
368 | if (populate) | ||
369 | mm_populate(ret, populate); | ||
364 | } | 370 | } |
365 | return ret; | 371 | return ret; |
366 | } | 372 | } |
@@ -378,6 +384,24 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, | |||
378 | } | 384 | } |
379 | EXPORT_SYMBOL(vm_mmap); | 385 | EXPORT_SYMBOL(vm_mmap); |
380 | 386 | ||
387 | struct address_space *page_mapping(struct page *page) | ||
388 | { | ||
389 | struct address_space *mapping = page->mapping; | ||
390 | |||
391 | VM_BUG_ON(PageSlab(page)); | ||
392 | #ifdef CONFIG_SWAP | ||
393 | if (unlikely(PageSwapCache(page))) { | ||
394 | swp_entry_t entry; | ||
395 | |||
396 | entry.val = page_private(page); | ||
397 | mapping = swap_address_space(entry); | ||
398 | } else | ||
399 | #endif | ||
400 | if ((unsigned long)mapping & PAGE_MAPPING_ANON) | ||
401 | mapping = NULL; | ||
402 | return mapping; | ||
403 | } | ||
404 | |||
381 | /* Tracepoints definitions. */ | 405 | /* Tracepoints definitions. */ |
382 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 406 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
383 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 407 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5123a169ab7b..0f751f2068c3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1376,8 +1376,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1376 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1376 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
1377 | unsigned long start, unsigned long end) | 1377 | unsigned long start, unsigned long end) |
1378 | { | 1378 | { |
1379 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, | 1379 | return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, |
1380 | __builtin_return_address(0)); | 1380 | GFP_KERNEL, __builtin_return_address(0)); |
1381 | } | 1381 | } |
1382 | EXPORT_SYMBOL_GPL(__get_vm_area); | 1382 | EXPORT_SYMBOL_GPL(__get_vm_area); |
1383 | 1383 | ||
@@ -1385,8 +1385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1385 | unsigned long start, unsigned long end, | 1385 | unsigned long start, unsigned long end, |
1386 | const void *caller) | 1386 | const void *caller) |
1387 | { | 1387 | { |
1388 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, | 1388 | return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, |
1389 | caller); | 1389 | GFP_KERNEL, caller); |
1390 | } | 1390 | } |
1391 | 1391 | ||
1392 | /** | 1392 | /** |
@@ -1401,14 +1401,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1401 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | 1401 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) |
1402 | { | 1402 | { |
1403 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | 1403 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
1404 | -1, GFP_KERNEL, __builtin_return_address(0)); | 1404 | NUMA_NO_NODE, GFP_KERNEL, |
1405 | __builtin_return_address(0)); | ||
1405 | } | 1406 | } |
1406 | 1407 | ||
1407 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | 1408 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, |
1408 | const void *caller) | 1409 | const void *caller) |
1409 | { | 1410 | { |
1410 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | 1411 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
1411 | -1, GFP_KERNEL, caller); | 1412 | NUMA_NO_NODE, GFP_KERNEL, caller); |
1412 | } | 1413 | } |
1413 | 1414 | ||
1414 | /** | 1415 | /** |
@@ -1650,7 +1651,7 @@ fail: | |||
1650 | * @end: vm area range end | 1651 | * @end: vm area range end |
1651 | * @gfp_mask: flags for the page level allocator | 1652 | * @gfp_mask: flags for the page level allocator |
1652 | * @prot: protection mask for the allocated pages | 1653 | * @prot: protection mask for the allocated pages |
1653 | * @node: node to use for allocation or -1 | 1654 | * @node: node to use for allocation or NUMA_NO_NODE |
1654 | * @caller: caller's return address | 1655 | * @caller: caller's return address |
1655 | * | 1656 | * |
1656 | * Allocate enough pages to cover @size from the page level | 1657 | * Allocate enough pages to cover @size from the page level |
@@ -1706,7 +1707,7 @@ fail: | |||
1706 | * @align: desired alignment | 1707 | * @align: desired alignment |
1707 | * @gfp_mask: flags for the page level allocator | 1708 | * @gfp_mask: flags for the page level allocator |
1708 | * @prot: protection mask for the allocated pages | 1709 | * @prot: protection mask for the allocated pages |
1709 | * @node: node to use for allocation or -1 | 1710 | * @node: node to use for allocation or NUMA_NO_NODE |
1710 | * @caller: caller's return address | 1711 | * @caller: caller's return address |
1711 | * | 1712 | * |
1712 | * Allocate enough pages to cover @size from the page level | 1713 | * Allocate enough pages to cover @size from the page level |
@@ -1723,7 +1724,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1723 | 1724 | ||
1724 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1725 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
1725 | { | 1726 | { |
1726 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, | 1727 | return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, |
1727 | __builtin_return_address(0)); | 1728 | __builtin_return_address(0)); |
1728 | } | 1729 | } |
1729 | EXPORT_SYMBOL(__vmalloc); | 1730 | EXPORT_SYMBOL(__vmalloc); |
@@ -1746,7 +1747,8 @@ static inline void *__vmalloc_node_flags(unsigned long size, | |||
1746 | */ | 1747 | */ |
1747 | void *vmalloc(unsigned long size) | 1748 | void *vmalloc(unsigned long size) |
1748 | { | 1749 | { |
1749 | return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); | 1750 | return __vmalloc_node_flags(size, NUMA_NO_NODE, |
1751 | GFP_KERNEL | __GFP_HIGHMEM); | ||
1750 | } | 1752 | } |
1751 | EXPORT_SYMBOL(vmalloc); | 1753 | EXPORT_SYMBOL(vmalloc); |
1752 | 1754 | ||
@@ -1762,7 +1764,7 @@ EXPORT_SYMBOL(vmalloc); | |||
1762 | */ | 1764 | */ |
1763 | void *vzalloc(unsigned long size) | 1765 | void *vzalloc(unsigned long size) |
1764 | { | 1766 | { |
1765 | return __vmalloc_node_flags(size, -1, | 1767 | return __vmalloc_node_flags(size, NUMA_NO_NODE, |
1766 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); | 1768 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); |
1767 | } | 1769 | } |
1768 | EXPORT_SYMBOL(vzalloc); | 1770 | EXPORT_SYMBOL(vzalloc); |
@@ -1781,7 +1783,8 @@ void *vmalloc_user(unsigned long size) | |||
1781 | 1783 | ||
1782 | ret = __vmalloc_node(size, SHMLBA, | 1784 | ret = __vmalloc_node(size, SHMLBA, |
1783 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 1785 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, |
1784 | PAGE_KERNEL, -1, __builtin_return_address(0)); | 1786 | PAGE_KERNEL, NUMA_NO_NODE, |
1787 | __builtin_return_address(0)); | ||
1785 | if (ret) { | 1788 | if (ret) { |
1786 | area = find_vm_area(ret); | 1789 | area = find_vm_area(ret); |
1787 | area->flags |= VM_USERMAP; | 1790 | area->flags |= VM_USERMAP; |
@@ -1846,7 +1849,7 @@ EXPORT_SYMBOL(vzalloc_node); | |||
1846 | void *vmalloc_exec(unsigned long size) | 1849 | void *vmalloc_exec(unsigned long size) |
1847 | { | 1850 | { |
1848 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, | 1851 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, |
1849 | -1, __builtin_return_address(0)); | 1852 | NUMA_NO_NODE, __builtin_return_address(0)); |
1850 | } | 1853 | } |
1851 | 1854 | ||
1852 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) | 1855 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) |
@@ -1867,7 +1870,7 @@ void *vmalloc_exec(unsigned long size) | |||
1867 | void *vmalloc_32(unsigned long size) | 1870 | void *vmalloc_32(unsigned long size) |
1868 | { | 1871 | { |
1869 | return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, | 1872 | return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, |
1870 | -1, __builtin_return_address(0)); | 1873 | NUMA_NO_NODE, __builtin_return_address(0)); |
1871 | } | 1874 | } |
1872 | EXPORT_SYMBOL(vmalloc_32); | 1875 | EXPORT_SYMBOL(vmalloc_32); |
1873 | 1876 | ||
@@ -1884,7 +1887,7 @@ void *vmalloc_32_user(unsigned long size) | |||
1884 | void *ret; | 1887 | void *ret; |
1885 | 1888 | ||
1886 | ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, | 1889 | ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
1887 | -1, __builtin_return_address(0)); | 1890 | NUMA_NO_NODE, __builtin_return_address(0)); |
1888 | if (ret) { | 1891 | if (ret) { |
1889 | area = find_vm_area(ret); | 1892 | area = find_vm_area(ret); |
1890 | area->flags |= VM_USERMAP; | 1893 | area->flags |= VM_USERMAP; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 196709f5ee58..88c5fed8b9a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -128,7 +128,7 @@ struct scan_control { | |||
128 | * From 0 .. 100. Higher means more swappy. | 128 | * From 0 .. 100. Higher means more swappy. |
129 | */ | 129 | */ |
130 | int vm_swappiness = 60; | 130 | int vm_swappiness = 60; |
131 | long vm_total_pages; /* The total number of pages which the VM controls */ | 131 | unsigned long vm_total_pages; /* The total number of pages which the VM controls */ |
132 | 132 | ||
133 | static LIST_HEAD(shrinker_list); | 133 | static LIST_HEAD(shrinker_list); |
134 | static DECLARE_RWSEM(shrinker_rwsem); | 134 | static DECLARE_RWSEM(shrinker_rwsem); |
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) | |||
1579 | } | 1579 | } |
1580 | #endif | 1580 | #endif |
1581 | 1581 | ||
1582 | static int inactive_file_is_low_global(struct zone *zone) | ||
1583 | { | ||
1584 | unsigned long active, inactive; | ||
1585 | |||
1586 | active = zone_page_state(zone, NR_ACTIVE_FILE); | ||
1587 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
1588 | |||
1589 | return (active > inactive); | ||
1590 | } | ||
1591 | |||
1592 | /** | 1582 | /** |
1593 | * inactive_file_is_low - check if file pages need to be deactivated | 1583 | * inactive_file_is_low - check if file pages need to be deactivated |
1594 | * @lruvec: LRU vector to check | 1584 | * @lruvec: LRU vector to check |
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1605 | */ | 1595 | */ |
1606 | static int inactive_file_is_low(struct lruvec *lruvec) | 1596 | static int inactive_file_is_low(struct lruvec *lruvec) |
1607 | { | 1597 | { |
1608 | if (!mem_cgroup_disabled()) | 1598 | unsigned long inactive; |
1609 | return mem_cgroup_inactive_file_is_low(lruvec); | 1599 | unsigned long active; |
1600 | |||
1601 | inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
1602 | active = get_lru_size(lruvec, LRU_ACTIVE_FILE); | ||
1610 | 1603 | ||
1611 | return inactive_file_is_low_global(lruvec_zone(lruvec)); | 1604 | return active > inactive; |
1612 | } | 1605 | } |
1613 | 1606 | ||
1614 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) | 1607 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) |
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
1638 | return mem_cgroup_swappiness(sc->target_mem_cgroup); | 1631 | return mem_cgroup_swappiness(sc->target_mem_cgroup); |
1639 | } | 1632 | } |
1640 | 1633 | ||
1634 | enum scan_balance { | ||
1635 | SCAN_EQUAL, | ||
1636 | SCAN_FRACT, | ||
1637 | SCAN_ANON, | ||
1638 | SCAN_FILE, | ||
1639 | }; | ||
1640 | |||
1641 | /* | 1641 | /* |
1642 | * Determine how aggressively the anon and file LRU lists should be | 1642 | * Determine how aggressively the anon and file LRU lists should be |
1643 | * scanned. The relative value of each set of LRU lists is determined | 1643 | * scanned. The relative value of each set of LRU lists is determined |
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
1650 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | 1650 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, |
1651 | unsigned long *nr) | 1651 | unsigned long *nr) |
1652 | { | 1652 | { |
1653 | unsigned long anon, file, free; | 1653 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1654 | u64 fraction[2]; | ||
1655 | u64 denominator = 0; /* gcc */ | ||
1656 | struct zone *zone = lruvec_zone(lruvec); | ||
1654 | unsigned long anon_prio, file_prio; | 1657 | unsigned long anon_prio, file_prio; |
1658 | enum scan_balance scan_balance; | ||
1659 | unsigned long anon, file, free; | ||
1660 | bool force_scan = false; | ||
1655 | unsigned long ap, fp; | 1661 | unsigned long ap, fp; |
1656 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | ||
1657 | u64 fraction[2], denominator; | ||
1658 | enum lru_list lru; | 1662 | enum lru_list lru; |
1659 | int noswap = 0; | ||
1660 | bool force_scan = false; | ||
1661 | struct zone *zone = lruvec_zone(lruvec); | ||
1662 | 1663 | ||
1663 | /* | 1664 | /* |
1664 | * If the zone or memcg is small, nr[l] can be 0. This | 1665 | * If the zone or memcg is small, nr[l] can be 0. This |
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1676 | force_scan = true; | 1677 | force_scan = true; |
1677 | 1678 | ||
1678 | /* If we have no swap space, do not bother scanning anon pages. */ | 1679 | /* If we have no swap space, do not bother scanning anon pages. */ |
1679 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1680 | if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { |
1680 | noswap = 1; | 1681 | scan_balance = SCAN_FILE; |
1681 | fraction[0] = 0; | 1682 | goto out; |
1682 | fraction[1] = 1; | 1683 | } |
1683 | denominator = 1; | 1684 | |
1685 | /* | ||
1686 | * Global reclaim will swap to prevent OOM even with no | ||
1687 | * swappiness, but memcg users want to use this knob to | ||
1688 | * disable swapping for individual groups completely when | ||
1689 | * using the memory controller's swap limit feature would be | ||
1690 | * too expensive. | ||
1691 | */ | ||
1692 | if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { | ||
1693 | scan_balance = SCAN_FILE; | ||
1694 | goto out; | ||
1695 | } | ||
1696 | |||
1697 | /* | ||
1698 | * Do not apply any pressure balancing cleverness when the | ||
1699 | * system is close to OOM, scan both anon and file equally | ||
1700 | * (unless the swappiness setting disagrees with swapping). | ||
1701 | */ | ||
1702 | if (!sc->priority && vmscan_swappiness(sc)) { | ||
1703 | scan_balance = SCAN_EQUAL; | ||
1684 | goto out; | 1704 | goto out; |
1685 | } | 1705 | } |
1686 | 1706 | ||
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1689 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + | 1709 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + |
1690 | get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1710 | get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1691 | 1711 | ||
1712 | /* | ||
1713 | * If it's foreseeable that reclaiming the file cache won't be | ||
1714 | * enough to get the zone back into a desirable shape, we have | ||
1715 | * to swap. Better start now and leave the - probably heavily | ||
1716 | * thrashing - remaining file pages alone. | ||
1717 | */ | ||
1692 | if (global_reclaim(sc)) { | 1718 | if (global_reclaim(sc)) { |
1693 | free = zone_page_state(zone, NR_FREE_PAGES); | 1719 | free = zone_page_state(zone, NR_FREE_PAGES); |
1694 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1720 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1695 | /* | 1721 | scan_balance = SCAN_ANON; |
1696 | * If we have very few page cache pages, force-scan | ||
1697 | * anon pages. | ||
1698 | */ | ||
1699 | fraction[0] = 1; | ||
1700 | fraction[1] = 0; | ||
1701 | denominator = 1; | ||
1702 | goto out; | ||
1703 | } else if (!inactive_file_is_low_global(zone)) { | ||
1704 | /* | ||
1705 | * There is enough inactive page cache, do not | ||
1706 | * reclaim anything from the working set right now. | ||
1707 | */ | ||
1708 | fraction[0] = 0; | ||
1709 | fraction[1] = 1; | ||
1710 | denominator = 1; | ||
1711 | goto out; | 1722 | goto out; |
1712 | } | 1723 | } |
1713 | } | 1724 | } |
1714 | 1725 | ||
1715 | /* | 1726 | /* |
1727 | * There is enough inactive page cache, do not reclaim | ||
1728 | * anything from the anonymous working set right now. | ||
1729 | */ | ||
1730 | if (!inactive_file_is_low(lruvec)) { | ||
1731 | scan_balance = SCAN_FILE; | ||
1732 | goto out; | ||
1733 | } | ||
1734 | |||
1735 | scan_balance = SCAN_FRACT; | ||
1736 | |||
1737 | /* | ||
1716 | * With swappiness at 100, anonymous and file have the same priority. | 1738 | * With swappiness at 100, anonymous and file have the same priority. |
1717 | * This scanning priority is essentially the inverse of IO cost. | 1739 | * This scanning priority is essentially the inverse of IO cost. |
1718 | */ | 1740 | */ |
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1759 | out: | 1781 | out: |
1760 | for_each_evictable_lru(lru) { | 1782 | for_each_evictable_lru(lru) { |
1761 | int file = is_file_lru(lru); | 1783 | int file = is_file_lru(lru); |
1784 | unsigned long size; | ||
1762 | unsigned long scan; | 1785 | unsigned long scan; |
1763 | 1786 | ||
1764 | scan = get_lru_size(lruvec, lru); | 1787 | size = get_lru_size(lruvec, lru); |
1765 | if (sc->priority || noswap || !vmscan_swappiness(sc)) { | 1788 | scan = size >> sc->priority; |
1766 | scan >>= sc->priority; | 1789 | |
1767 | if (!scan && force_scan) | 1790 | if (!scan && force_scan) |
1768 | scan = SWAP_CLUSTER_MAX; | 1791 | scan = min(size, SWAP_CLUSTER_MAX); |
1792 | |||
1793 | switch (scan_balance) { | ||
1794 | case SCAN_EQUAL: | ||
1795 | /* Scan lists relative to size */ | ||
1796 | break; | ||
1797 | case SCAN_FRACT: | ||
1798 | /* | ||
1799 | * Scan types proportional to swappiness and | ||
1800 | * their relative recent reclaim efficiency. | ||
1801 | */ | ||
1769 | scan = div64_u64(scan * fraction[file], denominator); | 1802 | scan = div64_u64(scan * fraction[file], denominator); |
1803 | break; | ||
1804 | case SCAN_FILE: | ||
1805 | case SCAN_ANON: | ||
1806 | /* Scan one type exclusively */ | ||
1807 | if ((scan_balance == SCAN_FILE) != file) | ||
1808 | scan = 0; | ||
1809 | break; | ||
1810 | default: | ||
1811 | /* Look ma, no brain */ | ||
1812 | BUG(); | ||
1770 | } | 1813 | } |
1771 | nr[lru] = scan; | 1814 | nr[lru] = scan; |
1772 | } | 1815 | } |
1773 | } | 1816 | } |
1774 | 1817 | ||
1818 | /* | ||
1819 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | ||
1820 | */ | ||
1821 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | ||
1822 | { | ||
1823 | unsigned long nr[NR_LRU_LISTS]; | ||
1824 | unsigned long nr_to_scan; | ||
1825 | enum lru_list lru; | ||
1826 | unsigned long nr_reclaimed = 0; | ||
1827 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | ||
1828 | struct blk_plug plug; | ||
1829 | |||
1830 | get_scan_count(lruvec, sc, nr); | ||
1831 | |||
1832 | blk_start_plug(&plug); | ||
1833 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | ||
1834 | nr[LRU_INACTIVE_FILE]) { | ||
1835 | for_each_evictable_lru(lru) { | ||
1836 | if (nr[lru]) { | ||
1837 | nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); | ||
1838 | nr[lru] -= nr_to_scan; | ||
1839 | |||
1840 | nr_reclaimed += shrink_list(lru, nr_to_scan, | ||
1841 | lruvec, sc); | ||
1842 | } | ||
1843 | } | ||
1844 | /* | ||
1845 | * On large memory systems, scan >> priority can become | ||
1846 | * really large. This is fine for the starting priority; | ||
1847 | * we want to put equal scanning pressure on each zone. | ||
1848 | * However, if the VM has a harder time of freeing pages, | ||
1849 | * with multiple processes reclaiming pages, the total | ||
1850 | * freeing target can get unreasonably large. | ||
1851 | */ | ||
1852 | if (nr_reclaimed >= nr_to_reclaim && | ||
1853 | sc->priority < DEF_PRIORITY) | ||
1854 | break; | ||
1855 | } | ||
1856 | blk_finish_plug(&plug); | ||
1857 | sc->nr_reclaimed += nr_reclaimed; | ||
1858 | |||
1859 | /* | ||
1860 | * Even if we did not try to evict anon pages at all, we want to | ||
1861 | * rebalance the anon lru active/inactive ratio. | ||
1862 | */ | ||
1863 | if (inactive_anon_is_low(lruvec)) | ||
1864 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | ||
1865 | sc, LRU_ACTIVE_ANON); | ||
1866 | |||
1867 | throttle_vm_writeout(sc->gfp_mask); | ||
1868 | } | ||
1869 | |||
1775 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | 1870 | /* Use reclaim/compaction for costly allocs or under memory pressure */ |
1776 | static bool in_reclaim_compaction(struct scan_control *sc) | 1871 | static bool in_reclaim_compaction(struct scan_control *sc) |
1777 | { | 1872 | { |
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) | |||
1790 | * calls try_to_compact_zone() that it will have enough free pages to succeed. | 1885 | * calls try_to_compact_zone() that it will have enough free pages to succeed. |
1791 | * It will give up earlier than that if there is difficulty reclaiming pages. | 1886 | * It will give up earlier than that if there is difficulty reclaiming pages. |
1792 | */ | 1887 | */ |
1793 | static inline bool should_continue_reclaim(struct lruvec *lruvec, | 1888 | static inline bool should_continue_reclaim(struct zone *zone, |
1794 | unsigned long nr_reclaimed, | 1889 | unsigned long nr_reclaimed, |
1795 | unsigned long nr_scanned, | 1890 | unsigned long nr_scanned, |
1796 | struct scan_control *sc) | 1891 | struct scan_control *sc) |
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
1830 | * inactive lists are large enough, continue reclaiming | 1925 | * inactive lists are large enough, continue reclaiming |
1831 | */ | 1926 | */ |
1832 | pages_for_compaction = (2UL << sc->order); | 1927 | pages_for_compaction = (2UL << sc->order); |
1833 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1928 | inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); |
1834 | if (nr_swap_pages > 0) | 1929 | if (get_nr_swap_pages() > 0) |
1835 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); | 1930 | inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); |
1836 | if (sc->nr_reclaimed < pages_for_compaction && | 1931 | if (sc->nr_reclaimed < pages_for_compaction && |
1837 | inactive_lru_pages > pages_for_compaction) | 1932 | inactive_lru_pages > pages_for_compaction) |
1838 | return true; | 1933 | return true; |
1839 | 1934 | ||
1840 | /* If compaction would go ahead or the allocation would succeed, stop */ | 1935 | /* If compaction would go ahead or the allocation would succeed, stop */ |
1841 | switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { | 1936 | switch (compaction_suitable(zone, sc->order)) { |
1842 | case COMPACT_PARTIAL: | 1937 | case COMPACT_PARTIAL: |
1843 | case COMPACT_CONTINUE: | 1938 | case COMPACT_CONTINUE: |
1844 | return false; | 1939 | return false; |
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
1847 | } | 1942 | } |
1848 | } | 1943 | } |
1849 | 1944 | ||
1850 | /* | 1945 | static void shrink_zone(struct zone *zone, struct scan_control *sc) |
1851 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | ||
1852 | */ | ||
1853 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | ||
1854 | { | 1946 | { |
1855 | unsigned long nr[NR_LRU_LISTS]; | ||
1856 | unsigned long nr_to_scan; | ||
1857 | enum lru_list lru; | ||
1858 | unsigned long nr_reclaimed, nr_scanned; | 1947 | unsigned long nr_reclaimed, nr_scanned; |
1859 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | ||
1860 | struct blk_plug plug; | ||
1861 | |||
1862 | restart: | ||
1863 | nr_reclaimed = 0; | ||
1864 | nr_scanned = sc->nr_scanned; | ||
1865 | get_scan_count(lruvec, sc, nr); | ||
1866 | |||
1867 | blk_start_plug(&plug); | ||
1868 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | ||
1869 | nr[LRU_INACTIVE_FILE]) { | ||
1870 | for_each_evictable_lru(lru) { | ||
1871 | if (nr[lru]) { | ||
1872 | nr_to_scan = min_t(unsigned long, | ||
1873 | nr[lru], SWAP_CLUSTER_MAX); | ||
1874 | nr[lru] -= nr_to_scan; | ||
1875 | |||
1876 | nr_reclaimed += shrink_list(lru, nr_to_scan, | ||
1877 | lruvec, sc); | ||
1878 | } | ||
1879 | } | ||
1880 | /* | ||
1881 | * On large memory systems, scan >> priority can become | ||
1882 | * really large. This is fine for the starting priority; | ||
1883 | * we want to put equal scanning pressure on each zone. | ||
1884 | * However, if the VM has a harder time of freeing pages, | ||
1885 | * with multiple processes reclaiming pages, the total | ||
1886 | * freeing target can get unreasonably large. | ||
1887 | */ | ||
1888 | if (nr_reclaimed >= nr_to_reclaim && | ||
1889 | sc->priority < DEF_PRIORITY) | ||
1890 | break; | ||
1891 | } | ||
1892 | blk_finish_plug(&plug); | ||
1893 | sc->nr_reclaimed += nr_reclaimed; | ||
1894 | 1948 | ||
1895 | /* | 1949 | do { |
1896 | * Even if we did not try to evict anon pages at all, we want to | 1950 | struct mem_cgroup *root = sc->target_mem_cgroup; |
1897 | * rebalance the anon lru active/inactive ratio. | 1951 | struct mem_cgroup_reclaim_cookie reclaim = { |
1898 | */ | 1952 | .zone = zone, |
1899 | if (inactive_anon_is_low(lruvec)) | 1953 | .priority = sc->priority, |
1900 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 1954 | }; |
1901 | sc, LRU_ACTIVE_ANON); | 1955 | struct mem_cgroup *memcg; |
1902 | |||
1903 | /* reclaim/compaction might need reclaim to continue */ | ||
1904 | if (should_continue_reclaim(lruvec, nr_reclaimed, | ||
1905 | sc->nr_scanned - nr_scanned, sc)) | ||
1906 | goto restart; | ||
1907 | 1956 | ||
1908 | throttle_vm_writeout(sc->gfp_mask); | 1957 | nr_reclaimed = sc->nr_reclaimed; |
1909 | } | 1958 | nr_scanned = sc->nr_scanned; |
1910 | 1959 | ||
1911 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | 1960 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
1912 | { | 1961 | do { |
1913 | struct mem_cgroup *root = sc->target_mem_cgroup; | 1962 | struct lruvec *lruvec; |
1914 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
1915 | .zone = zone, | ||
1916 | .priority = sc->priority, | ||
1917 | }; | ||
1918 | struct mem_cgroup *memcg; | ||
1919 | 1963 | ||
1920 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 1964 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
1921 | do { | ||
1922 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | ||
1923 | 1965 | ||
1924 | shrink_lruvec(lruvec, sc); | 1966 | shrink_lruvec(lruvec, sc); |
1925 | 1967 | ||
1926 | /* | 1968 | /* |
1927 | * Limit reclaim has historically picked one memcg and | 1969 | * Direct reclaim and kswapd have to scan all memory |
1928 | * scanned it with decreasing priority levels until | 1970 | * cgroups to fulfill the overall scan target for the |
1929 | * nr_to_reclaim had been reclaimed. This priority | 1971 | * zone. |
1930 | * cycle is thus over after a single memcg. | 1972 | * |
1931 | * | 1973 | * Limit reclaim, on the other hand, only cares about |
1932 | * Direct reclaim and kswapd, on the other hand, have | 1974 | * nr_to_reclaim pages to be reclaimed and it will |
1933 | * to scan all memory cgroups to fulfill the overall | 1975 | * retry with decreasing priority if one round over the |
1934 | * scan target for the zone. | 1976 | * whole hierarchy is not sufficient. |
1935 | */ | 1977 | */ |
1936 | if (!global_reclaim(sc)) { | 1978 | if (!global_reclaim(sc) && |
1937 | mem_cgroup_iter_break(root, memcg); | 1979 | sc->nr_reclaimed >= sc->nr_to_reclaim) { |
1938 | break; | 1980 | mem_cgroup_iter_break(root, memcg); |
1939 | } | 1981 | break; |
1940 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 1982 | } |
1941 | } while (memcg); | 1983 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
1984 | } while (memcg); | ||
1985 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | ||
1986 | sc->nr_scanned - nr_scanned, sc)); | ||
1942 | } | 1987 | } |
1943 | 1988 | ||
1944 | /* Returns true if compaction should go ahead for a high-order request */ | 1989 | /* Returns true if compaction should go ahead for a high-order request */ |
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
1958 | * a reasonable chance of completing and allocating the page | 2003 | * a reasonable chance of completing and allocating the page |
1959 | */ | 2004 | */ |
1960 | balance_gap = min(low_wmark_pages(zone), | 2005 | balance_gap = min(low_wmark_pages(zone), |
1961 | (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2006 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
1962 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2007 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
1963 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | 2008 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); |
1964 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | 2009 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); |
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2150 | goto out; | 2195 | goto out; |
2151 | 2196 | ||
2152 | /* | 2197 | /* |
2198 | * If we're getting trouble reclaiming, start doing | ||
2199 | * writepage even in laptop mode. | ||
2200 | */ | ||
2201 | if (sc->priority < DEF_PRIORITY - 2) | ||
2202 | sc->may_writepage = 1; | ||
2203 | |||
2204 | /* | ||
2153 | * Try to write back as many pages as we just scanned. This | 2205 | * Try to write back as many pages as we just scanned. This |
2154 | * tends to cause slow streaming writers to write data to the | 2206 | * tends to cause slow streaming writers to write data to the |
2155 | * disk smoothly, at the dirtying rate, which is nice. But | 2207 | * disk smoothly, at the dirtying rate, which is nice. But |
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2300 | { | 2352 | { |
2301 | unsigned long nr_reclaimed; | 2353 | unsigned long nr_reclaimed; |
2302 | struct scan_control sc = { | 2354 | struct scan_control sc = { |
2303 | .gfp_mask = gfp_mask, | 2355 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
2304 | .may_writepage = !laptop_mode, | 2356 | .may_writepage = !laptop_mode, |
2305 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2357 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2306 | .may_unmap = 1, | 2358 | .may_unmap = 1, |
@@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order, | |||
2473 | */ | 2525 | */ |
2474 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | 2526 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) |
2475 | { | 2527 | { |
2476 | unsigned long present_pages = 0; | 2528 | unsigned long managed_pages = 0; |
2477 | unsigned long balanced_pages = 0; | 2529 | unsigned long balanced_pages = 0; |
2478 | int i; | 2530 | int i; |
2479 | 2531 | ||
@@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
2484 | if (!populated_zone(zone)) | 2536 | if (!populated_zone(zone)) |
2485 | continue; | 2537 | continue; |
2486 | 2538 | ||
2487 | present_pages += zone->present_pages; | 2539 | managed_pages += zone->managed_pages; |
2488 | 2540 | ||
2489 | /* | 2541 | /* |
2490 | * A special case here: | 2542 | * A special case here: |
@@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
2494 | * they must be considered balanced here as well! | 2546 | * they must be considered balanced here as well! |
2495 | */ | 2547 | */ |
2496 | if (zone->all_unreclaimable) { | 2548 | if (zone->all_unreclaimable) { |
2497 | balanced_pages += zone->present_pages; | 2549 | balanced_pages += zone->managed_pages; |
2498 | continue; | 2550 | continue; |
2499 | } | 2551 | } |
2500 | 2552 | ||
2501 | if (zone_balanced(zone, order, 0, i)) | 2553 | if (zone_balanced(zone, order, 0, i)) |
2502 | balanced_pages += zone->present_pages; | 2554 | balanced_pages += zone->managed_pages; |
2503 | else if (!order) | 2555 | else if (!order) |
2504 | return false; | 2556 | return false; |
2505 | } | 2557 | } |
2506 | 2558 | ||
2507 | if (order) | 2559 | if (order) |
2508 | return balanced_pages >= (present_pages >> 2); | 2560 | return balanced_pages >= (managed_pages >> 2); |
2509 | else | 2561 | else |
2510 | return true; | 2562 | return true; |
2511 | } | 2563 | } |
@@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2564 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | 2616 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2565 | int *classzone_idx) | 2617 | int *classzone_idx) |
2566 | { | 2618 | { |
2567 | struct zone *unbalanced_zone; | 2619 | bool pgdat_is_balanced = false; |
2568 | int i; | 2620 | int i; |
2569 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2621 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2570 | unsigned long total_scanned; | 2622 | unsigned long total_scanned; |
@@ -2595,9 +2647,6 @@ loop_again: | |||
2595 | 2647 | ||
2596 | do { | 2648 | do { |
2597 | unsigned long lru_pages = 0; | 2649 | unsigned long lru_pages = 0; |
2598 | int has_under_min_watermark_zone = 0; | ||
2599 | |||
2600 | unbalanced_zone = NULL; | ||
2601 | 2650 | ||
2602 | /* | 2651 | /* |
2603 | * Scan in the highmem->dma direction for the highest | 2652 | * Scan in the highmem->dma direction for the highest |
@@ -2638,8 +2687,11 @@ loop_again: | |||
2638 | zone_clear_flag(zone, ZONE_CONGESTED); | 2687 | zone_clear_flag(zone, ZONE_CONGESTED); |
2639 | } | 2688 | } |
2640 | } | 2689 | } |
2641 | if (i < 0) | 2690 | |
2691 | if (i < 0) { | ||
2692 | pgdat_is_balanced = true; | ||
2642 | goto out; | 2693 | goto out; |
2694 | } | ||
2643 | 2695 | ||
2644 | for (i = 0; i <= end_zone; i++) { | 2696 | for (i = 0; i <= end_zone; i++) { |
2645 | struct zone *zone = pgdat->node_zones + i; | 2697 | struct zone *zone = pgdat->node_zones + i; |
@@ -2689,7 +2741,7 @@ loop_again: | |||
2689 | * of the zone, whichever is smaller. | 2741 | * of the zone, whichever is smaller. |
2690 | */ | 2742 | */ |
2691 | balance_gap = min(low_wmark_pages(zone), | 2743 | balance_gap = min(low_wmark_pages(zone), |
2692 | (zone->present_pages + | 2744 | (zone->managed_pages + |
2693 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2745 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
2694 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2746 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2695 | /* | 2747 | /* |
@@ -2720,12 +2772,10 @@ loop_again: | |||
2720 | } | 2772 | } |
2721 | 2773 | ||
2722 | /* | 2774 | /* |
2723 | * If we've done a decent amount of scanning and | 2775 | * If we're getting trouble reclaiming, start doing |
2724 | * the reclaim ratio is low, start doing writepage | 2776 | * writepage even in laptop mode. |
2725 | * even in laptop mode | ||
2726 | */ | 2777 | */ |
2727 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2778 | if (sc.priority < DEF_PRIORITY - 2) |
2728 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | ||
2729 | sc.may_writepage = 1; | 2779 | sc.may_writepage = 1; |
2730 | 2780 | ||
2731 | if (zone->all_unreclaimable) { | 2781 | if (zone->all_unreclaimable) { |
@@ -2734,17 +2784,7 @@ loop_again: | |||
2734 | continue; | 2784 | continue; |
2735 | } | 2785 | } |
2736 | 2786 | ||
2737 | if (!zone_balanced(zone, testorder, 0, end_zone)) { | 2787 | if (zone_balanced(zone, testorder, 0, end_zone)) |
2738 | unbalanced_zone = zone; | ||
2739 | /* | ||
2740 | * We are still under min water mark. This | ||
2741 | * means that we have a GFP_ATOMIC allocation | ||
2742 | * failure risk. Hurry up! | ||
2743 | */ | ||
2744 | if (!zone_watermark_ok_safe(zone, order, | ||
2745 | min_wmark_pages(zone), end_zone, 0)) | ||
2746 | has_under_min_watermark_zone = 1; | ||
2747 | } else { | ||
2748 | /* | 2788 | /* |
2749 | * If a zone reaches its high watermark, | 2789 | * If a zone reaches its high watermark, |
2750 | * consider it to be no longer congested. It's | 2790 | * consider it to be no longer congested. It's |
@@ -2753,8 +2793,6 @@ loop_again: | |||
2753 | * speculatively avoid congestion waits | 2793 | * speculatively avoid congestion waits |
2754 | */ | 2794 | */ |
2755 | zone_clear_flag(zone, ZONE_CONGESTED); | 2795 | zone_clear_flag(zone, ZONE_CONGESTED); |
2756 | } | ||
2757 | |||
2758 | } | 2796 | } |
2759 | 2797 | ||
2760 | /* | 2798 | /* |
@@ -2766,17 +2804,9 @@ loop_again: | |||
2766 | pfmemalloc_watermark_ok(pgdat)) | 2804 | pfmemalloc_watermark_ok(pgdat)) |
2767 | wake_up(&pgdat->pfmemalloc_wait); | 2805 | wake_up(&pgdat->pfmemalloc_wait); |
2768 | 2806 | ||
2769 | if (pgdat_balanced(pgdat, order, *classzone_idx)) | 2807 | if (pgdat_balanced(pgdat, order, *classzone_idx)) { |
2808 | pgdat_is_balanced = true; | ||
2770 | break; /* kswapd: all done */ | 2809 | break; /* kswapd: all done */ |
2771 | /* | ||
2772 | * OK, kswapd is getting into trouble. Take a nap, then take | ||
2773 | * another pass across the zones. | ||
2774 | */ | ||
2775 | if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { | ||
2776 | if (has_under_min_watermark_zone) | ||
2777 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | ||
2778 | else if (unbalanced_zone) | ||
2779 | wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); | ||
2780 | } | 2810 | } |
2781 | 2811 | ||
2782 | /* | 2812 | /* |
@@ -2788,9 +2818,9 @@ loop_again: | |||
2788 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) | 2818 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
2789 | break; | 2819 | break; |
2790 | } while (--sc.priority >= 0); | 2820 | } while (--sc.priority >= 0); |
2791 | out: | ||
2792 | 2821 | ||
2793 | if (!pgdat_balanced(pgdat, order, *classzone_idx)) { | 2822 | out: |
2823 | if (!pgdat_is_balanced) { | ||
2794 | cond_resched(); | 2824 | cond_resched(); |
2795 | 2825 | ||
2796 | try_to_freeze(); | 2826 | try_to_freeze(); |
@@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void) | |||
3053 | nr = global_page_state(NR_ACTIVE_FILE) + | 3083 | nr = global_page_state(NR_ACTIVE_FILE) + |
3054 | global_page_state(NR_INACTIVE_FILE); | 3084 | global_page_state(NR_INACTIVE_FILE); |
3055 | 3085 | ||
3056 | if (nr_swap_pages > 0) | 3086 | if (get_nr_swap_pages() > 0) |
3057 | nr += global_page_state(NR_ACTIVE_ANON) + | 3087 | nr += global_page_state(NR_ACTIVE_ANON) + |
3058 | global_page_state(NR_INACTIVE_ANON); | 3088 | global_page_state(NR_INACTIVE_ANON); |
3059 | 3089 | ||
@@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
3067 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | 3097 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + |
3068 | zone_page_state(zone, NR_INACTIVE_FILE); | 3098 | zone_page_state(zone, NR_INACTIVE_FILE); |
3069 | 3099 | ||
3070 | if (nr_swap_pages > 0) | 3100 | if (get_nr_swap_pages() > 0) |
3071 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | 3101 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + |
3072 | zone_page_state(zone, NR_INACTIVE_ANON); | 3102 | zone_page_state(zone, NR_INACTIVE_ANON); |
3073 | 3103 | ||
@@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3280 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 3310 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
3281 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3311 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
3282 | .may_swap = 1, | 3312 | .may_swap = 1, |
3283 | .nr_to_reclaim = max_t(unsigned long, nr_pages, | 3313 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
3284 | SWAP_CLUSTER_MAX), | 3314 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
3285 | .gfp_mask = gfp_mask, | ||
3286 | .order = order, | 3315 | .order = order, |
3287 | .priority = ZONE_RECLAIM_PRIORITY, | 3316 | .priority = ZONE_RECLAIM_PRIORITY, |
3288 | }; | 3317 | }; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 9800306c8195..e1d8ed172c42 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -142,7 +142,7 @@ int calculate_normal_threshold(struct zone *zone) | |||
142 | * 125 1024 10 16-32 GB 9 | 142 | * 125 1024 10 16-32 GB 9 |
143 | */ | 143 | */ |
144 | 144 | ||
145 | mem = zone->present_pages >> (27 - PAGE_SHIFT); | 145 | mem = zone->managed_pages >> (27 - PAGE_SHIFT); |
146 | 146 | ||
147 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); | 147 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); |
148 | 148 | ||
@@ -628,7 +628,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = { | |||
628 | #ifdef CONFIG_CMA | 628 | #ifdef CONFIG_CMA |
629 | "CMA", | 629 | "CMA", |
630 | #endif | 630 | #endif |
631 | #ifdef CONFIG_MEMORY_ISOLATION | ||
631 | "Isolate", | 632 | "Isolate", |
633 | #endif | ||
632 | }; | 634 | }; |
633 | 635 | ||
634 | static void *frag_start(struct seq_file *m, loff_t *pos) | 636 | static void *frag_start(struct seq_file *m, loff_t *pos) |
@@ -768,7 +770,6 @@ const char * const vmstat_text[] = { | |||
768 | "kswapd_inodesteal", | 770 | "kswapd_inodesteal", |
769 | "kswapd_low_wmark_hit_quickly", | 771 | "kswapd_low_wmark_hit_quickly", |
770 | "kswapd_high_wmark_hit_quickly", | 772 | "kswapd_high_wmark_hit_quickly", |
771 | "kswapd_skip_congestion_wait", | ||
772 | "pageoutrun", | 773 | "pageoutrun", |
773 | "allocstall", | 774 | "allocstall", |
774 | 775 | ||
@@ -890,7 +891,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, | |||
890 | int mtype; | 891 | int mtype; |
891 | unsigned long pfn; | 892 | unsigned long pfn; |
892 | unsigned long start_pfn = zone->zone_start_pfn; | 893 | unsigned long start_pfn = zone->zone_start_pfn; |
893 | unsigned long end_pfn = start_pfn + zone->spanned_pages; | 894 | unsigned long end_pfn = zone_end_pfn(zone); |
894 | unsigned long count[MIGRATE_TYPES] = { 0, }; | 895 | unsigned long count[MIGRATE_TYPES] = { 0, }; |
895 | 896 | ||
896 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 897 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |