diff options
Diffstat (limited to 'mm')
46 files changed, 1598 insertions, 1173 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7ba8feae11b..dd8e2aafb07 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -318,7 +318,7 @@ static void wakeup_timer_fn(unsigned long data) | |||
318 | if (bdi->wb.task) { | 318 | if (bdi->wb.task) { |
319 | trace_writeback_wake_thread(bdi); | 319 | trace_writeback_wake_thread(bdi); |
320 | wake_up_process(bdi->wb.task); | 320 | wake_up_process(bdi->wb.task); |
321 | } else { | 321 | } else if (bdi->dev) { |
322 | /* | 322 | /* |
323 | * When bdi tasks are inactive for long time, they are killed. | 323 | * When bdi tasks are inactive for long time, they are killed. |
324 | * In this case we have to wake-up the forker thread which | 324 | * In this case we have to wake-up the forker thread which |
@@ -584,6 +584,8 @@ EXPORT_SYMBOL(bdi_register_dev); | |||
584 | */ | 584 | */ |
585 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) | 585 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) |
586 | { | 586 | { |
587 | struct task_struct *task; | ||
588 | |||
587 | if (!bdi_cap_writeback_dirty(bdi)) | 589 | if (!bdi_cap_writeback_dirty(bdi)) |
588 | return; | 590 | return; |
589 | 591 | ||
@@ -602,8 +604,13 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) | |||
602 | * Finally, kill the kernel thread. We don't need to be RCU | 604 | * Finally, kill the kernel thread. We don't need to be RCU |
603 | * safe anymore, since the bdi is gone from visibility. | 605 | * safe anymore, since the bdi is gone from visibility. |
604 | */ | 606 | */ |
605 | if (bdi->wb.task) | 607 | spin_lock_bh(&bdi->wb_lock); |
606 | kthread_stop(bdi->wb.task); | 608 | task = bdi->wb.task; |
609 | bdi->wb.task = NULL; | ||
610 | spin_unlock_bh(&bdi->wb_lock); | ||
611 | |||
612 | if (task) | ||
613 | kthread_stop(task); | ||
607 | } | 614 | } |
608 | 615 | ||
609 | /* | 616 | /* |
@@ -623,7 +630,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) | |||
623 | 630 | ||
624 | void bdi_unregister(struct backing_dev_info *bdi) | 631 | void bdi_unregister(struct backing_dev_info *bdi) |
625 | { | 632 | { |
626 | if (bdi->dev) { | 633 | struct device *dev = bdi->dev; |
634 | |||
635 | if (dev) { | ||
627 | bdi_set_min_ratio(bdi, 0); | 636 | bdi_set_min_ratio(bdi, 0); |
628 | trace_writeback_bdi_unregister(bdi); | 637 | trace_writeback_bdi_unregister(bdi); |
629 | bdi_prune_sb(bdi); | 638 | bdi_prune_sb(bdi); |
@@ -632,8 +641,12 @@ void bdi_unregister(struct backing_dev_info *bdi) | |||
632 | if (!bdi_cap_flush_forker(bdi)) | 641 | if (!bdi_cap_flush_forker(bdi)) |
633 | bdi_wb_shutdown(bdi); | 642 | bdi_wb_shutdown(bdi); |
634 | bdi_debug_unregister(bdi); | 643 | bdi_debug_unregister(bdi); |
635 | device_unregister(bdi->dev); | 644 | |
645 | spin_lock_bh(&bdi->wb_lock); | ||
636 | bdi->dev = NULL; | 646 | bdi->dev = NULL; |
647 | spin_unlock_bh(&bdi->wb_lock); | ||
648 | |||
649 | device_unregister(dev); | ||
637 | } | 650 | } |
638 | } | 651 | } |
639 | EXPORT_SYMBOL(bdi_unregister); | 652 | EXPORT_SYMBOL(bdi_unregister); |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 668e94df8cf..0131170c9d5 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size, | |||
766 | unsigned long section_nr) | 766 | unsigned long section_nr) |
767 | { | 767 | { |
768 | bootmem_data_t *bdata; | 768 | bootmem_data_t *bdata; |
769 | unsigned long pfn, goal, limit; | 769 | unsigned long pfn, goal; |
770 | 770 | ||
771 | pfn = section_nr_to_pfn(section_nr); | 771 | pfn = section_nr_to_pfn(section_nr); |
772 | goal = pfn << PAGE_SHIFT; | 772 | goal = pfn << PAGE_SHIFT; |
773 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
774 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | 773 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
775 | 774 | ||
776 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); | 775 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); |
777 | } | 776 | } |
778 | #endif | 777 | #endif |
779 | 778 | ||
diff --git a/mm/bounce.c b/mm/bounce.c index 4e9ae722af8..d1be02ca188 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -50,9 +50,9 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) | |||
50 | unsigned char *vto; | 50 | unsigned char *vto; |
51 | 51 | ||
52 | local_irq_save(flags); | 52 | local_irq_save(flags); |
53 | vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); | 53 | vto = kmap_atomic(to->bv_page); |
54 | memcpy(vto + to->bv_offset, vfrom, to->bv_len); | 54 | memcpy(vto + to->bv_offset, vfrom, to->bv_len); |
55 | kunmap_atomic(vto, KM_BOUNCE_READ); | 55 | kunmap_atomic(vto); |
56 | local_irq_restore(flags); | 56 | local_irq_restore(flags); |
57 | } | 57 | } |
58 | 58 | ||
diff --git a/mm/cleancache.c b/mm/cleancache.c index bcaae4c2a77..5646c740f61 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c | |||
@@ -15,29 +15,34 @@ | |||
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/exportfs.h> | 16 | #include <linux/exportfs.h> |
17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
18 | #include <linux/debugfs.h> | ||
18 | #include <linux/cleancache.h> | 19 | #include <linux/cleancache.h> |
19 | 20 | ||
20 | /* | 21 | /* |
21 | * This global enablement flag may be read thousands of times per second | 22 | * This global enablement flag may be read thousands of times per second |
22 | * by cleancache_get/put/flush even on systems where cleancache_ops | 23 | * by cleancache_get/put/invalidate even on systems where cleancache_ops |
23 | * is not claimed (e.g. cleancache is config'ed on but remains | 24 | * is not claimed (e.g. cleancache is config'ed on but remains |
24 | * disabled), so is preferred to the slower alternative: a function | 25 | * disabled), so is preferred to the slower alternative: a function |
25 | * call that checks a non-global. | 26 | * call that checks a non-global. |
26 | */ | 27 | */ |
27 | int cleancache_enabled; | 28 | int cleancache_enabled __read_mostly; |
28 | EXPORT_SYMBOL(cleancache_enabled); | 29 | EXPORT_SYMBOL(cleancache_enabled); |
29 | 30 | ||
30 | /* | 31 | /* |
31 | * cleancache_ops is set by cleancache_ops_register to contain the pointers | 32 | * cleancache_ops is set by cleancache_ops_register to contain the pointers |
32 | * to the cleancache "backend" implementation functions. | 33 | * to the cleancache "backend" implementation functions. |
33 | */ | 34 | */ |
34 | static struct cleancache_ops cleancache_ops; | 35 | static struct cleancache_ops cleancache_ops __read_mostly; |
35 | 36 | ||
36 | /* useful stats available in /sys/kernel/mm/cleancache */ | 37 | /* |
37 | static unsigned long cleancache_succ_gets; | 38 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is |
38 | static unsigned long cleancache_failed_gets; | 39 | * properly configured. These are for information only so are not protected |
39 | static unsigned long cleancache_puts; | 40 | * against increment races. |
40 | static unsigned long cleancache_flushes; | 41 | */ |
42 | static u64 cleancache_succ_gets; | ||
43 | static u64 cleancache_failed_gets; | ||
44 | static u64 cleancache_puts; | ||
45 | static u64 cleancache_invalidates; | ||
41 | 46 | ||
42 | /* | 47 | /* |
43 | * register operations for cleancache, returning previous thus allowing | 48 | * register operations for cleancache, returning previous thus allowing |
@@ -148,10 +153,11 @@ void __cleancache_put_page(struct page *page) | |||
148 | EXPORT_SYMBOL(__cleancache_put_page); | 153 | EXPORT_SYMBOL(__cleancache_put_page); |
149 | 154 | ||
150 | /* | 155 | /* |
151 | * Flush any data from cleancache associated with the poolid and the | 156 | * Invalidate any data from cleancache associated with the poolid and the |
152 | * page's inode and page index so that a subsequent "get" will fail. | 157 | * page's inode and page index so that a subsequent "get" will fail. |
153 | */ | 158 | */ |
154 | void __cleancache_flush_page(struct address_space *mapping, struct page *page) | 159 | void __cleancache_invalidate_page(struct address_space *mapping, |
160 | struct page *page) | ||
155 | { | 161 | { |
156 | /* careful... page->mapping is NULL sometimes when this is called */ | 162 | /* careful... page->mapping is NULL sometimes when this is called */ |
157 | int pool_id = mapping->host->i_sb->cleancache_poolid; | 163 | int pool_id = mapping->host->i_sb->cleancache_poolid; |
@@ -160,85 +166,57 @@ void __cleancache_flush_page(struct address_space *mapping, struct page *page) | |||
160 | if (pool_id >= 0) { | 166 | if (pool_id >= 0) { |
161 | VM_BUG_ON(!PageLocked(page)); | 167 | VM_BUG_ON(!PageLocked(page)); |
162 | if (cleancache_get_key(mapping->host, &key) >= 0) { | 168 | if (cleancache_get_key(mapping->host, &key) >= 0) { |
163 | (*cleancache_ops.flush_page)(pool_id, key, page->index); | 169 | (*cleancache_ops.invalidate_page)(pool_id, |
164 | cleancache_flushes++; | 170 | key, page->index); |
171 | cleancache_invalidates++; | ||
165 | } | 172 | } |
166 | } | 173 | } |
167 | } | 174 | } |
168 | EXPORT_SYMBOL(__cleancache_flush_page); | 175 | EXPORT_SYMBOL(__cleancache_invalidate_page); |
169 | 176 | ||
170 | /* | 177 | /* |
171 | * Flush all data from cleancache associated with the poolid and the | 178 | * Invalidate all data from cleancache associated with the poolid and the |
172 | * mappings's inode so that all subsequent gets to this poolid/inode | 179 | * mappings's inode so that all subsequent gets to this poolid/inode |
173 | * will fail. | 180 | * will fail. |
174 | */ | 181 | */ |
175 | void __cleancache_flush_inode(struct address_space *mapping) | 182 | void __cleancache_invalidate_inode(struct address_space *mapping) |
176 | { | 183 | { |
177 | int pool_id = mapping->host->i_sb->cleancache_poolid; | 184 | int pool_id = mapping->host->i_sb->cleancache_poolid; |
178 | struct cleancache_filekey key = { .u.key = { 0 } }; | 185 | struct cleancache_filekey key = { .u.key = { 0 } }; |
179 | 186 | ||
180 | if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) | 187 | if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) |
181 | (*cleancache_ops.flush_inode)(pool_id, key); | 188 | (*cleancache_ops.invalidate_inode)(pool_id, key); |
182 | } | 189 | } |
183 | EXPORT_SYMBOL(__cleancache_flush_inode); | 190 | EXPORT_SYMBOL(__cleancache_invalidate_inode); |
184 | 191 | ||
185 | /* | 192 | /* |
186 | * Called by any cleancache-enabled filesystem at time of unmount; | 193 | * Called by any cleancache-enabled filesystem at time of unmount; |
187 | * note that pool_id is surrendered and may be reutrned by a subsequent | 194 | * note that pool_id is surrendered and may be reutrned by a subsequent |
188 | * cleancache_init_fs or cleancache_init_shared_fs | 195 | * cleancache_init_fs or cleancache_init_shared_fs |
189 | */ | 196 | */ |
190 | void __cleancache_flush_fs(struct super_block *sb) | 197 | void __cleancache_invalidate_fs(struct super_block *sb) |
191 | { | 198 | { |
192 | if (sb->cleancache_poolid >= 0) { | 199 | if (sb->cleancache_poolid >= 0) { |
193 | int old_poolid = sb->cleancache_poolid; | 200 | int old_poolid = sb->cleancache_poolid; |
194 | sb->cleancache_poolid = -1; | 201 | sb->cleancache_poolid = -1; |
195 | (*cleancache_ops.flush_fs)(old_poolid); | 202 | (*cleancache_ops.invalidate_fs)(old_poolid); |
196 | } | 203 | } |
197 | } | 204 | } |
198 | EXPORT_SYMBOL(__cleancache_flush_fs); | 205 | EXPORT_SYMBOL(__cleancache_invalidate_fs); |
199 | |||
200 | #ifdef CONFIG_SYSFS | ||
201 | |||
202 | /* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ | ||
203 | |||
204 | #define CLEANCACHE_SYSFS_RO(_name) \ | ||
205 | static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ | ||
206 | struct kobj_attribute *attr, char *buf) \ | ||
207 | { \ | ||
208 | return sprintf(buf, "%lu\n", cleancache_##_name); \ | ||
209 | } \ | ||
210 | static struct kobj_attribute cleancache_##_name##_attr = { \ | ||
211 | .attr = { .name = __stringify(_name), .mode = 0444 }, \ | ||
212 | .show = cleancache_##_name##_show, \ | ||
213 | } | ||
214 | |||
215 | CLEANCACHE_SYSFS_RO(succ_gets); | ||
216 | CLEANCACHE_SYSFS_RO(failed_gets); | ||
217 | CLEANCACHE_SYSFS_RO(puts); | ||
218 | CLEANCACHE_SYSFS_RO(flushes); | ||
219 | |||
220 | static struct attribute *cleancache_attrs[] = { | ||
221 | &cleancache_succ_gets_attr.attr, | ||
222 | &cleancache_failed_gets_attr.attr, | ||
223 | &cleancache_puts_attr.attr, | ||
224 | &cleancache_flushes_attr.attr, | ||
225 | NULL, | ||
226 | }; | ||
227 | |||
228 | static struct attribute_group cleancache_attr_group = { | ||
229 | .attrs = cleancache_attrs, | ||
230 | .name = "cleancache", | ||
231 | }; | ||
232 | |||
233 | #endif /* CONFIG_SYSFS */ | ||
234 | 206 | ||
235 | static int __init init_cleancache(void) | 207 | static int __init init_cleancache(void) |
236 | { | 208 | { |
237 | #ifdef CONFIG_SYSFS | 209 | #ifdef CONFIG_DEBUG_FS |
238 | int err; | 210 | struct dentry *root = debugfs_create_dir("cleancache", NULL); |
239 | 211 | if (root == NULL) | |
240 | err = sysfs_create_group(mm_kobj, &cleancache_attr_group); | 212 | return -ENXIO; |
241 | #endif /* CONFIG_SYSFS */ | 213 | debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets); |
214 | debugfs_create_u64("failed_gets", S_IRUGO, | ||
215 | root, &cleancache_failed_gets); | ||
216 | debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts); | ||
217 | debugfs_create_u64("invalidates", S_IRUGO, | ||
218 | root, &cleancache_invalidates); | ||
219 | #endif | ||
242 | return 0; | 220 | return 0; |
243 | } | 221 | } |
244 | module_init(init_cleancache) | 222 | module_init(init_cleancache) |
diff --git a/mm/compaction.c b/mm/compaction.c index 71a58f67f48..74a8c825ff2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -35,7 +35,7 @@ struct compact_control { | |||
35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
36 | bool sync; /* Synchronous migration */ | 36 | bool sync; /* Synchronous migration */ |
37 | 37 | ||
38 | unsigned int order; /* order a direct compactor needs */ | 38 | int order; /* order a direct compactor needs */ |
39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
40 | struct zone *zone; | 40 | struct zone *zone; |
41 | }; | 41 | }; |
@@ -313,12 +313,34 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
313 | } else if (!locked) | 313 | } else if (!locked) |
314 | spin_lock_irq(&zone->lru_lock); | 314 | spin_lock_irq(&zone->lru_lock); |
315 | 315 | ||
316 | /* | ||
317 | * migrate_pfn does not necessarily start aligned to a | ||
318 | * pageblock. Ensure that pfn_valid is called when moving | ||
319 | * into a new MAX_ORDER_NR_PAGES range in case of large | ||
320 | * memory holes within the zone | ||
321 | */ | ||
322 | if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { | ||
323 | if (!pfn_valid(low_pfn)) { | ||
324 | low_pfn += MAX_ORDER_NR_PAGES - 1; | ||
325 | continue; | ||
326 | } | ||
327 | } | ||
328 | |||
316 | if (!pfn_valid_within(low_pfn)) | 329 | if (!pfn_valid_within(low_pfn)) |
317 | continue; | 330 | continue; |
318 | nr_scanned++; | 331 | nr_scanned++; |
319 | 332 | ||
320 | /* Get the page and skip if free */ | 333 | /* |
334 | * Get the page and ensure the page is within the same zone. | ||
335 | * See the comment in isolate_freepages about overlapping | ||
336 | * nodes. It is deliberate that the new zone lock is not taken | ||
337 | * as memory compaction should not move pages between nodes. | ||
338 | */ | ||
321 | page = pfn_to_page(low_pfn); | 339 | page = pfn_to_page(low_pfn); |
340 | if (page_zone(page) != zone) | ||
341 | continue; | ||
342 | |||
343 | /* Skip if free */ | ||
322 | if (PageBuddy(page)) | 344 | if (PageBuddy(page)) |
323 | continue; | 345 | continue; |
324 | 346 | ||
@@ -653,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
653 | 675 | ||
654 | 676 | ||
655 | /* Compact all zones within a node */ | 677 | /* Compact all zones within a node */ |
656 | static int compact_node(int nid) | 678 | static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) |
657 | { | 679 | { |
658 | int zoneid; | 680 | int zoneid; |
659 | pg_data_t *pgdat; | ||
660 | struct zone *zone; | 681 | struct zone *zone; |
661 | 682 | ||
662 | if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) | ||
663 | return -EINVAL; | ||
664 | pgdat = NODE_DATA(nid); | ||
665 | |||
666 | /* Flush pending updates to the LRU lists */ | ||
667 | lru_add_drain_all(); | ||
668 | |||
669 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | 683 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { |
670 | struct compact_control cc = { | ||
671 | .nr_freepages = 0, | ||
672 | .nr_migratepages = 0, | ||
673 | .order = -1, | ||
674 | .sync = true, | ||
675 | }; | ||
676 | 684 | ||
677 | zone = &pgdat->node_zones[zoneid]; | 685 | zone = &pgdat->node_zones[zoneid]; |
678 | if (!populated_zone(zone)) | 686 | if (!populated_zone(zone)) |
679 | continue; | 687 | continue; |
680 | 688 | ||
681 | cc.zone = zone; | 689 | cc->nr_freepages = 0; |
682 | INIT_LIST_HEAD(&cc.freepages); | 690 | cc->nr_migratepages = 0; |
683 | INIT_LIST_HEAD(&cc.migratepages); | 691 | cc->zone = zone; |
684 | 692 | INIT_LIST_HEAD(&cc->freepages); | |
685 | compact_zone(zone, &cc); | 693 | INIT_LIST_HEAD(&cc->migratepages); |
694 | |||
695 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) | ||
696 | compact_zone(zone, cc); | ||
697 | |||
698 | if (cc->order > 0) { | ||
699 | int ok = zone_watermark_ok(zone, cc->order, | ||
700 | low_wmark_pages(zone), 0, 0); | ||
701 | if (ok && cc->order > zone->compact_order_failed) | ||
702 | zone->compact_order_failed = cc->order + 1; | ||
703 | /* Currently async compaction is never deferred. */ | ||
704 | else if (!ok && cc->sync) | ||
705 | defer_compaction(zone, cc->order); | ||
706 | } | ||
686 | 707 | ||
687 | VM_BUG_ON(!list_empty(&cc.freepages)); | 708 | VM_BUG_ON(!list_empty(&cc->freepages)); |
688 | VM_BUG_ON(!list_empty(&cc.migratepages)); | 709 | VM_BUG_ON(!list_empty(&cc->migratepages)); |
689 | } | 710 | } |
690 | 711 | ||
691 | return 0; | 712 | return 0; |
692 | } | 713 | } |
693 | 714 | ||
715 | int compact_pgdat(pg_data_t *pgdat, int order) | ||
716 | { | ||
717 | struct compact_control cc = { | ||
718 | .order = order, | ||
719 | .sync = false, | ||
720 | }; | ||
721 | |||
722 | return __compact_pgdat(pgdat, &cc); | ||
723 | } | ||
724 | |||
725 | static int compact_node(int nid) | ||
726 | { | ||
727 | struct compact_control cc = { | ||
728 | .order = -1, | ||
729 | .sync = true, | ||
730 | }; | ||
731 | |||
732 | return __compact_pgdat(NODE_DATA(nid), &cc); | ||
733 | } | ||
734 | |||
694 | /* Compact all nodes in the system */ | 735 | /* Compact all nodes in the system */ |
695 | static int compact_nodes(void) | 736 | static int compact_nodes(void) |
696 | { | 737 | { |
697 | int nid; | 738 | int nid; |
698 | 739 | ||
740 | /* Flush pending updates to the LRU lists */ | ||
741 | lru_add_drain_all(); | ||
742 | |||
699 | for_each_online_node(nid) | 743 | for_each_online_node(nid) |
700 | compact_node(nid); | 744 | compact_node(nid); |
701 | 745 | ||
@@ -728,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev, | |||
728 | struct device_attribute *attr, | 772 | struct device_attribute *attr, |
729 | const char *buf, size_t count) | 773 | const char *buf, size_t count) |
730 | { | 774 | { |
731 | compact_node(dev->id); | 775 | int nid = dev->id; |
776 | |||
777 | if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { | ||
778 | /* Flush pending updates to the LRU lists */ | ||
779 | lru_add_drain_all(); | ||
780 | |||
781 | compact_node(nid); | ||
782 | } | ||
732 | 783 | ||
733 | return count; | 784 | return count; |
734 | } | 785 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 97f49ed35bd..c3811bc6b9e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -101,9 +101,8 @@ | |||
101 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 101 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
102 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 102 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
103 | * | 103 | * |
104 | * (code doesn't rely on that order, so you could switch it around) | 104 | * ->i_mmap_mutex |
105 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 105 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
106 | * ->i_mmap_mutex | ||
107 | */ | 106 | */ |
108 | 107 | ||
109 | /* | 108 | /* |
@@ -123,7 +122,7 @@ void __delete_from_page_cache(struct page *page) | |||
123 | if (PageUptodate(page) && PageMappedToDisk(page)) | 122 | if (PageUptodate(page) && PageMappedToDisk(page)) |
124 | cleancache_put_page(page); | 123 | cleancache_put_page(page); |
125 | else | 124 | else |
126 | cleancache_flush_page(mapping, page); | 125 | cleancache_invalidate_page(mapping, page); |
127 | 126 | ||
128 | radix_tree_delete(&mapping->page_tree, page->index); | 127 | radix_tree_delete(&mapping->page_tree, page->index); |
129 | page->mapping = NULL; | 128 | page->mapping = NULL; |
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
500 | struct page *page; | 499 | struct page *page; |
501 | 500 | ||
502 | if (cpuset_do_page_mem_spread()) { | 501 | if (cpuset_do_page_mem_spread()) { |
503 | get_mems_allowed(); | 502 | unsigned int cpuset_mems_cookie; |
504 | n = cpuset_mem_spread_node(); | 503 | do { |
505 | page = alloc_pages_exact_node(n, gfp, 0); | 504 | cpuset_mems_cookie = get_mems_allowed(); |
506 | put_mems_allowed(); | 505 | n = cpuset_mem_spread_node(); |
506 | page = alloc_pages_exact_node(n, gfp, 0); | ||
507 | } while (!put_mems_allowed(cpuset_mems_cookie) && !page); | ||
508 | |||
507 | return page; | 509 | return page; |
508 | } | 510 | } |
509 | return alloc_pages(gfp, 0); | 511 | return alloc_pages(gfp, 0); |
@@ -1318,10 +1320,10 @@ int file_read_actor(read_descriptor_t *desc, struct page *page, | |||
1318 | * taking the kmap. | 1320 | * taking the kmap. |
1319 | */ | 1321 | */ |
1320 | if (!fault_in_pages_writeable(desc->arg.buf, size)) { | 1322 | if (!fault_in_pages_writeable(desc->arg.buf, size)) { |
1321 | kaddr = kmap_atomic(page, KM_USER0); | 1323 | kaddr = kmap_atomic(page); |
1322 | left = __copy_to_user_inatomic(desc->arg.buf, | 1324 | left = __copy_to_user_inatomic(desc->arg.buf, |
1323 | kaddr + offset, size); | 1325 | kaddr + offset, size); |
1324 | kunmap_atomic(kaddr, KM_USER0); | 1326 | kunmap_atomic(kaddr); |
1325 | if (left == 0) | 1327 | if (left == 0) |
1326 | goto success; | 1328 | goto success; |
1327 | } | 1329 | } |
@@ -1400,15 +1402,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1400 | unsigned long seg = 0; | 1402 | unsigned long seg = 0; |
1401 | size_t count; | 1403 | size_t count; |
1402 | loff_t *ppos = &iocb->ki_pos; | 1404 | loff_t *ppos = &iocb->ki_pos; |
1403 | struct blk_plug plug; | ||
1404 | 1405 | ||
1405 | count = 0; | 1406 | count = 0; |
1406 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); | 1407 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
1407 | if (retval) | 1408 | if (retval) |
1408 | return retval; | 1409 | return retval; |
1409 | 1410 | ||
1410 | blk_start_plug(&plug); | ||
1411 | |||
1412 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1411 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
1413 | if (filp->f_flags & O_DIRECT) { | 1412 | if (filp->f_flags & O_DIRECT) { |
1414 | loff_t size; | 1413 | loff_t size; |
@@ -1424,8 +1423,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1424 | retval = filemap_write_and_wait_range(mapping, pos, | 1423 | retval = filemap_write_and_wait_range(mapping, pos, |
1425 | pos + iov_length(iov, nr_segs) - 1); | 1424 | pos + iov_length(iov, nr_segs) - 1); |
1426 | if (!retval) { | 1425 | if (!retval) { |
1426 | struct blk_plug plug; | ||
1427 | |||
1428 | blk_start_plug(&plug); | ||
1427 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1429 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1428 | iov, pos, nr_segs); | 1430 | iov, pos, nr_segs); |
1431 | blk_finish_plug(&plug); | ||
1429 | } | 1432 | } |
1430 | if (retval > 0) { | 1433 | if (retval > 0) { |
1431 | *ppos = pos + retval; | 1434 | *ppos = pos + retval; |
@@ -1481,7 +1484,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1481 | break; | 1484 | break; |
1482 | } | 1485 | } |
1483 | out: | 1486 | out: |
1484 | blk_finish_plug(&plug); | ||
1485 | return retval; | 1487 | return retval; |
1486 | } | 1488 | } |
1487 | EXPORT_SYMBOL(generic_file_aio_read); | 1489 | EXPORT_SYMBOL(generic_file_aio_read); |
@@ -2045,7 +2047,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, | |||
2045 | size_t copied; | 2047 | size_t copied; |
2046 | 2048 | ||
2047 | BUG_ON(!in_atomic()); | 2049 | BUG_ON(!in_atomic()); |
2048 | kaddr = kmap_atomic(page, KM_USER0); | 2050 | kaddr = kmap_atomic(page); |
2049 | if (likely(i->nr_segs == 1)) { | 2051 | if (likely(i->nr_segs == 1)) { |
2050 | int left; | 2052 | int left; |
2051 | char __user *buf = i->iov->iov_base + i->iov_offset; | 2053 | char __user *buf = i->iov->iov_base + i->iov_offset; |
@@ -2055,7 +2057,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, | |||
2055 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, | 2057 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, |
2056 | i->iov, i->iov_offset, bytes); | 2058 | i->iov, i->iov_offset, bytes); |
2057 | } | 2059 | } |
2058 | kunmap_atomic(kaddr, KM_USER0); | 2060 | kunmap_atomic(kaddr); |
2059 | 2061 | ||
2060 | return copied; | 2062 | return copied; |
2061 | } | 2063 | } |
@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, | |||
2341 | struct page *page; | 2343 | struct page *page; |
2342 | gfp_t gfp_notmask = 0; | 2344 | gfp_t gfp_notmask = 0; |
2343 | 2345 | ||
2344 | gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; | 2346 | gfp_mask = mapping_gfp_mask(mapping); |
2347 | if (mapping_cap_account_dirty(mapping)) | ||
2348 | gfp_mask |= __GFP_WRITE; | ||
2345 | if (flags & AOP_FLAG_NOFS) | 2349 | if (flags & AOP_FLAG_NOFS) |
2346 | gfp_notmask = __GFP_FS; | 2350 | gfp_notmask = __GFP_FS; |
2347 | repeat: | 2351 | repeat: |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index f91b2f68734..a4eb3113222 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -263,7 +263,12 @@ found: | |||
263 | xip_pfn); | 263 | xip_pfn); |
264 | if (err == -ENOMEM) | 264 | if (err == -ENOMEM) |
265 | return VM_FAULT_OOM; | 265 | return VM_FAULT_OOM; |
266 | BUG_ON(err); | 266 | /* |
267 | * err == -EBUSY is fine, we've raced against another thread | ||
268 | * that faulted-in the same page | ||
269 | */ | ||
270 | if (err != -EBUSY) | ||
271 | BUG_ON(err); | ||
267 | return VM_FAULT_NOPAGE; | 272 | return VM_FAULT_NOPAGE; |
268 | } else { | 273 | } else { |
269 | int err, ret = VM_FAULT_OOM; | 274 | int err, ret = VM_FAULT_OOM; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b3ffc21ce80..f0e5306eeb5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -671,6 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
671 | set_pmd_at(mm, haddr, pmd, entry); | 671 | set_pmd_at(mm, haddr, pmd, entry); |
672 | prepare_pmd_huge_pte(pgtable, mm); | 672 | prepare_pmd_huge_pte(pgtable, mm); |
673 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 673 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
674 | mm->nr_ptes++; | ||
674 | spin_unlock(&mm->page_table_lock); | 675 | spin_unlock(&mm->page_table_lock); |
675 | } | 676 | } |
676 | 677 | ||
@@ -789,6 +790,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
789 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 790 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
790 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 791 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
791 | prepare_pmd_huge_pte(pgtable, dst_mm); | 792 | prepare_pmd_huge_pte(pgtable, dst_mm); |
793 | dst_mm->nr_ptes++; | ||
792 | 794 | ||
793 | ret = 0; | 795 | ret = 0; |
794 | out_unlock: | 796 | out_unlock: |
@@ -887,7 +889,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
887 | } | 889 | } |
888 | kfree(pages); | 890 | kfree(pages); |
889 | 891 | ||
890 | mm->nr_ptes++; | ||
891 | smp_wmb(); /* make pte visible before pmd */ | 892 | smp_wmb(); /* make pte visible before pmd */ |
892 | pmd_populate(mm, pmd, pgtable); | 893 | pmd_populate(mm, pmd, pgtable); |
893 | page_remove_rmap(page); | 894 | page_remove_rmap(page); |
@@ -1030,31 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1030 | { | 1031 | { |
1031 | int ret = 0; | 1032 | int ret = 0; |
1032 | 1033 | ||
1033 | spin_lock(&tlb->mm->page_table_lock); | 1034 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1034 | if (likely(pmd_trans_huge(*pmd))) { | 1035 | struct page *page; |
1035 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1036 | pgtable_t pgtable; |
1036 | spin_unlock(&tlb->mm->page_table_lock); | 1037 | pgtable = get_pmd_huge_pte(tlb->mm); |
1037 | wait_split_huge_page(vma->anon_vma, | 1038 | page = pmd_page(*pmd); |
1038 | pmd); | 1039 | pmd_clear(pmd); |
1039 | } else { | 1040 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1040 | struct page *page; | 1041 | page_remove_rmap(page); |
1041 | pgtable_t pgtable; | 1042 | VM_BUG_ON(page_mapcount(page) < 0); |
1042 | pgtable = get_pmd_huge_pte(tlb->mm); | 1043 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1043 | page = pmd_page(*pmd); | 1044 | VM_BUG_ON(!PageHead(page)); |
1044 | pmd_clear(pmd); | 1045 | tlb->mm->nr_ptes--; |
1045 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | ||
1046 | page_remove_rmap(page); | ||
1047 | VM_BUG_ON(page_mapcount(page) < 0); | ||
1048 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1049 | VM_BUG_ON(!PageHead(page)); | ||
1050 | spin_unlock(&tlb->mm->page_table_lock); | ||
1051 | tlb_remove_page(tlb, page); | ||
1052 | pte_free(tlb->mm, pgtable); | ||
1053 | ret = 1; | ||
1054 | } | ||
1055 | } else | ||
1056 | spin_unlock(&tlb->mm->page_table_lock); | 1046 | spin_unlock(&tlb->mm->page_table_lock); |
1057 | 1047 | tlb_remove_page(tlb, page); | |
1048 | pte_free(tlb->mm, pgtable); | ||
1049 | ret = 1; | ||
1050 | } | ||
1058 | return ret; | 1051 | return ret; |
1059 | } | 1052 | } |
1060 | 1053 | ||
@@ -1064,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1064 | { | 1057 | { |
1065 | int ret = 0; | 1058 | int ret = 0; |
1066 | 1059 | ||
1067 | spin_lock(&vma->vm_mm->page_table_lock); | 1060 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1068 | if (likely(pmd_trans_huge(*pmd))) { | 1061 | /* |
1069 | ret = !pmd_trans_splitting(*pmd); | 1062 | * All logical pages in the range are present |
1070 | spin_unlock(&vma->vm_mm->page_table_lock); | 1063 | * if backed by a huge page. |
1071 | if (unlikely(!ret)) | 1064 | */ |
1072 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1073 | else { | ||
1074 | /* | ||
1075 | * All logical pages in the range are present | ||
1076 | * if backed by a huge page. | ||
1077 | */ | ||
1078 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1079 | } | ||
1080 | } else | ||
1081 | spin_unlock(&vma->vm_mm->page_table_lock); | 1065 | spin_unlock(&vma->vm_mm->page_table_lock); |
1066 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1067 | ret = 1; | ||
1068 | } | ||
1082 | 1069 | ||
1083 | return ret; | 1070 | return ret; |
1084 | } | 1071 | } |
@@ -1108,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1108 | goto out; | 1095 | goto out; |
1109 | } | 1096 | } |
1110 | 1097 | ||
1111 | spin_lock(&mm->page_table_lock); | 1098 | ret = __pmd_trans_huge_lock(old_pmd, vma); |
1112 | if (likely(pmd_trans_huge(*old_pmd))) { | 1099 | if (ret == 1) { |
1113 | if (pmd_trans_splitting(*old_pmd)) { | 1100 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
1114 | spin_unlock(&mm->page_table_lock); | 1101 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1115 | wait_split_huge_page(vma->anon_vma, old_pmd); | 1102 | set_pmd_at(mm, new_addr, new_pmd, pmd); |
1116 | ret = -1; | ||
1117 | } else { | ||
1118 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | ||
1119 | VM_BUG_ON(!pmd_none(*new_pmd)); | ||
1120 | set_pmd_at(mm, new_addr, new_pmd, pmd); | ||
1121 | spin_unlock(&mm->page_table_lock); | ||
1122 | ret = 1; | ||
1123 | } | ||
1124 | } else { | ||
1125 | spin_unlock(&mm->page_table_lock); | 1103 | spin_unlock(&mm->page_table_lock); |
1126 | } | 1104 | } |
1127 | out: | 1105 | out: |
@@ -1134,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1134 | struct mm_struct *mm = vma->vm_mm; | 1112 | struct mm_struct *mm = vma->vm_mm; |
1135 | int ret = 0; | 1113 | int ret = 0; |
1136 | 1114 | ||
1137 | spin_lock(&mm->page_table_lock); | 1115 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1116 | pmd_t entry; | ||
1117 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
1118 | entry = pmd_modify(entry, newprot); | ||
1119 | set_pmd_at(mm, addr, pmd, entry); | ||
1120 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1121 | ret = 1; | ||
1122 | } | ||
1123 | |||
1124 | return ret; | ||
1125 | } | ||
1126 | |||
1127 | /* | ||
1128 | * Returns 1 if a given pmd maps a stable (not under splitting) thp. | ||
1129 | * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. | ||
1130 | * | ||
1131 | * Note that if it returns 1, this routine returns without unlocking page | ||
1132 | * table locks. So callers must unlock them. | ||
1133 | */ | ||
1134 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | ||
1135 | { | ||
1136 | spin_lock(&vma->vm_mm->page_table_lock); | ||
1138 | if (likely(pmd_trans_huge(*pmd))) { | 1137 | if (likely(pmd_trans_huge(*pmd))) { |
1139 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1138 | if (unlikely(pmd_trans_splitting(*pmd))) { |
1140 | spin_unlock(&mm->page_table_lock); | 1139 | spin_unlock(&vma->vm_mm->page_table_lock); |
1141 | wait_split_huge_page(vma->anon_vma, pmd); | 1140 | wait_split_huge_page(vma->anon_vma, pmd); |
1141 | return -1; | ||
1142 | } else { | 1142 | } else { |
1143 | pmd_t entry; | 1143 | /* Thp mapped by 'pmd' is stable, so we can |
1144 | 1144 | * handle it as it is. */ | |
1145 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1145 | return 1; |
1146 | entry = pmd_modify(entry, newprot); | ||
1147 | set_pmd_at(mm, addr, pmd, entry); | ||
1148 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1149 | ret = 1; | ||
1150 | } | 1146 | } |
1151 | } else | 1147 | } |
1152 | spin_unlock(&vma->vm_mm->page_table_lock); | 1148 | spin_unlock(&vma->vm_mm->page_table_lock); |
1153 | 1149 | return 0; | |
1154 | return ret; | ||
1155 | } | 1150 | } |
1156 | 1151 | ||
1157 | pmd_t *page_check_address_pmd(struct page *page, | 1152 | pmd_t *page_check_address_pmd(struct page *page, |
@@ -1375,7 +1370,6 @@ static int __split_huge_page_map(struct page *page, | |||
1375 | pte_unmap(pte); | 1370 | pte_unmap(pte); |
1376 | } | 1371 | } |
1377 | 1372 | ||
1378 | mm->nr_ptes++; | ||
1379 | smp_wmb(); /* make pte visible before pmd */ | 1373 | smp_wmb(); /* make pte visible before pmd */ |
1380 | /* | 1374 | /* |
1381 | * Up to this point the pmd is present and huge and | 1375 | * Up to this point the pmd is present and huge and |
@@ -1988,7 +1982,6 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1988 | set_pmd_at(mm, address, pmd, _pmd); | 1982 | set_pmd_at(mm, address, pmd, _pmd); |
1989 | update_mmu_cache(vma, address, _pmd); | 1983 | update_mmu_cache(vma, address, _pmd); |
1990 | prepare_pmd_huge_pte(pgtable, mm); | 1984 | prepare_pmd_huge_pte(pgtable, mm); |
1991 | mm->nr_ptes--; | ||
1992 | spin_unlock(&mm->page_table_lock); | 1985 | spin_unlock(&mm->page_table_lock); |
1993 | 1986 | ||
1994 | #ifndef CONFIG_NUMA | 1987 | #ifndef CONFIG_NUMA |
@@ -2083,7 +2076,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) | |||
2083 | { | 2076 | { |
2084 | struct mm_struct *mm = mm_slot->mm; | 2077 | struct mm_struct *mm = mm_slot->mm; |
2085 | 2078 | ||
2086 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | 2079 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); |
2087 | 2080 | ||
2088 | if (khugepaged_test_exit(mm)) { | 2081 | if (khugepaged_test_exit(mm)) { |
2089 | /* free mm_slot */ | 2082 | /* free mm_slot */ |
@@ -2113,7 +2106,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2113 | int progress = 0; | 2106 | int progress = 0; |
2114 | 2107 | ||
2115 | VM_BUG_ON(!pages); | 2108 | VM_BUG_ON(!pages); |
2116 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | 2109 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); |
2117 | 2110 | ||
2118 | if (khugepaged_scan.mm_slot) | 2111 | if (khugepaged_scan.mm_slot) |
2119 | mm_slot = khugepaged_scan.mm_slot; | 2112 | mm_slot = khugepaged_scan.mm_slot; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5f34bd8dda3..b8ce6f45095 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size; | |||
53 | */ | 53 | */ |
54 | static DEFINE_SPINLOCK(hugetlb_lock); | 54 | static DEFINE_SPINLOCK(hugetlb_lock); |
55 | 55 | ||
56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | ||
57 | { | ||
58 | bool free = (spool->count == 0) && (spool->used_hpages == 0); | ||
59 | |||
60 | spin_unlock(&spool->lock); | ||
61 | |||
62 | /* If no pages are used, and no other handles to the subpool | ||
63 | * remain, free the subpool the subpool remain */ | ||
64 | if (free) | ||
65 | kfree(spool); | ||
66 | } | ||
67 | |||
68 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) | ||
69 | { | ||
70 | struct hugepage_subpool *spool; | ||
71 | |||
72 | spool = kmalloc(sizeof(*spool), GFP_KERNEL); | ||
73 | if (!spool) | ||
74 | return NULL; | ||
75 | |||
76 | spin_lock_init(&spool->lock); | ||
77 | spool->count = 1; | ||
78 | spool->max_hpages = nr_blocks; | ||
79 | spool->used_hpages = 0; | ||
80 | |||
81 | return spool; | ||
82 | } | ||
83 | |||
84 | void hugepage_put_subpool(struct hugepage_subpool *spool) | ||
85 | { | ||
86 | spin_lock(&spool->lock); | ||
87 | BUG_ON(!spool->count); | ||
88 | spool->count--; | ||
89 | unlock_or_release_subpool(spool); | ||
90 | } | ||
91 | |||
92 | static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, | ||
93 | long delta) | ||
94 | { | ||
95 | int ret = 0; | ||
96 | |||
97 | if (!spool) | ||
98 | return 0; | ||
99 | |||
100 | spin_lock(&spool->lock); | ||
101 | if ((spool->used_hpages + delta) <= spool->max_hpages) { | ||
102 | spool->used_hpages += delta; | ||
103 | } else { | ||
104 | ret = -ENOMEM; | ||
105 | } | ||
106 | spin_unlock(&spool->lock); | ||
107 | |||
108 | return ret; | ||
109 | } | ||
110 | |||
111 | static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, | ||
112 | long delta) | ||
113 | { | ||
114 | if (!spool) | ||
115 | return; | ||
116 | |||
117 | spin_lock(&spool->lock); | ||
118 | spool->used_hpages -= delta; | ||
119 | /* If hugetlbfs_put_super couldn't free spool due to | ||
120 | * an outstanding quota reference, free it now. */ | ||
121 | unlock_or_release_subpool(spool); | ||
122 | } | ||
123 | |||
124 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) | ||
125 | { | ||
126 | return HUGETLBFS_SB(inode->i_sb)->spool; | ||
127 | } | ||
128 | |||
129 | static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) | ||
130 | { | ||
131 | return subpool_inode(vma->vm_file->f_dentry->d_inode); | ||
132 | } | ||
133 | |||
56 | /* | 134 | /* |
57 | * Region tracking -- allows tracking of reservations and instantiated pages | 135 | * Region tracking -- allows tracking of reservations and instantiated pages |
58 | * across the pages in a mapping. | 136 | * across the pages in a mapping. |
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
454 | struct vm_area_struct *vma, | 532 | struct vm_area_struct *vma, |
455 | unsigned long address, int avoid_reserve) | 533 | unsigned long address, int avoid_reserve) |
456 | { | 534 | { |
457 | struct page *page = NULL; | 535 | struct page *page; |
458 | struct mempolicy *mpol; | 536 | struct mempolicy *mpol; |
459 | nodemask_t *nodemask; | 537 | nodemask_t *nodemask; |
460 | struct zonelist *zonelist; | 538 | struct zonelist *zonelist; |
461 | struct zone *zone; | 539 | struct zone *zone; |
462 | struct zoneref *z; | 540 | struct zoneref *z; |
541 | unsigned int cpuset_mems_cookie; | ||
463 | 542 | ||
464 | get_mems_allowed(); | 543 | retry_cpuset: |
544 | cpuset_mems_cookie = get_mems_allowed(); | ||
465 | zonelist = huge_zonelist(vma, address, | 545 | zonelist = huge_zonelist(vma, address, |
466 | htlb_alloc_mask, &mpol, &nodemask); | 546 | htlb_alloc_mask, &mpol, &nodemask); |
467 | /* | 547 | /* |
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
488 | } | 568 | } |
489 | } | 569 | } |
490 | } | 570 | } |
491 | err: | 571 | |
492 | mpol_cond_put(mpol); | 572 | mpol_cond_put(mpol); |
493 | put_mems_allowed(); | 573 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
574 | goto retry_cpuset; | ||
494 | return page; | 575 | return page; |
576 | |||
577 | err: | ||
578 | mpol_cond_put(mpol); | ||
579 | return NULL; | ||
495 | } | 580 | } |
496 | 581 | ||
497 | static void update_and_free_page(struct hstate *h, struct page *page) | 582 | static void update_and_free_page(struct hstate *h, struct page *page) |
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page) | |||
533 | */ | 618 | */ |
534 | struct hstate *h = page_hstate(page); | 619 | struct hstate *h = page_hstate(page); |
535 | int nid = page_to_nid(page); | 620 | int nid = page_to_nid(page); |
536 | struct address_space *mapping; | 621 | struct hugepage_subpool *spool = |
622 | (struct hugepage_subpool *)page_private(page); | ||
537 | 623 | ||
538 | mapping = (struct address_space *) page_private(page); | ||
539 | set_page_private(page, 0); | 624 | set_page_private(page, 0); |
540 | page->mapping = NULL; | 625 | page->mapping = NULL; |
541 | BUG_ON(page_count(page)); | 626 | BUG_ON(page_count(page)); |
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page) | |||
551 | enqueue_huge_page(h, page); | 636 | enqueue_huge_page(h, page); |
552 | } | 637 | } |
553 | spin_unlock(&hugetlb_lock); | 638 | spin_unlock(&hugetlb_lock); |
554 | if (mapping) | 639 | hugepage_subpool_put_pages(spool, 1); |
555 | hugetlb_put_quota(mapping, 1); | ||
556 | } | 640 | } |
557 | 641 | ||
558 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 642 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) | |||
852 | struct page *page, *tmp; | 936 | struct page *page, *tmp; |
853 | int ret, i; | 937 | int ret, i; |
854 | int needed, allocated; | 938 | int needed, allocated; |
939 | bool alloc_ok = true; | ||
855 | 940 | ||
856 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; | 941 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; |
857 | if (needed <= 0) { | 942 | if (needed <= 0) { |
@@ -867,17 +952,13 @@ retry: | |||
867 | spin_unlock(&hugetlb_lock); | 952 | spin_unlock(&hugetlb_lock); |
868 | for (i = 0; i < needed; i++) { | 953 | for (i = 0; i < needed; i++) { |
869 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 954 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
870 | if (!page) | 955 | if (!page) { |
871 | /* | 956 | alloc_ok = false; |
872 | * We were not able to allocate enough pages to | 957 | break; |
873 | * satisfy the entire reservation so we free what | 958 | } |
874 | * we've allocated so far. | ||
875 | */ | ||
876 | goto free; | ||
877 | |||
878 | list_add(&page->lru, &surplus_list); | 959 | list_add(&page->lru, &surplus_list); |
879 | } | 960 | } |
880 | allocated += needed; | 961 | allocated += i; |
881 | 962 | ||
882 | /* | 963 | /* |
883 | * After retaking hugetlb_lock, we need to recalculate 'needed' | 964 | * After retaking hugetlb_lock, we need to recalculate 'needed' |
@@ -886,9 +967,16 @@ retry: | |||
886 | spin_lock(&hugetlb_lock); | 967 | spin_lock(&hugetlb_lock); |
887 | needed = (h->resv_huge_pages + delta) - | 968 | needed = (h->resv_huge_pages + delta) - |
888 | (h->free_huge_pages + allocated); | 969 | (h->free_huge_pages + allocated); |
889 | if (needed > 0) | 970 | if (needed > 0) { |
890 | goto retry; | 971 | if (alloc_ok) |
891 | 972 | goto retry; | |
973 | /* | ||
974 | * We were not able to allocate enough pages to | ||
975 | * satisfy the entire reservation so we free what | ||
976 | * we've allocated so far. | ||
977 | */ | ||
978 | goto free; | ||
979 | } | ||
892 | /* | 980 | /* |
893 | * The surplus_list now contains _at_least_ the number of extra pages | 981 | * The surplus_list now contains _at_least_ the number of extra pages |
894 | * needed to accommodate the reservation. Add the appropriate number | 982 | * needed to accommodate the reservation. Add the appropriate number |
@@ -914,10 +1002,10 @@ retry: | |||
914 | VM_BUG_ON(page_count(page)); | 1002 | VM_BUG_ON(page_count(page)); |
915 | enqueue_huge_page(h, page); | 1003 | enqueue_huge_page(h, page); |
916 | } | 1004 | } |
1005 | free: | ||
917 | spin_unlock(&hugetlb_lock); | 1006 | spin_unlock(&hugetlb_lock); |
918 | 1007 | ||
919 | /* Free unnecessary surplus pages to the buddy allocator */ | 1008 | /* Free unnecessary surplus pages to the buddy allocator */ |
920 | free: | ||
921 | if (!list_empty(&surplus_list)) { | 1009 | if (!list_empty(&surplus_list)) { |
922 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1010 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
923 | list_del(&page->lru); | 1011 | list_del(&page->lru); |
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
966 | /* | 1054 | /* |
967 | * Determine if the huge page at addr within the vma has an associated | 1055 | * Determine if the huge page at addr within the vma has an associated |
968 | * reservation. Where it does not we will need to logically increase | 1056 | * reservation. Where it does not we will need to logically increase |
969 | * reservation and actually increase quota before an allocation can occur. | 1057 | * reservation and actually increase subpool usage before an allocation |
970 | * Where any new reservation would be required the reservation change is | 1058 | * can occur. Where any new reservation would be required the |
971 | * prepared, but not committed. Once the page has been quota'd allocated | 1059 | * reservation change is prepared, but not committed. Once the page |
972 | * an instantiated the change should be committed via vma_commit_reservation. | 1060 | * has been allocated from the subpool and instantiated the change should |
973 | * No action is required on failure. | 1061 | * be committed via vma_commit_reservation. No action is required on |
1062 | * failure. | ||
974 | */ | 1063 | */ |
975 | static long vma_needs_reservation(struct hstate *h, | 1064 | static long vma_needs_reservation(struct hstate *h, |
976 | struct vm_area_struct *vma, unsigned long addr) | 1065 | struct vm_area_struct *vma, unsigned long addr) |
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h, | |||
1019 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 1108 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
1020 | unsigned long addr, int avoid_reserve) | 1109 | unsigned long addr, int avoid_reserve) |
1021 | { | 1110 | { |
1111 | struct hugepage_subpool *spool = subpool_vma(vma); | ||
1022 | struct hstate *h = hstate_vma(vma); | 1112 | struct hstate *h = hstate_vma(vma); |
1023 | struct page *page; | 1113 | struct page *page; |
1024 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
1025 | struct inode *inode = mapping->host; | ||
1026 | long chg; | 1114 | long chg; |
1027 | 1115 | ||
1028 | /* | 1116 | /* |
1029 | * Processes that did not create the mapping will have no reserves and | 1117 | * Processes that did not create the mapping will have no |
1030 | * will not have accounted against quota. Check that the quota can be | 1118 | * reserves and will not have accounted against subpool |
1031 | * made before satisfying the allocation | 1119 | * limit. Check that the subpool limit can be made before |
1032 | * MAP_NORESERVE mappings may also need pages and quota allocated | 1120 | * satisfying the allocation MAP_NORESERVE mappings may also |
1033 | * if no reserve mapping overlaps. | 1121 | * need pages and subpool limit allocated allocated if no reserve |
1122 | * mapping overlaps. | ||
1034 | */ | 1123 | */ |
1035 | chg = vma_needs_reservation(h, vma, addr); | 1124 | chg = vma_needs_reservation(h, vma, addr); |
1036 | if (chg < 0) | 1125 | if (chg < 0) |
1037 | return ERR_PTR(-VM_FAULT_OOM); | 1126 | return ERR_PTR(-VM_FAULT_OOM); |
1038 | if (chg) | 1127 | if (chg) |
1039 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 1128 | if (hugepage_subpool_get_pages(spool, chg)) |
1040 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1129 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1041 | 1130 | ||
1042 | spin_lock(&hugetlb_lock); | 1131 | spin_lock(&hugetlb_lock); |
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1046 | if (!page) { | 1135 | if (!page) { |
1047 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1136 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1048 | if (!page) { | 1137 | if (!page) { |
1049 | hugetlb_put_quota(inode->i_mapping, chg); | 1138 | hugepage_subpool_put_pages(spool, chg); |
1050 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1139 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1051 | } | 1140 | } |
1052 | } | 1141 | } |
1053 | 1142 | ||
1054 | set_page_private(page, (unsigned long) mapping); | 1143 | set_page_private(page, (unsigned long)spool); |
1055 | 1144 | ||
1056 | vma_commit_reservation(h, vma, addr); | 1145 | vma_commit_reservation(h, vma, addr); |
1057 | 1146 | ||
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2072 | { | 2161 | { |
2073 | struct hstate *h = hstate_vma(vma); | 2162 | struct hstate *h = hstate_vma(vma); |
2074 | struct resv_map *reservations = vma_resv_map(vma); | 2163 | struct resv_map *reservations = vma_resv_map(vma); |
2164 | struct hugepage_subpool *spool = subpool_vma(vma); | ||
2075 | unsigned long reserve; | 2165 | unsigned long reserve; |
2076 | unsigned long start; | 2166 | unsigned long start; |
2077 | unsigned long end; | 2167 | unsigned long end; |
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2087 | 2177 | ||
2088 | if (reserve) { | 2178 | if (reserve) { |
2089 | hugetlb_acct_memory(h, -reserve); | 2179 | hugetlb_acct_memory(h, -reserve); |
2090 | hugetlb_put_quota(vma->vm_file->f_mapping, reserve); | 2180 | hugepage_subpool_put_pages(spool, reserve); |
2091 | } | 2181 | } |
2092 | } | 2182 | } |
2093 | } | 2183 | } |
@@ -2241,16 +2331,23 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2241 | if (huge_pmd_unshare(mm, &address, ptep)) | 2331 | if (huge_pmd_unshare(mm, &address, ptep)) |
2242 | continue; | 2332 | continue; |
2243 | 2333 | ||
2334 | pte = huge_ptep_get(ptep); | ||
2335 | if (huge_pte_none(pte)) | ||
2336 | continue; | ||
2337 | |||
2338 | /* | ||
2339 | * HWPoisoned hugepage is already unmapped and dropped reference | ||
2340 | */ | ||
2341 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) | ||
2342 | continue; | ||
2343 | |||
2344 | page = pte_page(pte); | ||
2244 | /* | 2345 | /* |
2245 | * If a reference page is supplied, it is because a specific | 2346 | * If a reference page is supplied, it is because a specific |
2246 | * page is being unmapped, not a range. Ensure the page we | 2347 | * page is being unmapped, not a range. Ensure the page we |
2247 | * are about to unmap is the actual page of interest. | 2348 | * are about to unmap is the actual page of interest. |
2248 | */ | 2349 | */ |
2249 | if (ref_page) { | 2350 | if (ref_page) { |
2250 | pte = huge_ptep_get(ptep); | ||
2251 | if (huge_pte_none(pte)) | ||
2252 | continue; | ||
2253 | page = pte_page(pte); | ||
2254 | if (page != ref_page) | 2351 | if (page != ref_page) |
2255 | continue; | 2352 | continue; |
2256 | 2353 | ||
@@ -2263,22 +2360,16 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2263 | } | 2360 | } |
2264 | 2361 | ||
2265 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 2362 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
2266 | if (huge_pte_none(pte)) | ||
2267 | continue; | ||
2268 | |||
2269 | /* | ||
2270 | * HWPoisoned hugepage is already unmapped and dropped reference | ||
2271 | */ | ||
2272 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) | ||
2273 | continue; | ||
2274 | |||
2275 | page = pte_page(pte); | ||
2276 | if (pte_dirty(pte)) | 2363 | if (pte_dirty(pte)) |
2277 | set_page_dirty(page); | 2364 | set_page_dirty(page); |
2278 | list_add(&page->lru, &page_list); | 2365 | list_add(&page->lru, &page_list); |
2366 | |||
2367 | /* Bail out after unmapping reference page if supplied */ | ||
2368 | if (ref_page) | ||
2369 | break; | ||
2279 | } | 2370 | } |
2280 | spin_unlock(&mm->page_table_lock); | ||
2281 | flush_tlb_range(vma, start, end); | 2371 | flush_tlb_range(vma, start, end); |
2372 | spin_unlock(&mm->page_table_lock); | ||
2282 | mmu_notifier_invalidate_range_end(mm, start, end); | 2373 | mmu_notifier_invalidate_range_end(mm, start, end); |
2283 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 2374 | list_for_each_entry_safe(page, tmp, &page_list, lru) { |
2284 | page_remove_rmap(page); | 2375 | page_remove_rmap(page); |
@@ -2316,7 +2407,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2316 | */ | 2407 | */ |
2317 | address = address & huge_page_mask(h); | 2408 | address = address & huge_page_mask(h); |
2318 | pgoff = vma_hugecache_offset(h, vma, address); | 2409 | pgoff = vma_hugecache_offset(h, vma, address); |
2319 | mapping = (struct address_space *)page_private(page); | 2410 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; |
2320 | 2411 | ||
2321 | /* | 2412 | /* |
2322 | * Take the mapping lock for the duration of the table walk. As | 2413 | * Take the mapping lock for the duration of the table walk. As |
@@ -2869,11 +2960,12 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2869 | { | 2960 | { |
2870 | long ret, chg; | 2961 | long ret, chg; |
2871 | struct hstate *h = hstate_inode(inode); | 2962 | struct hstate *h = hstate_inode(inode); |
2963 | struct hugepage_subpool *spool = subpool_inode(inode); | ||
2872 | 2964 | ||
2873 | /* | 2965 | /* |
2874 | * Only apply hugepage reservation if asked. At fault time, an | 2966 | * Only apply hugepage reservation if asked. At fault time, an |
2875 | * attempt will be made for VM_NORESERVE to allocate a page | 2967 | * attempt will be made for VM_NORESERVE to allocate a page |
2876 | * and filesystem quota without using reserves | 2968 | * without using reserves |
2877 | */ | 2969 | */ |
2878 | if (vm_flags & VM_NORESERVE) | 2970 | if (vm_flags & VM_NORESERVE) |
2879 | return 0; | 2971 | return 0; |
@@ -2900,17 +2992,17 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2900 | if (chg < 0) | 2992 | if (chg < 0) |
2901 | return chg; | 2993 | return chg; |
2902 | 2994 | ||
2903 | /* There must be enough filesystem quota for the mapping */ | 2995 | /* There must be enough pages in the subpool for the mapping */ |
2904 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 2996 | if (hugepage_subpool_get_pages(spool, chg)) |
2905 | return -ENOSPC; | 2997 | return -ENOSPC; |
2906 | 2998 | ||
2907 | /* | 2999 | /* |
2908 | * Check enough hugepages are available for the reservation. | 3000 | * Check enough hugepages are available for the reservation. |
2909 | * Hand back the quota if there are not | 3001 | * Hand the pages back to the subpool if there are not |
2910 | */ | 3002 | */ |
2911 | ret = hugetlb_acct_memory(h, chg); | 3003 | ret = hugetlb_acct_memory(h, chg); |
2912 | if (ret < 0) { | 3004 | if (ret < 0) { |
2913 | hugetlb_put_quota(inode->i_mapping, chg); | 3005 | hugepage_subpool_put_pages(spool, chg); |
2914 | return ret; | 3006 | return ret; |
2915 | } | 3007 | } |
2916 | 3008 | ||
@@ -2934,12 +3026,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
2934 | { | 3026 | { |
2935 | struct hstate *h = hstate_inode(inode); | 3027 | struct hstate *h = hstate_inode(inode); |
2936 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 3028 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
3029 | struct hugepage_subpool *spool = subpool_inode(inode); | ||
2937 | 3030 | ||
2938 | spin_lock(&inode->i_lock); | 3031 | spin_lock(&inode->i_lock); |
2939 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); | 3032 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
2940 | spin_unlock(&inode->i_lock); | 3033 | spin_unlock(&inode->i_lock); |
2941 | 3034 | ||
2942 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 3035 | hugepage_subpool_put_pages(spool, (chg - freed)); |
2943 | hugetlb_acct_memory(h, -(chg - freed)); | 3036 | hugetlb_acct_memory(h, -(chg - freed)); |
2944 | } | 3037 | } |
2945 | 3038 | ||
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c7fc7fd00e3..cc448bb983b 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -45,7 +45,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
45 | * do a racy check with elevated page count, to make sure PG_hwpoison | 45 | * do a racy check with elevated page count, to make sure PG_hwpoison |
46 | * will only be set for the targeted owner (or on a free page). | 46 | * will only be set for the targeted owner (or on a free page). |
47 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). | 47 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). |
48 | * __memory_failure() will redo the check reliably inside page lock. | 48 | * memory_failure() will redo the check reliably inside page lock. |
49 | */ | 49 | */ |
50 | lock_page(hpage); | 50 | lock_page(hpage); |
51 | err = hwpoison_filter(hpage); | 51 | err = hwpoison_filter(hpage); |
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
55 | 55 | ||
56 | inject: | 56 | inject: |
57 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | 57 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); |
58 | return __memory_failure(pfn, 18, MF_COUNT_INCREASED); | 58 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); |
59 | } | 59 | } |
60 | 60 | ||
61 | static int hwpoison_unpoison(void *data, u64 val) | 61 | static int hwpoison_unpoison(void *data, u64 val) |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c833addd94d..45eb6217bf3 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1036,7 +1036,7 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) | |||
1036 | { | 1036 | { |
1037 | pr_debug("%s(0x%p)\n", __func__, ptr); | 1037 | pr_debug("%s(0x%p)\n", __func__, ptr); |
1038 | 1038 | ||
1039 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 1039 | if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr)) |
1040 | add_scan_area((unsigned long)ptr, size, gfp); | 1040 | add_scan_area((unsigned long)ptr, size, gfp); |
1041 | else if (atomic_read(&kmemleak_early_log)) | 1041 | else if (atomic_read(&kmemleak_early_log)) |
1042 | log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); | 1042 | log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); |
@@ -1757,6 +1757,7 @@ void __init kmemleak_init(void) | |||
1757 | 1757 | ||
1758 | #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF | 1758 | #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF |
1759 | if (!kmemleak_skip_disable) { | 1759 | if (!kmemleak_skip_disable) { |
1760 | atomic_set(&kmemleak_early_log, 0); | ||
1760 | kmemleak_disable(); | 1761 | kmemleak_disable(); |
1761 | return; | 1762 | return; |
1762 | } | 1763 | } |
@@ -28,7 +28,6 @@ | |||
28 | #include <linux/kthread.h> | 28 | #include <linux/kthread.h> |
29 | #include <linux/wait.h> | 29 | #include <linux/wait.h> |
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/memcontrol.h> | ||
32 | #include <linux/rbtree.h> | 31 | #include <linux/rbtree.h> |
33 | #include <linux/memory.h> | 32 | #include <linux/memory.h> |
34 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
@@ -375,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
375 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | 374 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; |
376 | } | 375 | } |
377 | 376 | ||
377 | static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, | ||
378 | unsigned long addr) | ||
379 | { | ||
380 | struct vm_area_struct *vma; | ||
381 | if (ksm_test_exit(mm)) | ||
382 | return NULL; | ||
383 | vma = find_vma(mm, addr); | ||
384 | if (!vma || vma->vm_start > addr) | ||
385 | return NULL; | ||
386 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
387 | return NULL; | ||
388 | return vma; | ||
389 | } | ||
390 | |||
378 | static void break_cow(struct rmap_item *rmap_item) | 391 | static void break_cow(struct rmap_item *rmap_item) |
379 | { | 392 | { |
380 | struct mm_struct *mm = rmap_item->mm; | 393 | struct mm_struct *mm = rmap_item->mm; |
@@ -388,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item) | |||
388 | put_anon_vma(rmap_item->anon_vma); | 401 | put_anon_vma(rmap_item->anon_vma); |
389 | 402 | ||
390 | down_read(&mm->mmap_sem); | 403 | down_read(&mm->mmap_sem); |
391 | if (ksm_test_exit(mm)) | 404 | vma = find_mergeable_vma(mm, addr); |
392 | goto out; | 405 | if (vma) |
393 | vma = find_vma(mm, addr); | 406 | break_ksm(vma, addr); |
394 | if (!vma || vma->vm_start > addr) | ||
395 | goto out; | ||
396 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
397 | goto out; | ||
398 | break_ksm(vma, addr); | ||
399 | out: | ||
400 | up_read(&mm->mmap_sem); | 407 | up_read(&mm->mmap_sem); |
401 | } | 408 | } |
402 | 409 | ||
@@ -422,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
422 | struct page *page; | 429 | struct page *page; |
423 | 430 | ||
424 | down_read(&mm->mmap_sem); | 431 | down_read(&mm->mmap_sem); |
425 | if (ksm_test_exit(mm)) | 432 | vma = find_mergeable_vma(mm, addr); |
426 | goto out; | 433 | if (!vma) |
427 | vma = find_vma(mm, addr); | ||
428 | if (!vma || vma->vm_start > addr) | ||
429 | goto out; | ||
430 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
431 | goto out; | 434 | goto out; |
432 | 435 | ||
433 | page = follow_page(vma, addr, FOLL_GET); | 436 | page = follow_page(vma, addr, FOLL_GET); |
@@ -673,9 +676,9 @@ error: | |||
673 | static u32 calc_checksum(struct page *page) | 676 | static u32 calc_checksum(struct page *page) |
674 | { | 677 | { |
675 | u32 checksum; | 678 | u32 checksum; |
676 | void *addr = kmap_atomic(page, KM_USER0); | 679 | void *addr = kmap_atomic(page); |
677 | checksum = jhash2(addr, PAGE_SIZE / 4, 17); | 680 | checksum = jhash2(addr, PAGE_SIZE / 4, 17); |
678 | kunmap_atomic(addr, KM_USER0); | 681 | kunmap_atomic(addr); |
679 | return checksum; | 682 | return checksum; |
680 | } | 683 | } |
681 | 684 | ||
@@ -684,11 +687,11 @@ static int memcmp_pages(struct page *page1, struct page *page2) | |||
684 | char *addr1, *addr2; | 687 | char *addr1, *addr2; |
685 | int ret; | 688 | int ret; |
686 | 689 | ||
687 | addr1 = kmap_atomic(page1, KM_USER0); | 690 | addr1 = kmap_atomic(page1); |
688 | addr2 = kmap_atomic(page2, KM_USER1); | 691 | addr2 = kmap_atomic(page2); |
689 | ret = memcmp(addr1, addr2, PAGE_SIZE); | 692 | ret = memcmp(addr1, addr2, PAGE_SIZE); |
690 | kunmap_atomic(addr2, KM_USER1); | 693 | kunmap_atomic(addr2); |
691 | kunmap_atomic(addr1, KM_USER0); | 694 | kunmap_atomic(addr1); |
692 | return ret; | 695 | return ret; |
693 | } | 696 | } |
694 | 697 | ||
@@ -1572,16 +1575,6 @@ struct page *ksm_does_need_to_copy(struct page *page, | |||
1572 | 1575 | ||
1573 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1576 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1574 | if (new_page) { | 1577 | if (new_page) { |
1575 | /* | ||
1576 | * The memcg-specific accounting when moving | ||
1577 | * pages around the LRU lists relies on the | ||
1578 | * page's owner (memcg) to be valid. Usually, | ||
1579 | * pages are assigned to a new owner before | ||
1580 | * being put on the LRU list, but since this | ||
1581 | * is not the case here, the stale owner from | ||
1582 | * a previous allocation cycle must be reset. | ||
1583 | */ | ||
1584 | mem_cgroup_reset_owner(new_page); | ||
1585 | copy_user_highpage(new_page, page, address, vma); | 1578 | copy_user_highpage(new_page, page, address, vma); |
1586 | 1579 | ||
1587 | SetPageDirty(new_page); | 1580 | SetPageDirty(new_page); |
diff --git a/mm/madvise.c b/mm/madvise.c index 74bf193eff0..1ccbba5b667 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -65,6 +65,12 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
65 | } | 65 | } |
66 | new_flags &= ~VM_DONTCOPY; | 66 | new_flags &= ~VM_DONTCOPY; |
67 | break; | 67 | break; |
68 | case MADV_DONTDUMP: | ||
69 | new_flags |= VM_NODUMP; | ||
70 | break; | ||
71 | case MADV_DODUMP: | ||
72 | new_flags &= ~VM_NODUMP; | ||
73 | break; | ||
68 | case MADV_MERGEABLE: | 74 | case MADV_MERGEABLE: |
69 | case MADV_UNMERGEABLE: | 75 | case MADV_UNMERGEABLE: |
70 | error = ksm_madvise(vma, start, end, behavior, &new_flags); | 76 | error = ksm_madvise(vma, start, end, behavior, &new_flags); |
@@ -251,7 +257,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) | |||
251 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | 257 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", |
252 | page_to_pfn(p), start); | 258 | page_to_pfn(p), start); |
253 | /* Ignore return value for now */ | 259 | /* Ignore return value for now */ |
254 | __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); | 260 | memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
255 | } | 261 | } |
256 | return ret; | 262 | return ret; |
257 | } | 263 | } |
@@ -293,6 +299,8 @@ madvise_behavior_valid(int behavior) | |||
293 | case MADV_HUGEPAGE: | 299 | case MADV_HUGEPAGE: |
294 | case MADV_NOHUGEPAGE: | 300 | case MADV_NOHUGEPAGE: |
295 | #endif | 301 | #endif |
302 | case MADV_DONTDUMP: | ||
303 | case MADV_DODUMP: | ||
296 | return 1; | 304 | return 1; |
297 | 305 | ||
298 | default: | 306 | default: |
diff --git a/mm/memblock.c b/mm/memblock.c index 77b5f227e1d..99f28559950 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -99,9 +99,6 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | |||
99 | phys_addr_t this_start, this_end, cand; | 99 | phys_addr_t this_start, this_end, cand; |
100 | u64 i; | 100 | u64 i; |
101 | 101 | ||
102 | /* align @size to avoid excessive fragmentation on reserved array */ | ||
103 | size = round_up(size, align); | ||
104 | |||
105 | /* pump up @end */ | 102 | /* pump up @end */ |
106 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | 103 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) |
107 | end = memblock.current_limit; | 104 | end = memblock.current_limit; |
@@ -731,6 +728,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | |||
731 | { | 728 | { |
732 | phys_addr_t found; | 729 | phys_addr_t found; |
733 | 730 | ||
731 | /* align @size to avoid excessive fragmentation on reserved array */ | ||
732 | size = round_up(size, align); | ||
733 | |||
734 | found = memblock_find_in_range_node(0, max_addr, size, align, nid); | 734 | found = memblock_find_in_range_node(0, max_addr, size, align, nid); |
735 | if (found && !memblock_reserve(found, size)) | 735 | if (found && !memblock_reserve(found, size)) |
736 | return found; | 736 | return found; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 556859fec4e..b2ee6df0e9b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index { | |||
89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
91 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ | 91 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
92 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | ||
93 | MEM_CGROUP_STAT_NSTATS, | 92 | MEM_CGROUP_STAT_NSTATS, |
94 | }; | 93 | }; |
95 | 94 | ||
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter { | |||
135 | */ | 134 | */ |
136 | struct mem_cgroup_per_zone { | 135 | struct mem_cgroup_per_zone { |
137 | struct lruvec lruvec; | 136 | struct lruvec lruvec; |
138 | unsigned long count[NR_LRU_LISTS]; | 137 | unsigned long lru_size[NR_LRU_LISTS]; |
139 | 138 | ||
140 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 139 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
141 | 140 | ||
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone { | |||
144 | unsigned long long usage_in_excess;/* Set to the value by which */ | 143 | unsigned long long usage_in_excess;/* Set to the value by which */ |
145 | /* the soft limit is exceeded*/ | 144 | /* the soft limit is exceeded*/ |
146 | bool on_tree; | 145 | bool on_tree; |
147 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | 146 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
148 | /* use container_of */ | 147 | /* use container_of */ |
149 | }; | 148 | }; |
150 | /* Macro for accessing counter */ | ||
151 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | ||
152 | 149 | ||
153 | struct mem_cgroup_per_node { | 150 | struct mem_cgroup_per_node { |
154 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 151 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
@@ -230,10 +227,30 @@ struct mem_cgroup { | |||
230 | * the counter to account for memory usage | 227 | * the counter to account for memory usage |
231 | */ | 228 | */ |
232 | struct res_counter res; | 229 | struct res_counter res; |
233 | /* | 230 | |
234 | * the counter to account for mem+swap usage. | 231 | union { |
235 | */ | 232 | /* |
236 | struct res_counter memsw; | 233 | * the counter to account for mem+swap usage. |
234 | */ | ||
235 | struct res_counter memsw; | ||
236 | |||
237 | /* | ||
238 | * rcu_freeing is used only when freeing struct mem_cgroup, | ||
239 | * so put it into a union to avoid wasting more memory. | ||
240 | * It must be disjoint from the css field. It could be | ||
241 | * in a union with the res field, but res plays a much | ||
242 | * larger part in mem_cgroup life than memsw, and might | ||
243 | * be of interest, even at time of free, when debugging. | ||
244 | * So share rcu_head with the less interesting memsw. | ||
245 | */ | ||
246 | struct rcu_head rcu_freeing; | ||
247 | /* | ||
248 | * But when using vfree(), that cannot be done at | ||
249 | * interrupt time, so we must then queue the work. | ||
250 | */ | ||
251 | struct work_struct work_freeing; | ||
252 | }; | ||
253 | |||
237 | /* | 254 | /* |
238 | * Per cgroup active and inactive list, similar to the | 255 | * Per cgroup active and inactive list, similar to the |
239 | * per zone LRU lists. | 256 | * per zone LRU lists. |
@@ -280,6 +297,12 @@ struct mem_cgroup { | |||
280 | */ | 297 | */ |
281 | unsigned long move_charge_at_immigrate; | 298 | unsigned long move_charge_at_immigrate; |
282 | /* | 299 | /* |
300 | * set > 0 if pages under this cgroup are moving to other cgroup. | ||
301 | */ | ||
302 | atomic_t moving_account; | ||
303 | /* taken only while moving_account > 0 */ | ||
304 | spinlock_t move_lock; | ||
305 | /* | ||
283 | * percpu counter. | 306 | * percpu counter. |
284 | */ | 307 | */ |
285 | struct mem_cgroup_stat_cpu *stat; | 308 | struct mem_cgroup_stat_cpu *stat; |
@@ -592,9 +615,9 @@ retry: | |||
592 | * we will to add it back at the end of reclaim to its correct | 615 | * we will to add it back at the end of reclaim to its correct |
593 | * position in the tree. | 616 | * position in the tree. |
594 | */ | 617 | */ |
595 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | 618 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); |
596 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | 619 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || |
597 | !css_tryget(&mz->mem->css)) | 620 | !css_tryget(&mz->memcg->css)) |
598 | goto retry; | 621 | goto retry; |
599 | done: | 622 | done: |
600 | return mz; | 623 | return mz; |
@@ -672,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
672 | } | 695 | } |
673 | 696 | ||
674 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | 697 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
675 | bool file, int nr_pages) | 698 | bool anon, int nr_pages) |
676 | { | 699 | { |
677 | preempt_disable(); | 700 | preempt_disable(); |
678 | 701 | ||
679 | if (file) | 702 | /* |
680 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], | 703 | * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is |
704 | * counted as CACHE even if it's on ANON LRU. | ||
705 | */ | ||
706 | if (anon) | ||
707 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], | ||
681 | nr_pages); | 708 | nr_pages); |
682 | else | 709 | else |
683 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], | 710 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], |
684 | nr_pages); | 711 | nr_pages); |
685 | 712 | ||
686 | /* pagein of a big page is an event. So, ignore page size */ | 713 | /* pagein of a big page is an event. So, ignore page size */ |
@@ -701,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, | |||
701 | unsigned int lru_mask) | 728 | unsigned int lru_mask) |
702 | { | 729 | { |
703 | struct mem_cgroup_per_zone *mz; | 730 | struct mem_cgroup_per_zone *mz; |
704 | enum lru_list l; | 731 | enum lru_list lru; |
705 | unsigned long ret = 0; | 732 | unsigned long ret = 0; |
706 | 733 | ||
707 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 734 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
708 | 735 | ||
709 | for_each_lru(l) { | 736 | for_each_lru(lru) { |
710 | if (BIT(l) & lru_mask) | 737 | if (BIT(lru) & lru_mask) |
711 | ret += MEM_CGROUP_ZSTAT(mz, l); | 738 | ret += mz->lru_size[lru]; |
712 | } | 739 | } |
713 | return ret; | 740 | return ret; |
714 | } | 741 | } |
@@ -776,7 +803,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
776 | /* threshold event is triggered in finer grain than soft limit */ | 803 | /* threshold event is triggered in finer grain than soft limit */ |
777 | if (unlikely(mem_cgroup_event_ratelimit(memcg, | 804 | if (unlikely(mem_cgroup_event_ratelimit(memcg, |
778 | MEM_CGROUP_TARGET_THRESH))) { | 805 | MEM_CGROUP_TARGET_THRESH))) { |
779 | bool do_softlimit, do_numainfo; | 806 | bool do_softlimit; |
807 | bool do_numainfo __maybe_unused; | ||
780 | 808 | ||
781 | do_softlimit = mem_cgroup_event_ratelimit(memcg, | 809 | do_softlimit = mem_cgroup_event_ratelimit(memcg, |
782 | MEM_CGROUP_TARGET_SOFTLIMIT); | 810 | MEM_CGROUP_TARGET_SOFTLIMIT); |
@@ -1041,9 +1069,22 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | |||
1041 | 1069 | ||
1042 | pc = lookup_page_cgroup(page); | 1070 | pc = lookup_page_cgroup(page); |
1043 | memcg = pc->mem_cgroup; | 1071 | memcg = pc->mem_cgroup; |
1072 | |||
1073 | /* | ||
1074 | * Surreptitiously switch any uncharged page to root: | ||
1075 | * an uncharged page off lru does nothing to secure | ||
1076 | * its former mem_cgroup from sudden removal. | ||
1077 | * | ||
1078 | * Our caller holds lru_lock, and PageCgroupUsed is updated | ||
1079 | * under page_cgroup lock: between them, they make all uses | ||
1080 | * of pc->mem_cgroup safe. | ||
1081 | */ | ||
1082 | if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) | ||
1083 | pc->mem_cgroup = memcg = root_mem_cgroup; | ||
1084 | |||
1044 | mz = page_cgroup_zoneinfo(memcg, page); | 1085 | mz = page_cgroup_zoneinfo(memcg, page); |
1045 | /* compound_order() is stabilized through lru_lock */ | 1086 | /* compound_order() is stabilized through lru_lock */ |
1046 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 1087 | mz->lru_size[lru] += 1 << compound_order(page); |
1047 | return &mz->lruvec; | 1088 | return &mz->lruvec; |
1048 | } | 1089 | } |
1049 | 1090 | ||
@@ -1071,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) | |||
1071 | VM_BUG_ON(!memcg); | 1112 | VM_BUG_ON(!memcg); |
1072 | mz = page_cgroup_zoneinfo(memcg, page); | 1113 | mz = page_cgroup_zoneinfo(memcg, page); |
1073 | /* huge page split is done under lru_lock. so, we have no races. */ | 1114 | /* huge page split is done under lru_lock. so, we have no races. */ |
1074 | VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); | 1115 | VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); |
1075 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | 1116 | mz->lru_size[lru] -= 1 << compound_order(page); |
1076 | } | 1117 | } |
1077 | 1118 | ||
1078 | void mem_cgroup_lru_del(struct page *page) | 1119 | void mem_cgroup_lru_del(struct page *page) |
@@ -1251,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) | |||
1251 | return memcg->swappiness; | 1292 | return memcg->swappiness; |
1252 | } | 1293 | } |
1253 | 1294 | ||
1254 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) | 1295 | /* |
1255 | { | 1296 | * memcg->moving_account is used for checking possibility that some thread is |
1256 | int cpu; | 1297 | * calling move_account(). When a thread on CPU-A starts moving pages under |
1298 | * a memcg, other threads should check memcg->moving_account under | ||
1299 | * rcu_read_lock(), like this: | ||
1300 | * | ||
1301 | * CPU-A CPU-B | ||
1302 | * rcu_read_lock() | ||
1303 | * memcg->moving_account+1 if (memcg->mocing_account) | ||
1304 | * take heavy locks. | ||
1305 | * synchronize_rcu() update something. | ||
1306 | * rcu_read_unlock() | ||
1307 | * start move here. | ||
1308 | */ | ||
1257 | 1309 | ||
1258 | get_online_cpus(); | 1310 | /* for quick checking without looking up memcg */ |
1259 | spin_lock(&memcg->pcp_counter_lock); | 1311 | atomic_t memcg_moving __read_mostly; |
1260 | for_each_online_cpu(cpu) | ||
1261 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; | ||
1262 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; | ||
1263 | spin_unlock(&memcg->pcp_counter_lock); | ||
1264 | put_online_cpus(); | ||
1265 | 1312 | ||
1313 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) | ||
1314 | { | ||
1315 | atomic_inc(&memcg_moving); | ||
1316 | atomic_inc(&memcg->moving_account); | ||
1266 | synchronize_rcu(); | 1317 | synchronize_rcu(); |
1267 | } | 1318 | } |
1268 | 1319 | ||
1269 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) | 1320 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) |
1270 | { | 1321 | { |
1271 | int cpu; | 1322 | /* |
1272 | 1323 | * Now, mem_cgroup_clear_mc() may call this function with NULL. | |
1273 | if (!memcg) | 1324 | * We check NULL in callee rather than caller. |
1274 | return; | 1325 | */ |
1275 | get_online_cpus(); | 1326 | if (memcg) { |
1276 | spin_lock(&memcg->pcp_counter_lock); | 1327 | atomic_dec(&memcg_moving); |
1277 | for_each_online_cpu(cpu) | 1328 | atomic_dec(&memcg->moving_account); |
1278 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; | 1329 | } |
1279 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; | ||
1280 | spin_unlock(&memcg->pcp_counter_lock); | ||
1281 | put_online_cpus(); | ||
1282 | } | 1330 | } |
1331 | |||
1283 | /* | 1332 | /* |
1284 | * 2 routines for checking "mem" is under move_account() or not. | 1333 | * 2 routines for checking "mem" is under move_account() or not. |
1285 | * | 1334 | * |
1286 | * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used | 1335 | * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This |
1287 | * for avoiding race in accounting. If true, | 1336 | * is used for avoiding races in accounting. If true, |
1288 | * pc->mem_cgroup may be overwritten. | 1337 | * pc->mem_cgroup may be overwritten. |
1289 | * | 1338 | * |
1290 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or | 1339 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or |
@@ -1292,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) | |||
1292 | * waiting at hith-memory prressure caused by "move". | 1341 | * waiting at hith-memory prressure caused by "move". |
1293 | */ | 1342 | */ |
1294 | 1343 | ||
1295 | static bool mem_cgroup_stealed(struct mem_cgroup *memcg) | 1344 | static bool mem_cgroup_stolen(struct mem_cgroup *memcg) |
1296 | { | 1345 | { |
1297 | VM_BUG_ON(!rcu_read_lock_held()); | 1346 | VM_BUG_ON(!rcu_read_lock_held()); |
1298 | return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; | 1347 | return atomic_read(&memcg->moving_account) > 0; |
1299 | } | 1348 | } |
1300 | 1349 | ||
1301 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | 1350 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) |
@@ -1336,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | |||
1336 | return false; | 1385 | return false; |
1337 | } | 1386 | } |
1338 | 1387 | ||
1388 | /* | ||
1389 | * Take this lock when | ||
1390 | * - a code tries to modify page's memcg while it's USED. | ||
1391 | * - a code tries to modify page state accounting in a memcg. | ||
1392 | * see mem_cgroup_stolen(), too. | ||
1393 | */ | ||
1394 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, | ||
1395 | unsigned long *flags) | ||
1396 | { | ||
1397 | spin_lock_irqsave(&memcg->move_lock, *flags); | ||
1398 | } | ||
1399 | |||
1400 | static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | ||
1401 | unsigned long *flags) | ||
1402 | { | ||
1403 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | ||
1404 | } | ||
1405 | |||
1339 | /** | 1406 | /** |
1340 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. | 1407 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
1341 | * @memcg: The memory cgroup that went over limit | 1408 | * @memcg: The memory cgroup that went over limit |
@@ -1359,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1359 | if (!memcg || !p) | 1426 | if (!memcg || !p) |
1360 | return; | 1427 | return; |
1361 | 1428 | ||
1362 | |||
1363 | rcu_read_lock(); | 1429 | rcu_read_lock(); |
1364 | 1430 | ||
1365 | mem_cgrp = memcg->css.cgroup; | 1431 | mem_cgrp = memcg->css.cgroup; |
@@ -1738,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock); | |||
1738 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1804 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1739 | 1805 | ||
1740 | struct oom_wait_info { | 1806 | struct oom_wait_info { |
1741 | struct mem_cgroup *mem; | 1807 | struct mem_cgroup *memcg; |
1742 | wait_queue_t wait; | 1808 | wait_queue_t wait; |
1743 | }; | 1809 | }; |
1744 | 1810 | ||
1745 | static int memcg_oom_wake_function(wait_queue_t *wait, | 1811 | static int memcg_oom_wake_function(wait_queue_t *wait, |
1746 | unsigned mode, int sync, void *arg) | 1812 | unsigned mode, int sync, void *arg) |
1747 | { | 1813 | { |
1748 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, | 1814 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; |
1749 | *oom_wait_memcg; | 1815 | struct mem_cgroup *oom_wait_memcg; |
1750 | struct oom_wait_info *oom_wait_info; | 1816 | struct oom_wait_info *oom_wait_info; |
1751 | 1817 | ||
1752 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1818 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
1753 | oom_wait_memcg = oom_wait_info->mem; | 1819 | oom_wait_memcg = oom_wait_info->memcg; |
1754 | 1820 | ||
1755 | /* | 1821 | /* |
1756 | * Both of oom_wait_info->mem and wake_mem are stable under us. | 1822 | * Both of oom_wait_info->memcg and wake_memcg are stable under us. |
1757 | * Then we can use css_is_ancestor without taking care of RCU. | 1823 | * Then we can use css_is_ancestor without taking care of RCU. |
1758 | */ | 1824 | */ |
1759 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) | 1825 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) |
@@ -1777,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
1777 | /* | 1843 | /* |
1778 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1844 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1779 | */ | 1845 | */ |
1780 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) | 1846 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
1781 | { | 1847 | { |
1782 | struct oom_wait_info owait; | 1848 | struct oom_wait_info owait; |
1783 | bool locked, need_to_kill; | 1849 | bool locked, need_to_kill; |
1784 | 1850 | ||
1785 | owait.mem = memcg; | 1851 | owait.memcg = memcg; |
1786 | owait.wait.flags = 0; | 1852 | owait.wait.flags = 0; |
1787 | owait.wait.func = memcg_oom_wake_function; | 1853 | owait.wait.func = memcg_oom_wake_function; |
1788 | owait.wait.private = current; | 1854 | owait.wait.private = current; |
@@ -1807,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) | |||
1807 | 1873 | ||
1808 | if (need_to_kill) { | 1874 | if (need_to_kill) { |
1809 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1875 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1810 | mem_cgroup_out_of_memory(memcg, mask); | 1876 | mem_cgroup_out_of_memory(memcg, mask, order); |
1811 | } else { | 1877 | } else { |
1812 | schedule(); | 1878 | schedule(); |
1813 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1879 | finish_wait(&memcg_oom_waitq, &owait.wait); |
@@ -1847,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) | |||
1847 | * by flags. | 1913 | * by flags. |
1848 | * | 1914 | * |
1849 | * Considering "move", this is an only case we see a race. To make the race | 1915 | * Considering "move", this is an only case we see a race. To make the race |
1850 | * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are | 1916 | * small, we check mm->moving_account and detect there are possibility of race |
1851 | * possibility of race condition. If there is, we take a lock. | 1917 | * If there is, we take a lock. |
1852 | */ | 1918 | */ |
1853 | 1919 | ||
1920 | void __mem_cgroup_begin_update_page_stat(struct page *page, | ||
1921 | bool *locked, unsigned long *flags) | ||
1922 | { | ||
1923 | struct mem_cgroup *memcg; | ||
1924 | struct page_cgroup *pc; | ||
1925 | |||
1926 | pc = lookup_page_cgroup(page); | ||
1927 | again: | ||
1928 | memcg = pc->mem_cgroup; | ||
1929 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | ||
1930 | return; | ||
1931 | /* | ||
1932 | * If this memory cgroup is not under account moving, we don't | ||
1933 | * need to take move_lock_page_cgroup(). Because we already hold | ||
1934 | * rcu_read_lock(), any calls to move_account will be delayed until | ||
1935 | * rcu_read_unlock() if mem_cgroup_stolen() == true. | ||
1936 | */ | ||
1937 | if (!mem_cgroup_stolen(memcg)) | ||
1938 | return; | ||
1939 | |||
1940 | move_lock_mem_cgroup(memcg, flags); | ||
1941 | if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { | ||
1942 | move_unlock_mem_cgroup(memcg, flags); | ||
1943 | goto again; | ||
1944 | } | ||
1945 | *locked = true; | ||
1946 | } | ||
1947 | |||
1948 | void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | ||
1949 | { | ||
1950 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
1951 | |||
1952 | /* | ||
1953 | * It's guaranteed that pc->mem_cgroup never changes while | ||
1954 | * lock is held because a routine modifies pc->mem_cgroup | ||
1955 | * should take move_lock_page_cgroup(). | ||
1956 | */ | ||
1957 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); | ||
1958 | } | ||
1959 | |||
1854 | void mem_cgroup_update_page_stat(struct page *page, | 1960 | void mem_cgroup_update_page_stat(struct page *page, |
1855 | enum mem_cgroup_page_stat_item idx, int val) | 1961 | enum mem_cgroup_page_stat_item idx, int val) |
1856 | { | 1962 | { |
1857 | struct mem_cgroup *memcg; | 1963 | struct mem_cgroup *memcg; |
1858 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1964 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1859 | bool need_unlock = false; | ||
1860 | unsigned long uninitialized_var(flags); | 1965 | unsigned long uninitialized_var(flags); |
1861 | 1966 | ||
1862 | if (mem_cgroup_disabled()) | 1967 | if (mem_cgroup_disabled()) |
1863 | return; | 1968 | return; |
1864 | 1969 | ||
1865 | rcu_read_lock(); | ||
1866 | memcg = pc->mem_cgroup; | 1970 | memcg = pc->mem_cgroup; |
1867 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 1971 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
1868 | goto out; | 1972 | return; |
1869 | /* pc->mem_cgroup is unstable ? */ | ||
1870 | if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) { | ||
1871 | /* take a lock against to access pc->mem_cgroup */ | ||
1872 | move_lock_page_cgroup(pc, &flags); | ||
1873 | need_unlock = true; | ||
1874 | memcg = pc->mem_cgroup; | ||
1875 | if (!memcg || !PageCgroupUsed(pc)) | ||
1876 | goto out; | ||
1877 | } | ||
1878 | 1973 | ||
1879 | switch (idx) { | 1974 | switch (idx) { |
1880 | case MEMCG_NR_FILE_MAPPED: | 1975 | case MEMCG_NR_FILE_MAPPED: |
1881 | if (val > 0) | ||
1882 | SetPageCgroupFileMapped(pc); | ||
1883 | else if (!page_mapped(page)) | ||
1884 | ClearPageCgroupFileMapped(pc); | ||
1885 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | 1976 | idx = MEM_CGROUP_STAT_FILE_MAPPED; |
1886 | break; | 1977 | break; |
1887 | default: | 1978 | default: |
@@ -1889,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
1889 | } | 1980 | } |
1890 | 1981 | ||
1891 | this_cpu_add(memcg->stat->count[idx], val); | 1982 | this_cpu_add(memcg->stat->count[idx], val); |
1892 | |||
1893 | out: | ||
1894 | if (unlikely(need_unlock)) | ||
1895 | move_unlock_page_cgroup(pc, &flags); | ||
1896 | rcu_read_unlock(); | ||
1897 | return; | ||
1898 | } | 1983 | } |
1899 | EXPORT_SYMBOL(mem_cgroup_update_page_stat); | ||
1900 | 1984 | ||
1901 | /* | 1985 | /* |
1902 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1986 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
@@ -2067,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) | |||
2067 | per_cpu(memcg->stat->events[i], cpu) = 0; | 2151 | per_cpu(memcg->stat->events[i], cpu) = 0; |
2068 | memcg->nocpu_base.events[i] += x; | 2152 | memcg->nocpu_base.events[i] += x; |
2069 | } | 2153 | } |
2070 | /* need to clear ON_MOVE value, works as a kind of lock. */ | ||
2071 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | ||
2072 | spin_unlock(&memcg->pcp_counter_lock); | ||
2073 | } | ||
2074 | |||
2075 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) | ||
2076 | { | ||
2077 | int idx = MEM_CGROUP_ON_MOVE; | ||
2078 | |||
2079 | spin_lock(&memcg->pcp_counter_lock); | ||
2080 | per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; | ||
2081 | spin_unlock(&memcg->pcp_counter_lock); | 2154 | spin_unlock(&memcg->pcp_counter_lock); |
2082 | } | 2155 | } |
2083 | 2156 | ||
@@ -2089,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2089 | struct memcg_stock_pcp *stock; | 2162 | struct memcg_stock_pcp *stock; |
2090 | struct mem_cgroup *iter; | 2163 | struct mem_cgroup *iter; |
2091 | 2164 | ||
2092 | if ((action == CPU_ONLINE)) { | 2165 | if (action == CPU_ONLINE) |
2093 | for_each_mem_cgroup(iter) | ||
2094 | synchronize_mem_cgroup_on_move(iter, cpu); | ||
2095 | return NOTIFY_OK; | 2166 | return NOTIFY_OK; |
2096 | } | ||
2097 | 2167 | ||
2098 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) | 2168 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
2099 | return NOTIFY_OK; | 2169 | return NOTIFY_OK; |
@@ -2178,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2178 | if (!oom_check) | 2248 | if (!oom_check) |
2179 | return CHARGE_NOMEM; | 2249 | return CHARGE_NOMEM; |
2180 | /* check OOM */ | 2250 | /* check OOM */ |
2181 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) | 2251 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) |
2182 | return CHARGE_OOM_DIE; | 2252 | return CHARGE_OOM_DIE; |
2183 | 2253 | ||
2184 | return CHARGE_RETRY; | 2254 | return CHARGE_RETRY; |
@@ -2407,8 +2477,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2407 | struct page *page, | 2477 | struct page *page, |
2408 | unsigned int nr_pages, | 2478 | unsigned int nr_pages, |
2409 | struct page_cgroup *pc, | 2479 | struct page_cgroup *pc, |
2410 | enum charge_type ctype) | 2480 | enum charge_type ctype, |
2481 | bool lrucare) | ||
2411 | { | 2482 | { |
2483 | struct zone *uninitialized_var(zone); | ||
2484 | bool was_on_lru = false; | ||
2485 | bool anon; | ||
2486 | |||
2412 | lock_page_cgroup(pc); | 2487 | lock_page_cgroup(pc); |
2413 | if (unlikely(PageCgroupUsed(pc))) { | 2488 | if (unlikely(PageCgroupUsed(pc))) { |
2414 | unlock_page_cgroup(pc); | 2489 | unlock_page_cgroup(pc); |
@@ -2419,6 +2494,21 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2419 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2494 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2420 | * accessed by any other context at this point. | 2495 | * accessed by any other context at this point. |
2421 | */ | 2496 | */ |
2497 | |||
2498 | /* | ||
2499 | * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page | ||
2500 | * may already be on some other mem_cgroup's LRU. Take care of it. | ||
2501 | */ | ||
2502 | if (lrucare) { | ||
2503 | zone = page_zone(page); | ||
2504 | spin_lock_irq(&zone->lru_lock); | ||
2505 | if (PageLRU(page)) { | ||
2506 | ClearPageLRU(page); | ||
2507 | del_page_from_lru_list(zone, page, page_lru(page)); | ||
2508 | was_on_lru = true; | ||
2509 | } | ||
2510 | } | ||
2511 | |||
2422 | pc->mem_cgroup = memcg; | 2512 | pc->mem_cgroup = memcg; |
2423 | /* | 2513 | /* |
2424 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | 2514 | * We access a page_cgroup asynchronously without lock_page_cgroup(). |
@@ -2428,23 +2518,25 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2428 | * See mem_cgroup_add_lru_list(), etc. | 2518 | * See mem_cgroup_add_lru_list(), etc. |
2429 | */ | 2519 | */ |
2430 | smp_wmb(); | 2520 | smp_wmb(); |
2431 | switch (ctype) { | 2521 | SetPageCgroupUsed(pc); |
2432 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | 2522 | |
2433 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | 2523 | if (lrucare) { |
2434 | SetPageCgroupCache(pc); | 2524 | if (was_on_lru) { |
2435 | SetPageCgroupUsed(pc); | 2525 | VM_BUG_ON(PageLRU(page)); |
2436 | break; | 2526 | SetPageLRU(page); |
2437 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2527 | add_page_to_lru_list(zone, page, page_lru(page)); |
2438 | ClearPageCgroupCache(pc); | 2528 | } |
2439 | SetPageCgroupUsed(pc); | 2529 | spin_unlock_irq(&zone->lru_lock); |
2440 | break; | ||
2441 | default: | ||
2442 | break; | ||
2443 | } | 2530 | } |
2444 | 2531 | ||
2445 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); | 2532 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) |
2533 | anon = true; | ||
2534 | else | ||
2535 | anon = false; | ||
2536 | |||
2537 | mem_cgroup_charge_statistics(memcg, anon, nr_pages); | ||
2446 | unlock_page_cgroup(pc); | 2538 | unlock_page_cgroup(pc); |
2447 | WARN_ON_ONCE(PageLRU(page)); | 2539 | |
2448 | /* | 2540 | /* |
2449 | * "charge_statistics" updated event counter. Then, check it. | 2541 | * "charge_statistics" updated event counter. Then, check it. |
2450 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2542 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
@@ -2455,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2455 | 2547 | ||
2456 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2548 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2457 | 2549 | ||
2458 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ | 2550 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) |
2459 | (1 << PCG_MIGRATION)) | ||
2460 | /* | 2551 | /* |
2461 | * Because tail pages are not marked as "used", set it. We're under | 2552 | * Because tail pages are not marked as "used", set it. We're under |
2462 | * zone->lru_lock, 'splitting on pmd' and compound_lock. | 2553 | * zone->lru_lock, 'splitting on pmd' and compound_lock. |
@@ -2507,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
2507 | { | 2598 | { |
2508 | unsigned long flags; | 2599 | unsigned long flags; |
2509 | int ret; | 2600 | int ret; |
2601 | bool anon = PageAnon(page); | ||
2510 | 2602 | ||
2511 | VM_BUG_ON(from == to); | 2603 | VM_BUG_ON(from == to); |
2512 | VM_BUG_ON(PageLRU(page)); | 2604 | VM_BUG_ON(PageLRU(page)); |
@@ -2526,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page, | |||
2526 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | 2618 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) |
2527 | goto unlock; | 2619 | goto unlock; |
2528 | 2620 | ||
2529 | move_lock_page_cgroup(pc, &flags); | 2621 | move_lock_mem_cgroup(from, &flags); |
2530 | 2622 | ||
2531 | if (PageCgroupFileMapped(pc)) { | 2623 | if (!anon && page_mapped(page)) { |
2532 | /* Update mapped_file data for mem_cgroup */ | 2624 | /* Update mapped_file data for mem_cgroup */ |
2533 | preempt_disable(); | 2625 | preempt_disable(); |
2534 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2626 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2535 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2627 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2536 | preempt_enable(); | 2628 | preempt_enable(); |
2537 | } | 2629 | } |
2538 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); | 2630 | mem_cgroup_charge_statistics(from, anon, -nr_pages); |
2539 | if (uncharge) | 2631 | if (uncharge) |
2540 | /* This is not "cancel", but cancel_charge does all we need. */ | 2632 | /* This is not "cancel", but cancel_charge does all we need. */ |
2541 | __mem_cgroup_cancel_charge(from, nr_pages); | 2633 | __mem_cgroup_cancel_charge(from, nr_pages); |
2542 | 2634 | ||
2543 | /* caller should have done css_get */ | 2635 | /* caller should have done css_get */ |
2544 | pc->mem_cgroup = to; | 2636 | pc->mem_cgroup = to; |
2545 | mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); | 2637 | mem_cgroup_charge_statistics(to, anon, nr_pages); |
2546 | /* | 2638 | /* |
2547 | * We charges against "to" which may not have any tasks. Then, "to" | 2639 | * We charges against "to" which may not have any tasks. Then, "to" |
2548 | * can be under rmdir(). But in current implementation, caller of | 2640 | * can be under rmdir(). But in current implementation, caller of |
@@ -2550,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
2550 | * guaranteed that "to" is never removed. So, we don't check rmdir | 2642 | * guaranteed that "to" is never removed. So, we don't check rmdir |
2551 | * status here. | 2643 | * status here. |
2552 | */ | 2644 | */ |
2553 | move_unlock_page_cgroup(pc, &flags); | 2645 | move_unlock_mem_cgroup(from, &flags); |
2554 | ret = 0; | 2646 | ret = 0; |
2555 | unlock: | 2647 | unlock: |
2556 | unlock_page_cgroup(pc); | 2648 | unlock_page_cgroup(pc); |
@@ -2642,7 +2734,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2642 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); | 2734 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); |
2643 | if (ret == -ENOMEM) | 2735 | if (ret == -ENOMEM) |
2644 | return ret; | 2736 | return ret; |
2645 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); | 2737 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false); |
2646 | return 0; | 2738 | return 0; |
2647 | } | 2739 | } |
2648 | 2740 | ||
@@ -2662,35 +2754,6 @@ static void | |||
2662 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2754 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, |
2663 | enum charge_type ctype); | 2755 | enum charge_type ctype); |
2664 | 2756 | ||
2665 | static void | ||
2666 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, | ||
2667 | enum charge_type ctype) | ||
2668 | { | ||
2669 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
2670 | struct zone *zone = page_zone(page); | ||
2671 | unsigned long flags; | ||
2672 | bool removed = false; | ||
2673 | |||
2674 | /* | ||
2675 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page | ||
2676 | * is already on LRU. It means the page may on some other page_cgroup's | ||
2677 | * LRU. Take care of it. | ||
2678 | */ | ||
2679 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
2680 | if (PageLRU(page)) { | ||
2681 | del_page_from_lru_list(zone, page, page_lru(page)); | ||
2682 | ClearPageLRU(page); | ||
2683 | removed = true; | ||
2684 | } | ||
2685 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); | ||
2686 | if (removed) { | ||
2687 | add_page_to_lru_list(zone, page, page_lru(page)); | ||
2688 | SetPageLRU(page); | ||
2689 | } | ||
2690 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
2691 | return; | ||
2692 | } | ||
2693 | |||
2694 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 2757 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2695 | gfp_t gfp_mask) | 2758 | gfp_t gfp_mask) |
2696 | { | 2759 | { |
@@ -2768,13 +2831,16 @@ static void | |||
2768 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | 2831 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, |
2769 | enum charge_type ctype) | 2832 | enum charge_type ctype) |
2770 | { | 2833 | { |
2834 | struct page_cgroup *pc; | ||
2835 | |||
2771 | if (mem_cgroup_disabled()) | 2836 | if (mem_cgroup_disabled()) |
2772 | return; | 2837 | return; |
2773 | if (!memcg) | 2838 | if (!memcg) |
2774 | return; | 2839 | return; |
2775 | cgroup_exclude_rmdir(&memcg->css); | 2840 | cgroup_exclude_rmdir(&memcg->css); |
2776 | 2841 | ||
2777 | __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); | 2842 | pc = lookup_page_cgroup(page); |
2843 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true); | ||
2778 | /* | 2844 | /* |
2779 | * Now swap is on-memory. This means this page may be | 2845 | * Now swap is on-memory. This means this page may be |
2780 | * counted both as mem and swap....double count. | 2846 | * counted both as mem and swap....double count. |
@@ -2878,7 +2944,6 @@ direct_uncharge: | |||
2878 | res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); | 2944 | res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); |
2879 | if (unlikely(batch->memcg != memcg)) | 2945 | if (unlikely(batch->memcg != memcg)) |
2880 | memcg_oom_recover(memcg); | 2946 | memcg_oom_recover(memcg); |
2881 | return; | ||
2882 | } | 2947 | } |
2883 | 2948 | ||
2884 | /* | 2949 | /* |
@@ -2890,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2890 | struct mem_cgroup *memcg = NULL; | 2955 | struct mem_cgroup *memcg = NULL; |
2891 | unsigned int nr_pages = 1; | 2956 | unsigned int nr_pages = 1; |
2892 | struct page_cgroup *pc; | 2957 | struct page_cgroup *pc; |
2958 | bool anon; | ||
2893 | 2959 | ||
2894 | if (mem_cgroup_disabled()) | 2960 | if (mem_cgroup_disabled()) |
2895 | return NULL; | 2961 | return NULL; |
@@ -2915,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2915 | if (!PageCgroupUsed(pc)) | 2981 | if (!PageCgroupUsed(pc)) |
2916 | goto unlock_out; | 2982 | goto unlock_out; |
2917 | 2983 | ||
2984 | anon = PageAnon(page); | ||
2985 | |||
2918 | switch (ctype) { | 2986 | switch (ctype) { |
2919 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2987 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
2988 | /* | ||
2989 | * Generally PageAnon tells if it's the anon statistics to be | ||
2990 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is | ||
2991 | * used before page reached the stage of being marked PageAnon. | ||
2992 | */ | ||
2993 | anon = true; | ||
2994 | /* fallthrough */ | ||
2920 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 2995 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2921 | /* See mem_cgroup_prepare_migration() */ | 2996 | /* See mem_cgroup_prepare_migration() */ |
2922 | if (page_mapped(page) || PageCgroupMigration(pc)) | 2997 | if (page_mapped(page) || PageCgroupMigration(pc)) |
@@ -2933,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2933 | break; | 3008 | break; |
2934 | } | 3009 | } |
2935 | 3010 | ||
2936 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); | 3011 | mem_cgroup_charge_statistics(memcg, anon, -nr_pages); |
2937 | 3012 | ||
2938 | ClearPageCgroupUsed(pc); | 3013 | ClearPageCgroupUsed(pc); |
2939 | /* | 3014 | /* |
@@ -3026,23 +3101,6 @@ void mem_cgroup_uncharge_end(void) | |||
3026 | batch->memcg = NULL; | 3101 | batch->memcg = NULL; |
3027 | } | 3102 | } |
3028 | 3103 | ||
3029 | /* | ||
3030 | * A function for resetting pc->mem_cgroup for newly allocated pages. | ||
3031 | * This function should be called if the newpage will be added to LRU | ||
3032 | * before start accounting. | ||
3033 | */ | ||
3034 | void mem_cgroup_reset_owner(struct page *newpage) | ||
3035 | { | ||
3036 | struct page_cgroup *pc; | ||
3037 | |||
3038 | if (mem_cgroup_disabled()) | ||
3039 | return; | ||
3040 | |||
3041 | pc = lookup_page_cgroup(newpage); | ||
3042 | VM_BUG_ON(PageCgroupUsed(pc)); | ||
3043 | pc->mem_cgroup = root_mem_cgroup; | ||
3044 | } | ||
3045 | |||
3046 | #ifdef CONFIG_SWAP | 3104 | #ifdef CONFIG_SWAP |
3047 | /* | 3105 | /* |
3048 | * called after __delete_from_swap_cache() and drop "page" account. | 3106 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -3247,7 +3305,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3247 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3305 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3248 | else | 3306 | else |
3249 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3307 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
3250 | __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype); | 3308 | __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false); |
3251 | return ret; | 3309 | return ret; |
3252 | } | 3310 | } |
3253 | 3311 | ||
@@ -3257,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3257 | { | 3315 | { |
3258 | struct page *used, *unused; | 3316 | struct page *used, *unused; |
3259 | struct page_cgroup *pc; | 3317 | struct page_cgroup *pc; |
3318 | bool anon; | ||
3260 | 3319 | ||
3261 | if (!memcg) | 3320 | if (!memcg) |
3262 | return; | 3321 | return; |
@@ -3278,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3278 | lock_page_cgroup(pc); | 3337 | lock_page_cgroup(pc); |
3279 | ClearPageCgroupMigration(pc); | 3338 | ClearPageCgroupMigration(pc); |
3280 | unlock_page_cgroup(pc); | 3339 | unlock_page_cgroup(pc); |
3281 | 3340 | anon = PageAnon(used); | |
3282 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | 3341 | __mem_cgroup_uncharge_common(unused, |
3342 | anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED | ||
3343 | : MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
3283 | 3344 | ||
3284 | /* | 3345 | /* |
3285 | * If a page is a file cache, radix-tree replacement is very atomic | 3346 | * If a page is a file cache, radix-tree replacement is very atomic |
@@ -3289,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3289 | * and USED bit check in mem_cgroup_uncharge_page() will do enough | 3350 | * and USED bit check in mem_cgroup_uncharge_page() will do enough |
3290 | * check. (see prepare_charge() also) | 3351 | * check. (see prepare_charge() also) |
3291 | */ | 3352 | */ |
3292 | if (PageAnon(used)) | 3353 | if (anon) |
3293 | mem_cgroup_uncharge_page(used); | 3354 | mem_cgroup_uncharge_page(used); |
3294 | /* | 3355 | /* |
3295 | * At migration, we may charge account against cgroup which has no | 3356 | * At migration, we may charge account against cgroup which has no |
@@ -3319,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3319 | /* fix accounting on old pages */ | 3380 | /* fix accounting on old pages */ |
3320 | lock_page_cgroup(pc); | 3381 | lock_page_cgroup(pc); |
3321 | memcg = pc->mem_cgroup; | 3382 | memcg = pc->mem_cgroup; |
3322 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); | 3383 | mem_cgroup_charge_statistics(memcg, false, -1); |
3323 | ClearPageCgroupUsed(pc); | 3384 | ClearPageCgroupUsed(pc); |
3324 | unlock_page_cgroup(pc); | 3385 | unlock_page_cgroup(pc); |
3325 | 3386 | ||
@@ -3331,7 +3392,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3331 | * the newpage may be on LRU(or pagevec for LRU) already. We lock | 3392 | * the newpage may be on LRU(or pagevec for LRU) already. We lock |
3332 | * LRU while we overwrite pc->mem_cgroup. | 3393 | * LRU while we overwrite pc->mem_cgroup. |
3333 | */ | 3394 | */ |
3334 | __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); | 3395 | __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true); |
3335 | } | 3396 | } |
3336 | 3397 | ||
3337 | #ifdef CONFIG_DEBUG_VM | 3398 | #ifdef CONFIG_DEBUG_VM |
@@ -3530,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3530 | break; | 3591 | break; |
3531 | 3592 | ||
3532 | nr_scanned = 0; | 3593 | nr_scanned = 0; |
3533 | reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, | 3594 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, |
3534 | gfp_mask, &nr_scanned); | 3595 | gfp_mask, &nr_scanned); |
3535 | nr_reclaimed += reclaimed; | 3596 | nr_reclaimed += reclaimed; |
3536 | *total_scanned += nr_scanned; | 3597 | *total_scanned += nr_scanned; |
@@ -3557,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3557 | next_mz = | 3618 | next_mz = |
3558 | __mem_cgroup_largest_soft_limit_node(mctz); | 3619 | __mem_cgroup_largest_soft_limit_node(mctz); |
3559 | if (next_mz == mz) | 3620 | if (next_mz == mz) |
3560 | css_put(&next_mz->mem->css); | 3621 | css_put(&next_mz->memcg->css); |
3561 | else /* next_mz == NULL or other memcg */ | 3622 | else /* next_mz == NULL or other memcg */ |
3562 | break; | 3623 | break; |
3563 | } while (1); | 3624 | } while (1); |
3564 | } | 3625 | } |
3565 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | 3626 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); |
3566 | excess = res_counter_soft_limit_excess(&mz->mem->res); | 3627 | excess = res_counter_soft_limit_excess(&mz->memcg->res); |
3567 | /* | 3628 | /* |
3568 | * One school of thought says that we should not add | 3629 | * One school of thought says that we should not add |
3569 | * back the node to the tree if reclaim returns 0. | 3630 | * back the node to the tree if reclaim returns 0. |
@@ -3573,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3573 | * term TODO. | 3634 | * term TODO. |
3574 | */ | 3635 | */ |
3575 | /* If excess == 0, no tree ops */ | 3636 | /* If excess == 0, no tree ops */ |
3576 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); | 3637 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); |
3577 | spin_unlock(&mctz->lock); | 3638 | spin_unlock(&mctz->lock); |
3578 | css_put(&mz->mem->css); | 3639 | css_put(&mz->memcg->css); |
3579 | loop++; | 3640 | loop++; |
3580 | /* | 3641 | /* |
3581 | * Could not reclaim anything and there are no more | 3642 | * Could not reclaim anything and there are no more |
@@ -3588,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3588 | break; | 3649 | break; |
3589 | } while (!nr_reclaimed); | 3650 | } while (!nr_reclaimed); |
3590 | if (next_mz) | 3651 | if (next_mz) |
3591 | css_put(&next_mz->mem->css); | 3652 | css_put(&next_mz->memcg->css); |
3592 | return nr_reclaimed; | 3653 | return nr_reclaimed; |
3593 | } | 3654 | } |
3594 | 3655 | ||
@@ -3610,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3610 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3671 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
3611 | list = &mz->lruvec.lists[lru]; | 3672 | list = &mz->lruvec.lists[lru]; |
3612 | 3673 | ||
3613 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 3674 | loop = mz->lru_size[lru]; |
3614 | /* give some margin against EBUSY etc...*/ | 3675 | /* give some margin against EBUSY etc...*/ |
3615 | loop += 256; | 3676 | loop += 256; |
3616 | busy = NULL; | 3677 | busy = NULL; |
@@ -3684,10 +3745,10 @@ move_account: | |||
3684 | mem_cgroup_start_move(memcg); | 3745 | mem_cgroup_start_move(memcg); |
3685 | for_each_node_state(node, N_HIGH_MEMORY) { | 3746 | for_each_node_state(node, N_HIGH_MEMORY) { |
3686 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3747 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
3687 | enum lru_list l; | 3748 | enum lru_list lru; |
3688 | for_each_lru(l) { | 3749 | for_each_lru(lru) { |
3689 | ret = mem_cgroup_force_empty_list(memcg, | 3750 | ret = mem_cgroup_force_empty_list(memcg, |
3690 | node, zid, l); | 3751 | node, zid, lru); |
3691 | if (ret) | 3752 | if (ret) |
3692 | break; | 3753 | break; |
3693 | } | 3754 | } |
@@ -3841,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
3841 | break; | 3902 | break; |
3842 | default: | 3903 | default: |
3843 | BUG(); | 3904 | BUG(); |
3844 | break; | ||
3845 | } | 3905 | } |
3846 | return val; | 3906 | return val; |
3847 | } | 3907 | } |
@@ -3920,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, | |||
3920 | out: | 3980 | out: |
3921 | *mem_limit = min_limit; | 3981 | *mem_limit = min_limit; |
3922 | *memsw_limit = min_memsw_limit; | 3982 | *memsw_limit = min_memsw_limit; |
3923 | return; | ||
3924 | } | 3983 | } |
3925 | 3984 | ||
3926 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 3985 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
@@ -4079,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4079 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; | 4138 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; |
4080 | unsigned long node_nr; | 4139 | unsigned long node_nr; |
4081 | struct cgroup *cont = m->private; | 4140 | struct cgroup *cont = m->private; |
4082 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 4141 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4083 | 4142 | ||
4084 | total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); | 4143 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); |
4085 | seq_printf(m, "total=%lu", total_nr); | 4144 | seq_printf(m, "total=%lu", total_nr); |
4086 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4145 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4087 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); | 4146 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); |
4088 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4147 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4089 | } | 4148 | } |
4090 | seq_putc(m, '\n'); | 4149 | seq_putc(m, '\n'); |
4091 | 4150 | ||
4092 | file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); | 4151 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); |
4093 | seq_printf(m, "file=%lu", file_nr); | 4152 | seq_printf(m, "file=%lu", file_nr); |
4094 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4153 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4095 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, | 4154 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4096 | LRU_ALL_FILE); | 4155 | LRU_ALL_FILE); |
4097 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4156 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4098 | } | 4157 | } |
4099 | seq_putc(m, '\n'); | 4158 | seq_putc(m, '\n'); |
4100 | 4159 | ||
4101 | anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); | 4160 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); |
4102 | seq_printf(m, "anon=%lu", anon_nr); | 4161 | seq_printf(m, "anon=%lu", anon_nr); |
4103 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4162 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4104 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, | 4163 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4105 | LRU_ALL_ANON); | 4164 | LRU_ALL_ANON); |
4106 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4165 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4107 | } | 4166 | } |
4108 | seq_putc(m, '\n'); | 4167 | seq_putc(m, '\n'); |
4109 | 4168 | ||
4110 | unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); | 4169 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); |
4111 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 4170 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
4112 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4171 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4113 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, | 4172 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4114 | BIT(LRU_UNEVICTABLE)); | 4173 | BIT(LRU_UNEVICTABLE)); |
4115 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4174 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4116 | } | 4175 | } |
@@ -4122,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4122 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 4181 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
4123 | struct cgroup_map_cb *cb) | 4182 | struct cgroup_map_cb *cb) |
4124 | { | 4183 | { |
4125 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 4184 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4126 | struct mcs_total_stat mystat; | 4185 | struct mcs_total_stat mystat; |
4127 | int i; | 4186 | int i; |
4128 | 4187 | ||
4129 | memset(&mystat, 0, sizeof(mystat)); | 4188 | memset(&mystat, 0, sizeof(mystat)); |
4130 | mem_cgroup_get_local_stat(mem_cont, &mystat); | 4189 | mem_cgroup_get_local_stat(memcg, &mystat); |
4131 | 4190 | ||
4132 | 4191 | ||
4133 | for (i = 0; i < NR_MCS_STAT; i++) { | 4192 | for (i = 0; i < NR_MCS_STAT; i++) { |
@@ -4139,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4139 | /* Hierarchical information */ | 4198 | /* Hierarchical information */ |
4140 | { | 4199 | { |
4141 | unsigned long long limit, memsw_limit; | 4200 | unsigned long long limit, memsw_limit; |
4142 | memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); | 4201 | memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); |
4143 | cb->fill(cb, "hierarchical_memory_limit", limit); | 4202 | cb->fill(cb, "hierarchical_memory_limit", limit); |
4144 | if (do_swap_account) | 4203 | if (do_swap_account) |
4145 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); | 4204 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); |
4146 | } | 4205 | } |
4147 | 4206 | ||
4148 | memset(&mystat, 0, sizeof(mystat)); | 4207 | memset(&mystat, 0, sizeof(mystat)); |
4149 | mem_cgroup_get_total_stat(mem_cont, &mystat); | 4208 | mem_cgroup_get_total_stat(memcg, &mystat); |
4150 | for (i = 0; i < NR_MCS_STAT; i++) { | 4209 | for (i = 0; i < NR_MCS_STAT; i++) { |
4151 | if (i == MCS_SWAP && !do_swap_account) | 4210 | if (i == MCS_SWAP && !do_swap_account) |
4152 | continue; | 4211 | continue; |
@@ -4162,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4162 | 4221 | ||
4163 | for_each_online_node(nid) | 4222 | for_each_online_node(nid) |
4164 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 4223 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
4165 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 4224 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
4166 | 4225 | ||
4167 | recent_rotated[0] += | 4226 | recent_rotated[0] += |
4168 | mz->reclaim_stat.recent_rotated[0]; | 4227 | mz->reclaim_stat.recent_rotated[0]; |
@@ -4407,11 +4466,8 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4407 | else | 4466 | else |
4408 | BUG(); | 4467 | BUG(); |
4409 | 4468 | ||
4410 | /* | 4469 | if (!thresholds->primary) |
4411 | * Something went wrong if we trying to unregister a threshold | 4470 | goto unlock; |
4412 | * if we don't have thresholds | ||
4413 | */ | ||
4414 | BUG_ON(!thresholds); | ||
4415 | 4471 | ||
4416 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | 4472 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); |
4417 | 4473 | ||
@@ -4461,7 +4517,7 @@ swap_buffers: | |||
4461 | 4517 | ||
4462 | /* To be sure that nobody uses thresholds */ | 4518 | /* To be sure that nobody uses thresholds */ |
4463 | synchronize_rcu(); | 4519 | synchronize_rcu(); |
4464 | 4520 | unlock: | |
4465 | mutex_unlock(&memcg->thresholds_lock); | 4521 | mutex_unlock(&memcg->thresholds_lock); |
4466 | } | 4522 | } |
4467 | 4523 | ||
@@ -4580,10 +4636,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | |||
4580 | return mem_cgroup_sockets_init(cont, ss); | 4636 | return mem_cgroup_sockets_init(cont, ss); |
4581 | }; | 4637 | }; |
4582 | 4638 | ||
4583 | static void kmem_cgroup_destroy(struct cgroup_subsys *ss, | 4639 | static void kmem_cgroup_destroy(struct cgroup *cont) |
4584 | struct cgroup *cont) | ||
4585 | { | 4640 | { |
4586 | mem_cgroup_sockets_destroy(cont, ss); | 4641 | mem_cgroup_sockets_destroy(cont); |
4587 | } | 4642 | } |
4588 | #else | 4643 | #else |
4589 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | 4644 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) |
@@ -4591,8 +4646,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | |||
4591 | return 0; | 4646 | return 0; |
4592 | } | 4647 | } |
4593 | 4648 | ||
4594 | static void kmem_cgroup_destroy(struct cgroup_subsys *ss, | 4649 | static void kmem_cgroup_destroy(struct cgroup *cont) |
4595 | struct cgroup *cont) | ||
4596 | { | 4650 | { |
4597 | } | 4651 | } |
4598 | #endif | 4652 | #endif |
@@ -4716,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4716 | { | 4770 | { |
4717 | struct mem_cgroup_per_node *pn; | 4771 | struct mem_cgroup_per_node *pn; |
4718 | struct mem_cgroup_per_zone *mz; | 4772 | struct mem_cgroup_per_zone *mz; |
4719 | enum lru_list l; | 4773 | enum lru_list lru; |
4720 | int zone, tmp = node; | 4774 | int zone, tmp = node; |
4721 | /* | 4775 | /* |
4722 | * This routine is called against possible nodes. | 4776 | * This routine is called against possible nodes. |
@@ -4734,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4734 | 4788 | ||
4735 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4789 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4736 | mz = &pn->zoneinfo[zone]; | 4790 | mz = &pn->zoneinfo[zone]; |
4737 | for_each_lru(l) | 4791 | for_each_lru(lru) |
4738 | INIT_LIST_HEAD(&mz->lruvec.lists[l]); | 4792 | INIT_LIST_HEAD(&mz->lruvec.lists[lru]); |
4739 | mz->usage_in_excess = 0; | 4793 | mz->usage_in_excess = 0; |
4740 | mz->on_tree = false; | 4794 | mz->on_tree = false; |
4741 | mz->mem = memcg; | 4795 | mz->memcg = memcg; |
4742 | } | 4796 | } |
4743 | memcg->info.nodeinfo[node] = pn; | 4797 | memcg->info.nodeinfo[node] = pn; |
4744 | return 0; | 4798 | return 0; |
@@ -4751,33 +4805,54 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4751 | 4805 | ||
4752 | static struct mem_cgroup *mem_cgroup_alloc(void) | 4806 | static struct mem_cgroup *mem_cgroup_alloc(void) |
4753 | { | 4807 | { |
4754 | struct mem_cgroup *mem; | 4808 | struct mem_cgroup *memcg; |
4755 | int size = sizeof(struct mem_cgroup); | 4809 | int size = sizeof(struct mem_cgroup); |
4756 | 4810 | ||
4757 | /* Can be very big if MAX_NUMNODES is very big */ | 4811 | /* Can be very big if MAX_NUMNODES is very big */ |
4758 | if (size < PAGE_SIZE) | 4812 | if (size < PAGE_SIZE) |
4759 | mem = kzalloc(size, GFP_KERNEL); | 4813 | memcg = kzalloc(size, GFP_KERNEL); |
4760 | else | 4814 | else |
4761 | mem = vzalloc(size); | 4815 | memcg = vzalloc(size); |
4762 | 4816 | ||
4763 | if (!mem) | 4817 | if (!memcg) |
4764 | return NULL; | 4818 | return NULL; |
4765 | 4819 | ||
4766 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 4820 | memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); |
4767 | if (!mem->stat) | 4821 | if (!memcg->stat) |
4768 | goto out_free; | 4822 | goto out_free; |
4769 | spin_lock_init(&mem->pcp_counter_lock); | 4823 | spin_lock_init(&memcg->pcp_counter_lock); |
4770 | return mem; | 4824 | return memcg; |
4771 | 4825 | ||
4772 | out_free: | 4826 | out_free: |
4773 | if (size < PAGE_SIZE) | 4827 | if (size < PAGE_SIZE) |
4774 | kfree(mem); | 4828 | kfree(memcg); |
4775 | else | 4829 | else |
4776 | vfree(mem); | 4830 | vfree(memcg); |
4777 | return NULL; | 4831 | return NULL; |
4778 | } | 4832 | } |
4779 | 4833 | ||
4780 | /* | 4834 | /* |
4835 | * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, | ||
4836 | * but in process context. The work_freeing structure is overlaid | ||
4837 | * on the rcu_freeing structure, which itself is overlaid on memsw. | ||
4838 | */ | ||
4839 | static void vfree_work(struct work_struct *work) | ||
4840 | { | ||
4841 | struct mem_cgroup *memcg; | ||
4842 | |||
4843 | memcg = container_of(work, struct mem_cgroup, work_freeing); | ||
4844 | vfree(memcg); | ||
4845 | } | ||
4846 | static void vfree_rcu(struct rcu_head *rcu_head) | ||
4847 | { | ||
4848 | struct mem_cgroup *memcg; | ||
4849 | |||
4850 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); | ||
4851 | INIT_WORK(&memcg->work_freeing, vfree_work); | ||
4852 | schedule_work(&memcg->work_freeing); | ||
4853 | } | ||
4854 | |||
4855 | /* | ||
4781 | * At destroying mem_cgroup, references from swap_cgroup can remain. | 4856 | * At destroying mem_cgroup, references from swap_cgroup can remain. |
4782 | * (scanning all at force_empty is too costly...) | 4857 | * (scanning all at force_empty is too costly...) |
4783 | * | 4858 | * |
@@ -4800,9 +4875,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4800 | 4875 | ||
4801 | free_percpu(memcg->stat); | 4876 | free_percpu(memcg->stat); |
4802 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | 4877 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) |
4803 | kfree(memcg); | 4878 | kfree_rcu(memcg, rcu_freeing); |
4804 | else | 4879 | else |
4805 | vfree(memcg); | 4880 | call_rcu(&memcg->rcu_freeing, vfree_rcu); |
4806 | } | 4881 | } |
4807 | 4882 | ||
4808 | static void mem_cgroup_get(struct mem_cgroup *memcg) | 4883 | static void mem_cgroup_get(struct mem_cgroup *memcg) |
@@ -4884,7 +4959,7 @@ err_cleanup: | |||
4884 | } | 4959 | } |
4885 | 4960 | ||
4886 | static struct cgroup_subsys_state * __ref | 4961 | static struct cgroup_subsys_state * __ref |
4887 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 4962 | mem_cgroup_create(struct cgroup *cont) |
4888 | { | 4963 | { |
4889 | struct mem_cgroup *memcg, *parent; | 4964 | struct mem_cgroup *memcg, *parent; |
4890 | long error = -ENOMEM; | 4965 | long error = -ENOMEM; |
@@ -4940,26 +5015,25 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4940 | atomic_set(&memcg->refcnt, 1); | 5015 | atomic_set(&memcg->refcnt, 1); |
4941 | memcg->move_charge_at_immigrate = 0; | 5016 | memcg->move_charge_at_immigrate = 0; |
4942 | mutex_init(&memcg->thresholds_lock); | 5017 | mutex_init(&memcg->thresholds_lock); |
5018 | spin_lock_init(&memcg->move_lock); | ||
4943 | return &memcg->css; | 5019 | return &memcg->css; |
4944 | free_out: | 5020 | free_out: |
4945 | __mem_cgroup_free(memcg); | 5021 | __mem_cgroup_free(memcg); |
4946 | return ERR_PTR(error); | 5022 | return ERR_PTR(error); |
4947 | } | 5023 | } |
4948 | 5024 | ||
4949 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | 5025 | static int mem_cgroup_pre_destroy(struct cgroup *cont) |
4950 | struct cgroup *cont) | ||
4951 | { | 5026 | { |
4952 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 5027 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4953 | 5028 | ||
4954 | return mem_cgroup_force_empty(memcg, false); | 5029 | return mem_cgroup_force_empty(memcg, false); |
4955 | } | 5030 | } |
4956 | 5031 | ||
4957 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 5032 | static void mem_cgroup_destroy(struct cgroup *cont) |
4958 | struct cgroup *cont) | ||
4959 | { | 5033 | { |
4960 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 5034 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4961 | 5035 | ||
4962 | kmem_cgroup_destroy(ss, cont); | 5036 | kmem_cgroup_destroy(cont); |
4963 | 5037 | ||
4964 | mem_cgroup_put(memcg); | 5038 | mem_cgroup_put(memcg); |
4965 | } | 5039 | } |
@@ -5036,7 +5110,7 @@ one_by_one: | |||
5036 | } | 5110 | } |
5037 | 5111 | ||
5038 | /** | 5112 | /** |
5039 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | 5113 | * get_mctgt_type - get target type of moving charge |
5040 | * @vma: the vma the pte to be checked belongs | 5114 | * @vma: the vma the pte to be checked belongs |
5041 | * @addr: the address corresponding to the pte to be checked | 5115 | * @addr: the address corresponding to the pte to be checked |
5042 | * @ptent: the pte to be checked | 5116 | * @ptent: the pte to be checked |
@@ -5059,7 +5133,7 @@ union mc_target { | |||
5059 | }; | 5133 | }; |
5060 | 5134 | ||
5061 | enum mc_target_type { | 5135 | enum mc_target_type { |
5062 | MC_TARGET_NONE, /* not used */ | 5136 | MC_TARGET_NONE = 0, |
5063 | MC_TARGET_PAGE, | 5137 | MC_TARGET_PAGE, |
5064 | MC_TARGET_SWAP, | 5138 | MC_TARGET_SWAP, |
5065 | }; | 5139 | }; |
@@ -5140,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
5140 | return page; | 5214 | return page; |
5141 | } | 5215 | } |
5142 | 5216 | ||
5143 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | 5217 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, |
5144 | unsigned long addr, pte_t ptent, union mc_target *target) | 5218 | unsigned long addr, pte_t ptent, union mc_target *target) |
5145 | { | 5219 | { |
5146 | struct page *page = NULL; | 5220 | struct page *page = NULL; |
5147 | struct page_cgroup *pc; | 5221 | struct page_cgroup *pc; |
5148 | int ret = 0; | 5222 | enum mc_target_type ret = MC_TARGET_NONE; |
5149 | swp_entry_t ent = { .val = 0 }; | 5223 | swp_entry_t ent = { .val = 0 }; |
5150 | 5224 | ||
5151 | if (pte_present(ptent)) | 5225 | if (pte_present(ptent)) |
@@ -5156,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
5156 | page = mc_handle_file_pte(vma, addr, ptent, &ent); | 5230 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
5157 | 5231 | ||
5158 | if (!page && !ent.val) | 5232 | if (!page && !ent.val) |
5159 | return 0; | 5233 | return ret; |
5160 | if (page) { | 5234 | if (page) { |
5161 | pc = lookup_page_cgroup(page); | 5235 | pc = lookup_page_cgroup(page); |
5162 | /* | 5236 | /* |
@@ -5182,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
5182 | return ret; | 5256 | return ret; |
5183 | } | 5257 | } |
5184 | 5258 | ||
5259 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5260 | /* | ||
5261 | * We don't consider swapping or file mapped pages because THP does not | ||
5262 | * support them for now. | ||
5263 | * Caller should make sure that pmd_trans_huge(pmd) is true. | ||
5264 | */ | ||
5265 | static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | ||
5266 | unsigned long addr, pmd_t pmd, union mc_target *target) | ||
5267 | { | ||
5268 | struct page *page = NULL; | ||
5269 | struct page_cgroup *pc; | ||
5270 | enum mc_target_type ret = MC_TARGET_NONE; | ||
5271 | |||
5272 | page = pmd_page(pmd); | ||
5273 | VM_BUG_ON(!page || !PageHead(page)); | ||
5274 | if (!move_anon()) | ||
5275 | return ret; | ||
5276 | pc = lookup_page_cgroup(page); | ||
5277 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
5278 | ret = MC_TARGET_PAGE; | ||
5279 | if (target) { | ||
5280 | get_page(page); | ||
5281 | target->page = page; | ||
5282 | } | ||
5283 | } | ||
5284 | return ret; | ||
5285 | } | ||
5286 | #else | ||
5287 | static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | ||
5288 | unsigned long addr, pmd_t pmd, union mc_target *target) | ||
5289 | { | ||
5290 | return MC_TARGET_NONE; | ||
5291 | } | ||
5292 | #endif | ||
5293 | |||
5185 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | 5294 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, |
5186 | unsigned long addr, unsigned long end, | 5295 | unsigned long addr, unsigned long end, |
5187 | struct mm_walk *walk) | 5296 | struct mm_walk *walk) |
@@ -5190,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
5190 | pte_t *pte; | 5299 | pte_t *pte; |
5191 | spinlock_t *ptl; | 5300 | spinlock_t *ptl; |
5192 | 5301 | ||
5193 | split_huge_page_pmd(walk->mm, pmd); | 5302 | if (pmd_trans_huge_lock(pmd, vma) == 1) { |
5303 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) | ||
5304 | mc.precharge += HPAGE_PMD_NR; | ||
5305 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
5306 | return 0; | ||
5307 | } | ||
5194 | 5308 | ||
5195 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5309 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5196 | for (; addr != end; pte++, addr += PAGE_SIZE) | 5310 | for (; addr != end; pte++, addr += PAGE_SIZE) |
5197 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | 5311 | if (get_mctgt_type(vma, addr, *pte, NULL)) |
5198 | mc.precharge++; /* increment precharge temporarily */ | 5312 | mc.precharge++; /* increment precharge temporarily */ |
5199 | pte_unmap_unlock(pte - 1, ptl); | 5313 | pte_unmap_unlock(pte - 1, ptl); |
5200 | cond_resched(); | 5314 | cond_resched(); |
@@ -5296,9 +5410,8 @@ static void mem_cgroup_clear_mc(void) | |||
5296 | mem_cgroup_end_move(from); | 5410 | mem_cgroup_end_move(from); |
5297 | } | 5411 | } |
5298 | 5412 | ||
5299 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 5413 | static int mem_cgroup_can_attach(struct cgroup *cgroup, |
5300 | struct cgroup *cgroup, | 5414 | struct cgroup_taskset *tset) |
5301 | struct cgroup_taskset *tset) | ||
5302 | { | 5415 | { |
5303 | struct task_struct *p = cgroup_taskset_first(tset); | 5416 | struct task_struct *p = cgroup_taskset_first(tset); |
5304 | int ret = 0; | 5417 | int ret = 0; |
@@ -5336,9 +5449,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
5336 | return ret; | 5449 | return ret; |
5337 | } | 5450 | } |
5338 | 5451 | ||
5339 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | 5452 | static void mem_cgroup_cancel_attach(struct cgroup *cgroup, |
5340 | struct cgroup *cgroup, | 5453 | struct cgroup_taskset *tset) |
5341 | struct cgroup_taskset *tset) | ||
5342 | { | 5454 | { |
5343 | mem_cgroup_clear_mc(); | 5455 | mem_cgroup_clear_mc(); |
5344 | } | 5456 | } |
@@ -5351,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5351 | struct vm_area_struct *vma = walk->private; | 5463 | struct vm_area_struct *vma = walk->private; |
5352 | pte_t *pte; | 5464 | pte_t *pte; |
5353 | spinlock_t *ptl; | 5465 | spinlock_t *ptl; |
5466 | enum mc_target_type target_type; | ||
5467 | union mc_target target; | ||
5468 | struct page *page; | ||
5469 | struct page_cgroup *pc; | ||
5470 | |||
5471 | /* | ||
5472 | * We don't take compound_lock() here but no race with splitting thp | ||
5473 | * happens because: | ||
5474 | * - if pmd_trans_huge_lock() returns 1, the relevant thp is not | ||
5475 | * under splitting, which means there's no concurrent thp split, | ||
5476 | * - if another thread runs into split_huge_page() just after we | ||
5477 | * entered this if-block, the thread must wait for page table lock | ||
5478 | * to be unlocked in __split_huge_page_splitting(), where the main | ||
5479 | * part of thp split is not executed yet. | ||
5480 | */ | ||
5481 | if (pmd_trans_huge_lock(pmd, vma) == 1) { | ||
5482 | if (!mc.precharge) { | ||
5483 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
5484 | return 0; | ||
5485 | } | ||
5486 | target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); | ||
5487 | if (target_type == MC_TARGET_PAGE) { | ||
5488 | page = target.page; | ||
5489 | if (!isolate_lru_page(page)) { | ||
5490 | pc = lookup_page_cgroup(page); | ||
5491 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, | ||
5492 | pc, mc.from, mc.to, | ||
5493 | false)) { | ||
5494 | mc.precharge -= HPAGE_PMD_NR; | ||
5495 | mc.moved_charge += HPAGE_PMD_NR; | ||
5496 | } | ||
5497 | putback_lru_page(page); | ||
5498 | } | ||
5499 | put_page(page); | ||
5500 | } | ||
5501 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
5502 | return 0; | ||
5503 | } | ||
5354 | 5504 | ||
5355 | split_huge_page_pmd(walk->mm, pmd); | ||
5356 | retry: | 5505 | retry: |
5357 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5506 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5358 | for (; addr != end; addr += PAGE_SIZE) { | 5507 | for (; addr != end; addr += PAGE_SIZE) { |
5359 | pte_t ptent = *(pte++); | 5508 | pte_t ptent = *(pte++); |
5360 | union mc_target target; | ||
5361 | int type; | ||
5362 | struct page *page; | ||
5363 | struct page_cgroup *pc; | ||
5364 | swp_entry_t ent; | 5509 | swp_entry_t ent; |
5365 | 5510 | ||
5366 | if (!mc.precharge) | 5511 | if (!mc.precharge) |
5367 | break; | 5512 | break; |
5368 | 5513 | ||
5369 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | 5514 | switch (get_mctgt_type(vma, addr, ptent, &target)) { |
5370 | switch (type) { | ||
5371 | case MC_TARGET_PAGE: | 5515 | case MC_TARGET_PAGE: |
5372 | page = target.page; | 5516 | page = target.page; |
5373 | if (isolate_lru_page(page)) | 5517 | if (isolate_lru_page(page)) |
@@ -5380,7 +5524,7 @@ retry: | |||
5380 | mc.moved_charge++; | 5524 | mc.moved_charge++; |
5381 | } | 5525 | } |
5382 | putback_lru_page(page); | 5526 | putback_lru_page(page); |
5383 | put: /* is_target_pte_for_mc() gets the page */ | 5527 | put: /* get_mctgt_type() gets the page */ |
5384 | put_page(page); | 5528 | put_page(page); |
5385 | break; | 5529 | break; |
5386 | case MC_TARGET_SWAP: | 5530 | case MC_TARGET_SWAP: |
@@ -5453,9 +5597,8 @@ retry: | |||
5453 | up_read(&mm->mmap_sem); | 5597 | up_read(&mm->mmap_sem); |
5454 | } | 5598 | } |
5455 | 5599 | ||
5456 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 5600 | static void mem_cgroup_move_task(struct cgroup *cont, |
5457 | struct cgroup *cont, | 5601 | struct cgroup_taskset *tset) |
5458 | struct cgroup_taskset *tset) | ||
5459 | { | 5602 | { |
5460 | struct task_struct *p = cgroup_taskset_first(tset); | 5603 | struct task_struct *p = cgroup_taskset_first(tset); |
5461 | struct mm_struct *mm = get_task_mm(p); | 5604 | struct mm_struct *mm = get_task_mm(p); |
@@ -5470,20 +5613,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
5470 | mem_cgroup_clear_mc(); | 5613 | mem_cgroup_clear_mc(); |
5471 | } | 5614 | } |
5472 | #else /* !CONFIG_MMU */ | 5615 | #else /* !CONFIG_MMU */ |
5473 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 5616 | static int mem_cgroup_can_attach(struct cgroup *cgroup, |
5474 | struct cgroup *cgroup, | 5617 | struct cgroup_taskset *tset) |
5475 | struct cgroup_taskset *tset) | ||
5476 | { | 5618 | { |
5477 | return 0; | 5619 | return 0; |
5478 | } | 5620 | } |
5479 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | 5621 | static void mem_cgroup_cancel_attach(struct cgroup *cgroup, |
5480 | struct cgroup *cgroup, | 5622 | struct cgroup_taskset *tset) |
5481 | struct cgroup_taskset *tset) | ||
5482 | { | 5623 | { |
5483 | } | 5624 | } |
5484 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 5625 | static void mem_cgroup_move_task(struct cgroup *cont, |
5485 | struct cgroup *cont, | 5626 | struct cgroup_taskset *tset) |
5486 | struct cgroup_taskset *tset) | ||
5487 | { | 5627 | { |
5488 | } | 5628 | } |
5489 | #endif | 5629 | #endif |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 56080ea3614..97cc2733551 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -187,33 +187,40 @@ int hwpoison_filter(struct page *p) | |||
187 | EXPORT_SYMBOL_GPL(hwpoison_filter); | 187 | EXPORT_SYMBOL_GPL(hwpoison_filter); |
188 | 188 | ||
189 | /* | 189 | /* |
190 | * Send all the processes who have the page mapped an ``action optional'' | 190 | * Send all the processes who have the page mapped a signal. |
191 | * signal. | 191 | * ``action optional'' if they are not immediately affected by the error |
192 | * ``action required'' if error happened in current execution context | ||
192 | */ | 193 | */ |
193 | static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | 194 | static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, |
194 | unsigned long pfn, struct page *page) | 195 | unsigned long pfn, struct page *page, int flags) |
195 | { | 196 | { |
196 | struct siginfo si; | 197 | struct siginfo si; |
197 | int ret; | 198 | int ret; |
198 | 199 | ||
199 | printk(KERN_ERR | 200 | printk(KERN_ERR |
200 | "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", | 201 | "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", |
201 | pfn, t->comm, t->pid); | 202 | pfn, t->comm, t->pid); |
202 | si.si_signo = SIGBUS; | 203 | si.si_signo = SIGBUS; |
203 | si.si_errno = 0; | 204 | si.si_errno = 0; |
204 | si.si_code = BUS_MCEERR_AO; | ||
205 | si.si_addr = (void *)addr; | 205 | si.si_addr = (void *)addr; |
206 | #ifdef __ARCH_SI_TRAPNO | 206 | #ifdef __ARCH_SI_TRAPNO |
207 | si.si_trapno = trapno; | 207 | si.si_trapno = trapno; |
208 | #endif | 208 | #endif |
209 | si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; | 209 | si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; |
210 | /* | 210 | |
211 | * Don't use force here, it's convenient if the signal | 211 | if ((flags & MF_ACTION_REQUIRED) && t == current) { |
212 | * can be temporarily blocked. | 212 | si.si_code = BUS_MCEERR_AR; |
213 | * This could cause a loop when the user sets SIGBUS | 213 | ret = force_sig_info(SIGBUS, &si, t); |
214 | * to SIG_IGN, but hopefully no one will do that? | 214 | } else { |
215 | */ | 215 | /* |
216 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | 216 | * Don't use force here, it's convenient if the signal |
217 | * can be temporarily blocked. | ||
218 | * This could cause a loop when the user sets SIGBUS | ||
219 | * to SIG_IGN, but hopefully no one will do that? | ||
220 | */ | ||
221 | si.si_code = BUS_MCEERR_AO; | ||
222 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | ||
223 | } | ||
217 | if (ret < 0) | 224 | if (ret < 0) |
218 | printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", | 225 | printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", |
219 | t->comm, t->pid, ret); | 226 | t->comm, t->pid, ret); |
@@ -338,8 +345,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, | |||
338 | * Also when FAIL is set do a force kill because something went | 345 | * Also when FAIL is set do a force kill because something went |
339 | * wrong earlier. | 346 | * wrong earlier. |
340 | */ | 347 | */ |
341 | static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | 348 | static void kill_procs(struct list_head *to_kill, int doit, int trapno, |
342 | int fail, struct page *page, unsigned long pfn) | 349 | int fail, struct page *page, unsigned long pfn, |
350 | int flags) | ||
343 | { | 351 | { |
344 | struct to_kill *tk, *next; | 352 | struct to_kill *tk, *next; |
345 | 353 | ||
@@ -363,8 +371,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | |||
363 | * check for that, but we need to tell the | 371 | * check for that, but we need to tell the |
364 | * process anyways. | 372 | * process anyways. |
365 | */ | 373 | */ |
366 | else if (kill_proc_ao(tk->tsk, tk->addr, trapno, | 374 | else if (kill_proc(tk->tsk, tk->addr, trapno, |
367 | pfn, page) < 0) | 375 | pfn, page, flags) < 0) |
368 | printk(KERN_ERR | 376 | printk(KERN_ERR |
369 | "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", | 377 | "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", |
370 | pfn, tk->tsk->comm, tk->tsk->pid); | 378 | pfn, tk->tsk->comm, tk->tsk->pid); |
@@ -844,7 +852,7 @@ static int page_action(struct page_state *ps, struct page *p, | |||
844 | * the pages and send SIGBUS to the processes if the data was dirty. | 852 | * the pages and send SIGBUS to the processes if the data was dirty. |
845 | */ | 853 | */ |
846 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | 854 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
847 | int trapno) | 855 | int trapno, int flags) |
848 | { | 856 | { |
849 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 857 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
850 | struct address_space *mapping; | 858 | struct address_space *mapping; |
@@ -962,8 +970,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
962 | * use a more force-full uncatchable kill to prevent | 970 | * use a more force-full uncatchable kill to prevent |
963 | * any accesses to the poisoned memory. | 971 | * any accesses to the poisoned memory. |
964 | */ | 972 | */ |
965 | kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, | 973 | kill_procs(&tokill, !!PageDirty(ppage), trapno, |
966 | ret != SWAP_SUCCESS, p, pfn); | 974 | ret != SWAP_SUCCESS, p, pfn, flags); |
967 | 975 | ||
968 | return ret; | 976 | return ret; |
969 | } | 977 | } |
@@ -984,7 +992,25 @@ static void clear_page_hwpoison_huge_page(struct page *hpage) | |||
984 | ClearPageHWPoison(hpage + i); | 992 | ClearPageHWPoison(hpage + i); |
985 | } | 993 | } |
986 | 994 | ||
987 | int __memory_failure(unsigned long pfn, int trapno, int flags) | 995 | /** |
996 | * memory_failure - Handle memory failure of a page. | ||
997 | * @pfn: Page Number of the corrupted page | ||
998 | * @trapno: Trap number reported in the signal to user space. | ||
999 | * @flags: fine tune action taken | ||
1000 | * | ||
1001 | * This function is called by the low level machine check code | ||
1002 | * of an architecture when it detects hardware memory corruption | ||
1003 | * of a page. It tries its best to recover, which includes | ||
1004 | * dropping pages, killing processes etc. | ||
1005 | * | ||
1006 | * The function is primarily of use for corruptions that | ||
1007 | * happen outside the current execution context (e.g. when | ||
1008 | * detected by a background scrubber) | ||
1009 | * | ||
1010 | * Must run in process context (e.g. a work queue) with interrupts | ||
1011 | * enabled and no spinlocks hold. | ||
1012 | */ | ||
1013 | int memory_failure(unsigned long pfn, int trapno, int flags) | ||
988 | { | 1014 | { |
989 | struct page_state *ps; | 1015 | struct page_state *ps; |
990 | struct page *p; | 1016 | struct page *p; |
@@ -1063,7 +1089,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1063 | * The check (unnecessarily) ignores LRU pages being isolated and | 1089 | * The check (unnecessarily) ignores LRU pages being isolated and |
1064 | * walked by the page reclaim code, however that's not a big loss. | 1090 | * walked by the page reclaim code, however that's not a big loss. |
1065 | */ | 1091 | */ |
1066 | if (!PageHuge(p) && !PageTransCompound(p)) { | 1092 | if (!PageHuge(p) && !PageTransTail(p)) { |
1067 | if (!PageLRU(p)) | 1093 | if (!PageLRU(p)) |
1068 | shake_page(p, 0); | 1094 | shake_page(p, 0); |
1069 | if (!PageLRU(p)) { | 1095 | if (!PageLRU(p)) { |
@@ -1130,7 +1156,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1130 | * Now take care of user space mappings. | 1156 | * Now take care of user space mappings. |
1131 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. | 1157 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. |
1132 | */ | 1158 | */ |
1133 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { | 1159 | if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { |
1134 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | 1160 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); |
1135 | res = -EBUSY; | 1161 | res = -EBUSY; |
1136 | goto out; | 1162 | goto out; |
@@ -1156,29 +1182,7 @@ out: | |||
1156 | unlock_page(hpage); | 1182 | unlock_page(hpage); |
1157 | return res; | 1183 | return res; |
1158 | } | 1184 | } |
1159 | EXPORT_SYMBOL_GPL(__memory_failure); | 1185 | EXPORT_SYMBOL_GPL(memory_failure); |
1160 | |||
1161 | /** | ||
1162 | * memory_failure - Handle memory failure of a page. | ||
1163 | * @pfn: Page Number of the corrupted page | ||
1164 | * @trapno: Trap number reported in the signal to user space. | ||
1165 | * | ||
1166 | * This function is called by the low level machine check code | ||
1167 | * of an architecture when it detects hardware memory corruption | ||
1168 | * of a page. It tries its best to recover, which includes | ||
1169 | * dropping pages, killing processes etc. | ||
1170 | * | ||
1171 | * The function is primarily of use for corruptions that | ||
1172 | * happen outside the current execution context (e.g. when | ||
1173 | * detected by a background scrubber) | ||
1174 | * | ||
1175 | * Must run in process context (e.g. a work queue) with interrupts | ||
1176 | * enabled and no spinlocks hold. | ||
1177 | */ | ||
1178 | void memory_failure(unsigned long pfn, int trapno) | ||
1179 | { | ||
1180 | __memory_failure(pfn, trapno, 0); | ||
1181 | } | ||
1182 | 1186 | ||
1183 | #define MEMORY_FAILURE_FIFO_ORDER 4 | 1187 | #define MEMORY_FAILURE_FIFO_ORDER 4 |
1184 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) | 1188 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) |
@@ -1251,7 +1255,7 @@ static void memory_failure_work_func(struct work_struct *work) | |||
1251 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | 1255 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); |
1252 | if (!gotten) | 1256 | if (!gotten) |
1253 | break; | 1257 | break; |
1254 | __memory_failure(entry.pfn, entry.trapno, entry.flags); | 1258 | memory_failure(entry.pfn, entry.trapno, entry.flags); |
1255 | } | 1259 | } |
1256 | } | 1260 | } |
1257 | 1261 | ||
diff --git a/mm/memory.c b/mm/memory.c index fa2f04e0337..6105f475fa8 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn); | |||
125 | 125 | ||
126 | #if defined(SPLIT_RSS_COUNTING) | 126 | #if defined(SPLIT_RSS_COUNTING) |
127 | 127 | ||
128 | static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) | 128 | void sync_mm_rss(struct mm_struct *mm) |
129 | { | 129 | { |
130 | int i; | 130 | int i; |
131 | 131 | ||
132 | for (i = 0; i < NR_MM_COUNTERS; i++) { | 132 | for (i = 0; i < NR_MM_COUNTERS; i++) { |
133 | if (task->rss_stat.count[i]) { | 133 | if (current->rss_stat.count[i]) { |
134 | add_mm_counter(mm, i, task->rss_stat.count[i]); | 134 | add_mm_counter(mm, i, current->rss_stat.count[i]); |
135 | task->rss_stat.count[i] = 0; | 135 | current->rss_stat.count[i] = 0; |
136 | } | 136 | } |
137 | } | 137 | } |
138 | task->rss_stat.events = 0; | 138 | current->rss_stat.events = 0; |
139 | } | 139 | } |
140 | 140 | ||
141 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | 141 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) |
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task) | |||
157 | if (unlikely(task != current)) | 157 | if (unlikely(task != current)) |
158 | return; | 158 | return; |
159 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | 159 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) |
160 | __sync_task_rss_stat(task, task->mm); | 160 | sync_mm_rss(task->mm); |
161 | } | ||
162 | |||
163 | unsigned long get_mm_counter(struct mm_struct *mm, int member) | ||
164 | { | ||
165 | long val = 0; | ||
166 | |||
167 | /* | ||
168 | * Don't use task->mm here...for avoiding to use task_get_mm().. | ||
169 | * The caller must guarantee task->mm is not invalid. | ||
170 | */ | ||
171 | val = atomic_long_read(&mm->rss_stat.count[member]); | ||
172 | /* | ||
173 | * counter is updated in asynchronous manner and may go to minus. | ||
174 | * But it's never be expected number for users. | ||
175 | */ | ||
176 | if (val < 0) | ||
177 | return 0; | ||
178 | return (unsigned long)val; | ||
179 | } | ||
180 | |||
181 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
182 | { | ||
183 | __sync_task_rss_stat(task, mm); | ||
184 | } | 161 | } |
185 | #else /* SPLIT_RSS_COUNTING */ | 162 | #else /* SPLIT_RSS_COUNTING */ |
186 | 163 | ||
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) | |||
661 | int i; | 638 | int i; |
662 | 639 | ||
663 | if (current->mm == mm) | 640 | if (current->mm == mm) |
664 | sync_mm_rss(current, mm); | 641 | sync_mm_rss(mm); |
665 | for (i = 0; i < NR_MM_COUNTERS; i++) | 642 | for (i = 0; i < NR_MM_COUNTERS; i++) |
666 | if (rss[i]) | 643 | if (rss[i]) |
667 | add_mm_counter(mm, i, rss[i]); | 644 | add_mm_counter(mm, i, rss[i]); |
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1247 | do { | 1224 | do { |
1248 | next = pmd_addr_end(addr, end); | 1225 | next = pmd_addr_end(addr, end); |
1249 | if (pmd_trans_huge(*pmd)) { | 1226 | if (pmd_trans_huge(*pmd)) { |
1250 | if (next-addr != HPAGE_PMD_SIZE) { | 1227 | if (next - addr != HPAGE_PMD_SIZE) { |
1251 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1228 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); |
1252 | split_huge_page_pmd(vma->vm_mm, pmd); | 1229 | split_huge_page_pmd(vma->vm_mm, pmd); |
1253 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1230 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1254 | continue; | 1231 | goto next; |
1255 | /* fall through */ | 1232 | /* fall through */ |
1256 | } | 1233 | } |
1257 | if (pmd_none_or_clear_bad(pmd)) | 1234 | /* |
1258 | continue; | 1235 | * Here there can be other concurrent MADV_DONTNEED or |
1236 | * trans huge page faults running, and if the pmd is | ||
1237 | * none or trans huge it can change under us. This is | ||
1238 | * because MADV_DONTNEED holds the mmap_sem in read | ||
1239 | * mode. | ||
1240 | */ | ||
1241 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
1242 | goto next; | ||
1259 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); | 1243 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); |
1244 | next: | ||
1260 | cond_resched(); | 1245 | cond_resched(); |
1261 | } while (pmd++, addr = next, addr != end); | 1246 | } while (pmd++, addr = next, addr != end); |
1262 | 1247 | ||
@@ -1282,10 +1267,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | |||
1282 | return addr; | 1267 | return addr; |
1283 | } | 1268 | } |
1284 | 1269 | ||
1285 | static unsigned long unmap_page_range(struct mmu_gather *tlb, | 1270 | static void unmap_page_range(struct mmu_gather *tlb, |
1286 | struct vm_area_struct *vma, | 1271 | struct vm_area_struct *vma, |
1287 | unsigned long addr, unsigned long end, | 1272 | unsigned long addr, unsigned long end, |
1288 | struct zap_details *details) | 1273 | struct zap_details *details) |
1289 | { | 1274 | { |
1290 | pgd_t *pgd; | 1275 | pgd_t *pgd; |
1291 | unsigned long next; | 1276 | unsigned long next; |
@@ -1305,8 +1290,47 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1305 | } while (pgd++, addr = next, addr != end); | 1290 | } while (pgd++, addr = next, addr != end); |
1306 | tlb_end_vma(tlb, vma); | 1291 | tlb_end_vma(tlb, vma); |
1307 | mem_cgroup_uncharge_end(); | 1292 | mem_cgroup_uncharge_end(); |
1293 | } | ||
1308 | 1294 | ||
1309 | return addr; | 1295 | |
1296 | static void unmap_single_vma(struct mmu_gather *tlb, | ||
1297 | struct vm_area_struct *vma, unsigned long start_addr, | ||
1298 | unsigned long end_addr, unsigned long *nr_accounted, | ||
1299 | struct zap_details *details) | ||
1300 | { | ||
1301 | unsigned long start = max(vma->vm_start, start_addr); | ||
1302 | unsigned long end; | ||
1303 | |||
1304 | if (start >= vma->vm_end) | ||
1305 | return; | ||
1306 | end = min(vma->vm_end, end_addr); | ||
1307 | if (end <= vma->vm_start) | ||
1308 | return; | ||
1309 | |||
1310 | if (vma->vm_flags & VM_ACCOUNT) | ||
1311 | *nr_accounted += (end - start) >> PAGE_SHIFT; | ||
1312 | |||
1313 | if (unlikely(is_pfn_mapping(vma))) | ||
1314 | untrack_pfn_vma(vma, 0, 0); | ||
1315 | |||
1316 | if (start != end) { | ||
1317 | if (unlikely(is_vm_hugetlb_page(vma))) { | ||
1318 | /* | ||
1319 | * It is undesirable to test vma->vm_file as it | ||
1320 | * should be non-null for valid hugetlb area. | ||
1321 | * However, vm_file will be NULL in the error | ||
1322 | * cleanup path of do_mmap_pgoff. When | ||
1323 | * hugetlbfs ->mmap method fails, | ||
1324 | * do_mmap_pgoff() nullifies vma->vm_file | ||
1325 | * before calling this function to clean up. | ||
1326 | * Since no pte has actually been setup, it is | ||
1327 | * safe to do nothing in this case. | ||
1328 | */ | ||
1329 | if (vma->vm_file) | ||
1330 | unmap_hugepage_range(vma, start, end, NULL); | ||
1331 | } else | ||
1332 | unmap_page_range(tlb, vma, start, end, details); | ||
1333 | } | ||
1310 | } | 1334 | } |
1311 | 1335 | ||
1312 | /** | 1336 | /** |
@@ -1318,8 +1342,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1318 | * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here | 1342 | * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here |
1319 | * @details: details of nonlinear truncation or shared cache invalidation | 1343 | * @details: details of nonlinear truncation or shared cache invalidation |
1320 | * | 1344 | * |
1321 | * Returns the end address of the unmapping (restart addr if interrupted). | ||
1322 | * | ||
1323 | * Unmap all pages in the vma list. | 1345 | * Unmap all pages in the vma list. |
1324 | * | 1346 | * |
1325 | * Only addresses between `start' and `end' will be unmapped. | 1347 | * Only addresses between `start' and `end' will be unmapped. |
@@ -1331,55 +1353,18 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1331 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() | 1353 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() |
1332 | * drops the lock and schedules. | 1354 | * drops the lock and schedules. |
1333 | */ | 1355 | */ |
1334 | unsigned long unmap_vmas(struct mmu_gather *tlb, | 1356 | void unmap_vmas(struct mmu_gather *tlb, |
1335 | struct vm_area_struct *vma, unsigned long start_addr, | 1357 | struct vm_area_struct *vma, unsigned long start_addr, |
1336 | unsigned long end_addr, unsigned long *nr_accounted, | 1358 | unsigned long end_addr, unsigned long *nr_accounted, |
1337 | struct zap_details *details) | 1359 | struct zap_details *details) |
1338 | { | 1360 | { |
1339 | unsigned long start = start_addr; | ||
1340 | struct mm_struct *mm = vma->vm_mm; | 1361 | struct mm_struct *mm = vma->vm_mm; |
1341 | 1362 | ||
1342 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | 1363 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); |
1343 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { | 1364 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) |
1344 | unsigned long end; | 1365 | unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, |
1345 | 1366 | details); | |
1346 | start = max(vma->vm_start, start_addr); | ||
1347 | if (start >= vma->vm_end) | ||
1348 | continue; | ||
1349 | end = min(vma->vm_end, end_addr); | ||
1350 | if (end <= vma->vm_start) | ||
1351 | continue; | ||
1352 | |||
1353 | if (vma->vm_flags & VM_ACCOUNT) | ||
1354 | *nr_accounted += (end - start) >> PAGE_SHIFT; | ||
1355 | |||
1356 | if (unlikely(is_pfn_mapping(vma))) | ||
1357 | untrack_pfn_vma(vma, 0, 0); | ||
1358 | |||
1359 | while (start != end) { | ||
1360 | if (unlikely(is_vm_hugetlb_page(vma))) { | ||
1361 | /* | ||
1362 | * It is undesirable to test vma->vm_file as it | ||
1363 | * should be non-null for valid hugetlb area. | ||
1364 | * However, vm_file will be NULL in the error | ||
1365 | * cleanup path of do_mmap_pgoff. When | ||
1366 | * hugetlbfs ->mmap method fails, | ||
1367 | * do_mmap_pgoff() nullifies vma->vm_file | ||
1368 | * before calling this function to clean up. | ||
1369 | * Since no pte has actually been setup, it is | ||
1370 | * safe to do nothing in this case. | ||
1371 | */ | ||
1372 | if (vma->vm_file) | ||
1373 | unmap_hugepage_range(vma, start, end, NULL); | ||
1374 | |||
1375 | start = end; | ||
1376 | } else | ||
1377 | start = unmap_page_range(tlb, vma, start, end, details); | ||
1378 | } | ||
1379 | } | ||
1380 | |||
1381 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | 1367 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); |
1382 | return start; /* which is now the end (or restart) address */ | ||
1383 | } | 1368 | } |
1384 | 1369 | ||
1385 | /** | 1370 | /** |
@@ -1388,8 +1373,10 @@ unsigned long unmap_vmas(struct mmu_gather *tlb, | |||
1388 | * @address: starting address of pages to zap | 1373 | * @address: starting address of pages to zap |
1389 | * @size: number of bytes to zap | 1374 | * @size: number of bytes to zap |
1390 | * @details: details of nonlinear truncation or shared cache invalidation | 1375 | * @details: details of nonlinear truncation or shared cache invalidation |
1376 | * | ||
1377 | * Caller must protect the VMA list | ||
1391 | */ | 1378 | */ |
1392 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | 1379 | void zap_page_range(struct vm_area_struct *vma, unsigned long address, |
1393 | unsigned long size, struct zap_details *details) | 1380 | unsigned long size, struct zap_details *details) |
1394 | { | 1381 | { |
1395 | struct mm_struct *mm = vma->vm_mm; | 1382 | struct mm_struct *mm = vma->vm_mm; |
@@ -1400,9 +1387,34 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
1400 | lru_add_drain(); | 1387 | lru_add_drain(); |
1401 | tlb_gather_mmu(&tlb, mm, 0); | 1388 | tlb_gather_mmu(&tlb, mm, 0); |
1402 | update_hiwater_rss(mm); | 1389 | update_hiwater_rss(mm); |
1403 | end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); | 1390 | unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); |
1391 | tlb_finish_mmu(&tlb, address, end); | ||
1392 | } | ||
1393 | |||
1394 | /** | ||
1395 | * zap_page_range_single - remove user pages in a given range | ||
1396 | * @vma: vm_area_struct holding the applicable pages | ||
1397 | * @address: starting address of pages to zap | ||
1398 | * @size: number of bytes to zap | ||
1399 | * @details: details of nonlinear truncation or shared cache invalidation | ||
1400 | * | ||
1401 | * The range must fit into one VMA. | ||
1402 | */ | ||
1403 | static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, | ||
1404 | unsigned long size, struct zap_details *details) | ||
1405 | { | ||
1406 | struct mm_struct *mm = vma->vm_mm; | ||
1407 | struct mmu_gather tlb; | ||
1408 | unsigned long end = address + size; | ||
1409 | unsigned long nr_accounted = 0; | ||
1410 | |||
1411 | lru_add_drain(); | ||
1412 | tlb_gather_mmu(&tlb, mm, 0); | ||
1413 | update_hiwater_rss(mm); | ||
1414 | mmu_notifier_invalidate_range_start(mm, address, end); | ||
1415 | unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); | ||
1416 | mmu_notifier_invalidate_range_end(mm, address, end); | ||
1404 | tlb_finish_mmu(&tlb, address, end); | 1417 | tlb_finish_mmu(&tlb, address, end); |
1405 | return end; | ||
1406 | } | 1418 | } |
1407 | 1419 | ||
1408 | /** | 1420 | /** |
@@ -1423,7 +1435,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | |||
1423 | if (address < vma->vm_start || address + size > vma->vm_end || | 1435 | if (address < vma->vm_start || address + size > vma->vm_end || |
1424 | !(vma->vm_flags & VM_PFNMAP)) | 1436 | !(vma->vm_flags & VM_PFNMAP)) |
1425 | return -1; | 1437 | return -1; |
1426 | zap_page_range(vma, address, size, NULL); | 1438 | zap_page_range_single(vma, address, size, NULL); |
1427 | return 0; | 1439 | return 0; |
1428 | } | 1440 | } |
1429 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | 1441 | EXPORT_SYMBOL_GPL(zap_vma_ptes); |
@@ -2447,7 +2459,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |||
2447 | * fails, we just zero-fill it. Live with it. | 2459 | * fails, we just zero-fill it. Live with it. |
2448 | */ | 2460 | */ |
2449 | if (unlikely(!src)) { | 2461 | if (unlikely(!src)) { |
2450 | void *kaddr = kmap_atomic(dst, KM_USER0); | 2462 | void *kaddr = kmap_atomic(dst); |
2451 | void __user *uaddr = (void __user *)(va & PAGE_MASK); | 2463 | void __user *uaddr = (void __user *)(va & PAGE_MASK); |
2452 | 2464 | ||
2453 | /* | 2465 | /* |
@@ -2458,7 +2470,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |||
2458 | */ | 2470 | */ |
2459 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) | 2471 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) |
2460 | clear_page(kaddr); | 2472 | clear_page(kaddr); |
2461 | kunmap_atomic(kaddr, KM_USER0); | 2473 | kunmap_atomic(kaddr); |
2462 | flush_dcache_page(dst); | 2474 | flush_dcache_page(dst); |
2463 | } else | 2475 | } else |
2464 | copy_user_highpage(dst, src, va, vma); | 2476 | copy_user_highpage(dst, src, va, vma); |
@@ -2770,7 +2782,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, | |||
2770 | unsigned long start_addr, unsigned long end_addr, | 2782 | unsigned long start_addr, unsigned long end_addr, |
2771 | struct zap_details *details) | 2783 | struct zap_details *details) |
2772 | { | 2784 | { |
2773 | zap_page_range(vma, start_addr, end_addr - start_addr, details); | 2785 | zap_page_range_single(vma, start_addr, end_addr - start_addr, details); |
2774 | } | 2786 | } |
2775 | 2787 | ||
2776 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | 2788 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, |
@@ -3611,13 +3623,7 @@ static int __init gate_vma_init(void) | |||
3611 | gate_vma.vm_end = FIXADDR_USER_END; | 3623 | gate_vma.vm_end = FIXADDR_USER_END; |
3612 | gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; | 3624 | gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; |
3613 | gate_vma.vm_page_prot = __P101; | 3625 | gate_vma.vm_page_prot = __P101; |
3614 | /* | 3626 | |
3615 | * Make sure the vDSO gets into every core dump. | ||
3616 | * Dumping its contents makes post-mortem fully interpretable later | ||
3617 | * without matching up the same kernel and hardware config to see | ||
3618 | * what PC values meant. | ||
3619 | */ | ||
3620 | gate_vma.vm_flags |= VM_ALWAYSDUMP; | ||
3621 | return 0; | 3627 | return 0; |
3622 | } | 3628 | } |
3623 | __initcall(gate_vma_init); | 3629 | __initcall(gate_vma_init); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 06b145fb64a..cfb6c867875 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
512 | do { | 512 | do { |
513 | next = pmd_addr_end(addr, end); | 513 | next = pmd_addr_end(addr, end); |
514 | split_huge_page_pmd(vma->vm_mm, pmd); | 514 | split_huge_page_pmd(vma->vm_mm, pmd); |
515 | if (pmd_none_or_clear_bad(pmd)) | 515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
516 | continue; | 516 | continue; |
517 | if (check_pte_range(vma, pmd, addr, next, nodes, | 517 | if (check_pte_range(vma, pmd, addr, next, nodes, |
518 | flags, private)) | 518 | flags, private)) |
@@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
640 | unsigned long vmstart; | 640 | unsigned long vmstart; |
641 | unsigned long vmend; | 641 | unsigned long vmend; |
642 | 642 | ||
643 | vma = find_vma_prev(mm, start, &prev); | 643 | vma = find_vma(mm, start); |
644 | if (!vma || vma->vm_start > start) | 644 | if (!vma || vma->vm_start > start) |
645 | return -EFAULT; | 645 | return -EFAULT; |
646 | 646 | ||
647 | prev = vma->vm_prev; | ||
647 | if (start > vma->vm_start) | 648 | if (start > vma->vm_start) |
648 | prev = vma; | 649 | prev = vma; |
649 | 650 | ||
@@ -1322,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1322 | err = -ESRCH; | 1323 | err = -ESRCH; |
1323 | goto out; | 1324 | goto out; |
1324 | } | 1325 | } |
1325 | mm = get_task_mm(task); | 1326 | get_task_struct(task); |
1326 | rcu_read_unlock(); | ||
1327 | 1327 | ||
1328 | err = -EINVAL; | 1328 | err = -EINVAL; |
1329 | if (!mm) | ||
1330 | goto out; | ||
1331 | 1329 | ||
1332 | /* | 1330 | /* |
1333 | * Check if this process has the right to modify the specified | 1331 | * Check if this process has the right to modify the specified |
@@ -1335,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1335 | * capabilities, superuser privileges or the same | 1333 | * capabilities, superuser privileges or the same |
1336 | * userid as the target process. | 1334 | * userid as the target process. |
1337 | */ | 1335 | */ |
1338 | rcu_read_lock(); | ||
1339 | tcred = __task_cred(task); | 1336 | tcred = __task_cred(task); |
1340 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1337 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && |
1341 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1338 | cred->uid != tcred->suid && cred->uid != tcred->uid && |
1342 | !capable(CAP_SYS_NICE)) { | 1339 | !capable(CAP_SYS_NICE)) { |
1343 | rcu_read_unlock(); | 1340 | rcu_read_unlock(); |
1344 | err = -EPERM; | 1341 | err = -EPERM; |
1345 | goto out; | 1342 | goto out_put; |
1346 | } | 1343 | } |
1347 | rcu_read_unlock(); | 1344 | rcu_read_unlock(); |
1348 | 1345 | ||
@@ -1350,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1350 | /* Is the user allowed to access the target nodes? */ | 1347 | /* Is the user allowed to access the target nodes? */ |
1351 | if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { | 1348 | if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { |
1352 | err = -EPERM; | 1349 | err = -EPERM; |
1353 | goto out; | 1350 | goto out_put; |
1354 | } | 1351 | } |
1355 | 1352 | ||
1356 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { | 1353 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { |
1357 | err = -EINVAL; | 1354 | err = -EINVAL; |
1358 | goto out; | 1355 | goto out_put; |
1359 | } | 1356 | } |
1360 | 1357 | ||
1361 | err = security_task_movememory(task); | 1358 | err = security_task_movememory(task); |
1362 | if (err) | 1359 | if (err) |
1363 | goto out; | 1360 | goto out_put; |
1364 | 1361 | ||
1365 | err = do_migrate_pages(mm, old, new, | 1362 | mm = get_task_mm(task); |
1366 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | 1363 | put_task_struct(task); |
1367 | out: | ||
1368 | if (mm) | 1364 | if (mm) |
1369 | mmput(mm); | 1365 | err = do_migrate_pages(mm, old, new, |
1366 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | ||
1367 | else | ||
1368 | err = -EINVAL; | ||
1369 | |||
1370 | mmput(mm); | ||
1371 | out: | ||
1370 | NODEMASK_SCRATCH_FREE(scratch); | 1372 | NODEMASK_SCRATCH_FREE(scratch); |
1371 | 1373 | ||
1372 | return err; | 1374 | return err; |
1375 | |||
1376 | out_put: | ||
1377 | put_task_struct(task); | ||
1378 | goto out; | ||
1379 | |||
1373 | } | 1380 | } |
1374 | 1381 | ||
1375 | 1382 | ||
@@ -1843,18 +1850,24 @@ struct page * | |||
1843 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1850 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1844 | unsigned long addr, int node) | 1851 | unsigned long addr, int node) |
1845 | { | 1852 | { |
1846 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1853 | struct mempolicy *pol; |
1847 | struct zonelist *zl; | 1854 | struct zonelist *zl; |
1848 | struct page *page; | 1855 | struct page *page; |
1856 | unsigned int cpuset_mems_cookie; | ||
1857 | |||
1858 | retry_cpuset: | ||
1859 | pol = get_vma_policy(current, vma, addr); | ||
1860 | cpuset_mems_cookie = get_mems_allowed(); | ||
1849 | 1861 | ||
1850 | get_mems_allowed(); | ||
1851 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1862 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1852 | unsigned nid; | 1863 | unsigned nid; |
1853 | 1864 | ||
1854 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 1865 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
1855 | mpol_cond_put(pol); | 1866 | mpol_cond_put(pol); |
1856 | page = alloc_page_interleave(gfp, order, nid); | 1867 | page = alloc_page_interleave(gfp, order, nid); |
1857 | put_mems_allowed(); | 1868 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1869 | goto retry_cpuset; | ||
1870 | |||
1858 | return page; | 1871 | return page; |
1859 | } | 1872 | } |
1860 | zl = policy_zonelist(gfp, pol, node); | 1873 | zl = policy_zonelist(gfp, pol, node); |
@@ -1865,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1865 | struct page *page = __alloc_pages_nodemask(gfp, order, | 1878 | struct page *page = __alloc_pages_nodemask(gfp, order, |
1866 | zl, policy_nodemask(gfp, pol)); | 1879 | zl, policy_nodemask(gfp, pol)); |
1867 | __mpol_put(pol); | 1880 | __mpol_put(pol); |
1868 | put_mems_allowed(); | 1881 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1882 | goto retry_cpuset; | ||
1869 | return page; | 1883 | return page; |
1870 | } | 1884 | } |
1871 | /* | 1885 | /* |
@@ -1873,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1873 | */ | 1887 | */ |
1874 | page = __alloc_pages_nodemask(gfp, order, zl, | 1888 | page = __alloc_pages_nodemask(gfp, order, zl, |
1875 | policy_nodemask(gfp, pol)); | 1889 | policy_nodemask(gfp, pol)); |
1876 | put_mems_allowed(); | 1890 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1891 | goto retry_cpuset; | ||
1877 | return page; | 1892 | return page; |
1878 | } | 1893 | } |
1879 | 1894 | ||
@@ -1900,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1900 | { | 1915 | { |
1901 | struct mempolicy *pol = current->mempolicy; | 1916 | struct mempolicy *pol = current->mempolicy; |
1902 | struct page *page; | 1917 | struct page *page; |
1918 | unsigned int cpuset_mems_cookie; | ||
1903 | 1919 | ||
1904 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1920 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1905 | pol = &default_policy; | 1921 | pol = &default_policy; |
1906 | 1922 | ||
1907 | get_mems_allowed(); | 1923 | retry_cpuset: |
1924 | cpuset_mems_cookie = get_mems_allowed(); | ||
1925 | |||
1908 | /* | 1926 | /* |
1909 | * No reference counting needed for current->mempolicy | 1927 | * No reference counting needed for current->mempolicy |
1910 | * nor system default_policy | 1928 | * nor system default_policy |
@@ -1915,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1915 | page = __alloc_pages_nodemask(gfp, order, | 1933 | page = __alloc_pages_nodemask(gfp, order, |
1916 | policy_zonelist(gfp, pol, numa_node_id()), | 1934 | policy_zonelist(gfp, pol, numa_node_id()), |
1917 | policy_nodemask(gfp, pol)); | 1935 | policy_nodemask(gfp, pol)); |
1918 | put_mems_allowed(); | 1936 | |
1937 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1938 | goto retry_cpuset; | ||
1939 | |||
1919 | return page; | 1940 | return page; |
1920 | } | 1941 | } |
1921 | EXPORT_SYMBOL(alloc_pages_current); | 1942 | EXPORT_SYMBOL(alloc_pages_current); |
diff --git a/mm/migrate.c b/mm/migrate.c index 9871a56d82c..51c08a0c6f6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -445,7 +445,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
445 | ClearPageSwapCache(page); | 445 | ClearPageSwapCache(page); |
446 | ClearPagePrivate(page); | 446 | ClearPagePrivate(page); |
447 | set_page_private(page, 0); | 447 | set_page_private(page, 0); |
448 | page->mapping = NULL; | ||
449 | 448 | ||
450 | /* | 449 | /* |
451 | * If any waiters have accumulated on the new page then | 450 | * If any waiters have accumulated on the new page then |
@@ -667,6 +666,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
667 | } else { | 666 | } else { |
668 | if (remap_swapcache) | 667 | if (remap_swapcache) |
669 | remove_migration_ptes(page, newpage); | 668 | remove_migration_ptes(page, newpage); |
669 | page->mapping = NULL; | ||
670 | } | 670 | } |
671 | 671 | ||
672 | unlock_page(newpage); | 672 | unlock_page(newpage); |
@@ -839,8 +839,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
839 | if (!newpage) | 839 | if (!newpage) |
840 | return -ENOMEM; | 840 | return -ENOMEM; |
841 | 841 | ||
842 | mem_cgroup_reset_owner(newpage); | ||
843 | |||
844 | if (page_count(page) == 1) { | 842 | if (page_count(page) == 1) { |
845 | /* page was freed from under us. So we are done. */ | 843 | /* page was freed from under us. So we are done. */ |
846 | goto out; | 844 | goto out; |
@@ -1176,20 +1174,17 @@ set_status: | |||
1176 | * Migrate an array of page address onto an array of nodes and fill | 1174 | * Migrate an array of page address onto an array of nodes and fill |
1177 | * the corresponding array of status. | 1175 | * the corresponding array of status. |
1178 | */ | 1176 | */ |
1179 | static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | 1177 | static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, |
1180 | unsigned long nr_pages, | 1178 | unsigned long nr_pages, |
1181 | const void __user * __user *pages, | 1179 | const void __user * __user *pages, |
1182 | const int __user *nodes, | 1180 | const int __user *nodes, |
1183 | int __user *status, int flags) | 1181 | int __user *status, int flags) |
1184 | { | 1182 | { |
1185 | struct page_to_node *pm; | 1183 | struct page_to_node *pm; |
1186 | nodemask_t task_nodes; | ||
1187 | unsigned long chunk_nr_pages; | 1184 | unsigned long chunk_nr_pages; |
1188 | unsigned long chunk_start; | 1185 | unsigned long chunk_start; |
1189 | int err; | 1186 | int err; |
1190 | 1187 | ||
1191 | task_nodes = cpuset_mems_allowed(task); | ||
1192 | |||
1193 | err = -ENOMEM; | 1188 | err = -ENOMEM; |
1194 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); | 1189 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
1195 | if (!pm) | 1190 | if (!pm) |
@@ -1351,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1351 | struct task_struct *task; | 1346 | struct task_struct *task; |
1352 | struct mm_struct *mm; | 1347 | struct mm_struct *mm; |
1353 | int err; | 1348 | int err; |
1349 | nodemask_t task_nodes; | ||
1354 | 1350 | ||
1355 | /* Check flags */ | 1351 | /* Check flags */ |
1356 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) | 1352 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) |
@@ -1366,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1366 | rcu_read_unlock(); | 1362 | rcu_read_unlock(); |
1367 | return -ESRCH; | 1363 | return -ESRCH; |
1368 | } | 1364 | } |
1369 | mm = get_task_mm(task); | 1365 | get_task_struct(task); |
1370 | rcu_read_unlock(); | ||
1371 | |||
1372 | if (!mm) | ||
1373 | return -EINVAL; | ||
1374 | 1366 | ||
1375 | /* | 1367 | /* |
1376 | * Check if this process has the right to modify the specified | 1368 | * Check if this process has the right to modify the specified |
@@ -1378,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1378 | * capabilities, superuser privileges or the same | 1370 | * capabilities, superuser privileges or the same |
1379 | * userid as the target process. | 1371 | * userid as the target process. |
1380 | */ | 1372 | */ |
1381 | rcu_read_lock(); | ||
1382 | tcred = __task_cred(task); | 1373 | tcred = __task_cred(task); |
1383 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1374 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && |
1384 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1375 | cred->uid != tcred->suid && cred->uid != tcred->uid && |
@@ -1393,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1393 | if (err) | 1384 | if (err) |
1394 | goto out; | 1385 | goto out; |
1395 | 1386 | ||
1396 | if (nodes) { | 1387 | task_nodes = cpuset_mems_allowed(task); |
1397 | err = do_pages_move(mm, task, nr_pages, pages, nodes, status, | 1388 | mm = get_task_mm(task); |
1398 | flags); | 1389 | put_task_struct(task); |
1399 | } else { | 1390 | |
1400 | err = do_pages_stat(mm, nr_pages, pages, status); | 1391 | if (mm) { |
1401 | } | 1392 | if (nodes) |
1393 | err = do_pages_move(mm, task_nodes, nr_pages, pages, | ||
1394 | nodes, status, flags); | ||
1395 | else | ||
1396 | err = do_pages_stat(mm, nr_pages, pages, status); | ||
1397 | } else | ||
1398 | err = -EINVAL; | ||
1402 | 1399 | ||
1403 | out: | ||
1404 | mmput(mm); | 1400 | mmput(mm); |
1405 | return err; | 1401 | return err; |
1402 | |||
1403 | out: | ||
1404 | put_task_struct(task); | ||
1405 | return err; | ||
1406 | } | 1406 | } |
1407 | 1407 | ||
1408 | /* | 1408 | /* |
diff --git a/mm/mincore.c b/mm/mincore.c index 636a86876ff..936b4cee8cb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
164 | } | 164 | } |
165 | /* fall through */ | 165 | /* fall through */ |
166 | } | 166 | } |
167 | if (pmd_none_or_clear_bad(pmd)) | 167 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
168 | mincore_unmapped_range(vma, addr, next, vec); | 168 | mincore_unmapped_range(vma, addr, next, vec); |
169 | else | 169 | else |
170 | mincore_pte_range(vma, pmd, addr, next, vec); | 170 | mincore_pte_range(vma, pmd, addr, next, vec); |
diff --git a/mm/mlock.c b/mm/mlock.c index 4f4f53bdc65..ef726e8aa8e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
385 | return -EINVAL; | 385 | return -EINVAL; |
386 | if (end == start) | 386 | if (end == start) |
387 | return 0; | 387 | return 0; |
388 | vma = find_vma_prev(current->mm, start, &prev); | 388 | vma = find_vma(current->mm, start); |
389 | if (!vma || vma->vm_start > start) | 389 | if (!vma || vma->vm_start > start) |
390 | return -ENOMEM; | 390 | return -ENOMEM; |
391 | 391 | ||
392 | prev = vma->vm_prev; | ||
392 | if (start > vma->vm_start) | 393 | if (start > vma->vm_start) |
393 | prev = vma; | 394 | prev = vma; |
394 | 395 | ||
@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
451 | } | 451 | } |
452 | 452 | ||
453 | /* | 453 | /* |
454 | * Helper for vma_adjust in the split_vma insert case: | 454 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the |
455 | * insert vm structure into list and rbtree and anon_vma, | 455 | * mm's list and rbtree. It has already been inserted into the prio_tree. |
456 | * but it has already been inserted into prio_tree earlier. | ||
457 | */ | 456 | */ |
458 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 457 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
459 | { | 458 | { |
@@ -936,6 +935,19 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
936 | #endif /* CONFIG_PROC_FS */ | 935 | #endif /* CONFIG_PROC_FS */ |
937 | 936 | ||
938 | /* | 937 | /* |
938 | * If a hint addr is less than mmap_min_addr change hint to be as | ||
939 | * low as possible but still greater than mmap_min_addr | ||
940 | */ | ||
941 | static inline unsigned long round_hint_to_min(unsigned long hint) | ||
942 | { | ||
943 | hint &= PAGE_MASK; | ||
944 | if (((void *)hint != NULL) && | ||
945 | (hint < mmap_min_addr)) | ||
946 | return PAGE_ALIGN(mmap_min_addr); | ||
947 | return hint; | ||
948 | } | ||
949 | |||
950 | /* | ||
939 | * The caller must hold down_write(¤t->mm->mmap_sem). | 951 | * The caller must hold down_write(¤t->mm->mmap_sem). |
940 | */ | 952 | */ |
941 | 953 | ||
@@ -1099,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1099 | * A dummy user value is used because we are not locking | 1111 | * A dummy user value is used because we are not locking |
1100 | * memory so no accounting is necessary | 1112 | * memory so no accounting is necessary |
1101 | */ | 1113 | */ |
1102 | len = ALIGN(len, huge_page_size(&default_hstate)); | 1114 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, |
1103 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | 1115 | VM_NORESERVE, &user, |
1104 | &user, HUGETLB_ANONHUGE_INODE); | 1116 | HUGETLB_ANONHUGE_INODE); |
1105 | if (IS_ERR(file)) | 1117 | if (IS_ERR(file)) |
1106 | return PTR_ERR(file); | 1118 | return PTR_ERR(file); |
1107 | } | 1119 | } |
@@ -1235,7 +1247,7 @@ munmap_back: | |||
1235 | */ | 1247 | */ |
1236 | if (accountable_mapping(file, vm_flags)) { | 1248 | if (accountable_mapping(file, vm_flags)) { |
1237 | charged = len >> PAGE_SHIFT; | 1249 | charged = len >> PAGE_SHIFT; |
1238 | if (security_vm_enough_memory(charged)) | 1250 | if (security_vm_enough_memory_mm(mm, charged)) |
1239 | return -ENOMEM; | 1251 | return -ENOMEM; |
1240 | vm_flags |= VM_ACCOUNT; | 1252 | vm_flags |= VM_ACCOUNT; |
1241 | } | 1253 | } |
@@ -1266,8 +1278,9 @@ munmap_back: | |||
1266 | vma->vm_pgoff = pgoff; | 1278 | vma->vm_pgoff = pgoff; |
1267 | INIT_LIST_HEAD(&vma->anon_vma_chain); | 1279 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
1268 | 1280 | ||
1281 | error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ | ||
1282 | |||
1269 | if (file) { | 1283 | if (file) { |
1270 | error = -EINVAL; | ||
1271 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | 1284 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) |
1272 | goto free_vma; | 1285 | goto free_vma; |
1273 | if (vm_flags & VM_DENYWRITE) { | 1286 | if (vm_flags & VM_DENYWRITE) { |
@@ -1293,6 +1306,8 @@ munmap_back: | |||
1293 | pgoff = vma->vm_pgoff; | 1306 | pgoff = vma->vm_pgoff; |
1294 | vm_flags = vma->vm_flags; | 1307 | vm_flags = vma->vm_flags; |
1295 | } else if (vm_flags & VM_SHARED) { | 1308 | } else if (vm_flags & VM_SHARED) { |
1309 | if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) | ||
1310 | goto free_vma; | ||
1296 | error = shmem_zero_setup(vma); | 1311 | error = shmem_zero_setup(vma); |
1297 | if (error) | 1312 | if (error) |
1298 | goto free_vma; | 1313 | goto free_vma; |
@@ -1423,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | |||
1423 | /* | 1438 | /* |
1424 | * Is this a new hole at the lowest possible address? | 1439 | * Is this a new hole at the lowest possible address? |
1425 | */ | 1440 | */ |
1426 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { | 1441 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) |
1427 | mm->free_area_cache = addr; | 1442 | mm->free_area_cache = addr; |
1428 | mm->cached_hole_size = ~0UL; | ||
1429 | } | ||
1430 | } | 1443 | } |
1431 | 1444 | ||
1432 | /* | 1445 | /* |
@@ -1441,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1441 | { | 1454 | { |
1442 | struct vm_area_struct *vma; | 1455 | struct vm_area_struct *vma; |
1443 | struct mm_struct *mm = current->mm; | 1456 | struct mm_struct *mm = current->mm; |
1444 | unsigned long addr = addr0; | 1457 | unsigned long addr = addr0, start_addr; |
1445 | 1458 | ||
1446 | /* requested length too big for entire address space */ | 1459 | /* requested length too big for entire address space */ |
1447 | if (len > TASK_SIZE) | 1460 | if (len > TASK_SIZE) |
@@ -1465,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1465 | mm->free_area_cache = mm->mmap_base; | 1478 | mm->free_area_cache = mm->mmap_base; |
1466 | } | 1479 | } |
1467 | 1480 | ||
1481 | try_again: | ||
1468 | /* either no address requested or can't fit in requested address hole */ | 1482 | /* either no address requested or can't fit in requested address hole */ |
1469 | addr = mm->free_area_cache; | 1483 | start_addr = addr = mm->free_area_cache; |
1470 | |||
1471 | /* make sure it can fit in the remaining address space */ | ||
1472 | if (addr > len) { | ||
1473 | vma = find_vma(mm, addr-len); | ||
1474 | if (!vma || addr <= vma->vm_start) | ||
1475 | /* remember the address as a hint for next time */ | ||
1476 | return (mm->free_area_cache = addr-len); | ||
1477 | } | ||
1478 | 1484 | ||
1479 | if (mm->mmap_base < len) | 1485 | if (addr < len) |
1480 | goto bottomup; | 1486 | goto fail; |
1481 | |||
1482 | addr = mm->mmap_base-len; | ||
1483 | 1487 | ||
1488 | addr -= len; | ||
1484 | do { | 1489 | do { |
1485 | /* | 1490 | /* |
1486 | * Lookup failure means no vma is above this address, | 1491 | * Lookup failure means no vma is above this address, |
@@ -1500,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1500 | addr = vma->vm_start-len; | 1505 | addr = vma->vm_start-len; |
1501 | } while (len < vma->vm_start); | 1506 | } while (len < vma->vm_start); |
1502 | 1507 | ||
1503 | bottomup: | 1508 | fail: |
1509 | /* | ||
1510 | * if hint left us with no space for the requested | ||
1511 | * mapping then try again: | ||
1512 | * | ||
1513 | * Note: this is different with the case of bottomup | ||
1514 | * which does the fully line-search, but we use find_vma | ||
1515 | * here that causes some holes skipped. | ||
1516 | */ | ||
1517 | if (start_addr != mm->mmap_base) { | ||
1518 | mm->free_area_cache = mm->mmap_base; | ||
1519 | mm->cached_hole_size = 0; | ||
1520 | goto try_again; | ||
1521 | } | ||
1522 | |||
1504 | /* | 1523 | /* |
1505 | * A failed mmap() very likely causes application failure, | 1524 | * A failed mmap() very likely causes application failure, |
1506 | * so fall back to the bottom-up function here. This scenario | 1525 | * so fall back to the bottom-up function here. This scenario |
@@ -1605,7 +1624,6 @@ EXPORT_SYMBOL(find_vma); | |||
1605 | 1624 | ||
1606 | /* | 1625 | /* |
1607 | * Same as find_vma, but also return a pointer to the previous VMA in *pprev. | 1626 | * Same as find_vma, but also return a pointer to the previous VMA in *pprev. |
1608 | * Note: pprev is set to NULL when return value is NULL. | ||
1609 | */ | 1627 | */ |
1610 | struct vm_area_struct * | 1628 | struct vm_area_struct * |
1611 | find_vma_prev(struct mm_struct *mm, unsigned long addr, | 1629 | find_vma_prev(struct mm_struct *mm, unsigned long addr, |
@@ -1614,7 +1632,16 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, | |||
1614 | struct vm_area_struct *vma; | 1632 | struct vm_area_struct *vma; |
1615 | 1633 | ||
1616 | vma = find_vma(mm, addr); | 1634 | vma = find_vma(mm, addr); |
1617 | *pprev = vma ? vma->vm_prev : NULL; | 1635 | if (vma) { |
1636 | *pprev = vma->vm_prev; | ||
1637 | } else { | ||
1638 | struct rb_node *rb_node = mm->mm_rb.rb_node; | ||
1639 | *pprev = NULL; | ||
1640 | while (rb_node) { | ||
1641 | *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); | ||
1642 | rb_node = rb_node->rb_right; | ||
1643 | } | ||
1644 | } | ||
1618 | return vma; | 1645 | return vma; |
1619 | } | 1646 | } |
1620 | 1647 | ||
@@ -2169,7 +2196,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2169 | if (mm->map_count > sysctl_max_map_count) | 2196 | if (mm->map_count > sysctl_max_map_count) |
2170 | return -ENOMEM; | 2197 | return -ENOMEM; |
2171 | 2198 | ||
2172 | if (security_vm_enough_memory(len >> PAGE_SHIFT)) | 2199 | if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) |
2173 | return -ENOMEM; | 2200 | return -ENOMEM; |
2174 | 2201 | ||
2175 | /* Can we just expand an old private anonymous mapping? */ | 2202 | /* Can we just expand an old private anonymous mapping? */ |
@@ -2213,7 +2240,6 @@ void exit_mmap(struct mm_struct *mm) | |||
2213 | struct mmu_gather tlb; | 2240 | struct mmu_gather tlb; |
2214 | struct vm_area_struct *vma; | 2241 | struct vm_area_struct *vma; |
2215 | unsigned long nr_accounted = 0; | 2242 | unsigned long nr_accounted = 0; |
2216 | unsigned long end; | ||
2217 | 2243 | ||
2218 | /* mm's last user has gone, and its about to be pulled down */ | 2244 | /* mm's last user has gone, and its about to be pulled down */ |
2219 | mmu_notifier_release(mm); | 2245 | mmu_notifier_release(mm); |
@@ -2238,11 +2264,11 @@ void exit_mmap(struct mm_struct *mm) | |||
2238 | tlb_gather_mmu(&tlb, mm, 1); | 2264 | tlb_gather_mmu(&tlb, mm, 1); |
2239 | /* update_hiwater_rss(mm) here? but nobody should be looking */ | 2265 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2240 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2266 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2241 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2267 | unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
2242 | vm_unacct_memory(nr_accounted); | 2268 | vm_unacct_memory(nr_accounted); |
2243 | 2269 | ||
2244 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); | 2270 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); |
2245 | tlb_finish_mmu(&tlb, 0, end); | 2271 | tlb_finish_mmu(&tlb, 0, -1); |
2246 | 2272 | ||
2247 | /* | 2273 | /* |
2248 | * Walk the list again, actually closing and freeing it, | 2274 | * Walk the list again, actually closing and freeing it, |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index cf332bc0080..3dcfaf4ed35 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm) | |||
53 | struct task_struct *tsk = current; | 53 | struct task_struct *tsk = current; |
54 | 54 | ||
55 | task_lock(tsk); | 55 | task_lock(tsk); |
56 | sync_mm_rss(tsk, mm); | 56 | sync_mm_rss(mm); |
57 | tsk->mm = NULL; | 57 | tsk->mm = NULL; |
58 | /* active_mm is still 'mm' */ | 58 | /* active_mm is still 'mm' */ |
59 | enter_lazy_tlb(mm, tsk); | 59 | enter_lazy_tlb(mm, tsk); |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 5a688a2756b..a40992610ab 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
60 | ptent = pte_mkwrite(ptent); | 60 | ptent = pte_mkwrite(ptent); |
61 | 61 | ||
62 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 62 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
63 | } else if (PAGE_MIGRATION && !pte_file(oldpte)) { | 63 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { |
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 64 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
65 | 65 | ||
66 | if (is_write_migration_entry(entry)) { | 66 | if (is_write_migration_entry(entry)) { |
@@ -168,7 +168,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
168 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| | 168 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| |
169 | VM_SHARED|VM_NORESERVE))) { | 169 | VM_SHARED|VM_NORESERVE))) { |
170 | charged = nrpages; | 170 | charged = nrpages; |
171 | if (security_vm_enough_memory(charged)) | 171 | if (security_vm_enough_memory_mm(mm, charged)) |
172 | return -ENOMEM; | 172 | return -ENOMEM; |
173 | newflags |= VM_ACCOUNT; | 173 | newflags |= VM_ACCOUNT; |
174 | } | 174 | } |
@@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, | |||
262 | 262 | ||
263 | down_write(¤t->mm->mmap_sem); | 263 | down_write(¤t->mm->mmap_sem); |
264 | 264 | ||
265 | vma = find_vma_prev(current->mm, start, &prev); | 265 | vma = find_vma(current->mm, start); |
266 | error = -ENOMEM; | 266 | error = -ENOMEM; |
267 | if (!vma) | 267 | if (!vma) |
268 | goto out; | 268 | goto out; |
269 | prev = vma->vm_prev; | ||
269 | if (unlikely(grows & PROT_GROWSDOWN)) { | 270 | if (unlikely(grows & PROT_GROWSDOWN)) { |
270 | if (vma->vm_start >= end) | 271 | if (vma->vm_start >= end) |
271 | goto out; | 272 | goto out; |
diff --git a/mm/mremap.c b/mm/mremap.c index 87bb8393e7d..db8d983b5a7 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -329,7 +329,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
329 | 329 | ||
330 | if (vma->vm_flags & VM_ACCOUNT) { | 330 | if (vma->vm_flags & VM_ACCOUNT) { |
331 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; | 331 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; |
332 | if (security_vm_enough_memory(charged)) | 332 | if (security_vm_enough_memory_mm(mm, charged)) |
333 | goto Efault; | 333 | goto Efault; |
334 | *p = charged; | 334 | *p = charged; |
335 | } | 335 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index b982290fd96..f59e170fceb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -696,9 +696,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
696 | if (vma->vm_file) { | 696 | if (vma->vm_file) { |
697 | mapping = vma->vm_file->f_mapping; | 697 | mapping = vma->vm_file->f_mapping; |
698 | 698 | ||
699 | mutex_lock(&mapping->i_mmap_mutex); | ||
699 | flush_dcache_mmap_lock(mapping); | 700 | flush_dcache_mmap_lock(mapping); |
700 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 701 | vma_prio_tree_insert(vma, &mapping->i_mmap); |
701 | flush_dcache_mmap_unlock(mapping); | 702 | flush_dcache_mmap_unlock(mapping); |
703 | mutex_unlock(&mapping->i_mmap_mutex); | ||
702 | } | 704 | } |
703 | 705 | ||
704 | /* add the VMA to the tree */ | 706 | /* add the VMA to the tree */ |
@@ -760,9 +762,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
760 | if (vma->vm_file) { | 762 | if (vma->vm_file) { |
761 | mapping = vma->vm_file->f_mapping; | 763 | mapping = vma->vm_file->f_mapping; |
762 | 764 | ||
765 | mutex_lock(&mapping->i_mmap_mutex); | ||
763 | flush_dcache_mmap_lock(mapping); | 766 | flush_dcache_mmap_lock(mapping); |
764 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 767 | vma_prio_tree_remove(vma, &mapping->i_mmap); |
765 | flush_dcache_mmap_unlock(mapping); | 768 | flush_dcache_mmap_unlock(mapping); |
769 | mutex_unlock(&mapping->i_mmap_mutex); | ||
766 | } | 770 | } |
767 | 771 | ||
768 | /* remove from the MM's tree and list */ | 772 | /* remove from the MM's tree and list */ |
@@ -775,8 +779,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
775 | 779 | ||
776 | if (vma->vm_next) | 780 | if (vma->vm_next) |
777 | vma->vm_next->vm_prev = vma->vm_prev; | 781 | vma->vm_next->vm_prev = vma->vm_prev; |
778 | |||
779 | vma->vm_mm = NULL; | ||
780 | } | 782 | } |
781 | 783 | ||
782 | /* | 784 | /* |
@@ -2052,6 +2054,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2052 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2054 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
2053 | 2055 | ||
2054 | down_write(&nommu_region_sem); | 2056 | down_write(&nommu_region_sem); |
2057 | mutex_lock(&inode->i_mapping->i_mmap_mutex); | ||
2055 | 2058 | ||
2056 | /* search for VMAs that fall within the dead zone */ | 2059 | /* search for VMAs that fall within the dead zone */ |
2057 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | 2060 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, |
@@ -2059,6 +2062,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2059 | /* found one - only interested if it's shared out of the page | 2062 | /* found one - only interested if it's shared out of the page |
2060 | * cache */ | 2063 | * cache */ |
2061 | if (vma->vm_flags & VM_SHARED) { | 2064 | if (vma->vm_flags & VM_SHARED) { |
2065 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | ||
2062 | up_write(&nommu_region_sem); | 2066 | up_write(&nommu_region_sem); |
2063 | return -ETXTBSY; /* not quite true, but near enough */ | 2067 | return -ETXTBSY; /* not quite true, but near enough */ |
2064 | } | 2068 | } |
@@ -2086,6 +2090,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2086 | } | 2090 | } |
2087 | } | 2091 | } |
2088 | 2092 | ||
2093 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | ||
2089 | up_write(&nommu_region_sem); | 2094 | up_write(&nommu_region_sem); |
2090 | return 0; | 2095 | return 0; |
2091 | } | 2096 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2958fd8e7c9..46bf2ed5594 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/ptrace.h> | 34 | #include <linux/ptrace.h> |
35 | #include <linux/freezer.h> | 35 | #include <linux/freezer.h> |
36 | #include <linux/ftrace.h> | 36 | #include <linux/ftrace.h> |
37 | #include <linux/ratelimit.h> | ||
37 | 38 | ||
38 | #define CREATE_TRACE_POINTS | 39 | #define CREATE_TRACE_POINTS |
39 | #include <trace/events/oom.h> | 40 | #include <trace/events/oom.h> |
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
309 | */ | 310 | */ |
310 | static struct task_struct *select_bad_process(unsigned int *ppoints, | 311 | static struct task_struct *select_bad_process(unsigned int *ppoints, |
311 | unsigned long totalpages, struct mem_cgroup *memcg, | 312 | unsigned long totalpages, struct mem_cgroup *memcg, |
312 | const nodemask_t *nodemask) | 313 | const nodemask_t *nodemask, bool force_kill) |
313 | { | 314 | { |
314 | struct task_struct *g, *p; | 315 | struct task_struct *g, *p; |
315 | struct task_struct *chosen = NULL; | 316 | struct task_struct *chosen = NULL; |
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
335 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { | 336 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { |
336 | if (unlikely(frozen(p))) | 337 | if (unlikely(frozen(p))) |
337 | __thaw_task(p); | 338 | __thaw_task(p); |
338 | return ERR_PTR(-1UL); | 339 | if (!force_kill) |
340 | return ERR_PTR(-1UL); | ||
339 | } | 341 | } |
340 | if (!p->mm) | 342 | if (!p->mm) |
341 | continue; | 343 | continue; |
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
353 | if (p == current) { | 355 | if (p == current) { |
354 | chosen = p; | 356 | chosen = p; |
355 | *ppoints = 1000; | 357 | *ppoints = 1000; |
356 | } else { | 358 | } else if (!force_kill) { |
357 | /* | 359 | /* |
358 | * If this task is not being ptraced on exit, | 360 | * If this task is not being ptraced on exit, |
359 | * then wait for it to finish before killing | 361 | * then wait for it to finish before killing |
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
434 | } | 436 | } |
435 | 437 | ||
436 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 438 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
437 | static int oom_kill_task(struct task_struct *p) | 439 | static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, |
438 | { | 440 | unsigned int points, unsigned long totalpages, |
439 | struct task_struct *q; | 441 | struct mem_cgroup *memcg, nodemask_t *nodemask, |
440 | struct mm_struct *mm; | 442 | const char *message) |
441 | |||
442 | p = find_lock_task_mm(p); | ||
443 | if (!p) | ||
444 | return 1; | ||
445 | |||
446 | /* mm cannot be safely dereferenced after task_unlock(p) */ | ||
447 | mm = p->mm; | ||
448 | |||
449 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | ||
450 | task_pid_nr(p), p->comm, K(p->mm->total_vm), | ||
451 | K(get_mm_counter(p->mm, MM_ANONPAGES)), | ||
452 | K(get_mm_counter(p->mm, MM_FILEPAGES))); | ||
453 | task_unlock(p); | ||
454 | |||
455 | /* | ||
456 | * Kill all user processes sharing p->mm in other thread groups, if any. | ||
457 | * They don't get access to memory reserves or a higher scheduler | ||
458 | * priority, though, to avoid depletion of all memory or task | ||
459 | * starvation. This prevents mm->mmap_sem livelock when an oom killed | ||
460 | * task cannot exit because it requires the semaphore and its contended | ||
461 | * by another thread trying to allocate memory itself. That thread will | ||
462 | * now get access to memory reserves since it has a pending fatal | ||
463 | * signal. | ||
464 | */ | ||
465 | for_each_process(q) | ||
466 | if (q->mm == mm && !same_thread_group(q, p) && | ||
467 | !(q->flags & PF_KTHREAD)) { | ||
468 | if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
469 | continue; | ||
470 | |||
471 | task_lock(q); /* Protect ->comm from prctl() */ | ||
472 | pr_err("Kill process %d (%s) sharing same memory\n", | ||
473 | task_pid_nr(q), q->comm); | ||
474 | task_unlock(q); | ||
475 | force_sig(SIGKILL, q); | ||
476 | } | ||
477 | |||
478 | set_tsk_thread_flag(p, TIF_MEMDIE); | ||
479 | force_sig(SIGKILL, p); | ||
480 | |||
481 | return 0; | ||
482 | } | ||
483 | #undef K | ||
484 | |||
485 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | ||
486 | unsigned int points, unsigned long totalpages, | ||
487 | struct mem_cgroup *memcg, nodemask_t *nodemask, | ||
488 | const char *message) | ||
489 | { | 443 | { |
490 | struct task_struct *victim = p; | 444 | struct task_struct *victim = p; |
491 | struct task_struct *child; | 445 | struct task_struct *child; |
492 | struct task_struct *t = p; | 446 | struct task_struct *t = p; |
447 | struct mm_struct *mm; | ||
493 | unsigned int victim_points = 0; | 448 | unsigned int victim_points = 0; |
494 | 449 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | |
495 | if (printk_ratelimit()) | 450 | DEFAULT_RATELIMIT_BURST); |
496 | dump_header(p, gfp_mask, order, memcg, nodemask); | ||
497 | 451 | ||
498 | /* | 452 | /* |
499 | * If the task is already exiting, don't alarm the sysadmin or kill | 453 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
501 | */ | 455 | */ |
502 | if (p->flags & PF_EXITING) { | 456 | if (p->flags & PF_EXITING) { |
503 | set_tsk_thread_flag(p, TIF_MEMDIE); | 457 | set_tsk_thread_flag(p, TIF_MEMDIE); |
504 | return 0; | 458 | return; |
505 | } | 459 | } |
506 | 460 | ||
461 | if (__ratelimit(&oom_rs)) | ||
462 | dump_header(p, gfp_mask, order, memcg, nodemask); | ||
463 | |||
507 | task_lock(p); | 464 | task_lock(p); |
508 | pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", | 465 | pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", |
509 | message, task_pid_nr(p), p->comm, points); | 466 | message, task_pid_nr(p), p->comm, points); |
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
533 | } | 490 | } |
534 | } while_each_thread(p, t); | 491 | } while_each_thread(p, t); |
535 | 492 | ||
536 | return oom_kill_task(victim); | 493 | victim = find_lock_task_mm(victim); |
494 | if (!victim) | ||
495 | return; | ||
496 | |||
497 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | ||
498 | mm = victim->mm; | ||
499 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | ||
500 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | ||
501 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), | ||
502 | K(get_mm_counter(victim->mm, MM_FILEPAGES))); | ||
503 | task_unlock(victim); | ||
504 | |||
505 | /* | ||
506 | * Kill all user processes sharing victim->mm in other thread groups, if | ||
507 | * any. They don't get access to memory reserves, though, to avoid | ||
508 | * depletion of all memory. This prevents mm->mmap_sem livelock when an | ||
509 | * oom killed thread cannot exit because it requires the semaphore and | ||
510 | * its contended by another thread trying to allocate memory itself. | ||
511 | * That thread will now get access to memory reserves since it has a | ||
512 | * pending fatal signal. | ||
513 | */ | ||
514 | for_each_process(p) | ||
515 | if (p->mm == mm && !same_thread_group(p, victim) && | ||
516 | !(p->flags & PF_KTHREAD)) { | ||
517 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
518 | continue; | ||
519 | |||
520 | task_lock(p); /* Protect ->comm from prctl() */ | ||
521 | pr_err("Kill process %d (%s) sharing same memory\n", | ||
522 | task_pid_nr(p), p->comm); | ||
523 | task_unlock(p); | ||
524 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | ||
525 | } | ||
526 | |||
527 | set_tsk_thread_flag(victim, TIF_MEMDIE); | ||
528 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | ||
537 | } | 529 | } |
530 | #undef K | ||
538 | 531 | ||
539 | /* | 532 | /* |
540 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 533 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
561 | } | 554 | } |
562 | 555 | ||
563 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 556 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
564 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) | 557 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, |
558 | int order) | ||
565 | { | 559 | { |
566 | unsigned long limit; | 560 | unsigned long limit; |
567 | unsigned int points = 0; | 561 | unsigned int points = 0; |
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) | |||
577 | return; | 571 | return; |
578 | } | 572 | } |
579 | 573 | ||
580 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); | 574 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); |
581 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; | 575 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; |
582 | read_lock(&tasklist_lock); | 576 | read_lock(&tasklist_lock); |
583 | retry: | 577 | p = select_bad_process(&points, limit, memcg, NULL, false); |
584 | p = select_bad_process(&points, limit, memcg, NULL); | 578 | if (p && PTR_ERR(p) != -1UL) |
585 | if (!p || PTR_ERR(p) == -1UL) | 579 | oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, |
586 | goto out; | 580 | "Memory cgroup out of memory"); |
587 | |||
588 | if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, | ||
589 | "Memory cgroup out of memory")) | ||
590 | goto retry; | ||
591 | out: | ||
592 | read_unlock(&tasklist_lock); | 581 | read_unlock(&tasklist_lock); |
593 | } | 582 | } |
594 | #endif | 583 | #endif |
@@ -700,6 +689,7 @@ static void clear_system_oom(void) | |||
700 | * @gfp_mask: memory allocation flags | 689 | * @gfp_mask: memory allocation flags |
701 | * @order: amount of memory being requested as a power of 2 | 690 | * @order: amount of memory being requested as a power of 2 |
702 | * @nodemask: nodemask passed to page allocator | 691 | * @nodemask: nodemask passed to page allocator |
692 | * @force_kill: true if a task must be killed, even if others are exiting | ||
703 | * | 693 | * |
704 | * If we run out of memory, we have the choice between either | 694 | * If we run out of memory, we have the choice between either |
705 | * killing a random task (bad), letting the system crash (worse) | 695 | * killing a random task (bad), letting the system crash (worse) |
@@ -707,7 +697,7 @@ static void clear_system_oom(void) | |||
707 | * don't have to be perfect here, we just have to be good. | 697 | * don't have to be perfect here, we just have to be good. |
708 | */ | 698 | */ |
709 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 699 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
710 | int order, nodemask_t *nodemask) | 700 | int order, nodemask_t *nodemask, bool force_kill) |
711 | { | 701 | { |
712 | const nodemask_t *mpol_mask; | 702 | const nodemask_t *mpol_mask; |
713 | struct task_struct *p; | 703 | struct task_struct *p; |
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
745 | if (sysctl_oom_kill_allocating_task && | 735 | if (sysctl_oom_kill_allocating_task && |
746 | !oom_unkillable_task(current, NULL, nodemask) && | 736 | !oom_unkillable_task(current, NULL, nodemask) && |
747 | current->mm) { | 737 | current->mm) { |
748 | /* | 738 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, |
749 | * oom_kill_process() needs tasklist_lock held. If it returns | 739 | nodemask, |
750 | * non-zero, current could not be killed so we must fallback to | 740 | "Out of memory (oom_kill_allocating_task)"); |
751 | * the tasklist scan. | ||
752 | */ | ||
753 | if (!oom_kill_process(current, gfp_mask, order, 0, totalpages, | ||
754 | NULL, nodemask, | ||
755 | "Out of memory (oom_kill_allocating_task)")) | ||
756 | goto out; | ||
757 | } | ||
758 | |||
759 | retry: | ||
760 | p = select_bad_process(&points, totalpages, NULL, mpol_mask); | ||
761 | if (PTR_ERR(p) == -1UL) | ||
762 | goto out; | 741 | goto out; |
742 | } | ||
763 | 743 | ||
744 | p = select_bad_process(&points, totalpages, NULL, mpol_mask, | ||
745 | force_kill); | ||
764 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 746 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
765 | if (!p) { | 747 | if (!p) { |
766 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); | 748 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); |
767 | read_unlock(&tasklist_lock); | 749 | read_unlock(&tasklist_lock); |
768 | panic("Out of memory and no killable processes...\n"); | 750 | panic("Out of memory and no killable processes...\n"); |
769 | } | 751 | } |
770 | 752 | if (PTR_ERR(p) != -1UL) { | |
771 | if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, | 753 | oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, |
772 | nodemask, "Out of memory")) | 754 | nodemask, "Out of memory"); |
773 | goto retry; | 755 | killed = 1; |
774 | killed = 1; | 756 | } |
775 | out: | 757 | out: |
776 | read_unlock(&tasklist_lock); | 758 | read_unlock(&tasklist_lock); |
777 | 759 | ||
@@ -792,7 +774,7 @@ out: | |||
792 | void pagefault_out_of_memory(void) | 774 | void pagefault_out_of_memory(void) |
793 | { | 775 | { |
794 | if (try_set_system_oom()) { | 776 | if (try_set_system_oom()) { |
795 | out_of_memory(NULL, 0, 0, NULL); | 777 | out_of_memory(NULL, 0, 0, NULL, false); |
796 | clear_system_oom(); | 778 | clear_system_oom(); |
797 | } | 779 | } |
798 | if (!test_thread_flag(TIF_MEMDIE)) | 780 | if (!test_thread_flag(TIF_MEMDIE)) |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5e39858880f..26adea8ca2e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1474,6 +1474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
1474 | 1474 | ||
1475 | for ( ; ; ) { | 1475 | for ( ; ; ) { |
1476 | global_dirty_limits(&background_thresh, &dirty_thresh); | 1476 | global_dirty_limits(&background_thresh, &dirty_thresh); |
1477 | dirty_thresh = hard_dirty_limit(dirty_thresh); | ||
1477 | 1478 | ||
1478 | /* | 1479 | /* |
1479 | * Boost the allowable dirty threshold a bit for page | 1480 | * Boost the allowable dirty threshold a bit for page |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2186ecb36f..caea788628e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1968 | goto out; | 1968 | goto out; |
1969 | } | 1969 | } |
1970 | /* Exhausted what can be done so it's blamo time */ | 1970 | /* Exhausted what can be done so it's blamo time */ |
1971 | out_of_memory(zonelist, gfp_mask, order, nodemask); | 1971 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); |
1972 | 1972 | ||
1973 | out: | 1973 | out: |
1974 | clear_zonelist_oom(zonelist, gfp_mask); | 1974 | clear_zonelist_oom(zonelist, gfp_mask); |
@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
1990 | if (!order) | 1990 | if (!order) |
1991 | return NULL; | 1991 | return NULL; |
1992 | 1992 | ||
1993 | if (compaction_deferred(preferred_zone)) { | 1993 | if (compaction_deferred(preferred_zone, order)) { |
1994 | *deferred_compaction = true; | 1994 | *deferred_compaction = true; |
1995 | return NULL; | 1995 | return NULL; |
1996 | } | 1996 | } |
@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2012 | if (page) { | 2012 | if (page) { |
2013 | preferred_zone->compact_considered = 0; | 2013 | preferred_zone->compact_considered = 0; |
2014 | preferred_zone->compact_defer_shift = 0; | 2014 | preferred_zone->compact_defer_shift = 0; |
2015 | if (order >= preferred_zone->compact_order_failed) | ||
2016 | preferred_zone->compact_order_failed = order + 1; | ||
2015 | count_vm_event(COMPACTSUCCESS); | 2017 | count_vm_event(COMPACTSUCCESS); |
2016 | return page; | 2018 | return page; |
2017 | } | 2019 | } |
@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2028 | * defer if the failure was a sync compaction failure. | 2030 | * defer if the failure was a sync compaction failure. |
2029 | */ | 2031 | */ |
2030 | if (sync_migration) | 2032 | if (sync_migration) |
2031 | defer_compaction(preferred_zone); | 2033 | defer_compaction(preferred_zone, order); |
2032 | 2034 | ||
2033 | cond_resched(); | 2035 | cond_resched(); |
2034 | } | 2036 | } |
@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2378 | { | 2380 | { |
2379 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2381 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2380 | struct zone *preferred_zone; | 2382 | struct zone *preferred_zone; |
2381 | struct page *page; | 2383 | struct page *page = NULL; |
2382 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2384 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2385 | unsigned int cpuset_mems_cookie; | ||
2383 | 2386 | ||
2384 | gfp_mask &= gfp_allowed_mask; | 2387 | gfp_mask &= gfp_allowed_mask; |
2385 | 2388 | ||
@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2398 | if (unlikely(!zonelist->_zonerefs->zone)) | 2401 | if (unlikely(!zonelist->_zonerefs->zone)) |
2399 | return NULL; | 2402 | return NULL; |
2400 | 2403 | ||
2401 | get_mems_allowed(); | 2404 | retry_cpuset: |
2405 | cpuset_mems_cookie = get_mems_allowed(); | ||
2406 | |||
2402 | /* The preferred zone is used for statistics later */ | 2407 | /* The preferred zone is used for statistics later */ |
2403 | first_zones_zonelist(zonelist, high_zoneidx, | 2408 | first_zones_zonelist(zonelist, high_zoneidx, |
2404 | nodemask ? : &cpuset_current_mems_allowed, | 2409 | nodemask ? : &cpuset_current_mems_allowed, |
2405 | &preferred_zone); | 2410 | &preferred_zone); |
2406 | if (!preferred_zone) { | 2411 | if (!preferred_zone) |
2407 | put_mems_allowed(); | 2412 | goto out; |
2408 | return NULL; | ||
2409 | } | ||
2410 | 2413 | ||
2411 | /* First allocation attempt */ | 2414 | /* First allocation attempt */ |
2412 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2415 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2416 | page = __alloc_pages_slowpath(gfp_mask, order, | 2419 | page = __alloc_pages_slowpath(gfp_mask, order, |
2417 | zonelist, high_zoneidx, nodemask, | 2420 | zonelist, high_zoneidx, nodemask, |
2418 | preferred_zone, migratetype); | 2421 | preferred_zone, migratetype); |
2419 | put_mems_allowed(); | ||
2420 | 2422 | ||
2421 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2423 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2424 | |||
2425 | out: | ||
2426 | /* | ||
2427 | * When updating a task's mems_allowed, it is possible to race with | ||
2428 | * parallel threads in such a way that an allocation can fail while | ||
2429 | * the mask is being updated. If a page allocation is about to fail, | ||
2430 | * check if the cpuset changed during allocation and if so, retry. | ||
2431 | */ | ||
2432 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
2433 | goto retry_cpuset; | ||
2434 | |||
2422 | return page; | 2435 | return page; |
2423 | } | 2436 | } |
2424 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2437 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2632 | bool skip_free_areas_node(unsigned int flags, int nid) | 2645 | bool skip_free_areas_node(unsigned int flags, int nid) |
2633 | { | 2646 | { |
2634 | bool ret = false; | 2647 | bool ret = false; |
2648 | unsigned int cpuset_mems_cookie; | ||
2635 | 2649 | ||
2636 | if (!(flags & SHOW_MEM_FILTER_NODES)) | 2650 | if (!(flags & SHOW_MEM_FILTER_NODES)) |
2637 | goto out; | 2651 | goto out; |
2638 | 2652 | ||
2639 | get_mems_allowed(); | 2653 | do { |
2640 | ret = !node_isset(nid, cpuset_current_mems_allowed); | 2654 | cpuset_mems_cookie = get_mems_allowed(); |
2641 | put_mems_allowed(); | 2655 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
2656 | } while (!put_mems_allowed(cpuset_mems_cookie)); | ||
2642 | out: | 2657 | out: |
2643 | return ret; | 2658 | return ret; |
2644 | } | 2659 | } |
@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | |||
3925 | } | 3940 | } |
3926 | } | 3941 | } |
3927 | 3942 | ||
3928 | int __init add_from_early_node_map(struct range *range, int az, | ||
3929 | int nr_range, int nid) | ||
3930 | { | ||
3931 | unsigned long start_pfn, end_pfn; | ||
3932 | int i; | ||
3933 | |||
3934 | /* need to go over early_node_map to find out good range for node */ | ||
3935 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) | ||
3936 | nr_range = add_range(range, az, nr_range, start_pfn, end_pfn); | ||
3937 | return nr_range; | ||
3938 | } | ||
3939 | |||
3940 | /** | 3943 | /** |
3941 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | 3944 | * sparse_memory_present_with_active_regions - Call memory_present for each active range |
3942 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. | 3945 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
4521 | * memory. When they don't, some nodes will have more kernelcore than | 4524 | * memory. When they don't, some nodes will have more kernelcore than |
4522 | * others | 4525 | * others |
4523 | */ | 4526 | */ |
4524 | static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | 4527 | static void __init find_zone_movable_pfns_for_nodes(void) |
4525 | { | 4528 | { |
4526 | int i, nid; | 4529 | int i, nid; |
4527 | unsigned long usable_startpfn; | 4530 | unsigned long usable_startpfn; |
@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4713 | 4716 | ||
4714 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ | 4717 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ |
4715 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); | 4718 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); |
4716 | find_zone_movable_pfns_for_nodes(zone_movable_pfn); | 4719 | find_zone_movable_pfns_for_nodes(); |
4717 | 4720 | ||
4718 | /* Print out the zone ranges */ | 4721 | /* Print out the zone ranges */ |
4719 | printk("Zone PFN ranges:\n"); | 4722 | printk("Zone PFN ranges:\n"); |
@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
4823 | int cpu = (unsigned long)hcpu; | 4826 | int cpu = (unsigned long)hcpu; |
4824 | 4827 | ||
4825 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 4828 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
4829 | lru_add_drain_cpu(cpu); | ||
4826 | drain_pages(cpu); | 4830 | drain_pages(cpu); |
4827 | 4831 | ||
4828 | /* | 4832 | /* |
@@ -5236,6 +5240,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5236 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; | 5240 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; |
5237 | do_div(max, bucketsize); | 5241 | do_div(max, bucketsize); |
5238 | } | 5242 | } |
5243 | max = min(max, 0x80000000ULL); | ||
5239 | 5244 | ||
5240 | if (numentries > max) | 5245 | if (numentries > max) |
5241 | numentries = max; | 5246 | numentries = max; |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index de1616aa9b1..1ccbd714059 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -379,13 +379,15 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | |||
379 | pgoff_t offset = swp_offset(ent); | 379 | pgoff_t offset = swp_offset(ent); |
380 | struct swap_cgroup_ctrl *ctrl; | 380 | struct swap_cgroup_ctrl *ctrl; |
381 | struct page *mappage; | 381 | struct page *mappage; |
382 | struct swap_cgroup *sc; | ||
382 | 383 | ||
383 | ctrl = &swap_cgroup_ctrl[swp_type(ent)]; | 384 | ctrl = &swap_cgroup_ctrl[swp_type(ent)]; |
384 | if (ctrlp) | 385 | if (ctrlp) |
385 | *ctrlp = ctrl; | 386 | *ctrlp = ctrl; |
386 | 387 | ||
387 | mappage = ctrl->map[offset / SC_PER_PAGE]; | 388 | mappage = ctrl->map[offset / SC_PER_PAGE]; |
388 | return page_address(mappage) + offset % SC_PER_PAGE; | 389 | sc = page_address(mappage); |
390 | return sc + offset % SC_PER_PAGE; | ||
389 | } | 391 | } |
390 | 392 | ||
391 | /** | 393 | /** |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2f5cf10ff66..aa9701e1271 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -59,7 +59,7 @@ again: | |||
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd(walk->mm, pmd); | 61 | split_huge_page_pmd(walk->mm, pmd); |
62 | if (pmd_none_or_clear_bad(pmd)) | 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
65 | if (err) | 65 | if (err) |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 12a48a88c0d..405d331804c 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -184,8 +184,7 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | |||
184 | page_end - page_start); | 184 | page_end - page_start); |
185 | } | 185 | } |
186 | 186 | ||
187 | for (i = page_start; i < page_end; i++) | 187 | bitmap_clear(populated, page_start, page_end - page_start); |
188 | __clear_bit(i, populated); | ||
189 | } | 188 | } |
190 | 189 | ||
191 | /** | 190 | /** |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index eb663fb533e..5a74fea182f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, | |||
70 | unsigned long address, pmd_t *pmdp) | 70 | unsigned long address, pmd_t *pmdp) |
71 | { | 71 | { |
72 | int young; | 72 | int young; |
73 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | 73 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
74 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
75 | #else | ||
74 | BUG(); | 76 | BUG(); |
75 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 77 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
76 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
77 | young = pmdp_test_and_clear_young(vma, address, pmdp); | 78 | young = pmdp_test_and_clear_young(vma, address, pmdp); |
78 | if (young) | 79 | if (young) |
79 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | 80 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e920aa3ce10..c20ff48994c 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
@@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, | |||
298 | goto free_proc_pages; | 298 | goto free_proc_pages; |
299 | } | 299 | } |
300 | 300 | ||
301 | task_lock(task); | 301 | mm = mm_access(task, PTRACE_MODE_ATTACH); |
302 | if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { | 302 | if (!mm || IS_ERR(mm)) { |
303 | task_unlock(task); | 303 | rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; |
304 | rc = -EPERM; | 304 | /* |
305 | goto put_task_struct; | 305 | * Explicitly map EACCES to EPERM as EPERM is a more a |
306 | } | 306 | * appropriate error code for process_vw_readv/writev |
307 | mm = task->mm; | 307 | */ |
308 | 308 | if (rc == -EACCES) | |
309 | if (!mm || (task->flags & PF_KTHREAD)) { | 309 | rc = -EPERM; |
310 | task_unlock(task); | ||
311 | rc = -EINVAL; | ||
312 | goto put_task_struct; | 310 | goto put_task_struct; |
313 | } | 311 | } |
314 | 312 | ||
315 | atomic_inc(&mm->mm_users); | ||
316 | task_unlock(task); | ||
317 | |||
318 | for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { | 313 | for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { |
319 | rc = process_vm_rw_single_vec( | 314 | rc = process_vm_rw_single_vec( |
320 | (unsigned long)rvec[i].iov_base, rvec[i].iov_len, | 315 | (unsigned long)rvec[i].iov_base, rvec[i].iov_len, |
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | |||
120 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | 120 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); |
121 | } | 121 | } |
122 | 122 | ||
123 | static void anon_vma_chain_link(struct vm_area_struct *vma, | ||
124 | struct anon_vma_chain *avc, | ||
125 | struct anon_vma *anon_vma) | ||
126 | { | ||
127 | avc->vma = vma; | ||
128 | avc->anon_vma = anon_vma; | ||
129 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
130 | |||
131 | /* | ||
132 | * It's critical to add new vmas to the tail of the anon_vma, | ||
133 | * see comment in huge_memory.c:__split_huge_page(). | ||
134 | */ | ||
135 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
136 | } | ||
137 | |||
123 | /** | 138 | /** |
124 | * anon_vma_prepare - attach an anon_vma to a memory region | 139 | * anon_vma_prepare - attach an anon_vma to a memory region |
125 | * @vma: the memory region in question | 140 | * @vma: the memory region in question |
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
175 | spin_lock(&mm->page_table_lock); | 190 | spin_lock(&mm->page_table_lock); |
176 | if (likely(!vma->anon_vma)) { | 191 | if (likely(!vma->anon_vma)) { |
177 | vma->anon_vma = anon_vma; | 192 | vma->anon_vma = anon_vma; |
178 | avc->anon_vma = anon_vma; | 193 | anon_vma_chain_link(vma, avc, anon_vma); |
179 | avc->vma = vma; | ||
180 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
181 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
182 | allocated = NULL; | 194 | allocated = NULL; |
183 | avc = NULL; | 195 | avc = NULL; |
184 | } | 196 | } |
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) | |||
224 | mutex_unlock(&root->mutex); | 236 | mutex_unlock(&root->mutex); |
225 | } | 237 | } |
226 | 238 | ||
227 | static void anon_vma_chain_link(struct vm_area_struct *vma, | ||
228 | struct anon_vma_chain *avc, | ||
229 | struct anon_vma *anon_vma) | ||
230 | { | ||
231 | avc->vma = vma; | ||
232 | avc->anon_vma = anon_vma; | ||
233 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
234 | |||
235 | /* | ||
236 | * It's critical to add new vmas to the tail of the anon_vma, | ||
237 | * see comment in huge_memory.c:__split_huge_page(). | ||
238 | */ | ||
239 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
240 | } | ||
241 | |||
242 | /* | 239 | /* |
243 | * Attach the anon_vmas from src to dst. | 240 | * Attach the anon_vmas from src to dst. |
244 | * Returns 0 on success, -ENOMEM on failure. | 241 | * Returns 0 on success, -ENOMEM on failure. |
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page, | |||
1151 | */ | 1148 | */ |
1152 | void page_add_file_rmap(struct page *page) | 1149 | void page_add_file_rmap(struct page *page) |
1153 | { | 1150 | { |
1151 | bool locked; | ||
1152 | unsigned long flags; | ||
1153 | |||
1154 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); | ||
1154 | if (atomic_inc_and_test(&page->_mapcount)) { | 1155 | if (atomic_inc_and_test(&page->_mapcount)) { |
1155 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1156 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1156 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1157 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); |
1157 | } | 1158 | } |
1159 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | ||
1158 | } | 1160 | } |
1159 | 1161 | ||
1160 | /** | 1162 | /** |
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page) | |||
1165 | */ | 1167 | */ |
1166 | void page_remove_rmap(struct page *page) | 1168 | void page_remove_rmap(struct page *page) |
1167 | { | 1169 | { |
1170 | bool anon = PageAnon(page); | ||
1171 | bool locked; | ||
1172 | unsigned long flags; | ||
1173 | |||
1174 | /* | ||
1175 | * The anon case has no mem_cgroup page_stat to update; but may | ||
1176 | * uncharge_page() below, where the lock ordering can deadlock if | ||
1177 | * we hold the lock against page_stat move: so avoid it on anon. | ||
1178 | */ | ||
1179 | if (!anon) | ||
1180 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); | ||
1181 | |||
1168 | /* page still mapped by someone else? */ | 1182 | /* page still mapped by someone else? */ |
1169 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1183 | if (!atomic_add_negative(-1, &page->_mapcount)) |
1170 | return; | 1184 | goto out; |
1171 | 1185 | ||
1172 | /* | 1186 | /* |
1173 | * Now that the last pte has gone, s390 must transfer dirty | 1187 | * Now that the last pte has gone, s390 must transfer dirty |
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page) | |||
1176 | * not if it's in swapcache - there might be another pte slot | 1190 | * not if it's in swapcache - there might be another pte slot |
1177 | * containing the swap entry, but page not yet written to swap. | 1191 | * containing the swap entry, but page not yet written to swap. |
1178 | */ | 1192 | */ |
1179 | if ((!PageAnon(page) || PageSwapCache(page)) && | 1193 | if ((!anon || PageSwapCache(page)) && |
1180 | page_test_and_clear_dirty(page_to_pfn(page), 1)) | 1194 | page_test_and_clear_dirty(page_to_pfn(page), 1)) |
1181 | set_page_dirty(page); | 1195 | set_page_dirty(page); |
1182 | /* | 1196 | /* |
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page) | |||
1184 | * and not charged by memcg for now. | 1198 | * and not charged by memcg for now. |
1185 | */ | 1199 | */ |
1186 | if (unlikely(PageHuge(page))) | 1200 | if (unlikely(PageHuge(page))) |
1187 | return; | 1201 | goto out; |
1188 | if (PageAnon(page)) { | 1202 | if (anon) { |
1189 | mem_cgroup_uncharge_page(page); | 1203 | mem_cgroup_uncharge_page(page); |
1190 | if (!PageTransHuge(page)) | 1204 | if (!PageTransHuge(page)) |
1191 | __dec_zone_page_state(page, NR_ANON_PAGES); | 1205 | __dec_zone_page_state(page, NR_ANON_PAGES); |
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page) | |||
1205 | * Leaving it set also helps swapoff to reinstate ptes | 1219 | * Leaving it set also helps swapoff to reinstate ptes |
1206 | * faster for those pages still in swapcache. | 1220 | * faster for those pages still in swapcache. |
1207 | */ | 1221 | */ |
1222 | out: | ||
1223 | if (!anon) | ||
1224 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | ||
1208 | } | 1225 | } |
1209 | 1226 | ||
1210 | /* | 1227 | /* |
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1282 | } | 1299 | } |
1283 | dec_mm_counter(mm, MM_ANONPAGES); | 1300 | dec_mm_counter(mm, MM_ANONPAGES); |
1284 | inc_mm_counter(mm, MM_SWAPENTS); | 1301 | inc_mm_counter(mm, MM_SWAPENTS); |
1285 | } else if (PAGE_MIGRATION) { | 1302 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { |
1286 | /* | 1303 | /* |
1287 | * Store the pfn of the page in a special migration | 1304 | * Store the pfn of the page in a special migration |
1288 | * pte. do_swap_page() will wait until the migration | 1305 | * pte. do_swap_page() will wait until the migration |
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1293 | } | 1310 | } |
1294 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 1311 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
1295 | BUG_ON(pte_file(*pte)); | 1312 | BUG_ON(pte_file(*pte)); |
1296 | } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { | 1313 | } else if (IS_ENABLED(CONFIG_MIGRATION) && |
1314 | (TTU_ACTION(flags) == TTU_MIGRATION)) { | ||
1297 | /* Establish migration entry for a file page */ | 1315 | /* Establish migration entry for a file page */ |
1298 | swp_entry_t entry; | 1316 | swp_entry_t entry; |
1299 | entry = make_migration_entry(page, pte_write(pteval)); | 1317 | entry = make_migration_entry(page, pte_write(pteval)); |
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1499 | * locking requirements of exec(), migration skips | 1517 | * locking requirements of exec(), migration skips |
1500 | * temporary VMAs until after exec() completes. | 1518 | * temporary VMAs until after exec() completes. |
1501 | */ | 1519 | */ |
1502 | if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && | 1520 | if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && |
1503 | is_vma_temporary_stack(vma)) | 1521 | is_vma_temporary_stack(vma)) |
1504 | continue; | 1522 | continue; |
1505 | 1523 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 269d049294a..f99ff3e50bd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -127,7 +127,7 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) | |||
127 | static inline int shmem_acct_size(unsigned long flags, loff_t size) | 127 | static inline int shmem_acct_size(unsigned long flags, loff_t size) |
128 | { | 128 | { |
129 | return (flags & VM_NORESERVE) ? | 129 | return (flags & VM_NORESERVE) ? |
130 | 0 : security_vm_enough_memory_kern(VM_ACCT(size)); | 130 | 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); |
131 | } | 131 | } |
132 | 132 | ||
133 | static inline void shmem_unacct_size(unsigned long flags, loff_t size) | 133 | static inline void shmem_unacct_size(unsigned long flags, loff_t size) |
@@ -145,7 +145,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) | |||
145 | static inline int shmem_acct_block(unsigned long flags) | 145 | static inline int shmem_acct_block(unsigned long flags) |
146 | { | 146 | { |
147 | return (flags & VM_NORESERVE) ? | 147 | return (flags & VM_NORESERVE) ? |
148 | security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; | 148 | security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; |
149 | } | 149 | } |
150 | 150 | ||
151 | static inline void shmem_unacct_blocks(unsigned long flags, long pages) | 151 | static inline void shmem_unacct_blocks(unsigned long flags, long pages) |
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1178 | static const struct inode_operations shmem_symlink_inode_operations; | 1178 | static const struct inode_operations shmem_symlink_inode_operations; |
1179 | static const struct inode_operations shmem_short_symlink_operations; | 1179 | static const struct inode_operations shmem_short_symlink_operations; |
1180 | 1180 | ||
1181 | #ifdef CONFIG_TMPFS_XATTR | ||
1182 | static int shmem_initxattrs(struct inode *, const struct xattr *, void *); | ||
1183 | #else | ||
1184 | #define shmem_initxattrs NULL | ||
1185 | #endif | ||
1186 | |||
1181 | static int | 1187 | static int |
1182 | shmem_write_begin(struct file *file, struct address_space *mapping, | 1188 | shmem_write_begin(struct file *file, struct address_space *mapping, |
1183 | loff_t pos, unsigned len, unsigned flags, | 1189 | loff_t pos, unsigned len, unsigned flags, |
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) | |||
1490 | if (inode) { | 1496 | if (inode) { |
1491 | error = security_inode_init_security(inode, dir, | 1497 | error = security_inode_init_security(inode, dir, |
1492 | &dentry->d_name, | 1498 | &dentry->d_name, |
1493 | NULL, NULL); | 1499 | shmem_initxattrs, NULL); |
1494 | if (error) { | 1500 | if (error) { |
1495 | if (error != -EOPNOTSUPP) { | 1501 | if (error != -EOPNOTSUPP) { |
1496 | iput(inode); | 1502 | iput(inode); |
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1630 | return -ENOSPC; | 1636 | return -ENOSPC; |
1631 | 1637 | ||
1632 | error = security_inode_init_security(inode, dir, &dentry->d_name, | 1638 | error = security_inode_init_security(inode, dir, &dentry->d_name, |
1633 | NULL, NULL); | 1639 | shmem_initxattrs, NULL); |
1634 | if (error) { | 1640 | if (error) { |
1635 | if (error != -EOPNOTSUPP) { | 1641 | if (error != -EOPNOTSUPP) { |
1636 | iput(inode); | 1642 | iput(inode); |
@@ -1656,9 +1662,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1656 | } | 1662 | } |
1657 | inode->i_mapping->a_ops = &shmem_aops; | 1663 | inode->i_mapping->a_ops = &shmem_aops; |
1658 | inode->i_op = &shmem_symlink_inode_operations; | 1664 | inode->i_op = &shmem_symlink_inode_operations; |
1659 | kaddr = kmap_atomic(page, KM_USER0); | 1665 | kaddr = kmap_atomic(page); |
1660 | memcpy(kaddr, symname, len); | 1666 | memcpy(kaddr, symname, len); |
1661 | kunmap_atomic(kaddr, KM_USER0); | 1667 | kunmap_atomic(kaddr); |
1662 | set_page_dirty(page); | 1668 | set_page_dirty(page); |
1663 | unlock_page(page); | 1669 | unlock_page(page); |
1664 | page_cache_release(page); | 1670 | page_cache_release(page); |
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co | |||
1704 | * filesystem level, though. | 1710 | * filesystem level, though. |
1705 | */ | 1711 | */ |
1706 | 1712 | ||
1713 | /* | ||
1714 | * Allocate new xattr and copy in the value; but leave the name to callers. | ||
1715 | */ | ||
1716 | static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size) | ||
1717 | { | ||
1718 | struct shmem_xattr *new_xattr; | ||
1719 | size_t len; | ||
1720 | |||
1721 | /* wrap around? */ | ||
1722 | len = sizeof(*new_xattr) + size; | ||
1723 | if (len <= sizeof(*new_xattr)) | ||
1724 | return NULL; | ||
1725 | |||
1726 | new_xattr = kmalloc(len, GFP_KERNEL); | ||
1727 | if (!new_xattr) | ||
1728 | return NULL; | ||
1729 | |||
1730 | new_xattr->size = size; | ||
1731 | memcpy(new_xattr->value, value, size); | ||
1732 | return new_xattr; | ||
1733 | } | ||
1734 | |||
1735 | /* | ||
1736 | * Callback for security_inode_init_security() for acquiring xattrs. | ||
1737 | */ | ||
1738 | static int shmem_initxattrs(struct inode *inode, | ||
1739 | const struct xattr *xattr_array, | ||
1740 | void *fs_info) | ||
1741 | { | ||
1742 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
1743 | const struct xattr *xattr; | ||
1744 | struct shmem_xattr *new_xattr; | ||
1745 | size_t len; | ||
1746 | |||
1747 | for (xattr = xattr_array; xattr->name != NULL; xattr++) { | ||
1748 | new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); | ||
1749 | if (!new_xattr) | ||
1750 | return -ENOMEM; | ||
1751 | |||
1752 | len = strlen(xattr->name) + 1; | ||
1753 | new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, | ||
1754 | GFP_KERNEL); | ||
1755 | if (!new_xattr->name) { | ||
1756 | kfree(new_xattr); | ||
1757 | return -ENOMEM; | ||
1758 | } | ||
1759 | |||
1760 | memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, | ||
1761 | XATTR_SECURITY_PREFIX_LEN); | ||
1762 | memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, | ||
1763 | xattr->name, len); | ||
1764 | |||
1765 | spin_lock(&info->lock); | ||
1766 | list_add(&new_xattr->list, &info->xattr_list); | ||
1767 | spin_unlock(&info->lock); | ||
1768 | } | ||
1769 | |||
1770 | return 0; | ||
1771 | } | ||
1772 | |||
1707 | static int shmem_xattr_get(struct dentry *dentry, const char *name, | 1773 | static int shmem_xattr_get(struct dentry *dentry, const char *name, |
1708 | void *buffer, size_t size) | 1774 | void *buffer, size_t size) |
1709 | { | 1775 | { |
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name, | |||
1731 | return ret; | 1797 | return ret; |
1732 | } | 1798 | } |
1733 | 1799 | ||
1734 | static int shmem_xattr_set(struct dentry *dentry, const char *name, | 1800 | static int shmem_xattr_set(struct inode *inode, const char *name, |
1735 | const void *value, size_t size, int flags) | 1801 | const void *value, size_t size, int flags) |
1736 | { | 1802 | { |
1737 | struct inode *inode = dentry->d_inode; | ||
1738 | struct shmem_inode_info *info = SHMEM_I(inode); | 1803 | struct shmem_inode_info *info = SHMEM_I(inode); |
1739 | struct shmem_xattr *xattr; | 1804 | struct shmem_xattr *xattr; |
1740 | struct shmem_xattr *new_xattr = NULL; | 1805 | struct shmem_xattr *new_xattr = NULL; |
1741 | size_t len; | ||
1742 | int err = 0; | 1806 | int err = 0; |
1743 | 1807 | ||
1744 | /* value == NULL means remove */ | 1808 | /* value == NULL means remove */ |
1745 | if (value) { | 1809 | if (value) { |
1746 | /* wrap around? */ | 1810 | new_xattr = shmem_xattr_alloc(value, size); |
1747 | len = sizeof(*new_xattr) + size; | ||
1748 | if (len <= sizeof(*new_xattr)) | ||
1749 | return -ENOMEM; | ||
1750 | |||
1751 | new_xattr = kmalloc(len, GFP_KERNEL); | ||
1752 | if (!new_xattr) | 1811 | if (!new_xattr) |
1753 | return -ENOMEM; | 1812 | return -ENOMEM; |
1754 | 1813 | ||
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name, | |||
1757 | kfree(new_xattr); | 1816 | kfree(new_xattr); |
1758 | return -ENOMEM; | 1817 | return -ENOMEM; |
1759 | } | 1818 | } |
1760 | |||
1761 | new_xattr->size = size; | ||
1762 | memcpy(new_xattr->value, value, size); | ||
1763 | } | 1819 | } |
1764 | 1820 | ||
1765 | spin_lock(&info->lock); | 1821 | spin_lock(&info->lock); |
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, | |||
1858 | if (size == 0) | 1914 | if (size == 0) |
1859 | value = ""; /* empty EA, do not remove */ | 1915 | value = ""; /* empty EA, do not remove */ |
1860 | 1916 | ||
1861 | return shmem_xattr_set(dentry, name, value, size, flags); | 1917 | return shmem_xattr_set(dentry->d_inode, name, value, size, flags); |
1862 | 1918 | ||
1863 | } | 1919 | } |
1864 | 1920 | ||
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) | |||
1878 | if (err) | 1934 | if (err) |
1879 | return err; | 1935 | return err; |
1880 | 1936 | ||
1881 | return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); | 1937 | return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); |
1882 | } | 1938 | } |
1883 | 1939 | ||
1884 | static bool xattr_is_trusted(const char *name) | 1940 | static bool xattr_is_trusted(const char *name) |
@@ -2175,7 +2231,6 @@ static void shmem_put_super(struct super_block *sb) | |||
2175 | int shmem_fill_super(struct super_block *sb, void *data, int silent) | 2231 | int shmem_fill_super(struct super_block *sb, void *data, int silent) |
2176 | { | 2232 | { |
2177 | struct inode *inode; | 2233 | struct inode *inode; |
2178 | struct dentry *root; | ||
2179 | struct shmem_sb_info *sbinfo; | 2234 | struct shmem_sb_info *sbinfo; |
2180 | int err = -ENOMEM; | 2235 | int err = -ENOMEM; |
2181 | 2236 | ||
@@ -2232,14 +2287,11 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2232 | goto failed; | 2287 | goto failed; |
2233 | inode->i_uid = sbinfo->uid; | 2288 | inode->i_uid = sbinfo->uid; |
2234 | inode->i_gid = sbinfo->gid; | 2289 | inode->i_gid = sbinfo->gid; |
2235 | root = d_alloc_root(inode); | 2290 | sb->s_root = d_make_root(inode); |
2236 | if (!root) | 2291 | if (!sb->s_root) |
2237 | goto failed_iput; | 2292 | goto failed; |
2238 | sb->s_root = root; | ||
2239 | return 0; | 2293 | return 0; |
2240 | 2294 | ||
2241 | failed_iput: | ||
2242 | iput(inode); | ||
2243 | failed: | 2295 | failed: |
2244 | shmem_put_super(sb); | 2296 | shmem_put_super(sb); |
2245 | return err; | 2297 | return err; |
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3284 | if (in_interrupt() || (flags & __GFP_THISNODE)) | 3284 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
3285 | return NULL; | 3285 | return NULL; |
3286 | nid_alloc = nid_here = numa_mem_id(); | 3286 | nid_alloc = nid_here = numa_mem_id(); |
3287 | get_mems_allowed(); | ||
3288 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3287 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3289 | nid_alloc = cpuset_slab_spread_node(); | 3288 | nid_alloc = cpuset_slab_spread_node(); |
3290 | else if (current->mempolicy) | 3289 | else if (current->mempolicy) |
3291 | nid_alloc = slab_node(current->mempolicy); | 3290 | nid_alloc = slab_node(current->mempolicy); |
3292 | put_mems_allowed(); | ||
3293 | if (nid_alloc != nid_here) | 3291 | if (nid_alloc != nid_here) |
3294 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3292 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3295 | return NULL; | 3293 | return NULL; |
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3312 | enum zone_type high_zoneidx = gfp_zone(flags); | 3310 | enum zone_type high_zoneidx = gfp_zone(flags); |
3313 | void *obj = NULL; | 3311 | void *obj = NULL; |
3314 | int nid; | 3312 | int nid; |
3313 | unsigned int cpuset_mems_cookie; | ||
3315 | 3314 | ||
3316 | if (flags & __GFP_THISNODE) | 3315 | if (flags & __GFP_THISNODE) |
3317 | return NULL; | 3316 | return NULL; |
3318 | 3317 | ||
3319 | get_mems_allowed(); | ||
3320 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3321 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 3318 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3322 | 3319 | ||
3320 | retry_cpuset: | ||
3321 | cpuset_mems_cookie = get_mems_allowed(); | ||
3322 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3323 | |||
3323 | retry: | 3324 | retry: |
3324 | /* | 3325 | /* |
3325 | * Look through allowed nodes for objects available | 3326 | * Look through allowed nodes for objects available |
@@ -3372,7 +3373,9 @@ retry: | |||
3372 | } | 3373 | } |
3373 | } | 3374 | } |
3374 | } | 3375 | } |
3375 | put_mems_allowed(); | 3376 | |
3377 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) | ||
3378 | goto retry_cpuset; | ||
3376 | return obj; | 3379 | return obj; |
3377 | } | 3380 | } |
3378 | 3381 | ||
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1581 | struct zone *zone; | 1581 | struct zone *zone; |
1582 | enum zone_type high_zoneidx = gfp_zone(flags); | 1582 | enum zone_type high_zoneidx = gfp_zone(flags); |
1583 | void *object; | 1583 | void *object; |
1584 | unsigned int cpuset_mems_cookie; | ||
1584 | 1585 | ||
1585 | /* | 1586 | /* |
1586 | * The defrag ratio allows a configuration of the tradeoffs between | 1587 | * The defrag ratio allows a configuration of the tradeoffs between |
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1604 | get_cycles() % 1024 > s->remote_node_defrag_ratio) | 1605 | get_cycles() % 1024 > s->remote_node_defrag_ratio) |
1605 | return NULL; | 1606 | return NULL; |
1606 | 1607 | ||
1607 | get_mems_allowed(); | 1608 | do { |
1608 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 1609 | cpuset_mems_cookie = get_mems_allowed(); |
1609 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1610 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); |
1610 | struct kmem_cache_node *n; | 1611 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1611 | 1612 | struct kmem_cache_node *n; | |
1612 | n = get_node(s, zone_to_nid(zone)); | 1613 | |
1613 | 1614 | n = get_node(s, zone_to_nid(zone)); | |
1614 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1615 | |
1615 | n->nr_partial > s->min_partial) { | 1616 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1616 | object = get_partial_node(s, n, c); | 1617 | n->nr_partial > s->min_partial) { |
1617 | if (object) { | 1618 | object = get_partial_node(s, n, c); |
1618 | put_mems_allowed(); | 1619 | if (object) { |
1619 | return object; | 1620 | /* |
1621 | * Return the object even if | ||
1622 | * put_mems_allowed indicated that | ||
1623 | * the cpuset mems_allowed was | ||
1624 | * updated in parallel. It's a | ||
1625 | * harmless race between the alloc | ||
1626 | * and the cpuset update. | ||
1627 | */ | ||
1628 | put_mems_allowed(cpuset_mems_cookie); | ||
1629 | return object; | ||
1630 | } | ||
1620 | } | 1631 | } |
1621 | } | 1632 | } |
1622 | } | 1633 | } while (!put_mems_allowed(cpuset_mems_cookie)); |
1623 | put_mems_allowed(); | ||
1624 | #endif | 1634 | #endif |
1625 | return NULL; | 1635 | return NULL; |
1626 | } | 1636 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 61d7cde2311..a8bc7d364de 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, | |||
353 | 353 | ||
354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), | 354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
355 | usemap_count); | 355 | usemap_count); |
356 | if (usemap) { | 356 | if (!usemap) { |
357 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 357 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); |
358 | if (!present_section_nr(pnum)) | 358 | if (!usemap) { |
359 | continue; | 359 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
360 | usemap_map[pnum] = usemap; | 360 | return; |
361 | usemap += size; | ||
362 | } | 361 | } |
363 | return; | ||
364 | } | 362 | } |
365 | 363 | ||
366 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); | 364 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
367 | if (usemap) { | 365 | if (!present_section_nr(pnum)) |
368 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 366 | continue; |
369 | if (!present_section_nr(pnum)) | 367 | usemap_map[pnum] = usemap; |
370 | continue; | 368 | usemap += size; |
371 | usemap_map[pnum] = usemap; | 369 | check_usemap_section_nr(nodeid, usemap_map[pnum]); |
372 | usemap += size; | ||
373 | check_usemap_section_nr(nodeid, usemap_map[pnum]); | ||
374 | } | ||
375 | return; | ||
376 | } | 370 | } |
377 | |||
378 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | ||
379 | } | 371 | } |
380 | 372 | ||
381 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 373 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg) | |||
496 | * Either "cpu" is the current CPU, and preemption has already been | 496 | * Either "cpu" is the current CPU, and preemption has already been |
497 | * disabled; or "cpu" is being hot-unplugged, and is already dead. | 497 | * disabled; or "cpu" is being hot-unplugged, and is already dead. |
498 | */ | 498 | */ |
499 | static void drain_cpu_pagevecs(int cpu) | 499 | void lru_add_drain_cpu(int cpu) |
500 | { | 500 | { |
501 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); | 501 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); |
502 | struct pagevec *pvec; | 502 | struct pagevec *pvec; |
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page) | |||
553 | 553 | ||
554 | void lru_add_drain(void) | 554 | void lru_add_drain(void) |
555 | { | 555 | { |
556 | drain_cpu_pagevecs(get_cpu()); | 556 | lru_add_drain_cpu(get_cpu()); |
557 | put_cpu(); | 557 | put_cpu(); |
558 | } | 558 | } |
559 | 559 | ||
@@ -652,14 +652,14 @@ EXPORT_SYMBOL(__pagevec_release); | |||
652 | void lru_add_page_tail(struct zone* zone, | 652 | void lru_add_page_tail(struct zone* zone, |
653 | struct page *page, struct page *page_tail) | 653 | struct page *page, struct page *page_tail) |
654 | { | 654 | { |
655 | int active; | 655 | int uninitialized_var(active); |
656 | enum lru_list lru; | 656 | enum lru_list lru; |
657 | const int file = 0; | 657 | const int file = 0; |
658 | 658 | ||
659 | VM_BUG_ON(!PageHead(page)); | 659 | VM_BUG_ON(!PageHead(page)); |
660 | VM_BUG_ON(PageCompound(page_tail)); | 660 | VM_BUG_ON(PageCompound(page_tail)); |
661 | VM_BUG_ON(PageLRU(page_tail)); | 661 | VM_BUG_ON(PageLRU(page_tail)); |
662 | VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); | 662 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); |
663 | 663 | ||
664 | SetPageLRU(page_tail); | 664 | SetPageLRU(page_tail); |
665 | 665 | ||
@@ -672,7 +672,6 @@ void lru_add_page_tail(struct zone* zone, | |||
672 | active = 0; | 672 | active = 0; |
673 | lru = LRU_INACTIVE_ANON; | 673 | lru = LRU_INACTIVE_ANON; |
674 | } | 674 | } |
675 | update_page_reclaim_stat(zone, page_tail, file, active); | ||
676 | } else { | 675 | } else { |
677 | SetPageUnevictable(page_tail); | 676 | SetPageUnevictable(page_tail); |
678 | lru = LRU_UNEVICTABLE; | 677 | lru = LRU_UNEVICTABLE; |
@@ -693,6 +692,9 @@ void lru_add_page_tail(struct zone* zone, | |||
693 | list_head = page_tail->lru.prev; | 692 | list_head = page_tail->lru.prev; |
694 | list_move_tail(&page_tail->lru, list_head); | 693 | list_move_tail(&page_tail->lru, list_head); |
695 | } | 694 | } |
695 | |||
696 | if (!PageUnevictable(page)) | ||
697 | update_page_reclaim_stat(zone, page_tail, file, active); | ||
696 | } | 698 | } |
697 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 699 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
698 | 700 | ||
@@ -710,8 +712,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg) | |||
710 | SetPageLRU(page); | 712 | SetPageLRU(page); |
711 | if (active) | 713 | if (active) |
712 | SetPageActive(page); | 714 | SetPageActive(page); |
713 | update_page_reclaim_stat(zone, page, file, active); | ||
714 | add_page_to_lru_list(zone, page, lru); | 715 | add_page_to_lru_list(zone, page, lru); |
716 | update_page_reclaim_stat(zone, page, file, active); | ||
715 | } | 717 | } |
716 | 718 | ||
717 | /* | 719 | /* |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 470038a9187..9d3dd3763cf 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -300,16 +300,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
300 | new_page = alloc_page_vma(gfp_mask, vma, addr); | 300 | new_page = alloc_page_vma(gfp_mask, vma, addr); |
301 | if (!new_page) | 301 | if (!new_page) |
302 | break; /* Out of memory */ | 302 | break; /* Out of memory */ |
303 | /* | ||
304 | * The memcg-specific accounting when moving | ||
305 | * pages around the LRU lists relies on the | ||
306 | * page's owner (memcg) to be valid. Usually, | ||
307 | * pages are assigned to a new owner before | ||
308 | * being put on the LRU list, but since this | ||
309 | * is not the case here, the stale owner from | ||
310 | * a previous allocation cycle must be reset. | ||
311 | */ | ||
312 | mem_cgroup_reset_owner(new_page); | ||
313 | } | 303 | } |
314 | 304 | ||
315 | /* | 305 | /* |
@@ -382,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
382 | struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | 372 | struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, |
383 | struct vm_area_struct *vma, unsigned long addr) | 373 | struct vm_area_struct *vma, unsigned long addr) |
384 | { | 374 | { |
385 | int nr_pages; | ||
386 | struct page *page; | 375 | struct page *page; |
387 | unsigned long offset; | 376 | unsigned long offset = swp_offset(entry); |
388 | unsigned long end_offset; | 377 | unsigned long start_offset, end_offset; |
378 | unsigned long mask = (1UL << page_cluster) - 1; | ||
389 | 379 | ||
390 | /* | 380 | /* Read a page_cluster sized and aligned cluster around offset. */ |
391 | * Get starting offset for readaround, and number of pages to read. | 381 | start_offset = offset & ~mask; |
392 | * Adjust starting address by readbehind (for NUMA interleave case)? | 382 | end_offset = offset | mask; |
393 | * No, it's very unlikely that swap layout would follow vma layout, | 383 | if (!start_offset) /* First page is swap header. */ |
394 | * more likely that neighbouring swap pages came from the same node: | 384 | start_offset++; |
395 | * so use the same "addr" to choose the same node for each swap read. | 385 | |
396 | */ | 386 | for (offset = start_offset; offset <= end_offset ; offset++) { |
397 | nr_pages = valid_swaphandles(entry, &offset); | ||
398 | for (end_offset = offset + nr_pages; offset < end_offset; offset++) { | ||
399 | /* Ok, do the async read-ahead now */ | 387 | /* Ok, do the async read-ahead now */ |
400 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), | 388 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), |
401 | gfp_mask, vma, addr); | 389 | gfp_mask, vma, addr); |
402 | if (!page) | 390 | if (!page) |
403 | break; | 391 | continue; |
404 | page_cache_release(page); | 392 | page_cache_release(page); |
405 | } | 393 | } |
406 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 394 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
diff --git a/mm/swapfile.c b/mm/swapfile.c index d999f090dfd..dae42f380d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
932 | pmd = pmd_offset(pud, addr); | 932 | pmd = pmd_offset(pud, addr); |
933 | do { | 933 | do { |
934 | next = pmd_addr_end(addr, end); | 934 | next = pmd_addr_end(addr, end); |
935 | if (unlikely(pmd_trans_huge(*pmd))) | 935 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
936 | continue; | ||
937 | if (pmd_none_or_clear_bad(pmd)) | ||
938 | continue; | 936 | continue; |
939 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 937 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
940 | if (ret) | 938 | if (ret) |
@@ -1563,6 +1561,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1563 | if (!capable(CAP_SYS_ADMIN)) | 1561 | if (!capable(CAP_SYS_ADMIN)) |
1564 | return -EPERM; | 1562 | return -EPERM; |
1565 | 1563 | ||
1564 | BUG_ON(!current->mm); | ||
1565 | |||
1566 | pathname = getname(specialfile); | 1566 | pathname = getname(specialfile); |
1567 | err = PTR_ERR(pathname); | 1567 | err = PTR_ERR(pathname); |
1568 | if (IS_ERR(pathname)) | 1568 | if (IS_ERR(pathname)) |
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1590 | spin_unlock(&swap_lock); | 1590 | spin_unlock(&swap_lock); |
1591 | goto out_dput; | 1591 | goto out_dput; |
1592 | } | 1592 | } |
1593 | if (!security_vm_enough_memory(p->pages)) | 1593 | if (!security_vm_enough_memory_mm(current->mm, p->pages)) |
1594 | vm_unacct_memory(p->pages); | 1594 | vm_unacct_memory(p->pages); |
1595 | else { | 1595 | else { |
1596 | err = -ENOMEM; | 1596 | err = -ENOMEM; |
@@ -2105,7 +2105,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2105 | p->flags |= SWP_SOLIDSTATE; | 2105 | p->flags |= SWP_SOLIDSTATE; |
2106 | p->cluster_next = 1 + (random32() % p->highest_bit); | 2106 | p->cluster_next = 1 + (random32() % p->highest_bit); |
2107 | } | 2107 | } |
2108 | if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) | 2108 | if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) |
2109 | p->flags |= SWP_DISCARDABLE; | 2109 | p->flags |= SWP_DISCARDABLE; |
2110 | } | 2110 | } |
2111 | 2111 | ||
@@ -2290,58 +2290,6 @@ int swapcache_prepare(swp_entry_t entry) | |||
2290 | } | 2290 | } |
2291 | 2291 | ||
2292 | /* | 2292 | /* |
2293 | * swap_lock prevents swap_map being freed. Don't grab an extra | ||
2294 | * reference on the swaphandle, it doesn't matter if it becomes unused. | ||
2295 | */ | ||
2296 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | ||
2297 | { | ||
2298 | struct swap_info_struct *si; | ||
2299 | int our_page_cluster = page_cluster; | ||
2300 | pgoff_t target, toff; | ||
2301 | pgoff_t base, end; | ||
2302 | int nr_pages = 0; | ||
2303 | |||
2304 | if (!our_page_cluster) /* no readahead */ | ||
2305 | return 0; | ||
2306 | |||
2307 | si = swap_info[swp_type(entry)]; | ||
2308 | target = swp_offset(entry); | ||
2309 | base = (target >> our_page_cluster) << our_page_cluster; | ||
2310 | end = base + (1 << our_page_cluster); | ||
2311 | if (!base) /* first page is swap header */ | ||
2312 | base++; | ||
2313 | |||
2314 | spin_lock(&swap_lock); | ||
2315 | if (end > si->max) /* don't go beyond end of map */ | ||
2316 | end = si->max; | ||
2317 | |||
2318 | /* Count contiguous allocated slots above our target */ | ||
2319 | for (toff = target; ++toff < end; nr_pages++) { | ||
2320 | /* Don't read in free or bad pages */ | ||
2321 | if (!si->swap_map[toff]) | ||
2322 | break; | ||
2323 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) | ||
2324 | break; | ||
2325 | } | ||
2326 | /* Count contiguous allocated slots below our target */ | ||
2327 | for (toff = target; --toff >= base; nr_pages++) { | ||
2328 | /* Don't read in free or bad pages */ | ||
2329 | if (!si->swap_map[toff]) | ||
2330 | break; | ||
2331 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) | ||
2332 | break; | ||
2333 | } | ||
2334 | spin_unlock(&swap_lock); | ||
2335 | |||
2336 | /* | ||
2337 | * Indicate starting offset, and return number of pages to get: | ||
2338 | * if only 1, say 0, since there's then no readahead to be done. | ||
2339 | */ | ||
2340 | *offset = ++toff; | ||
2341 | return nr_pages? ++nr_pages: 0; | ||
2342 | } | ||
2343 | |||
2344 | /* | ||
2345 | * add_swap_count_continuation - called when a swap count is duplicated | 2293 | * add_swap_count_continuation - called when a swap count is duplicated |
2346 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | 2294 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's |
2347 | * page of the original vmalloc'ed swap_map, to hold the continuation count | 2295 | * page of the original vmalloc'ed swap_map, to hold the continuation count |
@@ -2427,9 +2375,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
2427 | if (!(count & COUNT_CONTINUED)) | 2375 | if (!(count & COUNT_CONTINUED)) |
2428 | goto out; | 2376 | goto out; |
2429 | 2377 | ||
2430 | map = kmap_atomic(list_page, KM_USER0) + offset; | 2378 | map = kmap_atomic(list_page) + offset; |
2431 | count = *map; | 2379 | count = *map; |
2432 | kunmap_atomic(map, KM_USER0); | 2380 | kunmap_atomic(map); |
2433 | 2381 | ||
2434 | /* | 2382 | /* |
2435 | * If this continuation count now has some space in it, | 2383 | * If this continuation count now has some space in it, |
@@ -2472,7 +2420,7 @@ static bool swap_count_continued(struct swap_info_struct *si, | |||
2472 | 2420 | ||
2473 | offset &= ~PAGE_MASK; | 2421 | offset &= ~PAGE_MASK; |
2474 | page = list_entry(head->lru.next, struct page, lru); | 2422 | page = list_entry(head->lru.next, struct page, lru); |
2475 | map = kmap_atomic(page, KM_USER0) + offset; | 2423 | map = kmap_atomic(page) + offset; |
2476 | 2424 | ||
2477 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ | 2425 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ |
2478 | goto init_map; /* jump over SWAP_CONT_MAX checks */ | 2426 | goto init_map; /* jump over SWAP_CONT_MAX checks */ |
@@ -2482,26 +2430,26 @@ static bool swap_count_continued(struct swap_info_struct *si, | |||
2482 | * Think of how you add 1 to 999 | 2430 | * Think of how you add 1 to 999 |
2483 | */ | 2431 | */ |
2484 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { | 2432 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { |
2485 | kunmap_atomic(map, KM_USER0); | 2433 | kunmap_atomic(map); |
2486 | page = list_entry(page->lru.next, struct page, lru); | 2434 | page = list_entry(page->lru.next, struct page, lru); |
2487 | BUG_ON(page == head); | 2435 | BUG_ON(page == head); |
2488 | map = kmap_atomic(page, KM_USER0) + offset; | 2436 | map = kmap_atomic(page) + offset; |
2489 | } | 2437 | } |
2490 | if (*map == SWAP_CONT_MAX) { | 2438 | if (*map == SWAP_CONT_MAX) { |
2491 | kunmap_atomic(map, KM_USER0); | 2439 | kunmap_atomic(map); |
2492 | page = list_entry(page->lru.next, struct page, lru); | 2440 | page = list_entry(page->lru.next, struct page, lru); |
2493 | if (page == head) | 2441 | if (page == head) |
2494 | return false; /* add count continuation */ | 2442 | return false; /* add count continuation */ |
2495 | map = kmap_atomic(page, KM_USER0) + offset; | 2443 | map = kmap_atomic(page) + offset; |
2496 | init_map: *map = 0; /* we didn't zero the page */ | 2444 | init_map: *map = 0; /* we didn't zero the page */ |
2497 | } | 2445 | } |
2498 | *map += 1; | 2446 | *map += 1; |
2499 | kunmap_atomic(map, KM_USER0); | 2447 | kunmap_atomic(map); |
2500 | page = list_entry(page->lru.prev, struct page, lru); | 2448 | page = list_entry(page->lru.prev, struct page, lru); |
2501 | while (page != head) { | 2449 | while (page != head) { |
2502 | map = kmap_atomic(page, KM_USER0) + offset; | 2450 | map = kmap_atomic(page) + offset; |
2503 | *map = COUNT_CONTINUED; | 2451 | *map = COUNT_CONTINUED; |
2504 | kunmap_atomic(map, KM_USER0); | 2452 | kunmap_atomic(map); |
2505 | page = list_entry(page->lru.prev, struct page, lru); | 2453 | page = list_entry(page->lru.prev, struct page, lru); |
2506 | } | 2454 | } |
2507 | return true; /* incremented */ | 2455 | return true; /* incremented */ |
@@ -2512,22 +2460,22 @@ init_map: *map = 0; /* we didn't zero the page */ | |||
2512 | */ | 2460 | */ |
2513 | BUG_ON(count != COUNT_CONTINUED); | 2461 | BUG_ON(count != COUNT_CONTINUED); |
2514 | while (*map == COUNT_CONTINUED) { | 2462 | while (*map == COUNT_CONTINUED) { |
2515 | kunmap_atomic(map, KM_USER0); | 2463 | kunmap_atomic(map); |
2516 | page = list_entry(page->lru.next, struct page, lru); | 2464 | page = list_entry(page->lru.next, struct page, lru); |
2517 | BUG_ON(page == head); | 2465 | BUG_ON(page == head); |
2518 | map = kmap_atomic(page, KM_USER0) + offset; | 2466 | map = kmap_atomic(page) + offset; |
2519 | } | 2467 | } |
2520 | BUG_ON(*map == 0); | 2468 | BUG_ON(*map == 0); |
2521 | *map -= 1; | 2469 | *map -= 1; |
2522 | if (*map == 0) | 2470 | if (*map == 0) |
2523 | count = 0; | 2471 | count = 0; |
2524 | kunmap_atomic(map, KM_USER0); | 2472 | kunmap_atomic(map); |
2525 | page = list_entry(page->lru.prev, struct page, lru); | 2473 | page = list_entry(page->lru.prev, struct page, lru); |
2526 | while (page != head) { | 2474 | while (page != head) { |
2527 | map = kmap_atomic(page, KM_USER0) + offset; | 2475 | map = kmap_atomic(page) + offset; |
2528 | *map = SWAP_CONT_MAX | count; | 2476 | *map = SWAP_CONT_MAX | count; |
2529 | count = COUNT_CONTINUED; | 2477 | count = COUNT_CONTINUED; |
2530 | kunmap_atomic(map, KM_USER0); | 2478 | kunmap_atomic(map); |
2531 | page = list_entry(page->lru.prev, struct page, lru); | 2479 | page = list_entry(page->lru.prev, struct page, lru); |
2532 | } | 2480 | } |
2533 | return count == COUNT_CONTINUED; | 2481 | return count == COUNT_CONTINUED; |
diff --git a/mm/truncate.c b/mm/truncate.c index 632b15e29f7..18aded3a89f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -52,7 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) | |||
52 | static inline void truncate_partial_page(struct page *page, unsigned partial) | 52 | static inline void truncate_partial_page(struct page *page, unsigned partial) |
53 | { | 53 | { |
54 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); | 54 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); |
55 | cleancache_flush_page(page->mapping, page); | 55 | cleancache_invalidate_page(page->mapping, page); |
56 | if (page_has_private(page)) | 56 | if (page_has_private(page)) |
57 | do_invalidatepage(page, partial); | 57 | do_invalidatepage(page, partial); |
58 | } | 58 | } |
@@ -184,7 +184,7 @@ int invalidate_inode_page(struct page *page) | |||
184 | } | 184 | } |
185 | 185 | ||
186 | /** | 186 | /** |
187 | * truncate_inode_pages - truncate range of pages specified by start & end byte offsets | 187 | * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets |
188 | * @mapping: mapping to truncate | 188 | * @mapping: mapping to truncate |
189 | * @lstart: offset from which to truncate | 189 | * @lstart: offset from which to truncate |
190 | * @lend: offset to which to truncate | 190 | * @lend: offset to which to truncate |
@@ -213,7 +213,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
213 | pgoff_t end; | 213 | pgoff_t end; |
214 | int i; | 214 | int i; |
215 | 215 | ||
216 | cleancache_flush_inode(mapping); | 216 | cleancache_invalidate_inode(mapping); |
217 | if (mapping->nrpages == 0) | 217 | if (mapping->nrpages == 0) |
218 | return; | 218 | return; |
219 | 219 | ||
@@ -292,7 +292,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
292 | mem_cgroup_uncharge_end(); | 292 | mem_cgroup_uncharge_end(); |
293 | index++; | 293 | index++; |
294 | } | 294 | } |
295 | cleancache_flush_inode(mapping); | 295 | cleancache_invalidate_inode(mapping); |
296 | } | 296 | } |
297 | EXPORT_SYMBOL(truncate_inode_pages_range); | 297 | EXPORT_SYMBOL(truncate_inode_pages_range); |
298 | 298 | ||
@@ -444,7 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
444 | int ret2 = 0; | 444 | int ret2 = 0; |
445 | int did_range_unmap = 0; | 445 | int did_range_unmap = 0; |
446 | 446 | ||
447 | cleancache_flush_inode(mapping); | 447 | cleancache_invalidate_inode(mapping); |
448 | pagevec_init(&pvec, 0); | 448 | pagevec_init(&pvec, 0); |
449 | index = start; | 449 | index = start; |
450 | while (index <= end && pagevec_lookup(&pvec, mapping, index, | 450 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
@@ -500,7 +500,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
500 | cond_resched(); | 500 | cond_resched(); |
501 | index++; | 501 | index++; |
502 | } | 502 | } |
503 | cleancache_flush_inode(mapping); | 503 | cleancache_invalidate_inode(mapping); |
504 | return ret; | 504 | return ret; |
505 | } | 505 | } |
506 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); | 506 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); |
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | |||
239 | next->vm_prev = vma; | 239 | next->vm_prev = vma; |
240 | } | 240 | } |
241 | 241 | ||
242 | /* Check if the vma is being used as a stack by this task */ | ||
243 | static int vm_is_stack_for_task(struct task_struct *t, | ||
244 | struct vm_area_struct *vma) | ||
245 | { | ||
246 | return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Check if the vma is being used as a stack. | ||
251 | * If is_group is non-zero, check in the entire thread group or else | ||
252 | * just check in the current task. Returns the pid of the task that | ||
253 | * the vma is stack for. | ||
254 | */ | ||
255 | pid_t vm_is_stack(struct task_struct *task, | ||
256 | struct vm_area_struct *vma, int in_group) | ||
257 | { | ||
258 | pid_t ret = 0; | ||
259 | |||
260 | if (vm_is_stack_for_task(task, vma)) | ||
261 | return task->pid; | ||
262 | |||
263 | if (in_group) { | ||
264 | struct task_struct *t; | ||
265 | rcu_read_lock(); | ||
266 | if (!pid_alive(task)) | ||
267 | goto done; | ||
268 | |||
269 | t = task; | ||
270 | do { | ||
271 | if (vm_is_stack_for_task(t, vma)) { | ||
272 | ret = t->pid; | ||
273 | goto done; | ||
274 | } | ||
275 | } while_each_thread(task, t); | ||
276 | done: | ||
277 | rcu_read_unlock(); | ||
278 | } | ||
279 | |||
280 | return ret; | ||
281 | } | ||
282 | |||
242 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) | 283 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
243 | void arch_pick_mmap_layout(struct mm_struct *mm) | 284 | void arch_pick_mmap_layout(struct mm_struct *mm) |
244 | { | 285 | { |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 86ce9a526c1..94dff883b44 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1906,9 +1906,9 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) | |||
1906 | * we can expect USER0 is not used (see vread/vwrite's | 1906 | * we can expect USER0 is not used (see vread/vwrite's |
1907 | * function description) | 1907 | * function description) |
1908 | */ | 1908 | */ |
1909 | void *map = kmap_atomic(p, KM_USER0); | 1909 | void *map = kmap_atomic(p); |
1910 | memcpy(buf, map + offset, length); | 1910 | memcpy(buf, map + offset, length); |
1911 | kunmap_atomic(map, KM_USER0); | 1911 | kunmap_atomic(map); |
1912 | } else | 1912 | } else |
1913 | memset(buf, 0, length); | 1913 | memset(buf, 0, length); |
1914 | 1914 | ||
@@ -1945,9 +1945,9 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) | |||
1945 | * we can expect USER0 is not used (see vread/vwrite's | 1945 | * we can expect USER0 is not used (see vread/vwrite's |
1946 | * function description) | 1946 | * function description) |
1947 | */ | 1947 | */ |
1948 | void *map = kmap_atomic(p, KM_USER0); | 1948 | void *map = kmap_atomic(p); |
1949 | memcpy(map + offset, buf, length); | 1949 | memcpy(map + offset, buf, length); |
1950 | kunmap_atomic(map, KM_USER0); | 1950 | kunmap_atomic(map); |
1951 | } | 1951 | } |
1952 | addr += length; | 1952 | addr += length; |
1953 | buf += length; | 1953 | buf += length; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index c52b2355265..33c332bbab7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1138 | * @mz: The mem_cgroup_zone to pull pages from. | 1138 | * @mz: The mem_cgroup_zone to pull pages from. |
1139 | * @dst: The temp list to put pages on to. | 1139 | * @dst: The temp list to put pages on to. |
1140 | * @nr_scanned: The number of pages that were scanned. | 1140 | * @nr_scanned: The number of pages that were scanned. |
1141 | * @order: The caller's attempted allocation order | 1141 | * @sc: The scan_control struct for this reclaim session |
1142 | * @mode: One of the LRU isolation modes | 1142 | * @mode: One of the LRU isolation modes |
1143 | * @active: True [1] if isolating active pages | 1143 | * @active: True [1] if isolating active pages |
1144 | * @file: True [1] if isolating file [!anon] pages | 1144 | * @file: True [1] if isolating file [!anon] pages |
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1147 | */ | 1147 | */ |
1148 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1148 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1149 | struct mem_cgroup_zone *mz, struct list_head *dst, | 1149 | struct mem_cgroup_zone *mz, struct list_head *dst, |
1150 | unsigned long *nr_scanned, int order, isolate_mode_t mode, | 1150 | unsigned long *nr_scanned, struct scan_control *sc, |
1151 | int active, int file) | 1151 | isolate_mode_t mode, int active, int file) |
1152 | { | 1152 | { |
1153 | struct lruvec *lruvec; | 1153 | struct lruvec *lruvec; |
1154 | struct list_head *src; | 1154 | struct list_head *src; |
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1194 | BUG(); | 1194 | BUG(); |
1195 | } | 1195 | } |
1196 | 1196 | ||
1197 | if (!order) | 1197 | if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) |
1198 | continue; | 1198 | continue; |
1199 | 1199 | ||
1200 | /* | 1200 | /* |
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1208 | */ | 1208 | */ |
1209 | zone_id = page_zone_id(page); | 1209 | zone_id = page_zone_id(page); |
1210 | page_pfn = page_to_pfn(page); | 1210 | page_pfn = page_to_pfn(page); |
1211 | pfn = page_pfn & ~((1 << order) - 1); | 1211 | pfn = page_pfn & ~((1 << sc->order) - 1); |
1212 | end_pfn = pfn + (1 << order); | 1212 | end_pfn = pfn + (1 << sc->order); |
1213 | for (; pfn < end_pfn; pfn++) { | 1213 | for (; pfn < end_pfn; pfn++) { |
1214 | struct page *cursor_page; | 1214 | struct page *cursor_page; |
1215 | 1215 | ||
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1275 | 1275 | ||
1276 | *nr_scanned = scan; | 1276 | *nr_scanned = scan; |
1277 | 1277 | ||
1278 | trace_mm_vmscan_lru_isolate(order, | 1278 | trace_mm_vmscan_lru_isolate(sc->order, |
1279 | nr_to_scan, scan, | 1279 | nr_to_scan, scan, |
1280 | nr_taken, | 1280 | nr_taken, |
1281 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, | 1281 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, |
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1413 | unsigned long *nr_anon, | 1413 | unsigned long *nr_anon, |
1414 | unsigned long *nr_file) | 1414 | unsigned long *nr_file) |
1415 | { | 1415 | { |
1416 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | ||
1417 | struct zone *zone = mz->zone; | 1416 | struct zone *zone = mz->zone; |
1418 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1417 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
1419 | unsigned long nr_active = 0; | 1418 | unsigned long nr_active = 0; |
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1434 | count[lru] += numpages; | 1433 | count[lru] += numpages; |
1435 | } | 1434 | } |
1436 | 1435 | ||
1436 | preempt_disable(); | ||
1437 | __count_vm_events(PGDEACTIVATE, nr_active); | 1437 | __count_vm_events(PGDEACTIVATE, nr_active); |
1438 | 1438 | ||
1439 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, | 1439 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, |
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1448 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | 1448 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; |
1449 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | 1449 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; |
1450 | 1450 | ||
1451 | reclaim_stat->recent_scanned[0] += *nr_anon; | 1451 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); |
1452 | reclaim_stat->recent_scanned[1] += *nr_file; | 1452 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); |
1453 | preempt_enable(); | ||
1453 | } | 1454 | } |
1454 | 1455 | ||
1455 | /* | 1456 | /* |
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1509 | unsigned long nr_file; | 1510 | unsigned long nr_file; |
1510 | unsigned long nr_dirty = 0; | 1511 | unsigned long nr_dirty = 0; |
1511 | unsigned long nr_writeback = 0; | 1512 | unsigned long nr_writeback = 0; |
1512 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; | 1513 | isolate_mode_t isolate_mode = ISOLATE_INACTIVE; |
1513 | struct zone *zone = mz->zone; | 1514 | struct zone *zone = mz->zone; |
1515 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | ||
1514 | 1516 | ||
1515 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1517 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1516 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1518 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1522 | 1524 | ||
1523 | set_reclaim_mode(priority, sc, false); | 1525 | set_reclaim_mode(priority, sc, false); |
1524 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | 1526 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
1525 | reclaim_mode |= ISOLATE_ACTIVE; | 1527 | isolate_mode |= ISOLATE_ACTIVE; |
1526 | 1528 | ||
1527 | lru_add_drain(); | 1529 | lru_add_drain(); |
1528 | 1530 | ||
1529 | if (!sc->may_unmap) | 1531 | if (!sc->may_unmap) |
1530 | reclaim_mode |= ISOLATE_UNMAPPED; | 1532 | isolate_mode |= ISOLATE_UNMAPPED; |
1531 | if (!sc->may_writepage) | 1533 | if (!sc->may_writepage) |
1532 | reclaim_mode |= ISOLATE_CLEAN; | 1534 | isolate_mode |= ISOLATE_CLEAN; |
1533 | 1535 | ||
1534 | spin_lock_irq(&zone->lru_lock); | 1536 | spin_lock_irq(&zone->lru_lock); |
1535 | 1537 | ||
1536 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, | 1538 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, |
1537 | &nr_scanned, sc->order, | 1539 | sc, isolate_mode, 0, file); |
1538 | reclaim_mode, 0, file); | ||
1539 | if (global_reclaim(sc)) { | 1540 | if (global_reclaim(sc)) { |
1540 | zone->pages_scanned += nr_scanned; | 1541 | zone->pages_scanned += nr_scanned; |
1541 | if (current_is_kswapd()) | 1542 | if (current_is_kswapd()) |
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1545 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1546 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
1546 | nr_scanned); | 1547 | nr_scanned); |
1547 | } | 1548 | } |
1549 | spin_unlock_irq(&zone->lru_lock); | ||
1548 | 1550 | ||
1549 | if (nr_taken == 0) { | 1551 | if (nr_taken == 0) |
1550 | spin_unlock_irq(&zone->lru_lock); | ||
1551 | return 0; | 1552 | return 0; |
1552 | } | ||
1553 | 1553 | ||
1554 | update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); | 1554 | update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); |
1555 | 1555 | ||
1556 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | ||
1557 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | ||
1558 | |||
1559 | spin_unlock_irq(&zone->lru_lock); | ||
1560 | |||
1561 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, | 1556 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, |
1562 | &nr_dirty, &nr_writeback); | 1557 | &nr_dirty, &nr_writeback); |
1563 | 1558 | ||
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1570 | 1565 | ||
1571 | spin_lock_irq(&zone->lru_lock); | 1566 | spin_lock_irq(&zone->lru_lock); |
1572 | 1567 | ||
1568 | reclaim_stat->recent_scanned[0] += nr_anon; | ||
1569 | reclaim_stat->recent_scanned[1] += nr_file; | ||
1570 | |||
1573 | if (current_is_kswapd()) | 1571 | if (current_is_kswapd()) |
1574 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1572 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); |
1575 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); | 1573 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); |
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1643 | unsigned long pgmoved = 0; | 1641 | unsigned long pgmoved = 0; |
1644 | struct page *page; | 1642 | struct page *page; |
1645 | 1643 | ||
1646 | if (buffer_heads_over_limit) { | ||
1647 | spin_unlock_irq(&zone->lru_lock); | ||
1648 | list_for_each_entry(page, list, lru) { | ||
1649 | if (page_has_private(page) && trylock_page(page)) { | ||
1650 | if (page_has_private(page)) | ||
1651 | try_to_release_page(page, 0); | ||
1652 | unlock_page(page); | ||
1653 | } | ||
1654 | } | ||
1655 | spin_lock_irq(&zone->lru_lock); | ||
1656 | } | ||
1657 | |||
1658 | while (!list_empty(list)) { | 1644 | while (!list_empty(list)) { |
1659 | struct lruvec *lruvec; | 1645 | struct lruvec *lruvec; |
1660 | 1646 | ||
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1699 | struct page *page; | 1685 | struct page *page; |
1700 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1686 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); |
1701 | unsigned long nr_rotated = 0; | 1687 | unsigned long nr_rotated = 0; |
1702 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; | 1688 | isolate_mode_t isolate_mode = ISOLATE_ACTIVE; |
1703 | struct zone *zone = mz->zone; | 1689 | struct zone *zone = mz->zone; |
1704 | 1690 | ||
1705 | lru_add_drain(); | 1691 | lru_add_drain(); |
1706 | 1692 | ||
1693 | reset_reclaim_mode(sc); | ||
1694 | |||
1707 | if (!sc->may_unmap) | 1695 | if (!sc->may_unmap) |
1708 | reclaim_mode |= ISOLATE_UNMAPPED; | 1696 | isolate_mode |= ISOLATE_UNMAPPED; |
1709 | if (!sc->may_writepage) | 1697 | if (!sc->may_writepage) |
1710 | reclaim_mode |= ISOLATE_CLEAN; | 1698 | isolate_mode |= ISOLATE_CLEAN; |
1711 | 1699 | ||
1712 | spin_lock_irq(&zone->lru_lock); | 1700 | spin_lock_irq(&zone->lru_lock); |
1713 | 1701 | ||
1714 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, | 1702 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, |
1715 | &nr_scanned, sc->order, | 1703 | isolate_mode, 1, file); |
1716 | reclaim_mode, 1, file); | ||
1717 | if (global_reclaim(sc)) | 1704 | if (global_reclaim(sc)) |
1718 | zone->pages_scanned += nr_scanned; | 1705 | zone->pages_scanned += nr_scanned; |
1719 | 1706 | ||
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1737 | continue; | 1724 | continue; |
1738 | } | 1725 | } |
1739 | 1726 | ||
1727 | if (unlikely(buffer_heads_over_limit)) { | ||
1728 | if (page_has_private(page) && trylock_page(page)) { | ||
1729 | if (page_has_private(page)) | ||
1730 | try_to_release_page(page, 0); | ||
1731 | unlock_page(page); | ||
1732 | } | ||
1733 | } | ||
1734 | |||
1740 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { | 1735 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { |
1741 | nr_rotated += hpage_nr_pages(page); | 1736 | nr_rotated += hpage_nr_pages(page); |
1742 | /* | 1737 | /* |
@@ -2112,7 +2107,12 @@ restart: | |||
2112 | * with multiple processes reclaiming pages, the total | 2107 | * with multiple processes reclaiming pages, the total |
2113 | * freeing target can get unreasonably large. | 2108 | * freeing target can get unreasonably large. |
2114 | */ | 2109 | */ |
2115 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 2110 | if (nr_reclaimed >= nr_to_reclaim) |
2111 | nr_to_reclaim = 0; | ||
2112 | else | ||
2113 | nr_to_reclaim -= nr_reclaimed; | ||
2114 | |||
2115 | if (!nr_to_reclaim && priority < DEF_PRIORITY) | ||
2116 | break; | 2116 | break; |
2117 | } | 2117 | } |
2118 | blk_finish_plug(&plug); | 2118 | blk_finish_plug(&plug); |
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
2195 | * If compaction is deferred, reclaim up to a point where | 2195 | * If compaction is deferred, reclaim up to a point where |
2196 | * compaction will have a chance of success when re-enabled | 2196 | * compaction will have a chance of success when re-enabled |
2197 | */ | 2197 | */ |
2198 | if (compaction_deferred(zone)) | 2198 | if (compaction_deferred(zone, sc->order)) |
2199 | return watermark_ok; | 2199 | return watermark_ok; |
2200 | 2200 | ||
2201 | /* If compaction is not ready to start, keep reclaiming */ | 2201 | /* If compaction is not ready to start, keep reclaiming */ |
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2235 | unsigned long nr_soft_scanned; | 2235 | unsigned long nr_soft_scanned; |
2236 | bool aborted_reclaim = false; | 2236 | bool aborted_reclaim = false; |
2237 | 2237 | ||
2238 | /* | ||
2239 | * If the number of buffer_heads in the machine exceeds the maximum | ||
2240 | * allowed level, force direct reclaim to scan the highmem zone as | ||
2241 | * highmem pages could be pinning lowmem pages storing buffer_heads | ||
2242 | */ | ||
2243 | if (buffer_heads_over_limit) | ||
2244 | sc->gfp_mask |= __GFP_HIGHMEM; | ||
2245 | |||
2238 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2246 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2239 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2247 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
2240 | if (!populated_zone(zone)) | 2248 | if (!populated_zone(zone)) |
@@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2255 | * Even though compaction is invoked for any | 2263 | * Even though compaction is invoked for any |
2256 | * non-zero order, only frequent costly order | 2264 | * non-zero order, only frequent costly order |
2257 | * reclamation is disruptive enough to become a | 2265 | * reclamation is disruptive enough to become a |
2258 | * noticable problem, like transparent huge page | 2266 | * noticeable problem, like transparent huge |
2259 | * allocations. | 2267 | * page allocations. |
2260 | */ | 2268 | */ |
2261 | if (compaction_ready(zone, sc)) { | 2269 | if (compaction_ready(zone, sc)) { |
2262 | aborted_reclaim = true; | 2270 | aborted_reclaim = true; |
@@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2337 | unsigned long writeback_threshold; | 2345 | unsigned long writeback_threshold; |
2338 | bool aborted_reclaim; | 2346 | bool aborted_reclaim; |
2339 | 2347 | ||
2340 | get_mems_allowed(); | ||
2341 | delayacct_freepages_start(); | 2348 | delayacct_freepages_start(); |
2342 | 2349 | ||
2343 | if (global_reclaim(sc)) | 2350 | if (global_reclaim(sc)) |
@@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2401 | 2408 | ||
2402 | out: | 2409 | out: |
2403 | delayacct_freepages_end(); | 2410 | delayacct_freepages_end(); |
2404 | put_mems_allowed(); | ||
2405 | 2411 | ||
2406 | if (sc->nr_reclaimed) | 2412 | if (sc->nr_reclaimed) |
2407 | return sc->nr_reclaimed; | 2413 | return sc->nr_reclaimed; |
@@ -2724,6 +2730,17 @@ loop_again: | |||
2724 | */ | 2730 | */ |
2725 | age_active_anon(zone, &sc, priority); | 2731 | age_active_anon(zone, &sc, priority); |
2726 | 2732 | ||
2733 | /* | ||
2734 | * If the number of buffer_heads in the machine | ||
2735 | * exceeds the maximum allowed level and this node | ||
2736 | * has a highmem zone, force kswapd to reclaim from | ||
2737 | * it to relieve lowmem pressure. | ||
2738 | */ | ||
2739 | if (buffer_heads_over_limit && is_highmem_idx(i)) { | ||
2740 | end_zone = i; | ||
2741 | break; | ||
2742 | } | ||
2743 | |||
2727 | if (!zone_watermark_ok_safe(zone, order, | 2744 | if (!zone_watermark_ok_safe(zone, order, |
2728 | high_wmark_pages(zone), 0, 0)) { | 2745 | high_wmark_pages(zone), 0, 0)) { |
2729 | end_zone = i; | 2746 | end_zone = i; |
@@ -2753,7 +2770,7 @@ loop_again: | |||
2753 | */ | 2770 | */ |
2754 | for (i = 0; i <= end_zone; i++) { | 2771 | for (i = 0; i <= end_zone; i++) { |
2755 | struct zone *zone = pgdat->node_zones + i; | 2772 | struct zone *zone = pgdat->node_zones + i; |
2756 | int nr_slab; | 2773 | int nr_slab, testorder; |
2757 | unsigned long balance_gap; | 2774 | unsigned long balance_gap; |
2758 | 2775 | ||
2759 | if (!populated_zone(zone)) | 2776 | if (!populated_zone(zone)) |
@@ -2786,7 +2803,21 @@ loop_again: | |||
2786 | (zone->present_pages + | 2803 | (zone->present_pages + |
2787 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2804 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
2788 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2805 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2789 | if (!zone_watermark_ok_safe(zone, order, | 2806 | /* |
2807 | * Kswapd reclaims only single pages with compaction | ||
2808 | * enabled. Trying too hard to reclaim until contiguous | ||
2809 | * free pages have become available can hurt performance | ||
2810 | * by evicting too much useful data from memory. | ||
2811 | * Do not reclaim more than needed for compaction. | ||
2812 | */ | ||
2813 | testorder = order; | ||
2814 | if (COMPACTION_BUILD && order && | ||
2815 | compaction_suitable(zone, order) != | ||
2816 | COMPACT_SKIPPED) | ||
2817 | testorder = 0; | ||
2818 | |||
2819 | if ((buffer_heads_over_limit && is_highmem_idx(i)) || | ||
2820 | !zone_watermark_ok_safe(zone, testorder, | ||
2790 | high_wmark_pages(zone) + balance_gap, | 2821 | high_wmark_pages(zone) + balance_gap, |
2791 | end_zone, 0)) { | 2822 | end_zone, 0)) { |
2792 | shrink_zone(priority, zone, &sc); | 2823 | shrink_zone(priority, zone, &sc); |
@@ -2815,7 +2846,7 @@ loop_again: | |||
2815 | continue; | 2846 | continue; |
2816 | } | 2847 | } |
2817 | 2848 | ||
2818 | if (!zone_watermark_ok_safe(zone, order, | 2849 | if (!zone_watermark_ok_safe(zone, testorder, |
2819 | high_wmark_pages(zone), end_zone, 0)) { | 2850 | high_wmark_pages(zone), end_zone, 0)) { |
2820 | all_zones_ok = 0; | 2851 | all_zones_ok = 0; |
2821 | /* | 2852 | /* |
@@ -2903,6 +2934,8 @@ out: | |||
2903 | * and it is potentially going to sleep here. | 2934 | * and it is potentially going to sleep here. |
2904 | */ | 2935 | */ |
2905 | if (order) { | 2936 | if (order) { |
2937 | int zones_need_compaction = 1; | ||
2938 | |||
2906 | for (i = 0; i <= end_zone; i++) { | 2939 | for (i = 0; i <= end_zone; i++) { |
2907 | struct zone *zone = pgdat->node_zones + i; | 2940 | struct zone *zone = pgdat->node_zones + i; |
2908 | 2941 | ||
@@ -2912,6 +2945,11 @@ out: | |||
2912 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2945 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2913 | continue; | 2946 | continue; |
2914 | 2947 | ||
2948 | /* Would compaction fail due to lack of free memory? */ | ||
2949 | if (COMPACTION_BUILD && | ||
2950 | compaction_suitable(zone, order) == COMPACT_SKIPPED) | ||
2951 | goto loop_again; | ||
2952 | |||
2915 | /* Confirm the zone is balanced for order-0 */ | 2953 | /* Confirm the zone is balanced for order-0 */ |
2916 | if (!zone_watermark_ok(zone, 0, | 2954 | if (!zone_watermark_ok(zone, 0, |
2917 | high_wmark_pages(zone), 0, 0)) { | 2955 | high_wmark_pages(zone), 0, 0)) { |
@@ -2919,11 +2957,17 @@ out: | |||
2919 | goto loop_again; | 2957 | goto loop_again; |
2920 | } | 2958 | } |
2921 | 2959 | ||
2960 | /* Check if the memory needs to be defragmented. */ | ||
2961 | if (zone_watermark_ok(zone, order, | ||
2962 | low_wmark_pages(zone), *classzone_idx, 0)) | ||
2963 | zones_need_compaction = 0; | ||
2964 | |||
2922 | /* If balanced, clear the congested flag */ | 2965 | /* If balanced, clear the congested flag */ |
2923 | zone_clear_flag(zone, ZONE_CONGESTED); | 2966 | zone_clear_flag(zone, ZONE_CONGESTED); |
2924 | if (i <= *classzone_idx) | ||
2925 | balanced += zone->present_pages; | ||
2926 | } | 2967 | } |
2968 | |||
2969 | if (zones_need_compaction) | ||
2970 | compact_pgdat(pgdat, order); | ||
2927 | } | 2971 | } |
2928 | 2972 | ||
2929 | /* | 2973 | /* |