diff options
Diffstat (limited to 'mm')
51 files changed, 3496 insertions, 1666 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 723bbe04a0b0..2d9f1504d75e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -552,3 +552,28 @@ config MEM_SOFT_DIRTY | |||
552 | it can be cleared by hands. | 552 | it can be cleared by hands. |
553 | 553 | ||
554 | See Documentation/vm/soft-dirty.txt for more details. | 554 | See Documentation/vm/soft-dirty.txt for more details. |
555 | |||
556 | config ZSMALLOC | ||
557 | bool "Memory allocator for compressed pages" | ||
558 | depends on MMU | ||
559 | default n | ||
560 | help | ||
561 | zsmalloc is a slab-based memory allocator designed to store | ||
562 | compressed RAM pages. zsmalloc uses virtual memory mapping | ||
563 | in order to reduce fragmentation. However, this results in a | ||
564 | non-standard allocator interface where a handle, not a pointer, is | ||
565 | returned by an alloc(). This handle must be mapped in order to | ||
566 | access the allocated space. | ||
567 | |||
568 | config PGTABLE_MAPPING | ||
569 | bool "Use page table mapping to access object in zsmalloc" | ||
570 | depends on ZSMALLOC | ||
571 | help | ||
572 | By default, zsmalloc uses a copy-based object mapping method to | ||
573 | access allocations that span two pages. However, if a particular | ||
574 | architecture (ex, ARM) performs VM mapping faster than copying, | ||
575 | then you should select this. This causes zsmalloc to use page table | ||
576 | mapping rather than copying for object mapping. | ||
577 | |||
578 | You can check speed with zsmalloc benchmark[1]. | ||
579 | [1] https://github.com/spartacus06/zsmalloc | ||
diff --git a/mm/Makefile b/mm/Makefile index 305d10acd081..310c90a09264 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | |||
60 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 60 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
61 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | 61 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o |
62 | obj-$(CONFIG_ZBUD) += zbud.o | 62 | obj-$(CONFIG_ZBUD) += zbud.o |
63 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o | ||
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 07dbc8ec46cf..6e45a5074bf0 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c | |||
@@ -267,7 +267,7 @@ void balloon_page_putback(struct page *page) | |||
267 | put_page(page); | 267 | put_page(page); |
268 | } else { | 268 | } else { |
269 | WARN_ON(1); | 269 | WARN_ON(1); |
270 | dump_page(page); | 270 | dump_page(page, "not movable balloon page"); |
271 | } | 271 | } |
272 | unlock_page(page); | 272 | unlock_page(page); |
273 | } | 273 | } |
@@ -287,7 +287,7 @@ int balloon_page_migrate(struct page *newpage, | |||
287 | BUG_ON(!trylock_page(newpage)); | 287 | BUG_ON(!trylock_page(newpage)); |
288 | 288 | ||
289 | if (WARN_ON(!__is_movable_balloon_page(page))) { | 289 | if (WARN_ON(!__is_movable_balloon_page(page))) { |
290 | dump_page(page); | 290 | dump_page(page, "not movable balloon page"); |
291 | unlock_page(newpage); | 291 | unlock_page(newpage); |
292 | return rc; | 292 | return rc; |
293 | } | 293 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index 5a7d58fb883b..523918b8c6dc 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -98,27 +98,24 @@ int init_emergency_isa_pool(void) | |||
98 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) | 98 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) |
99 | { | 99 | { |
100 | unsigned char *vfrom; | 100 | unsigned char *vfrom; |
101 | struct bio_vec *tovec, *fromvec; | 101 | struct bio_vec tovec, *fromvec = from->bi_io_vec; |
102 | int i; | 102 | struct bvec_iter iter; |
103 | 103 | ||
104 | bio_for_each_segment(tovec, to, i) { | 104 | bio_for_each_segment(tovec, to, iter) { |
105 | fromvec = from->bi_io_vec + i; | 105 | if (tovec.bv_page != fromvec->bv_page) { |
106 | 106 | /* | |
107 | /* | 107 | * fromvec->bv_offset and fromvec->bv_len might have |
108 | * not bounced | 108 | * been modified by the block layer, so use the original |
109 | */ | 109 | * copy, bounce_copy_vec already uses tovec->bv_len |
110 | if (tovec->bv_page == fromvec->bv_page) | 110 | */ |
111 | continue; | 111 | vfrom = page_address(fromvec->bv_page) + |
112 | 112 | tovec.bv_offset; | |
113 | /* | 113 | |
114 | * fromvec->bv_offset and fromvec->bv_len might have been | 114 | bounce_copy_vec(&tovec, vfrom); |
115 | * modified by the block layer, so use the original copy, | 115 | flush_dcache_page(tovec.bv_page); |
116 | * bounce_copy_vec already uses tovec->bv_len | 116 | } |
117 | */ | ||
118 | vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; | ||
119 | 117 | ||
120 | bounce_copy_vec(tovec, vfrom); | 118 | fromvec++; |
121 | flush_dcache_page(tovec->bv_page); | ||
122 | } | 119 | } |
123 | } | 120 | } |
124 | 121 | ||
@@ -201,13 +198,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
201 | { | 198 | { |
202 | struct bio *bio; | 199 | struct bio *bio; |
203 | int rw = bio_data_dir(*bio_orig); | 200 | int rw = bio_data_dir(*bio_orig); |
204 | struct bio_vec *to, *from; | 201 | struct bio_vec *to, from; |
202 | struct bvec_iter iter; | ||
205 | unsigned i; | 203 | unsigned i; |
206 | 204 | ||
207 | if (force) | 205 | if (force) |
208 | goto bounce; | 206 | goto bounce; |
209 | bio_for_each_segment(from, *bio_orig, i) | 207 | bio_for_each_segment(from, *bio_orig, iter) |
210 | if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) | 208 | if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) |
211 | goto bounce; | 209 | goto bounce; |
212 | 210 | ||
213 | return; | 211 | return; |
diff --git a/mm/cleancache.c b/mm/cleancache.c index 5875f48ce279..d0eac4350403 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c | |||
@@ -237,7 +237,7 @@ int __cleancache_get_page(struct page *page) | |||
237 | goto out; | 237 | goto out; |
238 | } | 238 | } |
239 | 239 | ||
240 | VM_BUG_ON(!PageLocked(page)); | 240 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
241 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; | 241 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; |
242 | if (fake_pool_id < 0) | 242 | if (fake_pool_id < 0) |
243 | goto out; | 243 | goto out; |
@@ -279,7 +279,7 @@ void __cleancache_put_page(struct page *page) | |||
279 | return; | 279 | return; |
280 | } | 280 | } |
281 | 281 | ||
282 | VM_BUG_ON(!PageLocked(page)); | 282 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
283 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; | 283 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; |
284 | if (fake_pool_id < 0) | 284 | if (fake_pool_id < 0) |
285 | return; | 285 | return; |
@@ -318,7 +318,7 @@ void __cleancache_invalidate_page(struct address_space *mapping, | |||
318 | if (pool_id < 0) | 318 | if (pool_id < 0) |
319 | return; | 319 | return; |
320 | 320 | ||
321 | VM_BUG_ON(!PageLocked(page)); | 321 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
322 | if (cleancache_get_key(mapping->host, &key) >= 0) { | 322 | if (cleancache_get_key(mapping->host, &key) >= 0) { |
323 | cleancache_ops->invalidate_page(pool_id, | 323 | cleancache_ops->invalidate_page(pool_id, |
324 | key, page->index); | 324 | key, page->index); |
diff --git a/mm/compaction.c b/mm/compaction.c index f58bcd016f43..b48c5259ea33 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
459 | unsigned long flags; | 459 | unsigned long flags; |
460 | bool locked = false; | 460 | bool locked = false; |
461 | struct page *page = NULL, *valid_page = NULL; | 461 | struct page *page = NULL, *valid_page = NULL; |
462 | bool skipped_async_unsuitable = false; | ||
462 | 463 | ||
463 | /* | 464 | /* |
464 | * Ensure that there are not too many pages isolated from the LRU | 465 | * Ensure that there are not too many pages isolated from the LRU |
@@ -522,7 +523,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
522 | if (!isolation_suitable(cc, page)) | 523 | if (!isolation_suitable(cc, page)) |
523 | goto next_pageblock; | 524 | goto next_pageblock; |
524 | 525 | ||
525 | /* Skip if free */ | 526 | /* |
527 | * Skip if free. page_order cannot be used without zone->lock | ||
528 | * as nothing prevents parallel allocations or buddy merging. | ||
529 | */ | ||
526 | if (PageBuddy(page)) | 530 | if (PageBuddy(page)) |
527 | continue; | 531 | continue; |
528 | 532 | ||
@@ -534,6 +538,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
534 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 538 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
535 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | 539 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
536 | cc->finished_update_migrate = true; | 540 | cc->finished_update_migrate = true; |
541 | skipped_async_unsuitable = true; | ||
537 | goto next_pageblock; | 542 | goto next_pageblock; |
538 | } | 543 | } |
539 | 544 | ||
@@ -599,7 +604,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
599 | if (__isolate_lru_page(page, mode) != 0) | 604 | if (__isolate_lru_page(page, mode) != 0) |
600 | continue; | 605 | continue; |
601 | 606 | ||
602 | VM_BUG_ON(PageTransCompound(page)); | 607 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
603 | 608 | ||
604 | /* Successfully isolated */ | 609 | /* Successfully isolated */ |
605 | cc->finished_update_migrate = true; | 610 | cc->finished_update_migrate = true; |
@@ -627,8 +632,13 @@ next_pageblock: | |||
627 | if (locked) | 632 | if (locked) |
628 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 633 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
629 | 634 | ||
630 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 635 | /* |
631 | if (low_pfn == end_pfn) | 636 | * Update the pageblock-skip information and cached scanner pfn, |
637 | * if the whole pageblock was scanned without isolating any page. | ||
638 | * This is not done when pageblock was skipped due to being unsuitable | ||
639 | * for async compaction, so that eventual sync compaction can try. | ||
640 | */ | ||
641 | if (low_pfn == end_pfn && !skipped_async_unsuitable) | ||
632 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 642 | update_pageblock_skip(cc, valid_page, nr_isolated, true); |
633 | 643 | ||
634 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 644 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
@@ -660,7 +670,7 @@ static void isolate_freepages(struct zone *zone, | |||
660 | * is the end of the pageblock the migration scanner is using. | 670 | * is the end of the pageblock the migration scanner is using. |
661 | */ | 671 | */ |
662 | pfn = cc->free_pfn; | 672 | pfn = cc->free_pfn; |
663 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | 673 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); |
664 | 674 | ||
665 | /* | 675 | /* |
666 | * Take care that if the migration scanner is at the end of the zone | 676 | * Take care that if the migration scanner is at the end of the zone |
@@ -676,7 +686,7 @@ static void isolate_freepages(struct zone *zone, | |||
676 | * pages on cc->migratepages. We stop searching if the migrate | 686 | * pages on cc->migratepages. We stop searching if the migrate |
677 | * and free page scanners meet or enough free pages are isolated. | 687 | * and free page scanners meet or enough free pages are isolated. |
678 | */ | 688 | */ |
679 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | 689 | for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; |
680 | pfn -= pageblock_nr_pages) { | 690 | pfn -= pageblock_nr_pages) { |
681 | unsigned long isolated; | 691 | unsigned long isolated; |
682 | 692 | ||
@@ -738,7 +748,14 @@ static void isolate_freepages(struct zone *zone, | |||
738 | /* split_free_page does not map the pages */ | 748 | /* split_free_page does not map the pages */ |
739 | map_pages(freelist); | 749 | map_pages(freelist); |
740 | 750 | ||
741 | cc->free_pfn = high_pfn; | 751 | /* |
752 | * If we crossed the migrate scanner, we want to keep it that way | ||
753 | * so that compact_finished() may detect this | ||
754 | */ | ||
755 | if (pfn < low_pfn) | ||
756 | cc->free_pfn = max(pfn, zone->zone_start_pfn); | ||
757 | else | ||
758 | cc->free_pfn = high_pfn; | ||
742 | cc->nr_freepages = nr_freepages; | 759 | cc->nr_freepages = nr_freepages; |
743 | } | 760 | } |
744 | 761 | ||
@@ -837,6 +854,10 @@ static int compact_finished(struct zone *zone, | |||
837 | 854 | ||
838 | /* Compaction run completes if the migrate and free scanner meet */ | 855 | /* Compaction run completes if the migrate and free scanner meet */ |
839 | if (cc->free_pfn <= cc->migrate_pfn) { | 856 | if (cc->free_pfn <= cc->migrate_pfn) { |
857 | /* Let the next compaction start anew. */ | ||
858 | zone->compact_cached_migrate_pfn = zone->zone_start_pfn; | ||
859 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | ||
860 | |||
840 | /* | 861 | /* |
841 | * Mark that the PG_migrate_skip information should be cleared | 862 | * Mark that the PG_migrate_skip information should be cleared |
842 | * by kswapd when it goes to sleep. kswapd does not set the | 863 | * by kswapd when it goes to sleep. kswapd does not set the |
@@ -947,6 +968,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
947 | } | 968 | } |
948 | 969 | ||
949 | /* | 970 | /* |
971 | * Clear pageblock skip if there were failures recently and compaction | ||
972 | * is about to be retried after being deferred. kswapd does not do | ||
973 | * this reset as it'll reset the cached information when going to sleep. | ||
974 | */ | ||
975 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
976 | __reset_isolation_suitable(zone); | ||
977 | |||
978 | /* | ||
950 | * Setup to move all movable pages to the end of the zone. Used cached | 979 | * Setup to move all movable pages to the end of the zone. Used cached |
951 | * information on where the scanners should start but check that it | 980 | * information on where the scanners should start but check that it |
952 | * is initialised by ensuring the values are within zone boundaries. | 981 | * is initialised by ensuring the values are within zone boundaries. |
@@ -962,13 +991,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
962 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | 991 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; |
963 | } | 992 | } |
964 | 993 | ||
965 | /* | 994 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); |
966 | * Clear pageblock skip if there were failures recently and compaction | ||
967 | * is about to be retried after being deferred. kswapd does not do | ||
968 | * this reset as it'll reset the cached information when going to sleep. | ||
969 | */ | ||
970 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
971 | __reset_isolation_suitable(zone); | ||
972 | 995 | ||
973 | migrate_prep_local(); | 996 | migrate_prep_local(); |
974 | 997 | ||
@@ -1003,7 +1026,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1003 | if (err) { | 1026 | if (err) { |
1004 | putback_movable_pages(&cc->migratepages); | 1027 | putback_movable_pages(&cc->migratepages); |
1005 | cc->nr_migratepages = 0; | 1028 | cc->nr_migratepages = 0; |
1006 | if (err == -ENOMEM) { | 1029 | /* |
1030 | * migrate_pages() may return -ENOMEM when scanners meet | ||
1031 | * and we want compact_finished() to detect it | ||
1032 | */ | ||
1033 | if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { | ||
1007 | ret = COMPACT_PARTIAL; | 1034 | ret = COMPACT_PARTIAL; |
1008 | goto out; | 1035 | goto out; |
1009 | } | 1036 | } |
@@ -1015,6 +1042,8 @@ out: | |||
1015 | cc->nr_freepages -= release_freepages(&cc->freepages); | 1042 | cc->nr_freepages -= release_freepages(&cc->freepages); |
1016 | VM_BUG_ON(cc->nr_freepages != 0); | 1043 | VM_BUG_ON(cc->nr_freepages != 0); |
1017 | 1044 | ||
1045 | trace_mm_compaction_end(ret); | ||
1046 | |||
1018 | return ret; | 1047 | return ret; |
1019 | } | 1048 | } |
1020 | 1049 | ||
@@ -1120,12 +1149,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1120 | compact_zone(zone, cc); | 1149 | compact_zone(zone, cc); |
1121 | 1150 | ||
1122 | if (cc->order > 0) { | 1151 | if (cc->order > 0) { |
1123 | int ok = zone_watermark_ok(zone, cc->order, | 1152 | if (zone_watermark_ok(zone, cc->order, |
1124 | low_wmark_pages(zone), 0, 0); | 1153 | low_wmark_pages(zone), 0, 0)) |
1125 | if (ok && cc->order >= zone->compact_order_failed) | 1154 | compaction_defer_reset(zone, cc->order, false); |
1126 | zone->compact_order_failed = cc->order + 1; | ||
1127 | /* Currently async compaction is never deferred. */ | 1155 | /* Currently async compaction is never deferred. */ |
1128 | else if (!ok && cc->sync) | 1156 | else if (cc->sync) |
1129 | defer_compaction(zone, cc->order); | 1157 | defer_compaction(zone, cc->order); |
1130 | } | 1158 | } |
1131 | 1159 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index b7749a92021c..d56d3c145b9f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -409,9 +409,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
409 | { | 409 | { |
410 | int error; | 410 | int error; |
411 | 411 | ||
412 | VM_BUG_ON(!PageLocked(old)); | 412 | VM_BUG_ON_PAGE(!PageLocked(old), old); |
413 | VM_BUG_ON(!PageLocked(new)); | 413 | VM_BUG_ON_PAGE(!PageLocked(new), new); |
414 | VM_BUG_ON(new->mapping); | 414 | VM_BUG_ON_PAGE(new->mapping, new); |
415 | 415 | ||
416 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 416 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
417 | if (!error) { | 417 | if (!error) { |
@@ -461,8 +461,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
461 | { | 461 | { |
462 | int error; | 462 | int error; |
463 | 463 | ||
464 | VM_BUG_ON(!PageLocked(page)); | 464 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
465 | VM_BUG_ON(PageSwapBacked(page)); | 465 | VM_BUG_ON_PAGE(PageSwapBacked(page), page); |
466 | 466 | ||
467 | error = mem_cgroup_cache_charge(page, current->mm, | 467 | error = mem_cgroup_cache_charge(page, current->mm, |
468 | gfp_mask & GFP_RECLAIM_MASK); | 468 | gfp_mask & GFP_RECLAIM_MASK); |
@@ -607,7 +607,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); | |||
607 | */ | 607 | */ |
608 | void unlock_page(struct page *page) | 608 | void unlock_page(struct page *page) |
609 | { | 609 | { |
610 | VM_BUG_ON(!PageLocked(page)); | 610 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
611 | clear_bit_unlock(PG_locked, &page->flags); | 611 | clear_bit_unlock(PG_locked, &page->flags); |
612 | smp_mb__after_clear_bit(); | 612 | smp_mb__after_clear_bit(); |
613 | wake_up_page(page, PG_locked); | 613 | wake_up_page(page, PG_locked); |
@@ -760,7 +760,7 @@ repeat: | |||
760 | page_cache_release(page); | 760 | page_cache_release(page); |
761 | goto repeat; | 761 | goto repeat; |
762 | } | 762 | } |
763 | VM_BUG_ON(page->index != offset); | 763 | VM_BUG_ON_PAGE(page->index != offset, page); |
764 | } | 764 | } |
765 | return page; | 765 | return page; |
766 | } | 766 | } |
@@ -1428,30 +1428,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1428 | if (!count) | 1428 | if (!count) |
1429 | goto out; /* skip atime */ | 1429 | goto out; /* skip atime */ |
1430 | size = i_size_read(inode); | 1430 | size = i_size_read(inode); |
1431 | if (pos < size) { | 1431 | retval = filemap_write_and_wait_range(mapping, pos, |
1432 | retval = filemap_write_and_wait_range(mapping, pos, | ||
1433 | pos + iov_length(iov, nr_segs) - 1); | 1432 | pos + iov_length(iov, nr_segs) - 1); |
1434 | if (!retval) { | 1433 | if (!retval) { |
1435 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1434 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1436 | iov, pos, nr_segs); | 1435 | iov, pos, nr_segs); |
1437 | } | 1436 | } |
1438 | if (retval > 0) { | 1437 | if (retval > 0) { |
1439 | *ppos = pos + retval; | 1438 | *ppos = pos + retval; |
1440 | count -= retval; | 1439 | count -= retval; |
1441 | } | 1440 | } |
1442 | 1441 | ||
1443 | /* | 1442 | /* |
1444 | * Btrfs can have a short DIO read if we encounter | 1443 | * Btrfs can have a short DIO read if we encounter |
1445 | * compressed extents, so if there was an error, or if | 1444 | * compressed extents, so if there was an error, or if |
1446 | * we've already read everything we wanted to, or if | 1445 | * we've already read everything we wanted to, or if |
1447 | * there was a short read because we hit EOF, go ahead | 1446 | * there was a short read because we hit EOF, go ahead |
1448 | * and return. Otherwise fallthrough to buffered io for | 1447 | * and return. Otherwise fallthrough to buffered io for |
1449 | * the rest of the read. | 1448 | * the rest of the read. |
1450 | */ | 1449 | */ |
1451 | if (retval < 0 || !count || *ppos >= size) { | 1450 | if (retval < 0 || !count || *ppos >= size) { |
1452 | file_accessed(filp); | 1451 | file_accessed(filp); |
1453 | goto out; | 1452 | goto out; |
1454 | } | ||
1455 | } | 1453 | } |
1456 | } | 1454 | } |
1457 | 1455 | ||
@@ -1656,7 +1654,7 @@ retry_find: | |||
1656 | put_page(page); | 1654 | put_page(page); |
1657 | goto retry_find; | 1655 | goto retry_find; |
1658 | } | 1656 | } |
1659 | VM_BUG_ON(page->index != offset); | 1657 | VM_BUG_ON_PAGE(page->index != offset, page); |
1660 | 1658 | ||
1661 | /* | 1659 | /* |
1662 | * We have a locked page in the page cache, now we need to check | 1660 | * We have a locked page in the page cache, now we need to check |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 95d1acb0f3d2..82166bf974e1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -130,8 +130,14 @@ static int set_recommended_min_free_kbytes(void) | |||
130 | (unsigned long) nr_free_buffer_pages() / 20); | 130 | (unsigned long) nr_free_buffer_pages() / 20); |
131 | recommended_min <<= (PAGE_SHIFT-10); | 131 | recommended_min <<= (PAGE_SHIFT-10); |
132 | 132 | ||
133 | if (recommended_min > min_free_kbytes) | 133 | if (recommended_min > min_free_kbytes) { |
134 | if (user_min_free_kbytes >= 0) | ||
135 | pr_info("raising min_free_kbytes from %d to %lu " | ||
136 | "to help transparent hugepage allocations\n", | ||
137 | min_free_kbytes, recommended_min); | ||
138 | |||
134 | min_free_kbytes = recommended_min; | 139 | min_free_kbytes = recommended_min; |
140 | } | ||
135 | setup_per_zone_wmarks(); | 141 | setup_per_zone_wmarks(); |
136 | return 0; | 142 | return 0; |
137 | } | 143 | } |
@@ -655,7 +661,7 @@ out: | |||
655 | hugepage_exit_sysfs(hugepage_kobj); | 661 | hugepage_exit_sysfs(hugepage_kobj); |
656 | return err; | 662 | return err; |
657 | } | 663 | } |
658 | module_init(hugepage_init) | 664 | subsys_initcall(hugepage_init); |
659 | 665 | ||
660 | static int __init setup_transparent_hugepage(char *str) | 666 | static int __init setup_transparent_hugepage(char *str) |
661 | { | 667 | { |
@@ -712,7 +718,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
712 | pgtable_t pgtable; | 718 | pgtable_t pgtable; |
713 | spinlock_t *ptl; | 719 | spinlock_t *ptl; |
714 | 720 | ||
715 | VM_BUG_ON(!PageCompound(page)); | 721 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
716 | pgtable = pte_alloc_one(mm, haddr); | 722 | pgtable = pte_alloc_one(mm, haddr); |
717 | if (unlikely(!pgtable)) | 723 | if (unlikely(!pgtable)) |
718 | return VM_FAULT_OOM; | 724 | return VM_FAULT_OOM; |
@@ -893,7 +899,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
893 | goto out; | 899 | goto out; |
894 | } | 900 | } |
895 | src_page = pmd_page(pmd); | 901 | src_page = pmd_page(pmd); |
896 | VM_BUG_ON(!PageHead(src_page)); | 902 | VM_BUG_ON_PAGE(!PageHead(src_page), src_page); |
897 | get_page(src_page); | 903 | get_page(src_page); |
898 | page_dup_rmap(src_page); | 904 | page_dup_rmap(src_page); |
899 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 905 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
@@ -1067,7 +1073,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1067 | ptl = pmd_lock(mm, pmd); | 1073 | ptl = pmd_lock(mm, pmd); |
1068 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1074 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
1069 | goto out_free_pages; | 1075 | goto out_free_pages; |
1070 | VM_BUG_ON(!PageHead(page)); | 1076 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1071 | 1077 | ||
1072 | pmdp_clear_flush(vma, haddr, pmd); | 1078 | pmdp_clear_flush(vma, haddr, pmd); |
1073 | /* leave pmd empty until pte is filled */ | 1079 | /* leave pmd empty until pte is filled */ |
@@ -1133,7 +1139,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1133 | goto out_unlock; | 1139 | goto out_unlock; |
1134 | 1140 | ||
1135 | page = pmd_page(orig_pmd); | 1141 | page = pmd_page(orig_pmd); |
1136 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | 1142 | VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); |
1137 | if (page_mapcount(page) == 1) { | 1143 | if (page_mapcount(page) == 1) { |
1138 | pmd_t entry; | 1144 | pmd_t entry; |
1139 | entry = pmd_mkyoung(orig_pmd); | 1145 | entry = pmd_mkyoung(orig_pmd); |
@@ -1211,7 +1217,7 @@ alloc: | |||
1211 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1217 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1212 | put_huge_zero_page(); | 1218 | put_huge_zero_page(); |
1213 | } else { | 1219 | } else { |
1214 | VM_BUG_ON(!PageHead(page)); | 1220 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1215 | page_remove_rmap(page); | 1221 | page_remove_rmap(page); |
1216 | put_page(page); | 1222 | put_page(page); |
1217 | } | 1223 | } |
@@ -1249,7 +1255,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1249 | goto out; | 1255 | goto out; |
1250 | 1256 | ||
1251 | page = pmd_page(*pmd); | 1257 | page = pmd_page(*pmd); |
1252 | VM_BUG_ON(!PageHead(page)); | 1258 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1253 | if (flags & FOLL_TOUCH) { | 1259 | if (flags & FOLL_TOUCH) { |
1254 | pmd_t _pmd; | 1260 | pmd_t _pmd; |
1255 | /* | 1261 | /* |
@@ -1274,7 +1280,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1274 | } | 1280 | } |
1275 | } | 1281 | } |
1276 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1282 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
1277 | VM_BUG_ON(!PageCompound(page)); | 1283 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
1278 | if (flags & FOLL_GET) | 1284 | if (flags & FOLL_GET) |
1279 | get_page_foll(page); | 1285 | get_page_foll(page); |
1280 | 1286 | ||
@@ -1432,9 +1438,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1432 | } else { | 1438 | } else { |
1433 | page = pmd_page(orig_pmd); | 1439 | page = pmd_page(orig_pmd); |
1434 | page_remove_rmap(page); | 1440 | page_remove_rmap(page); |
1435 | VM_BUG_ON(page_mapcount(page) < 0); | 1441 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); |
1436 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1442 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1437 | VM_BUG_ON(!PageHead(page)); | 1443 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1438 | atomic_long_dec(&tlb->mm->nr_ptes); | 1444 | atomic_long_dec(&tlb->mm->nr_ptes); |
1439 | spin_unlock(ptl); | 1445 | spin_unlock(ptl); |
1440 | tlb_remove_page(tlb, page); | 1446 | tlb_remove_page(tlb, page); |
@@ -1502,19 +1508,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1502 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); | 1508 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); |
1503 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | 1509 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
1504 | VM_BUG_ON(!pmd_none(*new_pmd)); | 1510 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1505 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); | ||
1506 | if (new_ptl != old_ptl) { | ||
1507 | pgtable_t pgtable; | ||
1508 | 1511 | ||
1509 | /* | 1512 | if (pmd_move_must_withdraw(new_ptl, old_ptl)) { |
1510 | * Move preallocated PTE page table if new_pmd is on | 1513 | pgtable_t pgtable; |
1511 | * different PMD page table. | ||
1512 | */ | ||
1513 | pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); | 1514 | pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); |
1514 | pgtable_trans_huge_deposit(mm, new_pmd, pgtable); | 1515 | pgtable_trans_huge_deposit(mm, new_pmd, pgtable); |
1515 | |||
1516 | spin_unlock(new_ptl); | ||
1517 | } | 1516 | } |
1517 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); | ||
1518 | if (new_ptl != old_ptl) | ||
1519 | spin_unlock(new_ptl); | ||
1518 | spin_unlock(old_ptl); | 1520 | spin_unlock(old_ptl); |
1519 | } | 1521 | } |
1520 | out: | 1522 | out: |
@@ -2176,9 +2178,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2176 | if (unlikely(!page)) | 2178 | if (unlikely(!page)) |
2177 | goto out; | 2179 | goto out; |
2178 | 2180 | ||
2179 | VM_BUG_ON(PageCompound(page)); | 2181 | VM_BUG_ON_PAGE(PageCompound(page), page); |
2180 | BUG_ON(!PageAnon(page)); | 2182 | VM_BUG_ON_PAGE(!PageAnon(page), page); |
2181 | VM_BUG_ON(!PageSwapBacked(page)); | 2183 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
2182 | 2184 | ||
2183 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2185 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
2184 | if (page_count(page) != 1) | 2186 | if (page_count(page) != 1) |
@@ -2201,8 +2203,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2201 | } | 2203 | } |
2202 | /* 0 stands for page_is_file_cache(page) == false */ | 2204 | /* 0 stands for page_is_file_cache(page) == false */ |
2203 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | 2205 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); |
2204 | VM_BUG_ON(!PageLocked(page)); | 2206 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
2205 | VM_BUG_ON(PageLRU(page)); | 2207 | VM_BUG_ON_PAGE(PageLRU(page), page); |
2206 | 2208 | ||
2207 | /* If there is no mapped pte young don't collapse the page */ | 2209 | /* If there is no mapped pte young don't collapse the page */ |
2208 | if (pte_young(pteval) || PageReferenced(page) || | 2210 | if (pte_young(pteval) || PageReferenced(page) || |
@@ -2232,7 +2234,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | |||
2232 | } else { | 2234 | } else { |
2233 | src_page = pte_page(pteval); | 2235 | src_page = pte_page(pteval); |
2234 | copy_user_highpage(page, src_page, address, vma); | 2236 | copy_user_highpage(page, src_page, address, vma); |
2235 | VM_BUG_ON(page_mapcount(src_page) != 1); | 2237 | VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); |
2236 | release_pte_page(src_page); | 2238 | release_pte_page(src_page); |
2237 | /* | 2239 | /* |
2238 | * ptl mostly unnecessary, but preempt has to | 2240 | * ptl mostly unnecessary, but preempt has to |
@@ -2311,7 +2313,7 @@ static struct page | |||
2311 | struct vm_area_struct *vma, unsigned long address, | 2313 | struct vm_area_struct *vma, unsigned long address, |
2312 | int node) | 2314 | int node) |
2313 | { | 2315 | { |
2314 | VM_BUG_ON(*hpage); | 2316 | VM_BUG_ON_PAGE(*hpage, *hpage); |
2315 | /* | 2317 | /* |
2316 | * Allocate the page while the vma is still valid and under | 2318 | * Allocate the page while the vma is still valid and under |
2317 | * the mmap_sem read mode so there is no memory allocation | 2319 | * the mmap_sem read mode so there is no memory allocation |
@@ -2580,7 +2582,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2580 | */ | 2582 | */ |
2581 | node = page_to_nid(page); | 2583 | node = page_to_nid(page); |
2582 | khugepaged_node_load[node]++; | 2584 | khugepaged_node_load[node]++; |
2583 | VM_BUG_ON(PageCompound(page)); | 2585 | VM_BUG_ON_PAGE(PageCompound(page), page); |
2584 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2586 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
2585 | goto out_unmap; | 2587 | goto out_unmap; |
2586 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2588 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
@@ -2876,7 +2878,7 @@ again: | |||
2876 | return; | 2878 | return; |
2877 | } | 2879 | } |
2878 | page = pmd_page(*pmd); | 2880 | page = pmd_page(*pmd); |
2879 | VM_BUG_ON(!page_count(page)); | 2881 | VM_BUG_ON_PAGE(!page_count(page), page); |
2880 | get_page(page); | 2882 | get_page(page); |
2881 | spin_unlock(ptl); | 2883 | spin_unlock(ptl); |
2882 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2884 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dee6cf4e6d34..c01cb9fedb18 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -584,7 +584,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
584 | 1 << PG_active | 1 << PG_reserved | | 584 | 1 << PG_active | 1 << PG_reserved | |
585 | 1 << PG_private | 1 << PG_writeback); | 585 | 1 << PG_private | 1 << PG_writeback); |
586 | } | 586 | } |
587 | VM_BUG_ON(hugetlb_cgroup_from_page(page)); | 587 | VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); |
588 | set_compound_page_dtor(page, NULL); | 588 | set_compound_page_dtor(page, NULL); |
589 | set_page_refcounted(page); | 589 | set_page_refcounted(page); |
590 | arch_release_hugepage(page); | 590 | arch_release_hugepage(page); |
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
690 | */ | 690 | */ |
691 | int PageHuge(struct page *page) | 691 | int PageHuge(struct page *page) |
692 | { | 692 | { |
693 | compound_page_dtor *dtor; | ||
694 | |||
695 | if (!PageCompound(page)) | 693 | if (!PageCompound(page)) |
696 | return 0; | 694 | return 0; |
697 | 695 | ||
698 | page = compound_head(page); | 696 | page = compound_head(page); |
699 | dtor = get_compound_page_dtor(page); | 697 | return get_compound_page_dtor(page) == free_huge_page; |
700 | |||
701 | return dtor == free_huge_page; | ||
702 | } | 698 | } |
703 | EXPORT_SYMBOL_GPL(PageHuge); | 699 | EXPORT_SYMBOL_GPL(PageHuge); |
704 | 700 | ||
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge); | |||
708 | */ | 704 | */ |
709 | int PageHeadHuge(struct page *page_head) | 705 | int PageHeadHuge(struct page *page_head) |
710 | { | 706 | { |
711 | compound_page_dtor *dtor; | ||
712 | |||
713 | if (!PageHead(page_head)) | 707 | if (!PageHead(page_head)) |
714 | return 0; | 708 | return 0; |
715 | 709 | ||
716 | dtor = get_compound_page_dtor(page_head); | 710 | return get_compound_page_dtor(page_head) == free_huge_page; |
717 | |||
718 | return dtor == free_huge_page; | ||
719 | } | 711 | } |
720 | EXPORT_SYMBOL_GPL(PageHeadHuge); | ||
721 | 712 | ||
722 | pgoff_t __basepage_index(struct page *page) | 713 | pgoff_t __basepage_index(struct page *page) |
723 | { | 714 | { |
@@ -1098,7 +1089,7 @@ retry: | |||
1098 | * no users -- drop the buddy allocator's reference. | 1089 | * no users -- drop the buddy allocator's reference. |
1099 | */ | 1090 | */ |
1100 | put_page_testzero(page); | 1091 | put_page_testzero(page); |
1101 | VM_BUG_ON(page_count(page)); | 1092 | VM_BUG_ON_PAGE(page_count(page), page); |
1102 | enqueue_huge_page(h, page); | 1093 | enqueue_huge_page(h, page); |
1103 | } | 1094 | } |
1104 | free: | 1095 | free: |
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1280 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { | 1271 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { |
1281 | void *addr; | 1272 | void *addr; |
1282 | 1273 | ||
1283 | addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), | 1274 | addr = memblock_virt_alloc_try_nid_nopanic( |
1284 | huge_page_size(h), huge_page_size(h), 0); | 1275 | huge_page_size(h), huge_page_size(h), |
1285 | 1276 | 0, BOOTMEM_ALLOC_ACCESSIBLE, node); | |
1286 | if (addr) { | 1277 | if (addr) { |
1287 | /* | 1278 | /* |
1288 | * Use the beginning of the huge page to store the | 1279 | * Use the beginning of the huge page to store the |
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void) | |||
1322 | 1313 | ||
1323 | #ifdef CONFIG_HIGHMEM | 1314 | #ifdef CONFIG_HIGHMEM |
1324 | page = pfn_to_page(m->phys >> PAGE_SHIFT); | 1315 | page = pfn_to_page(m->phys >> PAGE_SHIFT); |
1325 | free_bootmem_late((unsigned long)m, | 1316 | memblock_free_late(__pa(m), |
1326 | sizeof(struct huge_bootmem_page)); | 1317 | sizeof(struct huge_bootmem_page)); |
1327 | #else | 1318 | #else |
1328 | page = virt_to_page(m); | 1319 | page = virt_to_page(m); |
1329 | #endif | 1320 | #endif |
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2355 | int cow; | 2346 | int cow; |
2356 | struct hstate *h = hstate_vma(vma); | 2347 | struct hstate *h = hstate_vma(vma); |
2357 | unsigned long sz = huge_page_size(h); | 2348 | unsigned long sz = huge_page_size(h); |
2349 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2350 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2351 | int ret = 0; | ||
2358 | 2352 | ||
2359 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 2353 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
2360 | 2354 | ||
2355 | mmun_start = vma->vm_start; | ||
2356 | mmun_end = vma->vm_end; | ||
2357 | if (cow) | ||
2358 | mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); | ||
2359 | |||
2361 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { | 2360 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
2362 | spinlock_t *src_ptl, *dst_ptl; | 2361 | spinlock_t *src_ptl, *dst_ptl; |
2363 | src_pte = huge_pte_offset(src, addr); | 2362 | src_pte = huge_pte_offset(src, addr); |
2364 | if (!src_pte) | 2363 | if (!src_pte) |
2365 | continue; | 2364 | continue; |
2366 | dst_pte = huge_pte_alloc(dst, addr, sz); | 2365 | dst_pte = huge_pte_alloc(dst, addr, sz); |
2367 | if (!dst_pte) | 2366 | if (!dst_pte) { |
2368 | goto nomem; | 2367 | ret = -ENOMEM; |
2368 | break; | ||
2369 | } | ||
2369 | 2370 | ||
2370 | /* If the pagetables are shared don't copy or take references */ | 2371 | /* If the pagetables are shared don't copy or take references */ |
2371 | if (dst_pte == src_pte) | 2372 | if (dst_pte == src_pte) |
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2386 | spin_unlock(src_ptl); | 2387 | spin_unlock(src_ptl); |
2387 | spin_unlock(dst_ptl); | 2388 | spin_unlock(dst_ptl); |
2388 | } | 2389 | } |
2389 | return 0; | ||
2390 | 2390 | ||
2391 | nomem: | 2391 | if (cow) |
2392 | return -ENOMEM; | 2392 | mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); |
2393 | |||
2394 | return ret; | ||
2393 | } | 2395 | } |
2394 | 2396 | ||
2395 | static int is_hugetlb_entry_migration(pte_t pte) | 2397 | static int is_hugetlb_entry_migration(pte_t pte) |
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3079 | same_page: | 3081 | same_page: |
3080 | if (pages) { | 3082 | if (pages) { |
3081 | pages[i] = mem_map_offset(page, pfn_offset); | 3083 | pages[i] = mem_map_offset(page, pfn_offset); |
3082 | get_page(pages[i]); | 3084 | get_page_foll(pages[i]); |
3083 | } | 3085 | } |
3084 | 3086 | ||
3085 | if (vmas) | 3087 | if (vmas) |
@@ -3501,7 +3503,7 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
3501 | 3503 | ||
3502 | bool isolate_huge_page(struct page *page, struct list_head *list) | 3504 | bool isolate_huge_page(struct page *page, struct list_head *list) |
3503 | { | 3505 | { |
3504 | VM_BUG_ON(!PageHead(page)); | 3506 | VM_BUG_ON_PAGE(!PageHead(page), page); |
3505 | if (!get_page_unless_zero(page)) | 3507 | if (!get_page_unless_zero(page)) |
3506 | return false; | 3508 | return false; |
3507 | spin_lock(&hugetlb_lock); | 3509 | spin_lock(&hugetlb_lock); |
@@ -3512,7 +3514,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list) | |||
3512 | 3514 | ||
3513 | void putback_active_hugepage(struct page *page) | 3515 | void putback_active_hugepage(struct page *page) |
3514 | { | 3516 | { |
3515 | VM_BUG_ON(!PageHead(page)); | 3517 | VM_BUG_ON_PAGE(!PageHead(page), page); |
3516 | spin_lock(&hugetlb_lock); | 3518 | spin_lock(&hugetlb_lock); |
3517 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); | 3519 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); |
3518 | spin_unlock(&hugetlb_lock); | 3520 | spin_unlock(&hugetlb_lock); |
@@ -3521,7 +3523,7 @@ void putback_active_hugepage(struct page *page) | |||
3521 | 3523 | ||
3522 | bool is_hugepage_active(struct page *page) | 3524 | bool is_hugepage_active(struct page *page) |
3523 | { | 3525 | { |
3524 | VM_BUG_ON(!PageHuge(page)); | 3526 | VM_BUG_ON_PAGE(!PageHuge(page), page); |
3525 | /* | 3527 | /* |
3526 | * This function can be called for a tail page because the caller, | 3528 | * This function can be called for a tail page because the caller, |
3527 | * scan_movable_pages, scans through a given pfn-range which typically | 3529 | * scan_movable_pages, scans through a given pfn-range which typically |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index bda8e44f6fde..cb00829bb466 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | |||
242 | return; | 242 | return; |
243 | } | 243 | } |
244 | 244 | ||
245 | static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, | 245 | static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, |
246 | struct cftype *cft, struct file *file, | 246 | struct cftype *cft) |
247 | char __user *buf, size_t nbytes, | ||
248 | loff_t *ppos) | ||
249 | { | 247 | { |
250 | u64 val; | 248 | int idx, name; |
251 | char str[64]; | ||
252 | int idx, name, len; | ||
253 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); | 249 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); |
254 | 250 | ||
255 | idx = MEMFILE_IDX(cft->private); | 251 | idx = MEMFILE_IDX(cft->private); |
256 | name = MEMFILE_ATTR(cft->private); | 252 | name = MEMFILE_ATTR(cft->private); |
257 | 253 | ||
258 | val = res_counter_read_u64(&h_cg->hugepage[idx], name); | 254 | return res_counter_read_u64(&h_cg->hugepage[idx], name); |
259 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | ||
260 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
261 | } | 255 | } |
262 | 256 | ||
263 | static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, | 257 | static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, |
@@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx) | |||
337 | cft = &h->cgroup_files[0]; | 331 | cft = &h->cgroup_files[0]; |
338 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); | 332 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); |
339 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); | 333 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); |
340 | cft->read = hugetlb_cgroup_read; | 334 | cft->read_u64 = hugetlb_cgroup_read_u64; |
341 | cft->write_string = hugetlb_cgroup_write; | 335 | cft->write_string = hugetlb_cgroup_write; |
342 | 336 | ||
343 | /* Add the usage file */ | 337 | /* Add the usage file */ |
344 | cft = &h->cgroup_files[1]; | 338 | cft = &h->cgroup_files[1]; |
345 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); | 339 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); |
346 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); | 340 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); |
347 | cft->read = hugetlb_cgroup_read; | 341 | cft->read_u64 = hugetlb_cgroup_read_u64; |
348 | 342 | ||
349 | /* Add the MAX usage file */ | 343 | /* Add the MAX usage file */ |
350 | cft = &h->cgroup_files[2]; | 344 | cft = &h->cgroup_files[2]; |
351 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); | 345 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); |
352 | cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); | 346 | cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); |
353 | cft->trigger = hugetlb_cgroup_reset; | 347 | cft->trigger = hugetlb_cgroup_reset; |
354 | cft->read = hugetlb_cgroup_read; | 348 | cft->read_u64 = hugetlb_cgroup_read_u64; |
355 | 349 | ||
356 | /* Add the failcntfile */ | 350 | /* Add the failcntfile */ |
357 | cft = &h->cgroup_files[3]; | 351 | cft = &h->cgroup_files[3]; |
358 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); | 352 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); |
359 | cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); | 353 | cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); |
360 | cft->trigger = hugetlb_cgroup_reset; | 354 | cft->trigger = hugetlb_cgroup_reset; |
361 | cft->read = hugetlb_cgroup_read; | 355 | cft->read_u64 = hugetlb_cgroup_read_u64; |
362 | 356 | ||
363 | /* NULL terminate the last cft */ | 357 | /* NULL terminate the last cft */ |
364 | cft = &h->cgroup_files[4]; | 358 | cft = &h->cgroup_files[4]; |
@@ -396,7 +390,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) | |||
396 | if (hugetlb_cgroup_disabled()) | 390 | if (hugetlb_cgroup_disabled()) |
397 | return; | 391 | return; |
398 | 392 | ||
399 | VM_BUG_ON(!PageHuge(oldhpage)); | 393 | VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); |
400 | spin_lock(&hugetlb_lock); | 394 | spin_lock(&hugetlb_lock); |
401 | h_cg = hugetlb_cgroup_from_page(oldhpage); | 395 | h_cg = hugetlb_cgroup_from_page(oldhpage); |
402 | set_hugetlb_cgroup(oldhpage, NULL); | 396 | set_hugetlb_cgroup(oldhpage, NULL); |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4c84678371eb..95487c71cad5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
55 | return 0; | 55 | return 0; |
56 | 56 | ||
57 | inject: | 57 | inject: |
58 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | 58 | pr_info("Injecting memory failure at pfn %#lx\n", pfn); |
59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); | 59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); |
60 | } | 60 | } |
61 | 61 | ||
diff --git a/mm/internal.h b/mm/internal.h index 684f7aa9692a..29e1e761f9eb 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -27,8 +27,8 @@ static inline void set_page_count(struct page *page, int v) | |||
27 | */ | 27 | */ |
28 | static inline void set_page_refcounted(struct page *page) | 28 | static inline void set_page_refcounted(struct page *page) |
29 | { | 29 | { |
30 | VM_BUG_ON(PageTail(page)); | 30 | VM_BUG_ON_PAGE(PageTail(page), page); |
31 | VM_BUG_ON(atomic_read(&page->_count)); | 31 | VM_BUG_ON_PAGE(atomic_read(&page->_count), page); |
32 | set_page_count(page, 1); | 32 | set_page_count(page, 1); |
33 | } | 33 | } |
34 | 34 | ||
@@ -46,12 +46,10 @@ static inline void __get_page_tail_foll(struct page *page, | |||
46 | * speculative page access (like in | 46 | * speculative page access (like in |
47 | * page_cache_get_speculative()) on tail pages. | 47 | * page_cache_get_speculative()) on tail pages. |
48 | */ | 48 | */ |
49 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); | 49 | VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); |
50 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
51 | VM_BUG_ON(page_mapcount(page) < 0); | ||
52 | if (get_page_head) | 50 | if (get_page_head) |
53 | atomic_inc(&page->first_page->_count); | 51 | atomic_inc(&page->first_page->_count); |
54 | atomic_inc(&page->_mapcount); | 52 | get_huge_page_tail(page); |
55 | } | 53 | } |
56 | 54 | ||
57 | /* | 55 | /* |
@@ -73,7 +71,7 @@ static inline void get_page_foll(struct page *page) | |||
73 | * Getting a normal page or the head of a compound page | 71 | * Getting a normal page or the head of a compound page |
74 | * requires to already have an elevated page->_count. | 72 | * requires to already have an elevated page->_count. |
75 | */ | 73 | */ |
76 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | 74 | VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); |
77 | atomic_inc(&page->_count); | 75 | atomic_inc(&page->_count); |
78 | } | 76 | } |
79 | } | 77 | } |
@@ -85,7 +83,6 @@ extern unsigned long highest_memmap_pfn; | |||
85 | */ | 83 | */ |
86 | extern int isolate_lru_page(struct page *page); | 84 | extern int isolate_lru_page(struct page *page); |
87 | extern void putback_lru_page(struct page *page); | 85 | extern void putback_lru_page(struct page *page); |
88 | extern unsigned long zone_reclaimable_pages(struct zone *zone); | ||
89 | extern bool zone_reclaimable(struct zone *zone); | 86 | extern bool zone_reclaimable(struct zone *zone); |
90 | 87 | ||
91 | /* | 88 | /* |
@@ -101,6 +98,7 @@ extern void prep_compound_page(struct page *page, unsigned long order); | |||
101 | #ifdef CONFIG_MEMORY_FAILURE | 98 | #ifdef CONFIG_MEMORY_FAILURE |
102 | extern bool is_free_buddy_page(struct page *page); | 99 | extern bool is_free_buddy_page(struct page *page); |
103 | #endif | 100 | #endif |
101 | extern int user_min_free_kbytes; | ||
104 | 102 | ||
105 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 103 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
106 | 104 | ||
@@ -144,9 +142,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
144 | #endif | 142 | #endif |
145 | 143 | ||
146 | /* | 144 | /* |
147 | * function for dealing with page's order in buddy system. | 145 | * This function returns the order of a free page in the buddy system. In |
148 | * zone->lock is already acquired when we use these. | 146 | * general, page_zone(page)->lock must be held by the caller to prevent the |
149 | * So, we don't need atomic page->flags operations here. | 147 | * page from being allocated in parallel and returning garbage as the order. |
148 | * If a caller does not hold page_zone(page)->lock, it must guarantee that the | ||
149 | * page cannot be allocated or merged in parallel. | ||
150 | */ | 150 | */ |
151 | static inline unsigned long page_order(struct page *page) | 151 | static inline unsigned long page_order(struct page *page) |
152 | { | 152 | { |
@@ -175,7 +175,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
175 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | 175 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, |
176 | struct page *page) | 176 | struct page *page) |
177 | { | 177 | { |
178 | VM_BUG_ON(PageLRU(page)); | 178 | VM_BUG_ON_PAGE(PageLRU(page), page); |
179 | 179 | ||
180 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) | 180 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) |
181 | return 0; | 181 | return 0; |
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page, | |||
1891 | return new_page; | 1891 | return new_page; |
1892 | } | 1892 | } |
1893 | 1893 | ||
1894 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | 1894 | int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) |
1895 | unsigned long *vm_flags) | ||
1896 | { | 1895 | { |
1897 | struct stable_node *stable_node; | 1896 | struct stable_node *stable_node; |
1898 | struct rmap_item *rmap_item; | 1897 | struct rmap_item *rmap_item; |
1899 | unsigned int mapcount = page_mapcount(page); | 1898 | int ret = SWAP_AGAIN; |
1900 | int referenced = 0; | ||
1901 | int search_new_forks = 0; | 1899 | int search_new_forks = 0; |
1902 | 1900 | ||
1903 | VM_BUG_ON(!PageKsm(page)); | 1901 | VM_BUG_ON_PAGE(!PageKsm(page), page); |
1904 | VM_BUG_ON(!PageLocked(page)); | 1902 | |
1903 | /* | ||
1904 | * Rely on the page lock to protect against concurrent modifications | ||
1905 | * to that page's node of the stable tree. | ||
1906 | */ | ||
1907 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
1905 | 1908 | ||
1906 | stable_node = page_stable_node(page); | 1909 | stable_node = page_stable_node(page); |
1907 | if (!stable_node) | 1910 | if (!stable_node) |
1908 | return 0; | 1911 | return ret; |
1909 | again: | 1912 | again: |
1910 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | 1913 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { |
1911 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1914 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
@@ -1928,113 +1931,16 @@ again: | |||
1928 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | 1931 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) |
1929 | continue; | 1932 | continue; |
1930 | 1933 | ||
1931 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | 1934 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) |
1932 | continue; | ||
1933 | |||
1934 | referenced += page_referenced_one(page, vma, | ||
1935 | rmap_item->address, &mapcount, vm_flags); | ||
1936 | if (!search_new_forks || !mapcount) | ||
1937 | break; | ||
1938 | } | ||
1939 | anon_vma_unlock_read(anon_vma); | ||
1940 | if (!mapcount) | ||
1941 | goto out; | ||
1942 | } | ||
1943 | if (!search_new_forks++) | ||
1944 | goto again; | ||
1945 | out: | ||
1946 | return referenced; | ||
1947 | } | ||
1948 | |||
1949 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
1950 | { | ||
1951 | struct stable_node *stable_node; | ||
1952 | struct rmap_item *rmap_item; | ||
1953 | int ret = SWAP_AGAIN; | ||
1954 | int search_new_forks = 0; | ||
1955 | |||
1956 | VM_BUG_ON(!PageKsm(page)); | ||
1957 | VM_BUG_ON(!PageLocked(page)); | ||
1958 | |||
1959 | stable_node = page_stable_node(page); | ||
1960 | if (!stable_node) | ||
1961 | return SWAP_FAIL; | ||
1962 | again: | ||
1963 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | ||
1964 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1965 | struct anon_vma_chain *vmac; | ||
1966 | struct vm_area_struct *vma; | ||
1967 | |||
1968 | anon_vma_lock_read(anon_vma); | ||
1969 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | ||
1970 | 0, ULONG_MAX) { | ||
1971 | vma = vmac->vma; | ||
1972 | if (rmap_item->address < vma->vm_start || | ||
1973 | rmap_item->address >= vma->vm_end) | ||
1974 | continue; | ||
1975 | /* | ||
1976 | * Initially we examine only the vma which covers this | ||
1977 | * rmap_item; but later, if there is still work to do, | ||
1978 | * we examine covering vmas in other mms: in case they | ||
1979 | * were forked from the original since ksmd passed. | ||
1980 | */ | ||
1981 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1982 | continue; | 1935 | continue; |
1983 | 1936 | ||
1984 | ret = try_to_unmap_one(page, vma, | 1937 | ret = rwc->rmap_one(page, vma, |
1985 | rmap_item->address, flags); | 1938 | rmap_item->address, rwc->arg); |
1986 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | 1939 | if (ret != SWAP_AGAIN) { |
1987 | anon_vma_unlock_read(anon_vma); | 1940 | anon_vma_unlock_read(anon_vma); |
1988 | goto out; | 1941 | goto out; |
1989 | } | 1942 | } |
1990 | } | 1943 | if (rwc->done && rwc->done(page)) { |
1991 | anon_vma_unlock_read(anon_vma); | ||
1992 | } | ||
1993 | if (!search_new_forks++) | ||
1994 | goto again; | ||
1995 | out: | ||
1996 | return ret; | ||
1997 | } | ||
1998 | |||
1999 | #ifdef CONFIG_MIGRATION | ||
2000 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
2001 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
2002 | { | ||
2003 | struct stable_node *stable_node; | ||
2004 | struct rmap_item *rmap_item; | ||
2005 | int ret = SWAP_AGAIN; | ||
2006 | int search_new_forks = 0; | ||
2007 | |||
2008 | VM_BUG_ON(!PageKsm(page)); | ||
2009 | VM_BUG_ON(!PageLocked(page)); | ||
2010 | |||
2011 | stable_node = page_stable_node(page); | ||
2012 | if (!stable_node) | ||
2013 | return ret; | ||
2014 | again: | ||
2015 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | ||
2016 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
2017 | struct anon_vma_chain *vmac; | ||
2018 | struct vm_area_struct *vma; | ||
2019 | |||
2020 | anon_vma_lock_read(anon_vma); | ||
2021 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | ||
2022 | 0, ULONG_MAX) { | ||
2023 | vma = vmac->vma; | ||
2024 | if (rmap_item->address < vma->vm_start || | ||
2025 | rmap_item->address >= vma->vm_end) | ||
2026 | continue; | ||
2027 | /* | ||
2028 | * Initially we examine only the vma which covers this | ||
2029 | * rmap_item; but later, if there is still work to do, | ||
2030 | * we examine covering vmas in other mms: in case they | ||
2031 | * were forked from the original since ksmd passed. | ||
2032 | */ | ||
2033 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
2034 | continue; | ||
2035 | |||
2036 | ret = rmap_one(page, vma, rmap_item->address, arg); | ||
2037 | if (ret != SWAP_AGAIN) { | ||
2038 | anon_vma_unlock_read(anon_vma); | 1944 | anon_vma_unlock_read(anon_vma); |
2039 | goto out; | 1945 | goto out; |
2040 | } | 1946 | } |
@@ -2047,17 +1953,18 @@ out: | |||
2047 | return ret; | 1953 | return ret; |
2048 | } | 1954 | } |
2049 | 1955 | ||
1956 | #ifdef CONFIG_MIGRATION | ||
2050 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | 1957 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) |
2051 | { | 1958 | { |
2052 | struct stable_node *stable_node; | 1959 | struct stable_node *stable_node; |
2053 | 1960 | ||
2054 | VM_BUG_ON(!PageLocked(oldpage)); | 1961 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); |
2055 | VM_BUG_ON(!PageLocked(newpage)); | 1962 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); |
2056 | VM_BUG_ON(newpage->mapping != oldpage->mapping); | 1963 | VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage); |
2057 | 1964 | ||
2058 | stable_node = page_stable_node(newpage); | 1965 | stable_node = page_stable_node(newpage); |
2059 | if (stable_node) { | 1966 | if (stable_node) { |
2060 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | 1967 | VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage); |
2061 | stable_node->kpfn = page_to_pfn(newpage); | 1968 | stable_node->kpfn = page_to_pfn(newpage); |
2062 | /* | 1969 | /* |
2063 | * newpage->mapping was set in advance; now we need smp_wmb() | 1970 | * newpage->mapping was set in advance; now we need smp_wmb() |
@@ -2438,4 +2345,4 @@ out_free: | |||
2438 | out: | 2345 | out: |
2439 | return err; | 2346 | return err; |
2440 | } | 2347 | } |
2441 | module_init(ksm_init) | 2348 | subsys_initcall(ksm_init); |
diff --git a/mm/memblock.c b/mm/memblock.c index 53e477bb5558..39a31e7f0045 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -21,6 +21,9 @@ | |||
21 | #include <linux/memblock.h> | 21 | #include <linux/memblock.h> |
22 | 22 | ||
23 | #include <asm-generic/sections.h> | 23 | #include <asm-generic/sections.h> |
24 | #include <linux/io.h> | ||
25 | |||
26 | #include "internal.h" | ||
24 | 27 | ||
25 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 28 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
26 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 29 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = { | |||
39 | }; | 42 | }; |
40 | 43 | ||
41 | int memblock_debug __initdata_memblock; | 44 | int memblock_debug __initdata_memblock; |
45 | #ifdef CONFIG_MOVABLE_NODE | ||
46 | bool movable_node_enabled __initdata_memblock = false; | ||
47 | #endif | ||
42 | static int memblock_can_resize __initdata_memblock; | 48 | static int memblock_can_resize __initdata_memblock; |
43 | static int memblock_memory_in_slab __initdata_memblock = 0; | 49 | static int memblock_memory_in_slab __initdata_memblock = 0; |
44 | static int memblock_reserved_in_slab __initdata_memblock = 0; | 50 | static int memblock_reserved_in_slab __initdata_memblock = 0; |
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
91 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | 97 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} |
92 | * @size: size of free area to find | 98 | * @size: size of free area to find |
93 | * @align: alignment of free area to find | 99 | * @align: alignment of free area to find |
94 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 100 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
95 | * | 101 | * |
96 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. | 102 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. |
97 | * | 103 | * |
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, | |||
123 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | 129 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} |
124 | * @size: size of free area to find | 130 | * @size: size of free area to find |
125 | * @align: alignment of free area to find | 131 | * @align: alignment of free area to find |
126 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 132 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
127 | * | 133 | * |
128 | * Utility called from memblock_find_in_range_node(), find free area top-down. | 134 | * Utility called from memblock_find_in_range_node(), find free area top-down. |
129 | * | 135 | * |
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
154 | 160 | ||
155 | /** | 161 | /** |
156 | * memblock_find_in_range_node - find free area in given range and node | 162 | * memblock_find_in_range_node - find free area in given range and node |
157 | * @start: start of candidate range | ||
158 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
159 | * @size: size of free area to find | 163 | * @size: size of free area to find |
160 | * @align: alignment of free area to find | 164 | * @align: alignment of free area to find |
161 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 165 | * @start: start of candidate range |
166 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
167 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
162 | * | 168 | * |
163 | * Find @size free area aligned to @align in the specified range and node. | 169 | * Find @size free area aligned to @align in the specified range and node. |
164 | * | 170 | * |
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
173 | * RETURNS: | 179 | * RETURNS: |
174 | * Found address on success, 0 on failure. | 180 | * Found address on success, 0 on failure. |
175 | */ | 181 | */ |
176 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | 182 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, |
177 | phys_addr_t end, phys_addr_t size, | 183 | phys_addr_t align, phys_addr_t start, |
178 | phys_addr_t align, int nid) | 184 | phys_addr_t end, int nid) |
179 | { | 185 | { |
180 | int ret; | 186 | int ret; |
181 | phys_addr_t kernel_end; | 187 | phys_addr_t kernel_end; |
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | |||
238 | phys_addr_t end, phys_addr_t size, | 244 | phys_addr_t end, phys_addr_t size, |
239 | phys_addr_t align) | 245 | phys_addr_t align) |
240 | { | 246 | { |
241 | return memblock_find_in_range_node(start, end, size, align, | 247 | return memblock_find_in_range_node(size, align, start, end, |
242 | MAX_NUMNODES); | 248 | NUMA_NO_NODE); |
243 | } | 249 | } |
244 | 250 | ||
245 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) | 251 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) |
@@ -255,10 +261,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
255 | type->cnt = 1; | 261 | type->cnt = 1; |
256 | type->regions[0].base = 0; | 262 | type->regions[0].base = 0; |
257 | type->regions[0].size = 0; | 263 | type->regions[0].size = 0; |
264 | type->regions[0].flags = 0; | ||
258 | memblock_set_region_node(&type->regions[0], MAX_NUMNODES); | 265 | memblock_set_region_node(&type->regions[0], MAX_NUMNODES); |
259 | } | 266 | } |
260 | } | 267 | } |
261 | 268 | ||
269 | #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK | ||
270 | |||
262 | phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( | 271 | phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( |
263 | phys_addr_t *addr) | 272 | phys_addr_t *addr) |
264 | { | 273 | { |
@@ -271,6 +280,20 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( | |||
271 | memblock.reserved.max); | 280 | memblock.reserved.max); |
272 | } | 281 | } |
273 | 282 | ||
283 | phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( | ||
284 | phys_addr_t *addr) | ||
285 | { | ||
286 | if (memblock.memory.regions == memblock_memory_init_regions) | ||
287 | return 0; | ||
288 | |||
289 | *addr = __pa(memblock.memory.regions); | ||
290 | |||
291 | return PAGE_ALIGN(sizeof(struct memblock_region) * | ||
292 | memblock.memory.max); | ||
293 | } | ||
294 | |||
295 | #endif | ||
296 | |||
274 | /** | 297 | /** |
275 | * memblock_double_array - double the size of the memblock regions array | 298 | * memblock_double_array - double the size of the memblock regions array |
276 | * @type: memblock type of the regions array being doubled | 299 | * @type: memblock type of the regions array being doubled |
@@ -405,7 +428,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
405 | 428 | ||
406 | if (this->base + this->size != next->base || | 429 | if (this->base + this->size != next->base || |
407 | memblock_get_region_node(this) != | 430 | memblock_get_region_node(this) != |
408 | memblock_get_region_node(next)) { | 431 | memblock_get_region_node(next) || |
432 | this->flags != next->flags) { | ||
409 | BUG_ON(this->base + this->size > next->base); | 433 | BUG_ON(this->base + this->size > next->base); |
410 | i++; | 434 | i++; |
411 | continue; | 435 | continue; |
@@ -425,13 +449,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
425 | * @base: base address of the new region | 449 | * @base: base address of the new region |
426 | * @size: size of the new region | 450 | * @size: size of the new region |
427 | * @nid: node id of the new region | 451 | * @nid: node id of the new region |
452 | * @flags: flags of the new region | ||
428 | * | 453 | * |
429 | * Insert new memblock region [@base,@base+@size) into @type at @idx. | 454 | * Insert new memblock region [@base,@base+@size) into @type at @idx. |
430 | * @type must already have extra room to accomodate the new region. | 455 | * @type must already have extra room to accomodate the new region. |
431 | */ | 456 | */ |
432 | static void __init_memblock memblock_insert_region(struct memblock_type *type, | 457 | static void __init_memblock memblock_insert_region(struct memblock_type *type, |
433 | int idx, phys_addr_t base, | 458 | int idx, phys_addr_t base, |
434 | phys_addr_t size, int nid) | 459 | phys_addr_t size, |
460 | int nid, unsigned long flags) | ||
435 | { | 461 | { |
436 | struct memblock_region *rgn = &type->regions[idx]; | 462 | struct memblock_region *rgn = &type->regions[idx]; |
437 | 463 | ||
@@ -439,6 +465,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
439 | memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); | 465 | memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); |
440 | rgn->base = base; | 466 | rgn->base = base; |
441 | rgn->size = size; | 467 | rgn->size = size; |
468 | rgn->flags = flags; | ||
442 | memblock_set_region_node(rgn, nid); | 469 | memblock_set_region_node(rgn, nid); |
443 | type->cnt++; | 470 | type->cnt++; |
444 | type->total_size += size; | 471 | type->total_size += size; |
@@ -450,6 +477,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
450 | * @base: base address of the new region | 477 | * @base: base address of the new region |
451 | * @size: size of the new region | 478 | * @size: size of the new region |
452 | * @nid: nid of the new region | 479 | * @nid: nid of the new region |
480 | * @flags: flags of the new region | ||
453 | * | 481 | * |
454 | * Add new memblock region [@base,@base+@size) into @type. The new region | 482 | * Add new memblock region [@base,@base+@size) into @type. The new region |
455 | * is allowed to overlap with existing ones - overlaps don't affect already | 483 | * is allowed to overlap with existing ones - overlaps don't affect already |
@@ -460,7 +488,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
460 | * 0 on success, -errno on failure. | 488 | * 0 on success, -errno on failure. |
461 | */ | 489 | */ |
462 | static int __init_memblock memblock_add_region(struct memblock_type *type, | 490 | static int __init_memblock memblock_add_region(struct memblock_type *type, |
463 | phys_addr_t base, phys_addr_t size, int nid) | 491 | phys_addr_t base, phys_addr_t size, |
492 | int nid, unsigned long flags) | ||
464 | { | 493 | { |
465 | bool insert = false; | 494 | bool insert = false; |
466 | phys_addr_t obase = base; | 495 | phys_addr_t obase = base; |
@@ -475,6 +504,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, | |||
475 | WARN_ON(type->cnt != 1 || type->total_size); | 504 | WARN_ON(type->cnt != 1 || type->total_size); |
476 | type->regions[0].base = base; | 505 | type->regions[0].base = base; |
477 | type->regions[0].size = size; | 506 | type->regions[0].size = size; |
507 | type->regions[0].flags = flags; | ||
478 | memblock_set_region_node(&type->regions[0], nid); | 508 | memblock_set_region_node(&type->regions[0], nid); |
479 | type->total_size = size; | 509 | type->total_size = size; |
480 | return 0; | 510 | return 0; |
@@ -505,7 +535,8 @@ repeat: | |||
505 | nr_new++; | 535 | nr_new++; |
506 | if (insert) | 536 | if (insert) |
507 | memblock_insert_region(type, i++, base, | 537 | memblock_insert_region(type, i++, base, |
508 | rbase - base, nid); | 538 | rbase - base, nid, |
539 | flags); | ||
509 | } | 540 | } |
510 | /* area below @rend is dealt with, forget about it */ | 541 | /* area below @rend is dealt with, forget about it */ |
511 | base = min(rend, end); | 542 | base = min(rend, end); |
@@ -515,7 +546,8 @@ repeat: | |||
515 | if (base < end) { | 546 | if (base < end) { |
516 | nr_new++; | 547 | nr_new++; |
517 | if (insert) | 548 | if (insert) |
518 | memblock_insert_region(type, i, base, end - base, nid); | 549 | memblock_insert_region(type, i, base, end - base, |
550 | nid, flags); | ||
519 | } | 551 | } |
520 | 552 | ||
521 | /* | 553 | /* |
@@ -537,12 +569,13 @@ repeat: | |||
537 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, | 569 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, |
538 | int nid) | 570 | int nid) |
539 | { | 571 | { |
540 | return memblock_add_region(&memblock.memory, base, size, nid); | 572 | return memblock_add_region(&memblock.memory, base, size, nid, 0); |
541 | } | 573 | } |
542 | 574 | ||
543 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 575 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
544 | { | 576 | { |
545 | return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); | 577 | return memblock_add_region(&memblock.memory, base, size, |
578 | MAX_NUMNODES, 0); | ||
546 | } | 579 | } |
547 | 580 | ||
548 | /** | 581 | /** |
@@ -597,7 +630,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
597 | rgn->size -= base - rbase; | 630 | rgn->size -= base - rbase; |
598 | type->total_size -= base - rbase; | 631 | type->total_size -= base - rbase; |
599 | memblock_insert_region(type, i, rbase, base - rbase, | 632 | memblock_insert_region(type, i, rbase, base - rbase, |
600 | memblock_get_region_node(rgn)); | 633 | memblock_get_region_node(rgn), |
634 | rgn->flags); | ||
601 | } else if (rend > end) { | 635 | } else if (rend > end) { |
602 | /* | 636 | /* |
603 | * @rgn intersects from above. Split and redo the | 637 | * @rgn intersects from above. Split and redo the |
@@ -607,7 +641,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
607 | rgn->size -= end - rbase; | 641 | rgn->size -= end - rbase; |
608 | type->total_size -= end - rbase; | 642 | type->total_size -= end - rbase; |
609 | memblock_insert_region(type, i--, rbase, end - rbase, | 643 | memblock_insert_region(type, i--, rbase, end - rbase, |
610 | memblock_get_region_node(rgn)); | 644 | memblock_get_region_node(rgn), |
645 | rgn->flags); | ||
611 | } else { | 646 | } else { |
612 | /* @rgn is fully contained, record it */ | 647 | /* @rgn is fully contained, record it */ |
613 | if (!*end_rgn) | 648 | if (!*end_rgn) |
@@ -643,28 +678,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) | |||
643 | { | 678 | { |
644 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", | 679 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", |
645 | (unsigned long long)base, | 680 | (unsigned long long)base, |
646 | (unsigned long long)base + size, | 681 | (unsigned long long)base + size - 1, |
647 | (void *)_RET_IP_); | 682 | (void *)_RET_IP_); |
648 | 683 | ||
649 | return __memblock_remove(&memblock.reserved, base, size); | 684 | return __memblock_remove(&memblock.reserved, base, size); |
650 | } | 685 | } |
651 | 686 | ||
652 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | 687 | static int __init_memblock memblock_reserve_region(phys_addr_t base, |
688 | phys_addr_t size, | ||
689 | int nid, | ||
690 | unsigned long flags) | ||
653 | { | 691 | { |
654 | struct memblock_type *_rgn = &memblock.reserved; | 692 | struct memblock_type *_rgn = &memblock.reserved; |
655 | 693 | ||
656 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", | 694 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", |
657 | (unsigned long long)base, | 695 | (unsigned long long)base, |
658 | (unsigned long long)base + size, | 696 | (unsigned long long)base + size - 1, |
659 | (void *)_RET_IP_); | 697 | flags, (void *)_RET_IP_); |
698 | |||
699 | return memblock_add_region(_rgn, base, size, nid, flags); | ||
700 | } | ||
701 | |||
702 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | ||
703 | { | ||
704 | return memblock_reserve_region(base, size, MAX_NUMNODES, 0); | ||
705 | } | ||
706 | |||
707 | /** | ||
708 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. | ||
709 | * @base: the base phys addr of the region | ||
710 | * @size: the size of the region | ||
711 | * | ||
712 | * This function isolates region [@base, @base + @size), and mark it with flag | ||
713 | * MEMBLOCK_HOTPLUG. | ||
714 | * | ||
715 | * Return 0 on succees, -errno on failure. | ||
716 | */ | ||
717 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | ||
718 | { | ||
719 | struct memblock_type *type = &memblock.memory; | ||
720 | int i, ret, start_rgn, end_rgn; | ||
721 | |||
722 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
723 | if (ret) | ||
724 | return ret; | ||
725 | |||
726 | for (i = start_rgn; i < end_rgn; i++) | ||
727 | memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); | ||
660 | 728 | ||
661 | return memblock_add_region(_rgn, base, size, MAX_NUMNODES); | 729 | memblock_merge_regions(type); |
730 | return 0; | ||
731 | } | ||
732 | |||
733 | /** | ||
734 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | ||
735 | * @base: the base phys addr of the region | ||
736 | * @size: the size of the region | ||
737 | * | ||
738 | * This function isolates region [@base, @base + @size), and clear flag | ||
739 | * MEMBLOCK_HOTPLUG for the isolated regions. | ||
740 | * | ||
741 | * Return 0 on succees, -errno on failure. | ||
742 | */ | ||
743 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | ||
744 | { | ||
745 | struct memblock_type *type = &memblock.memory; | ||
746 | int i, ret, start_rgn, end_rgn; | ||
747 | |||
748 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
749 | if (ret) | ||
750 | return ret; | ||
751 | |||
752 | for (i = start_rgn; i < end_rgn; i++) | ||
753 | memblock_clear_region_flags(&type->regions[i], | ||
754 | MEMBLOCK_HOTPLUG); | ||
755 | |||
756 | memblock_merge_regions(type); | ||
757 | return 0; | ||
662 | } | 758 | } |
663 | 759 | ||
664 | /** | 760 | /** |
665 | * __next_free_mem_range - next function for for_each_free_mem_range() | 761 | * __next_free_mem_range - next function for for_each_free_mem_range() |
666 | * @idx: pointer to u64 loop variable | 762 | * @idx: pointer to u64 loop variable |
667 | * @nid: node selector, %MAX_NUMNODES for all nodes | 763 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
668 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 764 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
669 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 765 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
670 | * @out_nid: ptr to int for nid of the range, can be %NULL | 766 | * @out_nid: ptr to int for nid of the range, can be %NULL |
@@ -693,13 +789,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
693 | int mi = *idx & 0xffffffff; | 789 | int mi = *idx & 0xffffffff; |
694 | int ri = *idx >> 32; | 790 | int ri = *idx >> 32; |
695 | 791 | ||
792 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
793 | nid = NUMA_NO_NODE; | ||
794 | |||
696 | for ( ; mi < mem->cnt; mi++) { | 795 | for ( ; mi < mem->cnt; mi++) { |
697 | struct memblock_region *m = &mem->regions[mi]; | 796 | struct memblock_region *m = &mem->regions[mi]; |
698 | phys_addr_t m_start = m->base; | 797 | phys_addr_t m_start = m->base; |
699 | phys_addr_t m_end = m->base + m->size; | 798 | phys_addr_t m_end = m->base + m->size; |
700 | 799 | ||
701 | /* only memory regions are associated with nodes, check it */ | 800 | /* only memory regions are associated with nodes, check it */ |
702 | if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) | 801 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) |
703 | continue; | 802 | continue; |
704 | 803 | ||
705 | /* scan areas before each reservation for intersection */ | 804 | /* scan areas before each reservation for intersection */ |
@@ -740,12 +839,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
740 | /** | 839 | /** |
741 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 840 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() |
742 | * @idx: pointer to u64 loop variable | 841 | * @idx: pointer to u64 loop variable |
743 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 842 | * @nid: nid: node selector, %NUMA_NO_NODE for all nodes |
744 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 843 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
745 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 844 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
746 | * @out_nid: ptr to int for nid of the range, can be %NULL | 845 | * @out_nid: ptr to int for nid of the range, can be %NULL |
747 | * | 846 | * |
748 | * Reverse of __next_free_mem_range(). | 847 | * Reverse of __next_free_mem_range(). |
848 | * | ||
849 | * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't | ||
850 | * be able to hot-remove hotpluggable memory used by the kernel. So this | ||
851 | * function skip hotpluggable regions if needed when allocating memory for the | ||
852 | * kernel. | ||
749 | */ | 853 | */ |
750 | void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | 854 | void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, |
751 | phys_addr_t *out_start, | 855 | phys_addr_t *out_start, |
@@ -756,6 +860,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | |||
756 | int mi = *idx & 0xffffffff; | 860 | int mi = *idx & 0xffffffff; |
757 | int ri = *idx >> 32; | 861 | int ri = *idx >> 32; |
758 | 862 | ||
863 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
864 | nid = NUMA_NO_NODE; | ||
865 | |||
759 | if (*idx == (u64)ULLONG_MAX) { | 866 | if (*idx == (u64)ULLONG_MAX) { |
760 | mi = mem->cnt - 1; | 867 | mi = mem->cnt - 1; |
761 | ri = rsv->cnt; | 868 | ri = rsv->cnt; |
@@ -767,7 +874,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | |||
767 | phys_addr_t m_end = m->base + m->size; | 874 | phys_addr_t m_end = m->base + m->size; |
768 | 875 | ||
769 | /* only memory regions are associated with nodes, check it */ | 876 | /* only memory regions are associated with nodes, check it */ |
770 | if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) | 877 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) |
878 | continue; | ||
879 | |||
880 | /* skip hotpluggable memory regions if needed */ | ||
881 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) | ||
771 | continue; | 882 | continue; |
772 | 883 | ||
773 | /* scan areas before each reservation for intersection */ | 884 | /* scan areas before each reservation for intersection */ |
@@ -837,18 +948,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, | |||
837 | * memblock_set_node - set node ID on memblock regions | 948 | * memblock_set_node - set node ID on memblock regions |
838 | * @base: base of area to set node ID for | 949 | * @base: base of area to set node ID for |
839 | * @size: size of area to set node ID for | 950 | * @size: size of area to set node ID for |
951 | * @type: memblock type to set node ID for | ||
840 | * @nid: node ID to set | 952 | * @nid: node ID to set |
841 | * | 953 | * |
842 | * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. | 954 | * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. |
843 | * Regions which cross the area boundaries are split as necessary. | 955 | * Regions which cross the area boundaries are split as necessary. |
844 | * | 956 | * |
845 | * RETURNS: | 957 | * RETURNS: |
846 | * 0 on success, -errno on failure. | 958 | * 0 on success, -errno on failure. |
847 | */ | 959 | */ |
848 | int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | 960 | int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, |
849 | int nid) | 961 | struct memblock_type *type, int nid) |
850 | { | 962 | { |
851 | struct memblock_type *type = &memblock.memory; | ||
852 | int start_rgn, end_rgn; | 963 | int start_rgn, end_rgn; |
853 | int i, ret; | 964 | int i, ret; |
854 | 965 | ||
@@ -870,13 +981,10 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | |||
870 | { | 981 | { |
871 | phys_addr_t found; | 982 | phys_addr_t found; |
872 | 983 | ||
873 | if (WARN_ON(!align)) | 984 | if (!align) |
874 | align = __alignof__(long long); | 985 | align = SMP_CACHE_BYTES; |
875 | 986 | ||
876 | /* align @size to avoid excessive fragmentation on reserved array */ | 987 | found = memblock_find_in_range_node(size, align, 0, max_addr, nid); |
877 | size = round_up(size, align); | ||
878 | |||
879 | found = memblock_find_in_range_node(0, max_addr, size, align, nid); | ||
880 | if (found && !memblock_reserve(found, size)) | 988 | if (found && !memblock_reserve(found, size)) |
881 | return found; | 989 | return found; |
882 | 990 | ||
@@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n | |||
890 | 998 | ||
891 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 999 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
892 | { | 1000 | { |
893 | return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); | 1001 | return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); |
894 | } | 1002 | } |
895 | 1003 | ||
896 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 1004 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
@@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i | |||
920 | return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); | 1028 | return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); |
921 | } | 1029 | } |
922 | 1030 | ||
1031 | /** | ||
1032 | * memblock_virt_alloc_internal - allocate boot memory block | ||
1033 | * @size: size of memory block to be allocated in bytes | ||
1034 | * @align: alignment of the region and block's size | ||
1035 | * @min_addr: the lower bound of the memory region to allocate (phys address) | ||
1036 | * @max_addr: the upper bound of the memory region to allocate (phys address) | ||
1037 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1038 | * | ||
1039 | * The @min_addr limit is dropped if it can not be satisfied and the allocation | ||
1040 | * will fall back to memory below @min_addr. Also, allocation may fall back | ||
1041 | * to any node in the system if the specified node can not | ||
1042 | * hold the requested memory. | ||
1043 | * | ||
1044 | * The allocation is performed from memory region limited by | ||
1045 | * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. | ||
1046 | * | ||
1047 | * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. | ||
1048 | * | ||
1049 | * The phys address of allocated boot memory block is converted to virtual and | ||
1050 | * allocated memory is reset to 0. | ||
1051 | * | ||
1052 | * In addition, function sets the min_count to 0 using kmemleak_alloc for | ||
1053 | * allocated boot memory block, so that it is never reported as leaks. | ||
1054 | * | ||
1055 | * RETURNS: | ||
1056 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1057 | */ | ||
1058 | static void * __init memblock_virt_alloc_internal( | ||
1059 | phys_addr_t size, phys_addr_t align, | ||
1060 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1061 | int nid) | ||
1062 | { | ||
1063 | phys_addr_t alloc; | ||
1064 | void *ptr; | ||
1065 | |||
1066 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
1067 | nid = NUMA_NO_NODE; | ||
1068 | |||
1069 | /* | ||
1070 | * Detect any accidental use of these APIs after slab is ready, as at | ||
1071 | * this moment memblock may be deinitialized already and its | ||
1072 | * internal data may be destroyed (after execution of free_all_bootmem) | ||
1073 | */ | ||
1074 | if (WARN_ON_ONCE(slab_is_available())) | ||
1075 | return kzalloc_node(size, GFP_NOWAIT, nid); | ||
1076 | |||
1077 | if (!align) | ||
1078 | align = SMP_CACHE_BYTES; | ||
1079 | |||
1080 | if (max_addr > memblock.current_limit) | ||
1081 | max_addr = memblock.current_limit; | ||
1082 | |||
1083 | again: | ||
1084 | alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, | ||
1085 | nid); | ||
1086 | if (alloc) | ||
1087 | goto done; | ||
1088 | |||
1089 | if (nid != NUMA_NO_NODE) { | ||
1090 | alloc = memblock_find_in_range_node(size, align, min_addr, | ||
1091 | max_addr, NUMA_NO_NODE); | ||
1092 | if (alloc) | ||
1093 | goto done; | ||
1094 | } | ||
1095 | |||
1096 | if (min_addr) { | ||
1097 | min_addr = 0; | ||
1098 | goto again; | ||
1099 | } else { | ||
1100 | goto error; | ||
1101 | } | ||
1102 | |||
1103 | done: | ||
1104 | memblock_reserve(alloc, size); | ||
1105 | ptr = phys_to_virt(alloc); | ||
1106 | memset(ptr, 0, size); | ||
1107 | |||
1108 | /* | ||
1109 | * The min_count is set to 0 so that bootmem allocated blocks | ||
1110 | * are never reported as leaks. This is because many of these blocks | ||
1111 | * are only referred via the physical address which is not | ||
1112 | * looked up by kmemleak. | ||
1113 | */ | ||
1114 | kmemleak_alloc(ptr, size, 0, 0); | ||
1115 | |||
1116 | return ptr; | ||
1117 | |||
1118 | error: | ||
1119 | return NULL; | ||
1120 | } | ||
1121 | |||
1122 | /** | ||
1123 | * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block | ||
1124 | * @size: size of memory block to be allocated in bytes | ||
1125 | * @align: alignment of the region and block's size | ||
1126 | * @min_addr: the lower bound of the memory region from where the allocation | ||
1127 | * is preferred (phys address) | ||
1128 | * @max_addr: the upper bound of the memory region from where the allocation | ||
1129 | * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to | ||
1130 | * allocate only from memory limited by memblock.current_limit value | ||
1131 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1132 | * | ||
1133 | * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides | ||
1134 | * additional debug information (including caller info), if enabled. | ||
1135 | * | ||
1136 | * RETURNS: | ||
1137 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1138 | */ | ||
1139 | void * __init memblock_virt_alloc_try_nid_nopanic( | ||
1140 | phys_addr_t size, phys_addr_t align, | ||
1141 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1142 | int nid) | ||
1143 | { | ||
1144 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | ||
1145 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1146 | (u64)max_addr, (void *)_RET_IP_); | ||
1147 | return memblock_virt_alloc_internal(size, align, min_addr, | ||
1148 | max_addr, nid); | ||
1149 | } | ||
1150 | |||
1151 | /** | ||
1152 | * memblock_virt_alloc_try_nid - allocate boot memory block with panicking | ||
1153 | * @size: size of memory block to be allocated in bytes | ||
1154 | * @align: alignment of the region and block's size | ||
1155 | * @min_addr: the lower bound of the memory region from where the allocation | ||
1156 | * is preferred (phys address) | ||
1157 | * @max_addr: the upper bound of the memory region from where the allocation | ||
1158 | * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to | ||
1159 | * allocate only from memory limited by memblock.current_limit value | ||
1160 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1161 | * | ||
1162 | * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() | ||
1163 | * which provides debug information (including caller info), if enabled, | ||
1164 | * and panics if the request can not be satisfied. | ||
1165 | * | ||
1166 | * RETURNS: | ||
1167 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1168 | */ | ||
1169 | void * __init memblock_virt_alloc_try_nid( | ||
1170 | phys_addr_t size, phys_addr_t align, | ||
1171 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1172 | int nid) | ||
1173 | { | ||
1174 | void *ptr; | ||
1175 | |||
1176 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | ||
1177 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1178 | (u64)max_addr, (void *)_RET_IP_); | ||
1179 | ptr = memblock_virt_alloc_internal(size, align, | ||
1180 | min_addr, max_addr, nid); | ||
1181 | if (ptr) | ||
1182 | return ptr; | ||
1183 | |||
1184 | panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", | ||
1185 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1186 | (u64)max_addr); | ||
1187 | return NULL; | ||
1188 | } | ||
1189 | |||
1190 | /** | ||
1191 | * __memblock_free_early - free boot memory block | ||
1192 | * @base: phys starting address of the boot memory block | ||
1193 | * @size: size of the boot memory block in bytes | ||
1194 | * | ||
1195 | * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. | ||
1196 | * The freeing memory will not be released to the buddy allocator. | ||
1197 | */ | ||
1198 | void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) | ||
1199 | { | ||
1200 | memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", | ||
1201 | __func__, (u64)base, (u64)base + size - 1, | ||
1202 | (void *)_RET_IP_); | ||
1203 | kmemleak_free_part(__va(base), size); | ||
1204 | __memblock_remove(&memblock.reserved, base, size); | ||
1205 | } | ||
1206 | |||
1207 | /* | ||
1208 | * __memblock_free_late - free bootmem block pages directly to buddy allocator | ||
1209 | * @addr: phys starting address of the boot memory block | ||
1210 | * @size: size of the boot memory block in bytes | ||
1211 | * | ||
1212 | * This is only useful when the bootmem allocator has already been torn | ||
1213 | * down, but we are still initializing the system. Pages are released directly | ||
1214 | * to the buddy allocator, no bootmem metadata is updated because it is gone. | ||
1215 | */ | ||
1216 | void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) | ||
1217 | { | ||
1218 | u64 cursor, end; | ||
1219 | |||
1220 | memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", | ||
1221 | __func__, (u64)base, (u64)base + size - 1, | ||
1222 | (void *)_RET_IP_); | ||
1223 | kmemleak_free_part(__va(base), size); | ||
1224 | cursor = PFN_UP(base); | ||
1225 | end = PFN_DOWN(base + size); | ||
1226 | |||
1227 | for (; cursor < end; cursor++) { | ||
1228 | __free_pages_bootmem(pfn_to_page(cursor), 0); | ||
1229 | totalram_pages++; | ||
1230 | } | ||
1231 | } | ||
923 | 1232 | ||
924 | /* | 1233 | /* |
925 | * Remaining API functions | 1234 | * Remaining API functions |
@@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) | |||
1101 | static void __init_memblock memblock_dump(struct memblock_type *type, char *name) | 1410 | static void __init_memblock memblock_dump(struct memblock_type *type, char *name) |
1102 | { | 1411 | { |
1103 | unsigned long long base, size; | 1412 | unsigned long long base, size; |
1413 | unsigned long flags; | ||
1104 | int i; | 1414 | int i; |
1105 | 1415 | ||
1106 | pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); | 1416 | pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); |
@@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name | |||
1111 | 1421 | ||
1112 | base = rgn->base; | 1422 | base = rgn->base; |
1113 | size = rgn->size; | 1423 | size = rgn->size; |
1424 | flags = rgn->flags; | ||
1114 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 1425 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
1115 | if (memblock_get_region_node(rgn) != MAX_NUMNODES) | 1426 | if (memblock_get_region_node(rgn) != MAX_NUMNODES) |
1116 | snprintf(nid_buf, sizeof(nid_buf), " on node %d", | 1427 | snprintf(nid_buf, sizeof(nid_buf), " on node %d", |
1117 | memblock_get_region_node(rgn)); | 1428 | memblock_get_region_node(rgn)); |
1118 | #endif | 1429 | #endif |
1119 | pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", | 1430 | pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", |
1120 | name, i, base, base + size - 1, size, nid_buf); | 1431 | name, i, base, base + size - 1, size, nid_buf, flags); |
1121 | } | 1432 | } |
1122 | } | 1433 | } |
1123 | 1434 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f1a356153c0..53385cd4e6f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -45,16 +45,17 @@ | |||
45 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
46 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
47 | #include <linux/eventfd.h> | 47 | #include <linux/eventfd.h> |
48 | #include <linux/poll.h> | ||
48 | #include <linux/sort.h> | 49 | #include <linux/sort.h> |
49 | #include <linux/fs.h> | 50 | #include <linux/fs.h> |
50 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
51 | #include <linux/vmalloc.h> | ||
52 | #include <linux/vmpressure.h> | 52 | #include <linux/vmpressure.h> |
53 | #include <linux/mm_inline.h> | 53 | #include <linux/mm_inline.h> |
54 | #include <linux/page_cgroup.h> | 54 | #include <linux/page_cgroup.h> |
55 | #include <linux/cpu.h> | 55 | #include <linux/cpu.h> |
56 | #include <linux/oom.h> | 56 | #include <linux/oom.h> |
57 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
58 | #include <linux/file.h> | ||
58 | #include "internal.h" | 59 | #include "internal.h" |
59 | #include <net/sock.h> | 60 | #include <net/sock.h> |
60 | #include <net/ip.h> | 61 | #include <net/ip.h> |
@@ -148,7 +149,7 @@ struct mem_cgroup_reclaim_iter { | |||
148 | * matches memcg->dead_count of the hierarchy root group. | 149 | * matches memcg->dead_count of the hierarchy root group. |
149 | */ | 150 | */ |
150 | struct mem_cgroup *last_visited; | 151 | struct mem_cgroup *last_visited; |
151 | unsigned long last_dead_count; | 152 | int last_dead_count; |
152 | 153 | ||
153 | /* scan generation, increased every round-trip */ | 154 | /* scan generation, increased every round-trip */ |
154 | unsigned int generation; | 155 | unsigned int generation; |
@@ -227,6 +228,46 @@ struct mem_cgroup_eventfd_list { | |||
227 | struct eventfd_ctx *eventfd; | 228 | struct eventfd_ctx *eventfd; |
228 | }; | 229 | }; |
229 | 230 | ||
231 | /* | ||
232 | * cgroup_event represents events which userspace want to receive. | ||
233 | */ | ||
234 | struct mem_cgroup_event { | ||
235 | /* | ||
236 | * memcg which the event belongs to. | ||
237 | */ | ||
238 | struct mem_cgroup *memcg; | ||
239 | /* | ||
240 | * eventfd to signal userspace about the event. | ||
241 | */ | ||
242 | struct eventfd_ctx *eventfd; | ||
243 | /* | ||
244 | * Each of these stored in a list by the cgroup. | ||
245 | */ | ||
246 | struct list_head list; | ||
247 | /* | ||
248 | * register_event() callback will be used to add new userspace | ||
249 | * waiter for changes related to this event. Use eventfd_signal() | ||
250 | * on eventfd to send notification to userspace. | ||
251 | */ | ||
252 | int (*register_event)(struct mem_cgroup *memcg, | ||
253 | struct eventfd_ctx *eventfd, const char *args); | ||
254 | /* | ||
255 | * unregister_event() callback will be called when userspace closes | ||
256 | * the eventfd or on cgroup removing. This callback must be set, | ||
257 | * if you want provide notification functionality. | ||
258 | */ | ||
259 | void (*unregister_event)(struct mem_cgroup *memcg, | ||
260 | struct eventfd_ctx *eventfd); | ||
261 | /* | ||
262 | * All fields below needed to unregister event when | ||
263 | * userspace closes eventfd. | ||
264 | */ | ||
265 | poll_table pt; | ||
266 | wait_queue_head_t *wqh; | ||
267 | wait_queue_t wait; | ||
268 | struct work_struct remove; | ||
269 | }; | ||
270 | |||
230 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); | 271 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); |
231 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | 272 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
232 | 273 | ||
@@ -331,27 +372,20 @@ struct mem_cgroup { | |||
331 | atomic_t numainfo_updating; | 372 | atomic_t numainfo_updating; |
332 | #endif | 373 | #endif |
333 | 374 | ||
375 | /* List of events which userspace want to receive */ | ||
376 | struct list_head event_list; | ||
377 | spinlock_t event_list_lock; | ||
378 | |||
334 | struct mem_cgroup_per_node *nodeinfo[0]; | 379 | struct mem_cgroup_per_node *nodeinfo[0]; |
335 | /* WARNING: nodeinfo must be the last member here */ | 380 | /* WARNING: nodeinfo must be the last member here */ |
336 | }; | 381 | }; |
337 | 382 | ||
338 | static size_t memcg_size(void) | ||
339 | { | ||
340 | return sizeof(struct mem_cgroup) + | ||
341 | nr_node_ids * sizeof(struct mem_cgroup_per_node *); | ||
342 | } | ||
343 | |||
344 | /* internal only representation about the status of kmem accounting. */ | 383 | /* internal only representation about the status of kmem accounting. */ |
345 | enum { | 384 | enum { |
346 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ | 385 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ |
347 | KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ | ||
348 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ | 386 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ |
349 | }; | 387 | }; |
350 | 388 | ||
351 | /* We account when limit is on, but only after call sites are patched */ | ||
352 | #define KMEM_ACCOUNTED_MASK \ | ||
353 | ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) | ||
354 | |||
355 | #ifdef CONFIG_MEMCG_KMEM | 389 | #ifdef CONFIG_MEMCG_KMEM |
356 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) | 390 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) |
357 | { | 391 | { |
@@ -363,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | |||
363 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | 397 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); |
364 | } | 398 | } |
365 | 399 | ||
366 | static void memcg_kmem_set_activated(struct mem_cgroup *memcg) | ||
367 | { | ||
368 | set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | ||
369 | } | ||
370 | |||
371 | static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) | ||
372 | { | ||
373 | clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | ||
374 | } | ||
375 | |||
376 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) | 400 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) |
377 | { | 401 | { |
378 | /* | 402 | /* |
@@ -490,11 +514,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) | |||
490 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; | 514 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; |
491 | } | 515 | } |
492 | 516 | ||
493 | struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) | ||
494 | { | ||
495 | return &mem_cgroup_from_css(css)->vmpressure; | ||
496 | } | ||
497 | |||
498 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | 517 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
499 | { | 518 | { |
500 | return (memcg == root_mem_cgroup); | 519 | return (memcg == root_mem_cgroup); |
@@ -1098,16 +1117,22 @@ skip_node: | |||
1098 | * skipped and we should continue the tree walk. | 1117 | * skipped and we should continue the tree walk. |
1099 | * last_visited css is safe to use because it is | 1118 | * last_visited css is safe to use because it is |
1100 | * protected by css_get and the tree walk is rcu safe. | 1119 | * protected by css_get and the tree walk is rcu safe. |
1120 | * | ||
1121 | * We do not take a reference on the root of the tree walk | ||
1122 | * because we might race with the root removal when it would | ||
1123 | * be the only node in the iterated hierarchy and mem_cgroup_iter | ||
1124 | * would end up in an endless loop because it expects that at | ||
1125 | * least one valid node will be returned. Root cannot disappear | ||
1126 | * because caller of the iterator should hold it already so | ||
1127 | * skipping css reference should be safe. | ||
1101 | */ | 1128 | */ |
1102 | if (next_css) { | 1129 | if (next_css) { |
1103 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 1130 | if ((next_css->flags & CSS_ONLINE) && |
1131 | (next_css == &root->css || css_tryget(next_css))) | ||
1132 | return mem_cgroup_from_css(next_css); | ||
1104 | 1133 | ||
1105 | if (css_tryget(&mem->css)) | 1134 | prev_css = next_css; |
1106 | return mem; | 1135 | goto skip_node; |
1107 | else { | ||
1108 | prev_css = next_css; | ||
1109 | goto skip_node; | ||
1110 | } | ||
1111 | } | 1136 | } |
1112 | 1137 | ||
1113 | return NULL; | 1138 | return NULL; |
@@ -1141,7 +1166,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, | |||
1141 | if (iter->last_dead_count == *sequence) { | 1166 | if (iter->last_dead_count == *sequence) { |
1142 | smp_rmb(); | 1167 | smp_rmb(); |
1143 | position = iter->last_visited; | 1168 | position = iter->last_visited; |
1144 | if (position && !css_tryget(&position->css)) | 1169 | |
1170 | /* | ||
1171 | * We cannot take a reference to root because we might race | ||
1172 | * with root removal and returning NULL would end up in | ||
1173 | * an endless loop on the iterator user level when root | ||
1174 | * would be returned all the time. | ||
1175 | */ | ||
1176 | if (position && position != root && | ||
1177 | !css_tryget(&position->css)) | ||
1145 | position = NULL; | 1178 | position = NULL; |
1146 | } | 1179 | } |
1147 | return position; | 1180 | return position; |
@@ -1150,9 +1183,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, | |||
1150 | static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | 1183 | static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, |
1151 | struct mem_cgroup *last_visited, | 1184 | struct mem_cgroup *last_visited, |
1152 | struct mem_cgroup *new_position, | 1185 | struct mem_cgroup *new_position, |
1186 | struct mem_cgroup *root, | ||
1153 | int sequence) | 1187 | int sequence) |
1154 | { | 1188 | { |
1155 | if (last_visited) | 1189 | /* root reference counting symmetric to mem_cgroup_iter_load */ |
1190 | if (last_visited && last_visited != root) | ||
1156 | css_put(&last_visited->css); | 1191 | css_put(&last_visited->css); |
1157 | /* | 1192 | /* |
1158 | * We store the sequence count from the time @last_visited was | 1193 | * We store the sequence count from the time @last_visited was |
@@ -1227,7 +1262,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1227 | memcg = __mem_cgroup_iter_next(root, last_visited); | 1262 | memcg = __mem_cgroup_iter_next(root, last_visited); |
1228 | 1263 | ||
1229 | if (reclaim) { | 1264 | if (reclaim) { |
1230 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1265 | mem_cgroup_iter_update(iter, last_visited, memcg, root, |
1266 | seq); | ||
1231 | 1267 | ||
1232 | if (!memcg) | 1268 | if (!memcg) |
1233 | iter->generation++; | 1269 | iter->generation++; |
@@ -1647,13 +1683,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | |||
1647 | */ | 1683 | */ |
1648 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | 1684 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) |
1649 | { | 1685 | { |
1650 | struct cgroup *task_cgrp; | ||
1651 | struct cgroup *mem_cgrp; | ||
1652 | /* | 1686 | /* |
1653 | * Need a buffer in BSS, can't rely on allocations. The code relies | 1687 | * protects memcg_name and makes sure that parallel ooms do not |
1654 | * on the assumption that OOM is serialized for memory controller. | 1688 | * interleave |
1655 | * If this assumption is broken, revisit this code. | ||
1656 | */ | 1689 | */ |
1690 | static DEFINE_SPINLOCK(oom_info_lock); | ||
1691 | struct cgroup *task_cgrp; | ||
1692 | struct cgroup *mem_cgrp; | ||
1657 | static char memcg_name[PATH_MAX]; | 1693 | static char memcg_name[PATH_MAX]; |
1658 | int ret; | 1694 | int ret; |
1659 | struct mem_cgroup *iter; | 1695 | struct mem_cgroup *iter; |
@@ -1662,6 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1662 | if (!p) | 1698 | if (!p) |
1663 | return; | 1699 | return; |
1664 | 1700 | ||
1701 | spin_lock(&oom_info_lock); | ||
1665 | rcu_read_lock(); | 1702 | rcu_read_lock(); |
1666 | 1703 | ||
1667 | mem_cgrp = memcg->css.cgroup; | 1704 | mem_cgrp = memcg->css.cgroup; |
@@ -1730,6 +1767,7 @@ done: | |||
1730 | 1767 | ||
1731 | pr_cont("\n"); | 1768 | pr_cont("\n"); |
1732 | } | 1769 | } |
1770 | spin_unlock(&oom_info_lock); | ||
1733 | } | 1771 | } |
1734 | 1772 | ||
1735 | /* | 1773 | /* |
@@ -1822,13 +1860,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1822 | break; | 1860 | break; |
1823 | }; | 1861 | }; |
1824 | points = oom_badness(task, memcg, NULL, totalpages); | 1862 | points = oom_badness(task, memcg, NULL, totalpages); |
1825 | if (points > chosen_points) { | 1863 | if (!points || points < chosen_points) |
1826 | if (chosen) | 1864 | continue; |
1827 | put_task_struct(chosen); | 1865 | /* Prefer thread group leaders for display purposes */ |
1828 | chosen = task; | 1866 | if (points == chosen_points && |
1829 | chosen_points = points; | 1867 | thread_group_leader(chosen)) |
1830 | get_task_struct(chosen); | 1868 | continue; |
1831 | } | 1869 | |
1870 | if (chosen) | ||
1871 | put_task_struct(chosen); | ||
1872 | chosen = task; | ||
1873 | chosen_points = points; | ||
1874 | get_task_struct(chosen); | ||
1832 | } | 1875 | } |
1833 | css_task_iter_end(&it); | 1876 | css_task_iter_end(&it); |
1834 | } | 1877 | } |
@@ -2861,7 +2904,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2861 | unsigned short id; | 2904 | unsigned short id; |
2862 | swp_entry_t ent; | 2905 | swp_entry_t ent; |
2863 | 2906 | ||
2864 | VM_BUG_ON(!PageLocked(page)); | 2907 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
2865 | 2908 | ||
2866 | pc = lookup_page_cgroup(page); | 2909 | pc = lookup_page_cgroup(page); |
2867 | lock_page_cgroup(pc); | 2910 | lock_page_cgroup(pc); |
@@ -2895,7 +2938,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2895 | bool anon; | 2938 | bool anon; |
2896 | 2939 | ||
2897 | lock_page_cgroup(pc); | 2940 | lock_page_cgroup(pc); |
2898 | VM_BUG_ON(PageCgroupUsed(pc)); | 2941 | VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); |
2899 | /* | 2942 | /* |
2900 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2943 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2901 | * accessed by any other context at this point. | 2944 | * accessed by any other context at this point. |
@@ -2930,7 +2973,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2930 | if (lrucare) { | 2973 | if (lrucare) { |
2931 | if (was_on_lru) { | 2974 | if (was_on_lru) { |
2932 | lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); | 2975 | lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); |
2933 | VM_BUG_ON(PageLRU(page)); | 2976 | VM_BUG_ON_PAGE(PageLRU(page), page); |
2934 | SetPageLRU(page); | 2977 | SetPageLRU(page); |
2935 | add_page_to_lru_list(page, lruvec, page_lru(page)); | 2978 | add_page_to_lru_list(page, lruvec, page_lru(page)); |
2936 | } | 2979 | } |
@@ -2956,10 +2999,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2956 | static DEFINE_MUTEX(set_limit_mutex); | 2999 | static DEFINE_MUTEX(set_limit_mutex); |
2957 | 3000 | ||
2958 | #ifdef CONFIG_MEMCG_KMEM | 3001 | #ifdef CONFIG_MEMCG_KMEM |
3002 | static DEFINE_MUTEX(activate_kmem_mutex); | ||
3003 | |||
2959 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | 3004 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) |
2960 | { | 3005 | { |
2961 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && | 3006 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && |
2962 | (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); | 3007 | memcg_kmem_is_active(memcg); |
2963 | } | 3008 | } |
2964 | 3009 | ||
2965 | /* | 3010 | /* |
@@ -2976,10 +3021,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | |||
2976 | } | 3021 | } |
2977 | 3022 | ||
2978 | #ifdef CONFIG_SLABINFO | 3023 | #ifdef CONFIG_SLABINFO |
2979 | static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, | 3024 | static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) |
2980 | struct cftype *cft, struct seq_file *m) | ||
2981 | { | 3025 | { |
2982 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 3026 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
2983 | struct memcg_cache_params *params; | 3027 | struct memcg_cache_params *params; |
2984 | 3028 | ||
2985 | if (!memcg_can_account_kmem(memcg)) | 3029 | if (!memcg_can_account_kmem(memcg)) |
@@ -3059,16 +3103,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) | |||
3059 | css_put(&memcg->css); | 3103 | css_put(&memcg->css); |
3060 | } | 3104 | } |
3061 | 3105 | ||
3062 | void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) | ||
3063 | { | ||
3064 | if (!memcg) | ||
3065 | return; | ||
3066 | |||
3067 | mutex_lock(&memcg->slab_caches_mutex); | ||
3068 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | ||
3069 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3070 | } | ||
3071 | |||
3072 | /* | 3106 | /* |
3073 | * helper for acessing a memcg's index. It will be used as an index in the | 3107 | * helper for acessing a memcg's index. It will be used as an index in the |
3074 | * child cache array in kmem_cache, and also to derive its name. This function | 3108 | * child cache array in kmem_cache, and also to derive its name. This function |
@@ -3079,43 +3113,6 @@ int memcg_cache_id(struct mem_cgroup *memcg) | |||
3079 | return memcg ? memcg->kmemcg_id : -1; | 3113 | return memcg ? memcg->kmemcg_id : -1; |
3080 | } | 3114 | } |
3081 | 3115 | ||
3082 | /* | ||
3083 | * This ends up being protected by the set_limit mutex, during normal | ||
3084 | * operation, because that is its main call site. | ||
3085 | * | ||
3086 | * But when we create a new cache, we can call this as well if its parent | ||
3087 | * is kmem-limited. That will have to hold set_limit_mutex as well. | ||
3088 | */ | ||
3089 | int memcg_update_cache_sizes(struct mem_cgroup *memcg) | ||
3090 | { | ||
3091 | int num, ret; | ||
3092 | |||
3093 | num = ida_simple_get(&kmem_limited_groups, | ||
3094 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | ||
3095 | if (num < 0) | ||
3096 | return num; | ||
3097 | /* | ||
3098 | * After this point, kmem_accounted (that we test atomically in | ||
3099 | * the beginning of this conditional), is no longer 0. This | ||
3100 | * guarantees only one process will set the following boolean | ||
3101 | * to true. We don't need test_and_set because we're protected | ||
3102 | * by the set_limit_mutex anyway. | ||
3103 | */ | ||
3104 | memcg_kmem_set_activated(memcg); | ||
3105 | |||
3106 | ret = memcg_update_all_caches(num+1); | ||
3107 | if (ret) { | ||
3108 | ida_simple_remove(&kmem_limited_groups, num); | ||
3109 | memcg_kmem_clear_activated(memcg); | ||
3110 | return ret; | ||
3111 | } | ||
3112 | |||
3113 | memcg->kmemcg_id = num; | ||
3114 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
3115 | mutex_init(&memcg->slab_caches_mutex); | ||
3116 | return 0; | ||
3117 | } | ||
3118 | |||
3119 | static size_t memcg_caches_array_size(int num_groups) | 3116 | static size_t memcg_caches_array_size(int num_groups) |
3120 | { | 3117 | { |
3121 | ssize_t size; | 3118 | ssize_t size; |
@@ -3152,18 +3149,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3152 | 3149 | ||
3153 | if (num_groups > memcg_limited_groups_array_size) { | 3150 | if (num_groups > memcg_limited_groups_array_size) { |
3154 | int i; | 3151 | int i; |
3152 | struct memcg_cache_params *new_params; | ||
3155 | ssize_t size = memcg_caches_array_size(num_groups); | 3153 | ssize_t size = memcg_caches_array_size(num_groups); |
3156 | 3154 | ||
3157 | size *= sizeof(void *); | 3155 | size *= sizeof(void *); |
3158 | size += offsetof(struct memcg_cache_params, memcg_caches); | 3156 | size += offsetof(struct memcg_cache_params, memcg_caches); |
3159 | 3157 | ||
3160 | s->memcg_params = kzalloc(size, GFP_KERNEL); | 3158 | new_params = kzalloc(size, GFP_KERNEL); |
3161 | if (!s->memcg_params) { | 3159 | if (!new_params) |
3162 | s->memcg_params = cur_params; | ||
3163 | return -ENOMEM; | 3160 | return -ENOMEM; |
3164 | } | ||
3165 | 3161 | ||
3166 | s->memcg_params->is_root_cache = true; | 3162 | new_params->is_root_cache = true; |
3167 | 3163 | ||
3168 | /* | 3164 | /* |
3169 | * There is the chance it will be bigger than | 3165 | * There is the chance it will be bigger than |
@@ -3177,7 +3173,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3177 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | 3173 | for (i = 0; i < memcg_limited_groups_array_size; i++) { |
3178 | if (!cur_params->memcg_caches[i]) | 3174 | if (!cur_params->memcg_caches[i]) |
3179 | continue; | 3175 | continue; |
3180 | s->memcg_params->memcg_caches[i] = | 3176 | new_params->memcg_caches[i] = |
3181 | cur_params->memcg_caches[i]; | 3177 | cur_params->memcg_caches[i]; |
3182 | } | 3178 | } |
3183 | 3179 | ||
@@ -3190,13 +3186,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3190 | * bigger than the others. And all updates will reset this | 3186 | * bigger than the others. And all updates will reset this |
3191 | * anyway. | 3187 | * anyway. |
3192 | */ | 3188 | */ |
3193 | kfree(cur_params); | 3189 | rcu_assign_pointer(s->memcg_params, new_params); |
3190 | if (cur_params) | ||
3191 | kfree_rcu(cur_params, rcu_head); | ||
3194 | } | 3192 | } |
3195 | return 0; | 3193 | return 0; |
3196 | } | 3194 | } |
3197 | 3195 | ||
3198 | int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, | 3196 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, |
3199 | struct kmem_cache *root_cache) | 3197 | struct kmem_cache *root_cache) |
3200 | { | 3198 | { |
3201 | size_t size; | 3199 | size_t size; |
3202 | 3200 | ||
@@ -3224,35 +3222,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, | |||
3224 | return 0; | 3222 | return 0; |
3225 | } | 3223 | } |
3226 | 3224 | ||
3227 | void memcg_release_cache(struct kmem_cache *s) | 3225 | void memcg_free_cache_params(struct kmem_cache *s) |
3226 | { | ||
3227 | kfree(s->memcg_params); | ||
3228 | } | ||
3229 | |||
3230 | void memcg_register_cache(struct kmem_cache *s) | ||
3228 | { | 3231 | { |
3229 | struct kmem_cache *root; | 3232 | struct kmem_cache *root; |
3230 | struct mem_cgroup *memcg; | 3233 | struct mem_cgroup *memcg; |
3231 | int id; | 3234 | int id; |
3232 | 3235 | ||
3233 | /* | 3236 | if (is_root_cache(s)) |
3234 | * This happens, for instance, when a root cache goes away before we | ||
3235 | * add any memcg. | ||
3236 | */ | ||
3237 | if (!s->memcg_params) | ||
3238 | return; | 3237 | return; |
3239 | 3238 | ||
3240 | if (s->memcg_params->is_root_cache) | 3239 | /* |
3241 | goto out; | 3240 | * Holding the slab_mutex assures nobody will touch the memcg_caches |
3241 | * array while we are modifying it. | ||
3242 | */ | ||
3243 | lockdep_assert_held(&slab_mutex); | ||
3242 | 3244 | ||
3245 | root = s->memcg_params->root_cache; | ||
3243 | memcg = s->memcg_params->memcg; | 3246 | memcg = s->memcg_params->memcg; |
3244 | id = memcg_cache_id(memcg); | 3247 | id = memcg_cache_id(memcg); |
3248 | |||
3249 | css_get(&memcg->css); | ||
3250 | |||
3251 | |||
3252 | /* | ||
3253 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | ||
3254 | * barrier here to ensure nobody will see the kmem_cache partially | ||
3255 | * initialized. | ||
3256 | */ | ||
3257 | smp_wmb(); | ||
3258 | |||
3259 | /* | ||
3260 | * Initialize the pointer to this cache in its parent's memcg_params | ||
3261 | * before adding it to the memcg_slab_caches list, otherwise we can | ||
3262 | * fail to convert memcg_params_to_cache() while traversing the list. | ||
3263 | */ | ||
3264 | VM_BUG_ON(root->memcg_params->memcg_caches[id]); | ||
3265 | root->memcg_params->memcg_caches[id] = s; | ||
3266 | |||
3267 | mutex_lock(&memcg->slab_caches_mutex); | ||
3268 | list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); | ||
3269 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3270 | } | ||
3271 | |||
3272 | void memcg_unregister_cache(struct kmem_cache *s) | ||
3273 | { | ||
3274 | struct kmem_cache *root; | ||
3275 | struct mem_cgroup *memcg; | ||
3276 | int id; | ||
3277 | |||
3278 | if (is_root_cache(s)) | ||
3279 | return; | ||
3280 | |||
3281 | /* | ||
3282 | * Holding the slab_mutex assures nobody will touch the memcg_caches | ||
3283 | * array while we are modifying it. | ||
3284 | */ | ||
3285 | lockdep_assert_held(&slab_mutex); | ||
3245 | 3286 | ||
3246 | root = s->memcg_params->root_cache; | 3287 | root = s->memcg_params->root_cache; |
3247 | root->memcg_params->memcg_caches[id] = NULL; | 3288 | memcg = s->memcg_params->memcg; |
3289 | id = memcg_cache_id(memcg); | ||
3248 | 3290 | ||
3249 | mutex_lock(&memcg->slab_caches_mutex); | 3291 | mutex_lock(&memcg->slab_caches_mutex); |
3250 | list_del(&s->memcg_params->list); | 3292 | list_del(&s->memcg_params->list); |
3251 | mutex_unlock(&memcg->slab_caches_mutex); | 3293 | mutex_unlock(&memcg->slab_caches_mutex); |
3252 | 3294 | ||
3295 | /* | ||
3296 | * Clear the pointer to this cache in its parent's memcg_params only | ||
3297 | * after removing it from the memcg_slab_caches list, otherwise we can | ||
3298 | * fail to convert memcg_params_to_cache() while traversing the list. | ||
3299 | */ | ||
3300 | VM_BUG_ON(!root->memcg_params->memcg_caches[id]); | ||
3301 | root->memcg_params->memcg_caches[id] = NULL; | ||
3302 | |||
3253 | css_put(&memcg->css); | 3303 | css_put(&memcg->css); |
3254 | out: | ||
3255 | kfree(s->memcg_params); | ||
3256 | } | 3304 | } |
3257 | 3305 | ||
3258 | /* | 3306 | /* |
@@ -3311,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w) | |||
3311 | * So if we aren't down to zero, we'll just schedule a worker and try | 3359 | * So if we aren't down to zero, we'll just schedule a worker and try |
3312 | * again | 3360 | * again |
3313 | */ | 3361 | */ |
3314 | if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { | 3362 | if (atomic_read(&cachep->memcg_params->nr_pages) != 0) |
3315 | kmem_cache_shrink(cachep); | 3363 | kmem_cache_shrink(cachep); |
3316 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) | 3364 | else |
3317 | return; | ||
3318 | } else | ||
3319 | kmem_cache_destroy(cachep); | 3365 | kmem_cache_destroy(cachep); |
3320 | } | 3366 | } |
3321 | 3367 | ||
@@ -3351,27 +3397,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) | |||
3351 | schedule_work(&cachep->memcg_params->destroy); | 3397 | schedule_work(&cachep->memcg_params->destroy); |
3352 | } | 3398 | } |
3353 | 3399 | ||
3354 | /* | 3400 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, |
3355 | * This lock protects updaters, not readers. We want readers to be as fast as | 3401 | struct kmem_cache *s) |
3356 | * they can, and they will either see NULL or a valid cache value. Our model | ||
3357 | * allow them to see NULL, in which case the root memcg will be selected. | ||
3358 | * | ||
3359 | * We need this lock because multiple allocations to the same cache from a non | ||
3360 | * will span more than one worker. Only one of them can create the cache. | ||
3361 | */ | ||
3362 | static DEFINE_MUTEX(memcg_cache_mutex); | ||
3363 | |||
3364 | /* | ||
3365 | * Called with memcg_cache_mutex held | ||
3366 | */ | ||
3367 | static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | ||
3368 | struct kmem_cache *s) | ||
3369 | { | 3402 | { |
3370 | struct kmem_cache *new; | 3403 | struct kmem_cache *new = NULL; |
3371 | static char *tmp_name = NULL; | 3404 | static char *tmp_name = NULL; |
3405 | static DEFINE_MUTEX(mutex); /* protects tmp_name */ | ||
3372 | 3406 | ||
3373 | lockdep_assert_held(&memcg_cache_mutex); | 3407 | BUG_ON(!memcg_can_account_kmem(memcg)); |
3374 | 3408 | ||
3409 | mutex_lock(&mutex); | ||
3375 | /* | 3410 | /* |
3376 | * kmem_cache_create_memcg duplicates the given name and | 3411 | * kmem_cache_create_memcg duplicates the given name and |
3377 | * cgroup_name for this name requires RCU context. | 3412 | * cgroup_name for this name requires RCU context. |
@@ -3381,7 +3416,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | |||
3381 | if (!tmp_name) { | 3416 | if (!tmp_name) { |
3382 | tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); | 3417 | tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); |
3383 | if (!tmp_name) | 3418 | if (!tmp_name) |
3384 | return NULL; | 3419 | goto out; |
3385 | } | 3420 | } |
3386 | 3421 | ||
3387 | rcu_read_lock(); | 3422 | rcu_read_lock(); |
@@ -3391,48 +3426,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | |||
3391 | 3426 | ||
3392 | new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, | 3427 | new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, |
3393 | (s->flags & ~SLAB_PANIC), s->ctor, s); | 3428 | (s->flags & ~SLAB_PANIC), s->ctor, s); |
3394 | |||
3395 | if (new) | 3429 | if (new) |
3396 | new->allocflags |= __GFP_KMEMCG; | 3430 | new->allocflags |= __GFP_KMEMCG; |
3397 | 3431 | else | |
3398 | return new; | 3432 | new = s; |
3399 | } | ||
3400 | |||
3401 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | ||
3402 | struct kmem_cache *cachep) | ||
3403 | { | ||
3404 | struct kmem_cache *new_cachep; | ||
3405 | int idx; | ||
3406 | |||
3407 | BUG_ON(!memcg_can_account_kmem(memcg)); | ||
3408 | |||
3409 | idx = memcg_cache_id(memcg); | ||
3410 | |||
3411 | mutex_lock(&memcg_cache_mutex); | ||
3412 | new_cachep = cache_from_memcg_idx(cachep, idx); | ||
3413 | if (new_cachep) { | ||
3414 | css_put(&memcg->css); | ||
3415 | goto out; | ||
3416 | } | ||
3417 | |||
3418 | new_cachep = kmem_cache_dup(memcg, cachep); | ||
3419 | if (new_cachep == NULL) { | ||
3420 | new_cachep = cachep; | ||
3421 | css_put(&memcg->css); | ||
3422 | goto out; | ||
3423 | } | ||
3424 | |||
3425 | atomic_set(&new_cachep->memcg_params->nr_pages , 0); | ||
3426 | |||
3427 | cachep->memcg_params->memcg_caches[idx] = new_cachep; | ||
3428 | /* | ||
3429 | * the readers won't lock, make sure everybody sees the updated value, | ||
3430 | * so they won't put stuff in the queue again for no reason | ||
3431 | */ | ||
3432 | wmb(); | ||
3433 | out: | 3433 | out: |
3434 | mutex_unlock(&memcg_cache_mutex); | 3434 | mutex_unlock(&mutex); |
3435 | return new_cachep; | 3435 | return new; |
3436 | } | 3436 | } |
3437 | 3437 | ||
3438 | void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | 3438 | void kmem_cache_destroy_memcg_children(struct kmem_cache *s) |
@@ -3452,9 +3452,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | |||
3452 | * | 3452 | * |
3453 | * Still, we don't want anyone else freeing memcg_caches under our | 3453 | * Still, we don't want anyone else freeing memcg_caches under our |
3454 | * noses, which can happen if a new memcg comes to life. As usual, | 3454 | * noses, which can happen if a new memcg comes to life. As usual, |
3455 | * we'll take the set_limit_mutex to protect ourselves against this. | 3455 | * we'll take the activate_kmem_mutex to protect ourselves against |
3456 | * this. | ||
3456 | */ | 3457 | */ |
3457 | mutex_lock(&set_limit_mutex); | 3458 | mutex_lock(&activate_kmem_mutex); |
3458 | for_each_memcg_cache_index(i) { | 3459 | for_each_memcg_cache_index(i) { |
3459 | c = cache_from_memcg_idx(s, i); | 3460 | c = cache_from_memcg_idx(s, i); |
3460 | if (!c) | 3461 | if (!c) |
@@ -3477,7 +3478,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | |||
3477 | cancel_work_sync(&c->memcg_params->destroy); | 3478 | cancel_work_sync(&c->memcg_params->destroy); |
3478 | kmem_cache_destroy(c); | 3479 | kmem_cache_destroy(c); |
3479 | } | 3480 | } |
3480 | mutex_unlock(&set_limit_mutex); | 3481 | mutex_unlock(&activate_kmem_mutex); |
3481 | } | 3482 | } |
3482 | 3483 | ||
3483 | struct create_work { | 3484 | struct create_work { |
@@ -3509,6 +3510,7 @@ static void memcg_create_cache_work_func(struct work_struct *w) | |||
3509 | 3510 | ||
3510 | cw = container_of(w, struct create_work, work); | 3511 | cw = container_of(w, struct create_work, work); |
3511 | memcg_create_kmem_cache(cw->memcg, cw->cachep); | 3512 | memcg_create_kmem_cache(cw->memcg, cw->cachep); |
3513 | css_put(&cw->memcg->css); | ||
3512 | kfree(cw); | 3514 | kfree(cw); |
3513 | } | 3515 | } |
3514 | 3516 | ||
@@ -3568,7 +3570,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3568 | gfp_t gfp) | 3570 | gfp_t gfp) |
3569 | { | 3571 | { |
3570 | struct mem_cgroup *memcg; | 3572 | struct mem_cgroup *memcg; |
3571 | int idx; | 3573 | struct kmem_cache *memcg_cachep; |
3572 | 3574 | ||
3573 | VM_BUG_ON(!cachep->memcg_params); | 3575 | VM_BUG_ON(!cachep->memcg_params); |
3574 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | 3576 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); |
@@ -3582,15 +3584,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3582 | if (!memcg_can_account_kmem(memcg)) | 3584 | if (!memcg_can_account_kmem(memcg)) |
3583 | goto out; | 3585 | goto out; |
3584 | 3586 | ||
3585 | idx = memcg_cache_id(memcg); | 3587 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); |
3586 | 3588 | if (likely(memcg_cachep)) { | |
3587 | /* | 3589 | cachep = memcg_cachep; |
3588 | * barrier to mare sure we're always seeing the up to date value. The | ||
3589 | * code updating memcg_caches will issue a write barrier to match this. | ||
3590 | */ | ||
3591 | read_barrier_depends(); | ||
3592 | if (likely(cache_from_memcg_idx(cachep, idx))) { | ||
3593 | cachep = cache_from_memcg_idx(cachep, idx); | ||
3594 | goto out; | 3590 | goto out; |
3595 | } | 3591 | } |
3596 | 3592 | ||
@@ -3744,7 +3740,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
3744 | if (!memcg) | 3740 | if (!memcg) |
3745 | return; | 3741 | return; |
3746 | 3742 | ||
3747 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | 3743 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); |
3748 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | 3744 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); |
3749 | } | 3745 | } |
3750 | #else | 3746 | #else |
@@ -3823,7 +3819,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
3823 | bool anon = PageAnon(page); | 3819 | bool anon = PageAnon(page); |
3824 | 3820 | ||
3825 | VM_BUG_ON(from == to); | 3821 | VM_BUG_ON(from == to); |
3826 | VM_BUG_ON(PageLRU(page)); | 3822 | VM_BUG_ON_PAGE(PageLRU(page), page); |
3827 | /* | 3823 | /* |
3828 | * The page is isolated from LRU. So, collapse function | 3824 | * The page is isolated from LRU. So, collapse function |
3829 | * will not handle this page. But page splitting can happen. | 3825 | * will not handle this page. But page splitting can happen. |
@@ -3916,7 +3912,7 @@ static int mem_cgroup_move_parent(struct page *page, | |||
3916 | parent = root_mem_cgroup; | 3912 | parent = root_mem_cgroup; |
3917 | 3913 | ||
3918 | if (nr_pages > 1) { | 3914 | if (nr_pages > 1) { |
3919 | VM_BUG_ON(!PageTransHuge(page)); | 3915 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
3920 | flags = compound_lock_irqsave(page); | 3916 | flags = compound_lock_irqsave(page); |
3921 | } | 3917 | } |
3922 | 3918 | ||
@@ -3950,7 +3946,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
3950 | 3946 | ||
3951 | if (PageTransHuge(page)) { | 3947 | if (PageTransHuge(page)) { |
3952 | nr_pages <<= compound_order(page); | 3948 | nr_pages <<= compound_order(page); |
3953 | VM_BUG_ON(!PageTransHuge(page)); | 3949 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
3954 | /* | 3950 | /* |
3955 | * Never OOM-kill a process for a huge page. The | 3951 | * Never OOM-kill a process for a huge page. The |
3956 | * fault handler will fall back to regular pages. | 3952 | * fault handler will fall back to regular pages. |
@@ -3970,8 +3966,8 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
3970 | { | 3966 | { |
3971 | if (mem_cgroup_disabled()) | 3967 | if (mem_cgroup_disabled()) |
3972 | return 0; | 3968 | return 0; |
3973 | VM_BUG_ON(page_mapped(page)); | 3969 | VM_BUG_ON_PAGE(page_mapped(page), page); |
3974 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 3970 | VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); |
3975 | VM_BUG_ON(!mm); | 3971 | VM_BUG_ON(!mm); |
3976 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 3972 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
3977 | MEM_CGROUP_CHARGE_TYPE_ANON); | 3973 | MEM_CGROUP_CHARGE_TYPE_ANON); |
@@ -4175,7 +4171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, | |||
4175 | 4171 | ||
4176 | if (PageTransHuge(page)) { | 4172 | if (PageTransHuge(page)) { |
4177 | nr_pages <<= compound_order(page); | 4173 | nr_pages <<= compound_order(page); |
4178 | VM_BUG_ON(!PageTransHuge(page)); | 4174 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
4179 | } | 4175 | } |
4180 | /* | 4176 | /* |
4181 | * Check if our page_cgroup is valid | 4177 | * Check if our page_cgroup is valid |
@@ -4267,7 +4263,7 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
4267 | /* early check. */ | 4263 | /* early check. */ |
4268 | if (page_mapped(page)) | 4264 | if (page_mapped(page)) |
4269 | return; | 4265 | return; |
4270 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 4266 | VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); |
4271 | /* | 4267 | /* |
4272 | * If the page is in swap cache, uncharge should be deferred | 4268 | * If the page is in swap cache, uncharge should be deferred |
4273 | * to the swap path, which also properly accounts swap usage | 4269 | * to the swap path, which also properly accounts swap usage |
@@ -4287,8 +4283,8 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
4287 | 4283 | ||
4288 | void mem_cgroup_uncharge_cache_page(struct page *page) | 4284 | void mem_cgroup_uncharge_cache_page(struct page *page) |
4289 | { | 4285 | { |
4290 | VM_BUG_ON(page_mapped(page)); | 4286 | VM_BUG_ON_PAGE(page_mapped(page), page); |
4291 | VM_BUG_ON(page->mapping); | 4287 | VM_BUG_ON_PAGE(page->mapping, page); |
4292 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); | 4288 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); |
4293 | } | 4289 | } |
4294 | 4290 | ||
@@ -5112,14 +5108,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
5112 | return val << PAGE_SHIFT; | 5108 | return val << PAGE_SHIFT; |
5113 | } | 5109 | } |
5114 | 5110 | ||
5115 | static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, | 5111 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, |
5116 | struct cftype *cft, struct file *file, | 5112 | struct cftype *cft) |
5117 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
5118 | { | 5113 | { |
5119 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5114 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5120 | char str[64]; | ||
5121 | u64 val; | 5115 | u64 val; |
5122 | int name, len; | 5116 | int name; |
5123 | enum res_type type; | 5117 | enum res_type type; |
5124 | 5118 | ||
5125 | type = MEMFILE_TYPE(cft->private); | 5119 | type = MEMFILE_TYPE(cft->private); |
@@ -5145,15 +5139,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, | |||
5145 | BUG(); | 5139 | BUG(); |
5146 | } | 5140 | } |
5147 | 5141 | ||
5148 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | 5142 | return val; |
5149 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
5150 | } | 5143 | } |
5151 | 5144 | ||
5152 | static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | ||
5153 | { | ||
5154 | int ret = -EINVAL; | ||
5155 | #ifdef CONFIG_MEMCG_KMEM | 5145 | #ifdef CONFIG_MEMCG_KMEM |
5156 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5146 | /* should be called with activate_kmem_mutex held */ |
5147 | static int __memcg_activate_kmem(struct mem_cgroup *memcg, | ||
5148 | unsigned long long limit) | ||
5149 | { | ||
5150 | int err = 0; | ||
5151 | int memcg_id; | ||
5152 | |||
5153 | if (memcg_kmem_is_active(memcg)) | ||
5154 | return 0; | ||
5155 | |||
5156 | /* | ||
5157 | * We are going to allocate memory for data shared by all memory | ||
5158 | * cgroups so let's stop accounting here. | ||
5159 | */ | ||
5160 | memcg_stop_kmem_account(); | ||
5161 | |||
5157 | /* | 5162 | /* |
5158 | * For simplicity, we won't allow this to be disabled. It also can't | 5163 | * For simplicity, we won't allow this to be disabled. It also can't |
5159 | * be changed if the cgroup has children already, or if tasks had | 5164 | * be changed if the cgroup has children already, or if tasks had |
@@ -5167,72 +5172,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | |||
5167 | * of course permitted. | 5172 | * of course permitted. |
5168 | */ | 5173 | */ |
5169 | mutex_lock(&memcg_create_mutex); | 5174 | mutex_lock(&memcg_create_mutex); |
5170 | mutex_lock(&set_limit_mutex); | 5175 | if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) |
5171 | if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { | 5176 | err = -EBUSY; |
5172 | if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { | 5177 | mutex_unlock(&memcg_create_mutex); |
5173 | ret = -EBUSY; | 5178 | if (err) |
5174 | goto out; | 5179 | goto out; |
5175 | } | ||
5176 | ret = res_counter_set_limit(&memcg->kmem, val); | ||
5177 | VM_BUG_ON(ret); | ||
5178 | 5180 | ||
5179 | ret = memcg_update_cache_sizes(memcg); | 5181 | memcg_id = ida_simple_get(&kmem_limited_groups, |
5180 | if (ret) { | 5182 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); |
5181 | res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); | 5183 | if (memcg_id < 0) { |
5182 | goto out; | 5184 | err = memcg_id; |
5183 | } | 5185 | goto out; |
5184 | static_key_slow_inc(&memcg_kmem_enabled_key); | 5186 | } |
5185 | /* | 5187 | |
5186 | * setting the active bit after the inc will guarantee no one | 5188 | /* |
5187 | * starts accounting before all call sites are patched | 5189 | * Make sure we have enough space for this cgroup in each root cache's |
5188 | */ | 5190 | * memcg_params. |
5189 | memcg_kmem_set_active(memcg); | 5191 | */ |
5190 | } else | 5192 | err = memcg_update_all_caches(memcg_id + 1); |
5191 | ret = res_counter_set_limit(&memcg->kmem, val); | 5193 | if (err) |
5194 | goto out_rmid; | ||
5195 | |||
5196 | memcg->kmemcg_id = memcg_id; | ||
5197 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
5198 | mutex_init(&memcg->slab_caches_mutex); | ||
5199 | |||
5200 | /* | ||
5201 | * We couldn't have accounted to this cgroup, because it hasn't got the | ||
5202 | * active bit set yet, so this should succeed. | ||
5203 | */ | ||
5204 | err = res_counter_set_limit(&memcg->kmem, limit); | ||
5205 | VM_BUG_ON(err); | ||
5206 | |||
5207 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
5208 | /* | ||
5209 | * Setting the active bit after enabling static branching will | ||
5210 | * guarantee no one starts accounting before all call sites are | ||
5211 | * patched. | ||
5212 | */ | ||
5213 | memcg_kmem_set_active(memcg); | ||
5192 | out: | 5214 | out: |
5193 | mutex_unlock(&set_limit_mutex); | 5215 | memcg_resume_kmem_account(); |
5194 | mutex_unlock(&memcg_create_mutex); | 5216 | return err; |
5195 | #endif | 5217 | |
5218 | out_rmid: | ||
5219 | ida_simple_remove(&kmem_limited_groups, memcg_id); | ||
5220 | goto out; | ||
5221 | } | ||
5222 | |||
5223 | static int memcg_activate_kmem(struct mem_cgroup *memcg, | ||
5224 | unsigned long long limit) | ||
5225 | { | ||
5226 | int ret; | ||
5227 | |||
5228 | mutex_lock(&activate_kmem_mutex); | ||
5229 | ret = __memcg_activate_kmem(memcg, limit); | ||
5230 | mutex_unlock(&activate_kmem_mutex); | ||
5231 | return ret; | ||
5232 | } | ||
5233 | |||
5234 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | ||
5235 | unsigned long long val) | ||
5236 | { | ||
5237 | int ret; | ||
5238 | |||
5239 | if (!memcg_kmem_is_active(memcg)) | ||
5240 | ret = memcg_activate_kmem(memcg, val); | ||
5241 | else | ||
5242 | ret = res_counter_set_limit(&memcg->kmem, val); | ||
5196 | return ret; | 5243 | return ret; |
5197 | } | 5244 | } |
5198 | 5245 | ||
5199 | #ifdef CONFIG_MEMCG_KMEM | ||
5200 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) | 5246 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) |
5201 | { | 5247 | { |
5202 | int ret = 0; | 5248 | int ret = 0; |
5203 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | 5249 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
5204 | if (!parent) | ||
5205 | goto out; | ||
5206 | 5250 | ||
5207 | memcg->kmem_account_flags = parent->kmem_account_flags; | 5251 | if (!parent) |
5208 | /* | 5252 | return 0; |
5209 | * When that happen, we need to disable the static branch only on those | ||
5210 | * memcgs that enabled it. To achieve this, we would be forced to | ||
5211 | * complicate the code by keeping track of which memcgs were the ones | ||
5212 | * that actually enabled limits, and which ones got it from its | ||
5213 | * parents. | ||
5214 | * | ||
5215 | * It is a lot simpler just to do static_key_slow_inc() on every child | ||
5216 | * that is accounted. | ||
5217 | */ | ||
5218 | if (!memcg_kmem_is_active(memcg)) | ||
5219 | goto out; | ||
5220 | 5253 | ||
5254 | mutex_lock(&activate_kmem_mutex); | ||
5221 | /* | 5255 | /* |
5222 | * __mem_cgroup_free() will issue static_key_slow_dec() because this | 5256 | * If the parent cgroup is not kmem-active now, it cannot be activated |
5223 | * memcg is active already. If the later initialization fails then the | 5257 | * after this point, because it has at least one child already. |
5224 | * cgroup core triggers the cleanup so we do not have to do it here. | ||
5225 | */ | 5258 | */ |
5226 | static_key_slow_inc(&memcg_kmem_enabled_key); | 5259 | if (memcg_kmem_is_active(parent)) |
5227 | 5260 | ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); | |
5228 | mutex_lock(&set_limit_mutex); | 5261 | mutex_unlock(&activate_kmem_mutex); |
5229 | memcg_stop_kmem_account(); | ||
5230 | ret = memcg_update_cache_sizes(memcg); | ||
5231 | memcg_resume_kmem_account(); | ||
5232 | mutex_unlock(&set_limit_mutex); | ||
5233 | out: | ||
5234 | return ret; | 5262 | return ret; |
5235 | } | 5263 | } |
5264 | #else | ||
5265 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | ||
5266 | unsigned long long val) | ||
5267 | { | ||
5268 | return -EINVAL; | ||
5269 | } | ||
5236 | #endif /* CONFIG_MEMCG_KMEM */ | 5270 | #endif /* CONFIG_MEMCG_KMEM */ |
5237 | 5271 | ||
5238 | /* | 5272 | /* |
@@ -5266,7 +5300,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, | |||
5266 | else if (type == _MEMSWAP) | 5300 | else if (type == _MEMSWAP) |
5267 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 5301 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
5268 | else if (type == _KMEM) | 5302 | else if (type == _KMEM) |
5269 | ret = memcg_update_kmem_limit(css, val); | 5303 | ret = memcg_update_kmem_limit(memcg, val); |
5270 | else | 5304 | else |
5271 | return -EINVAL; | 5305 | return -EINVAL; |
5272 | break; | 5306 | break; |
@@ -5383,8 +5417,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | |||
5383 | #endif | 5417 | #endif |
5384 | 5418 | ||
5385 | #ifdef CONFIG_NUMA | 5419 | #ifdef CONFIG_NUMA |
5386 | static int memcg_numa_stat_show(struct cgroup_subsys_state *css, | 5420 | static int memcg_numa_stat_show(struct seq_file *m, void *v) |
5387 | struct cftype *cft, struct seq_file *m) | ||
5388 | { | 5421 | { |
5389 | struct numa_stat { | 5422 | struct numa_stat { |
5390 | const char *name; | 5423 | const char *name; |
@@ -5400,7 +5433,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css, | |||
5400 | const struct numa_stat *stat; | 5433 | const struct numa_stat *stat; |
5401 | int nid; | 5434 | int nid; |
5402 | unsigned long nr; | 5435 | unsigned long nr; |
5403 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5436 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5404 | 5437 | ||
5405 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { | 5438 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { |
5406 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); | 5439 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); |
@@ -5439,10 +5472,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) | |||
5439 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 5472 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
5440 | } | 5473 | } |
5441 | 5474 | ||
5442 | static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, | 5475 | static int memcg_stat_show(struct seq_file *m, void *v) |
5443 | struct seq_file *m) | ||
5444 | { | 5476 | { |
5445 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5477 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5446 | struct mem_cgroup *mi; | 5478 | struct mem_cgroup *mi; |
5447 | unsigned int i; | 5479 | unsigned int i; |
5448 | 5480 | ||
@@ -5651,13 +5683,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) | |||
5651 | mem_cgroup_oom_notify_cb(iter); | 5683 | mem_cgroup_oom_notify_cb(iter); |
5652 | } | 5684 | } |
5653 | 5685 | ||
5654 | static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, | 5686 | static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
5655 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 5687 | struct eventfd_ctx *eventfd, const char *args, enum res_type type) |
5656 | { | 5688 | { |
5657 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
5658 | struct mem_cgroup_thresholds *thresholds; | 5689 | struct mem_cgroup_thresholds *thresholds; |
5659 | struct mem_cgroup_threshold_ary *new; | 5690 | struct mem_cgroup_threshold_ary *new; |
5660 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
5661 | u64 threshold, usage; | 5691 | u64 threshold, usage; |
5662 | int i, size, ret; | 5692 | int i, size, ret; |
5663 | 5693 | ||
@@ -5734,13 +5764,23 @@ unlock: | |||
5734 | return ret; | 5764 | return ret; |
5735 | } | 5765 | } |
5736 | 5766 | ||
5737 | static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, | 5767 | static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
5738 | struct cftype *cft, struct eventfd_ctx *eventfd) | 5768 | struct eventfd_ctx *eventfd, const char *args) |
5769 | { | ||
5770 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); | ||
5771 | } | ||
5772 | |||
5773 | static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, | ||
5774 | struct eventfd_ctx *eventfd, const char *args) | ||
5775 | { | ||
5776 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); | ||
5777 | } | ||
5778 | |||
5779 | static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||
5780 | struct eventfd_ctx *eventfd, enum res_type type) | ||
5739 | { | 5781 | { |
5740 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
5741 | struct mem_cgroup_thresholds *thresholds; | 5782 | struct mem_cgroup_thresholds *thresholds; |
5742 | struct mem_cgroup_threshold_ary *new; | 5783 | struct mem_cgroup_threshold_ary *new; |
5743 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
5744 | u64 usage; | 5784 | u64 usage; |
5745 | int i, j, size; | 5785 | int i, j, size; |
5746 | 5786 | ||
@@ -5813,14 +5853,23 @@ unlock: | |||
5813 | mutex_unlock(&memcg->thresholds_lock); | 5853 | mutex_unlock(&memcg->thresholds_lock); |
5814 | } | 5854 | } |
5815 | 5855 | ||
5816 | static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | 5856 | static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
5817 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 5857 | struct eventfd_ctx *eventfd) |
5858 | { | ||
5859 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); | ||
5860 | } | ||
5861 | |||
5862 | static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||
5863 | struct eventfd_ctx *eventfd) | ||
5864 | { | ||
5865 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); | ||
5866 | } | ||
5867 | |||
5868 | static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, | ||
5869 | struct eventfd_ctx *eventfd, const char *args) | ||
5818 | { | 5870 | { |
5819 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
5820 | struct mem_cgroup_eventfd_list *event; | 5871 | struct mem_cgroup_eventfd_list *event; |
5821 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
5822 | 5872 | ||
5823 | BUG_ON(type != _OOM_TYPE); | ||
5824 | event = kmalloc(sizeof(*event), GFP_KERNEL); | 5873 | event = kmalloc(sizeof(*event), GFP_KERNEL); |
5825 | if (!event) | 5874 | if (!event) |
5826 | return -ENOMEM; | 5875 | return -ENOMEM; |
@@ -5838,14 +5887,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | |||
5838 | return 0; | 5887 | return 0; |
5839 | } | 5888 | } |
5840 | 5889 | ||
5841 | static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, | 5890 | static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, |
5842 | struct cftype *cft, struct eventfd_ctx *eventfd) | 5891 | struct eventfd_ctx *eventfd) |
5843 | { | 5892 | { |
5844 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
5845 | struct mem_cgroup_eventfd_list *ev, *tmp; | 5893 | struct mem_cgroup_eventfd_list *ev, *tmp; |
5846 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
5847 | |||
5848 | BUG_ON(type != _OOM_TYPE); | ||
5849 | 5894 | ||
5850 | spin_lock(&memcg_oom_lock); | 5895 | spin_lock(&memcg_oom_lock); |
5851 | 5896 | ||
@@ -5859,17 +5904,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, | |||
5859 | spin_unlock(&memcg_oom_lock); | 5904 | spin_unlock(&memcg_oom_lock); |
5860 | } | 5905 | } |
5861 | 5906 | ||
5862 | static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, | 5907 | static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) |
5863 | struct cftype *cft, struct cgroup_map_cb *cb) | ||
5864 | { | 5908 | { |
5865 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5909 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); |
5866 | 5910 | ||
5867 | cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); | 5911 | seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); |
5868 | 5912 | seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); | |
5869 | if (atomic_read(&memcg->under_oom)) | ||
5870 | cb->fill(cb, "under_oom", 1); | ||
5871 | else | ||
5872 | cb->fill(cb, "under_oom", 0); | ||
5873 | return 0; | 5913 | return 0; |
5874 | } | 5914 | } |
5875 | 5915 | ||
@@ -5962,41 +6002,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | |||
5962 | } | 6002 | } |
5963 | #endif | 6003 | #endif |
5964 | 6004 | ||
6005 | /* | ||
6006 | * DO NOT USE IN NEW FILES. | ||
6007 | * | ||
6008 | * "cgroup.event_control" implementation. | ||
6009 | * | ||
6010 | * This is way over-engineered. It tries to support fully configurable | ||
6011 | * events for each user. Such level of flexibility is completely | ||
6012 | * unnecessary especially in the light of the planned unified hierarchy. | ||
6013 | * | ||
6014 | * Please deprecate this and replace with something simpler if at all | ||
6015 | * possible. | ||
6016 | */ | ||
6017 | |||
6018 | /* | ||
6019 | * Unregister event and free resources. | ||
6020 | * | ||
6021 | * Gets called from workqueue. | ||
6022 | */ | ||
6023 | static void memcg_event_remove(struct work_struct *work) | ||
6024 | { | ||
6025 | struct mem_cgroup_event *event = | ||
6026 | container_of(work, struct mem_cgroup_event, remove); | ||
6027 | struct mem_cgroup *memcg = event->memcg; | ||
6028 | |||
6029 | remove_wait_queue(event->wqh, &event->wait); | ||
6030 | |||
6031 | event->unregister_event(memcg, event->eventfd); | ||
6032 | |||
6033 | /* Notify userspace the event is going away. */ | ||
6034 | eventfd_signal(event->eventfd, 1); | ||
6035 | |||
6036 | eventfd_ctx_put(event->eventfd); | ||
6037 | kfree(event); | ||
6038 | css_put(&memcg->css); | ||
6039 | } | ||
6040 | |||
6041 | /* | ||
6042 | * Gets called on POLLHUP on eventfd when user closes it. | ||
6043 | * | ||
6044 | * Called with wqh->lock held and interrupts disabled. | ||
6045 | */ | ||
6046 | static int memcg_event_wake(wait_queue_t *wait, unsigned mode, | ||
6047 | int sync, void *key) | ||
6048 | { | ||
6049 | struct mem_cgroup_event *event = | ||
6050 | container_of(wait, struct mem_cgroup_event, wait); | ||
6051 | struct mem_cgroup *memcg = event->memcg; | ||
6052 | unsigned long flags = (unsigned long)key; | ||
6053 | |||
6054 | if (flags & POLLHUP) { | ||
6055 | /* | ||
6056 | * If the event has been detached at cgroup removal, we | ||
6057 | * can simply return knowing the other side will cleanup | ||
6058 | * for us. | ||
6059 | * | ||
6060 | * We can't race against event freeing since the other | ||
6061 | * side will require wqh->lock via remove_wait_queue(), | ||
6062 | * which we hold. | ||
6063 | */ | ||
6064 | spin_lock(&memcg->event_list_lock); | ||
6065 | if (!list_empty(&event->list)) { | ||
6066 | list_del_init(&event->list); | ||
6067 | /* | ||
6068 | * We are in atomic context, but cgroup_event_remove() | ||
6069 | * may sleep, so we have to call it in workqueue. | ||
6070 | */ | ||
6071 | schedule_work(&event->remove); | ||
6072 | } | ||
6073 | spin_unlock(&memcg->event_list_lock); | ||
6074 | } | ||
6075 | |||
6076 | return 0; | ||
6077 | } | ||
6078 | |||
6079 | static void memcg_event_ptable_queue_proc(struct file *file, | ||
6080 | wait_queue_head_t *wqh, poll_table *pt) | ||
6081 | { | ||
6082 | struct mem_cgroup_event *event = | ||
6083 | container_of(pt, struct mem_cgroup_event, pt); | ||
6084 | |||
6085 | event->wqh = wqh; | ||
6086 | add_wait_queue(wqh, &event->wait); | ||
6087 | } | ||
6088 | |||
6089 | /* | ||
6090 | * DO NOT USE IN NEW FILES. | ||
6091 | * | ||
6092 | * Parse input and register new cgroup event handler. | ||
6093 | * | ||
6094 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
6095 | * Interpretation of args is defined by control file implementation. | ||
6096 | */ | ||
6097 | static int memcg_write_event_control(struct cgroup_subsys_state *css, | ||
6098 | struct cftype *cft, const char *buffer) | ||
6099 | { | ||
6100 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
6101 | struct mem_cgroup_event *event; | ||
6102 | struct cgroup_subsys_state *cfile_css; | ||
6103 | unsigned int efd, cfd; | ||
6104 | struct fd efile; | ||
6105 | struct fd cfile; | ||
6106 | const char *name; | ||
6107 | char *endp; | ||
6108 | int ret; | ||
6109 | |||
6110 | efd = simple_strtoul(buffer, &endp, 10); | ||
6111 | if (*endp != ' ') | ||
6112 | return -EINVAL; | ||
6113 | buffer = endp + 1; | ||
6114 | |||
6115 | cfd = simple_strtoul(buffer, &endp, 10); | ||
6116 | if ((*endp != ' ') && (*endp != '\0')) | ||
6117 | return -EINVAL; | ||
6118 | buffer = endp + 1; | ||
6119 | |||
6120 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
6121 | if (!event) | ||
6122 | return -ENOMEM; | ||
6123 | |||
6124 | event->memcg = memcg; | ||
6125 | INIT_LIST_HEAD(&event->list); | ||
6126 | init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); | ||
6127 | init_waitqueue_func_entry(&event->wait, memcg_event_wake); | ||
6128 | INIT_WORK(&event->remove, memcg_event_remove); | ||
6129 | |||
6130 | efile = fdget(efd); | ||
6131 | if (!efile.file) { | ||
6132 | ret = -EBADF; | ||
6133 | goto out_kfree; | ||
6134 | } | ||
6135 | |||
6136 | event->eventfd = eventfd_ctx_fileget(efile.file); | ||
6137 | if (IS_ERR(event->eventfd)) { | ||
6138 | ret = PTR_ERR(event->eventfd); | ||
6139 | goto out_put_efile; | ||
6140 | } | ||
6141 | |||
6142 | cfile = fdget(cfd); | ||
6143 | if (!cfile.file) { | ||
6144 | ret = -EBADF; | ||
6145 | goto out_put_eventfd; | ||
6146 | } | ||
6147 | |||
6148 | /* the process need read permission on control file */ | ||
6149 | /* AV: shouldn't we check that it's been opened for read instead? */ | ||
6150 | ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||
6151 | if (ret < 0) | ||
6152 | goto out_put_cfile; | ||
6153 | |||
6154 | /* | ||
6155 | * Determine the event callbacks and set them in @event. This used | ||
6156 | * to be done via struct cftype but cgroup core no longer knows | ||
6157 | * about these events. The following is crude but the whole thing | ||
6158 | * is for compatibility anyway. | ||
6159 | * | ||
6160 | * DO NOT ADD NEW FILES. | ||
6161 | */ | ||
6162 | name = cfile.file->f_dentry->d_name.name; | ||
6163 | |||
6164 | if (!strcmp(name, "memory.usage_in_bytes")) { | ||
6165 | event->register_event = mem_cgroup_usage_register_event; | ||
6166 | event->unregister_event = mem_cgroup_usage_unregister_event; | ||
6167 | } else if (!strcmp(name, "memory.oom_control")) { | ||
6168 | event->register_event = mem_cgroup_oom_register_event; | ||
6169 | event->unregister_event = mem_cgroup_oom_unregister_event; | ||
6170 | } else if (!strcmp(name, "memory.pressure_level")) { | ||
6171 | event->register_event = vmpressure_register_event; | ||
6172 | event->unregister_event = vmpressure_unregister_event; | ||
6173 | } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { | ||
6174 | event->register_event = memsw_cgroup_usage_register_event; | ||
6175 | event->unregister_event = memsw_cgroup_usage_unregister_event; | ||
6176 | } else { | ||
6177 | ret = -EINVAL; | ||
6178 | goto out_put_cfile; | ||
6179 | } | ||
6180 | |||
6181 | /* | ||
6182 | * Verify @cfile should belong to @css. Also, remaining events are | ||
6183 | * automatically removed on cgroup destruction but the removal is | ||
6184 | * asynchronous, so take an extra ref on @css. | ||
6185 | */ | ||
6186 | rcu_read_lock(); | ||
6187 | |||
6188 | ret = -EINVAL; | ||
6189 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, | ||
6190 | &mem_cgroup_subsys); | ||
6191 | if (cfile_css == css && css_tryget(css)) | ||
6192 | ret = 0; | ||
6193 | |||
6194 | rcu_read_unlock(); | ||
6195 | if (ret) | ||
6196 | goto out_put_cfile; | ||
6197 | |||
6198 | ret = event->register_event(memcg, event->eventfd, buffer); | ||
6199 | if (ret) | ||
6200 | goto out_put_css; | ||
6201 | |||
6202 | efile.file->f_op->poll(efile.file, &event->pt); | ||
6203 | |||
6204 | spin_lock(&memcg->event_list_lock); | ||
6205 | list_add(&event->list, &memcg->event_list); | ||
6206 | spin_unlock(&memcg->event_list_lock); | ||
6207 | |||
6208 | fdput(cfile); | ||
6209 | fdput(efile); | ||
6210 | |||
6211 | return 0; | ||
6212 | |||
6213 | out_put_css: | ||
6214 | css_put(css); | ||
6215 | out_put_cfile: | ||
6216 | fdput(cfile); | ||
6217 | out_put_eventfd: | ||
6218 | eventfd_ctx_put(event->eventfd); | ||
6219 | out_put_efile: | ||
6220 | fdput(efile); | ||
6221 | out_kfree: | ||
6222 | kfree(event); | ||
6223 | |||
6224 | return ret; | ||
6225 | } | ||
6226 | |||
5965 | static struct cftype mem_cgroup_files[] = { | 6227 | static struct cftype mem_cgroup_files[] = { |
5966 | { | 6228 | { |
5967 | .name = "usage_in_bytes", | 6229 | .name = "usage_in_bytes", |
5968 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 6230 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
5969 | .read = mem_cgroup_read, | 6231 | .read_u64 = mem_cgroup_read_u64, |
5970 | .register_event = mem_cgroup_usage_register_event, | ||
5971 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
5972 | }, | 6232 | }, |
5973 | { | 6233 | { |
5974 | .name = "max_usage_in_bytes", | 6234 | .name = "max_usage_in_bytes", |
5975 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), | 6235 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
5976 | .trigger = mem_cgroup_reset, | 6236 | .trigger = mem_cgroup_reset, |
5977 | .read = mem_cgroup_read, | 6237 | .read_u64 = mem_cgroup_read_u64, |
5978 | }, | 6238 | }, |
5979 | { | 6239 | { |
5980 | .name = "limit_in_bytes", | 6240 | .name = "limit_in_bytes", |
5981 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), | 6241 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
5982 | .write_string = mem_cgroup_write, | 6242 | .write_string = mem_cgroup_write, |
5983 | .read = mem_cgroup_read, | 6243 | .read_u64 = mem_cgroup_read_u64, |
5984 | }, | 6244 | }, |
5985 | { | 6245 | { |
5986 | .name = "soft_limit_in_bytes", | 6246 | .name = "soft_limit_in_bytes", |
5987 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | 6247 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), |
5988 | .write_string = mem_cgroup_write, | 6248 | .write_string = mem_cgroup_write, |
5989 | .read = mem_cgroup_read, | 6249 | .read_u64 = mem_cgroup_read_u64, |
5990 | }, | 6250 | }, |
5991 | { | 6251 | { |
5992 | .name = "failcnt", | 6252 | .name = "failcnt", |
5993 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 6253 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
5994 | .trigger = mem_cgroup_reset, | 6254 | .trigger = mem_cgroup_reset, |
5995 | .read = mem_cgroup_read, | 6255 | .read_u64 = mem_cgroup_read_u64, |
5996 | }, | 6256 | }, |
5997 | { | 6257 | { |
5998 | .name = "stat", | 6258 | .name = "stat", |
5999 | .read_seq_string = memcg_stat_show, | 6259 | .seq_show = memcg_stat_show, |
6000 | }, | 6260 | }, |
6001 | { | 6261 | { |
6002 | .name = "force_empty", | 6262 | .name = "force_empty", |
@@ -6009,6 +6269,12 @@ static struct cftype mem_cgroup_files[] = { | |||
6009 | .read_u64 = mem_cgroup_hierarchy_read, | 6269 | .read_u64 = mem_cgroup_hierarchy_read, |
6010 | }, | 6270 | }, |
6011 | { | 6271 | { |
6272 | .name = "cgroup.event_control", /* XXX: for compat */ | ||
6273 | .write_string = memcg_write_event_control, | ||
6274 | .flags = CFTYPE_NO_PREFIX, | ||
6275 | .mode = S_IWUGO, | ||
6276 | }, | ||
6277 | { | ||
6012 | .name = "swappiness", | 6278 | .name = "swappiness", |
6013 | .read_u64 = mem_cgroup_swappiness_read, | 6279 | .read_u64 = mem_cgroup_swappiness_read, |
6014 | .write_u64 = mem_cgroup_swappiness_write, | 6280 | .write_u64 = mem_cgroup_swappiness_write, |
@@ -6020,21 +6286,17 @@ static struct cftype mem_cgroup_files[] = { | |||
6020 | }, | 6286 | }, |
6021 | { | 6287 | { |
6022 | .name = "oom_control", | 6288 | .name = "oom_control", |
6023 | .read_map = mem_cgroup_oom_control_read, | 6289 | .seq_show = mem_cgroup_oom_control_read, |
6024 | .write_u64 = mem_cgroup_oom_control_write, | 6290 | .write_u64 = mem_cgroup_oom_control_write, |
6025 | .register_event = mem_cgroup_oom_register_event, | ||
6026 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
6027 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 6291 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
6028 | }, | 6292 | }, |
6029 | { | 6293 | { |
6030 | .name = "pressure_level", | 6294 | .name = "pressure_level", |
6031 | .register_event = vmpressure_register_event, | ||
6032 | .unregister_event = vmpressure_unregister_event, | ||
6033 | }, | 6295 | }, |
6034 | #ifdef CONFIG_NUMA | 6296 | #ifdef CONFIG_NUMA |
6035 | { | 6297 | { |
6036 | .name = "numa_stat", | 6298 | .name = "numa_stat", |
6037 | .read_seq_string = memcg_numa_stat_show, | 6299 | .seq_show = memcg_numa_stat_show, |
6038 | }, | 6300 | }, |
6039 | #endif | 6301 | #endif |
6040 | #ifdef CONFIG_MEMCG_KMEM | 6302 | #ifdef CONFIG_MEMCG_KMEM |
@@ -6042,29 +6304,29 @@ static struct cftype mem_cgroup_files[] = { | |||
6042 | .name = "kmem.limit_in_bytes", | 6304 | .name = "kmem.limit_in_bytes", |
6043 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), | 6305 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), |
6044 | .write_string = mem_cgroup_write, | 6306 | .write_string = mem_cgroup_write, |
6045 | .read = mem_cgroup_read, | 6307 | .read_u64 = mem_cgroup_read_u64, |
6046 | }, | 6308 | }, |
6047 | { | 6309 | { |
6048 | .name = "kmem.usage_in_bytes", | 6310 | .name = "kmem.usage_in_bytes", |
6049 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), | 6311 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), |
6050 | .read = mem_cgroup_read, | 6312 | .read_u64 = mem_cgroup_read_u64, |
6051 | }, | 6313 | }, |
6052 | { | 6314 | { |
6053 | .name = "kmem.failcnt", | 6315 | .name = "kmem.failcnt", |
6054 | .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), | 6316 | .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), |
6055 | .trigger = mem_cgroup_reset, | 6317 | .trigger = mem_cgroup_reset, |
6056 | .read = mem_cgroup_read, | 6318 | .read_u64 = mem_cgroup_read_u64, |
6057 | }, | 6319 | }, |
6058 | { | 6320 | { |
6059 | .name = "kmem.max_usage_in_bytes", | 6321 | .name = "kmem.max_usage_in_bytes", |
6060 | .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), | 6322 | .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), |
6061 | .trigger = mem_cgroup_reset, | 6323 | .trigger = mem_cgroup_reset, |
6062 | .read = mem_cgroup_read, | 6324 | .read_u64 = mem_cgroup_read_u64, |
6063 | }, | 6325 | }, |
6064 | #ifdef CONFIG_SLABINFO | 6326 | #ifdef CONFIG_SLABINFO |
6065 | { | 6327 | { |
6066 | .name = "kmem.slabinfo", | 6328 | .name = "kmem.slabinfo", |
6067 | .read_seq_string = mem_cgroup_slabinfo_read, | 6329 | .seq_show = mem_cgroup_slabinfo_read, |
6068 | }, | 6330 | }, |
6069 | #endif | 6331 | #endif |
6070 | #endif | 6332 | #endif |
@@ -6076,27 +6338,25 @@ static struct cftype memsw_cgroup_files[] = { | |||
6076 | { | 6338 | { |
6077 | .name = "memsw.usage_in_bytes", | 6339 | .name = "memsw.usage_in_bytes", |
6078 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 6340 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
6079 | .read = mem_cgroup_read, | 6341 | .read_u64 = mem_cgroup_read_u64, |
6080 | .register_event = mem_cgroup_usage_register_event, | ||
6081 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
6082 | }, | 6342 | }, |
6083 | { | 6343 | { |
6084 | .name = "memsw.max_usage_in_bytes", | 6344 | .name = "memsw.max_usage_in_bytes", |
6085 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | 6345 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), |
6086 | .trigger = mem_cgroup_reset, | 6346 | .trigger = mem_cgroup_reset, |
6087 | .read = mem_cgroup_read, | 6347 | .read_u64 = mem_cgroup_read_u64, |
6088 | }, | 6348 | }, |
6089 | { | 6349 | { |
6090 | .name = "memsw.limit_in_bytes", | 6350 | .name = "memsw.limit_in_bytes", |
6091 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | 6351 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), |
6092 | .write_string = mem_cgroup_write, | 6352 | .write_string = mem_cgroup_write, |
6093 | .read = mem_cgroup_read, | 6353 | .read_u64 = mem_cgroup_read_u64, |
6094 | }, | 6354 | }, |
6095 | { | 6355 | { |
6096 | .name = "memsw.failcnt", | 6356 | .name = "memsw.failcnt", |
6097 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | 6357 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), |
6098 | .trigger = mem_cgroup_reset, | 6358 | .trigger = mem_cgroup_reset, |
6099 | .read = mem_cgroup_read, | 6359 | .read_u64 = mem_cgroup_read_u64, |
6100 | }, | 6360 | }, |
6101 | { }, /* terminate */ | 6361 | { }, /* terminate */ |
6102 | }; | 6362 | }; |
@@ -6139,14 +6399,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
6139 | static struct mem_cgroup *mem_cgroup_alloc(void) | 6399 | static struct mem_cgroup *mem_cgroup_alloc(void) |
6140 | { | 6400 | { |
6141 | struct mem_cgroup *memcg; | 6401 | struct mem_cgroup *memcg; |
6142 | size_t size = memcg_size(); | 6402 | size_t size; |
6143 | 6403 | ||
6144 | /* Can be very big if nr_node_ids is very big */ | 6404 | size = sizeof(struct mem_cgroup); |
6145 | if (size < PAGE_SIZE) | 6405 | size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); |
6146 | memcg = kzalloc(size, GFP_KERNEL); | ||
6147 | else | ||
6148 | memcg = vzalloc(size); | ||
6149 | 6406 | ||
6407 | memcg = kzalloc(size, GFP_KERNEL); | ||
6150 | if (!memcg) | 6408 | if (!memcg) |
6151 | return NULL; | 6409 | return NULL; |
6152 | 6410 | ||
@@ -6157,10 +6415,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
6157 | return memcg; | 6415 | return memcg; |
6158 | 6416 | ||
6159 | out_free: | 6417 | out_free: |
6160 | if (size < PAGE_SIZE) | 6418 | kfree(memcg); |
6161 | kfree(memcg); | ||
6162 | else | ||
6163 | vfree(memcg); | ||
6164 | return NULL; | 6419 | return NULL; |
6165 | } | 6420 | } |
6166 | 6421 | ||
@@ -6178,7 +6433,6 @@ out_free: | |||
6178 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | 6433 | static void __mem_cgroup_free(struct mem_cgroup *memcg) |
6179 | { | 6434 | { |
6180 | int node; | 6435 | int node; |
6181 | size_t size = memcg_size(); | ||
6182 | 6436 | ||
6183 | mem_cgroup_remove_from_trees(memcg); | 6437 | mem_cgroup_remove_from_trees(memcg); |
6184 | 6438 | ||
@@ -6199,10 +6453,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
6199 | * the cgroup_lock. | 6453 | * the cgroup_lock. |
6200 | */ | 6454 | */ |
6201 | disarm_static_keys(memcg); | 6455 | disarm_static_keys(memcg); |
6202 | if (size < PAGE_SIZE) | 6456 | kfree(memcg); |
6203 | kfree(memcg); | ||
6204 | else | ||
6205 | vfree(memcg); | ||
6206 | } | 6457 | } |
6207 | 6458 | ||
6208 | /* | 6459 | /* |
@@ -6268,6 +6519,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
6268 | mutex_init(&memcg->thresholds_lock); | 6519 | mutex_init(&memcg->thresholds_lock); |
6269 | spin_lock_init(&memcg->move_lock); | 6520 | spin_lock_init(&memcg->move_lock); |
6270 | vmpressure_init(&memcg->vmpressure); | 6521 | vmpressure_init(&memcg->vmpressure); |
6522 | INIT_LIST_HEAD(&memcg->event_list); | ||
6523 | spin_lock_init(&memcg->event_list_lock); | ||
6271 | 6524 | ||
6272 | return &memcg->css; | 6525 | return &memcg->css; |
6273 | 6526 | ||
@@ -6281,7 +6534,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
6281 | { | 6534 | { |
6282 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 6535 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
6283 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); | 6536 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); |
6284 | int error = 0; | ||
6285 | 6537 | ||
6286 | if (css->cgroup->id > MEM_CGROUP_ID_MAX) | 6538 | if (css->cgroup->id > MEM_CGROUP_ID_MAX) |
6287 | return -ENOSPC; | 6539 | return -ENOSPC; |
@@ -6316,10 +6568,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
6316 | if (parent != root_mem_cgroup) | 6568 | if (parent != root_mem_cgroup) |
6317 | mem_cgroup_subsys.broken_hierarchy = true; | 6569 | mem_cgroup_subsys.broken_hierarchy = true; |
6318 | } | 6570 | } |
6319 | |||
6320 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); | ||
6321 | mutex_unlock(&memcg_create_mutex); | 6571 | mutex_unlock(&memcg_create_mutex); |
6322 | return error; | 6572 | |
6573 | return memcg_init_kmem(memcg, &mem_cgroup_subsys); | ||
6323 | } | 6574 | } |
6324 | 6575 | ||
6325 | /* | 6576 | /* |
@@ -6343,6 +6594,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | |||
6343 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | 6594 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) |
6344 | { | 6595 | { |
6345 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 6596 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
6597 | struct mem_cgroup_event *event, *tmp; | ||
6598 | |||
6599 | /* | ||
6600 | * Unregister events and notify userspace. | ||
6601 | * Notify userspace about cgroup removing only after rmdir of cgroup | ||
6602 | * directory to avoid race between userspace and kernelspace. | ||
6603 | */ | ||
6604 | spin_lock(&memcg->event_list_lock); | ||
6605 | list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { | ||
6606 | list_del_init(&event->list); | ||
6607 | schedule_work(&event->remove); | ||
6608 | } | ||
6609 | spin_unlock(&memcg->event_list_lock); | ||
6346 | 6610 | ||
6347 | kmem_cgroup_css_offline(memcg); | 6611 | kmem_cgroup_css_offline(memcg); |
6348 | 6612 | ||
@@ -6615,7 +6879,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | |||
6615 | enum mc_target_type ret = MC_TARGET_NONE; | 6879 | enum mc_target_type ret = MC_TARGET_NONE; |
6616 | 6880 | ||
6617 | page = pmd_page(pmd); | 6881 | page = pmd_page(pmd); |
6618 | VM_BUG_ON(!page || !PageHead(page)); | 6882 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); |
6619 | if (!move_anon()) | 6883 | if (!move_anon()) |
6620 | return ret; | 6884 | return ret; |
6621 | pc = lookup_page_cgroup(page); | 6885 | pc = lookup_page_cgroup(page); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fabe55046c1d..4f08a2d61487 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
611 | } | 611 | } |
612 | 612 | ||
613 | /* | 613 | /* |
614 | * Dirty cache page page | 614 | * Dirty pagecache page |
615 | * Issues: when the error hit a hole page the error is not properly | 615 | * Issues: when the error hit a hole page the error is not properly |
616 | * propagated. | 616 | * propagated. |
617 | */ | 617 | */ |
@@ -856,14 +856,14 @@ static int page_action(struct page_state *ps, struct page *p, | |||
856 | * the pages and send SIGBUS to the processes if the data was dirty. | 856 | * the pages and send SIGBUS to the processes if the data was dirty. |
857 | */ | 857 | */ |
858 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | 858 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
859 | int trapno, int flags) | 859 | int trapno, int flags, struct page **hpagep) |
860 | { | 860 | { |
861 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 861 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
862 | struct address_space *mapping; | 862 | struct address_space *mapping; |
863 | LIST_HEAD(tokill); | 863 | LIST_HEAD(tokill); |
864 | int ret; | 864 | int ret; |
865 | int kill = 1, forcekill; | 865 | int kill = 1, forcekill; |
866 | struct page *hpage = compound_head(p); | 866 | struct page *hpage = *hpagep; |
867 | struct page *ppage; | 867 | struct page *ppage; |
868 | 868 | ||
869 | if (PageReserved(p) || PageSlab(p)) | 869 | if (PageReserved(p) || PageSlab(p)) |
@@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
942 | * We pinned the head page for hwpoison handling, | 942 | * We pinned the head page for hwpoison handling, |
943 | * now we split the thp and we are interested in | 943 | * now we split the thp and we are interested in |
944 | * the hwpoisoned raw page, so move the refcount | 944 | * the hwpoisoned raw page, so move the refcount |
945 | * to it. | 945 | * to it. Similarly, page lock is shifted. |
946 | */ | 946 | */ |
947 | if (hpage != p) { | 947 | if (hpage != p) { |
948 | put_page(hpage); | 948 | put_page(hpage); |
949 | get_page(p); | 949 | get_page(p); |
950 | lock_page(p); | ||
951 | unlock_page(hpage); | ||
952 | *hpagep = p; | ||
950 | } | 953 | } |
951 | /* THP is split, so ppage should be the real poisoned page. */ | 954 | /* THP is split, so ppage should be the real poisoned page. */ |
952 | ppage = p; | 955 | ppage = p; |
@@ -964,17 +967,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
964 | if (kill) | 967 | if (kill) |
965 | collect_procs(ppage, &tokill); | 968 | collect_procs(ppage, &tokill); |
966 | 969 | ||
967 | if (hpage != ppage) | ||
968 | lock_page(ppage); | ||
969 | |||
970 | ret = try_to_unmap(ppage, ttu); | 970 | ret = try_to_unmap(ppage, ttu); |
971 | if (ret != SWAP_SUCCESS) | 971 | if (ret != SWAP_SUCCESS) |
972 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 972 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
973 | pfn, page_mapcount(ppage)); | 973 | pfn, page_mapcount(ppage)); |
974 | 974 | ||
975 | if (hpage != ppage) | ||
976 | unlock_page(ppage); | ||
977 | |||
978 | /* | 975 | /* |
979 | * Now that the dirty bit has been propagated to the | 976 | * Now that the dirty bit has been propagated to the |
980 | * struct page and all unmaps done we can decide if | 977 | * struct page and all unmaps done we can decide if |
@@ -1193,8 +1190,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1193 | /* | 1190 | /* |
1194 | * Now take care of user space mappings. | 1191 | * Now take care of user space mappings. |
1195 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. | 1192 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. |
1193 | * | ||
1194 | * When the raw error page is thp tail page, hpage points to the raw | ||
1195 | * page after thp split. | ||
1196 | */ | 1196 | */ |
1197 | if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { | 1197 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) |
1198 | != SWAP_SUCCESS) { | ||
1198 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | 1199 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); |
1199 | res = -EBUSY; | 1200 | res = -EBUSY; |
1200 | goto out; | 1201 | goto out; |
@@ -1585,7 +1586,13 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1585 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1586 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1586 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1587 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1587 | if (ret) { | 1588 | if (ret) { |
1588 | putback_lru_pages(&pagelist); | 1589 | if (!list_empty(&pagelist)) { |
1590 | list_del(&page->lru); | ||
1591 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
1592 | page_is_file_cache(page)); | ||
1593 | putback_lru_page(page); | ||
1594 | } | ||
1595 | |||
1589 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1596 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1590 | pfn, ret, page->flags); | 1597 | pfn, ret, page->flags); |
1591 | if (ret > 0) | 1598 | if (ret > 0) |
diff --git a/mm/memory.c b/mm/memory.c index 6768ce9e57d2..be6a0c0d4ae0 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/migrate.h> | 60 | #include <linux/migrate.h> |
61 | #include <linux/string.h> | 61 | #include <linux/string.h> |
62 | #include <linux/dma-debug.h> | ||
62 | 63 | ||
63 | #include <asm/io.h> | 64 | #include <asm/io.h> |
64 | #include <asm/pgalloc.h> | 65 | #include <asm/pgalloc.h> |
@@ -288,7 +289,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
288 | return 0; | 289 | return 0; |
289 | batch = tlb->active; | 290 | batch = tlb->active; |
290 | } | 291 | } |
291 | VM_BUG_ON(batch->nr > batch->max); | 292 | VM_BUG_ON_PAGE(batch->nr > batch->max, page); |
292 | 293 | ||
293 | return batch->max - batch->nr; | 294 | return batch->max - batch->nr; |
294 | } | 295 | } |
@@ -670,7 +671,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
670 | current->comm, | 671 | current->comm, |
671 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | 672 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); |
672 | if (page) | 673 | if (page) |
673 | dump_page(page); | 674 | dump_page(page, "bad pte"); |
674 | printk(KERN_ALERT | 675 | printk(KERN_ALERT |
675 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | 676 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", |
676 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | 677 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); |
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | |||
2559 | 2560 | ||
2560 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 2561 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
2561 | { | 2562 | { |
2563 | debug_dma_assert_idle(src); | ||
2564 | |||
2562 | /* | 2565 | /* |
2563 | * If the source page was a PFN mapping, we don't have | 2566 | * If the source page was a PFN mapping, we don't have |
2564 | * a "struct page" for it. We do a best-effort copy by | 2567 | * a "struct page" for it. We do a best-effort copy by |
@@ -2699,7 +2702,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2699 | goto unwritable_page; | 2702 | goto unwritable_page; |
2700 | } | 2703 | } |
2701 | } else | 2704 | } else |
2702 | VM_BUG_ON(!PageLocked(old_page)); | 2705 | VM_BUG_ON_PAGE(!PageLocked(old_page), old_page); |
2703 | 2706 | ||
2704 | /* | 2707 | /* |
2705 | * Since we dropped the lock we need to revalidate | 2708 | * Since we dropped the lock we need to revalidate |
@@ -3355,7 +3358,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3355 | if (unlikely(!(ret & VM_FAULT_LOCKED))) | 3358 | if (unlikely(!(ret & VM_FAULT_LOCKED))) |
3356 | lock_page(vmf.page); | 3359 | lock_page(vmf.page); |
3357 | else | 3360 | else |
3358 | VM_BUG_ON(!PageLocked(vmf.page)); | 3361 | VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); |
3359 | 3362 | ||
3360 | /* | 3363 | /* |
3361 | * Should we do an early C-O-W break? | 3364 | * Should we do an early C-O-W break? |
@@ -3392,7 +3395,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3392 | goto unwritable_page; | 3395 | goto unwritable_page; |
3393 | } | 3396 | } |
3394 | } else | 3397 | } else |
3395 | VM_BUG_ON(!PageLocked(page)); | 3398 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
3396 | page_mkwrite = 1; | 3399 | page_mkwrite = 1; |
3397 | } | 3400 | } |
3398 | } | 3401 | } |
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src, | |||
4272 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | 4275 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
4273 | 4276 | ||
4274 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS | 4277 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS |
4278 | |||
4279 | static struct kmem_cache *page_ptl_cachep; | ||
4280 | |||
4281 | void __init ptlock_cache_init(void) | ||
4282 | { | ||
4283 | page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, | ||
4284 | SLAB_PANIC, NULL); | ||
4285 | } | ||
4286 | |||
4275 | bool ptlock_alloc(struct page *page) | 4287 | bool ptlock_alloc(struct page *page) |
4276 | { | 4288 | { |
4277 | spinlock_t *ptl; | 4289 | spinlock_t *ptl; |
4278 | 4290 | ||
4279 | ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); | 4291 | ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); |
4280 | if (!ptl) | 4292 | if (!ptl) |
4281 | return false; | 4293 | return false; |
4282 | page->ptl = ptl; | 4294 | page->ptl = ptl; |
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page) | |||
4285 | 4297 | ||
4286 | void ptlock_free(struct page *page) | 4298 | void ptlock_free(struct page *page) |
4287 | { | 4299 | { |
4288 | kfree(page->ptl); | 4300 | kmem_cache_free(page_ptl_cachep, page->ptl); |
4289 | } | 4301 | } |
4290 | #endif | 4302 | #endif |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 489f235502db..a650db29606f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/swap.h> | 9 | #include <linux/swap.h> |
10 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
11 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
12 | #include <linux/bootmem.h> | ||
13 | #include <linux/compiler.h> | 12 | #include <linux/compiler.h> |
14 | #include <linux/export.h> | 13 | #include <linux/export.h> |
15 | #include <linux/pagevec.h> | 14 | #include <linux/pagevec.h> |
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | |||
269 | } | 268 | } |
270 | 269 | ||
271 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or | 270 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or |
272 | * alloc_bootmem_node_nopanic() */ | 271 | * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ |
273 | static int __ref ensure_zone_is_initialized(struct zone *zone, | 272 | static int __ref ensure_zone_is_initialized(struct zone *zone, |
274 | unsigned long start_pfn, unsigned long num_pages) | 273 | unsigned long start_pfn, unsigned long num_pages) |
275 | { | 274 | { |
@@ -1108,17 +1107,18 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
1108 | if (ret) | 1107 | if (ret) |
1109 | return ret; | 1108 | return ret; |
1110 | 1109 | ||
1111 | lock_memory_hotplug(); | ||
1112 | |||
1113 | res = register_memory_resource(start, size); | 1110 | res = register_memory_resource(start, size); |
1114 | ret = -EEXIST; | 1111 | ret = -EEXIST; |
1115 | if (!res) | 1112 | if (!res) |
1116 | goto out; | 1113 | return ret; |
1117 | 1114 | ||
1118 | { /* Stupid hack to suppress address-never-null warning */ | 1115 | { /* Stupid hack to suppress address-never-null warning */ |
1119 | void *p = NODE_DATA(nid); | 1116 | void *p = NODE_DATA(nid); |
1120 | new_pgdat = !p; | 1117 | new_pgdat = !p; |
1121 | } | 1118 | } |
1119 | |||
1120 | lock_memory_hotplug(); | ||
1121 | |||
1122 | new_node = !node_online(nid); | 1122 | new_node = !node_online(nid); |
1123 | if (new_node) { | 1123 | if (new_node) { |
1124 | pgdat = hotadd_new_pgdat(nid, start); | 1124 | pgdat = hotadd_new_pgdat(nid, start); |
@@ -1310,7 +1310,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1310 | #ifdef CONFIG_DEBUG_VM | 1310 | #ifdef CONFIG_DEBUG_VM |
1311 | printk(KERN_ALERT "removing pfn %lx from LRU failed\n", | 1311 | printk(KERN_ALERT "removing pfn %lx from LRU failed\n", |
1312 | pfn); | 1312 | pfn); |
1313 | dump_page(page); | 1313 | dump_page(page, "failed to remove from LRU"); |
1314 | #endif | 1314 | #endif |
1315 | put_page(page); | 1315 | put_page(page); |
1316 | /* Because we don't have big zone->lock. we should | 1316 | /* Because we don't have big zone->lock. we should |
@@ -1446,6 +1446,7 @@ static int __init cmdline_parse_movable_node(char *p) | |||
1446 | * the kernel away from hotpluggable memory. | 1446 | * the kernel away from hotpluggable memory. |
1447 | */ | 1447 | */ |
1448 | memblock_set_bottom_up(true); | 1448 | memblock_set_bottom_up(true); |
1449 | movable_node_enabled = true; | ||
1449 | #else | 1450 | #else |
1450 | pr_warn("movable_node option not supported\n"); | 1451 | pr_warn("movable_node option not supported\n"); |
1451 | #endif | 1452 | #endif |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0cd2c4d4e270..ae3c8f3595d4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -613,7 +613,7 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma, | |||
613 | return 0; | 613 | return 0; |
614 | } | 614 | } |
615 | 615 | ||
616 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | 616 | #ifdef CONFIG_NUMA_BALANCING |
617 | /* | 617 | /* |
618 | * This is used to mark a range of virtual addresses to be inaccessible. | 618 | * This is used to mark a range of virtual addresses to be inaccessible. |
619 | * These are later cleared by a NUMA hinting fault. Depending on these | 619 | * These are later cleared by a NUMA hinting fault. Depending on these |
@@ -627,7 +627,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
627 | unsigned long addr, unsigned long end) | 627 | unsigned long addr, unsigned long end) |
628 | { | 628 | { |
629 | int nr_updated; | 629 | int nr_updated; |
630 | BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); | ||
631 | 630 | ||
632 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); | 631 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); |
633 | if (nr_updated) | 632 | if (nr_updated) |
@@ -641,7 +640,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
641 | { | 640 | { |
642 | return 0; | 641 | return 0; |
643 | } | 642 | } |
644 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | 643 | #endif /* CONFIG_NUMA_BALANCING */ |
645 | 644 | ||
646 | /* | 645 | /* |
647 | * Walk through page tables and collect pages to be migrated. | 646 | * Walk through page tables and collect pages to be migrated. |
@@ -1199,10 +1198,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * | |||
1199 | } | 1198 | } |
1200 | 1199 | ||
1201 | if (PageHuge(page)) { | 1200 | if (PageHuge(page)) { |
1202 | if (vma) | 1201 | BUG_ON(!vma); |
1203 | return alloc_huge_page_noerr(vma, address, 1); | 1202 | return alloc_huge_page_noerr(vma, address, 1); |
1204 | else | ||
1205 | return NULL; | ||
1206 | } | 1203 | } |
1207 | /* | 1204 | /* |
1208 | * if !vma, alloc_page_vma() will use task or system default policy | 1205 | * if !vma, alloc_page_vma() will use task or system default policy |
@@ -2657,7 +2654,7 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
2657 | } | 2654 | } |
2658 | 2655 | ||
2659 | #ifdef CONFIG_NUMA_BALANCING | 2656 | #ifdef CONFIG_NUMA_BALANCING |
2660 | static bool __initdata numabalancing_override; | 2657 | static int __initdata numabalancing_override; |
2661 | 2658 | ||
2662 | static void __init check_numabalancing_enable(void) | 2659 | static void __init check_numabalancing_enable(void) |
2663 | { | 2660 | { |
@@ -2666,9 +2663,15 @@ static void __init check_numabalancing_enable(void) | |||
2666 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) | 2663 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) |
2667 | numabalancing_default = true; | 2664 | numabalancing_default = true; |
2668 | 2665 | ||
2666 | /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ | ||
2667 | if (numabalancing_override) | ||
2668 | set_numabalancing_state(numabalancing_override == 1); | ||
2669 | |||
2669 | if (nr_node_ids > 1 && !numabalancing_override) { | 2670 | if (nr_node_ids > 1 && !numabalancing_override) { |
2670 | printk(KERN_INFO "Enabling automatic NUMA balancing. " | 2671 | pr_info("%s automatic NUMA balancing. " |
2671 | "Configure with numa_balancing= or sysctl"); | 2672 | "Configure with numa_balancing= or the " |
2673 | "kernel.numa_balancing sysctl", | ||
2674 | numabalancing_default ? "Enabling" : "Disabling"); | ||
2672 | set_numabalancing_state(numabalancing_default); | 2675 | set_numabalancing_state(numabalancing_default); |
2673 | } | 2676 | } |
2674 | } | 2677 | } |
@@ -2678,18 +2681,17 @@ static int __init setup_numabalancing(char *str) | |||
2678 | int ret = 0; | 2681 | int ret = 0; |
2679 | if (!str) | 2682 | if (!str) |
2680 | goto out; | 2683 | goto out; |
2681 | numabalancing_override = true; | ||
2682 | 2684 | ||
2683 | if (!strcmp(str, "enable")) { | 2685 | if (!strcmp(str, "enable")) { |
2684 | set_numabalancing_state(true); | 2686 | numabalancing_override = 1; |
2685 | ret = 1; | 2687 | ret = 1; |
2686 | } else if (!strcmp(str, "disable")) { | 2688 | } else if (!strcmp(str, "disable")) { |
2687 | set_numabalancing_state(false); | 2689 | numabalancing_override = -1; |
2688 | ret = 1; | 2690 | ret = 1; |
2689 | } | 2691 | } |
2690 | out: | 2692 | out: |
2691 | if (!ret) | 2693 | if (!ret) |
2692 | printk(KERN_WARNING "Unable to parse numa_balancing=\n"); | 2694 | pr_warn("Unable to parse numa_balancing=\n"); |
2693 | 2695 | ||
2694 | return ret; | 2696 | return ret; |
2695 | } | 2697 | } |
@@ -2928,7 +2930,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
2928 | unsigned short mode = MPOL_DEFAULT; | 2930 | unsigned short mode = MPOL_DEFAULT; |
2929 | unsigned short flags = 0; | 2931 | unsigned short flags = 0; |
2930 | 2932 | ||
2931 | if (pol && pol != &default_policy) { | 2933 | if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { |
2932 | mode = pol->mode; | 2934 | mode = pol->mode; |
2933 | flags = pol->flags; | 2935 | flags = pol->flags; |
2934 | } | 2936 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 9194375b2307..482a33d89134 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -72,28 +72,12 @@ int migrate_prep_local(void) | |||
72 | } | 72 | } |
73 | 73 | ||
74 | /* | 74 | /* |
75 | * Add isolated pages on the list back to the LRU under page lock | ||
76 | * to avoid leaking evictable pages back onto unevictable list. | ||
77 | */ | ||
78 | void putback_lru_pages(struct list_head *l) | ||
79 | { | ||
80 | struct page *page; | ||
81 | struct page *page2; | ||
82 | |||
83 | list_for_each_entry_safe(page, page2, l, lru) { | ||
84 | list_del(&page->lru); | ||
85 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
86 | page_is_file_cache(page)); | ||
87 | putback_lru_page(page); | ||
88 | } | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * Put previously isolated pages back onto the appropriate lists | 75 | * Put previously isolated pages back onto the appropriate lists |
93 | * from where they were once taken off for compaction/migration. | 76 | * from where they were once taken off for compaction/migration. |
94 | * | 77 | * |
95 | * This function shall be used instead of putback_lru_pages(), | 78 | * This function shall be used whenever the isolated pageset has been |
96 | * whenever the isolated pageset has been built by isolate_migratepages_range() | 79 | * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() |
80 | * and isolate_huge_page(). | ||
97 | */ | 81 | */ |
98 | void putback_movable_pages(struct list_head *l) | 82 | void putback_movable_pages(struct list_head *l) |
99 | { | 83 | { |
@@ -199,7 +183,12 @@ out: | |||
199 | */ | 183 | */ |
200 | static void remove_migration_ptes(struct page *old, struct page *new) | 184 | static void remove_migration_ptes(struct page *old, struct page *new) |
201 | { | 185 | { |
202 | rmap_walk(new, remove_migration_pte, old); | 186 | struct rmap_walk_control rwc = { |
187 | .rmap_one = remove_migration_pte, | ||
188 | .arg = old, | ||
189 | }; | ||
190 | |||
191 | rmap_walk(new, &rwc); | ||
203 | } | 192 | } |
204 | 193 | ||
205 | /* | 194 | /* |
@@ -510,7 +499,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
510 | if (PageUptodate(page)) | 499 | if (PageUptodate(page)) |
511 | SetPageUptodate(newpage); | 500 | SetPageUptodate(newpage); |
512 | if (TestClearPageActive(page)) { | 501 | if (TestClearPageActive(page)) { |
513 | VM_BUG_ON(PageUnevictable(page)); | 502 | VM_BUG_ON_PAGE(PageUnevictable(page), page); |
514 | SetPageActive(newpage); | 503 | SetPageActive(newpage); |
515 | } else if (TestClearPageUnevictable(page)) | 504 | } else if (TestClearPageUnevictable(page)) |
516 | SetPageUnevictable(newpage); | 505 | SetPageUnevictable(newpage); |
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
563 | * Migration functions | 552 | * Migration functions |
564 | ***********************************************************/ | 553 | ***********************************************************/ |
565 | 554 | ||
566 | /* Always fail migration. Used for mappings that are not movable */ | ||
567 | int fail_migrate_page(struct address_space *mapping, | ||
568 | struct page *newpage, struct page *page) | ||
569 | { | ||
570 | return -EIO; | ||
571 | } | ||
572 | EXPORT_SYMBOL(fail_migrate_page); | ||
573 | |||
574 | /* | 555 | /* |
575 | * Common logic to directly migrate a single page suitable for | 556 | * Common logic to directly migrate a single page suitable for |
576 | * pages that do not use PagePrivate/PagePrivate2. | 557 | * pages that do not use PagePrivate/PagePrivate2. |
@@ -890,7 +871,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
890 | * free the metadata, so the page can be freed. | 871 | * free the metadata, so the page can be freed. |
891 | */ | 872 | */ |
892 | if (!page->mapping) { | 873 | if (!page->mapping) { |
893 | VM_BUG_ON(PageAnon(page)); | 874 | VM_BUG_ON_PAGE(PageAnon(page), page); |
894 | if (page_has_private(page)) { | 875 | if (page_has_private(page)) { |
895 | try_to_free_buffers(page); | 876 | try_to_free_buffers(page); |
896 | goto uncharge; | 877 | goto uncharge; |
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1008 | { | 989 | { |
1009 | int rc = 0; | 990 | int rc = 0; |
1010 | int *result = NULL; | 991 | int *result = NULL; |
1011 | struct page *new_hpage = get_new_page(hpage, private, &result); | 992 | struct page *new_hpage; |
1012 | struct anon_vma *anon_vma = NULL; | 993 | struct anon_vma *anon_vma = NULL; |
1013 | 994 | ||
1014 | /* | 995 | /* |
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1018 | * tables or check whether the hugepage is pmd-based or not before | 999 | * tables or check whether the hugepage is pmd-based or not before |
1019 | * kicking migration. | 1000 | * kicking migration. |
1020 | */ | 1001 | */ |
1021 | if (!hugepage_migration_support(page_hstate(hpage))) | 1002 | if (!hugepage_migration_support(page_hstate(hpage))) { |
1003 | putback_active_hugepage(hpage); | ||
1022 | return -ENOSYS; | 1004 | return -ENOSYS; |
1005 | } | ||
1023 | 1006 | ||
1007 | new_hpage = get_new_page(hpage, private, &result); | ||
1024 | if (!new_hpage) | 1008 | if (!new_hpage) |
1025 | return -ENOMEM; | 1009 | return -ENOMEM; |
1026 | 1010 | ||
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1120 | nr_succeeded++; | 1104 | nr_succeeded++; |
1121 | break; | 1105 | break; |
1122 | default: | 1106 | default: |
1123 | /* Permanent failure */ | 1107 | /* |
1108 | * Permanent failure (-EBUSY, -ENOSYS, etc.): | ||
1109 | * unlike -EAGAIN case, the failed page is | ||
1110 | * removed from migration page list and not | ||
1111 | * retried in the next outer loop. | ||
1112 | */ | ||
1124 | nr_failed++; | 1113 | nr_failed++; |
1125 | break; | 1114 | break; |
1126 | } | 1115 | } |
@@ -1559,8 +1548,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page, | |||
1559 | __GFP_NOMEMALLOC | __GFP_NORETRY | | 1548 | __GFP_NOMEMALLOC | __GFP_NORETRY | |
1560 | __GFP_NOWARN) & | 1549 | __GFP_NOWARN) & |
1561 | ~GFP_IOFS, 0); | 1550 | ~GFP_IOFS, 0); |
1562 | if (newpage) | ||
1563 | page_cpupid_xchg_last(newpage, page_cpupid_last(page)); | ||
1564 | 1551 | ||
1565 | return newpage; | 1552 | return newpage; |
1566 | } | 1553 | } |
@@ -1594,35 +1581,42 @@ bool migrate_ratelimited(int node) | |||
1594 | } | 1581 | } |
1595 | 1582 | ||
1596 | /* Returns true if the node is migrate rate-limited after the update */ | 1583 | /* Returns true if the node is migrate rate-limited after the update */ |
1597 | bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | 1584 | static bool numamigrate_update_ratelimit(pg_data_t *pgdat, |
1585 | unsigned long nr_pages) | ||
1598 | { | 1586 | { |
1599 | bool rate_limited = false; | ||
1600 | |||
1601 | /* | 1587 | /* |
1602 | * Rate-limit the amount of data that is being migrated to a node. | 1588 | * Rate-limit the amount of data that is being migrated to a node. |
1603 | * Optimal placement is no good if the memory bus is saturated and | 1589 | * Optimal placement is no good if the memory bus is saturated and |
1604 | * all the time is being spent migrating! | 1590 | * all the time is being spent migrating! |
1605 | */ | 1591 | */ |
1606 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1607 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { | 1592 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { |
1593 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1608 | pgdat->numabalancing_migrate_nr_pages = 0; | 1594 | pgdat->numabalancing_migrate_nr_pages = 0; |
1609 | pgdat->numabalancing_migrate_next_window = jiffies + | 1595 | pgdat->numabalancing_migrate_next_window = jiffies + |
1610 | msecs_to_jiffies(migrate_interval_millisecs); | 1596 | msecs_to_jiffies(migrate_interval_millisecs); |
1597 | spin_unlock(&pgdat->numabalancing_migrate_lock); | ||
1611 | } | 1598 | } |
1612 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) | 1599 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { |
1613 | rate_limited = true; | 1600 | trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, |
1614 | else | 1601 | nr_pages); |
1615 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | 1602 | return true; |
1616 | spin_unlock(&pgdat->numabalancing_migrate_lock); | 1603 | } |
1617 | 1604 | ||
1618 | return rate_limited; | 1605 | /* |
1606 | * This is an unlocked non-atomic update so errors are possible. | ||
1607 | * The consequences are failing to migrate when we potentiall should | ||
1608 | * have which is not severe enough to warrant locking. If it is ever | ||
1609 | * a problem, it can be converted to a per-cpu counter. | ||
1610 | */ | ||
1611 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | ||
1612 | return false; | ||
1619 | } | 1613 | } |
1620 | 1614 | ||
1621 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | 1615 | static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) |
1622 | { | 1616 | { |
1623 | int page_lru; | 1617 | int page_lru; |
1624 | 1618 | ||
1625 | VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); | 1619 | VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); |
1626 | 1620 | ||
1627 | /* Avoid migrating to a node that is nearly full */ | 1621 | /* Avoid migrating to a node that is nearly full */ |
1628 | if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) | 1622 | if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) |
@@ -1705,7 +1699,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, | |||
1705 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, | 1699 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
1706 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); | 1700 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); |
1707 | if (nr_remaining) { | 1701 | if (nr_remaining) { |
1708 | putback_lru_pages(&migratepages); | 1702 | if (!list_empty(&migratepages)) { |
1703 | list_del(&page->lru); | ||
1704 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
1705 | page_is_file_cache(page)); | ||
1706 | putback_lru_page(page); | ||
1707 | } | ||
1709 | isolated = 0; | 1708 | isolated = 0; |
1710 | } else | 1709 | } else |
1711 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | 1710 | count_vm_numa_event(NUMA_PAGE_MIGRATE); |
@@ -1752,8 +1751,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1752 | if (!new_page) | 1751 | if (!new_page) |
1753 | goto out_fail; | 1752 | goto out_fail; |
1754 | 1753 | ||
1755 | page_cpupid_xchg_last(new_page, page_cpupid_last(page)); | ||
1756 | |||
1757 | isolated = numamigrate_isolate_page(pgdat, page); | 1754 | isolated = numamigrate_isolate_page(pgdat, page); |
1758 | if (!isolated) { | 1755 | if (!isolated) { |
1759 | put_page(new_page); | 1756 | put_page(new_page); |
diff --git a/mm/mincore.c b/mm/mincore.c index da2be56a7b8f..101623378fbf 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -225,13 +225,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v | |||
225 | 225 | ||
226 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); | 226 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); |
227 | 227 | ||
228 | if (is_vm_hugetlb_page(vma)) { | ||
229 | mincore_hugetlb_page_range(vma, addr, end, vec); | ||
230 | return (end - addr) >> PAGE_SHIFT; | ||
231 | } | ||
232 | |||
233 | end = pmd_addr_end(addr, end); | ||
234 | |||
235 | if (is_vm_hugetlb_page(vma)) | 228 | if (is_vm_hugetlb_page(vma)) |
236 | mincore_hugetlb_page_range(vma, addr, end, vec); | 229 | mincore_hugetlb_page_range(vma, addr, end, vec); |
237 | else | 230 | else |
diff --git a/mm/mlock.c b/mm/mlock.c index 192e6eebe4f2..4e1a68162285 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -91,6 +91,26 @@ void mlock_vma_page(struct page *page) | |||
91 | } | 91 | } |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * Isolate a page from LRU with optional get_page() pin. | ||
95 | * Assumes lru_lock already held and page already pinned. | ||
96 | */ | ||
97 | static bool __munlock_isolate_lru_page(struct page *page, bool getpage) | ||
98 | { | ||
99 | if (PageLRU(page)) { | ||
100 | struct lruvec *lruvec; | ||
101 | |||
102 | lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); | ||
103 | if (getpage) | ||
104 | get_page(page); | ||
105 | ClearPageLRU(page); | ||
106 | del_page_from_lru_list(page, lruvec, page_lru(page)); | ||
107 | return true; | ||
108 | } | ||
109 | |||
110 | return false; | ||
111 | } | ||
112 | |||
113 | /* | ||
94 | * Finish munlock after successful page isolation | 114 | * Finish munlock after successful page isolation |
95 | * | 115 | * |
96 | * Page must be locked. This is a wrapper for try_to_munlock() | 116 | * Page must be locked. This is a wrapper for try_to_munlock() |
@@ -126,9 +146,9 @@ static void __munlock_isolated_page(struct page *page) | |||
126 | static void __munlock_isolation_failed(struct page *page) | 146 | static void __munlock_isolation_failed(struct page *page) |
127 | { | 147 | { |
128 | if (PageUnevictable(page)) | 148 | if (PageUnevictable(page)) |
129 | count_vm_event(UNEVICTABLE_PGSTRANDED); | 149 | __count_vm_event(UNEVICTABLE_PGSTRANDED); |
130 | else | 150 | else |
131 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | 151 | __count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
132 | } | 152 | } |
133 | 153 | ||
134 | /** | 154 | /** |
@@ -152,28 +172,34 @@ static void __munlock_isolation_failed(struct page *page) | |||
152 | unsigned int munlock_vma_page(struct page *page) | 172 | unsigned int munlock_vma_page(struct page *page) |
153 | { | 173 | { |
154 | unsigned int nr_pages; | 174 | unsigned int nr_pages; |
175 | struct zone *zone = page_zone(page); | ||
155 | 176 | ||
156 | BUG_ON(!PageLocked(page)); | 177 | BUG_ON(!PageLocked(page)); |
157 | 178 | ||
158 | if (TestClearPageMlocked(page)) { | ||
159 | nr_pages = hpage_nr_pages(page); | ||
160 | mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); | ||
161 | if (!isolate_lru_page(page)) | ||
162 | __munlock_isolated_page(page); | ||
163 | else | ||
164 | __munlock_isolation_failed(page); | ||
165 | } else { | ||
166 | nr_pages = hpage_nr_pages(page); | ||
167 | } | ||
168 | |||
169 | /* | 179 | /* |
170 | * Regardless of the original PageMlocked flag, we determine nr_pages | 180 | * Serialize with any parallel __split_huge_page_refcount() which |
171 | * after touching the flag. This leaves a possible race with a THP page | 181 | * might otherwise copy PageMlocked to part of the tail pages before |
172 | * split, such that a whole THP page was munlocked, but nr_pages == 1. | 182 | * we clear it in the head page. It also stabilizes hpage_nr_pages(). |
173 | * Returning a smaller mask due to that is OK, the worst that can | ||
174 | * happen is subsequent useless scanning of the former tail pages. | ||
175 | * The NR_MLOCK accounting can however become broken. | ||
176 | */ | 183 | */ |
184 | spin_lock_irq(&zone->lru_lock); | ||
185 | |||
186 | nr_pages = hpage_nr_pages(page); | ||
187 | if (!TestClearPageMlocked(page)) | ||
188 | goto unlock_out; | ||
189 | |||
190 | __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); | ||
191 | |||
192 | if (__munlock_isolate_lru_page(page, true)) { | ||
193 | spin_unlock_irq(&zone->lru_lock); | ||
194 | __munlock_isolated_page(page); | ||
195 | goto out; | ||
196 | } | ||
197 | __munlock_isolation_failed(page); | ||
198 | |||
199 | unlock_out: | ||
200 | spin_unlock_irq(&zone->lru_lock); | ||
201 | |||
202 | out: | ||
177 | return nr_pages - 1; | 203 | return nr_pages - 1; |
178 | } | 204 | } |
179 | 205 | ||
@@ -253,8 +279,8 @@ static int __mlock_posix_error_return(long retval) | |||
253 | static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, | 279 | static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, |
254 | int *pgrescued) | 280 | int *pgrescued) |
255 | { | 281 | { |
256 | VM_BUG_ON(PageLRU(page)); | 282 | VM_BUG_ON_PAGE(PageLRU(page), page); |
257 | VM_BUG_ON(!PageLocked(page)); | 283 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
258 | 284 | ||
259 | if (page_mapcount(page) <= 1 && page_evictable(page)) { | 285 | if (page_mapcount(page) <= 1 && page_evictable(page)) { |
260 | pagevec_add(pvec, page); | 286 | pagevec_add(pvec, page); |
@@ -310,34 +336,24 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) | |||
310 | struct page *page = pvec->pages[i]; | 336 | struct page *page = pvec->pages[i]; |
311 | 337 | ||
312 | if (TestClearPageMlocked(page)) { | 338 | if (TestClearPageMlocked(page)) { |
313 | struct lruvec *lruvec; | ||
314 | int lru; | ||
315 | |||
316 | if (PageLRU(page)) { | ||
317 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
318 | lru = page_lru(page); | ||
319 | /* | ||
320 | * We already have pin from follow_page_mask() | ||
321 | * so we can spare the get_page() here. | ||
322 | */ | ||
323 | ClearPageLRU(page); | ||
324 | del_page_from_lru_list(page, lruvec, lru); | ||
325 | } else { | ||
326 | __munlock_isolation_failed(page); | ||
327 | goto skip_munlock; | ||
328 | } | ||
329 | |||
330 | } else { | ||
331 | skip_munlock: | ||
332 | /* | 339 | /* |
333 | * We won't be munlocking this page in the next phase | 340 | * We already have pin from follow_page_mask() |
334 | * but we still need to release the follow_page_mask() | 341 | * so we can spare the get_page() here. |
335 | * pin. We cannot do it under lru_lock however. If it's | ||
336 | * the last pin, __page_cache_release would deadlock. | ||
337 | */ | 342 | */ |
338 | pagevec_add(&pvec_putback, pvec->pages[i]); | 343 | if (__munlock_isolate_lru_page(page, false)) |
339 | pvec->pages[i] = NULL; | 344 | continue; |
345 | else | ||
346 | __munlock_isolation_failed(page); | ||
340 | } | 347 | } |
348 | |||
349 | /* | ||
350 | * We won't be munlocking this page in the next phase | ||
351 | * but we still need to release the follow_page_mask() | ||
352 | * pin. We cannot do it under lru_lock however. If it's | ||
353 | * the last pin, __page_cache_release() would deadlock. | ||
354 | */ | ||
355 | pagevec_add(&pvec_putback, pvec->pages[i]); | ||
356 | pvec->pages[i] = NULL; | ||
341 | } | 357 | } |
342 | delta_munlocked = -nr + pagevec_count(&pvec_putback); | 358 | delta_munlocked = -nr + pagevec_count(&pvec_putback); |
343 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); | 359 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); |
@@ -709,19 +725,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
709 | 725 | ||
710 | lru_add_drain_all(); /* flush pagevec */ | 726 | lru_add_drain_all(); /* flush pagevec */ |
711 | 727 | ||
712 | down_write(¤t->mm->mmap_sem); | ||
713 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 728 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
714 | start &= PAGE_MASK; | 729 | start &= PAGE_MASK; |
715 | 730 | ||
716 | locked = len >> PAGE_SHIFT; | ||
717 | locked += current->mm->locked_vm; | ||
718 | |||
719 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 731 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
720 | lock_limit >>= PAGE_SHIFT; | 732 | lock_limit >>= PAGE_SHIFT; |
733 | locked = len >> PAGE_SHIFT; | ||
734 | |||
735 | down_write(¤t->mm->mmap_sem); | ||
736 | |||
737 | locked += current->mm->locked_vm; | ||
721 | 738 | ||
722 | /* check against resource limits */ | 739 | /* check against resource limits */ |
723 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | 740 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
724 | error = do_mlock(start, len, 1); | 741 | error = do_mlock(start, len, 1); |
742 | |||
725 | up_write(¤t->mm->mmap_sem); | 743 | up_write(¤t->mm->mmap_sem); |
726 | if (!error) | 744 | if (!error) |
727 | error = __mm_populate(start, len, 0); | 745 | error = __mm_populate(start, len, 0); |
@@ -732,11 +750,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) | |||
732 | { | 750 | { |
733 | int ret; | 751 | int ret; |
734 | 752 | ||
735 | down_write(¤t->mm->mmap_sem); | ||
736 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 753 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
737 | start &= PAGE_MASK; | 754 | start &= PAGE_MASK; |
755 | |||
756 | down_write(¤t->mm->mmap_sem); | ||
738 | ret = do_mlock(start, len, 0); | 757 | ret = do_mlock(start, len, 0); |
739 | up_write(¤t->mm->mmap_sem); | 758 | up_write(¤t->mm->mmap_sem); |
759 | |||
740 | return ret; | 760 | return ret; |
741 | } | 761 | } |
742 | 762 | ||
@@ -781,12 +801,12 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
781 | if (flags & MCL_CURRENT) | 801 | if (flags & MCL_CURRENT) |
782 | lru_add_drain_all(); /* flush pagevec */ | 802 | lru_add_drain_all(); /* flush pagevec */ |
783 | 803 | ||
784 | down_write(¤t->mm->mmap_sem); | ||
785 | |||
786 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 804 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
787 | lock_limit >>= PAGE_SHIFT; | 805 | lock_limit >>= PAGE_SHIFT; |
788 | 806 | ||
789 | ret = -ENOMEM; | 807 | ret = -ENOMEM; |
808 | down_write(¤t->mm->mmap_sem); | ||
809 | |||
790 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || | 810 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || |
791 | capable(CAP_IPC_LOCK)) | 811 | capable(CAP_IPC_LOCK)) |
792 | ret = do_mlockall(flags); | 812 | ret = do_mlockall(flags); |
diff --git a/mm/mm_init.c b/mm/mm_init.c index 68562e92d50c..4074caf9936b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -202,5 +202,4 @@ static int __init mm_sysfs_init(void) | |||
202 | 202 | ||
203 | return 0; | 203 | return 0; |
204 | } | 204 | } |
205 | 205 | postcore_initcall(mm_sysfs_init); | |
206 | __initcall(mm_sysfs_init); | ||
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); | |||
86 | 86 | ||
87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ | 88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ |
89 | unsigned long sysctl_overcommit_kbytes __read_mostly; | ||
89 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 90 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
90 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 91 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
91 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | 92 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ |
@@ -893,7 +894,15 @@ again: remove_next = 1 + (end > next->vm_end); | |||
893 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 894 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
894 | struct file *file, unsigned long vm_flags) | 895 | struct file *file, unsigned long vm_flags) |
895 | { | 896 | { |
896 | if (vma->vm_flags ^ vm_flags) | 897 | /* |
898 | * VM_SOFTDIRTY should not prevent from VMA merging, if we | ||
899 | * match the flags but dirty bit -- the caller should mark | ||
900 | * merged VMA as dirty. If dirty bit won't be excluded from | ||
901 | * comparison, we increase pressue on the memory system forcing | ||
902 | * the kernel to generate new VMAs when old one could be | ||
903 | * extended instead. | ||
904 | */ | ||
905 | if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) | ||
897 | return 0; | 906 | return 0; |
898 | if (vma->vm_file != file) | 907 | if (vma->vm_file != file) |
899 | return 0; | 908 | return 0; |
@@ -1082,7 +1091,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * | |||
1082 | return a->vm_end == b->vm_start && | 1091 | return a->vm_end == b->vm_start && |
1083 | mpol_equal(vma_policy(a), vma_policy(b)) && | 1092 | mpol_equal(vma_policy(a), vma_policy(b)) && |
1084 | a->vm_file == b->vm_file && | 1093 | a->vm_file == b->vm_file && |
1085 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && | 1094 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && |
1086 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); | 1095 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); |
1087 | } | 1096 | } |
1088 | 1097 | ||
@@ -1190,6 +1199,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint) | |||
1190 | return hint; | 1199 | return hint; |
1191 | } | 1200 | } |
1192 | 1201 | ||
1202 | static inline int mlock_future_check(struct mm_struct *mm, | ||
1203 | unsigned long flags, | ||
1204 | unsigned long len) | ||
1205 | { | ||
1206 | unsigned long locked, lock_limit; | ||
1207 | |||
1208 | /* mlock MCL_FUTURE? */ | ||
1209 | if (flags & VM_LOCKED) { | ||
1210 | locked = len >> PAGE_SHIFT; | ||
1211 | locked += mm->locked_vm; | ||
1212 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
1213 | lock_limit >>= PAGE_SHIFT; | ||
1214 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
1215 | return -EAGAIN; | ||
1216 | } | ||
1217 | return 0; | ||
1218 | } | ||
1219 | |||
1193 | /* | 1220 | /* |
1194 | * The caller must hold down_write(¤t->mm->mmap_sem). | 1221 | * The caller must hold down_write(¤t->mm->mmap_sem). |
1195 | */ | 1222 | */ |
@@ -1251,16 +1278,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1251 | if (!can_do_mlock()) | 1278 | if (!can_do_mlock()) |
1252 | return -EPERM; | 1279 | return -EPERM; |
1253 | 1280 | ||
1254 | /* mlock MCL_FUTURE? */ | 1281 | if (mlock_future_check(mm, vm_flags, len)) |
1255 | if (vm_flags & VM_LOCKED) { | 1282 | return -EAGAIN; |
1256 | unsigned long locked, lock_limit; | ||
1257 | locked = len >> PAGE_SHIFT; | ||
1258 | locked += mm->locked_vm; | ||
1259 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
1260 | lock_limit >>= PAGE_SHIFT; | ||
1261 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
1262 | return -EAGAIN; | ||
1263 | } | ||
1264 | 1283 | ||
1265 | if (file) { | 1284 | if (file) { |
1266 | struct inode *inode = file_inode(file); | 1285 | struct inode *inode = file_inode(file); |
@@ -2591,18 +2610,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2591 | if (error & ~PAGE_MASK) | 2610 | if (error & ~PAGE_MASK) |
2592 | return error; | 2611 | return error; |
2593 | 2612 | ||
2594 | /* | 2613 | error = mlock_future_check(mm, mm->def_flags, len); |
2595 | * mlock MCL_FUTURE? | 2614 | if (error) |
2596 | */ | 2615 | return error; |
2597 | if (mm->def_flags & VM_LOCKED) { | ||
2598 | unsigned long locked, lock_limit; | ||
2599 | locked = len >> PAGE_SHIFT; | ||
2600 | locked += mm->locked_vm; | ||
2601 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
2602 | lock_limit >>= PAGE_SHIFT; | ||
2603 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
2604 | return -EAGAIN; | ||
2605 | } | ||
2606 | 2616 | ||
2607 | /* | 2617 | /* |
2608 | * mm->mmap_sem is required to protect against another thread | 2618 | * mm->mmap_sem is required to protect against another thread |
@@ -3140,7 +3150,7 @@ static int init_user_reserve(void) | |||
3140 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); | 3150 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); |
3141 | return 0; | 3151 | return 0; |
3142 | } | 3152 | } |
3143 | module_init(init_user_reserve) | 3153 | subsys_initcall(init_user_reserve); |
3144 | 3154 | ||
3145 | /* | 3155 | /* |
3146 | * Initialise sysctl_admin_reserve_kbytes. | 3156 | * Initialise sysctl_admin_reserve_kbytes. |
@@ -3161,7 +3171,7 @@ static int init_admin_reserve(void) | |||
3161 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | 3171 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); |
3162 | return 0; | 3172 | return 0; |
3163 | } | 3173 | } |
3164 | module_init(init_admin_reserve) | 3174 | subsys_initcall(init_admin_reserve); |
3165 | 3175 | ||
3166 | /* | 3176 | /* |
3167 | * Reinititalise user and admin reserves if memory is added or removed. | 3177 | * Reinititalise user and admin reserves if memory is added or removed. |
@@ -3231,4 +3241,4 @@ static int __meminit init_reserve_notifier(void) | |||
3231 | 3241 | ||
3232 | return 0; | 3242 | return 0; |
3233 | } | 3243 | } |
3234 | module_init(init_reserve_notifier) | 3244 | subsys_initcall(init_reserve_notifier); |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 93e6089cb456..41cefdf0aadd 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void) | |||
329 | { | 329 | { |
330 | return init_srcu_struct(&srcu); | 330 | return init_srcu_struct(&srcu); |
331 | } | 331 | } |
332 | 332 | subsys_initcall(mmu_notifier_init); | |
333 | module_init(mmu_notifier_init); | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index bb53a6591aea..7332c1785744 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/mmu_notifier.h> | 23 | #include <linux/mmu_notifier.h> |
24 | #include <linux/migrate.h> | 24 | #include <linux/migrate.h> |
25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
26 | #include <linux/ksm.h> | ||
26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
27 | #include <asm/pgtable.h> | 28 | #include <asm/pgtable.h> |
28 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
63 | 64 | ||
64 | ptent = *pte; | 65 | ptent = *pte; |
65 | page = vm_normal_page(vma, addr, oldpte); | 66 | page = vm_normal_page(vma, addr, oldpte); |
66 | if (page) { | 67 | if (page && !PageKsm(page)) { |
67 | if (!pte_numa(oldpte)) { | 68 | if (!pte_numa(oldpte)) { |
68 | ptent = pte_mknuma(ptent); | 69 | ptent = pte_mknuma(ptent); |
69 | set_pte_at(mm, addr, pte, ptent); | 70 | set_pte_at(mm, addr, pte, ptent); |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 2c254d374655..f73f2987a852 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -41,11 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | |||
41 | if (limit > memblock.current_limit) | 41 | if (limit > memblock.current_limit) |
42 | limit = memblock.current_limit; | 42 | limit = memblock.current_limit; |
43 | 43 | ||
44 | addr = memblock_find_in_range_node(goal, limit, size, align, nid); | 44 | addr = memblock_find_in_range_node(size, align, goal, limit, nid); |
45 | if (!addr) | 45 | if (!addr) |
46 | return NULL; | 46 | return NULL; |
47 | 47 | ||
48 | memblock_reserve(addr, size); | 48 | if (memblock_reserve(addr, size)) |
49 | return NULL; | ||
50 | |||
49 | ptr = phys_to_virt(addr); | 51 | ptr = phys_to_virt(addr); |
50 | memset(ptr, 0, size); | 52 | memset(ptr, 0, size); |
51 | /* | 53 | /* |
@@ -114,16 +116,27 @@ static unsigned long __init __free_memory_core(phys_addr_t start, | |||
114 | static unsigned long __init free_low_memory_core_early(void) | 116 | static unsigned long __init free_low_memory_core_early(void) |
115 | { | 117 | { |
116 | unsigned long count = 0; | 118 | unsigned long count = 0; |
117 | phys_addr_t start, end, size; | 119 | phys_addr_t start, end; |
118 | u64 i; | 120 | u64 i; |
119 | 121 | ||
120 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) | 122 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) |
121 | count += __free_memory_core(start, end); | 123 | count += __free_memory_core(start, end); |
122 | 124 | ||
123 | /* free range that is used for reserved array if we allocate it */ | 125 | #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK |
124 | size = get_allocated_memblock_reserved_regions_info(&start); | 126 | { |
125 | if (size) | 127 | phys_addr_t size; |
126 | count += __free_memory_core(start, start + size); | 128 | |
129 | /* Free memblock.reserved array if it was allocated */ | ||
130 | size = get_allocated_memblock_reserved_regions_info(&start); | ||
131 | if (size) | ||
132 | count += __free_memory_core(start, start + size); | ||
133 | |||
134 | /* Free memblock.memory array if it was allocated */ | ||
135 | size = get_allocated_memblock_memory_regions_info(&start); | ||
136 | if (size) | ||
137 | count += __free_memory_core(start, start + size); | ||
138 | } | ||
139 | #endif | ||
127 | 140 | ||
128 | return count; | 141 | return count; |
129 | } | 142 | } |
@@ -161,7 +174,7 @@ unsigned long __init free_all_bootmem(void) | |||
161 | reset_all_zones_managed_pages(); | 174 | reset_all_zones_managed_pages(); |
162 | 175 | ||
163 | /* | 176 | /* |
164 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 177 | * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id |
165 | * because in some case like Node0 doesn't have RAM installed | 178 | * because in some case like Node0 doesn't have RAM installed |
166 | * low ram will be on Node1 | 179 | * low ram will be on Node1 |
167 | */ | 180 | */ |
@@ -215,7 +228,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, | |||
215 | 228 | ||
216 | restart: | 229 | restart: |
217 | 230 | ||
218 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | 231 | ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); |
219 | 232 | ||
220 | if (ptr) | 233 | if (ptr) |
221 | return ptr; | 234 | return ptr; |
@@ -299,7 +312,7 @@ again: | |||
299 | if (ptr) | 312 | if (ptr) |
300 | return ptr; | 313 | return ptr; |
301 | 314 | ||
302 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, | 315 | ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, |
303 | goal, limit); | 316 | goal, limit); |
304 | if (ptr) | 317 | if (ptr) |
305 | return ptr; | 318 | return ptr; |
diff --git a/mm/nommu.c b/mm/nommu.c index fec093adad9c..8740213b1647 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn; | |||
60 | struct percpu_counter vm_committed_as; | 60 | struct percpu_counter vm_committed_as; |
61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
62 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 62 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
63 | unsigned long sysctl_overcommit_kbytes __read_mostly; | ||
63 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 64 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
64 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; | 65 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
65 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 66 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e4a600a6163..3291e82d4352 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock); | |||
47 | #ifdef CONFIG_NUMA | 47 | #ifdef CONFIG_NUMA |
48 | /** | 48 | /** |
49 | * has_intersects_mems_allowed() - check task eligiblity for kill | 49 | * has_intersects_mems_allowed() - check task eligiblity for kill |
50 | * @tsk: task struct of which task to consider | 50 | * @start: task struct of which task to consider |
51 | * @mask: nodemask passed to page allocator for mempolicy ooms | 51 | * @mask: nodemask passed to page allocator for mempolicy ooms |
52 | * | 52 | * |
53 | * Task eligibility is determined by whether or not a candidate task, @tsk, | 53 | * Task eligibility is determined by whether or not a candidate task, @tsk, |
54 | * shares the same mempolicy nodes as current if it is bound by such a policy | 54 | * shares the same mempolicy nodes as current if it is bound by such a policy |
55 | * and whether or not it has the same set of allowed cpuset nodes. | 55 | * and whether or not it has the same set of allowed cpuset nodes. |
56 | */ | 56 | */ |
57 | static bool has_intersects_mems_allowed(struct task_struct *tsk, | 57 | static bool has_intersects_mems_allowed(struct task_struct *start, |
58 | const nodemask_t *mask) | 58 | const nodemask_t *mask) |
59 | { | 59 | { |
60 | struct task_struct *start = tsk; | 60 | struct task_struct *tsk; |
61 | bool ret = false; | ||
61 | 62 | ||
62 | do { | 63 | rcu_read_lock(); |
64 | for_each_thread(start, tsk) { | ||
63 | if (mask) { | 65 | if (mask) { |
64 | /* | 66 | /* |
65 | * If this is a mempolicy constrained oom, tsk's | 67 | * If this is a mempolicy constrained oom, tsk's |
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
67 | * mempolicy intersects current, otherwise it may be | 69 | * mempolicy intersects current, otherwise it may be |
68 | * needlessly killed. | 70 | * needlessly killed. |
69 | */ | 71 | */ |
70 | if (mempolicy_nodemask_intersects(tsk, mask)) | 72 | ret = mempolicy_nodemask_intersects(tsk, mask); |
71 | return true; | ||
72 | } else { | 73 | } else { |
73 | /* | 74 | /* |
74 | * This is not a mempolicy constrained oom, so only | 75 | * This is not a mempolicy constrained oom, so only |
75 | * check the mems of tsk's cpuset. | 76 | * check the mems of tsk's cpuset. |
76 | */ | 77 | */ |
77 | if (cpuset_mems_allowed_intersects(current, tsk)) | 78 | ret = cpuset_mems_allowed_intersects(current, tsk); |
78 | return true; | ||
79 | } | 79 | } |
80 | } while_each_thread(start, tsk); | 80 | if (ret) |
81 | break; | ||
82 | } | ||
83 | rcu_read_unlock(); | ||
81 | 84 | ||
82 | return false; | 85 | return ret; |
83 | } | 86 | } |
84 | #else | 87 | #else |
85 | static bool has_intersects_mems_allowed(struct task_struct *tsk, | 88 | static bool has_intersects_mems_allowed(struct task_struct *tsk, |
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
97 | */ | 100 | */ |
98 | struct task_struct *find_lock_task_mm(struct task_struct *p) | 101 | struct task_struct *find_lock_task_mm(struct task_struct *p) |
99 | { | 102 | { |
100 | struct task_struct *t = p; | 103 | struct task_struct *t; |
101 | 104 | ||
102 | do { | 105 | rcu_read_lock(); |
106 | |||
107 | for_each_thread(p, t) { | ||
103 | task_lock(t); | 108 | task_lock(t); |
104 | if (likely(t->mm)) | 109 | if (likely(t->mm)) |
105 | return t; | 110 | goto found; |
106 | task_unlock(t); | 111 | task_unlock(t); |
107 | } while_each_thread(p, t); | 112 | } |
113 | t = NULL; | ||
114 | found: | ||
115 | rcu_read_unlock(); | ||
108 | 116 | ||
109 | return NULL; | 117 | return t; |
110 | } | 118 | } |
111 | 119 | ||
112 | /* return true if the task is not adequate as candidate victim task. */ | 120 | /* return true if the task is not adequate as candidate victim task. */ |
@@ -170,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
170 | * implementation used by LSMs. | 178 | * implementation used by LSMs. |
171 | */ | 179 | */ |
172 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) | 180 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
173 | adj -= 30; | 181 | points -= (points * 3) / 100; |
174 | 182 | ||
175 | /* Normalize to oom_score_adj units */ | 183 | /* Normalize to oom_score_adj units */ |
176 | adj *= totalpages / 1000; | 184 | adj *= totalpages / 1000; |
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
301 | unsigned long chosen_points = 0; | 309 | unsigned long chosen_points = 0; |
302 | 310 | ||
303 | rcu_read_lock(); | 311 | rcu_read_lock(); |
304 | do_each_thread(g, p) { | 312 | for_each_process_thread(g, p) { |
305 | unsigned int points; | 313 | unsigned int points; |
306 | 314 | ||
307 | switch (oom_scan_process_thread(p, totalpages, nodemask, | 315 | switch (oom_scan_process_thread(p, totalpages, nodemask, |
@@ -319,11 +327,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
319 | break; | 327 | break; |
320 | }; | 328 | }; |
321 | points = oom_badness(p, NULL, nodemask, totalpages); | 329 | points = oom_badness(p, NULL, nodemask, totalpages); |
322 | if (points > chosen_points) { | 330 | if (!points || points < chosen_points) |
323 | chosen = p; | 331 | continue; |
324 | chosen_points = points; | 332 | /* Prefer thread group leaders for display purposes */ |
325 | } | 333 | if (points == chosen_points && thread_group_leader(chosen)) |
326 | } while_each_thread(g, p); | 334 | continue; |
335 | |||
336 | chosen = p; | ||
337 | chosen_points = points; | ||
338 | } | ||
327 | if (chosen) | 339 | if (chosen) |
328 | get_task_struct(chosen); | 340 | get_task_struct(chosen); |
329 | rcu_read_unlock(); | 341 | rcu_read_unlock(); |
@@ -406,7 +418,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
406 | { | 418 | { |
407 | struct task_struct *victim = p; | 419 | struct task_struct *victim = p; |
408 | struct task_struct *child; | 420 | struct task_struct *child; |
409 | struct task_struct *t = p; | 421 | struct task_struct *t; |
410 | struct mm_struct *mm; | 422 | struct mm_struct *mm; |
411 | unsigned int victim_points = 0; | 423 | unsigned int victim_points = 0; |
412 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 424 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
@@ -437,7 +449,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
437 | * still freeing memory. | 449 | * still freeing memory. |
438 | */ | 450 | */ |
439 | read_lock(&tasklist_lock); | 451 | read_lock(&tasklist_lock); |
440 | do { | 452 | for_each_thread(p, t) { |
441 | list_for_each_entry(child, &t->children, sibling) { | 453 | list_for_each_entry(child, &t->children, sibling) { |
442 | unsigned int child_points; | 454 | unsigned int child_points; |
443 | 455 | ||
@@ -455,13 +467,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
455 | get_task_struct(victim); | 467 | get_task_struct(victim); |
456 | } | 468 | } |
457 | } | 469 | } |
458 | } while_each_thread(p, t); | 470 | } |
459 | read_unlock(&tasklist_lock); | 471 | read_unlock(&tasklist_lock); |
460 | 472 | ||
461 | rcu_read_lock(); | ||
462 | p = find_lock_task_mm(victim); | 473 | p = find_lock_task_mm(victim); |
463 | if (!p) { | 474 | if (!p) { |
464 | rcu_read_unlock(); | ||
465 | put_task_struct(victim); | 475 | put_task_struct(victim); |
466 | return; | 476 | return; |
467 | } else if (victim != p) { | 477 | } else if (victim != p) { |
@@ -487,6 +497,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
487 | * That thread will now get access to memory reserves since it has a | 497 | * That thread will now get access to memory reserves since it has a |
488 | * pending fatal signal. | 498 | * pending fatal signal. |
489 | */ | 499 | */ |
500 | rcu_read_lock(); | ||
490 | for_each_process(p) | 501 | for_each_process(p) |
491 | if (p->mm == mm && !same_thread_group(p, victim) && | 502 | if (p->mm == mm && !same_thread_group(p, victim) && |
492 | !(p->flags & PF_KTHREAD)) { | 503 | !(p->flags & PF_KTHREAD)) { |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 63807583d8e8..2d30e2cfe804 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -191,6 +191,26 @@ static unsigned long writeout_period_time = 0; | |||
191 | * global dirtyable memory first. | 191 | * global dirtyable memory first. |
192 | */ | 192 | */ |
193 | 193 | ||
194 | /** | ||
195 | * zone_dirtyable_memory - number of dirtyable pages in a zone | ||
196 | * @zone: the zone | ||
197 | * | ||
198 | * Returns the zone's number of pages potentially available for dirty | ||
199 | * page cache. This is the base value for the per-zone dirty limits. | ||
200 | */ | ||
201 | static unsigned long zone_dirtyable_memory(struct zone *zone) | ||
202 | { | ||
203 | unsigned long nr_pages; | ||
204 | |||
205 | nr_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
206 | nr_pages -= min(nr_pages, zone->dirty_balance_reserve); | ||
207 | |||
208 | nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); | ||
209 | nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); | ||
210 | |||
211 | return nr_pages; | ||
212 | } | ||
213 | |||
194 | static unsigned long highmem_dirtyable_memory(unsigned long total) | 214 | static unsigned long highmem_dirtyable_memory(unsigned long total) |
195 | { | 215 | { |
196 | #ifdef CONFIG_HIGHMEM | 216 | #ifdef CONFIG_HIGHMEM |
@@ -198,11 +218,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
198 | unsigned long x = 0; | 218 | unsigned long x = 0; |
199 | 219 | ||
200 | for_each_node_state(node, N_HIGH_MEMORY) { | 220 | for_each_node_state(node, N_HIGH_MEMORY) { |
201 | struct zone *z = | 221 | struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; |
202 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | ||
203 | 222 | ||
204 | x += zone_page_state(z, NR_FREE_PAGES) + | 223 | x += zone_dirtyable_memory(z); |
205 | zone_reclaimable_pages(z) - z->dirty_balance_reserve; | ||
206 | } | 224 | } |
207 | /* | 225 | /* |
208 | * Unreclaimable memory (kernel memory or anonymous memory | 226 | * Unreclaimable memory (kernel memory or anonymous memory |
@@ -238,9 +256,12 @@ static unsigned long global_dirtyable_memory(void) | |||
238 | { | 256 | { |
239 | unsigned long x; | 257 | unsigned long x; |
240 | 258 | ||
241 | x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); | 259 | x = global_page_state(NR_FREE_PAGES); |
242 | x -= min(x, dirty_balance_reserve); | 260 | x -= min(x, dirty_balance_reserve); |
243 | 261 | ||
262 | x += global_page_state(NR_INACTIVE_FILE); | ||
263 | x += global_page_state(NR_ACTIVE_FILE); | ||
264 | |||
244 | if (!vm_highmem_is_dirtyable) | 265 | if (!vm_highmem_is_dirtyable) |
245 | x -= highmem_dirtyable_memory(x); | 266 | x -= highmem_dirtyable_memory(x); |
246 | 267 | ||
@@ -289,32 +310,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
289 | } | 310 | } |
290 | 311 | ||
291 | /** | 312 | /** |
292 | * zone_dirtyable_memory - number of dirtyable pages in a zone | ||
293 | * @zone: the zone | ||
294 | * | ||
295 | * Returns the zone's number of pages potentially available for dirty | ||
296 | * page cache. This is the base value for the per-zone dirty limits. | ||
297 | */ | ||
298 | static unsigned long zone_dirtyable_memory(struct zone *zone) | ||
299 | { | ||
300 | /* | ||
301 | * The effective global number of dirtyable pages may exclude | ||
302 | * highmem as a big-picture measure to keep the ratio between | ||
303 | * dirty memory and lowmem reasonable. | ||
304 | * | ||
305 | * But this function is purely about the individual zone and a | ||
306 | * highmem zone can hold its share of dirty pages, so we don't | ||
307 | * care about vm_highmem_is_dirtyable here. | ||
308 | */ | ||
309 | unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + | ||
310 | zone_reclaimable_pages(zone); | ||
311 | |||
312 | /* don't allow this to underflow */ | ||
313 | nr_pages -= min(nr_pages, zone->dirty_balance_reserve); | ||
314 | return nr_pages; | ||
315 | } | ||
316 | |||
317 | /** | ||
318 | * zone_dirty_limit - maximum number of dirty pages allowed in a zone | 313 | * zone_dirty_limit - maximum number of dirty pages allowed in a zone |
319 | * @zone: the zone | 314 | * @zone: the zone |
320 | * | 315 | * |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5248fe070aa4..e3758a09a009 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -205,7 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
205 | }; | 205 | }; |
206 | 206 | ||
207 | int min_free_kbytes = 1024; | 207 | int min_free_kbytes = 1024; |
208 | int user_min_free_kbytes; | 208 | int user_min_free_kbytes = -1; |
209 | 209 | ||
210 | static unsigned long __meminitdata nr_kernel_pages; | 210 | static unsigned long __meminitdata nr_kernel_pages; |
211 | static unsigned long __meminitdata nr_all_pages; | 211 | static unsigned long __meminitdata nr_all_pages; |
@@ -295,7 +295,7 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
295 | } | 295 | } |
296 | #endif | 296 | #endif |
297 | 297 | ||
298 | static void bad_page(struct page *page) | 298 | static void bad_page(struct page *page, char *reason, unsigned long bad_flags) |
299 | { | 299 | { |
300 | static unsigned long resume; | 300 | static unsigned long resume; |
301 | static unsigned long nr_shown; | 301 | static unsigned long nr_shown; |
@@ -329,7 +329,7 @@ static void bad_page(struct page *page) | |||
329 | 329 | ||
330 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 330 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", |
331 | current->comm, page_to_pfn(page)); | 331 | current->comm, page_to_pfn(page)); |
332 | dump_page(page); | 332 | dump_page_badflags(page, reason, bad_flags); |
333 | 333 | ||
334 | print_modules(); | 334 | print_modules(); |
335 | dump_stack(); | 335 | dump_stack(); |
@@ -383,7 +383,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
383 | int bad = 0; | 383 | int bad = 0; |
384 | 384 | ||
385 | if (unlikely(compound_order(page) != order)) { | 385 | if (unlikely(compound_order(page) != order)) { |
386 | bad_page(page); | 386 | bad_page(page, "wrong compound order", 0); |
387 | bad++; | 387 | bad++; |
388 | } | 388 | } |
389 | 389 | ||
@@ -392,8 +392,11 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
392 | for (i = 1; i < nr_pages; i++) { | 392 | for (i = 1; i < nr_pages; i++) { |
393 | struct page *p = page + i; | 393 | struct page *p = page + i; |
394 | 394 | ||
395 | if (unlikely(!PageTail(p) || (p->first_page != page))) { | 395 | if (unlikely(!PageTail(p))) { |
396 | bad_page(page); | 396 | bad_page(page, "PageTail not set", 0); |
397 | bad++; | ||
398 | } else if (unlikely(p->first_page != page)) { | ||
399 | bad_page(page, "first_page not consistent", 0); | ||
397 | bad++; | 400 | bad++; |
398 | } | 401 | } |
399 | __ClearPageTail(p); | 402 | __ClearPageTail(p); |
@@ -506,12 +509,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
506 | return 0; | 509 | return 0; |
507 | 510 | ||
508 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 511 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
509 | VM_BUG_ON(page_count(buddy) != 0); | 512 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
510 | return 1; | 513 | return 1; |
511 | } | 514 | } |
512 | 515 | ||
513 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 516 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
514 | VM_BUG_ON(page_count(buddy) != 0); | 517 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
515 | return 1; | 518 | return 1; |
516 | } | 519 | } |
517 | return 0; | 520 | return 0; |
@@ -561,8 +564,8 @@ static inline void __free_one_page(struct page *page, | |||
561 | 564 | ||
562 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 565 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
563 | 566 | ||
564 | VM_BUG_ON(page_idx & ((1 << order) - 1)); | 567 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); |
565 | VM_BUG_ON(bad_range(zone, page)); | 568 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
566 | 569 | ||
567 | while (order < MAX_ORDER-1) { | 570 | while (order < MAX_ORDER-1) { |
568 | buddy_idx = __find_buddy_index(page_idx, order); | 571 | buddy_idx = __find_buddy_index(page_idx, order); |
@@ -618,12 +621,23 @@ out: | |||
618 | 621 | ||
619 | static inline int free_pages_check(struct page *page) | 622 | static inline int free_pages_check(struct page *page) |
620 | { | 623 | { |
621 | if (unlikely(page_mapcount(page) | | 624 | char *bad_reason = NULL; |
622 | (page->mapping != NULL) | | 625 | unsigned long bad_flags = 0; |
623 | (atomic_read(&page->_count) != 0) | | 626 | |
624 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | | 627 | if (unlikely(page_mapcount(page))) |
625 | (mem_cgroup_bad_page_check(page)))) { | 628 | bad_reason = "nonzero mapcount"; |
626 | bad_page(page); | 629 | if (unlikely(page->mapping != NULL)) |
630 | bad_reason = "non-NULL mapping"; | ||
631 | if (unlikely(atomic_read(&page->_count) != 0)) | ||
632 | bad_reason = "nonzero _count"; | ||
633 | if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { | ||
634 | bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; | ||
635 | bad_flags = PAGE_FLAGS_CHECK_AT_FREE; | ||
636 | } | ||
637 | if (unlikely(mem_cgroup_bad_page_check(page))) | ||
638 | bad_reason = "cgroup check failed"; | ||
639 | if (unlikely(bad_reason)) { | ||
640 | bad_page(page, bad_reason, bad_flags); | ||
627 | return 1; | 641 | return 1; |
628 | } | 642 | } |
629 | page_cpupid_reset_last(page); | 643 | page_cpupid_reset_last(page); |
@@ -813,7 +827,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
813 | area--; | 827 | area--; |
814 | high--; | 828 | high--; |
815 | size >>= 1; | 829 | size >>= 1; |
816 | VM_BUG_ON(bad_range(zone, &page[size])); | 830 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
817 | 831 | ||
818 | #ifdef CONFIG_DEBUG_PAGEALLOC | 832 | #ifdef CONFIG_DEBUG_PAGEALLOC |
819 | if (high < debug_guardpage_minorder()) { | 833 | if (high < debug_guardpage_minorder()) { |
@@ -843,12 +857,23 @@ static inline void expand(struct zone *zone, struct page *page, | |||
843 | */ | 857 | */ |
844 | static inline int check_new_page(struct page *page) | 858 | static inline int check_new_page(struct page *page) |
845 | { | 859 | { |
846 | if (unlikely(page_mapcount(page) | | 860 | char *bad_reason = NULL; |
847 | (page->mapping != NULL) | | 861 | unsigned long bad_flags = 0; |
848 | (atomic_read(&page->_count) != 0) | | 862 | |
849 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | | 863 | if (unlikely(page_mapcount(page))) |
850 | (mem_cgroup_bad_page_check(page)))) { | 864 | bad_reason = "nonzero mapcount"; |
851 | bad_page(page); | 865 | if (unlikely(page->mapping != NULL)) |
866 | bad_reason = "non-NULL mapping"; | ||
867 | if (unlikely(atomic_read(&page->_count) != 0)) | ||
868 | bad_reason = "nonzero _count"; | ||
869 | if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { | ||
870 | bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; | ||
871 | bad_flags = PAGE_FLAGS_CHECK_AT_PREP; | ||
872 | } | ||
873 | if (unlikely(mem_cgroup_bad_page_check(page))) | ||
874 | bad_reason = "cgroup check failed"; | ||
875 | if (unlikely(bad_reason)) { | ||
876 | bad_page(page, bad_reason, bad_flags); | ||
852 | return 1; | 877 | return 1; |
853 | } | 878 | } |
854 | return 0; | 879 | return 0; |
@@ -955,7 +980,7 @@ int move_freepages(struct zone *zone, | |||
955 | 980 | ||
956 | for (page = start_page; page <= end_page;) { | 981 | for (page = start_page; page <= end_page;) { |
957 | /* Make sure we are not inadvertently changing nodes */ | 982 | /* Make sure we are not inadvertently changing nodes */ |
958 | VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); | 983 | VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
959 | 984 | ||
960 | if (!pfn_valid_within(page_to_pfn(page))) { | 985 | if (!pfn_valid_within(page_to_pfn(page))) { |
961 | page++; | 986 | page++; |
@@ -1404,8 +1429,8 @@ void split_page(struct page *page, unsigned int order) | |||
1404 | { | 1429 | { |
1405 | int i; | 1430 | int i; |
1406 | 1431 | ||
1407 | VM_BUG_ON(PageCompound(page)); | 1432 | VM_BUG_ON_PAGE(PageCompound(page), page); |
1408 | VM_BUG_ON(!page_count(page)); | 1433 | VM_BUG_ON_PAGE(!page_count(page), page); |
1409 | 1434 | ||
1410 | #ifdef CONFIG_KMEMCHECK | 1435 | #ifdef CONFIG_KMEMCHECK |
1411 | /* | 1436 | /* |
@@ -1552,7 +1577,7 @@ again: | |||
1552 | zone_statistics(preferred_zone, zone, gfp_flags); | 1577 | zone_statistics(preferred_zone, zone, gfp_flags); |
1553 | local_irq_restore(flags); | 1578 | local_irq_restore(flags); |
1554 | 1579 | ||
1555 | VM_BUG_ON(bad_range(zone, page)); | 1580 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
1556 | if (prep_new_page(page, order, gfp_flags)) | 1581 | if (prep_new_page(page, order, gfp_flags)) |
1557 | goto again; | 1582 | goto again; |
1558 | return page; | 1583 | return page; |
@@ -2072,13 +2097,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
2072 | return; | 2097 | return; |
2073 | 2098 | ||
2074 | /* | 2099 | /* |
2075 | * Walking all memory to count page types is very expensive and should | ||
2076 | * be inhibited in non-blockable contexts. | ||
2077 | */ | ||
2078 | if (!(gfp_mask & __GFP_WAIT)) | ||
2079 | filter |= SHOW_MEM_FILTER_PAGE_COUNT; | ||
2080 | |||
2081 | /* | ||
2082 | * This documents exceptions given to allocations in certain | 2100 | * This documents exceptions given to allocations in certain |
2083 | * contexts that are allowed to allocate outside current's set | 2101 | * contexts that are allowed to allocate outside current's set |
2084 | * of allowed nodes. | 2102 | * of allowed nodes. |
@@ -2242,10 +2260,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2242 | preferred_zone, migratetype); | 2260 | preferred_zone, migratetype); |
2243 | if (page) { | 2261 | if (page) { |
2244 | preferred_zone->compact_blockskip_flush = false; | 2262 | preferred_zone->compact_blockskip_flush = false; |
2245 | preferred_zone->compact_considered = 0; | 2263 | compaction_defer_reset(preferred_zone, order, true); |
2246 | preferred_zone->compact_defer_shift = 0; | ||
2247 | if (order >= preferred_zone->compact_order_failed) | ||
2248 | preferred_zone->compact_order_failed = order + 1; | ||
2249 | count_vm_event(COMPACTSUCCESS); | 2264 | count_vm_event(COMPACTSUCCESS); |
2250 | return page; | 2265 | return page; |
2251 | } | 2266 | } |
@@ -2535,8 +2550,15 @@ rebalance: | |||
2535 | } | 2550 | } |
2536 | 2551 | ||
2537 | /* Atomic allocations - we can't balance anything */ | 2552 | /* Atomic allocations - we can't balance anything */ |
2538 | if (!wait) | 2553 | if (!wait) { |
2554 | /* | ||
2555 | * All existing users of the deprecated __GFP_NOFAIL are | ||
2556 | * blockable, so warn of any new users that actually allow this | ||
2557 | * type of allocation to fail. | ||
2558 | */ | ||
2559 | WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); | ||
2539 | goto nopage; | 2560 | goto nopage; |
2561 | } | ||
2540 | 2562 | ||
2541 | /* Avoid recursion of direct reclaim */ | 2563 | /* Avoid recursion of direct reclaim */ |
2542 | if (current->flags & PF_MEMALLOC) | 2564 | if (current->flags & PF_MEMALLOC) |
@@ -3901,6 +3923,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3901 | struct page *page; | 3923 | struct page *page; |
3902 | unsigned long block_migratetype; | 3924 | unsigned long block_migratetype; |
3903 | int reserve; | 3925 | int reserve; |
3926 | int old_reserve; | ||
3904 | 3927 | ||
3905 | /* | 3928 | /* |
3906 | * Get the start pfn, end pfn and the number of blocks to reserve | 3929 | * Get the start pfn, end pfn and the number of blocks to reserve |
@@ -3922,6 +3945,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3922 | * future allocation of hugepages at runtime. | 3945 | * future allocation of hugepages at runtime. |
3923 | */ | 3946 | */ |
3924 | reserve = min(2, reserve); | 3947 | reserve = min(2, reserve); |
3948 | old_reserve = zone->nr_migrate_reserve_block; | ||
3949 | |||
3950 | /* When memory hot-add, we almost always need to do nothing */ | ||
3951 | if (reserve == old_reserve) | ||
3952 | return; | ||
3953 | zone->nr_migrate_reserve_block = reserve; | ||
3925 | 3954 | ||
3926 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 3955 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
3927 | if (!pfn_valid(pfn)) | 3956 | if (!pfn_valid(pfn)) |
@@ -3959,6 +3988,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3959 | reserve--; | 3988 | reserve--; |
3960 | continue; | 3989 | continue; |
3961 | } | 3990 | } |
3991 | } else if (!old_reserve) { | ||
3992 | /* | ||
3993 | * At boot time we don't need to scan the whole zone | ||
3994 | * for turning off MIGRATE_RESERVE. | ||
3995 | */ | ||
3996 | break; | ||
3962 | } | 3997 | } |
3963 | 3998 | ||
3964 | /* | 3999 | /* |
@@ -4209,7 +4244,6 @@ static noinline __init_refok | |||
4209 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 4244 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
4210 | { | 4245 | { |
4211 | int i; | 4246 | int i; |
4212 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
4213 | size_t alloc_size; | 4247 | size_t alloc_size; |
4214 | 4248 | ||
4215 | /* | 4249 | /* |
@@ -4225,7 +4259,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
4225 | 4259 | ||
4226 | if (!slab_is_available()) { | 4260 | if (!slab_is_available()) { |
4227 | zone->wait_table = (wait_queue_head_t *) | 4261 | zone->wait_table = (wait_queue_head_t *) |
4228 | alloc_bootmem_node_nopanic(pgdat, alloc_size); | 4262 | memblock_virt_alloc_node_nopanic( |
4263 | alloc_size, zone->zone_pgdat->node_id); | ||
4229 | } else { | 4264 | } else { |
4230 | /* | 4265 | /* |
4231 | * This case means that a zone whose size was 0 gets new memory | 4266 | * This case means that a zone whose size was 0 gets new memory |
@@ -4345,13 +4380,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) | |||
4345 | #endif | 4380 | #endif |
4346 | 4381 | ||
4347 | /** | 4382 | /** |
4348 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | 4383 | * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range |
4349 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | 4384 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
4350 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node | 4385 | * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid |
4351 | * | 4386 | * |
4352 | * If an architecture guarantees that all ranges registered with | 4387 | * If an architecture guarantees that all ranges registered with |
4353 | * add_active_ranges() contain no holes and may be freed, this | 4388 | * add_active_ranges() contain no holes and may be freed, this |
4354 | * this function may be used instead of calling free_bootmem() manually. | 4389 | * this function may be used instead of calling memblock_free_early_nid() |
4390 | * manually. | ||
4355 | */ | 4391 | */ |
4356 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | 4392 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
4357 | { | 4393 | { |
@@ -4363,9 +4399,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | |||
4363 | end_pfn = min(end_pfn, max_low_pfn); | 4399 | end_pfn = min(end_pfn, max_low_pfn); |
4364 | 4400 | ||
4365 | if (start_pfn < end_pfn) | 4401 | if (start_pfn < end_pfn) |
4366 | free_bootmem_node(NODE_DATA(this_nid), | 4402 | memblock_free_early_nid(PFN_PHYS(start_pfn), |
4367 | PFN_PHYS(start_pfn), | 4403 | (end_pfn - start_pfn) << PAGE_SHIFT, |
4368 | (end_pfn - start_pfn) << PAGE_SHIFT); | 4404 | this_nid); |
4369 | } | 4405 | } |
4370 | } | 4406 | } |
4371 | 4407 | ||
@@ -4636,8 +4672,9 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
4636 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); | 4672 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); |
4637 | zone->pageblock_flags = NULL; | 4673 | zone->pageblock_flags = NULL; |
4638 | if (usemapsize) | 4674 | if (usemapsize) |
4639 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, | 4675 | zone->pageblock_flags = |
4640 | usemapsize); | 4676 | memblock_virt_alloc_node_nopanic(usemapsize, |
4677 | pgdat->node_id); | ||
4641 | } | 4678 | } |
4642 | #else | 4679 | #else |
4643 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, | 4680 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, |
@@ -4831,7 +4868,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
4831 | size = (end - start) * sizeof(struct page); | 4868 | size = (end - start) * sizeof(struct page); |
4832 | map = alloc_remap(pgdat->node_id, size); | 4869 | map = alloc_remap(pgdat->node_id, size); |
4833 | if (!map) | 4870 | if (!map) |
4834 | map = alloc_bootmem_node_nopanic(pgdat, size); | 4871 | map = memblock_virt_alloc_node_nopanic(size, |
4872 | pgdat->node_id); | ||
4835 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 4873 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
4836 | } | 4874 | } |
4837 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4875 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -5012,9 +5050,33 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5012 | nodemask_t saved_node_state = node_states[N_MEMORY]; | 5050 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
5013 | unsigned long totalpages = early_calculate_totalpages(); | 5051 | unsigned long totalpages = early_calculate_totalpages(); |
5014 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); | 5052 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
5053 | struct memblock_type *type = &memblock.memory; | ||
5054 | |||
5055 | /* Need to find movable_zone earlier when movable_node is specified. */ | ||
5056 | find_usable_zone_for_movable(); | ||
5057 | |||
5058 | /* | ||
5059 | * If movable_node is specified, ignore kernelcore and movablecore | ||
5060 | * options. | ||
5061 | */ | ||
5062 | if (movable_node_is_enabled()) { | ||
5063 | for (i = 0; i < type->cnt; i++) { | ||
5064 | if (!memblock_is_hotpluggable(&type->regions[i])) | ||
5065 | continue; | ||
5066 | |||
5067 | nid = type->regions[i].nid; | ||
5068 | |||
5069 | usable_startpfn = PFN_DOWN(type->regions[i].base); | ||
5070 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? | ||
5071 | min(usable_startpfn, zone_movable_pfn[nid]) : | ||
5072 | usable_startpfn; | ||
5073 | } | ||
5074 | |||
5075 | goto out2; | ||
5076 | } | ||
5015 | 5077 | ||
5016 | /* | 5078 | /* |
5017 | * If movablecore was specified, calculate what size of | 5079 | * If movablecore=nn[KMG] was specified, calculate what size of |
5018 | * kernelcore that corresponds so that memory usable for | 5080 | * kernelcore that corresponds so that memory usable for |
5019 | * any allocation type is evenly spread. If both kernelcore | 5081 | * any allocation type is evenly spread. If both kernelcore |
5020 | * and movablecore are specified, then the value of kernelcore | 5082 | * and movablecore are specified, then the value of kernelcore |
@@ -5040,7 +5102,6 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5040 | goto out; | 5102 | goto out; |
5041 | 5103 | ||
5042 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 5104 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
5043 | find_usable_zone_for_movable(); | ||
5044 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; | 5105 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; |
5045 | 5106 | ||
5046 | restart: | 5107 | restart: |
@@ -5131,6 +5192,7 @@ restart: | |||
5131 | if (usable_nodes && required_kernelcore > usable_nodes) | 5192 | if (usable_nodes && required_kernelcore > usable_nodes) |
5132 | goto restart; | 5193 | goto restart; |
5133 | 5194 | ||
5195 | out2: | ||
5134 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ | 5196 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
5135 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 5197 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
5136 | zone_movable_pfn[nid] = | 5198 | zone_movable_pfn[nid] = |
@@ -5692,7 +5754,12 @@ module_init(init_per_zone_wmark_min) | |||
5692 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 5754 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
5693 | void __user *buffer, size_t *length, loff_t *ppos) | 5755 | void __user *buffer, size_t *length, loff_t *ppos) |
5694 | { | 5756 | { |
5695 | proc_dointvec(table, write, buffer, length, ppos); | 5757 | int rc; |
5758 | |||
5759 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
5760 | if (rc) | ||
5761 | return rc; | ||
5762 | |||
5696 | if (write) { | 5763 | if (write) { |
5697 | user_min_free_kbytes = min_free_kbytes; | 5764 | user_min_free_kbytes = min_free_kbytes; |
5698 | setup_per_zone_wmarks(); | 5765 | setup_per_zone_wmarks(); |
@@ -5857,7 +5924,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5857 | do { | 5924 | do { |
5858 | size = bucketsize << log2qty; | 5925 | size = bucketsize << log2qty; |
5859 | if (flags & HASH_EARLY) | 5926 | if (flags & HASH_EARLY) |
5860 | table = alloc_bootmem_nopanic(size); | 5927 | table = memblock_virt_alloc_nopanic(size, 0); |
5861 | else if (hashdist) | 5928 | else if (hashdist) |
5862 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 5929 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
5863 | else { | 5930 | else { |
@@ -5959,7 +6026,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5959 | pfn = page_to_pfn(page); | 6026 | pfn = page_to_pfn(page); |
5960 | bitmap = get_pageblock_bitmap(zone, pfn); | 6027 | bitmap = get_pageblock_bitmap(zone, pfn); |
5961 | bitidx = pfn_to_bitidx(zone, pfn); | 6028 | bitidx = pfn_to_bitidx(zone, pfn); |
5962 | VM_BUG_ON(!zone_spans_pfn(zone, pfn)); | 6029 | VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); |
5963 | 6030 | ||
5964 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 6031 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) |
5965 | if (flags & value) | 6032 | if (flags & value) |
@@ -6457,12 +6524,24 @@ static void dump_page_flags(unsigned long flags) | |||
6457 | printk(")\n"); | 6524 | printk(")\n"); |
6458 | } | 6525 | } |
6459 | 6526 | ||
6460 | void dump_page(struct page *page) | 6527 | void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) |
6461 | { | 6528 | { |
6462 | printk(KERN_ALERT | 6529 | printk(KERN_ALERT |
6463 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 6530 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", |
6464 | page, atomic_read(&page->_count), page_mapcount(page), | 6531 | page, atomic_read(&page->_count), page_mapcount(page), |
6465 | page->mapping, page->index); | 6532 | page->mapping, page->index); |
6466 | dump_page_flags(page->flags); | 6533 | dump_page_flags(page->flags); |
6534 | if (reason) | ||
6535 | pr_alert("page dumped because: %s\n", reason); | ||
6536 | if (page->flags & badflags) { | ||
6537 | pr_alert("bad because of flags:\n"); | ||
6538 | dump_page_flags(page->flags & badflags); | ||
6539 | } | ||
6467 | mem_cgroup_print_bad_page(page); | 6540 | mem_cgroup_print_bad_page(page); |
6468 | } | 6541 | } |
6542 | |||
6543 | void dump_page(struct page *page, char *reason) | ||
6544 | { | ||
6545 | dump_page_badflags(page, reason, 0); | ||
6546 | } | ||
6547 | EXPORT_SYMBOL_GPL(dump_page); | ||
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 6d757e3a872a..cfd162882c00 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid) | |||
54 | 54 | ||
55 | table_size = sizeof(struct page_cgroup) * nr_pages; | 55 | table_size = sizeof(struct page_cgroup) * nr_pages; |
56 | 56 | ||
57 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | 57 | base = memblock_virt_alloc_try_nid_nopanic( |
58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
59 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
59 | if (!base) | 60 | if (!base) |
60 | return -ENOMEM; | 61 | return -ENOMEM; |
61 | NODE_DATA(nid)->node_page_cgroup = base; | 62 | NODE_DATA(nid)->node_page_cgroup = base; |
@@ -451,7 +452,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | |||
451 | * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry | 452 | * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry |
452 | * @ent: swap entry to be looked up. | 453 | * @ent: swap entry to be looked up. |
453 | * | 454 | * |
454 | * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) | 455 | * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) |
455 | */ | 456 | */ |
456 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) | 457 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) |
457 | { | 458 | { |
diff --git a/mm/page_io.c b/mm/page_io.c index 8c79a4764be0..7c59ef681381 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -31,13 +31,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, | |||
31 | 31 | ||
32 | bio = bio_alloc(gfp_flags, 1); | 32 | bio = bio_alloc(gfp_flags, 1); |
33 | if (bio) { | 33 | if (bio) { |
34 | bio->bi_sector = map_swap_page(page, &bio->bi_bdev); | 34 | bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); |
35 | bio->bi_sector <<= PAGE_SHIFT - 9; | 35 | bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; |
36 | bio->bi_io_vec[0].bv_page = page; | 36 | bio->bi_io_vec[0].bv_page = page; |
37 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; | 37 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; |
38 | bio->bi_io_vec[0].bv_offset = 0; | 38 | bio->bi_io_vec[0].bv_offset = 0; |
39 | bio->bi_vcnt = 1; | 39 | bio->bi_vcnt = 1; |
40 | bio->bi_size = PAGE_SIZE; | 40 | bio->bi_iter.bi_size = PAGE_SIZE; |
41 | bio->bi_end_io = end_io; | 41 | bio->bi_end_io = end_io; |
42 | } | 42 | } |
43 | return bio; | 43 | return bio; |
@@ -62,7 +62,7 @@ void end_swap_bio_write(struct bio *bio, int err) | |||
62 | printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", | 62 | printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", |
63 | imajor(bio->bi_bdev->bd_inode), | 63 | imajor(bio->bi_bdev->bd_inode), |
64 | iminor(bio->bi_bdev->bd_inode), | 64 | iminor(bio->bi_bdev->bd_inode), |
65 | (unsigned long long)bio->bi_sector); | 65 | (unsigned long long)bio->bi_iter.bi_sector); |
66 | ClearPageReclaim(page); | 66 | ClearPageReclaim(page); |
67 | } | 67 | } |
68 | end_page_writeback(page); | 68 | end_page_writeback(page); |
@@ -80,7 +80,7 @@ void end_swap_bio_read(struct bio *bio, int err) | |||
80 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", | 80 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", |
81 | imajor(bio->bi_bdev->bd_inode), | 81 | imajor(bio->bi_bdev->bd_inode), |
82 | iminor(bio->bi_bdev->bd_inode), | 82 | iminor(bio->bi_bdev->bd_inode), |
83 | (unsigned long long)bio->bi_sector); | 83 | (unsigned long long)bio->bi_iter.bi_sector); |
84 | goto out; | 84 | goto out; |
85 | } | 85 | } |
86 | 86 | ||
@@ -320,8 +320,8 @@ int swap_readpage(struct page *page) | |||
320 | int ret = 0; | 320 | int ret = 0; |
321 | struct swap_info_struct *sis = page_swap_info(page); | 321 | struct swap_info_struct *sis = page_swap_info(page); |
322 | 322 | ||
323 | VM_BUG_ON(!PageLocked(page)); | 323 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
324 | VM_BUG_ON(PageUptodate(page)); | 324 | VM_BUG_ON_PAGE(PageUptodate(page), page); |
325 | if (frontswap_load(page) == 0) { | 325 | if (frontswap_load(page) == 0) { |
326 | SetPageUptodate(page); | 326 | SetPageUptodate(page); |
327 | unlock_page(page); | 327 | unlock_page(page); |
diff --git a/mm/percpu.c b/mm/percpu.c index 0d10defe951e..036cfe07050f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, | |||
1063 | __alignof__(ai->groups[0].cpu_map[0])); | 1063 | __alignof__(ai->groups[0].cpu_map[0])); |
1064 | ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); | 1064 | ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); |
1065 | 1065 | ||
1066 | ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); | 1066 | ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); |
1067 | if (!ptr) | 1067 | if (!ptr) |
1068 | return NULL; | 1068 | return NULL; |
1069 | ai = ptr; | 1069 | ai = ptr; |
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, | |||
1088 | */ | 1088 | */ |
1089 | void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) | 1089 | void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) |
1090 | { | 1090 | { |
1091 | free_bootmem(__pa(ai), ai->__ai_size); | 1091 | memblock_free_early(__pa(ai), ai->__ai_size); |
1092 | } | 1092 | } |
1093 | 1093 | ||
1094 | /** | 1094 | /** |
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1246 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); | 1246 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); |
1247 | 1247 | ||
1248 | /* process group information and build config tables accordingly */ | 1248 | /* process group information and build config tables accordingly */ |
1249 | group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); | 1249 | group_offsets = memblock_virt_alloc(ai->nr_groups * |
1250 | group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); | 1250 | sizeof(group_offsets[0]), 0); |
1251 | unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); | 1251 | group_sizes = memblock_virt_alloc(ai->nr_groups * |
1252 | unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); | 1252 | sizeof(group_sizes[0]), 0); |
1253 | unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); | ||
1254 | unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); | ||
1253 | 1255 | ||
1254 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) | 1256 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) |
1255 | unit_map[cpu] = UINT_MAX; | 1257 | unit_map[cpu] = UINT_MAX; |
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1311 | * empty chunks. | 1313 | * empty chunks. |
1312 | */ | 1314 | */ |
1313 | pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; | 1315 | pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; |
1314 | pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); | 1316 | pcpu_slot = memblock_virt_alloc( |
1317 | pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); | ||
1315 | for (i = 0; i < pcpu_nr_slots; i++) | 1318 | for (i = 0; i < pcpu_nr_slots; i++) |
1316 | INIT_LIST_HEAD(&pcpu_slot[i]); | 1319 | INIT_LIST_HEAD(&pcpu_slot[i]); |
1317 | 1320 | ||
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1322 | * covers static area + reserved area (mostly used for module | 1325 | * covers static area + reserved area (mostly used for module |
1323 | * static percpu allocation). | 1326 | * static percpu allocation). |
1324 | */ | 1327 | */ |
1325 | schunk = alloc_bootmem(pcpu_chunk_struct_size); | 1328 | schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1326 | INIT_LIST_HEAD(&schunk->list); | 1329 | INIT_LIST_HEAD(&schunk->list); |
1327 | schunk->base_addr = base_addr; | 1330 | schunk->base_addr = base_addr; |
1328 | schunk->map = smap; | 1331 | schunk->map = smap; |
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1346 | 1349 | ||
1347 | /* init dynamic chunk if necessary */ | 1350 | /* init dynamic chunk if necessary */ |
1348 | if (dyn_size) { | 1351 | if (dyn_size) { |
1349 | dchunk = alloc_bootmem(pcpu_chunk_struct_size); | 1352 | dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1350 | INIT_LIST_HEAD(&dchunk->list); | 1353 | INIT_LIST_HEAD(&dchunk->list); |
1351 | dchunk->base_addr = base_addr; | 1354 | dchunk->base_addr = base_addr; |
1352 | dchunk->map = dmap; | 1355 | dchunk->map = dmap; |
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
1626 | size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; | 1629 | size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; |
1627 | areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); | 1630 | areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); |
1628 | 1631 | ||
1629 | areas = alloc_bootmem_nopanic(areas_size); | 1632 | areas = memblock_virt_alloc_nopanic(areas_size, 0); |
1630 | if (!areas) { | 1633 | if (!areas) { |
1631 | rc = -ENOMEM; | 1634 | rc = -ENOMEM; |
1632 | goto out_free; | 1635 | goto out_free; |
@@ -1686,10 +1689,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
1686 | max_distance += ai->unit_size; | 1689 | max_distance += ai->unit_size; |
1687 | 1690 | ||
1688 | /* warn if maximum distance is further than 75% of vmalloc space */ | 1691 | /* warn if maximum distance is further than 75% of vmalloc space */ |
1689 | if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { | 1692 | if (max_distance > VMALLOC_TOTAL * 3 / 4) { |
1690 | pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " | 1693 | pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " |
1691 | "space 0x%lx\n", max_distance, | 1694 | "space 0x%lx\n", max_distance, |
1692 | (unsigned long)(VMALLOC_END - VMALLOC_START)); | 1695 | VMALLOC_TOTAL); |
1693 | #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK | 1696 | #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK |
1694 | /* and fail if we have fallback */ | 1697 | /* and fail if we have fallback */ |
1695 | rc = -EINVAL; | 1698 | rc = -EINVAL; |
@@ -1712,7 +1715,7 @@ out_free_areas: | |||
1712 | out_free: | 1715 | out_free: |
1713 | pcpu_free_alloc_info(ai); | 1716 | pcpu_free_alloc_info(ai); |
1714 | if (areas) | 1717 | if (areas) |
1715 | free_bootmem(__pa(areas), areas_size); | 1718 | memblock_free_early(__pa(areas), areas_size); |
1716 | return rc; | 1719 | return rc; |
1717 | } | 1720 | } |
1718 | #endif /* BUILD_EMBED_FIRST_CHUNK */ | 1721 | #endif /* BUILD_EMBED_FIRST_CHUNK */ |
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, | |||
1760 | /* unaligned allocations can't be freed, round up to page size */ | 1763 | /* unaligned allocations can't be freed, round up to page size */ |
1761 | pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * | 1764 | pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * |
1762 | sizeof(pages[0])); | 1765 | sizeof(pages[0])); |
1763 | pages = alloc_bootmem(pages_size); | 1766 | pages = memblock_virt_alloc(pages_size, 0); |
1764 | 1767 | ||
1765 | /* allocate pages */ | 1768 | /* allocate pages */ |
1766 | j = 0; | 1769 | j = 0; |
@@ -1823,7 +1826,7 @@ enomem: | |||
1823 | free_fn(page_address(pages[j]), PAGE_SIZE); | 1826 | free_fn(page_address(pages[j]), PAGE_SIZE); |
1824 | rc = -ENOMEM; | 1827 | rc = -ENOMEM; |
1825 | out_free_ar: | 1828 | out_free_ar: |
1826 | free_bootmem(__pa(pages), pages_size); | 1829 | memblock_free_early(__pa(pages), pages_size); |
1827 | pcpu_free_alloc_info(ai); | 1830 | pcpu_free_alloc_info(ai); |
1828 | return rc; | 1831 | return rc; |
1829 | } | 1832 | } |
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset); | |||
1848 | static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, | 1851 | static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, |
1849 | size_t align) | 1852 | size_t align) |
1850 | { | 1853 | { |
1851 | return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); | 1854 | return memblock_virt_alloc_from_nopanic( |
1855 | size, align, __pa(MAX_DMA_ADDRESS)); | ||
1852 | } | 1856 | } |
1853 | 1857 | ||
1854 | static void __init pcpu_dfl_fc_free(void *ptr, size_t size) | 1858 | static void __init pcpu_dfl_fc_free(void *ptr, size_t size) |
1855 | { | 1859 | { |
1856 | free_bootmem(__pa(ptr), size); | 1860 | memblock_free_early(__pa(ptr), size); |
1857 | } | 1861 | } |
1858 | 1862 | ||
1859 | void __init setup_per_cpu_areas(void) | 1863 | void __init setup_per_cpu_areas(void) |
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void) | |||
1896 | void *fc; | 1900 | void *fc; |
1897 | 1901 | ||
1898 | ai = pcpu_alloc_alloc_info(1, 1); | 1902 | ai = pcpu_alloc_alloc_info(1, 1); |
1899 | fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 1903 | fc = memblock_virt_alloc_from_nopanic(unit_size, |
1904 | PAGE_SIZE, | ||
1905 | __pa(MAX_DMA_ADDRESS)); | ||
1900 | if (!ai || !fc) | 1906 | if (!ai || !fc) |
1901 | panic("Failed to allocate memory for percpu areas."); | 1907 | panic("Failed to allocate memory for percpu areas."); |
1902 | /* kmemleak tracks the percpu allocations separately */ | 1908 | /* kmemleak tracks the percpu allocations separately */ |
diff --git a/mm/readahead.c b/mm/readahead.c index 7cdbb44aa90b..0de2360d65f3 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -211,8 +211,6 @@ out: | |||
211 | int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | 211 | int force_page_cache_readahead(struct address_space *mapping, struct file *filp, |
212 | pgoff_t offset, unsigned long nr_to_read) | 212 | pgoff_t offset, unsigned long nr_to_read) |
213 | { | 213 | { |
214 | int ret = 0; | ||
215 | |||
216 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | 214 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) |
217 | return -EINVAL; | 215 | return -EINVAL; |
218 | 216 | ||
@@ -226,15 +224,13 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
226 | this_chunk = nr_to_read; | 224 | this_chunk = nr_to_read; |
227 | err = __do_page_cache_readahead(mapping, filp, | 225 | err = __do_page_cache_readahead(mapping, filp, |
228 | offset, this_chunk, 0); | 226 | offset, this_chunk, 0); |
229 | if (err < 0) { | 227 | if (err < 0) |
230 | ret = err; | 228 | return err; |
231 | break; | 229 | |
232 | } | ||
233 | ret += err; | ||
234 | offset += this_chunk; | 230 | offset += this_chunk; |
235 | nr_to_read -= this_chunk; | 231 | nr_to_read -= this_chunk; |
236 | } | 232 | } |
237 | return ret; | 233 | return 0; |
238 | } | 234 | } |
239 | 235 | ||
240 | /* | 236 | /* |
@@ -576,8 +572,7 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
576 | if (!mapping || !mapping->a_ops) | 572 | if (!mapping || !mapping->a_ops) |
577 | return -EINVAL; | 573 | return -EINVAL; |
578 | 574 | ||
579 | force_page_cache_readahead(mapping, filp, index, nr); | 575 | return force_page_cache_readahead(mapping, filp, index, nr); |
580 | return 0; | ||
581 | } | 576 | } |
582 | 577 | ||
583 | SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) | 578 | SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) |
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
660 | return 1; | 660 | return 1; |
661 | } | 661 | } |
662 | 662 | ||
663 | struct page_referenced_arg { | ||
664 | int mapcount; | ||
665 | int referenced; | ||
666 | unsigned long vm_flags; | ||
667 | struct mem_cgroup *memcg; | ||
668 | }; | ||
663 | /* | 669 | /* |
664 | * Subfunctions of page_referenced: page_referenced_one called | 670 | * arg: page_referenced_arg will be passed |
665 | * repeatedly from either page_referenced_anon or page_referenced_file. | ||
666 | */ | 671 | */ |
667 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, | 672 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
668 | unsigned long address, unsigned int *mapcount, | 673 | unsigned long address, void *arg) |
669 | unsigned long *vm_flags) | ||
670 | { | 674 | { |
671 | struct mm_struct *mm = vma->vm_mm; | 675 | struct mm_struct *mm = vma->vm_mm; |
672 | spinlock_t *ptl; | 676 | spinlock_t *ptl; |
673 | int referenced = 0; | 677 | int referenced = 0; |
678 | struct page_referenced_arg *pra = arg; | ||
674 | 679 | ||
675 | if (unlikely(PageTransHuge(page))) { | 680 | if (unlikely(PageTransHuge(page))) { |
676 | pmd_t *pmd; | 681 | pmd_t *pmd; |
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
682 | pmd = page_check_address_pmd(page, mm, address, | 687 | pmd = page_check_address_pmd(page, mm, address, |
683 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); | 688 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); |
684 | if (!pmd) | 689 | if (!pmd) |
685 | goto out; | 690 | return SWAP_AGAIN; |
686 | 691 | ||
687 | if (vma->vm_flags & VM_LOCKED) { | 692 | if (vma->vm_flags & VM_LOCKED) { |
688 | spin_unlock(ptl); | 693 | spin_unlock(ptl); |
689 | *mapcount = 0; /* break early from loop */ | 694 | pra->vm_flags |= VM_LOCKED; |
690 | *vm_flags |= VM_LOCKED; | 695 | return SWAP_FAIL; /* To break the loop */ |
691 | goto out; | ||
692 | } | 696 | } |
693 | 697 | ||
694 | /* go ahead even if the pmd is pmd_trans_splitting() */ | 698 | /* go ahead even if the pmd is pmd_trans_splitting() */ |
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
704 | */ | 708 | */ |
705 | pte = page_check_address(page, mm, address, &ptl, 0); | 709 | pte = page_check_address(page, mm, address, &ptl, 0); |
706 | if (!pte) | 710 | if (!pte) |
707 | goto out; | 711 | return SWAP_AGAIN; |
708 | 712 | ||
709 | if (vma->vm_flags & VM_LOCKED) { | 713 | if (vma->vm_flags & VM_LOCKED) { |
710 | pte_unmap_unlock(pte, ptl); | 714 | pte_unmap_unlock(pte, ptl); |
711 | *mapcount = 0; /* break early from loop */ | 715 | pra->vm_flags |= VM_LOCKED; |
712 | *vm_flags |= VM_LOCKED; | 716 | return SWAP_FAIL; /* To break the loop */ |
713 | goto out; | ||
714 | } | 717 | } |
715 | 718 | ||
716 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 719 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
727 | pte_unmap_unlock(pte, ptl); | 730 | pte_unmap_unlock(pte, ptl); |
728 | } | 731 | } |
729 | 732 | ||
730 | (*mapcount)--; | 733 | if (referenced) { |
731 | 734 | pra->referenced++; | |
732 | if (referenced) | 735 | pra->vm_flags |= vma->vm_flags; |
733 | *vm_flags |= vma->vm_flags; | ||
734 | out: | ||
735 | return referenced; | ||
736 | } | ||
737 | |||
738 | static int page_referenced_anon(struct page *page, | ||
739 | struct mem_cgroup *memcg, | ||
740 | unsigned long *vm_flags) | ||
741 | { | ||
742 | unsigned int mapcount; | ||
743 | struct anon_vma *anon_vma; | ||
744 | pgoff_t pgoff; | ||
745 | struct anon_vma_chain *avc; | ||
746 | int referenced = 0; | ||
747 | |||
748 | anon_vma = page_lock_anon_vma_read(page); | ||
749 | if (!anon_vma) | ||
750 | return referenced; | ||
751 | |||
752 | mapcount = page_mapcount(page); | ||
753 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
754 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
755 | struct vm_area_struct *vma = avc->vma; | ||
756 | unsigned long address = vma_address(page, vma); | ||
757 | /* | ||
758 | * If we are reclaiming on behalf of a cgroup, skip | ||
759 | * counting on behalf of references from different | ||
760 | * cgroups | ||
761 | */ | ||
762 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
763 | continue; | ||
764 | referenced += page_referenced_one(page, vma, address, | ||
765 | &mapcount, vm_flags); | ||
766 | if (!mapcount) | ||
767 | break; | ||
768 | } | 736 | } |
769 | 737 | ||
770 | page_unlock_anon_vma_read(anon_vma); | 738 | pra->mapcount--; |
771 | return referenced; | 739 | if (!pra->mapcount) |
740 | return SWAP_SUCCESS; /* To break the loop */ | ||
741 | |||
742 | return SWAP_AGAIN; | ||
772 | } | 743 | } |
773 | 744 | ||
774 | /** | 745 | static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) |
775 | * page_referenced_file - referenced check for object-based rmap | ||
776 | * @page: the page we're checking references on. | ||
777 | * @memcg: target memory control group | ||
778 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
779 | * | ||
780 | * For an object-based mapped page, find all the places it is mapped and | ||
781 | * check/clear the referenced flag. This is done by following the page->mapping | ||
782 | * pointer, then walking the chain of vmas it holds. It returns the number | ||
783 | * of references it found. | ||
784 | * | ||
785 | * This function is only called from page_referenced for object-based pages. | ||
786 | */ | ||
787 | static int page_referenced_file(struct page *page, | ||
788 | struct mem_cgroup *memcg, | ||
789 | unsigned long *vm_flags) | ||
790 | { | 746 | { |
791 | unsigned int mapcount; | 747 | struct page_referenced_arg *pra = arg; |
792 | struct address_space *mapping = page->mapping; | 748 | struct mem_cgroup *memcg = pra->memcg; |
793 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
794 | struct vm_area_struct *vma; | ||
795 | int referenced = 0; | ||
796 | 749 | ||
797 | /* | 750 | if (!mm_match_cgroup(vma->vm_mm, memcg)) |
798 | * The caller's checks on page->mapping and !PageAnon have made | 751 | return true; |
799 | * sure that this is a file page: the check for page->mapping | ||
800 | * excludes the case just before it gets set on an anon page. | ||
801 | */ | ||
802 | BUG_ON(PageAnon(page)); | ||
803 | |||
804 | /* | ||
805 | * The page lock not only makes sure that page->mapping cannot | ||
806 | * suddenly be NULLified by truncation, it makes sure that the | ||
807 | * structure at mapping cannot be freed and reused yet, | ||
808 | * so we can safely take mapping->i_mmap_mutex. | ||
809 | */ | ||
810 | BUG_ON(!PageLocked(page)); | ||
811 | |||
812 | mutex_lock(&mapping->i_mmap_mutex); | ||
813 | |||
814 | /* | ||
815 | * i_mmap_mutex does not stabilize mapcount at all, but mapcount | ||
816 | * is more likely to be accurate if we note it after spinning. | ||
817 | */ | ||
818 | mapcount = page_mapcount(page); | ||
819 | |||
820 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
821 | unsigned long address = vma_address(page, vma); | ||
822 | /* | ||
823 | * If we are reclaiming on behalf of a cgroup, skip | ||
824 | * counting on behalf of references from different | ||
825 | * cgroups | ||
826 | */ | ||
827 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
828 | continue; | ||
829 | referenced += page_referenced_one(page, vma, address, | ||
830 | &mapcount, vm_flags); | ||
831 | if (!mapcount) | ||
832 | break; | ||
833 | } | ||
834 | 752 | ||
835 | mutex_unlock(&mapping->i_mmap_mutex); | 753 | return false; |
836 | return referenced; | ||
837 | } | 754 | } |
838 | 755 | ||
839 | /** | 756 | /** |
@@ -851,41 +768,57 @@ int page_referenced(struct page *page, | |||
851 | struct mem_cgroup *memcg, | 768 | struct mem_cgroup *memcg, |
852 | unsigned long *vm_flags) | 769 | unsigned long *vm_flags) |
853 | { | 770 | { |
854 | int referenced = 0; | 771 | int ret; |
855 | int we_locked = 0; | 772 | int we_locked = 0; |
773 | struct page_referenced_arg pra = { | ||
774 | .mapcount = page_mapcount(page), | ||
775 | .memcg = memcg, | ||
776 | }; | ||
777 | struct rmap_walk_control rwc = { | ||
778 | .rmap_one = page_referenced_one, | ||
779 | .arg = (void *)&pra, | ||
780 | .anon_lock = page_lock_anon_vma_read, | ||
781 | }; | ||
856 | 782 | ||
857 | *vm_flags = 0; | 783 | *vm_flags = 0; |
858 | if (page_mapped(page) && page_rmapping(page)) { | 784 | if (!page_mapped(page)) |
859 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { | 785 | return 0; |
860 | we_locked = trylock_page(page); | 786 | |
861 | if (!we_locked) { | 787 | if (!page_rmapping(page)) |
862 | referenced++; | 788 | return 0; |
863 | goto out; | 789 | |
864 | } | 790 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
865 | } | 791 | we_locked = trylock_page(page); |
866 | if (unlikely(PageKsm(page))) | 792 | if (!we_locked) |
867 | referenced += page_referenced_ksm(page, memcg, | 793 | return 1; |
868 | vm_flags); | ||
869 | else if (PageAnon(page)) | ||
870 | referenced += page_referenced_anon(page, memcg, | ||
871 | vm_flags); | ||
872 | else if (page->mapping) | ||
873 | referenced += page_referenced_file(page, memcg, | ||
874 | vm_flags); | ||
875 | if (we_locked) | ||
876 | unlock_page(page); | ||
877 | } | 794 | } |
878 | out: | 795 | |
879 | return referenced; | 796 | /* |
797 | * If we are reclaiming on behalf of a cgroup, skip | ||
798 | * counting on behalf of references from different | ||
799 | * cgroups | ||
800 | */ | ||
801 | if (memcg) { | ||
802 | rwc.invalid_vma = invalid_page_referenced_vma; | ||
803 | } | ||
804 | |||
805 | ret = rmap_walk(page, &rwc); | ||
806 | *vm_flags = pra.vm_flags; | ||
807 | |||
808 | if (we_locked) | ||
809 | unlock_page(page); | ||
810 | |||
811 | return pra.referenced; | ||
880 | } | 812 | } |
881 | 813 | ||
882 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | 814 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
883 | unsigned long address) | 815 | unsigned long address, void *arg) |
884 | { | 816 | { |
885 | struct mm_struct *mm = vma->vm_mm; | 817 | struct mm_struct *mm = vma->vm_mm; |
886 | pte_t *pte; | 818 | pte_t *pte; |
887 | spinlock_t *ptl; | 819 | spinlock_t *ptl; |
888 | int ret = 0; | 820 | int ret = 0; |
821 | int *cleaned = arg; | ||
889 | 822 | ||
890 | pte = page_check_address(page, mm, address, &ptl, 1); | 823 | pte = page_check_address(page, mm, address, &ptl, 1); |
891 | if (!pte) | 824 | if (!pte) |
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
904 | 837 | ||
905 | pte_unmap_unlock(pte, ptl); | 838 | pte_unmap_unlock(pte, ptl); |
906 | 839 | ||
907 | if (ret) | 840 | if (ret) { |
908 | mmu_notifier_invalidate_page(mm, address); | 841 | mmu_notifier_invalidate_page(mm, address); |
842 | (*cleaned)++; | ||
843 | } | ||
909 | out: | 844 | out: |
910 | return ret; | 845 | return SWAP_AGAIN; |
911 | } | 846 | } |
912 | 847 | ||
913 | static int page_mkclean_file(struct address_space *mapping, struct page *page) | 848 | static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) |
914 | { | 849 | { |
915 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 850 | if (vma->vm_flags & VM_SHARED) |
916 | struct vm_area_struct *vma; | 851 | return false; |
917 | int ret = 0; | ||
918 | |||
919 | BUG_ON(PageAnon(page)); | ||
920 | 852 | ||
921 | mutex_lock(&mapping->i_mmap_mutex); | 853 | return true; |
922 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
923 | if (vma->vm_flags & VM_SHARED) { | ||
924 | unsigned long address = vma_address(page, vma); | ||
925 | ret += page_mkclean_one(page, vma, address); | ||
926 | } | ||
927 | } | ||
928 | mutex_unlock(&mapping->i_mmap_mutex); | ||
929 | return ret; | ||
930 | } | 854 | } |
931 | 855 | ||
932 | int page_mkclean(struct page *page) | 856 | int page_mkclean(struct page *page) |
933 | { | 857 | { |
934 | int ret = 0; | 858 | int cleaned = 0; |
859 | struct address_space *mapping; | ||
860 | struct rmap_walk_control rwc = { | ||
861 | .arg = (void *)&cleaned, | ||
862 | .rmap_one = page_mkclean_one, | ||
863 | .invalid_vma = invalid_mkclean_vma, | ||
864 | }; | ||
935 | 865 | ||
936 | BUG_ON(!PageLocked(page)); | 866 | BUG_ON(!PageLocked(page)); |
937 | 867 | ||
938 | if (page_mapped(page)) { | 868 | if (!page_mapped(page)) |
939 | struct address_space *mapping = page_mapping(page); | 869 | return 0; |
940 | if (mapping) | ||
941 | ret = page_mkclean_file(mapping, page); | ||
942 | } | ||
943 | 870 | ||
944 | return ret; | 871 | mapping = page_mapping(page); |
872 | if (!mapping) | ||
873 | return 0; | ||
874 | |||
875 | rmap_walk(page, &rwc); | ||
876 | |||
877 | return cleaned; | ||
945 | } | 878 | } |
946 | EXPORT_SYMBOL_GPL(page_mkclean); | 879 | EXPORT_SYMBOL_GPL(page_mkclean); |
947 | 880 | ||
@@ -961,9 +894,9 @@ void page_move_anon_rmap(struct page *page, | |||
961 | { | 894 | { |
962 | struct anon_vma *anon_vma = vma->anon_vma; | 895 | struct anon_vma *anon_vma = vma->anon_vma; |
963 | 896 | ||
964 | VM_BUG_ON(!PageLocked(page)); | 897 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
965 | VM_BUG_ON(!anon_vma); | 898 | VM_BUG_ON(!anon_vma); |
966 | VM_BUG_ON(page->index != linear_page_index(vma, address)); | 899 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); |
967 | 900 | ||
968 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 901 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
969 | page->mapping = (struct address_space *) anon_vma; | 902 | page->mapping = (struct address_space *) anon_vma; |
@@ -1062,7 +995,7 @@ void do_page_add_anon_rmap(struct page *page, | |||
1062 | if (unlikely(PageKsm(page))) | 995 | if (unlikely(PageKsm(page))) |
1063 | return; | 996 | return; |
1064 | 997 | ||
1065 | VM_BUG_ON(!PageLocked(page)); | 998 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1066 | /* address might be in next vma when migration races vma_adjust */ | 999 | /* address might be in next vma when migration races vma_adjust */ |
1067 | if (first) | 1000 | if (first) |
1068 | __page_set_anon_rmap(page, vma, address, exclusive); | 1001 | __page_set_anon_rmap(page, vma, address, exclusive); |
@@ -1177,17 +1110,17 @@ out: | |||
1177 | } | 1110 | } |
1178 | 1111 | ||
1179 | /* | 1112 | /* |
1180 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 1113 | * @arg: enum ttu_flags will be passed to this argument |
1181 | * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. | ||
1182 | */ | 1114 | */ |
1183 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1115 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
1184 | unsigned long address, enum ttu_flags flags) | 1116 | unsigned long address, void *arg) |
1185 | { | 1117 | { |
1186 | struct mm_struct *mm = vma->vm_mm; | 1118 | struct mm_struct *mm = vma->vm_mm; |
1187 | pte_t *pte; | 1119 | pte_t *pte; |
1188 | pte_t pteval; | 1120 | pte_t pteval; |
1189 | spinlock_t *ptl; | 1121 | spinlock_t *ptl; |
1190 | int ret = SWAP_AGAIN; | 1122 | int ret = SWAP_AGAIN; |
1123 | enum ttu_flags flags = (enum ttu_flags)arg; | ||
1191 | 1124 | ||
1192 | pte = page_check_address(page, mm, address, &ptl, 0); | 1125 | pte = page_check_address(page, mm, address, &ptl, 0); |
1193 | if (!pte) | 1126 | if (!pte) |
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1426 | return ret; | 1359 | return ret; |
1427 | } | 1360 | } |
1428 | 1361 | ||
1429 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1362 | static int try_to_unmap_nonlinear(struct page *page, |
1430 | { | 1363 | struct address_space *mapping, struct vm_area_struct *vma) |
1431 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | ||
1432 | |||
1433 | if (!maybe_stack) | ||
1434 | return false; | ||
1435 | |||
1436 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | ||
1437 | VM_STACK_INCOMPLETE_SETUP) | ||
1438 | return true; | ||
1439 | |||
1440 | return false; | ||
1441 | } | ||
1442 | |||
1443 | /** | ||
1444 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | ||
1445 | * rmap method | ||
1446 | * @page: the page to unmap/unlock | ||
1447 | * @flags: action and flags | ||
1448 | * | ||
1449 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1450 | * contained in the anon_vma struct it points to. | ||
1451 | * | ||
1452 | * This function is only called from try_to_unmap/try_to_munlock for | ||
1453 | * anonymous pages. | ||
1454 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1455 | * where the page was found will be held for write. So, we won't recheck | ||
1456 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1457 | * 'LOCKED. | ||
1458 | */ | ||
1459 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | ||
1460 | { | 1364 | { |
1461 | struct anon_vma *anon_vma; | ||
1462 | pgoff_t pgoff; | ||
1463 | struct anon_vma_chain *avc; | ||
1464 | int ret = SWAP_AGAIN; | ||
1465 | |||
1466 | anon_vma = page_lock_anon_vma_read(page); | ||
1467 | if (!anon_vma) | ||
1468 | return ret; | ||
1469 | |||
1470 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1471 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
1472 | struct vm_area_struct *vma = avc->vma; | ||
1473 | unsigned long address; | ||
1474 | |||
1475 | /* | ||
1476 | * During exec, a temporary VMA is setup and later moved. | ||
1477 | * The VMA is moved under the anon_vma lock but not the | ||
1478 | * page tables leading to a race where migration cannot | ||
1479 | * find the migration ptes. Rather than increasing the | ||
1480 | * locking requirements of exec(), migration skips | ||
1481 | * temporary VMAs until after exec() completes. | ||
1482 | */ | ||
1483 | if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && | ||
1484 | is_vma_temporary_stack(vma)) | ||
1485 | continue; | ||
1486 | |||
1487 | address = vma_address(page, vma); | ||
1488 | ret = try_to_unmap_one(page, vma, address, flags); | ||
1489 | if (ret != SWAP_AGAIN || !page_mapped(page)) | ||
1490 | break; | ||
1491 | } | ||
1492 | |||
1493 | page_unlock_anon_vma_read(anon_vma); | ||
1494 | return ret; | ||
1495 | } | ||
1496 | |||
1497 | /** | ||
1498 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method | ||
1499 | * @page: the page to unmap/unlock | ||
1500 | * @flags: action and flags | ||
1501 | * | ||
1502 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1503 | * contained in the address_space struct it points to. | ||
1504 | * | ||
1505 | * This function is only called from try_to_unmap/try_to_munlock for | ||
1506 | * object-based pages. | ||
1507 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1508 | * where the page was found will be held for write. So, we won't recheck | ||
1509 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1510 | * 'LOCKED. | ||
1511 | */ | ||
1512 | static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | ||
1513 | { | ||
1514 | struct address_space *mapping = page->mapping; | ||
1515 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1516 | struct vm_area_struct *vma; | ||
1517 | int ret = SWAP_AGAIN; | 1365 | int ret = SWAP_AGAIN; |
1518 | unsigned long cursor; | 1366 | unsigned long cursor; |
1519 | unsigned long max_nl_cursor = 0; | 1367 | unsigned long max_nl_cursor = 0; |
1520 | unsigned long max_nl_size = 0; | 1368 | unsigned long max_nl_size = 0; |
1521 | unsigned int mapcount; | 1369 | unsigned int mapcount; |
1522 | 1370 | ||
1523 | if (PageHuge(page)) | 1371 | list_for_each_entry(vma, |
1524 | pgoff = page->index << compound_order(page); | 1372 | &mapping->i_mmap_nonlinear, shared.nonlinear) { |
1525 | 1373 | ||
1526 | mutex_lock(&mapping->i_mmap_mutex); | ||
1527 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
1528 | unsigned long address = vma_address(page, vma); | ||
1529 | ret = try_to_unmap_one(page, vma, address, flags); | ||
1530 | if (ret != SWAP_AGAIN || !page_mapped(page)) | ||
1531 | goto out; | ||
1532 | } | ||
1533 | |||
1534 | if (list_empty(&mapping->i_mmap_nonlinear)) | ||
1535 | goto out; | ||
1536 | |||
1537 | /* | ||
1538 | * We don't bother to try to find the munlocked page in nonlinears. | ||
1539 | * It's costly. Instead, later, page reclaim logic may call | ||
1540 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
1541 | */ | ||
1542 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
1543 | goto out; | ||
1544 | |||
1545 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | ||
1546 | shared.nonlinear) { | ||
1547 | cursor = (unsigned long) vma->vm_private_data; | 1374 | cursor = (unsigned long) vma->vm_private_data; |
1548 | if (cursor > max_nl_cursor) | 1375 | if (cursor > max_nl_cursor) |
1549 | max_nl_cursor = cursor; | 1376 | max_nl_cursor = cursor; |
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1553 | } | 1380 | } |
1554 | 1381 | ||
1555 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ | 1382 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ |
1556 | ret = SWAP_FAIL; | 1383 | return SWAP_FAIL; |
1557 | goto out; | ||
1558 | } | 1384 | } |
1559 | 1385 | ||
1560 | /* | 1386 | /* |
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1566 | */ | 1392 | */ |
1567 | mapcount = page_mapcount(page); | 1393 | mapcount = page_mapcount(page); |
1568 | if (!mapcount) | 1394 | if (!mapcount) |
1569 | goto out; | 1395 | return ret; |
1396 | |||
1570 | cond_resched(); | 1397 | cond_resched(); |
1571 | 1398 | ||
1572 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 1399 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1574 | max_nl_cursor = CLUSTER_SIZE; | 1401 | max_nl_cursor = CLUSTER_SIZE; |
1575 | 1402 | ||
1576 | do { | 1403 | do { |
1577 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1404 | list_for_each_entry(vma, |
1578 | shared.nonlinear) { | 1405 | &mapping->i_mmap_nonlinear, shared.nonlinear) { |
1406 | |||
1579 | cursor = (unsigned long) vma->vm_private_data; | 1407 | cursor = (unsigned long) vma->vm_private_data; |
1580 | while ( cursor < max_nl_cursor && | 1408 | while (cursor < max_nl_cursor && |
1581 | cursor < vma->vm_end - vma->vm_start) { | 1409 | cursor < vma->vm_end - vma->vm_start) { |
1582 | if (try_to_unmap_cluster(cursor, &mapcount, | 1410 | if (try_to_unmap_cluster(cursor, &mapcount, |
1583 | vma, page) == SWAP_MLOCK) | 1411 | vma, page) == SWAP_MLOCK) |
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1585 | cursor += CLUSTER_SIZE; | 1413 | cursor += CLUSTER_SIZE; |
1586 | vma->vm_private_data = (void *) cursor; | 1414 | vma->vm_private_data = (void *) cursor; |
1587 | if ((int)mapcount <= 0) | 1415 | if ((int)mapcount <= 0) |
1588 | goto out; | 1416 | return ret; |
1589 | } | 1417 | } |
1590 | vma->vm_private_data = (void *) max_nl_cursor; | 1418 | vma->vm_private_data = (void *) max_nl_cursor; |
1591 | } | 1419 | } |
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1600 | */ | 1428 | */ |
1601 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) | 1429 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) |
1602 | vma->vm_private_data = NULL; | 1430 | vma->vm_private_data = NULL; |
1603 | out: | 1431 | |
1604 | mutex_unlock(&mapping->i_mmap_mutex); | ||
1605 | return ret; | 1432 | return ret; |
1606 | } | 1433 | } |
1607 | 1434 | ||
1435 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | ||
1436 | { | ||
1437 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | ||
1438 | |||
1439 | if (!maybe_stack) | ||
1440 | return false; | ||
1441 | |||
1442 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | ||
1443 | VM_STACK_INCOMPLETE_SETUP) | ||
1444 | return true; | ||
1445 | |||
1446 | return false; | ||
1447 | } | ||
1448 | |||
1449 | static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) | ||
1450 | { | ||
1451 | return is_vma_temporary_stack(vma); | ||
1452 | } | ||
1453 | |||
1454 | static int page_not_mapped(struct page *page) | ||
1455 | { | ||
1456 | return !page_mapped(page); | ||
1457 | }; | ||
1458 | |||
1608 | /** | 1459 | /** |
1609 | * try_to_unmap - try to remove all page table mappings to a page | 1460 | * try_to_unmap - try to remove all page table mappings to a page |
1610 | * @page: the page to get unmapped | 1461 | * @page: the page to get unmapped |
@@ -1622,16 +1473,29 @@ out: | |||
1622 | int try_to_unmap(struct page *page, enum ttu_flags flags) | 1473 | int try_to_unmap(struct page *page, enum ttu_flags flags) |
1623 | { | 1474 | { |
1624 | int ret; | 1475 | int ret; |
1476 | struct rmap_walk_control rwc = { | ||
1477 | .rmap_one = try_to_unmap_one, | ||
1478 | .arg = (void *)flags, | ||
1479 | .done = page_not_mapped, | ||
1480 | .file_nonlinear = try_to_unmap_nonlinear, | ||
1481 | .anon_lock = page_lock_anon_vma_read, | ||
1482 | }; | ||
1625 | 1483 | ||
1626 | BUG_ON(!PageLocked(page)); | 1484 | VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); |
1627 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); | 1485 | |
1486 | /* | ||
1487 | * During exec, a temporary VMA is setup and later moved. | ||
1488 | * The VMA is moved under the anon_vma lock but not the | ||
1489 | * page tables leading to a race where migration cannot | ||
1490 | * find the migration ptes. Rather than increasing the | ||
1491 | * locking requirements of exec(), migration skips | ||
1492 | * temporary VMAs until after exec() completes. | ||
1493 | */ | ||
1494 | if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) | ||
1495 | rwc.invalid_vma = invalid_migration_vma; | ||
1496 | |||
1497 | ret = rmap_walk(page, &rwc); | ||
1628 | 1498 | ||
1629 | if (unlikely(PageKsm(page))) | ||
1630 | ret = try_to_unmap_ksm(page, flags); | ||
1631 | else if (PageAnon(page)) | ||
1632 | ret = try_to_unmap_anon(page, flags); | ||
1633 | else | ||
1634 | ret = try_to_unmap_file(page, flags); | ||
1635 | if (ret != SWAP_MLOCK && !page_mapped(page)) | 1499 | if (ret != SWAP_MLOCK && !page_mapped(page)) |
1636 | ret = SWAP_SUCCESS; | 1500 | ret = SWAP_SUCCESS; |
1637 | return ret; | 1501 | return ret; |
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1654 | */ | 1518 | */ |
1655 | int try_to_munlock(struct page *page) | 1519 | int try_to_munlock(struct page *page) |
1656 | { | 1520 | { |
1657 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1521 | int ret; |
1522 | struct rmap_walk_control rwc = { | ||
1523 | .rmap_one = try_to_unmap_one, | ||
1524 | .arg = (void *)TTU_MUNLOCK, | ||
1525 | .done = page_not_mapped, | ||
1526 | /* | ||
1527 | * We don't bother to try to find the munlocked page in | ||
1528 | * nonlinears. It's costly. Instead, later, page reclaim logic | ||
1529 | * may call try_to_unmap() and recover PG_mlocked lazily. | ||
1530 | */ | ||
1531 | .file_nonlinear = NULL, | ||
1532 | .anon_lock = page_lock_anon_vma_read, | ||
1658 | 1533 | ||
1659 | if (unlikely(PageKsm(page))) | 1534 | }; |
1660 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | 1535 | |
1661 | else if (PageAnon(page)) | 1536 | VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); |
1662 | return try_to_unmap_anon(page, TTU_MUNLOCK); | 1537 | |
1663 | else | 1538 | ret = rmap_walk(page, &rwc); |
1664 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1539 | return ret; |
1665 | } | 1540 | } |
1666 | 1541 | ||
1667 | void __put_anon_vma(struct anon_vma *anon_vma) | 1542 | void __put_anon_vma(struct anon_vma *anon_vma) |
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma) | |||
1674 | anon_vma_free(anon_vma); | 1549 | anon_vma_free(anon_vma); |
1675 | } | 1550 | } |
1676 | 1551 | ||
1677 | #ifdef CONFIG_MIGRATION | 1552 | static struct anon_vma *rmap_walk_anon_lock(struct page *page, |
1678 | /* | 1553 | struct rmap_walk_control *rwc) |
1679 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
1680 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
1681 | */ | ||
1682 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
1683 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1684 | { | 1554 | { |
1685 | struct anon_vma *anon_vma; | 1555 | struct anon_vma *anon_vma; |
1686 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1556 | |
1687 | struct anon_vma_chain *avc; | 1557 | if (rwc->anon_lock) |
1688 | int ret = SWAP_AGAIN; | 1558 | return rwc->anon_lock(page); |
1689 | 1559 | ||
1690 | /* | 1560 | /* |
1691 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() | 1561 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1695 | */ | 1565 | */ |
1696 | anon_vma = page_anon_vma(page); | 1566 | anon_vma = page_anon_vma(page); |
1697 | if (!anon_vma) | 1567 | if (!anon_vma) |
1698 | return ret; | 1568 | return NULL; |
1569 | |||
1699 | anon_vma_lock_read(anon_vma); | 1570 | anon_vma_lock_read(anon_vma); |
1571 | return anon_vma; | ||
1572 | } | ||
1573 | |||
1574 | /* | ||
1575 | * rmap_walk_anon - do something to anonymous page using the object-based | ||
1576 | * rmap method | ||
1577 | * @page: the page to be handled | ||
1578 | * @rwc: control variable according to each walk type | ||
1579 | * | ||
1580 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1581 | * contained in the anon_vma struct it points to. | ||
1582 | * | ||
1583 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1584 | * where the page was found will be held for write. So, we won't recheck | ||
1585 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1586 | * LOCKED. | ||
1587 | */ | ||
1588 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | ||
1589 | { | ||
1590 | struct anon_vma *anon_vma; | ||
1591 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1592 | struct anon_vma_chain *avc; | ||
1593 | int ret = SWAP_AGAIN; | ||
1594 | |||
1595 | anon_vma = rmap_walk_anon_lock(page, rwc); | ||
1596 | if (!anon_vma) | ||
1597 | return ret; | ||
1598 | |||
1700 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1599 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1701 | struct vm_area_struct *vma = avc->vma; | 1600 | struct vm_area_struct *vma = avc->vma; |
1702 | unsigned long address = vma_address(page, vma); | 1601 | unsigned long address = vma_address(page, vma); |
1703 | ret = rmap_one(page, vma, address, arg); | 1602 | |
1603 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | ||
1604 | continue; | ||
1605 | |||
1606 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | ||
1704 | if (ret != SWAP_AGAIN) | 1607 | if (ret != SWAP_AGAIN) |
1705 | break; | 1608 | break; |
1609 | if (rwc->done && rwc->done(page)) | ||
1610 | break; | ||
1706 | } | 1611 | } |
1707 | anon_vma_unlock_read(anon_vma); | 1612 | anon_vma_unlock_read(anon_vma); |
1708 | return ret; | 1613 | return ret; |
1709 | } | 1614 | } |
1710 | 1615 | ||
1711 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | 1616 | /* |
1712 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1617 | * rmap_walk_file - do something to file page using the object-based rmap method |
1618 | * @page: the page to be handled | ||
1619 | * @rwc: control variable according to each walk type | ||
1620 | * | ||
1621 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1622 | * contained in the address_space struct it points to. | ||
1623 | * | ||
1624 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1625 | * where the page was found will be held for write. So, we won't recheck | ||
1626 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1627 | * LOCKED. | ||
1628 | */ | ||
1629 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | ||
1713 | { | 1630 | { |
1714 | struct address_space *mapping = page->mapping; | 1631 | struct address_space *mapping = page->mapping; |
1715 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1632 | pgoff_t pgoff = page->index << compound_order(page); |
1716 | struct vm_area_struct *vma; | 1633 | struct vm_area_struct *vma; |
1717 | int ret = SWAP_AGAIN; | 1634 | int ret = SWAP_AGAIN; |
1718 | 1635 | ||
1636 | /* | ||
1637 | * The page lock not only makes sure that page->mapping cannot | ||
1638 | * suddenly be NULLified by truncation, it makes sure that the | ||
1639 | * structure at mapping cannot be freed and reused yet, | ||
1640 | * so we can safely take mapping->i_mmap_mutex. | ||
1641 | */ | ||
1642 | VM_BUG_ON(!PageLocked(page)); | ||
1643 | |||
1719 | if (!mapping) | 1644 | if (!mapping) |
1720 | return ret; | 1645 | return ret; |
1721 | mutex_lock(&mapping->i_mmap_mutex); | 1646 | mutex_lock(&mapping->i_mmap_mutex); |
1722 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1647 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1723 | unsigned long address = vma_address(page, vma); | 1648 | unsigned long address = vma_address(page, vma); |
1724 | ret = rmap_one(page, vma, address, arg); | 1649 | |
1650 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | ||
1651 | continue; | ||
1652 | |||
1653 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | ||
1725 | if (ret != SWAP_AGAIN) | 1654 | if (ret != SWAP_AGAIN) |
1726 | break; | 1655 | goto done; |
1656 | if (rwc->done && rwc->done(page)) | ||
1657 | goto done; | ||
1727 | } | 1658 | } |
1728 | /* | 1659 | |
1729 | * No nonlinear handling: being always shared, nonlinear vmas | 1660 | if (!rwc->file_nonlinear) |
1730 | * never contain migration ptes. Decide what to do about this | 1661 | goto done; |
1731 | * limitation to linear when we need rmap_walk() on nonlinear. | 1662 | |
1732 | */ | 1663 | if (list_empty(&mapping->i_mmap_nonlinear)) |
1664 | goto done; | ||
1665 | |||
1666 | ret = rwc->file_nonlinear(page, mapping, vma); | ||
1667 | |||
1668 | done: | ||
1733 | mutex_unlock(&mapping->i_mmap_mutex); | 1669 | mutex_unlock(&mapping->i_mmap_mutex); |
1734 | return ret; | 1670 | return ret; |
1735 | } | 1671 | } |
1736 | 1672 | ||
1737 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | 1673 | int rmap_walk(struct page *page, struct rmap_walk_control *rwc) |
1738 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1739 | { | 1674 | { |
1740 | VM_BUG_ON(!PageLocked(page)); | ||
1741 | |||
1742 | if (unlikely(PageKsm(page))) | 1675 | if (unlikely(PageKsm(page))) |
1743 | return rmap_walk_ksm(page, rmap_one, arg); | 1676 | return rmap_walk_ksm(page, rwc); |
1744 | else if (PageAnon(page)) | 1677 | else if (PageAnon(page)) |
1745 | return rmap_walk_anon(page, rmap_one, arg); | 1678 | return rmap_walk_anon(page, rwc); |
1746 | else | 1679 | else |
1747 | return rmap_walk_file(page, rmap_one, arg); | 1680 | return rmap_walk_file(page, rwc); |
1748 | } | 1681 | } |
1749 | #endif /* CONFIG_MIGRATION */ | ||
1750 | 1682 | ||
1751 | #ifdef CONFIG_HUGETLB_PAGE | 1683 | #ifdef CONFIG_HUGETLB_PAGE |
1752 | /* | 1684 | /* |
diff --git a/mm/shmem.c b/mm/shmem.c index 902a14842b74..1f18c9d0d93e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -45,7 +45,7 @@ static struct vfsmount *shm_mnt; | |||
45 | #include <linux/xattr.h> | 45 | #include <linux/xattr.h> |
46 | #include <linux/exportfs.h> | 46 | #include <linux/exportfs.h> |
47 | #include <linux/posix_acl.h> | 47 | #include <linux/posix_acl.h> |
48 | #include <linux/generic_acl.h> | 48 | #include <linux/posix_acl_xattr.h> |
49 | #include <linux/mman.h> | 49 | #include <linux/mman.h> |
50 | #include <linux/string.h> | 50 | #include <linux/string.h> |
51 | #include <linux/slab.h> | 51 | #include <linux/slab.h> |
@@ -285,8 +285,8 @@ static int shmem_add_to_page_cache(struct page *page, | |||
285 | { | 285 | { |
286 | int error; | 286 | int error; |
287 | 287 | ||
288 | VM_BUG_ON(!PageLocked(page)); | 288 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
289 | VM_BUG_ON(!PageSwapBacked(page)); | 289 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
290 | 290 | ||
291 | page_cache_get(page); | 291 | page_cache_get(page); |
292 | page->mapping = mapping; | 292 | page->mapping = mapping; |
@@ -491,7 +491,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
491 | continue; | 491 | continue; |
492 | if (!unfalloc || !PageUptodate(page)) { | 492 | if (!unfalloc || !PageUptodate(page)) { |
493 | if (page->mapping == mapping) { | 493 | if (page->mapping == mapping) { |
494 | VM_BUG_ON(PageWriteback(page)); | 494 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
495 | truncate_inode_page(mapping, page); | 495 | truncate_inode_page(mapping, page); |
496 | } | 496 | } |
497 | } | 497 | } |
@@ -568,7 +568,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
568 | lock_page(page); | 568 | lock_page(page); |
569 | if (!unfalloc || !PageUptodate(page)) { | 569 | if (!unfalloc || !PageUptodate(page)) { |
570 | if (page->mapping == mapping) { | 570 | if (page->mapping == mapping) { |
571 | VM_BUG_ON(PageWriteback(page)); | 571 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
572 | truncate_inode_page(mapping, page); | 572 | truncate_inode_page(mapping, page); |
573 | } | 573 | } |
574 | } | 574 | } |
@@ -620,10 +620,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
620 | } | 620 | } |
621 | 621 | ||
622 | setattr_copy(inode, attr); | 622 | setattr_copy(inode, attr); |
623 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
624 | if (attr->ia_valid & ATTR_MODE) | 623 | if (attr->ia_valid & ATTR_MODE) |
625 | error = generic_acl_chmod(inode); | 624 | error = posix_acl_chmod(inode, inode->i_mode); |
626 | #endif | ||
627 | return error; | 625 | return error; |
628 | } | 626 | } |
629 | 627 | ||
@@ -1937,22 +1935,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) | |||
1937 | 1935 | ||
1938 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); | 1936 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); |
1939 | if (inode) { | 1937 | if (inode) { |
1940 | #ifdef CONFIG_TMPFS_POSIX_ACL | 1938 | error = simple_acl_create(dir, inode); |
1941 | error = generic_acl_init(inode, dir); | 1939 | if (error) |
1942 | if (error) { | 1940 | goto out_iput; |
1943 | iput(inode); | ||
1944 | return error; | ||
1945 | } | ||
1946 | #endif | ||
1947 | error = security_inode_init_security(inode, dir, | 1941 | error = security_inode_init_security(inode, dir, |
1948 | &dentry->d_name, | 1942 | &dentry->d_name, |
1949 | shmem_initxattrs, NULL); | 1943 | shmem_initxattrs, NULL); |
1950 | if (error) { | 1944 | if (error && error != -EOPNOTSUPP) |
1951 | if (error != -EOPNOTSUPP) { | 1945 | goto out_iput; |
1952 | iput(inode); | ||
1953 | return error; | ||
1954 | } | ||
1955 | } | ||
1956 | 1946 | ||
1957 | error = 0; | 1947 | error = 0; |
1958 | dir->i_size += BOGO_DIRENT_SIZE; | 1948 | dir->i_size += BOGO_DIRENT_SIZE; |
@@ -1961,6 +1951,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) | |||
1961 | dget(dentry); /* Extra count - pin the dentry in core */ | 1951 | dget(dentry); /* Extra count - pin the dentry in core */ |
1962 | } | 1952 | } |
1963 | return error; | 1953 | return error; |
1954 | out_iput: | ||
1955 | iput(inode); | ||
1956 | return error; | ||
1964 | } | 1957 | } |
1965 | 1958 | ||
1966 | static int | 1959 | static int |
@@ -1974,24 +1967,17 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
1974 | error = security_inode_init_security(inode, dir, | 1967 | error = security_inode_init_security(inode, dir, |
1975 | NULL, | 1968 | NULL, |
1976 | shmem_initxattrs, NULL); | 1969 | shmem_initxattrs, NULL); |
1977 | if (error) { | 1970 | if (error && error != -EOPNOTSUPP) |
1978 | if (error != -EOPNOTSUPP) { | 1971 | goto out_iput; |
1979 | iput(inode); | 1972 | error = simple_acl_create(dir, inode); |
1980 | return error; | 1973 | if (error) |
1981 | } | 1974 | goto out_iput; |
1982 | } | ||
1983 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
1984 | error = generic_acl_init(inode, dir); | ||
1985 | if (error) { | ||
1986 | iput(inode); | ||
1987 | return error; | ||
1988 | } | ||
1989 | #else | ||
1990 | error = 0; | ||
1991 | #endif | ||
1992 | d_tmpfile(dentry, inode); | 1975 | d_tmpfile(dentry, inode); |
1993 | } | 1976 | } |
1994 | return error; | 1977 | return error; |
1978 | out_iput: | ||
1979 | iput(inode); | ||
1980 | return error; | ||
1995 | } | 1981 | } |
1996 | 1982 | ||
1997 | static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | 1983 | static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
@@ -2223,8 +2209,8 @@ static int shmem_initxattrs(struct inode *inode, | |||
2223 | 2209 | ||
2224 | static const struct xattr_handler *shmem_xattr_handlers[] = { | 2210 | static const struct xattr_handler *shmem_xattr_handlers[] = { |
2225 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2211 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2226 | &generic_acl_access_handler, | 2212 | &posix_acl_access_xattr_handler, |
2227 | &generic_acl_default_handler, | 2213 | &posix_acl_default_xattr_handler, |
2228 | #endif | 2214 | #endif |
2229 | NULL | 2215 | NULL |
2230 | }; | 2216 | }; |
@@ -2740,6 +2726,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
2740 | .getxattr = shmem_getxattr, | 2726 | .getxattr = shmem_getxattr, |
2741 | .listxattr = shmem_listxattr, | 2727 | .listxattr = shmem_listxattr, |
2742 | .removexattr = shmem_removexattr, | 2728 | .removexattr = shmem_removexattr, |
2729 | .set_acl = simple_set_acl, | ||
2743 | #endif | 2730 | #endif |
2744 | }; | 2731 | }; |
2745 | 2732 | ||
@@ -2764,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2764 | #endif | 2751 | #endif |
2765 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2752 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2766 | .setattr = shmem_setattr, | 2753 | .setattr = shmem_setattr, |
2754 | .set_acl = simple_set_acl, | ||
2767 | #endif | 2755 | #endif |
2768 | }; | 2756 | }; |
2769 | 2757 | ||
@@ -2776,6 +2764,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2776 | #endif | 2764 | #endif |
2777 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2765 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2778 | .setattr = shmem_setattr, | 2766 | .setattr = shmem_setattr, |
2767 | .set_acl = simple_set_acl, | ||
2779 | #endif | 2768 | #endif |
2780 | }; | 2769 | }; |
2781 | 2770 | ||
@@ -1946,7 +1946,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, | |||
1946 | /** | 1946 | /** |
1947 | * slab_destroy - destroy and release all objects in a slab | 1947 | * slab_destroy - destroy and release all objects in a slab |
1948 | * @cachep: cache pointer being destroyed | 1948 | * @cachep: cache pointer being destroyed |
1949 | * @slabp: slab pointer being destroyed | 1949 | * @page: page pointer being destroyed |
1950 | * | 1950 | * |
1951 | * Destroy all the objs in a slab, and release the mem back to the system. | 1951 | * Destroy all the objs in a slab, and release the mem back to the system. |
1952 | * Before calling the slab must have been unlinked from the cache. The | 1952 | * Before calling the slab must have been unlinked from the cache. The |
@@ -160,12 +160,36 @@ static inline const char *cache_name(struct kmem_cache *s) | |||
160 | return s->name; | 160 | return s->name; |
161 | } | 161 | } |
162 | 162 | ||
163 | /* | ||
164 | * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. | ||
165 | * That said the caller must assure the memcg's cache won't go away. Since once | ||
166 | * created a memcg's cache is destroyed only along with the root cache, it is | ||
167 | * true if we are going to allocate from the cache or hold a reference to the | ||
168 | * root cache by other means. Otherwise, we should hold either the slab_mutex | ||
169 | * or the memcg's slab_caches_mutex while calling this function and accessing | ||
170 | * the returned value. | ||
171 | */ | ||
163 | static inline struct kmem_cache * | 172 | static inline struct kmem_cache * |
164 | cache_from_memcg_idx(struct kmem_cache *s, int idx) | 173 | cache_from_memcg_idx(struct kmem_cache *s, int idx) |
165 | { | 174 | { |
175 | struct kmem_cache *cachep; | ||
176 | struct memcg_cache_params *params; | ||
177 | |||
166 | if (!s->memcg_params) | 178 | if (!s->memcg_params) |
167 | return NULL; | 179 | return NULL; |
168 | return s->memcg_params->memcg_caches[idx]; | 180 | |
181 | rcu_read_lock(); | ||
182 | params = rcu_dereference(s->memcg_params); | ||
183 | cachep = params->memcg_caches[idx]; | ||
184 | rcu_read_unlock(); | ||
185 | |||
186 | /* | ||
187 | * Make sure we will access the up-to-date value. The code updating | ||
188 | * memcg_caches issues a write barrier to match this (see | ||
189 | * memcg_register_cache()). | ||
190 | */ | ||
191 | smp_read_barrier_depends(); | ||
192 | return cachep; | ||
169 | } | 193 | } |
170 | 194 | ||
171 | static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | 195 | static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 0b7bb399b0e4..1ec3c619ba04 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -171,13 +171,26 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, | |||
171 | struct kmem_cache *parent_cache) | 171 | struct kmem_cache *parent_cache) |
172 | { | 172 | { |
173 | struct kmem_cache *s = NULL; | 173 | struct kmem_cache *s = NULL; |
174 | int err = 0; | 174 | int err; |
175 | 175 | ||
176 | get_online_cpus(); | 176 | get_online_cpus(); |
177 | mutex_lock(&slab_mutex); | 177 | mutex_lock(&slab_mutex); |
178 | 178 | ||
179 | if (!kmem_cache_sanity_check(memcg, name, size) == 0) | 179 | err = kmem_cache_sanity_check(memcg, name, size); |
180 | goto out_locked; | 180 | if (err) |
181 | goto out_unlock; | ||
182 | |||
183 | if (memcg) { | ||
184 | /* | ||
185 | * Since per-memcg caches are created asynchronously on first | ||
186 | * allocation (see memcg_kmem_get_cache()), several threads can | ||
187 | * try to create the same cache, but only one of them may | ||
188 | * succeed. Therefore if we get here and see the cache has | ||
189 | * already been created, we silently return NULL. | ||
190 | */ | ||
191 | if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg))) | ||
192 | goto out_unlock; | ||
193 | } | ||
181 | 194 | ||
182 | /* | 195 | /* |
183 | * Some allocators will constraint the set of valid flags to a subset | 196 | * Some allocators will constraint the set of valid flags to a subset |
@@ -189,44 +202,47 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, | |||
189 | 202 | ||
190 | s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); | 203 | s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); |
191 | if (s) | 204 | if (s) |
192 | goto out_locked; | 205 | goto out_unlock; |
193 | 206 | ||
207 | err = -ENOMEM; | ||
194 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); | 208 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); |
195 | if (s) { | 209 | if (!s) |
196 | s->object_size = s->size = size; | 210 | goto out_unlock; |
197 | s->align = calculate_alignment(flags, align, size); | ||
198 | s->ctor = ctor; | ||
199 | 211 | ||
200 | if (memcg_register_cache(memcg, s, parent_cache)) { | 212 | s->object_size = s->size = size; |
201 | kmem_cache_free(kmem_cache, s); | 213 | s->align = calculate_alignment(flags, align, size); |
202 | err = -ENOMEM; | 214 | s->ctor = ctor; |
203 | goto out_locked; | ||
204 | } | ||
205 | 215 | ||
206 | s->name = kstrdup(name, GFP_KERNEL); | 216 | s->name = kstrdup(name, GFP_KERNEL); |
207 | if (!s->name) { | 217 | if (!s->name) |
208 | kmem_cache_free(kmem_cache, s); | 218 | goto out_free_cache; |
209 | err = -ENOMEM; | ||
210 | goto out_locked; | ||
211 | } | ||
212 | 219 | ||
213 | err = __kmem_cache_create(s, flags); | 220 | err = memcg_alloc_cache_params(memcg, s, parent_cache); |
214 | if (!err) { | 221 | if (err) |
215 | s->refcount = 1; | 222 | goto out_free_cache; |
216 | list_add(&s->list, &slab_caches); | 223 | |
217 | memcg_cache_list_add(memcg, s); | 224 | err = __kmem_cache_create(s, flags); |
218 | } else { | 225 | if (err) |
219 | kfree(s->name); | 226 | goto out_free_cache; |
220 | kmem_cache_free(kmem_cache, s); | 227 | |
221 | } | 228 | s->refcount = 1; |
222 | } else | 229 | list_add(&s->list, &slab_caches); |
223 | err = -ENOMEM; | 230 | memcg_register_cache(s); |
224 | 231 | ||
225 | out_locked: | 232 | out_unlock: |
226 | mutex_unlock(&slab_mutex); | 233 | mutex_unlock(&slab_mutex); |
227 | put_online_cpus(); | 234 | put_online_cpus(); |
228 | 235 | ||
229 | if (err) { | 236 | if (err) { |
237 | /* | ||
238 | * There is no point in flooding logs with warnings or | ||
239 | * especially crashing the system if we fail to create a cache | ||
240 | * for a memcg. In this case we will be accounting the memcg | ||
241 | * allocation to the root cgroup until we succeed to create its | ||
242 | * own cache, but it isn't that critical. | ||
243 | */ | ||
244 | if (!memcg) | ||
245 | return NULL; | ||
230 | 246 | ||
231 | if (flags & SLAB_PANIC) | 247 | if (flags & SLAB_PANIC) |
232 | panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", | 248 | panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", |
@@ -236,11 +252,15 @@ out_locked: | |||
236 | name, err); | 252 | name, err); |
237 | dump_stack(); | 253 | dump_stack(); |
238 | } | 254 | } |
239 | |||
240 | return NULL; | 255 | return NULL; |
241 | } | 256 | } |
242 | |||
243 | return s; | 257 | return s; |
258 | |||
259 | out_free_cache: | ||
260 | memcg_free_cache_params(s); | ||
261 | kfree(s->name); | ||
262 | kmem_cache_free(kmem_cache, s); | ||
263 | goto out_unlock; | ||
244 | } | 264 | } |
245 | 265 | ||
246 | struct kmem_cache * | 266 | struct kmem_cache * |
@@ -263,11 +283,12 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
263 | list_del(&s->list); | 283 | list_del(&s->list); |
264 | 284 | ||
265 | if (!__kmem_cache_shutdown(s)) { | 285 | if (!__kmem_cache_shutdown(s)) { |
286 | memcg_unregister_cache(s); | ||
266 | mutex_unlock(&slab_mutex); | 287 | mutex_unlock(&slab_mutex); |
267 | if (s->flags & SLAB_DESTROY_BY_RCU) | 288 | if (s->flags & SLAB_DESTROY_BY_RCU) |
268 | rcu_barrier(); | 289 | rcu_barrier(); |
269 | 290 | ||
270 | memcg_release_cache(s); | 291 | memcg_free_cache_params(s); |
271 | kfree(s->name); | 292 | kfree(s->name); |
272 | kmem_cache_free(kmem_cache, s); | 293 | kmem_cache_free(kmem_cache, s); |
273 | } else { | 294 | } else { |
@@ -355,6 +355,21 @@ static __always_inline void slab_unlock(struct page *page) | |||
355 | __bit_spin_unlock(PG_locked, &page->flags); | 355 | __bit_spin_unlock(PG_locked, &page->flags); |
356 | } | 356 | } |
357 | 357 | ||
358 | static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) | ||
359 | { | ||
360 | struct page tmp; | ||
361 | tmp.counters = counters_new; | ||
362 | /* | ||
363 | * page->counters can cover frozen/inuse/objects as well | ||
364 | * as page->_count. If we assign to ->counters directly | ||
365 | * we run the risk of losing updates to page->_count, so | ||
366 | * be careful and only assign to the fields we need. | ||
367 | */ | ||
368 | page->frozen = tmp.frozen; | ||
369 | page->inuse = tmp.inuse; | ||
370 | page->objects = tmp.objects; | ||
371 | } | ||
372 | |||
358 | /* Interrupts must be disabled (for the fallback code to work right) */ | 373 | /* Interrupts must be disabled (for the fallback code to work right) */ |
359 | static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | 374 | static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, |
360 | void *freelist_old, unsigned long counters_old, | 375 | void *freelist_old, unsigned long counters_old, |
@@ -376,7 +391,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page | |||
376 | if (page->freelist == freelist_old && | 391 | if (page->freelist == freelist_old && |
377 | page->counters == counters_old) { | 392 | page->counters == counters_old) { |
378 | page->freelist = freelist_new; | 393 | page->freelist = freelist_new; |
379 | page->counters = counters_new; | 394 | set_page_slub_counters(page, counters_new); |
380 | slab_unlock(page); | 395 | slab_unlock(page); |
381 | return 1; | 396 | return 1; |
382 | } | 397 | } |
@@ -415,7 +430,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
415 | if (page->freelist == freelist_old && | 430 | if (page->freelist == freelist_old && |
416 | page->counters == counters_old) { | 431 | page->counters == counters_old) { |
417 | page->freelist = freelist_new; | 432 | page->freelist = freelist_new; |
418 | page->counters = counters_new; | 433 | set_page_slub_counters(page, counters_new); |
419 | slab_unlock(page); | 434 | slab_unlock(page); |
420 | local_irq_restore(flags); | 435 | local_irq_restore(flags); |
421 | return 1; | 436 | return 1; |
@@ -985,23 +1000,22 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
985 | 1000 | ||
986 | /* | 1001 | /* |
987 | * Tracking of fully allocated slabs for debugging purposes. | 1002 | * Tracking of fully allocated slabs for debugging purposes. |
988 | * | ||
989 | * list_lock must be held. | ||
990 | */ | 1003 | */ |
991 | static void add_full(struct kmem_cache *s, | 1004 | static void add_full(struct kmem_cache *s, |
992 | struct kmem_cache_node *n, struct page *page) | 1005 | struct kmem_cache_node *n, struct page *page) |
993 | { | 1006 | { |
1007 | lockdep_assert_held(&n->list_lock); | ||
1008 | |||
994 | if (!(s->flags & SLAB_STORE_USER)) | 1009 | if (!(s->flags & SLAB_STORE_USER)) |
995 | return; | 1010 | return; |
996 | 1011 | ||
997 | list_add(&page->lru, &n->full); | 1012 | list_add(&page->lru, &n->full); |
998 | } | 1013 | } |
999 | 1014 | ||
1000 | /* | 1015 | static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) |
1001 | * list_lock must be held. | ||
1002 | */ | ||
1003 | static void remove_full(struct kmem_cache *s, struct page *page) | ||
1004 | { | 1016 | { |
1017 | lockdep_assert_held(&n->list_lock); | ||
1018 | |||
1005 | if (!(s->flags & SLAB_STORE_USER)) | 1019 | if (!(s->flags & SLAB_STORE_USER)) |
1006 | return; | 1020 | return; |
1007 | 1021 | ||
@@ -1250,7 +1264,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page, | |||
1250 | void *object, u8 val) { return 1; } | 1264 | void *object, u8 val) { return 1; } |
1251 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, | 1265 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1252 | struct page *page) {} | 1266 | struct page *page) {} |
1253 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | 1267 | static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1268 | struct page *page) {} | ||
1254 | static inline unsigned long kmem_cache_flags(unsigned long object_size, | 1269 | static inline unsigned long kmem_cache_flags(unsigned long object_size, |
1255 | unsigned long flags, const char *name, | 1270 | unsigned long flags, const char *name, |
1256 | void (*ctor)(void *)) | 1271 | void (*ctor)(void *)) |
@@ -1504,12 +1519,12 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
1504 | 1519 | ||
1505 | /* | 1520 | /* |
1506 | * Management of partially allocated slabs. | 1521 | * Management of partially allocated slabs. |
1507 | * | ||
1508 | * list_lock must be held. | ||
1509 | */ | 1522 | */ |
1510 | static inline void add_partial(struct kmem_cache_node *n, | 1523 | static inline void add_partial(struct kmem_cache_node *n, |
1511 | struct page *page, int tail) | 1524 | struct page *page, int tail) |
1512 | { | 1525 | { |
1526 | lockdep_assert_held(&n->list_lock); | ||
1527 | |||
1513 | n->nr_partial++; | 1528 | n->nr_partial++; |
1514 | if (tail == DEACTIVATE_TO_TAIL) | 1529 | if (tail == DEACTIVATE_TO_TAIL) |
1515 | list_add_tail(&page->lru, &n->partial); | 1530 | list_add_tail(&page->lru, &n->partial); |
@@ -1517,12 +1532,11 @@ static inline void add_partial(struct kmem_cache_node *n, | |||
1517 | list_add(&page->lru, &n->partial); | 1532 | list_add(&page->lru, &n->partial); |
1518 | } | 1533 | } |
1519 | 1534 | ||
1520 | /* | ||
1521 | * list_lock must be held. | ||
1522 | */ | ||
1523 | static inline void remove_partial(struct kmem_cache_node *n, | 1535 | static inline void remove_partial(struct kmem_cache_node *n, |
1524 | struct page *page) | 1536 | struct page *page) |
1525 | { | 1537 | { |
1538 | lockdep_assert_held(&n->list_lock); | ||
1539 | |||
1526 | list_del(&page->lru); | 1540 | list_del(&page->lru); |
1527 | n->nr_partial--; | 1541 | n->nr_partial--; |
1528 | } | 1542 | } |
@@ -1532,8 +1546,6 @@ static inline void remove_partial(struct kmem_cache_node *n, | |||
1532 | * return the pointer to the freelist. | 1546 | * return the pointer to the freelist. |
1533 | * | 1547 | * |
1534 | * Returns a list of objects or NULL if it fails. | 1548 | * Returns a list of objects or NULL if it fails. |
1535 | * | ||
1536 | * Must hold list_lock since we modify the partial list. | ||
1537 | */ | 1549 | */ |
1538 | static inline void *acquire_slab(struct kmem_cache *s, | 1550 | static inline void *acquire_slab(struct kmem_cache *s, |
1539 | struct kmem_cache_node *n, struct page *page, | 1551 | struct kmem_cache_node *n, struct page *page, |
@@ -1543,6 +1555,8 @@ static inline void *acquire_slab(struct kmem_cache *s, | |||
1543 | unsigned long counters; | 1555 | unsigned long counters; |
1544 | struct page new; | 1556 | struct page new; |
1545 | 1557 | ||
1558 | lockdep_assert_held(&n->list_lock); | ||
1559 | |||
1546 | /* | 1560 | /* |
1547 | * Zap the freelist and set the frozen bit. | 1561 | * Zap the freelist and set the frozen bit. |
1548 | * The old freelist is the list of objects for the | 1562 | * The old freelist is the list of objects for the |
@@ -1887,7 +1901,7 @@ redo: | |||
1887 | 1901 | ||
1888 | else if (l == M_FULL) | 1902 | else if (l == M_FULL) |
1889 | 1903 | ||
1890 | remove_full(s, page); | 1904 | remove_full(s, n, page); |
1891 | 1905 | ||
1892 | if (m == M_PARTIAL) { | 1906 | if (m == M_PARTIAL) { |
1893 | 1907 | ||
@@ -2541,7 +2555,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2541 | new.inuse--; | 2555 | new.inuse--; |
2542 | if ((!new.inuse || !prior) && !was_frozen) { | 2556 | if ((!new.inuse || !prior) && !was_frozen) { |
2543 | 2557 | ||
2544 | if (kmem_cache_has_cpu_partial(s) && !prior) | 2558 | if (kmem_cache_has_cpu_partial(s) && !prior) { |
2545 | 2559 | ||
2546 | /* | 2560 | /* |
2547 | * Slab was on no list before and will be | 2561 | * Slab was on no list before and will be |
@@ -2551,7 +2565,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2551 | */ | 2565 | */ |
2552 | new.frozen = 1; | 2566 | new.frozen = 1; |
2553 | 2567 | ||
2554 | else { /* Needs to be taken off a list */ | 2568 | } else { /* Needs to be taken off a list */ |
2555 | 2569 | ||
2556 | n = get_node(s, page_to_nid(page)); | 2570 | n = get_node(s, page_to_nid(page)); |
2557 | /* | 2571 | /* |
@@ -2600,7 +2614,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2600 | */ | 2614 | */ |
2601 | if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { | 2615 | if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { |
2602 | if (kmem_cache_debug(s)) | 2616 | if (kmem_cache_debug(s)) |
2603 | remove_full(s, page); | 2617 | remove_full(s, n, page); |
2604 | add_partial(n, page, DEACTIVATE_TO_TAIL); | 2618 | add_partial(n, page, DEACTIVATE_TO_TAIL); |
2605 | stat(s, FREE_ADD_PARTIAL); | 2619 | stat(s, FREE_ADD_PARTIAL); |
2606 | } | 2620 | } |
@@ -2614,9 +2628,10 @@ slab_empty: | |||
2614 | */ | 2628 | */ |
2615 | remove_partial(n, page); | 2629 | remove_partial(n, page); |
2616 | stat(s, FREE_REMOVE_PARTIAL); | 2630 | stat(s, FREE_REMOVE_PARTIAL); |
2617 | } else | 2631 | } else { |
2618 | /* Slab must be on the full list */ | 2632 | /* Slab must be on the full list */ |
2619 | remove_full(s, page); | 2633 | remove_full(s, n, page); |
2634 | } | ||
2620 | 2635 | ||
2621 | spin_unlock_irqrestore(&n->list_lock, flags); | 2636 | spin_unlock_irqrestore(&n->list_lock, flags); |
2622 | stat(s, FREE_SLAB); | 2637 | stat(s, FREE_SLAB); |
@@ -2890,7 +2905,13 @@ static void early_kmem_cache_node_alloc(int node) | |||
2890 | init_kmem_cache_node(n); | 2905 | init_kmem_cache_node(n); |
2891 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2906 | inc_slabs_node(kmem_cache_node, node, page->objects); |
2892 | 2907 | ||
2908 | /* | ||
2909 | * the lock is for lockdep's sake, not for any actual | ||
2910 | * race protection | ||
2911 | */ | ||
2912 | spin_lock(&n->list_lock); | ||
2893 | add_partial(n, page, DEACTIVATE_TO_HEAD); | 2913 | add_partial(n, page, DEACTIVATE_TO_HEAD); |
2914 | spin_unlock(&n->list_lock); | ||
2894 | } | 2915 | } |
2895 | 2916 | ||
2896 | static void free_kmem_cache_nodes(struct kmem_cache *s) | 2917 | static void free_kmem_cache_nodes(struct kmem_cache *s) |
@@ -4299,7 +4320,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4299 | 4320 | ||
4300 | page = ACCESS_ONCE(c->partial); | 4321 | page = ACCESS_ONCE(c->partial); |
4301 | if (page) { | 4322 | if (page) { |
4302 | x = page->pobjects; | 4323 | node = page_to_nid(page); |
4324 | if (flags & SO_TOTAL) | ||
4325 | WARN_ON_ONCE(1); | ||
4326 | else if (flags & SO_OBJECTS) | ||
4327 | WARN_ON_ONCE(1); | ||
4328 | else | ||
4329 | x = page->pages; | ||
4303 | total += x; | 4330 | total += x; |
4304 | nodes[node] += x; | 4331 | nodes[node] += x; |
4305 | } | 4332 | } |
@@ -5163,7 +5190,7 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5163 | } | 5190 | } |
5164 | 5191 | ||
5165 | s->kobj.kset = slab_kset; | 5192 | s->kobj.kset = slab_kset; |
5166 | err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); | 5193 | err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); |
5167 | if (err) { | 5194 | if (err) { |
5168 | kobject_put(&s->kobj); | 5195 | kobject_put(&s->kobj); |
5169 | return err; | 5196 | return err; |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 27eeab3be757..4cba9c2783a1 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, | |||
40 | unsigned long align, | 40 | unsigned long align, |
41 | unsigned long goal) | 41 | unsigned long goal) |
42 | { | 42 | { |
43 | return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); | 43 | return memblock_virt_alloc_try_nid(size, align, goal, |
44 | BOOTMEM_ALLOC_ACCESSIBLE, node); | ||
44 | } | 45 | } |
45 | 46 | ||
46 | static void *vmemmap_buf; | 47 | static void *vmemmap_buf; |
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
226 | 227 | ||
227 | if (vmemmap_buf_start) { | 228 | if (vmemmap_buf_start) { |
228 | /* need to free left buf */ | 229 | /* need to free left buf */ |
229 | free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); | 230 | memblock_free_early(__pa(vmemmap_buf), |
231 | vmemmap_buf_end - vmemmap_buf); | ||
230 | vmemmap_buf = NULL; | 232 | vmemmap_buf = NULL; |
231 | vmemmap_buf_end = NULL; | 233 | vmemmap_buf_end = NULL; |
232 | } | 234 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 8cc7be0e9590..63c3ea5c119c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) | |||
69 | else | 69 | else |
70 | section = kzalloc(array_size, GFP_KERNEL); | 70 | section = kzalloc(array_size, GFP_KERNEL); |
71 | } else { | 71 | } else { |
72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); | 72 | section = memblock_virt_alloc_node(array_size, nid); |
73 | } | 73 | } |
74 | 74 | ||
75 | return section; | 75 | return section; |
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | |||
279 | limit = goal + (1UL << PA_SECTION_SHIFT); | 279 | limit = goal + (1UL << PA_SECTION_SHIFT); |
280 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); | 280 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); |
281 | again: | 281 | again: |
282 | p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, | 282 | p = memblock_virt_alloc_try_nid_nopanic(size, |
283 | SMP_CACHE_BYTES, goal, limit); | 283 | SMP_CACHE_BYTES, goal, limit, |
284 | nid); | ||
284 | if (!p && limit) { | 285 | if (!p && limit) { |
285 | limit = 0; | 286 | limit = 0; |
286 | goto again; | 287 | goto again; |
@@ -331,7 +332,7 @@ static unsigned long * __init | |||
331 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 332 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
332 | unsigned long size) | 333 | unsigned long size) |
333 | { | 334 | { |
334 | return alloc_bootmem_node_nopanic(pgdat, size); | 335 | return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); |
335 | } | 336 | } |
336 | 337 | ||
337 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 338 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
376 | return map; | 377 | return map; |
377 | 378 | ||
378 | size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); | 379 | size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); |
379 | map = __alloc_bootmem_node_high(NODE_DATA(nid), size, | 380 | map = memblock_virt_alloc_try_nid(size, |
380 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 381 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
382 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
381 | return map; | 383 | return map; |
382 | } | 384 | } |
383 | void __init sparse_mem_maps_populate_node(struct page **map_map, | 385 | void __init sparse_mem_maps_populate_node(struct page **map_map, |
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
401 | } | 403 | } |
402 | 404 | ||
403 | size = PAGE_ALIGN(size); | 405 | size = PAGE_ALIGN(size); |
404 | map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, | 406 | map = memblock_virt_alloc_try_nid(size * map_count, |
405 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 407 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
408 | BOOTMEM_ALLOC_ACCESSIBLE, nodeid); | ||
406 | if (map) { | 409 | if (map) { |
407 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 410 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
408 | if (!present_section_nr(pnum)) | 411 | if (!present_section_nr(pnum)) |
@@ -545,7 +548,7 @@ void __init sparse_init(void) | |||
545 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. | 548 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. |
546 | */ | 549 | */ |
547 | size = sizeof(unsigned long *) * NR_MEM_SECTIONS; | 550 | size = sizeof(unsigned long *) * NR_MEM_SECTIONS; |
548 | usemap_map = alloc_bootmem(size); | 551 | usemap_map = memblock_virt_alloc(size, 0); |
549 | if (!usemap_map) | 552 | if (!usemap_map) |
550 | panic("can not allocate usemap_map\n"); | 553 | panic("can not allocate usemap_map\n"); |
551 | alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, | 554 | alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, |
@@ -553,7 +556,7 @@ void __init sparse_init(void) | |||
553 | 556 | ||
554 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 557 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
555 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; | 558 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; |
556 | map_map = alloc_bootmem(size2); | 559 | map_map = memblock_virt_alloc(size2, 0); |
557 | if (!map_map) | 560 | if (!map_map) |
558 | panic("can not allocate map_map\n"); | 561 | panic("can not allocate map_map\n"); |
559 | alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, | 562 | alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, |
@@ -583,9 +586,9 @@ void __init sparse_init(void) | |||
583 | vmemmap_populate_print_last(); | 586 | vmemmap_populate_print_last(); |
584 | 587 | ||
585 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 588 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
586 | free_bootmem(__pa(map_map), size2); | 589 | memblock_free_early(__pa(map_map), size2); |
587 | #endif | 590 | #endif |
588 | free_bootmem(__pa(usemap_map), size); | 591 | memblock_free_early(__pa(usemap_map), size); |
589 | } | 592 | } |
590 | 593 | ||
591 | #ifdef CONFIG_MEMORY_HOTPLUG | 594 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -31,7 +31,6 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/uio.h> | 33 | #include <linux/uio.h> |
34 | #include <linux/hugetlb.h> | ||
35 | 34 | ||
36 | #include "internal.h" | 35 | #include "internal.h" |
37 | 36 | ||
@@ -58,7 +57,7 @@ static void __page_cache_release(struct page *page) | |||
58 | 57 | ||
59 | spin_lock_irqsave(&zone->lru_lock, flags); | 58 | spin_lock_irqsave(&zone->lru_lock, flags); |
60 | lruvec = mem_cgroup_page_lruvec(page, zone); | 59 | lruvec = mem_cgroup_page_lruvec(page, zone); |
61 | VM_BUG_ON(!PageLRU(page)); | 60 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
62 | __ClearPageLRU(page); | 61 | __ClearPageLRU(page); |
63 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); | 62 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
64 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 63 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page) | |||
82 | 81 | ||
83 | static void put_compound_page(struct page *page) | 82 | static void put_compound_page(struct page *page) |
84 | { | 83 | { |
85 | if (unlikely(PageTail(page))) { | 84 | struct page *page_head; |
86 | /* __split_huge_page_refcount can run under us */ | ||
87 | struct page *page_head = compound_trans_head(page); | ||
88 | |||
89 | if (likely(page != page_head && | ||
90 | get_page_unless_zero(page_head))) { | ||
91 | unsigned long flags; | ||
92 | 85 | ||
86 | if (likely(!PageTail(page))) { | ||
87 | if (put_page_testzero(page)) { | ||
93 | /* | 88 | /* |
94 | * THP can not break up slab pages so avoid taking | 89 | * By the time all refcounts have been released |
95 | * compound_lock(). Slab performs non-atomic bit ops | 90 | * split_huge_page cannot run anymore from under us. |
96 | * on page->flags for better performance. In particular | ||
97 | * slab_unlock() in slub used to be a hot path. It is | ||
98 | * still hot on arches that do not support | ||
99 | * this_cpu_cmpxchg_double(). | ||
100 | */ | 91 | */ |
101 | if (PageSlab(page_head) || PageHeadHuge(page_head)) { | 92 | if (PageHead(page)) |
102 | if (likely(PageTail(page))) { | 93 | __put_compound_page(page); |
103 | /* | 94 | else |
104 | * __split_huge_page_refcount | 95 | __put_single_page(page); |
105 | * cannot race here. | 96 | } |
106 | */ | 97 | return; |
107 | VM_BUG_ON(!PageHead(page_head)); | 98 | } |
108 | atomic_dec(&page->_mapcount); | 99 | |
109 | if (put_page_testzero(page_head)) | 100 | /* __split_huge_page_refcount can run under us */ |
110 | VM_BUG_ON(1); | 101 | page_head = compound_trans_head(page); |
111 | if (put_page_testzero(page_head)) | 102 | |
112 | __put_compound_page(page_head); | 103 | /* |
113 | return; | 104 | * THP can not break up slab pages so avoid taking |
114 | } else | 105 | * compound_lock() and skip the tail page refcounting (in |
115 | /* | 106 | * _mapcount) too. Slab performs non-atomic bit ops on |
116 | * __split_huge_page_refcount | 107 | * page->flags for better performance. In particular |
117 | * run before us, "page" was a | 108 | * slab_unlock() in slub used to be a hot path. It is still |
118 | * THP tail. The split | 109 | * hot on arches that do not support |
119 | * page_head has been freed | 110 | * this_cpu_cmpxchg_double(). |
120 | * and reallocated as slab or | 111 | * |
121 | * hugetlbfs page of smaller | 112 | * If "page" is part of a slab or hugetlbfs page it cannot be |
122 | * order (only possible if | 113 | * splitted and the head page cannot change from under us. And |
123 | * reallocated as slab on | 114 | * if "page" is part of a THP page under splitting, if the |
124 | * x86). | 115 | * head page pointed by the THP tail isn't a THP head anymore, |
125 | */ | 116 | * we'll find PageTail clear after smp_rmb() and we'll treat |
126 | goto skip_lock; | 117 | * it as a single page. |
127 | } | 118 | */ |
119 | if (!__compound_tail_refcounted(page_head)) { | ||
120 | /* | ||
121 | * If "page" is a THP tail, we must read the tail page | ||
122 | * flags after the head page flags. The | ||
123 | * split_huge_page side enforces write memory barriers | ||
124 | * between clearing PageTail and before the head page | ||
125 | * can be freed and reallocated. | ||
126 | */ | ||
127 | smp_rmb(); | ||
128 | if (likely(PageTail(page))) { | ||
128 | /* | 129 | /* |
129 | * page_head wasn't a dangling pointer but it | 130 | * __split_huge_page_refcount cannot race |
130 | * may not be a head page anymore by the time | 131 | * here. |
131 | * we obtain the lock. That is ok as long as it | ||
132 | * can't be freed from under us. | ||
133 | */ | 132 | */ |
134 | flags = compound_lock_irqsave(page_head); | 133 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); |
135 | if (unlikely(!PageTail(page))) { | 134 | VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); |
136 | /* __split_huge_page_refcount run before us */ | 135 | if (put_page_testzero(page_head)) { |
137 | compound_unlock_irqrestore(page_head, flags); | 136 | /* |
138 | skip_lock: | 137 | * If this is the tail of a slab |
139 | if (put_page_testzero(page_head)) { | 138 | * compound page, the tail pin must |
140 | /* | 139 | * not be the last reference held on |
141 | * The head page may have been | 140 | * the page, because the PG_slab |
142 | * freed and reallocated as a | 141 | * cannot be cleared before all tail |
143 | * compound page of smaller | 142 | * pins (which skips the _mapcount |
144 | * order and then freed again. | 143 | * tail refcounting) have been |
145 | * All we know is that it | 144 | * released. For hugetlbfs the tail |
146 | * cannot have become: a THP | 145 | * pin may be the last reference on |
147 | * page, a compound page of | 146 | * the page instead, because |
148 | * higher order, a tail page. | 147 | * PageHeadHuge will not go away until |
149 | * That is because we still | 148 | * the compound page enters the buddy |
150 | * hold the refcount of the | 149 | * allocator. |
151 | * split THP tail and | 150 | */ |
152 | * page_head was the THP head | 151 | VM_BUG_ON_PAGE(PageSlab(page_head), page_head); |
153 | * before the split. | 152 | __put_compound_page(page_head); |
154 | */ | ||
155 | if (PageHead(page_head)) | ||
156 | __put_compound_page(page_head); | ||
157 | else | ||
158 | __put_single_page(page_head); | ||
159 | } | ||
160 | out_put_single: | ||
161 | if (put_page_testzero(page)) | ||
162 | __put_single_page(page); | ||
163 | return; | ||
164 | } | 153 | } |
165 | VM_BUG_ON(page_head != page->first_page); | 154 | return; |
155 | } else | ||
166 | /* | 156 | /* |
167 | * We can release the refcount taken by | 157 | * __split_huge_page_refcount run before us, |
168 | * get_page_unless_zero() now that | 158 | * "page" was a THP tail. The split page_head |
169 | * __split_huge_page_refcount() is blocked on | 159 | * has been freed and reallocated as slab or |
170 | * the compound_lock. | 160 | * hugetlbfs page of smaller order (only |
161 | * possible if reallocated as slab on x86). | ||
171 | */ | 162 | */ |
172 | if (put_page_testzero(page_head)) | 163 | goto out_put_single; |
173 | VM_BUG_ON(1); | 164 | } |
174 | /* __split_huge_page_refcount will wait now */ | ||
175 | VM_BUG_ON(page_mapcount(page) <= 0); | ||
176 | atomic_dec(&page->_mapcount); | ||
177 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | ||
178 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
179 | compound_unlock_irqrestore(page_head, flags); | ||
180 | 165 | ||
166 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
167 | unsigned long flags; | ||
168 | |||
169 | /* | ||
170 | * page_head wasn't a dangling pointer but it may not | ||
171 | * be a head page anymore by the time we obtain the | ||
172 | * lock. That is ok as long as it can't be freed from | ||
173 | * under us. | ||
174 | */ | ||
175 | flags = compound_lock_irqsave(page_head); | ||
176 | if (unlikely(!PageTail(page))) { | ||
177 | /* __split_huge_page_refcount run before us */ | ||
178 | compound_unlock_irqrestore(page_head, flags); | ||
181 | if (put_page_testzero(page_head)) { | 179 | if (put_page_testzero(page_head)) { |
180 | /* | ||
181 | * The head page may have been freed | ||
182 | * and reallocated as a compound page | ||
183 | * of smaller order and then freed | ||
184 | * again. All we know is that it | ||
185 | * cannot have become: a THP page, a | ||
186 | * compound page of higher order, a | ||
187 | * tail page. That is because we | ||
188 | * still hold the refcount of the | ||
189 | * split THP tail and page_head was | ||
190 | * the THP head before the split. | ||
191 | */ | ||
182 | if (PageHead(page_head)) | 192 | if (PageHead(page_head)) |
183 | __put_compound_page(page_head); | 193 | __put_compound_page(page_head); |
184 | else | 194 | else |
185 | __put_single_page(page_head); | 195 | __put_single_page(page_head); |
186 | } | 196 | } |
187 | } else { | 197 | out_put_single: |
188 | /* page_head is a dangling pointer */ | 198 | if (put_page_testzero(page)) |
189 | VM_BUG_ON(PageTail(page)); | 199 | __put_single_page(page); |
190 | goto out_put_single; | 200 | return; |
191 | } | 201 | } |
192 | } else if (put_page_testzero(page)) { | 202 | VM_BUG_ON_PAGE(page_head != page->first_page, page); |
193 | if (PageHead(page)) | 203 | /* |
194 | __put_compound_page(page); | 204 | * We can release the refcount taken by |
195 | else | 205 | * get_page_unless_zero() now that |
196 | __put_single_page(page); | 206 | * __split_huge_page_refcount() is blocked on the |
207 | * compound_lock. | ||
208 | */ | ||
209 | if (put_page_testzero(page_head)) | ||
210 | VM_BUG_ON_PAGE(1, page_head); | ||
211 | /* __split_huge_page_refcount will wait now */ | ||
212 | VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); | ||
213 | atomic_dec(&page->_mapcount); | ||
214 | VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); | ||
215 | VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); | ||
216 | compound_unlock_irqrestore(page_head, flags); | ||
217 | |||
218 | if (put_page_testzero(page_head)) { | ||
219 | if (PageHead(page_head)) | ||
220 | __put_compound_page(page_head); | ||
221 | else | ||
222 | __put_single_page(page_head); | ||
223 | } | ||
224 | } else { | ||
225 | /* page_head is a dangling pointer */ | ||
226 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
227 | goto out_put_single; | ||
197 | } | 228 | } |
198 | } | 229 | } |
199 | 230 | ||
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page) | |||
221 | * split_huge_page(). | 252 | * split_huge_page(). |
222 | */ | 253 | */ |
223 | unsigned long flags; | 254 | unsigned long flags; |
224 | bool got = false; | 255 | bool got; |
225 | struct page *page_head = compound_trans_head(page); | 256 | struct page *page_head = compound_trans_head(page); |
226 | 257 | ||
227 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 258 | /* Ref to put_compound_page() comment. */ |
228 | /* Ref to put_compound_page() comment. */ | 259 | if (!__compound_tail_refcounted(page_head)) { |
229 | if (PageSlab(page_head) || PageHeadHuge(page_head)) { | 260 | smp_rmb(); |
230 | if (likely(PageTail(page))) { | 261 | if (likely(PageTail(page))) { |
231 | /* | 262 | /* |
232 | * This is a hugetlbfs page or a slab | 263 | * This is a hugetlbfs page or a slab |
233 | * page. __split_huge_page_refcount | 264 | * page. __split_huge_page_refcount |
234 | * cannot race here. | 265 | * cannot race here. |
235 | */ | 266 | */ |
236 | VM_BUG_ON(!PageHead(page_head)); | 267 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); |
237 | __get_page_tail_foll(page, false); | 268 | __get_page_tail_foll(page, true); |
238 | return true; | 269 | return true; |
239 | } else { | 270 | } else { |
240 | /* | 271 | /* |
241 | * __split_huge_page_refcount run | 272 | * __split_huge_page_refcount run |
242 | * before us, "page" was a THP | 273 | * before us, "page" was a THP |
243 | * tail. The split page_head has been | 274 | * tail. The split page_head has been |
244 | * freed and reallocated as slab or | 275 | * freed and reallocated as slab or |
245 | * hugetlbfs page of smaller order | 276 | * hugetlbfs page of smaller order |
246 | * (only possible if reallocated as | 277 | * (only possible if reallocated as |
247 | * slab on x86). | 278 | * slab on x86). |
248 | */ | 279 | */ |
249 | put_page(page_head); | 280 | return false; |
250 | return false; | ||
251 | } | ||
252 | } | 281 | } |
282 | } | ||
253 | 283 | ||
284 | got = false; | ||
285 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
254 | /* | 286 | /* |
255 | * page_head wasn't a dangling pointer but it | 287 | * page_head wasn't a dangling pointer but it |
256 | * may not be a head page anymore by the time | 288 | * may not be a head page anymore by the time |
@@ -572,8 +604,8 @@ EXPORT_SYMBOL(__lru_cache_add); | |||
572 | */ | 604 | */ |
573 | void lru_cache_add(struct page *page) | 605 | void lru_cache_add(struct page *page) |
574 | { | 606 | { |
575 | VM_BUG_ON(PageActive(page) && PageUnevictable(page)); | 607 | VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); |
576 | VM_BUG_ON(PageLRU(page)); | 608 | VM_BUG_ON_PAGE(PageLRU(page), page); |
577 | __lru_cache_add(page); | 609 | __lru_cache_add(page); |
578 | } | 610 | } |
579 | 611 | ||
@@ -814,7 +846,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
814 | } | 846 | } |
815 | 847 | ||
816 | lruvec = mem_cgroup_page_lruvec(page, zone); | 848 | lruvec = mem_cgroup_page_lruvec(page, zone); |
817 | VM_BUG_ON(!PageLRU(page)); | 849 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
818 | __ClearPageLRU(page); | 850 | __ClearPageLRU(page); |
819 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); | 851 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
820 | } | 852 | } |
@@ -856,9 +888,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
856 | { | 888 | { |
857 | const int file = 0; | 889 | const int file = 0; |
858 | 890 | ||
859 | VM_BUG_ON(!PageHead(page)); | 891 | VM_BUG_ON_PAGE(!PageHead(page), page); |
860 | VM_BUG_ON(PageCompound(page_tail)); | 892 | VM_BUG_ON_PAGE(PageCompound(page_tail), page); |
861 | VM_BUG_ON(PageLRU(page_tail)); | 893 | VM_BUG_ON_PAGE(PageLRU(page_tail), page); |
862 | VM_BUG_ON(NR_CPUS != 1 && | 894 | VM_BUG_ON(NR_CPUS != 1 && |
863 | !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); | 895 | !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); |
864 | 896 | ||
@@ -897,7 +929,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, | |||
897 | int active = PageActive(page); | 929 | int active = PageActive(page); |
898 | enum lru_list lru = page_lru(page); | 930 | enum lru_list lru = page_lru(page); |
899 | 931 | ||
900 | VM_BUG_ON(PageLRU(page)); | 932 | VM_BUG_ON_PAGE(PageLRU(page), page); |
901 | 933 | ||
902 | SetPageLRU(page); | 934 | SetPageLRU(page); |
903 | add_page_to_lru_list(page, lruvec, lru); | 935 | add_page_to_lru_list(page, lruvec, lru); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index e6f15f8ca2af..98e85e9c2b2d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -83,9 +83,9 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
83 | int error; | 83 | int error; |
84 | struct address_space *address_space; | 84 | struct address_space *address_space; |
85 | 85 | ||
86 | VM_BUG_ON(!PageLocked(page)); | 86 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
87 | VM_BUG_ON(PageSwapCache(page)); | 87 | VM_BUG_ON_PAGE(PageSwapCache(page), page); |
88 | VM_BUG_ON(!PageSwapBacked(page)); | 88 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
89 | 89 | ||
90 | page_cache_get(page); | 90 | page_cache_get(page); |
91 | SetPageSwapCache(page); | 91 | SetPageSwapCache(page); |
@@ -139,9 +139,9 @@ void __delete_from_swap_cache(struct page *page) | |||
139 | swp_entry_t entry; | 139 | swp_entry_t entry; |
140 | struct address_space *address_space; | 140 | struct address_space *address_space; |
141 | 141 | ||
142 | VM_BUG_ON(!PageLocked(page)); | 142 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
143 | VM_BUG_ON(!PageSwapCache(page)); | 143 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
144 | VM_BUG_ON(PageWriteback(page)); | 144 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
145 | 145 | ||
146 | entry.val = page_private(page); | 146 | entry.val = page_private(page); |
147 | address_space = swap_address_space(entry); | 147 | address_space = swap_address_space(entry); |
@@ -165,8 +165,8 @@ int add_to_swap(struct page *page, struct list_head *list) | |||
165 | swp_entry_t entry; | 165 | swp_entry_t entry; |
166 | int err; | 166 | int err; |
167 | 167 | ||
168 | VM_BUG_ON(!PageLocked(page)); | 168 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
169 | VM_BUG_ON(!PageUptodate(page)); | 169 | VM_BUG_ON_PAGE(!PageUptodate(page), page); |
170 | 170 | ||
171 | entry = get_swap_page(); | 171 | entry = get_swap_page(); |
172 | if (!entry.val) | 172 | if (!entry.val) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 612a7c9795f6..c6c13b050a58 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -616,7 +616,7 @@ scan: | |||
616 | } | 616 | } |
617 | } | 617 | } |
618 | offset = si->lowest_bit; | 618 | offset = si->lowest_bit; |
619 | while (++offset < scan_base) { | 619 | while (offset < scan_base) { |
620 | if (!si->swap_map[offset]) { | 620 | if (!si->swap_map[offset]) { |
621 | spin_lock(&si->lock); | 621 | spin_lock(&si->lock); |
622 | goto checks; | 622 | goto checks; |
@@ -629,6 +629,7 @@ scan: | |||
629 | cond_resched(); | 629 | cond_resched(); |
630 | latency_ration = LATENCY_LIMIT; | 630 | latency_ration = LATENCY_LIMIT; |
631 | } | 631 | } |
632 | offset++; | ||
632 | } | 633 | } |
633 | spin_lock(&si->lock); | 634 | spin_lock(&si->lock); |
634 | 635 | ||
@@ -906,7 +907,7 @@ int reuse_swap_page(struct page *page) | |||
906 | { | 907 | { |
907 | int count; | 908 | int count; |
908 | 909 | ||
909 | VM_BUG_ON(!PageLocked(page)); | 910 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
910 | if (unlikely(PageKsm(page))) | 911 | if (unlikely(PageKsm(page))) |
911 | return 0; | 912 | return 0; |
912 | count = page_mapcount(page); | 913 | count = page_mapcount(page); |
@@ -926,7 +927,7 @@ int reuse_swap_page(struct page *page) | |||
926 | */ | 927 | */ |
927 | int try_to_free_swap(struct page *page) | 928 | int try_to_free_swap(struct page *page) |
928 | { | 929 | { |
929 | VM_BUG_ON(!PageLocked(page)); | 930 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
930 | 931 | ||
931 | if (!PageSwapCache(page)) | 932 | if (!PageSwapCache(page)) |
932 | return 0; | 933 | return 0; |
@@ -2714,7 +2715,7 @@ struct swap_info_struct *page_swap_info(struct page *page) | |||
2714 | */ | 2715 | */ |
2715 | struct address_space *__page_file_mapping(struct page *page) | 2716 | struct address_space *__page_file_mapping(struct page *page) |
2716 | { | 2717 | { |
2717 | VM_BUG_ON(!PageSwapCache(page)); | 2718 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
2718 | return page_swap_info(page)->swap_file->f_mapping; | 2719 | return page_swap_info(page)->swap_file->f_mapping; |
2719 | } | 2720 | } |
2720 | EXPORT_SYMBOL_GPL(__page_file_mapping); | 2721 | EXPORT_SYMBOL_GPL(__page_file_mapping); |
@@ -2722,7 +2723,7 @@ EXPORT_SYMBOL_GPL(__page_file_mapping); | |||
2722 | pgoff_t __page_file_index(struct page *page) | 2723 | pgoff_t __page_file_index(struct page *page) |
2723 | { | 2724 | { |
2724 | swp_entry_t swap = { .val = page_private(page) }; | 2725 | swp_entry_t swap = { .val = page_private(page) }; |
2725 | VM_BUG_ON(!PageSwapCache(page)); | 2726 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
2726 | return swp_offset(swap); | 2727 | return swp_offset(swap); |
2727 | } | 2728 | } |
2728 | EXPORT_SYMBOL_GPL(__page_file_index); | 2729 | EXPORT_SYMBOL_GPL(__page_file_index); |
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page) | |||
404 | return mapping; | 404 | return mapping; |
405 | } | 405 | } |
406 | 406 | ||
407 | int overcommit_ratio_handler(struct ctl_table *table, int write, | ||
408 | void __user *buffer, size_t *lenp, | ||
409 | loff_t *ppos) | ||
410 | { | ||
411 | int ret; | ||
412 | |||
413 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
414 | if (ret == 0 && write) | ||
415 | sysctl_overcommit_kbytes = 0; | ||
416 | return ret; | ||
417 | } | ||
418 | |||
419 | int overcommit_kbytes_handler(struct ctl_table *table, int write, | ||
420 | void __user *buffer, size_t *lenp, | ||
421 | loff_t *ppos) | ||
422 | { | ||
423 | int ret; | ||
424 | |||
425 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | ||
426 | if (ret == 0 && write) | ||
427 | sysctl_overcommit_ratio = 0; | ||
428 | return ret; | ||
429 | } | ||
430 | |||
407 | /* | 431 | /* |
408 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used | 432 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used |
409 | */ | 433 | */ |
410 | unsigned long vm_commit_limit(void) | 434 | unsigned long vm_commit_limit(void) |
411 | { | 435 | { |
412 | return ((totalram_pages - hugetlb_total_pages()) | 436 | unsigned long allowed; |
413 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | 437 | |
438 | if (sysctl_overcommit_kbytes) | ||
439 | allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); | ||
440 | else | ||
441 | allowed = ((totalram_pages - hugetlb_total_pages()) | ||
442 | * sysctl_overcommit_ratio / 100); | ||
443 | allowed += total_swap_pages; | ||
444 | |||
445 | return allowed; | ||
414 | } | 446 | } |
415 | 447 | ||
416 | 448 | ||
diff --git a/mm/vmpressure.c b/mm/vmpressure.c index e0f62837c3f4..196970a4541f 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c | |||
@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
278 | 278 | ||
279 | /** | 279 | /** |
280 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd | 280 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd |
281 | * @css: css that is interested in vmpressure notifications | 281 | * @memcg: memcg that is interested in vmpressure notifications |
282 | * @cft: cgroup control files handle | ||
283 | * @eventfd: eventfd context to link notifications with | 282 | * @eventfd: eventfd context to link notifications with |
284 | * @args: event arguments (used to set up a pressure level threshold) | 283 | * @args: event arguments (used to set up a pressure level threshold) |
285 | * | 284 | * |
@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
289 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or | 288 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or |
290 | * "critical"). | 289 | * "critical"). |
291 | * | 290 | * |
292 | * This function should not be used directly, just pass it to (struct | 291 | * To be used as memcg event method. |
293 | * cftype).register_event, and then cgroup core will handle everything by | ||
294 | * itself. | ||
295 | */ | 292 | */ |
296 | int vmpressure_register_event(struct cgroup_subsys_state *css, | 293 | int vmpressure_register_event(struct mem_cgroup *memcg, |
297 | struct cftype *cft, struct eventfd_ctx *eventfd, | 294 | struct eventfd_ctx *eventfd, const char *args) |
298 | const char *args) | ||
299 | { | 295 | { |
300 | struct vmpressure *vmpr = css_to_vmpressure(css); | 296 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); |
301 | struct vmpressure_event *ev; | 297 | struct vmpressure_event *ev; |
302 | int level; | 298 | int level; |
303 | 299 | ||
@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, | |||
325 | 321 | ||
326 | /** | 322 | /** |
327 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure | 323 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure |
328 | * @css: css handle | 324 | * @memcg: memcg handle |
329 | * @cft: cgroup control files handle | ||
330 | * @eventfd: eventfd context that was used to link vmpressure with the @cg | 325 | * @eventfd: eventfd context that was used to link vmpressure with the @cg |
331 | * | 326 | * |
332 | * This function does internal manipulations to detach the @eventfd from | 327 | * This function does internal manipulations to detach the @eventfd from |
333 | * the vmpressure notifications, and then frees internal resources | 328 | * the vmpressure notifications, and then frees internal resources |
334 | * associated with the @eventfd (but the @eventfd itself is not freed). | 329 | * associated with the @eventfd (but the @eventfd itself is not freed). |
335 | * | 330 | * |
336 | * This function should not be used directly, just pass it to (struct | 331 | * To be used as memcg event method. |
337 | * cftype).unregister_event, and then cgroup core will handle everything | ||
338 | * by itself. | ||
339 | */ | 332 | */ |
340 | void vmpressure_unregister_event(struct cgroup_subsys_state *css, | 333 | void vmpressure_unregister_event(struct mem_cgroup *memcg, |
341 | struct cftype *cft, | ||
342 | struct eventfd_ctx *eventfd) | 334 | struct eventfd_ctx *eventfd) |
343 | { | 335 | { |
344 | struct vmpressure *vmpr = css_to_vmpressure(css); | 336 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); |
345 | struct vmpressure_event *ev; | 337 | struct vmpressure_event *ev; |
346 | 338 | ||
347 | mutex_lock(&vmpr->events_lock); | 339 | mutex_lock(&vmpr->events_lock); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index eea668d9cff6..a9c74b409681 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc) | |||
147 | } | 147 | } |
148 | #endif | 148 | #endif |
149 | 149 | ||
150 | unsigned long zone_reclaimable_pages(struct zone *zone) | 150 | static unsigned long zone_reclaimable_pages(struct zone *zone) |
151 | { | 151 | { |
152 | int nr; | 152 | int nr; |
153 | 153 | ||
@@ -281,17 +281,34 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
281 | nr_pages_scanned, lru_pages, | 281 | nr_pages_scanned, lru_pages, |
282 | max_pass, delta, total_scan); | 282 | max_pass, delta, total_scan); |
283 | 283 | ||
284 | while (total_scan >= batch_size) { | 284 | /* |
285 | * Normally, we should not scan less than batch_size objects in one | ||
286 | * pass to avoid too frequent shrinker calls, but if the slab has less | ||
287 | * than batch_size objects in total and we are really tight on memory, | ||
288 | * we will try to reclaim all available objects, otherwise we can end | ||
289 | * up failing allocations although there are plenty of reclaimable | ||
290 | * objects spread over several slabs with usage less than the | ||
291 | * batch_size. | ||
292 | * | ||
293 | * We detect the "tight on memory" situations by looking at the total | ||
294 | * number of objects we want to scan (total_scan). If it is greater | ||
295 | * than the total number of objects on slab (max_pass), we must be | ||
296 | * scanning at high prio and therefore should try to reclaim as much as | ||
297 | * possible. | ||
298 | */ | ||
299 | while (total_scan >= batch_size || | ||
300 | total_scan >= max_pass) { | ||
285 | unsigned long ret; | 301 | unsigned long ret; |
302 | unsigned long nr_to_scan = min(batch_size, total_scan); | ||
286 | 303 | ||
287 | shrinkctl->nr_to_scan = batch_size; | 304 | shrinkctl->nr_to_scan = nr_to_scan; |
288 | ret = shrinker->scan_objects(shrinker, shrinkctl); | 305 | ret = shrinker->scan_objects(shrinker, shrinkctl); |
289 | if (ret == SHRINK_STOP) | 306 | if (ret == SHRINK_STOP) |
290 | break; | 307 | break; |
291 | freed += ret; | 308 | freed += ret; |
292 | 309 | ||
293 | count_vm_events(SLABS_SCANNED, batch_size); | 310 | count_vm_events(SLABS_SCANNED, nr_to_scan); |
294 | total_scan -= batch_size; | 311 | total_scan -= nr_to_scan; |
295 | 312 | ||
296 | cond_resched(); | 313 | cond_resched(); |
297 | } | 314 | } |
@@ -352,16 +369,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, | |||
352 | } | 369 | } |
353 | 370 | ||
354 | list_for_each_entry(shrinker, &shrinker_list, list) { | 371 | list_for_each_entry(shrinker, &shrinker_list, list) { |
355 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { | 372 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { |
356 | if (!node_online(shrinkctl->nid)) | 373 | shrinkctl->nid = 0; |
357 | continue; | ||
358 | |||
359 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && | ||
360 | (shrinkctl->nid != 0)) | ||
361 | break; | ||
362 | |||
363 | freed += shrink_slab_node(shrinkctl, shrinker, | 374 | freed += shrink_slab_node(shrinkctl, shrinker, |
364 | nr_pages_scanned, lru_pages); | 375 | nr_pages_scanned, lru_pages); |
376 | continue; | ||
377 | } | ||
378 | |||
379 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { | ||
380 | if (node_online(shrinkctl->nid)) | ||
381 | freed += shrink_slab_node(shrinkctl, shrinker, | ||
382 | nr_pages_scanned, lru_pages); | ||
365 | 383 | ||
366 | } | 384 | } |
367 | } | 385 | } |
@@ -603,7 +621,7 @@ void putback_lru_page(struct page *page) | |||
603 | bool is_unevictable; | 621 | bool is_unevictable; |
604 | int was_unevictable = PageUnevictable(page); | 622 | int was_unevictable = PageUnevictable(page); |
605 | 623 | ||
606 | VM_BUG_ON(PageLRU(page)); | 624 | VM_BUG_ON_PAGE(PageLRU(page), page); |
607 | 625 | ||
608 | redo: | 626 | redo: |
609 | ClearPageUnevictable(page); | 627 | ClearPageUnevictable(page); |
@@ -794,8 +812,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
794 | if (!trylock_page(page)) | 812 | if (!trylock_page(page)) |
795 | goto keep; | 813 | goto keep; |
796 | 814 | ||
797 | VM_BUG_ON(PageActive(page)); | 815 | VM_BUG_ON_PAGE(PageActive(page), page); |
798 | VM_BUG_ON(page_zone(page) != zone); | 816 | VM_BUG_ON_PAGE(page_zone(page) != zone, page); |
799 | 817 | ||
800 | sc->nr_scanned++; | 818 | sc->nr_scanned++; |
801 | 819 | ||
@@ -1079,14 +1097,14 @@ activate_locked: | |||
1079 | /* Not a candidate for swapping, so reclaim swap space. */ | 1097 | /* Not a candidate for swapping, so reclaim swap space. */ |
1080 | if (PageSwapCache(page) && vm_swap_full()) | 1098 | if (PageSwapCache(page) && vm_swap_full()) |
1081 | try_to_free_swap(page); | 1099 | try_to_free_swap(page); |
1082 | VM_BUG_ON(PageActive(page)); | 1100 | VM_BUG_ON_PAGE(PageActive(page), page); |
1083 | SetPageActive(page); | 1101 | SetPageActive(page); |
1084 | pgactivate++; | 1102 | pgactivate++; |
1085 | keep_locked: | 1103 | keep_locked: |
1086 | unlock_page(page); | 1104 | unlock_page(page); |
1087 | keep: | 1105 | keep: |
1088 | list_add(&page->lru, &ret_pages); | 1106 | list_add(&page->lru, &ret_pages); |
1089 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 1107 | VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); |
1090 | } | 1108 | } |
1091 | 1109 | ||
1092 | free_hot_cold_page_list(&free_pages, 1); | 1110 | free_hot_cold_page_list(&free_pages, 1); |
@@ -1240,7 +1258,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1240 | page = lru_to_page(src); | 1258 | page = lru_to_page(src); |
1241 | prefetchw_prev_lru_page(page, src, flags); | 1259 | prefetchw_prev_lru_page(page, src, flags); |
1242 | 1260 | ||
1243 | VM_BUG_ON(!PageLRU(page)); | 1261 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
1244 | 1262 | ||
1245 | switch (__isolate_lru_page(page, mode)) { | 1263 | switch (__isolate_lru_page(page, mode)) { |
1246 | case 0: | 1264 | case 0: |
@@ -1295,7 +1313,7 @@ int isolate_lru_page(struct page *page) | |||
1295 | { | 1313 | { |
1296 | int ret = -EBUSY; | 1314 | int ret = -EBUSY; |
1297 | 1315 | ||
1298 | VM_BUG_ON(!page_count(page)); | 1316 | VM_BUG_ON_PAGE(!page_count(page), page); |
1299 | 1317 | ||
1300 | if (PageLRU(page)) { | 1318 | if (PageLRU(page)) { |
1301 | struct zone *zone = page_zone(page); | 1319 | struct zone *zone = page_zone(page); |
@@ -1366,7 +1384,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | |||
1366 | struct page *page = lru_to_page(page_list); | 1384 | struct page *page = lru_to_page(page_list); |
1367 | int lru; | 1385 | int lru; |
1368 | 1386 | ||
1369 | VM_BUG_ON(PageLRU(page)); | 1387 | VM_BUG_ON_PAGE(PageLRU(page), page); |
1370 | list_del(&page->lru); | 1388 | list_del(&page->lru); |
1371 | if (unlikely(!page_evictable(page))) { | 1389 | if (unlikely(!page_evictable(page))) { |
1372 | spin_unlock_irq(&zone->lru_lock); | 1390 | spin_unlock_irq(&zone->lru_lock); |
@@ -1586,7 +1604,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, | |||
1586 | page = lru_to_page(list); | 1604 | page = lru_to_page(list); |
1587 | lruvec = mem_cgroup_page_lruvec(page, zone); | 1605 | lruvec = mem_cgroup_page_lruvec(page, zone); |
1588 | 1606 | ||
1589 | VM_BUG_ON(PageLRU(page)); | 1607 | VM_BUG_ON_PAGE(PageLRU(page), page); |
1590 | SetPageLRU(page); | 1608 | SetPageLRU(page); |
1591 | 1609 | ||
1592 | nr_pages = hpage_nr_pages(page); | 1610 | nr_pages = hpage_nr_pages(page); |
@@ -3297,27 +3315,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | |||
3297 | wake_up_interruptible(&pgdat->kswapd_wait); | 3315 | wake_up_interruptible(&pgdat->kswapd_wait); |
3298 | } | 3316 | } |
3299 | 3317 | ||
3300 | /* | ||
3301 | * The reclaimable count would be mostly accurate. | ||
3302 | * The less reclaimable pages may be | ||
3303 | * - mlocked pages, which will be moved to unevictable list when encountered | ||
3304 | * - mapped pages, which may require several travels to be reclaimed | ||
3305 | * - dirty pages, which is not "instantly" reclaimable | ||
3306 | */ | ||
3307 | unsigned long global_reclaimable_pages(void) | ||
3308 | { | ||
3309 | int nr; | ||
3310 | |||
3311 | nr = global_page_state(NR_ACTIVE_FILE) + | ||
3312 | global_page_state(NR_INACTIVE_FILE); | ||
3313 | |||
3314 | if (get_nr_swap_pages() > 0) | ||
3315 | nr += global_page_state(NR_ACTIVE_ANON) + | ||
3316 | global_page_state(NR_INACTIVE_ANON); | ||
3317 | |||
3318 | return nr; | ||
3319 | } | ||
3320 | |||
3321 | #ifdef CONFIG_HIBERNATION | 3318 | #ifdef CONFIG_HIBERNATION |
3322 | /* | 3319 | /* |
3323 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of | 3320 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
@@ -3701,7 +3698,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3701 | if (page_evictable(page)) { | 3698 | if (page_evictable(page)) { |
3702 | enum lru_list lru = page_lru_base_type(page); | 3699 | enum lru_list lru = page_lru_base_type(page); |
3703 | 3700 | ||
3704 | VM_BUG_ON(PageActive(page)); | 3701 | VM_BUG_ON_PAGE(PageActive(page), page); |
3705 | ClearPageUnevictable(page); | 3702 | ClearPageUnevictable(page); |
3706 | del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); | 3703 | del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); |
3707 | add_page_to_lru_list(page, lruvec, lru); | 3704 | add_page_to_lru_list(page, lruvec, lru); |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c new file mode 100644 index 000000000000..c03ca5e9fe15 --- /dev/null +++ b/mm/zsmalloc.c | |||
@@ -0,0 +1,1106 @@ | |||
1 | /* | ||
2 | * zsmalloc memory allocator | ||
3 | * | ||
4 | * Copyright (C) 2011 Nitin Gupta | ||
5 | * Copyright (C) 2012, 2013 Minchan Kim | ||
6 | * | ||
7 | * This code is released using a dual license strategy: BSD/GPL | ||
8 | * You can choose the license that better fits your requirements. | ||
9 | * | ||
10 | * Released under the terms of 3-clause BSD License | ||
11 | * Released under the terms of GNU General Public License Version 2.0 | ||
12 | */ | ||
13 | |||
14 | /* | ||
15 | * This allocator is designed for use with zram. Thus, the allocator is | ||
16 | * supposed to work well under low memory conditions. In particular, it | ||
17 | * never attempts higher order page allocation which is very likely to | ||
18 | * fail under memory pressure. On the other hand, if we just use single | ||
19 | * (0-order) pages, it would suffer from very high fragmentation -- | ||
20 | * any object of size PAGE_SIZE/2 or larger would occupy an entire page. | ||
21 | * This was one of the major issues with its predecessor (xvmalloc). | ||
22 | * | ||
23 | * To overcome these issues, zsmalloc allocates a bunch of 0-order pages | ||
24 | * and links them together using various 'struct page' fields. These linked | ||
25 | * pages act as a single higher-order page i.e. an object can span 0-order | ||
26 | * page boundaries. The code refers to these linked pages as a single entity | ||
27 | * called zspage. | ||
28 | * | ||
29 | * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE | ||
30 | * since this satisfies the requirements of all its current users (in the | ||
31 | * worst case, page is incompressible and is thus stored "as-is" i.e. in | ||
32 | * uncompressed form). For allocation requests larger than this size, failure | ||
33 | * is returned (see zs_malloc). | ||
34 | * | ||
35 | * Additionally, zs_malloc() does not return a dereferenceable pointer. | ||
36 | * Instead, it returns an opaque handle (unsigned long) which encodes actual | ||
37 | * location of the allocated object. The reason for this indirection is that | ||
38 | * zsmalloc does not keep zspages permanently mapped since that would cause | ||
39 | * issues on 32-bit systems where the VA region for kernel space mappings | ||
40 | * is very small. So, before using the allocating memory, the object has to | ||
41 | * be mapped using zs_map_object() to get a usable pointer and subsequently | ||
42 | * unmapped using zs_unmap_object(). | ||
43 | * | ||
44 | * Following is how we use various fields and flags of underlying | ||
45 | * struct page(s) to form a zspage. | ||
46 | * | ||
47 | * Usage of struct page fields: | ||
48 | * page->first_page: points to the first component (0-order) page | ||
49 | * page->index (union with page->freelist): offset of the first object | ||
50 | * starting in this page. For the first page, this is | ||
51 | * always 0, so we use this field (aka freelist) to point | ||
52 | * to the first free object in zspage. | ||
53 | * page->lru: links together all component pages (except the first page) | ||
54 | * of a zspage | ||
55 | * | ||
56 | * For _first_ page only: | ||
57 | * | ||
58 | * page->private (union with page->first_page): refers to the | ||
59 | * component page after the first page | ||
60 | * page->freelist: points to the first free object in zspage. | ||
61 | * Free objects are linked together using in-place | ||
62 | * metadata. | ||
63 | * page->objects: maximum number of objects we can store in this | ||
64 | * zspage (class->zspage_order * PAGE_SIZE / class->size) | ||
65 | * page->lru: links together first pages of various zspages. | ||
66 | * Basically forming list of zspages in a fullness group. | ||
67 | * page->mapping: class index and fullness group of the zspage | ||
68 | * | ||
69 | * Usage of struct page flags: | ||
70 | * PG_private: identifies the first component page | ||
71 | * PG_private2: identifies the last component page | ||
72 | * | ||
73 | */ | ||
74 | |||
75 | #ifdef CONFIG_ZSMALLOC_DEBUG | ||
76 | #define DEBUG | ||
77 | #endif | ||
78 | |||
79 | #include <linux/module.h> | ||
80 | #include <linux/kernel.h> | ||
81 | #include <linux/bitops.h> | ||
82 | #include <linux/errno.h> | ||
83 | #include <linux/highmem.h> | ||
84 | #include <linux/string.h> | ||
85 | #include <linux/slab.h> | ||
86 | #include <asm/tlbflush.h> | ||
87 | #include <asm/pgtable.h> | ||
88 | #include <linux/cpumask.h> | ||
89 | #include <linux/cpu.h> | ||
90 | #include <linux/vmalloc.h> | ||
91 | #include <linux/hardirq.h> | ||
92 | #include <linux/spinlock.h> | ||
93 | #include <linux/types.h> | ||
94 | #include <linux/zsmalloc.h> | ||
95 | |||
96 | /* | ||
97 | * This must be power of 2 and greater than of equal to sizeof(link_free). | ||
98 | * These two conditions ensure that any 'struct link_free' itself doesn't | ||
99 | * span more than 1 page which avoids complex case of mapping 2 pages simply | ||
100 | * to restore link_free pointer values. | ||
101 | */ | ||
102 | #define ZS_ALIGN 8 | ||
103 | |||
104 | /* | ||
105 | * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) | ||
106 | * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. | ||
107 | */ | ||
108 | #define ZS_MAX_ZSPAGE_ORDER 2 | ||
109 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) | ||
110 | |||
111 | /* | ||
112 | * Object location (<PFN>, <obj_idx>) is encoded as | ||
113 | * as single (unsigned long) handle value. | ||
114 | * | ||
115 | * Note that object index <obj_idx> is relative to system | ||
116 | * page <PFN> it is stored in, so for each sub-page belonging | ||
117 | * to a zspage, obj_idx starts with 0. | ||
118 | * | ||
119 | * This is made more complicated by various memory models and PAE. | ||
120 | */ | ||
121 | |||
122 | #ifndef MAX_PHYSMEM_BITS | ||
123 | #ifdef CONFIG_HIGHMEM64G | ||
124 | #define MAX_PHYSMEM_BITS 36 | ||
125 | #else /* !CONFIG_HIGHMEM64G */ | ||
126 | /* | ||
127 | * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just | ||
128 | * be PAGE_SHIFT | ||
129 | */ | ||
130 | #define MAX_PHYSMEM_BITS BITS_PER_LONG | ||
131 | #endif | ||
132 | #endif | ||
133 | #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) | ||
134 | #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) | ||
135 | #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) | ||
136 | |||
137 | #define MAX(a, b) ((a) >= (b) ? (a) : (b)) | ||
138 | /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ | ||
139 | #define ZS_MIN_ALLOC_SIZE \ | ||
140 | MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) | ||
141 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE | ||
142 | |||
143 | /* | ||
144 | * On systems with 4K page size, this gives 254 size classes! There is a | ||
145 | * trader-off here: | ||
146 | * - Large number of size classes is potentially wasteful as free page are | ||
147 | * spread across these classes | ||
148 | * - Small number of size classes causes large internal fragmentation | ||
149 | * - Probably its better to use specific size classes (empirically | ||
150 | * determined). NOTE: all those class sizes must be set as multiple of | ||
151 | * ZS_ALIGN to make sure link_free itself never has to span 2 pages. | ||
152 | * | ||
153 | * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN | ||
154 | * (reason above) | ||
155 | */ | ||
156 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) | ||
157 | #define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ | ||
158 | ZS_SIZE_CLASS_DELTA + 1) | ||
159 | |||
160 | /* | ||
161 | * We do not maintain any list for completely empty or full pages | ||
162 | */ | ||
163 | enum fullness_group { | ||
164 | ZS_ALMOST_FULL, | ||
165 | ZS_ALMOST_EMPTY, | ||
166 | _ZS_NR_FULLNESS_GROUPS, | ||
167 | |||
168 | ZS_EMPTY, | ||
169 | ZS_FULL | ||
170 | }; | ||
171 | |||
172 | /* | ||
173 | * We assign a page to ZS_ALMOST_EMPTY fullness group when: | ||
174 | * n <= N / f, where | ||
175 | * n = number of allocated objects | ||
176 | * N = total number of objects zspage can store | ||
177 | * f = 1/fullness_threshold_frac | ||
178 | * | ||
179 | * Similarly, we assign zspage to: | ||
180 | * ZS_ALMOST_FULL when n > N / f | ||
181 | * ZS_EMPTY when n == 0 | ||
182 | * ZS_FULL when n == N | ||
183 | * | ||
184 | * (see: fix_fullness_group()) | ||
185 | */ | ||
186 | static const int fullness_threshold_frac = 4; | ||
187 | |||
188 | struct size_class { | ||
189 | /* | ||
190 | * Size of objects stored in this class. Must be multiple | ||
191 | * of ZS_ALIGN. | ||
192 | */ | ||
193 | int size; | ||
194 | unsigned int index; | ||
195 | |||
196 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ | ||
197 | int pages_per_zspage; | ||
198 | |||
199 | spinlock_t lock; | ||
200 | |||
201 | /* stats */ | ||
202 | u64 pages_allocated; | ||
203 | |||
204 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; | ||
205 | }; | ||
206 | |||
207 | /* | ||
208 | * Placed within free objects to form a singly linked list. | ||
209 | * For every zspage, first_page->freelist gives head of this list. | ||
210 | * | ||
211 | * This must be power of 2 and less than or equal to ZS_ALIGN | ||
212 | */ | ||
213 | struct link_free { | ||
214 | /* Handle of next free chunk (encodes <PFN, obj_idx>) */ | ||
215 | void *next; | ||
216 | }; | ||
217 | |||
218 | struct zs_pool { | ||
219 | struct size_class size_class[ZS_SIZE_CLASSES]; | ||
220 | |||
221 | gfp_t flags; /* allocation flags used when growing pool */ | ||
222 | }; | ||
223 | |||
224 | /* | ||
225 | * A zspage's class index and fullness group | ||
226 | * are encoded in its (first)page->mapping | ||
227 | */ | ||
228 | #define CLASS_IDX_BITS 28 | ||
229 | #define FULLNESS_BITS 4 | ||
230 | #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) | ||
231 | #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) | ||
232 | |||
233 | struct mapping_area { | ||
234 | #ifdef CONFIG_PGTABLE_MAPPING | ||
235 | struct vm_struct *vm; /* vm area for mapping object that span pages */ | ||
236 | #else | ||
237 | char *vm_buf; /* copy buffer for objects that span pages */ | ||
238 | #endif | ||
239 | char *vm_addr; /* address of kmap_atomic()'ed pages */ | ||
240 | enum zs_mapmode vm_mm; /* mapping mode */ | ||
241 | }; | ||
242 | |||
243 | |||
244 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | ||
245 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); | ||
246 | |||
247 | static int is_first_page(struct page *page) | ||
248 | { | ||
249 | return PagePrivate(page); | ||
250 | } | ||
251 | |||
252 | static int is_last_page(struct page *page) | ||
253 | { | ||
254 | return PagePrivate2(page); | ||
255 | } | ||
256 | |||
257 | static void get_zspage_mapping(struct page *page, unsigned int *class_idx, | ||
258 | enum fullness_group *fullness) | ||
259 | { | ||
260 | unsigned long m; | ||
261 | BUG_ON(!is_first_page(page)); | ||
262 | |||
263 | m = (unsigned long)page->mapping; | ||
264 | *fullness = m & FULLNESS_MASK; | ||
265 | *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; | ||
266 | } | ||
267 | |||
268 | static void set_zspage_mapping(struct page *page, unsigned int class_idx, | ||
269 | enum fullness_group fullness) | ||
270 | { | ||
271 | unsigned long m; | ||
272 | BUG_ON(!is_first_page(page)); | ||
273 | |||
274 | m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | | ||
275 | (fullness & FULLNESS_MASK); | ||
276 | page->mapping = (struct address_space *)m; | ||
277 | } | ||
278 | |||
279 | /* | ||
280 | * zsmalloc divides the pool into various size classes where each | ||
281 | * class maintains a list of zspages where each zspage is divided | ||
282 | * into equal sized chunks. Each allocation falls into one of these | ||
283 | * classes depending on its size. This function returns index of the | ||
284 | * size class which has chunk size big enough to hold the give size. | ||
285 | */ | ||
286 | static int get_size_class_index(int size) | ||
287 | { | ||
288 | int idx = 0; | ||
289 | |||
290 | if (likely(size > ZS_MIN_ALLOC_SIZE)) | ||
291 | idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, | ||
292 | ZS_SIZE_CLASS_DELTA); | ||
293 | |||
294 | return idx; | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * For each size class, zspages are divided into different groups | ||
299 | * depending on how "full" they are. This was done so that we could | ||
300 | * easily find empty or nearly empty zspages when we try to shrink | ||
301 | * the pool (not yet implemented). This function returns fullness | ||
302 | * status of the given page. | ||
303 | */ | ||
304 | static enum fullness_group get_fullness_group(struct page *page) | ||
305 | { | ||
306 | int inuse, max_objects; | ||
307 | enum fullness_group fg; | ||
308 | BUG_ON(!is_first_page(page)); | ||
309 | |||
310 | inuse = page->inuse; | ||
311 | max_objects = page->objects; | ||
312 | |||
313 | if (inuse == 0) | ||
314 | fg = ZS_EMPTY; | ||
315 | else if (inuse == max_objects) | ||
316 | fg = ZS_FULL; | ||
317 | else if (inuse <= max_objects / fullness_threshold_frac) | ||
318 | fg = ZS_ALMOST_EMPTY; | ||
319 | else | ||
320 | fg = ZS_ALMOST_FULL; | ||
321 | |||
322 | return fg; | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Each size class maintains various freelists and zspages are assigned | ||
327 | * to one of these freelists based on the number of live objects they | ||
328 | * have. This functions inserts the given zspage into the freelist | ||
329 | * identified by <class, fullness_group>. | ||
330 | */ | ||
331 | static void insert_zspage(struct page *page, struct size_class *class, | ||
332 | enum fullness_group fullness) | ||
333 | { | ||
334 | struct page **head; | ||
335 | |||
336 | BUG_ON(!is_first_page(page)); | ||
337 | |||
338 | if (fullness >= _ZS_NR_FULLNESS_GROUPS) | ||
339 | return; | ||
340 | |||
341 | head = &class->fullness_list[fullness]; | ||
342 | if (*head) | ||
343 | list_add_tail(&page->lru, &(*head)->lru); | ||
344 | |||
345 | *head = page; | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | * This function removes the given zspage from the freelist identified | ||
350 | * by <class, fullness_group>. | ||
351 | */ | ||
352 | static void remove_zspage(struct page *page, struct size_class *class, | ||
353 | enum fullness_group fullness) | ||
354 | { | ||
355 | struct page **head; | ||
356 | |||
357 | BUG_ON(!is_first_page(page)); | ||
358 | |||
359 | if (fullness >= _ZS_NR_FULLNESS_GROUPS) | ||
360 | return; | ||
361 | |||
362 | head = &class->fullness_list[fullness]; | ||
363 | BUG_ON(!*head); | ||
364 | if (list_empty(&(*head)->lru)) | ||
365 | *head = NULL; | ||
366 | else if (*head == page) | ||
367 | *head = (struct page *)list_entry((*head)->lru.next, | ||
368 | struct page, lru); | ||
369 | |||
370 | list_del_init(&page->lru); | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * Each size class maintains zspages in different fullness groups depending | ||
375 | * on the number of live objects they contain. When allocating or freeing | ||
376 | * objects, the fullness status of the page can change, say, from ALMOST_FULL | ||
377 | * to ALMOST_EMPTY when freeing an object. This function checks if such | ||
378 | * a status change has occurred for the given page and accordingly moves the | ||
379 | * page from the freelist of the old fullness group to that of the new | ||
380 | * fullness group. | ||
381 | */ | ||
382 | static enum fullness_group fix_fullness_group(struct zs_pool *pool, | ||
383 | struct page *page) | ||
384 | { | ||
385 | int class_idx; | ||
386 | struct size_class *class; | ||
387 | enum fullness_group currfg, newfg; | ||
388 | |||
389 | BUG_ON(!is_first_page(page)); | ||
390 | |||
391 | get_zspage_mapping(page, &class_idx, &currfg); | ||
392 | newfg = get_fullness_group(page); | ||
393 | if (newfg == currfg) | ||
394 | goto out; | ||
395 | |||
396 | class = &pool->size_class[class_idx]; | ||
397 | remove_zspage(page, class, currfg); | ||
398 | insert_zspage(page, class, newfg); | ||
399 | set_zspage_mapping(page, class_idx, newfg); | ||
400 | |||
401 | out: | ||
402 | return newfg; | ||
403 | } | ||
404 | |||
405 | /* | ||
406 | * We have to decide on how many pages to link together | ||
407 | * to form a zspage for each size class. This is important | ||
408 | * to reduce wastage due to unusable space left at end of | ||
409 | * each zspage which is given as: | ||
410 | * wastage = Zp - Zp % size_class | ||
411 | * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... | ||
412 | * | ||
413 | * For example, for size class of 3/8 * PAGE_SIZE, we should | ||
414 | * link together 3 PAGE_SIZE sized pages to form a zspage | ||
415 | * since then we can perfectly fit in 8 such objects. | ||
416 | */ | ||
417 | static int get_pages_per_zspage(int class_size) | ||
418 | { | ||
419 | int i, max_usedpc = 0; | ||
420 | /* zspage order which gives maximum used size per KB */ | ||
421 | int max_usedpc_order = 1; | ||
422 | |||
423 | for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { | ||
424 | int zspage_size; | ||
425 | int waste, usedpc; | ||
426 | |||
427 | zspage_size = i * PAGE_SIZE; | ||
428 | waste = zspage_size % class_size; | ||
429 | usedpc = (zspage_size - waste) * 100 / zspage_size; | ||
430 | |||
431 | if (usedpc > max_usedpc) { | ||
432 | max_usedpc = usedpc; | ||
433 | max_usedpc_order = i; | ||
434 | } | ||
435 | } | ||
436 | |||
437 | return max_usedpc_order; | ||
438 | } | ||
439 | |||
440 | /* | ||
441 | * A single 'zspage' is composed of many system pages which are | ||
442 | * linked together using fields in struct page. This function finds | ||
443 | * the first/head page, given any component page of a zspage. | ||
444 | */ | ||
445 | static struct page *get_first_page(struct page *page) | ||
446 | { | ||
447 | if (is_first_page(page)) | ||
448 | return page; | ||
449 | else | ||
450 | return page->first_page; | ||
451 | } | ||
452 | |||
453 | static struct page *get_next_page(struct page *page) | ||
454 | { | ||
455 | struct page *next; | ||
456 | |||
457 | if (is_last_page(page)) | ||
458 | next = NULL; | ||
459 | else if (is_first_page(page)) | ||
460 | next = (struct page *)page_private(page); | ||
461 | else | ||
462 | next = list_entry(page->lru.next, struct page, lru); | ||
463 | |||
464 | return next; | ||
465 | } | ||
466 | |||
467 | /* | ||
468 | * Encode <page, obj_idx> as a single handle value. | ||
469 | * On hardware platforms with physical memory starting at 0x0 the pfn | ||
470 | * could be 0 so we ensure that the handle will never be 0 by adjusting the | ||
471 | * encoded obj_idx value before encoding. | ||
472 | */ | ||
473 | static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) | ||
474 | { | ||
475 | unsigned long handle; | ||
476 | |||
477 | if (!page) { | ||
478 | BUG_ON(obj_idx); | ||
479 | return NULL; | ||
480 | } | ||
481 | |||
482 | handle = page_to_pfn(page) << OBJ_INDEX_BITS; | ||
483 | handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); | ||
484 | |||
485 | return (void *)handle; | ||
486 | } | ||
487 | |||
488 | /* | ||
489 | * Decode <page, obj_idx> pair from the given object handle. We adjust the | ||
490 | * decoded obj_idx back to its original value since it was adjusted in | ||
491 | * obj_location_to_handle(). | ||
492 | */ | ||
493 | static void obj_handle_to_location(unsigned long handle, struct page **page, | ||
494 | unsigned long *obj_idx) | ||
495 | { | ||
496 | *page = pfn_to_page(handle >> OBJ_INDEX_BITS); | ||
497 | *obj_idx = (handle & OBJ_INDEX_MASK) - 1; | ||
498 | } | ||
499 | |||
500 | static unsigned long obj_idx_to_offset(struct page *page, | ||
501 | unsigned long obj_idx, int class_size) | ||
502 | { | ||
503 | unsigned long off = 0; | ||
504 | |||
505 | if (!is_first_page(page)) | ||
506 | off = page->index; | ||
507 | |||
508 | return off + obj_idx * class_size; | ||
509 | } | ||
510 | |||
511 | static void reset_page(struct page *page) | ||
512 | { | ||
513 | clear_bit(PG_private, &page->flags); | ||
514 | clear_bit(PG_private_2, &page->flags); | ||
515 | set_page_private(page, 0); | ||
516 | page->mapping = NULL; | ||
517 | page->freelist = NULL; | ||
518 | page_mapcount_reset(page); | ||
519 | } | ||
520 | |||
521 | static void free_zspage(struct page *first_page) | ||
522 | { | ||
523 | struct page *nextp, *tmp, *head_extra; | ||
524 | |||
525 | BUG_ON(!is_first_page(first_page)); | ||
526 | BUG_ON(first_page->inuse); | ||
527 | |||
528 | head_extra = (struct page *)page_private(first_page); | ||
529 | |||
530 | reset_page(first_page); | ||
531 | __free_page(first_page); | ||
532 | |||
533 | /* zspage with only 1 system page */ | ||
534 | if (!head_extra) | ||
535 | return; | ||
536 | |||
537 | list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { | ||
538 | list_del(&nextp->lru); | ||
539 | reset_page(nextp); | ||
540 | __free_page(nextp); | ||
541 | } | ||
542 | reset_page(head_extra); | ||
543 | __free_page(head_extra); | ||
544 | } | ||
545 | |||
546 | /* Initialize a newly allocated zspage */ | ||
547 | static void init_zspage(struct page *first_page, struct size_class *class) | ||
548 | { | ||
549 | unsigned long off = 0; | ||
550 | struct page *page = first_page; | ||
551 | |||
552 | BUG_ON(!is_first_page(first_page)); | ||
553 | while (page) { | ||
554 | struct page *next_page; | ||
555 | struct link_free *link; | ||
556 | unsigned int i, objs_on_page; | ||
557 | |||
558 | /* | ||
559 | * page->index stores offset of first object starting | ||
560 | * in the page. For the first page, this is always 0, | ||
561 | * so we use first_page->index (aka ->freelist) to store | ||
562 | * head of corresponding zspage's freelist. | ||
563 | */ | ||
564 | if (page != first_page) | ||
565 | page->index = off; | ||
566 | |||
567 | link = (struct link_free *)kmap_atomic(page) + | ||
568 | off / sizeof(*link); | ||
569 | objs_on_page = (PAGE_SIZE - off) / class->size; | ||
570 | |||
571 | for (i = 1; i <= objs_on_page; i++) { | ||
572 | off += class->size; | ||
573 | if (off < PAGE_SIZE) { | ||
574 | link->next = obj_location_to_handle(page, i); | ||
575 | link += class->size / sizeof(*link); | ||
576 | } | ||
577 | } | ||
578 | |||
579 | /* | ||
580 | * We now come to the last (full or partial) object on this | ||
581 | * page, which must point to the first object on the next | ||
582 | * page (if present) | ||
583 | */ | ||
584 | next_page = get_next_page(page); | ||
585 | link->next = obj_location_to_handle(next_page, 0); | ||
586 | kunmap_atomic(link); | ||
587 | page = next_page; | ||
588 | off = (off + class->size) % PAGE_SIZE; | ||
589 | } | ||
590 | } | ||
591 | |||
592 | /* | ||
593 | * Allocate a zspage for the given size class | ||
594 | */ | ||
595 | static struct page *alloc_zspage(struct size_class *class, gfp_t flags) | ||
596 | { | ||
597 | int i, error; | ||
598 | struct page *first_page = NULL, *uninitialized_var(prev_page); | ||
599 | |||
600 | /* | ||
601 | * Allocate individual pages and link them together as: | ||
602 | * 1. first page->private = first sub-page | ||
603 | * 2. all sub-pages are linked together using page->lru | ||
604 | * 3. each sub-page is linked to the first page using page->first_page | ||
605 | * | ||
606 | * For each size class, First/Head pages are linked together using | ||
607 | * page->lru. Also, we set PG_private to identify the first page | ||
608 | * (i.e. no other sub-page has this flag set) and PG_private_2 to | ||
609 | * identify the last page. | ||
610 | */ | ||
611 | error = -ENOMEM; | ||
612 | for (i = 0; i < class->pages_per_zspage; i++) { | ||
613 | struct page *page; | ||
614 | |||
615 | page = alloc_page(flags); | ||
616 | if (!page) | ||
617 | goto cleanup; | ||
618 | |||
619 | INIT_LIST_HEAD(&page->lru); | ||
620 | if (i == 0) { /* first page */ | ||
621 | SetPagePrivate(page); | ||
622 | set_page_private(page, 0); | ||
623 | first_page = page; | ||
624 | first_page->inuse = 0; | ||
625 | } | ||
626 | if (i == 1) | ||
627 | set_page_private(first_page, (unsigned long)page); | ||
628 | if (i >= 1) | ||
629 | page->first_page = first_page; | ||
630 | if (i >= 2) | ||
631 | list_add(&page->lru, &prev_page->lru); | ||
632 | if (i == class->pages_per_zspage - 1) /* last page */ | ||
633 | SetPagePrivate2(page); | ||
634 | prev_page = page; | ||
635 | } | ||
636 | |||
637 | init_zspage(first_page, class); | ||
638 | |||
639 | first_page->freelist = obj_location_to_handle(first_page, 0); | ||
640 | /* Maximum number of objects we can store in this zspage */ | ||
641 | first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; | ||
642 | |||
643 | error = 0; /* Success */ | ||
644 | |||
645 | cleanup: | ||
646 | if (unlikely(error) && first_page) { | ||
647 | free_zspage(first_page); | ||
648 | first_page = NULL; | ||
649 | } | ||
650 | |||
651 | return first_page; | ||
652 | } | ||
653 | |||
654 | static struct page *find_get_zspage(struct size_class *class) | ||
655 | { | ||
656 | int i; | ||
657 | struct page *page; | ||
658 | |||
659 | for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { | ||
660 | page = class->fullness_list[i]; | ||
661 | if (page) | ||
662 | break; | ||
663 | } | ||
664 | |||
665 | return page; | ||
666 | } | ||
667 | |||
668 | #ifdef CONFIG_PGTABLE_MAPPING | ||
669 | static inline int __zs_cpu_up(struct mapping_area *area) | ||
670 | { | ||
671 | /* | ||
672 | * Make sure we don't leak memory if a cpu UP notification | ||
673 | * and zs_init() race and both call zs_cpu_up() on the same cpu | ||
674 | */ | ||
675 | if (area->vm) | ||
676 | return 0; | ||
677 | area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); | ||
678 | if (!area->vm) | ||
679 | return -ENOMEM; | ||
680 | return 0; | ||
681 | } | ||
682 | |||
683 | static inline void __zs_cpu_down(struct mapping_area *area) | ||
684 | { | ||
685 | if (area->vm) | ||
686 | free_vm_area(area->vm); | ||
687 | area->vm = NULL; | ||
688 | } | ||
689 | |||
690 | static inline void *__zs_map_object(struct mapping_area *area, | ||
691 | struct page *pages[2], int off, int size) | ||
692 | { | ||
693 | BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); | ||
694 | area->vm_addr = area->vm->addr; | ||
695 | return area->vm_addr + off; | ||
696 | } | ||
697 | |||
698 | static inline void __zs_unmap_object(struct mapping_area *area, | ||
699 | struct page *pages[2], int off, int size) | ||
700 | { | ||
701 | unsigned long addr = (unsigned long)area->vm_addr; | ||
702 | |||
703 | unmap_kernel_range(addr, PAGE_SIZE * 2); | ||
704 | } | ||
705 | |||
706 | #else /* CONFIG_PGTABLE_MAPPING */ | ||
707 | |||
708 | static inline int __zs_cpu_up(struct mapping_area *area) | ||
709 | { | ||
710 | /* | ||
711 | * Make sure we don't leak memory if a cpu UP notification | ||
712 | * and zs_init() race and both call zs_cpu_up() on the same cpu | ||
713 | */ | ||
714 | if (area->vm_buf) | ||
715 | return 0; | ||
716 | area->vm_buf = (char *)__get_free_page(GFP_KERNEL); | ||
717 | if (!area->vm_buf) | ||
718 | return -ENOMEM; | ||
719 | return 0; | ||
720 | } | ||
721 | |||
722 | static inline void __zs_cpu_down(struct mapping_area *area) | ||
723 | { | ||
724 | if (area->vm_buf) | ||
725 | free_page((unsigned long)area->vm_buf); | ||
726 | area->vm_buf = NULL; | ||
727 | } | ||
728 | |||
729 | static void *__zs_map_object(struct mapping_area *area, | ||
730 | struct page *pages[2], int off, int size) | ||
731 | { | ||
732 | int sizes[2]; | ||
733 | void *addr; | ||
734 | char *buf = area->vm_buf; | ||
735 | |||
736 | /* disable page faults to match kmap_atomic() return conditions */ | ||
737 | pagefault_disable(); | ||
738 | |||
739 | /* no read fastpath */ | ||
740 | if (area->vm_mm == ZS_MM_WO) | ||
741 | goto out; | ||
742 | |||
743 | sizes[0] = PAGE_SIZE - off; | ||
744 | sizes[1] = size - sizes[0]; | ||
745 | |||
746 | /* copy object to per-cpu buffer */ | ||
747 | addr = kmap_atomic(pages[0]); | ||
748 | memcpy(buf, addr + off, sizes[0]); | ||
749 | kunmap_atomic(addr); | ||
750 | addr = kmap_atomic(pages[1]); | ||
751 | memcpy(buf + sizes[0], addr, sizes[1]); | ||
752 | kunmap_atomic(addr); | ||
753 | out: | ||
754 | return area->vm_buf; | ||
755 | } | ||
756 | |||
757 | static void __zs_unmap_object(struct mapping_area *area, | ||
758 | struct page *pages[2], int off, int size) | ||
759 | { | ||
760 | int sizes[2]; | ||
761 | void *addr; | ||
762 | char *buf = area->vm_buf; | ||
763 | |||
764 | /* no write fastpath */ | ||
765 | if (area->vm_mm == ZS_MM_RO) | ||
766 | goto out; | ||
767 | |||
768 | sizes[0] = PAGE_SIZE - off; | ||
769 | sizes[1] = size - sizes[0]; | ||
770 | |||
771 | /* copy per-cpu buffer to object */ | ||
772 | addr = kmap_atomic(pages[0]); | ||
773 | memcpy(addr + off, buf, sizes[0]); | ||
774 | kunmap_atomic(addr); | ||
775 | addr = kmap_atomic(pages[1]); | ||
776 | memcpy(addr, buf + sizes[0], sizes[1]); | ||
777 | kunmap_atomic(addr); | ||
778 | |||
779 | out: | ||
780 | /* enable page faults to match kunmap_atomic() return conditions */ | ||
781 | pagefault_enable(); | ||
782 | } | ||
783 | |||
784 | #endif /* CONFIG_PGTABLE_MAPPING */ | ||
785 | |||
786 | static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, | ||
787 | void *pcpu) | ||
788 | { | ||
789 | int ret, cpu = (long)pcpu; | ||
790 | struct mapping_area *area; | ||
791 | |||
792 | switch (action) { | ||
793 | case CPU_UP_PREPARE: | ||
794 | area = &per_cpu(zs_map_area, cpu); | ||
795 | ret = __zs_cpu_up(area); | ||
796 | if (ret) | ||
797 | return notifier_from_errno(ret); | ||
798 | break; | ||
799 | case CPU_DEAD: | ||
800 | case CPU_UP_CANCELED: | ||
801 | area = &per_cpu(zs_map_area, cpu); | ||
802 | __zs_cpu_down(area); | ||
803 | break; | ||
804 | } | ||
805 | |||
806 | return NOTIFY_OK; | ||
807 | } | ||
808 | |||
809 | static struct notifier_block zs_cpu_nb = { | ||
810 | .notifier_call = zs_cpu_notifier | ||
811 | }; | ||
812 | |||
813 | static void zs_exit(void) | ||
814 | { | ||
815 | int cpu; | ||
816 | |||
817 | for_each_online_cpu(cpu) | ||
818 | zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); | ||
819 | unregister_cpu_notifier(&zs_cpu_nb); | ||
820 | } | ||
821 | |||
822 | static int zs_init(void) | ||
823 | { | ||
824 | int cpu, ret; | ||
825 | |||
826 | register_cpu_notifier(&zs_cpu_nb); | ||
827 | for_each_online_cpu(cpu) { | ||
828 | ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | ||
829 | if (notifier_to_errno(ret)) | ||
830 | goto fail; | ||
831 | } | ||
832 | return 0; | ||
833 | fail: | ||
834 | zs_exit(); | ||
835 | return notifier_to_errno(ret); | ||
836 | } | ||
837 | |||
838 | /** | ||
839 | * zs_create_pool - Creates an allocation pool to work from. | ||
840 | * @flags: allocation flags used to allocate pool metadata | ||
841 | * | ||
842 | * This function must be called before anything when using | ||
843 | * the zsmalloc allocator. | ||
844 | * | ||
845 | * On success, a pointer to the newly created pool is returned, | ||
846 | * otherwise NULL. | ||
847 | */ | ||
848 | struct zs_pool *zs_create_pool(gfp_t flags) | ||
849 | { | ||
850 | int i, ovhd_size; | ||
851 | struct zs_pool *pool; | ||
852 | |||
853 | ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); | ||
854 | pool = kzalloc(ovhd_size, GFP_KERNEL); | ||
855 | if (!pool) | ||
856 | return NULL; | ||
857 | |||
858 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | ||
859 | int size; | ||
860 | struct size_class *class; | ||
861 | |||
862 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; | ||
863 | if (size > ZS_MAX_ALLOC_SIZE) | ||
864 | size = ZS_MAX_ALLOC_SIZE; | ||
865 | |||
866 | class = &pool->size_class[i]; | ||
867 | class->size = size; | ||
868 | class->index = i; | ||
869 | spin_lock_init(&class->lock); | ||
870 | class->pages_per_zspage = get_pages_per_zspage(size); | ||
871 | |||
872 | } | ||
873 | |||
874 | pool->flags = flags; | ||
875 | |||
876 | return pool; | ||
877 | } | ||
878 | EXPORT_SYMBOL_GPL(zs_create_pool); | ||
879 | |||
880 | void zs_destroy_pool(struct zs_pool *pool) | ||
881 | { | ||
882 | int i; | ||
883 | |||
884 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | ||
885 | int fg; | ||
886 | struct size_class *class = &pool->size_class[i]; | ||
887 | |||
888 | for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { | ||
889 | if (class->fullness_list[fg]) { | ||
890 | pr_info("Freeing non-empty class with size %db, fullness group %d\n", | ||
891 | class->size, fg); | ||
892 | } | ||
893 | } | ||
894 | } | ||
895 | kfree(pool); | ||
896 | } | ||
897 | EXPORT_SYMBOL_GPL(zs_destroy_pool); | ||
898 | |||
899 | /** | ||
900 | * zs_malloc - Allocate block of given size from pool. | ||
901 | * @pool: pool to allocate from | ||
902 | * @size: size of block to allocate | ||
903 | * | ||
904 | * On success, handle to the allocated object is returned, | ||
905 | * otherwise 0. | ||
906 | * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. | ||
907 | */ | ||
908 | unsigned long zs_malloc(struct zs_pool *pool, size_t size) | ||
909 | { | ||
910 | unsigned long obj; | ||
911 | struct link_free *link; | ||
912 | int class_idx; | ||
913 | struct size_class *class; | ||
914 | |||
915 | struct page *first_page, *m_page; | ||
916 | unsigned long m_objidx, m_offset; | ||
917 | |||
918 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) | ||
919 | return 0; | ||
920 | |||
921 | class_idx = get_size_class_index(size); | ||
922 | class = &pool->size_class[class_idx]; | ||
923 | BUG_ON(class_idx != class->index); | ||
924 | |||
925 | spin_lock(&class->lock); | ||
926 | first_page = find_get_zspage(class); | ||
927 | |||
928 | if (!first_page) { | ||
929 | spin_unlock(&class->lock); | ||
930 | first_page = alloc_zspage(class, pool->flags); | ||
931 | if (unlikely(!first_page)) | ||
932 | return 0; | ||
933 | |||
934 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | ||
935 | spin_lock(&class->lock); | ||
936 | class->pages_allocated += class->pages_per_zspage; | ||
937 | } | ||
938 | |||
939 | obj = (unsigned long)first_page->freelist; | ||
940 | obj_handle_to_location(obj, &m_page, &m_objidx); | ||
941 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | ||
942 | |||
943 | link = (struct link_free *)kmap_atomic(m_page) + | ||
944 | m_offset / sizeof(*link); | ||
945 | first_page->freelist = link->next; | ||
946 | memset(link, POISON_INUSE, sizeof(*link)); | ||
947 | kunmap_atomic(link); | ||
948 | |||
949 | first_page->inuse++; | ||
950 | /* Now move the zspage to another fullness group, if required */ | ||
951 | fix_fullness_group(pool, first_page); | ||
952 | spin_unlock(&class->lock); | ||
953 | |||
954 | return obj; | ||
955 | } | ||
956 | EXPORT_SYMBOL_GPL(zs_malloc); | ||
957 | |||
958 | void zs_free(struct zs_pool *pool, unsigned long obj) | ||
959 | { | ||
960 | struct link_free *link; | ||
961 | struct page *first_page, *f_page; | ||
962 | unsigned long f_objidx, f_offset; | ||
963 | |||
964 | int class_idx; | ||
965 | struct size_class *class; | ||
966 | enum fullness_group fullness; | ||
967 | |||
968 | if (unlikely(!obj)) | ||
969 | return; | ||
970 | |||
971 | obj_handle_to_location(obj, &f_page, &f_objidx); | ||
972 | first_page = get_first_page(f_page); | ||
973 | |||
974 | get_zspage_mapping(first_page, &class_idx, &fullness); | ||
975 | class = &pool->size_class[class_idx]; | ||
976 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); | ||
977 | |||
978 | spin_lock(&class->lock); | ||
979 | |||
980 | /* Insert this object in containing zspage's freelist */ | ||
981 | link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) | ||
982 | + f_offset); | ||
983 | link->next = first_page->freelist; | ||
984 | kunmap_atomic(link); | ||
985 | first_page->freelist = (void *)obj; | ||
986 | |||
987 | first_page->inuse--; | ||
988 | fullness = fix_fullness_group(pool, first_page); | ||
989 | |||
990 | if (fullness == ZS_EMPTY) | ||
991 | class->pages_allocated -= class->pages_per_zspage; | ||
992 | |||
993 | spin_unlock(&class->lock); | ||
994 | |||
995 | if (fullness == ZS_EMPTY) | ||
996 | free_zspage(first_page); | ||
997 | } | ||
998 | EXPORT_SYMBOL_GPL(zs_free); | ||
999 | |||
1000 | /** | ||
1001 | * zs_map_object - get address of allocated object from handle. | ||
1002 | * @pool: pool from which the object was allocated | ||
1003 | * @handle: handle returned from zs_malloc | ||
1004 | * | ||
1005 | * Before using an object allocated from zs_malloc, it must be mapped using | ||
1006 | * this function. When done with the object, it must be unmapped using | ||
1007 | * zs_unmap_object. | ||
1008 | * | ||
1009 | * Only one object can be mapped per cpu at a time. There is no protection | ||
1010 | * against nested mappings. | ||
1011 | * | ||
1012 | * This function returns with preemption and page faults disabled. | ||
1013 | */ | ||
1014 | void *zs_map_object(struct zs_pool *pool, unsigned long handle, | ||
1015 | enum zs_mapmode mm) | ||
1016 | { | ||
1017 | struct page *page; | ||
1018 | unsigned long obj_idx, off; | ||
1019 | |||
1020 | unsigned int class_idx; | ||
1021 | enum fullness_group fg; | ||
1022 | struct size_class *class; | ||
1023 | struct mapping_area *area; | ||
1024 | struct page *pages[2]; | ||
1025 | |||
1026 | BUG_ON(!handle); | ||
1027 | |||
1028 | /* | ||
1029 | * Because we use per-cpu mapping areas shared among the | ||
1030 | * pools/users, we can't allow mapping in interrupt context | ||
1031 | * because it can corrupt another users mappings. | ||
1032 | */ | ||
1033 | BUG_ON(in_interrupt()); | ||
1034 | |||
1035 | obj_handle_to_location(handle, &page, &obj_idx); | ||
1036 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | ||
1037 | class = &pool->size_class[class_idx]; | ||
1038 | off = obj_idx_to_offset(page, obj_idx, class->size); | ||
1039 | |||
1040 | area = &get_cpu_var(zs_map_area); | ||
1041 | area->vm_mm = mm; | ||
1042 | if (off + class->size <= PAGE_SIZE) { | ||
1043 | /* this object is contained entirely within a page */ | ||
1044 | area->vm_addr = kmap_atomic(page); | ||
1045 | return area->vm_addr + off; | ||
1046 | } | ||
1047 | |||
1048 | /* this object spans two pages */ | ||
1049 | pages[0] = page; | ||
1050 | pages[1] = get_next_page(page); | ||
1051 | BUG_ON(!pages[1]); | ||
1052 | |||
1053 | return __zs_map_object(area, pages, off, class->size); | ||
1054 | } | ||
1055 | EXPORT_SYMBOL_GPL(zs_map_object); | ||
1056 | |||
1057 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | ||
1058 | { | ||
1059 | struct page *page; | ||
1060 | unsigned long obj_idx, off; | ||
1061 | |||
1062 | unsigned int class_idx; | ||
1063 | enum fullness_group fg; | ||
1064 | struct size_class *class; | ||
1065 | struct mapping_area *area; | ||
1066 | |||
1067 | BUG_ON(!handle); | ||
1068 | |||
1069 | obj_handle_to_location(handle, &page, &obj_idx); | ||
1070 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | ||
1071 | class = &pool->size_class[class_idx]; | ||
1072 | off = obj_idx_to_offset(page, obj_idx, class->size); | ||
1073 | |||
1074 | area = &__get_cpu_var(zs_map_area); | ||
1075 | if (off + class->size <= PAGE_SIZE) | ||
1076 | kunmap_atomic(area->vm_addr); | ||
1077 | else { | ||
1078 | struct page *pages[2]; | ||
1079 | |||
1080 | pages[0] = page; | ||
1081 | pages[1] = get_next_page(page); | ||
1082 | BUG_ON(!pages[1]); | ||
1083 | |||
1084 | __zs_unmap_object(area, pages, off, class->size); | ||
1085 | } | ||
1086 | put_cpu_var(zs_map_area); | ||
1087 | } | ||
1088 | EXPORT_SYMBOL_GPL(zs_unmap_object); | ||
1089 | |||
1090 | u64 zs_get_total_size_bytes(struct zs_pool *pool) | ||
1091 | { | ||
1092 | int i; | ||
1093 | u64 npages = 0; | ||
1094 | |||
1095 | for (i = 0; i < ZS_SIZE_CLASSES; i++) | ||
1096 | npages += pool->size_class[i].pages_allocated; | ||
1097 | |||
1098 | return npages << PAGE_SHIFT; | ||
1099 | } | ||
1100 | EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); | ||
1101 | |||
1102 | module_init(zs_init); | ||
1103 | module_exit(zs_exit); | ||
1104 | |||
1105 | MODULE_LICENSE("Dual BSD/GPL"); | ||
1106 | MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); | ||
diff --git a/mm/zswap.c b/mm/zswap.c index 5a63f78a5601..e55bab9dc41f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -77,12 +77,12 @@ static u64 zswap_duplicate_entry; | |||
77 | **********************************/ | 77 | **********************************/ |
78 | /* Enable/disable zswap (disabled by default, fixed at boot for now) */ | 78 | /* Enable/disable zswap (disabled by default, fixed at boot for now) */ |
79 | static bool zswap_enabled __read_mostly; | 79 | static bool zswap_enabled __read_mostly; |
80 | module_param_named(enabled, zswap_enabled, bool, 0); | 80 | module_param_named(enabled, zswap_enabled, bool, 0444); |
81 | 81 | ||
82 | /* Compressor to be used by zswap (fixed at boot for now) */ | 82 | /* Compressor to be used by zswap (fixed at boot for now) */ |
83 | #define ZSWAP_COMPRESSOR_DEFAULT "lzo" | 83 | #define ZSWAP_COMPRESSOR_DEFAULT "lzo" |
84 | static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; | 84 | static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; |
85 | module_param_named(compressor, zswap_compressor, charp, 0); | 85 | module_param_named(compressor, zswap_compressor, charp, 0444); |
86 | 86 | ||
87 | /* The maximum percentage of memory that the compressed pool can occupy */ | 87 | /* The maximum percentage of memory that the compressed pool can occupy */ |
88 | static unsigned int zswap_max_pool_percent = 20; | 88 | static unsigned int zswap_max_pool_percent = 20; |