diff options
Diffstat (limited to 'mm')
53 files changed, 3726 insertions, 1749 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 723bbe04a0b0..2888024e0b0a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -552,3 +552,28 @@ config MEM_SOFT_DIRTY | |||
| 552 | it can be cleared by hands. | 552 | it can be cleared by hands. |
| 553 | 553 | ||
| 554 | See Documentation/vm/soft-dirty.txt for more details. | 554 | See Documentation/vm/soft-dirty.txt for more details. |
| 555 | |||
| 556 | config ZSMALLOC | ||
| 557 | bool "Memory allocator for compressed pages" | ||
| 558 | depends on MMU | ||
| 559 | default n | ||
| 560 | help | ||
| 561 | zsmalloc is a slab-based memory allocator designed to store | ||
| 562 | compressed RAM pages. zsmalloc uses virtual memory mapping | ||
| 563 | in order to reduce fragmentation. However, this results in a | ||
| 564 | non-standard allocator interface where a handle, not a pointer, is | ||
| 565 | returned by an alloc(). This handle must be mapped in order to | ||
| 566 | access the allocated space. | ||
| 567 | |||
| 568 | config PGTABLE_MAPPING | ||
| 569 | bool "Use page table mapping to access object in zsmalloc" | ||
| 570 | depends on ZSMALLOC | ||
| 571 | help | ||
| 572 | By default, zsmalloc uses a copy-based object mapping method to | ||
| 573 | access allocations that span two pages. However, if a particular | ||
| 574 | architecture (ex, ARM) performs VM mapping faster than copying, | ||
| 575 | then you should select this. This causes zsmalloc to use page table | ||
| 576 | mapping rather than copying for object mapping. | ||
| 577 | |||
| 578 | You can check speed with zsmalloc benchmark: | ||
| 579 | https://github.com/spartacus06/zsmapbench | ||
diff --git a/mm/Makefile b/mm/Makefile index 305d10acd081..310c90a09264 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | |||
| 60 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 60 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
| 61 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | 61 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o |
| 62 | obj-$(CONFIG_ZBUD) += zbud.o | 62 | obj-$(CONFIG_ZBUD) += zbud.o |
| 63 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o | ||
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 07dbc8ec46cf..6e45a5074bf0 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c | |||
| @@ -267,7 +267,7 @@ void balloon_page_putback(struct page *page) | |||
| 267 | put_page(page); | 267 | put_page(page); |
| 268 | } else { | 268 | } else { |
| 269 | WARN_ON(1); | 269 | WARN_ON(1); |
| 270 | dump_page(page); | 270 | dump_page(page, "not movable balloon page"); |
| 271 | } | 271 | } |
| 272 | unlock_page(page); | 272 | unlock_page(page); |
| 273 | } | 273 | } |
| @@ -287,7 +287,7 @@ int balloon_page_migrate(struct page *newpage, | |||
| 287 | BUG_ON(!trylock_page(newpage)); | 287 | BUG_ON(!trylock_page(newpage)); |
| 288 | 288 | ||
| 289 | if (WARN_ON(!__is_movable_balloon_page(page))) { | 289 | if (WARN_ON(!__is_movable_balloon_page(page))) { |
| 290 | dump_page(page); | 290 | dump_page(page, "not movable balloon page"); |
| 291 | unlock_page(newpage); | 291 | unlock_page(newpage); |
| 292 | return rc; | 292 | return rc; |
| 293 | } | 293 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index 5a7d58fb883b..523918b8c6dc 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
| @@ -98,27 +98,24 @@ int init_emergency_isa_pool(void) | |||
| 98 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) | 98 | static void copy_to_high_bio_irq(struct bio *to, struct bio *from) |
| 99 | { | 99 | { |
| 100 | unsigned char *vfrom; | 100 | unsigned char *vfrom; |
| 101 | struct bio_vec *tovec, *fromvec; | 101 | struct bio_vec tovec, *fromvec = from->bi_io_vec; |
| 102 | int i; | 102 | struct bvec_iter iter; |
| 103 | 103 | ||
| 104 | bio_for_each_segment(tovec, to, i) { | 104 | bio_for_each_segment(tovec, to, iter) { |
| 105 | fromvec = from->bi_io_vec + i; | 105 | if (tovec.bv_page != fromvec->bv_page) { |
| 106 | 106 | /* | |
| 107 | /* | 107 | * fromvec->bv_offset and fromvec->bv_len might have |
| 108 | * not bounced | 108 | * been modified by the block layer, so use the original |
| 109 | */ | 109 | * copy, bounce_copy_vec already uses tovec->bv_len |
| 110 | if (tovec->bv_page == fromvec->bv_page) | 110 | */ |
| 111 | continue; | 111 | vfrom = page_address(fromvec->bv_page) + |
| 112 | 112 | tovec.bv_offset; | |
| 113 | /* | 113 | |
| 114 | * fromvec->bv_offset and fromvec->bv_len might have been | 114 | bounce_copy_vec(&tovec, vfrom); |
| 115 | * modified by the block layer, so use the original copy, | 115 | flush_dcache_page(tovec.bv_page); |
| 116 | * bounce_copy_vec already uses tovec->bv_len | 116 | } |
| 117 | */ | ||
| 118 | vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; | ||
| 119 | 117 | ||
| 120 | bounce_copy_vec(tovec, vfrom); | 118 | fromvec++; |
| 121 | flush_dcache_page(tovec->bv_page); | ||
| 122 | } | 119 | } |
| 123 | } | 120 | } |
| 124 | 121 | ||
| @@ -201,13 +198,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
| 201 | { | 198 | { |
| 202 | struct bio *bio; | 199 | struct bio *bio; |
| 203 | int rw = bio_data_dir(*bio_orig); | 200 | int rw = bio_data_dir(*bio_orig); |
| 204 | struct bio_vec *to, *from; | 201 | struct bio_vec *to, from; |
| 202 | struct bvec_iter iter; | ||
| 205 | unsigned i; | 203 | unsigned i; |
| 206 | 204 | ||
| 207 | if (force) | 205 | if (force) |
| 208 | goto bounce; | 206 | goto bounce; |
| 209 | bio_for_each_segment(from, *bio_orig, i) | 207 | bio_for_each_segment(from, *bio_orig, iter) |
| 210 | if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) | 208 | if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) |
| 211 | goto bounce; | 209 | goto bounce; |
| 212 | 210 | ||
| 213 | return; | 211 | return; |
diff --git a/mm/cleancache.c b/mm/cleancache.c index 5875f48ce279..d0eac4350403 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c | |||
| @@ -237,7 +237,7 @@ int __cleancache_get_page(struct page *page) | |||
| 237 | goto out; | 237 | goto out; |
| 238 | } | 238 | } |
| 239 | 239 | ||
| 240 | VM_BUG_ON(!PageLocked(page)); | 240 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 241 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; | 241 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; |
| 242 | if (fake_pool_id < 0) | 242 | if (fake_pool_id < 0) |
| 243 | goto out; | 243 | goto out; |
| @@ -279,7 +279,7 @@ void __cleancache_put_page(struct page *page) | |||
| 279 | return; | 279 | return; |
| 280 | } | 280 | } |
| 281 | 281 | ||
| 282 | VM_BUG_ON(!PageLocked(page)); | 282 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 283 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; | 283 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; |
| 284 | if (fake_pool_id < 0) | 284 | if (fake_pool_id < 0) |
| 285 | return; | 285 | return; |
| @@ -318,7 +318,7 @@ void __cleancache_invalidate_page(struct address_space *mapping, | |||
| 318 | if (pool_id < 0) | 318 | if (pool_id < 0) |
| 319 | return; | 319 | return; |
| 320 | 320 | ||
| 321 | VM_BUG_ON(!PageLocked(page)); | 321 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 322 | if (cleancache_get_key(mapping->host, &key) >= 0) { | 322 | if (cleancache_get_key(mapping->host, &key) >= 0) { |
| 323 | cleancache_ops->invalidate_page(pool_id, | 323 | cleancache_ops->invalidate_page(pool_id, |
| 324 | key, page->index); | 324 | key, page->index); |
diff --git a/mm/compaction.c b/mm/compaction.c index f58bcd016f43..918577595ea8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -251,7 +251,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
| 251 | { | 251 | { |
| 252 | int nr_scanned = 0, total_isolated = 0; | 252 | int nr_scanned = 0, total_isolated = 0; |
| 253 | struct page *cursor, *valid_page = NULL; | 253 | struct page *cursor, *valid_page = NULL; |
| 254 | unsigned long nr_strict_required = end_pfn - blockpfn; | ||
| 255 | unsigned long flags; | 254 | unsigned long flags; |
| 256 | bool locked = false; | 255 | bool locked = false; |
| 257 | 256 | ||
| @@ -264,11 +263,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
| 264 | 263 | ||
| 265 | nr_scanned++; | 264 | nr_scanned++; |
| 266 | if (!pfn_valid_within(blockpfn)) | 265 | if (!pfn_valid_within(blockpfn)) |
| 267 | continue; | 266 | goto isolate_fail; |
| 267 | |||
| 268 | if (!valid_page) | 268 | if (!valid_page) |
| 269 | valid_page = page; | 269 | valid_page = page; |
| 270 | if (!PageBuddy(page)) | 270 | if (!PageBuddy(page)) |
| 271 | continue; | 271 | goto isolate_fail; |
| 272 | 272 | ||
| 273 | /* | 273 | /* |
| 274 | * The zone lock must be held to isolate freepages. | 274 | * The zone lock must be held to isolate freepages. |
| @@ -289,12 +289,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
| 289 | 289 | ||
| 290 | /* Recheck this is a buddy page under lock */ | 290 | /* Recheck this is a buddy page under lock */ |
| 291 | if (!PageBuddy(page)) | 291 | if (!PageBuddy(page)) |
| 292 | continue; | 292 | goto isolate_fail; |
| 293 | 293 | ||
| 294 | /* Found a free page, break it into order-0 pages */ | 294 | /* Found a free page, break it into order-0 pages */ |
| 295 | isolated = split_free_page(page); | 295 | isolated = split_free_page(page); |
| 296 | if (!isolated && strict) | ||
| 297 | break; | ||
| 298 | total_isolated += isolated; | 296 | total_isolated += isolated; |
| 299 | for (i = 0; i < isolated; i++) { | 297 | for (i = 0; i < isolated; i++) { |
| 300 | list_add(&page->lru, freelist); | 298 | list_add(&page->lru, freelist); |
| @@ -305,7 +303,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
| 305 | if (isolated) { | 303 | if (isolated) { |
| 306 | blockpfn += isolated - 1; | 304 | blockpfn += isolated - 1; |
| 307 | cursor += isolated - 1; | 305 | cursor += isolated - 1; |
| 306 | continue; | ||
| 308 | } | 307 | } |
| 308 | |||
| 309 | isolate_fail: | ||
| 310 | if (strict) | ||
| 311 | break; | ||
| 312 | else | ||
| 313 | continue; | ||
| 314 | |||
| 309 | } | 315 | } |
| 310 | 316 | ||
| 311 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | 317 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); |
| @@ -315,7 +321,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
| 315 | * pages requested were isolated. If there were any failures, 0 is | 321 | * pages requested were isolated. If there were any failures, 0 is |
| 316 | * returned and CMA will fail. | 322 | * returned and CMA will fail. |
| 317 | */ | 323 | */ |
| 318 | if (strict && nr_strict_required > total_isolated) | 324 | if (strict && blockpfn < end_pfn) |
| 319 | total_isolated = 0; | 325 | total_isolated = 0; |
| 320 | 326 | ||
| 321 | if (locked) | 327 | if (locked) |
| @@ -459,6 +465,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
| 459 | unsigned long flags; | 465 | unsigned long flags; |
| 460 | bool locked = false; | 466 | bool locked = false; |
| 461 | struct page *page = NULL, *valid_page = NULL; | 467 | struct page *page = NULL, *valid_page = NULL; |
| 468 | bool skipped_async_unsuitable = false; | ||
| 462 | 469 | ||
| 463 | /* | 470 | /* |
| 464 | * Ensure that there are not too many pages isolated from the LRU | 471 | * Ensure that there are not too many pages isolated from the LRU |
| @@ -522,7 +529,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
| 522 | if (!isolation_suitable(cc, page)) | 529 | if (!isolation_suitable(cc, page)) |
| 523 | goto next_pageblock; | 530 | goto next_pageblock; |
| 524 | 531 | ||
| 525 | /* Skip if free */ | 532 | /* |
| 533 | * Skip if free. page_order cannot be used without zone->lock | ||
| 534 | * as nothing prevents parallel allocations or buddy merging. | ||
| 535 | */ | ||
| 526 | if (PageBuddy(page)) | 536 | if (PageBuddy(page)) |
| 527 | continue; | 537 | continue; |
| 528 | 538 | ||
| @@ -534,6 +544,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
| 534 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 544 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
| 535 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | 545 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
| 536 | cc->finished_update_migrate = true; | 546 | cc->finished_update_migrate = true; |
| 547 | skipped_async_unsuitable = true; | ||
| 537 | goto next_pageblock; | 548 | goto next_pageblock; |
| 538 | } | 549 | } |
| 539 | 550 | ||
| @@ -599,7 +610,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
| 599 | if (__isolate_lru_page(page, mode) != 0) | 610 | if (__isolate_lru_page(page, mode) != 0) |
| 600 | continue; | 611 | continue; |
| 601 | 612 | ||
| 602 | VM_BUG_ON(PageTransCompound(page)); | 613 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
| 603 | 614 | ||
| 604 | /* Successfully isolated */ | 615 | /* Successfully isolated */ |
| 605 | cc->finished_update_migrate = true; | 616 | cc->finished_update_migrate = true; |
| @@ -627,8 +638,13 @@ next_pageblock: | |||
| 627 | if (locked) | 638 | if (locked) |
| 628 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 639 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 629 | 640 | ||
| 630 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 641 | /* |
| 631 | if (low_pfn == end_pfn) | 642 | * Update the pageblock-skip information and cached scanner pfn, |
| 643 | * if the whole pageblock was scanned without isolating any page. | ||
| 644 | * This is not done when pageblock was skipped due to being unsuitable | ||
| 645 | * for async compaction, so that eventual sync compaction can try. | ||
| 646 | */ | ||
| 647 | if (low_pfn == end_pfn && !skipped_async_unsuitable) | ||
| 632 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 648 | update_pageblock_skip(cc, valid_page, nr_isolated, true); |
| 633 | 649 | ||
| 634 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 650 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
| @@ -660,7 +676,7 @@ static void isolate_freepages(struct zone *zone, | |||
| 660 | * is the end of the pageblock the migration scanner is using. | 676 | * is the end of the pageblock the migration scanner is using. |
| 661 | */ | 677 | */ |
| 662 | pfn = cc->free_pfn; | 678 | pfn = cc->free_pfn; |
| 663 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | 679 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); |
| 664 | 680 | ||
| 665 | /* | 681 | /* |
| 666 | * Take care that if the migration scanner is at the end of the zone | 682 | * Take care that if the migration scanner is at the end of the zone |
| @@ -676,7 +692,7 @@ static void isolate_freepages(struct zone *zone, | |||
| 676 | * pages on cc->migratepages. We stop searching if the migrate | 692 | * pages on cc->migratepages. We stop searching if the migrate |
| 677 | * and free page scanners meet or enough free pages are isolated. | 693 | * and free page scanners meet or enough free pages are isolated. |
| 678 | */ | 694 | */ |
| 679 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | 695 | for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; |
| 680 | pfn -= pageblock_nr_pages) { | 696 | pfn -= pageblock_nr_pages) { |
| 681 | unsigned long isolated; | 697 | unsigned long isolated; |
| 682 | 698 | ||
| @@ -738,7 +754,14 @@ static void isolate_freepages(struct zone *zone, | |||
| 738 | /* split_free_page does not map the pages */ | 754 | /* split_free_page does not map the pages */ |
| 739 | map_pages(freelist); | 755 | map_pages(freelist); |
| 740 | 756 | ||
| 741 | cc->free_pfn = high_pfn; | 757 | /* |
| 758 | * If we crossed the migrate scanner, we want to keep it that way | ||
| 759 | * so that compact_finished() may detect this | ||
| 760 | */ | ||
| 761 | if (pfn < low_pfn) | ||
| 762 | cc->free_pfn = max(pfn, zone->zone_start_pfn); | ||
| 763 | else | ||
| 764 | cc->free_pfn = high_pfn; | ||
| 742 | cc->nr_freepages = nr_freepages; | 765 | cc->nr_freepages = nr_freepages; |
| 743 | } | 766 | } |
| 744 | 767 | ||
| @@ -837,6 +860,10 @@ static int compact_finished(struct zone *zone, | |||
| 837 | 860 | ||
| 838 | /* Compaction run completes if the migrate and free scanner meet */ | 861 | /* Compaction run completes if the migrate and free scanner meet */ |
| 839 | if (cc->free_pfn <= cc->migrate_pfn) { | 862 | if (cc->free_pfn <= cc->migrate_pfn) { |
| 863 | /* Let the next compaction start anew. */ | ||
| 864 | zone->compact_cached_migrate_pfn = zone->zone_start_pfn; | ||
| 865 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | ||
| 866 | |||
| 840 | /* | 867 | /* |
| 841 | * Mark that the PG_migrate_skip information should be cleared | 868 | * Mark that the PG_migrate_skip information should be cleared |
| 842 | * by kswapd when it goes to sleep. kswapd does not set the | 869 | * by kswapd when it goes to sleep. kswapd does not set the |
| @@ -947,6 +974,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 947 | } | 974 | } |
| 948 | 975 | ||
| 949 | /* | 976 | /* |
| 977 | * Clear pageblock skip if there were failures recently and compaction | ||
| 978 | * is about to be retried after being deferred. kswapd does not do | ||
| 979 | * this reset as it'll reset the cached information when going to sleep. | ||
| 980 | */ | ||
| 981 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
| 982 | __reset_isolation_suitable(zone); | ||
| 983 | |||
| 984 | /* | ||
| 950 | * Setup to move all movable pages to the end of the zone. Used cached | 985 | * Setup to move all movable pages to the end of the zone. Used cached |
| 951 | * information on where the scanners should start but check that it | 986 | * information on where the scanners should start but check that it |
| 952 | * is initialised by ensuring the values are within zone boundaries. | 987 | * is initialised by ensuring the values are within zone boundaries. |
| @@ -962,13 +997,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 962 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | 997 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; |
| 963 | } | 998 | } |
| 964 | 999 | ||
| 965 | /* | 1000 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); |
| 966 | * Clear pageblock skip if there were failures recently and compaction | ||
| 967 | * is about to be retried after being deferred. kswapd does not do | ||
| 968 | * this reset as it'll reset the cached information when going to sleep. | ||
| 969 | */ | ||
| 970 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
| 971 | __reset_isolation_suitable(zone); | ||
| 972 | 1001 | ||
| 973 | migrate_prep_local(); | 1002 | migrate_prep_local(); |
| 974 | 1003 | ||
| @@ -1003,7 +1032,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 1003 | if (err) { | 1032 | if (err) { |
| 1004 | putback_movable_pages(&cc->migratepages); | 1033 | putback_movable_pages(&cc->migratepages); |
| 1005 | cc->nr_migratepages = 0; | 1034 | cc->nr_migratepages = 0; |
| 1006 | if (err == -ENOMEM) { | 1035 | /* |
| 1036 | * migrate_pages() may return -ENOMEM when scanners meet | ||
| 1037 | * and we want compact_finished() to detect it | ||
| 1038 | */ | ||
| 1039 | if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { | ||
| 1007 | ret = COMPACT_PARTIAL; | 1040 | ret = COMPACT_PARTIAL; |
| 1008 | goto out; | 1041 | goto out; |
| 1009 | } | 1042 | } |
| @@ -1015,6 +1048,8 @@ out: | |||
| 1015 | cc->nr_freepages -= release_freepages(&cc->freepages); | 1048 | cc->nr_freepages -= release_freepages(&cc->freepages); |
| 1016 | VM_BUG_ON(cc->nr_freepages != 0); | 1049 | VM_BUG_ON(cc->nr_freepages != 0); |
| 1017 | 1050 | ||
| 1051 | trace_mm_compaction_end(ret); | ||
| 1052 | |||
| 1018 | return ret; | 1053 | return ret; |
| 1019 | } | 1054 | } |
| 1020 | 1055 | ||
| @@ -1120,12 +1155,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
| 1120 | compact_zone(zone, cc); | 1155 | compact_zone(zone, cc); |
| 1121 | 1156 | ||
| 1122 | if (cc->order > 0) { | 1157 | if (cc->order > 0) { |
| 1123 | int ok = zone_watermark_ok(zone, cc->order, | 1158 | if (zone_watermark_ok(zone, cc->order, |
| 1124 | low_wmark_pages(zone), 0, 0); | 1159 | low_wmark_pages(zone), 0, 0)) |
| 1125 | if (ok && cc->order >= zone->compact_order_failed) | 1160 | compaction_defer_reset(zone, cc->order, false); |
| 1126 | zone->compact_order_failed = cc->order + 1; | ||
| 1127 | /* Currently async compaction is never deferred. */ | 1161 | /* Currently async compaction is never deferred. */ |
| 1128 | else if (!ok && cc->sync) | 1162 | else if (cc->sync) |
| 1129 | defer_compaction(zone, cc->order); | 1163 | defer_compaction(zone, cc->order); |
| 1130 | } | 1164 | } |
| 1131 | 1165 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index b7749a92021c..7a13f6ac5421 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -409,9 +409,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
| 409 | { | 409 | { |
| 410 | int error; | 410 | int error; |
| 411 | 411 | ||
| 412 | VM_BUG_ON(!PageLocked(old)); | 412 | VM_BUG_ON_PAGE(!PageLocked(old), old); |
| 413 | VM_BUG_ON(!PageLocked(new)); | 413 | VM_BUG_ON_PAGE(!PageLocked(new), new); |
| 414 | VM_BUG_ON(new->mapping); | 414 | VM_BUG_ON_PAGE(new->mapping, new); |
| 415 | 415 | ||
| 416 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 416 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
| 417 | if (!error) { | 417 | if (!error) { |
| @@ -461,8 +461,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
| 461 | { | 461 | { |
| 462 | int error; | 462 | int error; |
| 463 | 463 | ||
| 464 | VM_BUG_ON(!PageLocked(page)); | 464 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 465 | VM_BUG_ON(PageSwapBacked(page)); | 465 | VM_BUG_ON_PAGE(PageSwapBacked(page), page); |
| 466 | 466 | ||
| 467 | error = mem_cgroup_cache_charge(page, current->mm, | 467 | error = mem_cgroup_cache_charge(page, current->mm, |
| 468 | gfp_mask & GFP_RECLAIM_MASK); | 468 | gfp_mask & GFP_RECLAIM_MASK); |
| @@ -607,7 +607,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); | |||
| 607 | */ | 607 | */ |
| 608 | void unlock_page(struct page *page) | 608 | void unlock_page(struct page *page) |
| 609 | { | 609 | { |
| 610 | VM_BUG_ON(!PageLocked(page)); | 610 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 611 | clear_bit_unlock(PG_locked, &page->flags); | 611 | clear_bit_unlock(PG_locked, &page->flags); |
| 612 | smp_mb__after_clear_bit(); | 612 | smp_mb__after_clear_bit(); |
| 613 | wake_up_page(page, PG_locked); | 613 | wake_up_page(page, PG_locked); |
| @@ -760,7 +760,7 @@ repeat: | |||
| 760 | page_cache_release(page); | 760 | page_cache_release(page); |
| 761 | goto repeat; | 761 | goto repeat; |
| 762 | } | 762 | } |
| 763 | VM_BUG_ON(page->index != offset); | 763 | VM_BUG_ON_PAGE(page->index != offset, page); |
| 764 | } | 764 | } |
| 765 | return page; | 765 | return page; |
| 766 | } | 766 | } |
| @@ -1428,30 +1428,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
| 1428 | if (!count) | 1428 | if (!count) |
| 1429 | goto out; /* skip atime */ | 1429 | goto out; /* skip atime */ |
| 1430 | size = i_size_read(inode); | 1430 | size = i_size_read(inode); |
| 1431 | if (pos < size) { | 1431 | retval = filemap_write_and_wait_range(mapping, pos, |
| 1432 | retval = filemap_write_and_wait_range(mapping, pos, | ||
| 1433 | pos + iov_length(iov, nr_segs) - 1); | 1432 | pos + iov_length(iov, nr_segs) - 1); |
| 1434 | if (!retval) { | 1433 | if (!retval) { |
| 1435 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1434 | retval = mapping->a_ops->direct_IO(READ, iocb, |
| 1436 | iov, pos, nr_segs); | 1435 | iov, pos, nr_segs); |
| 1437 | } | 1436 | } |
| 1438 | if (retval > 0) { | 1437 | if (retval > 0) { |
| 1439 | *ppos = pos + retval; | 1438 | *ppos = pos + retval; |
| 1440 | count -= retval; | 1439 | count -= retval; |
| 1441 | } | 1440 | } |
| 1442 | 1441 | ||
| 1443 | /* | 1442 | /* |
| 1444 | * Btrfs can have a short DIO read if we encounter | 1443 | * Btrfs can have a short DIO read if we encounter |
| 1445 | * compressed extents, so if there was an error, or if | 1444 | * compressed extents, so if there was an error, or if |
| 1446 | * we've already read everything we wanted to, or if | 1445 | * we've already read everything we wanted to, or if |
| 1447 | * there was a short read because we hit EOF, go ahead | 1446 | * there was a short read because we hit EOF, go ahead |
| 1448 | * and return. Otherwise fallthrough to buffered io for | 1447 | * and return. Otherwise fallthrough to buffered io for |
| 1449 | * the rest of the read. | 1448 | * the rest of the read. |
| 1450 | */ | 1449 | */ |
| 1451 | if (retval < 0 || !count || *ppos >= size) { | 1450 | if (retval < 0 || !count || *ppos >= size) { |
| 1452 | file_accessed(filp); | 1451 | file_accessed(filp); |
| 1453 | goto out; | 1452 | goto out; |
| 1454 | } | ||
| 1455 | } | 1453 | } |
| 1456 | } | 1454 | } |
| 1457 | 1455 | ||
| @@ -1656,7 +1654,7 @@ retry_find: | |||
| 1656 | put_page(page); | 1654 | put_page(page); |
| 1657 | goto retry_find; | 1655 | goto retry_find; |
| 1658 | } | 1656 | } |
| 1659 | VM_BUG_ON(page->index != offset); | 1657 | VM_BUG_ON_PAGE(page->index != offset, page); |
| 1660 | 1658 | ||
| 1661 | /* | 1659 | /* |
| 1662 | * We have a locked page in the page cache, now we need to check | 1660 | * We have a locked page in the page cache, now we need to check |
| @@ -2555,8 +2553,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2555 | if (ret > 0) { | 2553 | if (ret > 0) { |
| 2556 | ssize_t err; | 2554 | ssize_t err; |
| 2557 | 2555 | ||
| 2558 | err = generic_write_sync(file, pos, ret); | 2556 | err = generic_write_sync(file, iocb->ki_pos - ret, ret); |
| 2559 | if (err < 0 && ret > 0) | 2557 | if (err < 0) |
| 2560 | ret = err; | 2558 | ret = err; |
| 2561 | } | 2559 | } |
| 2562 | return ret; | 2560 | return ret; |
diff --git a/mm/fremap.c b/mm/fremap.c index bbc4d660221a..34feba60a17e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -23,28 +23,44 @@ | |||
| 23 | 23 | ||
| 24 | #include "internal.h" | 24 | #include "internal.h" |
| 25 | 25 | ||
| 26 | static int mm_counter(struct page *page) | ||
| 27 | { | ||
| 28 | return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; | ||
| 29 | } | ||
| 30 | |||
| 26 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | 31 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, |
| 27 | unsigned long addr, pte_t *ptep) | 32 | unsigned long addr, pte_t *ptep) |
| 28 | { | 33 | { |
| 29 | pte_t pte = *ptep; | 34 | pte_t pte = *ptep; |
| 35 | struct page *page; | ||
| 36 | swp_entry_t entry; | ||
| 30 | 37 | ||
| 31 | if (pte_present(pte)) { | 38 | if (pte_present(pte)) { |
| 32 | struct page *page; | ||
| 33 | |||
| 34 | flush_cache_page(vma, addr, pte_pfn(pte)); | 39 | flush_cache_page(vma, addr, pte_pfn(pte)); |
| 35 | pte = ptep_clear_flush(vma, addr, ptep); | 40 | pte = ptep_clear_flush(vma, addr, ptep); |
| 36 | page = vm_normal_page(vma, addr, pte); | 41 | page = vm_normal_page(vma, addr, pte); |
| 37 | if (page) { | 42 | if (page) { |
| 38 | if (pte_dirty(pte)) | 43 | if (pte_dirty(pte)) |
| 39 | set_page_dirty(page); | 44 | set_page_dirty(page); |
| 45 | update_hiwater_rss(mm); | ||
| 46 | dec_mm_counter(mm, mm_counter(page)); | ||
| 40 | page_remove_rmap(page); | 47 | page_remove_rmap(page); |
| 41 | page_cache_release(page); | 48 | page_cache_release(page); |
| 49 | } | ||
| 50 | } else { /* zap_pte() is not called when pte_none() */ | ||
| 51 | if (!pte_file(pte)) { | ||
| 42 | update_hiwater_rss(mm); | 52 | update_hiwater_rss(mm); |
| 43 | dec_mm_counter(mm, MM_FILEPAGES); | 53 | entry = pte_to_swp_entry(pte); |
| 54 | if (non_swap_entry(entry)) { | ||
| 55 | if (is_migration_entry(entry)) { | ||
| 56 | page = migration_entry_to_page(entry); | ||
| 57 | dec_mm_counter(mm, mm_counter(page)); | ||
| 58 | } | ||
| 59 | } else { | ||
| 60 | free_swap_and_cache(entry); | ||
| 61 | dec_mm_counter(mm, MM_SWAPENTS); | ||
| 62 | } | ||
| 44 | } | 63 | } |
| 45 | } else { | ||
| 46 | if (!pte_file(pte)) | ||
| 47 | free_swap_and_cache(pte_to_swp_entry(pte)); | ||
| 48 | pte_clear_not_present_full(mm, addr, ptep, 0); | 64 | pte_clear_not_present_full(mm, addr, ptep, 0); |
| 49 | } | 65 | } |
| 50 | } | 66 | } |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 95d1acb0f3d2..1546655a2d78 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -130,8 +130,14 @@ static int set_recommended_min_free_kbytes(void) | |||
| 130 | (unsigned long) nr_free_buffer_pages() / 20); | 130 | (unsigned long) nr_free_buffer_pages() / 20); |
| 131 | recommended_min <<= (PAGE_SHIFT-10); | 131 | recommended_min <<= (PAGE_SHIFT-10); |
| 132 | 132 | ||
| 133 | if (recommended_min > min_free_kbytes) | 133 | if (recommended_min > min_free_kbytes) { |
| 134 | if (user_min_free_kbytes >= 0) | ||
| 135 | pr_info("raising min_free_kbytes from %d to %lu " | ||
| 136 | "to help transparent hugepage allocations\n", | ||
| 137 | min_free_kbytes, recommended_min); | ||
| 138 | |||
| 134 | min_free_kbytes = recommended_min; | 139 | min_free_kbytes = recommended_min; |
| 140 | } | ||
| 135 | setup_per_zone_wmarks(); | 141 | setup_per_zone_wmarks(); |
| 136 | return 0; | 142 | return 0; |
| 137 | } | 143 | } |
| @@ -655,7 +661,7 @@ out: | |||
| 655 | hugepage_exit_sysfs(hugepage_kobj); | 661 | hugepage_exit_sysfs(hugepage_kobj); |
| 656 | return err; | 662 | return err; |
| 657 | } | 663 | } |
| 658 | module_init(hugepage_init) | 664 | subsys_initcall(hugepage_init); |
| 659 | 665 | ||
| 660 | static int __init setup_transparent_hugepage(char *str) | 666 | static int __init setup_transparent_hugepage(char *str) |
| 661 | { | 667 | { |
| @@ -712,7 +718,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
| 712 | pgtable_t pgtable; | 718 | pgtable_t pgtable; |
| 713 | spinlock_t *ptl; | 719 | spinlock_t *ptl; |
| 714 | 720 | ||
| 715 | VM_BUG_ON(!PageCompound(page)); | 721 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
| 716 | pgtable = pte_alloc_one(mm, haddr); | 722 | pgtable = pte_alloc_one(mm, haddr); |
| 717 | if (unlikely(!pgtable)) | 723 | if (unlikely(!pgtable)) |
| 718 | return VM_FAULT_OOM; | 724 | return VM_FAULT_OOM; |
| @@ -893,7 +899,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 893 | goto out; | 899 | goto out; |
| 894 | } | 900 | } |
| 895 | src_page = pmd_page(pmd); | 901 | src_page = pmd_page(pmd); |
| 896 | VM_BUG_ON(!PageHead(src_page)); | 902 | VM_BUG_ON_PAGE(!PageHead(src_page), src_page); |
| 897 | get_page(src_page); | 903 | get_page(src_page); |
| 898 | page_dup_rmap(src_page); | 904 | page_dup_rmap(src_page); |
| 899 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 905 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
| @@ -1067,7 +1073,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
| 1067 | ptl = pmd_lock(mm, pmd); | 1073 | ptl = pmd_lock(mm, pmd); |
| 1068 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1074 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
| 1069 | goto out_free_pages; | 1075 | goto out_free_pages; |
| 1070 | VM_BUG_ON(!PageHead(page)); | 1076 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 1071 | 1077 | ||
| 1072 | pmdp_clear_flush(vma, haddr, pmd); | 1078 | pmdp_clear_flush(vma, haddr, pmd); |
| 1073 | /* leave pmd empty until pte is filled */ | 1079 | /* leave pmd empty until pte is filled */ |
| @@ -1133,7 +1139,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1133 | goto out_unlock; | 1139 | goto out_unlock; |
| 1134 | 1140 | ||
| 1135 | page = pmd_page(orig_pmd); | 1141 | page = pmd_page(orig_pmd); |
| 1136 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | 1142 | VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); |
| 1137 | if (page_mapcount(page) == 1) { | 1143 | if (page_mapcount(page) == 1) { |
| 1138 | pmd_t entry; | 1144 | pmd_t entry; |
| 1139 | entry = pmd_mkyoung(orig_pmd); | 1145 | entry = pmd_mkyoung(orig_pmd); |
| @@ -1160,8 +1166,10 @@ alloc: | |||
| 1160 | } else { | 1166 | } else { |
| 1161 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1167 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
| 1162 | pmd, orig_pmd, page, haddr); | 1168 | pmd, orig_pmd, page, haddr); |
| 1163 | if (ret & VM_FAULT_OOM) | 1169 | if (ret & VM_FAULT_OOM) { |
| 1164 | split_huge_page(page); | 1170 | split_huge_page(page); |
| 1171 | ret |= VM_FAULT_FALLBACK; | ||
| 1172 | } | ||
| 1165 | put_page(page); | 1173 | put_page(page); |
| 1166 | } | 1174 | } |
| 1167 | count_vm_event(THP_FAULT_FALLBACK); | 1175 | count_vm_event(THP_FAULT_FALLBACK); |
| @@ -1173,9 +1181,10 @@ alloc: | |||
| 1173 | if (page) { | 1181 | if (page) { |
| 1174 | split_huge_page(page); | 1182 | split_huge_page(page); |
| 1175 | put_page(page); | 1183 | put_page(page); |
| 1176 | } | 1184 | } else |
| 1185 | split_huge_page_pmd(vma, address, pmd); | ||
| 1186 | ret |= VM_FAULT_FALLBACK; | ||
| 1177 | count_vm_event(THP_FAULT_FALLBACK); | 1187 | count_vm_event(THP_FAULT_FALLBACK); |
| 1178 | ret |= VM_FAULT_OOM; | ||
| 1179 | goto out; | 1188 | goto out; |
| 1180 | } | 1189 | } |
| 1181 | 1190 | ||
| @@ -1211,7 +1220,7 @@ alloc: | |||
| 1211 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1220 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
| 1212 | put_huge_zero_page(); | 1221 | put_huge_zero_page(); |
| 1213 | } else { | 1222 | } else { |
| 1214 | VM_BUG_ON(!PageHead(page)); | 1223 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 1215 | page_remove_rmap(page); | 1224 | page_remove_rmap(page); |
| 1216 | put_page(page); | 1225 | put_page(page); |
| 1217 | } | 1226 | } |
| @@ -1249,7 +1258,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
| 1249 | goto out; | 1258 | goto out; |
| 1250 | 1259 | ||
| 1251 | page = pmd_page(*pmd); | 1260 | page = pmd_page(*pmd); |
| 1252 | VM_BUG_ON(!PageHead(page)); | 1261 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 1253 | if (flags & FOLL_TOUCH) { | 1262 | if (flags & FOLL_TOUCH) { |
| 1254 | pmd_t _pmd; | 1263 | pmd_t _pmd; |
| 1255 | /* | 1264 | /* |
| @@ -1274,7 +1283,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
| 1274 | } | 1283 | } |
| 1275 | } | 1284 | } |
| 1276 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1285 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
| 1277 | VM_BUG_ON(!PageCompound(page)); | 1286 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
| 1278 | if (flags & FOLL_GET) | 1287 | if (flags & FOLL_GET) |
| 1279 | get_page_foll(page); | 1288 | get_page_foll(page); |
| 1280 | 1289 | ||
| @@ -1432,9 +1441,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 1432 | } else { | 1441 | } else { |
| 1433 | page = pmd_page(orig_pmd); | 1442 | page = pmd_page(orig_pmd); |
| 1434 | page_remove_rmap(page); | 1443 | page_remove_rmap(page); |
| 1435 | VM_BUG_ON(page_mapcount(page) < 0); | 1444 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); |
| 1436 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1445 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
| 1437 | VM_BUG_ON(!PageHead(page)); | 1446 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 1438 | atomic_long_dec(&tlb->mm->nr_ptes); | 1447 | atomic_long_dec(&tlb->mm->nr_ptes); |
| 1439 | spin_unlock(ptl); | 1448 | spin_unlock(ptl); |
| 1440 | tlb_remove_page(tlb, page); | 1449 | tlb_remove_page(tlb, page); |
| @@ -1502,19 +1511,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
| 1502 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); | 1511 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); |
| 1503 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | 1512 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
| 1504 | VM_BUG_ON(!pmd_none(*new_pmd)); | 1513 | VM_BUG_ON(!pmd_none(*new_pmd)); |
| 1505 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); | ||
| 1506 | if (new_ptl != old_ptl) { | ||
| 1507 | pgtable_t pgtable; | ||
| 1508 | 1514 | ||
| 1509 | /* | 1515 | if (pmd_move_must_withdraw(new_ptl, old_ptl)) { |
| 1510 | * Move preallocated PTE page table if new_pmd is on | 1516 | pgtable_t pgtable; |
| 1511 | * different PMD page table. | ||
| 1512 | */ | ||
| 1513 | pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); | 1517 | pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); |
| 1514 | pgtable_trans_huge_deposit(mm, new_pmd, pgtable); | 1518 | pgtable_trans_huge_deposit(mm, new_pmd, pgtable); |
| 1515 | |||
| 1516 | spin_unlock(new_ptl); | ||
| 1517 | } | 1519 | } |
| 1520 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); | ||
| 1521 | if (new_ptl != old_ptl) | ||
| 1522 | spin_unlock(new_ptl); | ||
| 1518 | spin_unlock(old_ptl); | 1523 | spin_unlock(old_ptl); |
| 1519 | } | 1524 | } |
| 1520 | out: | 1525 | out: |
| @@ -1543,6 +1548,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 1543 | entry = pmd_mknonnuma(entry); | 1548 | entry = pmd_mknonnuma(entry); |
| 1544 | entry = pmd_modify(entry, newprot); | 1549 | entry = pmd_modify(entry, newprot); |
| 1545 | ret = HPAGE_PMD_NR; | 1550 | ret = HPAGE_PMD_NR; |
| 1551 | set_pmd_at(mm, addr, pmd, entry); | ||
| 1546 | BUG_ON(pmd_write(entry)); | 1552 | BUG_ON(pmd_write(entry)); |
| 1547 | } else { | 1553 | } else { |
| 1548 | struct page *page = pmd_page(*pmd); | 1554 | struct page *page = pmd_page(*pmd); |
| @@ -1555,16 +1561,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 1555 | */ | 1561 | */ |
| 1556 | if (!is_huge_zero_page(page) && | 1562 | if (!is_huge_zero_page(page) && |
| 1557 | !pmd_numa(*pmd)) { | 1563 | !pmd_numa(*pmd)) { |
| 1558 | entry = *pmd; | 1564 | pmdp_set_numa(mm, addr, pmd); |
| 1559 | entry = pmd_mknuma(entry); | ||
| 1560 | ret = HPAGE_PMD_NR; | 1565 | ret = HPAGE_PMD_NR; |
| 1561 | } | 1566 | } |
| 1562 | } | 1567 | } |
| 1563 | |||
| 1564 | /* Set PMD if cleared earlier */ | ||
| 1565 | if (ret == HPAGE_PMD_NR) | ||
| 1566 | set_pmd_at(mm, addr, pmd, entry); | ||
| 1567 | |||
| 1568 | spin_unlock(ptl); | 1568 | spin_unlock(ptl); |
| 1569 | } | 1569 | } |
| 1570 | 1570 | ||
| @@ -1961,7 +1961,7 @@ out: | |||
| 1961 | return ret; | 1961 | return ret; |
| 1962 | } | 1962 | } |
| 1963 | 1963 | ||
| 1964 | #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) | 1964 | #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) |
| 1965 | 1965 | ||
| 1966 | int hugepage_madvise(struct vm_area_struct *vma, | 1966 | int hugepage_madvise(struct vm_area_struct *vma, |
| 1967 | unsigned long *vm_flags, int advice) | 1967 | unsigned long *vm_flags, int advice) |
| @@ -2176,9 +2176,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
| 2176 | if (unlikely(!page)) | 2176 | if (unlikely(!page)) |
| 2177 | goto out; | 2177 | goto out; |
| 2178 | 2178 | ||
| 2179 | VM_BUG_ON(PageCompound(page)); | 2179 | VM_BUG_ON_PAGE(PageCompound(page), page); |
| 2180 | BUG_ON(!PageAnon(page)); | 2180 | VM_BUG_ON_PAGE(!PageAnon(page), page); |
| 2181 | VM_BUG_ON(!PageSwapBacked(page)); | 2181 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
| 2182 | 2182 | ||
| 2183 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2183 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
| 2184 | if (page_count(page) != 1) | 2184 | if (page_count(page) != 1) |
| @@ -2201,8 +2201,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
| 2201 | } | 2201 | } |
| 2202 | /* 0 stands for page_is_file_cache(page) == false */ | 2202 | /* 0 stands for page_is_file_cache(page) == false */ |
| 2203 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | 2203 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); |
| 2204 | VM_BUG_ON(!PageLocked(page)); | 2204 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 2205 | VM_BUG_ON(PageLRU(page)); | 2205 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 2206 | 2206 | ||
| 2207 | /* If there is no mapped pte young don't collapse the page */ | 2207 | /* If there is no mapped pte young don't collapse the page */ |
| 2208 | if (pte_young(pteval) || PageReferenced(page) || | 2208 | if (pte_young(pteval) || PageReferenced(page) || |
| @@ -2232,7 +2232,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | |||
| 2232 | } else { | 2232 | } else { |
| 2233 | src_page = pte_page(pteval); | 2233 | src_page = pte_page(pteval); |
| 2234 | copy_user_highpage(page, src_page, address, vma); | 2234 | copy_user_highpage(page, src_page, address, vma); |
| 2235 | VM_BUG_ON(page_mapcount(src_page) != 1); | 2235 | VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); |
| 2236 | release_pte_page(src_page); | 2236 | release_pte_page(src_page); |
| 2237 | /* | 2237 | /* |
| 2238 | * ptl mostly unnecessary, but preempt has to | 2238 | * ptl mostly unnecessary, but preempt has to |
| @@ -2311,7 +2311,7 @@ static struct page | |||
| 2311 | struct vm_area_struct *vma, unsigned long address, | 2311 | struct vm_area_struct *vma, unsigned long address, |
| 2312 | int node) | 2312 | int node) |
| 2313 | { | 2313 | { |
| 2314 | VM_BUG_ON(*hpage); | 2314 | VM_BUG_ON_PAGE(*hpage, *hpage); |
| 2315 | /* | 2315 | /* |
| 2316 | * Allocate the page while the vma is still valid and under | 2316 | * Allocate the page while the vma is still valid and under |
| 2317 | * the mmap_sem read mode so there is no memory allocation | 2317 | * the mmap_sem read mode so there is no memory allocation |
| @@ -2580,7 +2580,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
| 2580 | */ | 2580 | */ |
| 2581 | node = page_to_nid(page); | 2581 | node = page_to_nid(page); |
| 2582 | khugepaged_node_load[node]++; | 2582 | khugepaged_node_load[node]++; |
| 2583 | VM_BUG_ON(PageCompound(page)); | 2583 | VM_BUG_ON_PAGE(PageCompound(page), page); |
| 2584 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2584 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
| 2585 | goto out_unmap; | 2585 | goto out_unmap; |
| 2586 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2586 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
| @@ -2876,7 +2876,7 @@ again: | |||
| 2876 | return; | 2876 | return; |
| 2877 | } | 2877 | } |
| 2878 | page = pmd_page(*pmd); | 2878 | page = pmd_page(*pmd); |
| 2879 | VM_BUG_ON(!page_count(page)); | 2879 | VM_BUG_ON_PAGE(!page_count(page), page); |
| 2880 | get_page(page); | 2880 | get_page(page); |
| 2881 | spin_unlock(ptl); | 2881 | spin_unlock(ptl); |
| 2882 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2882 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dee6cf4e6d34..c01cb9fedb18 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -584,7 +584,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
| 584 | 1 << PG_active | 1 << PG_reserved | | 584 | 1 << PG_active | 1 << PG_reserved | |
| 585 | 1 << PG_private | 1 << PG_writeback); | 585 | 1 << PG_private | 1 << PG_writeback); |
| 586 | } | 586 | } |
| 587 | VM_BUG_ON(hugetlb_cgroup_from_page(page)); | 587 | VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); |
| 588 | set_compound_page_dtor(page, NULL); | 588 | set_compound_page_dtor(page, NULL); |
| 589 | set_page_refcounted(page); | 589 | set_page_refcounted(page); |
| 590 | arch_release_hugepage(page); | 590 | arch_release_hugepage(page); |
| @@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
| 690 | */ | 690 | */ |
| 691 | int PageHuge(struct page *page) | 691 | int PageHuge(struct page *page) |
| 692 | { | 692 | { |
| 693 | compound_page_dtor *dtor; | ||
| 694 | |||
| 695 | if (!PageCompound(page)) | 693 | if (!PageCompound(page)) |
| 696 | return 0; | 694 | return 0; |
| 697 | 695 | ||
| 698 | page = compound_head(page); | 696 | page = compound_head(page); |
| 699 | dtor = get_compound_page_dtor(page); | 697 | return get_compound_page_dtor(page) == free_huge_page; |
| 700 | |||
| 701 | return dtor == free_huge_page; | ||
| 702 | } | 698 | } |
| 703 | EXPORT_SYMBOL_GPL(PageHuge); | 699 | EXPORT_SYMBOL_GPL(PageHuge); |
| 704 | 700 | ||
| @@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge); | |||
| 708 | */ | 704 | */ |
| 709 | int PageHeadHuge(struct page *page_head) | 705 | int PageHeadHuge(struct page *page_head) |
| 710 | { | 706 | { |
| 711 | compound_page_dtor *dtor; | ||
| 712 | |||
| 713 | if (!PageHead(page_head)) | 707 | if (!PageHead(page_head)) |
| 714 | return 0; | 708 | return 0; |
| 715 | 709 | ||
| 716 | dtor = get_compound_page_dtor(page_head); | 710 | return get_compound_page_dtor(page_head) == free_huge_page; |
| 717 | |||
| 718 | return dtor == free_huge_page; | ||
| 719 | } | 711 | } |
| 720 | EXPORT_SYMBOL_GPL(PageHeadHuge); | ||
| 721 | 712 | ||
| 722 | pgoff_t __basepage_index(struct page *page) | 713 | pgoff_t __basepage_index(struct page *page) |
| 723 | { | 714 | { |
| @@ -1098,7 +1089,7 @@ retry: | |||
| 1098 | * no users -- drop the buddy allocator's reference. | 1089 | * no users -- drop the buddy allocator's reference. |
| 1099 | */ | 1090 | */ |
| 1100 | put_page_testzero(page); | 1091 | put_page_testzero(page); |
| 1101 | VM_BUG_ON(page_count(page)); | 1092 | VM_BUG_ON_PAGE(page_count(page), page); |
| 1102 | enqueue_huge_page(h, page); | 1093 | enqueue_huge_page(h, page); |
| 1103 | } | 1094 | } |
| 1104 | free: | 1095 | free: |
| @@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
| 1280 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { | 1271 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { |
| 1281 | void *addr; | 1272 | void *addr; |
| 1282 | 1273 | ||
| 1283 | addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), | 1274 | addr = memblock_virt_alloc_try_nid_nopanic( |
| 1284 | huge_page_size(h), huge_page_size(h), 0); | 1275 | huge_page_size(h), huge_page_size(h), |
| 1285 | 1276 | 0, BOOTMEM_ALLOC_ACCESSIBLE, node); | |
| 1286 | if (addr) { | 1277 | if (addr) { |
| 1287 | /* | 1278 | /* |
| 1288 | * Use the beginning of the huge page to store the | 1279 | * Use the beginning of the huge page to store the |
| @@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void) | |||
| 1322 | 1313 | ||
| 1323 | #ifdef CONFIG_HIGHMEM | 1314 | #ifdef CONFIG_HIGHMEM |
| 1324 | page = pfn_to_page(m->phys >> PAGE_SHIFT); | 1315 | page = pfn_to_page(m->phys >> PAGE_SHIFT); |
| 1325 | free_bootmem_late((unsigned long)m, | 1316 | memblock_free_late(__pa(m), |
| 1326 | sizeof(struct huge_bootmem_page)); | 1317 | sizeof(struct huge_bootmem_page)); |
| 1327 | #else | 1318 | #else |
| 1328 | page = virt_to_page(m); | 1319 | page = virt_to_page(m); |
| 1329 | #endif | 1320 | #endif |
| @@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
| 2355 | int cow; | 2346 | int cow; |
| 2356 | struct hstate *h = hstate_vma(vma); | 2347 | struct hstate *h = hstate_vma(vma); |
| 2357 | unsigned long sz = huge_page_size(h); | 2348 | unsigned long sz = huge_page_size(h); |
| 2349 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
| 2350 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
| 2351 | int ret = 0; | ||
| 2358 | 2352 | ||
| 2359 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 2353 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
| 2360 | 2354 | ||
| 2355 | mmun_start = vma->vm_start; | ||
| 2356 | mmun_end = vma->vm_end; | ||
| 2357 | if (cow) | ||
| 2358 | mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); | ||
| 2359 | |||
| 2361 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { | 2360 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
| 2362 | spinlock_t *src_ptl, *dst_ptl; | 2361 | spinlock_t *src_ptl, *dst_ptl; |
| 2363 | src_pte = huge_pte_offset(src, addr); | 2362 | src_pte = huge_pte_offset(src, addr); |
| 2364 | if (!src_pte) | 2363 | if (!src_pte) |
| 2365 | continue; | 2364 | continue; |
| 2366 | dst_pte = huge_pte_alloc(dst, addr, sz); | 2365 | dst_pte = huge_pte_alloc(dst, addr, sz); |
| 2367 | if (!dst_pte) | 2366 | if (!dst_pte) { |
| 2368 | goto nomem; | 2367 | ret = -ENOMEM; |
| 2368 | break; | ||
| 2369 | } | ||
| 2369 | 2370 | ||
| 2370 | /* If the pagetables are shared don't copy or take references */ | 2371 | /* If the pagetables are shared don't copy or take references */ |
| 2371 | if (dst_pte == src_pte) | 2372 | if (dst_pte == src_pte) |
| @@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
| 2386 | spin_unlock(src_ptl); | 2387 | spin_unlock(src_ptl); |
| 2387 | spin_unlock(dst_ptl); | 2388 | spin_unlock(dst_ptl); |
| 2388 | } | 2389 | } |
| 2389 | return 0; | ||
| 2390 | 2390 | ||
| 2391 | nomem: | 2391 | if (cow) |
| 2392 | return -ENOMEM; | 2392 | mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); |
| 2393 | |||
| 2394 | return ret; | ||
| 2393 | } | 2395 | } |
| 2394 | 2396 | ||
| 2395 | static int is_hugetlb_entry_migration(pte_t pte) | 2397 | static int is_hugetlb_entry_migration(pte_t pte) |
| @@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3079 | same_page: | 3081 | same_page: |
| 3080 | if (pages) { | 3082 | if (pages) { |
| 3081 | pages[i] = mem_map_offset(page, pfn_offset); | 3083 | pages[i] = mem_map_offset(page, pfn_offset); |
| 3082 | get_page(pages[i]); | 3084 | get_page_foll(pages[i]); |
| 3083 | } | 3085 | } |
| 3084 | 3086 | ||
| 3085 | if (vmas) | 3087 | if (vmas) |
| @@ -3501,7 +3503,7 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
| 3501 | 3503 | ||
| 3502 | bool isolate_huge_page(struct page *page, struct list_head *list) | 3504 | bool isolate_huge_page(struct page *page, struct list_head *list) |
| 3503 | { | 3505 | { |
| 3504 | VM_BUG_ON(!PageHead(page)); | 3506 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 3505 | if (!get_page_unless_zero(page)) | 3507 | if (!get_page_unless_zero(page)) |
| 3506 | return false; | 3508 | return false; |
| 3507 | spin_lock(&hugetlb_lock); | 3509 | spin_lock(&hugetlb_lock); |
| @@ -3512,7 +3514,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list) | |||
| 3512 | 3514 | ||
| 3513 | void putback_active_hugepage(struct page *page) | 3515 | void putback_active_hugepage(struct page *page) |
| 3514 | { | 3516 | { |
| 3515 | VM_BUG_ON(!PageHead(page)); | 3517 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 3516 | spin_lock(&hugetlb_lock); | 3518 | spin_lock(&hugetlb_lock); |
| 3517 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); | 3519 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); |
| 3518 | spin_unlock(&hugetlb_lock); | 3520 | spin_unlock(&hugetlb_lock); |
| @@ -3521,7 +3523,7 @@ void putback_active_hugepage(struct page *page) | |||
| 3521 | 3523 | ||
| 3522 | bool is_hugepage_active(struct page *page) | 3524 | bool is_hugepage_active(struct page *page) |
| 3523 | { | 3525 | { |
| 3524 | VM_BUG_ON(!PageHuge(page)); | 3526 | VM_BUG_ON_PAGE(!PageHuge(page), page); |
| 3525 | /* | 3527 | /* |
| 3526 | * This function can be called for a tail page because the caller, | 3528 | * This function can be called for a tail page because the caller, |
| 3527 | * scan_movable_pages, scans through a given pfn-range which typically | 3529 | * scan_movable_pages, scans through a given pfn-range which typically |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index bda8e44f6fde..cb00829bb466 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
| @@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | |||
| 242 | return; | 242 | return; |
| 243 | } | 243 | } |
| 244 | 244 | ||
| 245 | static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, | 245 | static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, |
| 246 | struct cftype *cft, struct file *file, | 246 | struct cftype *cft) |
| 247 | char __user *buf, size_t nbytes, | ||
| 248 | loff_t *ppos) | ||
| 249 | { | 247 | { |
| 250 | u64 val; | 248 | int idx, name; |
| 251 | char str[64]; | ||
| 252 | int idx, name, len; | ||
| 253 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); | 249 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); |
| 254 | 250 | ||
| 255 | idx = MEMFILE_IDX(cft->private); | 251 | idx = MEMFILE_IDX(cft->private); |
| 256 | name = MEMFILE_ATTR(cft->private); | 252 | name = MEMFILE_ATTR(cft->private); |
| 257 | 253 | ||
| 258 | val = res_counter_read_u64(&h_cg->hugepage[idx], name); | 254 | return res_counter_read_u64(&h_cg->hugepage[idx], name); |
| 259 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | ||
| 260 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
| 261 | } | 255 | } |
| 262 | 256 | ||
| 263 | static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, | 257 | static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, |
| @@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx) | |||
| 337 | cft = &h->cgroup_files[0]; | 331 | cft = &h->cgroup_files[0]; |
| 338 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); | 332 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); |
| 339 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); | 333 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); |
| 340 | cft->read = hugetlb_cgroup_read; | 334 | cft->read_u64 = hugetlb_cgroup_read_u64; |
| 341 | cft->write_string = hugetlb_cgroup_write; | 335 | cft->write_string = hugetlb_cgroup_write; |
| 342 | 336 | ||
| 343 | /* Add the usage file */ | 337 | /* Add the usage file */ |
| 344 | cft = &h->cgroup_files[1]; | 338 | cft = &h->cgroup_files[1]; |
| 345 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); | 339 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); |
| 346 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); | 340 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); |
| 347 | cft->read = hugetlb_cgroup_read; | 341 | cft->read_u64 = hugetlb_cgroup_read_u64; |
| 348 | 342 | ||
| 349 | /* Add the MAX usage file */ | 343 | /* Add the MAX usage file */ |
| 350 | cft = &h->cgroup_files[2]; | 344 | cft = &h->cgroup_files[2]; |
| 351 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); | 345 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); |
| 352 | cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); | 346 | cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); |
| 353 | cft->trigger = hugetlb_cgroup_reset; | 347 | cft->trigger = hugetlb_cgroup_reset; |
| 354 | cft->read = hugetlb_cgroup_read; | 348 | cft->read_u64 = hugetlb_cgroup_read_u64; |
| 355 | 349 | ||
| 356 | /* Add the failcntfile */ | 350 | /* Add the failcntfile */ |
| 357 | cft = &h->cgroup_files[3]; | 351 | cft = &h->cgroup_files[3]; |
| 358 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); | 352 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); |
| 359 | cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); | 353 | cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); |
| 360 | cft->trigger = hugetlb_cgroup_reset; | 354 | cft->trigger = hugetlb_cgroup_reset; |
| 361 | cft->read = hugetlb_cgroup_read; | 355 | cft->read_u64 = hugetlb_cgroup_read_u64; |
| 362 | 356 | ||
| 363 | /* NULL terminate the last cft */ | 357 | /* NULL terminate the last cft */ |
| 364 | cft = &h->cgroup_files[4]; | 358 | cft = &h->cgroup_files[4]; |
| @@ -396,7 +390,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) | |||
| 396 | if (hugetlb_cgroup_disabled()) | 390 | if (hugetlb_cgroup_disabled()) |
| 397 | return; | 391 | return; |
| 398 | 392 | ||
| 399 | VM_BUG_ON(!PageHuge(oldhpage)); | 393 | VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); |
| 400 | spin_lock(&hugetlb_lock); | 394 | spin_lock(&hugetlb_lock); |
| 401 | h_cg = hugetlb_cgroup_from_page(oldhpage); | 395 | h_cg = hugetlb_cgroup_from_page(oldhpage); |
| 402 | set_hugetlb_cgroup(oldhpage, NULL); | 396 | set_hugetlb_cgroup(oldhpage, NULL); |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4c84678371eb..95487c71cad5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
| @@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
| 55 | return 0; | 55 | return 0; |
| 56 | 56 | ||
| 57 | inject: | 57 | inject: |
| 58 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | 58 | pr_info("Injecting memory failure at pfn %#lx\n", pfn); |
| 59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); | 59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); |
| 60 | } | 60 | } |
| 61 | 61 | ||
diff --git a/mm/internal.h b/mm/internal.h index 684f7aa9692a..29e1e761f9eb 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -27,8 +27,8 @@ static inline void set_page_count(struct page *page, int v) | |||
| 27 | */ | 27 | */ |
| 28 | static inline void set_page_refcounted(struct page *page) | 28 | static inline void set_page_refcounted(struct page *page) |
| 29 | { | 29 | { |
| 30 | VM_BUG_ON(PageTail(page)); | 30 | VM_BUG_ON_PAGE(PageTail(page), page); |
| 31 | VM_BUG_ON(atomic_read(&page->_count)); | 31 | VM_BUG_ON_PAGE(atomic_read(&page->_count), page); |
| 32 | set_page_count(page, 1); | 32 | set_page_count(page, 1); |
| 33 | } | 33 | } |
| 34 | 34 | ||
| @@ -46,12 +46,10 @@ static inline void __get_page_tail_foll(struct page *page, | |||
| 46 | * speculative page access (like in | 46 | * speculative page access (like in |
| 47 | * page_cache_get_speculative()) on tail pages. | 47 | * page_cache_get_speculative()) on tail pages. |
| 48 | */ | 48 | */ |
| 49 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); | 49 | VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); |
| 50 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
| 51 | VM_BUG_ON(page_mapcount(page) < 0); | ||
| 52 | if (get_page_head) | 50 | if (get_page_head) |
| 53 | atomic_inc(&page->first_page->_count); | 51 | atomic_inc(&page->first_page->_count); |
| 54 | atomic_inc(&page->_mapcount); | 52 | get_huge_page_tail(page); |
| 55 | } | 53 | } |
| 56 | 54 | ||
| 57 | /* | 55 | /* |
| @@ -73,7 +71,7 @@ static inline void get_page_foll(struct page *page) | |||
| 73 | * Getting a normal page or the head of a compound page | 71 | * Getting a normal page or the head of a compound page |
| 74 | * requires to already have an elevated page->_count. | 72 | * requires to already have an elevated page->_count. |
| 75 | */ | 73 | */ |
| 76 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | 74 | VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); |
| 77 | atomic_inc(&page->_count); | 75 | atomic_inc(&page->_count); |
| 78 | } | 76 | } |
| 79 | } | 77 | } |
| @@ -85,7 +83,6 @@ extern unsigned long highest_memmap_pfn; | |||
| 85 | */ | 83 | */ |
| 86 | extern int isolate_lru_page(struct page *page); | 84 | extern int isolate_lru_page(struct page *page); |
| 87 | extern void putback_lru_page(struct page *page); | 85 | extern void putback_lru_page(struct page *page); |
| 88 | extern unsigned long zone_reclaimable_pages(struct zone *zone); | ||
| 89 | extern bool zone_reclaimable(struct zone *zone); | 86 | extern bool zone_reclaimable(struct zone *zone); |
| 90 | 87 | ||
| 91 | /* | 88 | /* |
| @@ -101,6 +98,7 @@ extern void prep_compound_page(struct page *page, unsigned long order); | |||
| 101 | #ifdef CONFIG_MEMORY_FAILURE | 98 | #ifdef CONFIG_MEMORY_FAILURE |
| 102 | extern bool is_free_buddy_page(struct page *page); | 99 | extern bool is_free_buddy_page(struct page *page); |
| 103 | #endif | 100 | #endif |
| 101 | extern int user_min_free_kbytes; | ||
| 104 | 102 | ||
| 105 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 103 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
| 106 | 104 | ||
| @@ -144,9 +142,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
| 144 | #endif | 142 | #endif |
| 145 | 143 | ||
| 146 | /* | 144 | /* |
| 147 | * function for dealing with page's order in buddy system. | 145 | * This function returns the order of a free page in the buddy system. In |
| 148 | * zone->lock is already acquired when we use these. | 146 | * general, page_zone(page)->lock must be held by the caller to prevent the |
| 149 | * So, we don't need atomic page->flags operations here. | 147 | * page from being allocated in parallel and returning garbage as the order. |
| 148 | * If a caller does not hold page_zone(page)->lock, it must guarantee that the | ||
| 149 | * page cannot be allocated or merged in parallel. | ||
| 150 | */ | 150 | */ |
| 151 | static inline unsigned long page_order(struct page *page) | 151 | static inline unsigned long page_order(struct page *page) |
| 152 | { | 152 | { |
| @@ -175,7 +175,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
| 175 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | 175 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, |
| 176 | struct page *page) | 176 | struct page *page) |
| 177 | { | 177 | { |
| 178 | VM_BUG_ON(PageLRU(page)); | 178 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 179 | 179 | ||
| 180 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) | 180 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) |
| 181 | return 0; | 181 | return 0; |
| @@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item) | |||
| 444 | static struct page *page_trans_compound_anon(struct page *page) | 444 | static struct page *page_trans_compound_anon(struct page *page) |
| 445 | { | 445 | { |
| 446 | if (PageTransCompound(page)) { | 446 | if (PageTransCompound(page)) { |
| 447 | struct page *head = compound_trans_head(page); | 447 | struct page *head = compound_head(page); |
| 448 | /* | 448 | /* |
| 449 | * head may actually be splitted and freed from under | 449 | * head may actually be splitted and freed from under |
| 450 | * us but it's ok here. | 450 | * us but it's ok here. |
| @@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page, | |||
| 1891 | return new_page; | 1891 | return new_page; |
| 1892 | } | 1892 | } |
| 1893 | 1893 | ||
| 1894 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | 1894 | int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) |
| 1895 | unsigned long *vm_flags) | ||
| 1896 | { | 1895 | { |
| 1897 | struct stable_node *stable_node; | 1896 | struct stable_node *stable_node; |
| 1898 | struct rmap_item *rmap_item; | 1897 | struct rmap_item *rmap_item; |
| 1899 | unsigned int mapcount = page_mapcount(page); | 1898 | int ret = SWAP_AGAIN; |
| 1900 | int referenced = 0; | ||
| 1901 | int search_new_forks = 0; | 1899 | int search_new_forks = 0; |
| 1902 | 1900 | ||
| 1903 | VM_BUG_ON(!PageKsm(page)); | 1901 | VM_BUG_ON_PAGE(!PageKsm(page), page); |
| 1904 | VM_BUG_ON(!PageLocked(page)); | 1902 | |
| 1903 | /* | ||
| 1904 | * Rely on the page lock to protect against concurrent modifications | ||
| 1905 | * to that page's node of the stable tree. | ||
| 1906 | */ | ||
| 1907 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
| 1905 | 1908 | ||
| 1906 | stable_node = page_stable_node(page); | 1909 | stable_node = page_stable_node(page); |
| 1907 | if (!stable_node) | 1910 | if (!stable_node) |
| 1908 | return 0; | 1911 | return ret; |
| 1909 | again: | 1912 | again: |
| 1910 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | 1913 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { |
| 1911 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1914 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
| @@ -1928,113 +1931,16 @@ again: | |||
| 1928 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | 1931 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) |
| 1929 | continue; | 1932 | continue; |
| 1930 | 1933 | ||
| 1931 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | 1934 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) |
| 1932 | continue; | ||
| 1933 | |||
| 1934 | referenced += page_referenced_one(page, vma, | ||
| 1935 | rmap_item->address, &mapcount, vm_flags); | ||
| 1936 | if (!search_new_forks || !mapcount) | ||
| 1937 | break; | ||
| 1938 | } | ||
| 1939 | anon_vma_unlock_read(anon_vma); | ||
| 1940 | if (!mapcount) | ||
| 1941 | goto out; | ||
| 1942 | } | ||
| 1943 | if (!search_new_forks++) | ||
| 1944 | goto again; | ||
| 1945 | out: | ||
| 1946 | return referenced; | ||
| 1947 | } | ||
| 1948 | |||
| 1949 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
| 1950 | { | ||
| 1951 | struct stable_node *stable_node; | ||
| 1952 | struct rmap_item *rmap_item; | ||
| 1953 | int ret = SWAP_AGAIN; | ||
| 1954 | int search_new_forks = 0; | ||
| 1955 | |||
| 1956 | VM_BUG_ON(!PageKsm(page)); | ||
| 1957 | VM_BUG_ON(!PageLocked(page)); | ||
| 1958 | |||
| 1959 | stable_node = page_stable_node(page); | ||
| 1960 | if (!stable_node) | ||
| 1961 | return SWAP_FAIL; | ||
| 1962 | again: | ||
| 1963 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | ||
| 1964 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
| 1965 | struct anon_vma_chain *vmac; | ||
| 1966 | struct vm_area_struct *vma; | ||
| 1967 | |||
| 1968 | anon_vma_lock_read(anon_vma); | ||
| 1969 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | ||
| 1970 | 0, ULONG_MAX) { | ||
| 1971 | vma = vmac->vma; | ||
| 1972 | if (rmap_item->address < vma->vm_start || | ||
| 1973 | rmap_item->address >= vma->vm_end) | ||
| 1974 | continue; | ||
| 1975 | /* | ||
| 1976 | * Initially we examine only the vma which covers this | ||
| 1977 | * rmap_item; but later, if there is still work to do, | ||
| 1978 | * we examine covering vmas in other mms: in case they | ||
| 1979 | * were forked from the original since ksmd passed. | ||
| 1980 | */ | ||
| 1981 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
| 1982 | continue; | 1935 | continue; |
| 1983 | 1936 | ||
| 1984 | ret = try_to_unmap_one(page, vma, | 1937 | ret = rwc->rmap_one(page, vma, |
| 1985 | rmap_item->address, flags); | 1938 | rmap_item->address, rwc->arg); |
| 1986 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | 1939 | if (ret != SWAP_AGAIN) { |
| 1987 | anon_vma_unlock_read(anon_vma); | 1940 | anon_vma_unlock_read(anon_vma); |
| 1988 | goto out; | 1941 | goto out; |
| 1989 | } | 1942 | } |
| 1990 | } | 1943 | if (rwc->done && rwc->done(page)) { |
| 1991 | anon_vma_unlock_read(anon_vma); | ||
| 1992 | } | ||
| 1993 | if (!search_new_forks++) | ||
| 1994 | goto again; | ||
| 1995 | out: | ||
| 1996 | return ret; | ||
| 1997 | } | ||
| 1998 | |||
| 1999 | #ifdef CONFIG_MIGRATION | ||
| 2000 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
| 2001 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
| 2002 | { | ||
| 2003 | struct stable_node *stable_node; | ||
| 2004 | struct rmap_item *rmap_item; | ||
| 2005 | int ret = SWAP_AGAIN; | ||
| 2006 | int search_new_forks = 0; | ||
| 2007 | |||
| 2008 | VM_BUG_ON(!PageKsm(page)); | ||
| 2009 | VM_BUG_ON(!PageLocked(page)); | ||
| 2010 | |||
| 2011 | stable_node = page_stable_node(page); | ||
| 2012 | if (!stable_node) | ||
| 2013 | return ret; | ||
| 2014 | again: | ||
| 2015 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | ||
| 2016 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
| 2017 | struct anon_vma_chain *vmac; | ||
| 2018 | struct vm_area_struct *vma; | ||
| 2019 | |||
| 2020 | anon_vma_lock_read(anon_vma); | ||
| 2021 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | ||
| 2022 | 0, ULONG_MAX) { | ||
| 2023 | vma = vmac->vma; | ||
| 2024 | if (rmap_item->address < vma->vm_start || | ||
| 2025 | rmap_item->address >= vma->vm_end) | ||
| 2026 | continue; | ||
| 2027 | /* | ||
| 2028 | * Initially we examine only the vma which covers this | ||
| 2029 | * rmap_item; but later, if there is still work to do, | ||
| 2030 | * we examine covering vmas in other mms: in case they | ||
| 2031 | * were forked from the original since ksmd passed. | ||
| 2032 | */ | ||
| 2033 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
| 2034 | continue; | ||
| 2035 | |||
| 2036 | ret = rmap_one(page, vma, rmap_item->address, arg); | ||
| 2037 | if (ret != SWAP_AGAIN) { | ||
| 2038 | anon_vma_unlock_read(anon_vma); | 1944 | anon_vma_unlock_read(anon_vma); |
| 2039 | goto out; | 1945 | goto out; |
| 2040 | } | 1946 | } |
| @@ -2047,17 +1953,18 @@ out: | |||
| 2047 | return ret; | 1953 | return ret; |
| 2048 | } | 1954 | } |
| 2049 | 1955 | ||
| 1956 | #ifdef CONFIG_MIGRATION | ||
| 2050 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | 1957 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) |
| 2051 | { | 1958 | { |
| 2052 | struct stable_node *stable_node; | 1959 | struct stable_node *stable_node; |
| 2053 | 1960 | ||
| 2054 | VM_BUG_ON(!PageLocked(oldpage)); | 1961 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); |
| 2055 | VM_BUG_ON(!PageLocked(newpage)); | 1962 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); |
| 2056 | VM_BUG_ON(newpage->mapping != oldpage->mapping); | 1963 | VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage); |
| 2057 | 1964 | ||
| 2058 | stable_node = page_stable_node(newpage); | 1965 | stable_node = page_stable_node(newpage); |
| 2059 | if (stable_node) { | 1966 | if (stable_node) { |
| 2060 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | 1967 | VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage); |
| 2061 | stable_node->kpfn = page_to_pfn(newpage); | 1968 | stable_node->kpfn = page_to_pfn(newpage); |
| 2062 | /* | 1969 | /* |
| 2063 | * newpage->mapping was set in advance; now we need smp_wmb() | 1970 | * newpage->mapping was set in advance; now we need smp_wmb() |
| @@ -2438,4 +2345,4 @@ out_free: | |||
| 2438 | out: | 2345 | out: |
| 2439 | return err; | 2346 | return err; |
| 2440 | } | 2347 | } |
| 2441 | module_init(ksm_init) | 2348 | subsys_initcall(ksm_init); |
diff --git a/mm/memblock.c b/mm/memblock.c index 53e477bb5558..39a31e7f0045 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
| @@ -21,6 +21,9 @@ | |||
| 21 | #include <linux/memblock.h> | 21 | #include <linux/memblock.h> |
| 22 | 22 | ||
| 23 | #include <asm-generic/sections.h> | 23 | #include <asm-generic/sections.h> |
| 24 | #include <linux/io.h> | ||
| 25 | |||
| 26 | #include "internal.h" | ||
| 24 | 27 | ||
| 25 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 28 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
| 26 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 29 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
| @@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = { | |||
| 39 | }; | 42 | }; |
| 40 | 43 | ||
| 41 | int memblock_debug __initdata_memblock; | 44 | int memblock_debug __initdata_memblock; |
| 45 | #ifdef CONFIG_MOVABLE_NODE | ||
| 46 | bool movable_node_enabled __initdata_memblock = false; | ||
| 47 | #endif | ||
| 42 | static int memblock_can_resize __initdata_memblock; | 48 | static int memblock_can_resize __initdata_memblock; |
| 43 | static int memblock_memory_in_slab __initdata_memblock = 0; | 49 | static int memblock_memory_in_slab __initdata_memblock = 0; |
| 44 | static int memblock_reserved_in_slab __initdata_memblock = 0; | 50 | static int memblock_reserved_in_slab __initdata_memblock = 0; |
| @@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
| 91 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | 97 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} |
| 92 | * @size: size of free area to find | 98 | * @size: size of free area to find |
| 93 | * @align: alignment of free area to find | 99 | * @align: alignment of free area to find |
| 94 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 100 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
| 95 | * | 101 | * |
| 96 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. | 102 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. |
| 97 | * | 103 | * |
| @@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, | |||
| 123 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | 129 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} |
| 124 | * @size: size of free area to find | 130 | * @size: size of free area to find |
| 125 | * @align: alignment of free area to find | 131 | * @align: alignment of free area to find |
| 126 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 132 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
| 127 | * | 133 | * |
| 128 | * Utility called from memblock_find_in_range_node(), find free area top-down. | 134 | * Utility called from memblock_find_in_range_node(), find free area top-down. |
| 129 | * | 135 | * |
| @@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
| 154 | 160 | ||
| 155 | /** | 161 | /** |
| 156 | * memblock_find_in_range_node - find free area in given range and node | 162 | * memblock_find_in_range_node - find free area in given range and node |
| 157 | * @start: start of candidate range | ||
| 158 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
| 159 | * @size: size of free area to find | 163 | * @size: size of free area to find |
| 160 | * @align: alignment of free area to find | 164 | * @align: alignment of free area to find |
| 161 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 165 | * @start: start of candidate range |
| 166 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
| 167 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
| 162 | * | 168 | * |
| 163 | * Find @size free area aligned to @align in the specified range and node. | 169 | * Find @size free area aligned to @align in the specified range and node. |
| 164 | * | 170 | * |
| @@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
| 173 | * RETURNS: | 179 | * RETURNS: |
| 174 | * Found address on success, 0 on failure. | 180 | * Found address on success, 0 on failure. |
| 175 | */ | 181 | */ |
| 176 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | 182 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, |
| 177 | phys_addr_t end, phys_addr_t size, | 183 | phys_addr_t align, phys_addr_t start, |
| 178 | phys_addr_t align, int nid) | 184 | phys_addr_t end, int nid) |
| 179 | { | 185 | { |
| 180 | int ret; | 186 | int ret; |
| 181 | phys_addr_t kernel_end; | 187 | phys_addr_t kernel_end; |
| @@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | |||
| 238 | phys_addr_t end, phys_addr_t size, | 244 | phys_addr_t end, phys_addr_t size, |
| 239 | phys_addr_t align) | 245 | phys_addr_t align) |
| 240 | { | 246 | { |
| 241 | return memblock_find_in_range_node(start, end, size, align, | 247 | return memblock_find_in_range_node(size, align, start, end, |
| 242 | MAX_NUMNODES); | 248 | NUMA_NO_NODE); |
| 243 | } | 249 | } |
| 244 | 250 | ||
| 245 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) | 251 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) |
| @@ -255,10 +261,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
| 255 | type->cnt = 1; | 261 | type->cnt = 1; |
| 256 | type->regions[0].base = 0; | 262 | type->regions[0].base = 0; |
| 257 | type->regions[0].size = 0; | 263 | type->regions[0].size = 0; |
| 264 | type->regions[0].flags = 0; | ||
| 258 | memblock_set_region_node(&type->regions[0], MAX_NUMNODES); | 265 | memblock_set_region_node(&type->regions[0], MAX_NUMNODES); |
| 259 | } | 266 | } |
| 260 | } | 267 | } |
| 261 | 268 | ||
| 269 | #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK | ||
| 270 | |||
| 262 | phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( | 271 | phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( |
| 263 | phys_addr_t *addr) | 272 | phys_addr_t *addr) |
| 264 | { | 273 | { |
| @@ -271,6 +280,20 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( | |||
| 271 | memblock.reserved.max); | 280 | memblock.reserved.max); |
| 272 | } | 281 | } |
| 273 | 282 | ||
| 283 | phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( | ||
| 284 | phys_addr_t *addr) | ||
| 285 | { | ||
| 286 | if (memblock.memory.regions == memblock_memory_init_regions) | ||
| 287 | return 0; | ||
| 288 | |||
| 289 | *addr = __pa(memblock.memory.regions); | ||
| 290 | |||
| 291 | return PAGE_ALIGN(sizeof(struct memblock_region) * | ||
| 292 | memblock.memory.max); | ||
| 293 | } | ||
| 294 | |||
| 295 | #endif | ||
| 296 | |||
| 274 | /** | 297 | /** |
| 275 | * memblock_double_array - double the size of the memblock regions array | 298 | * memblock_double_array - double the size of the memblock regions array |
| 276 | * @type: memblock type of the regions array being doubled | 299 | * @type: memblock type of the regions array being doubled |
| @@ -405,7 +428,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
| 405 | 428 | ||
| 406 | if (this->base + this->size != next->base || | 429 | if (this->base + this->size != next->base || |
| 407 | memblock_get_region_node(this) != | 430 | memblock_get_region_node(this) != |
| 408 | memblock_get_region_node(next)) { | 431 | memblock_get_region_node(next) || |
| 432 | this->flags != next->flags) { | ||
| 409 | BUG_ON(this->base + this->size > next->base); | 433 | BUG_ON(this->base + this->size > next->base); |
| 410 | i++; | 434 | i++; |
| 411 | continue; | 435 | continue; |
| @@ -425,13 +449,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
| 425 | * @base: base address of the new region | 449 | * @base: base address of the new region |
| 426 | * @size: size of the new region | 450 | * @size: size of the new region |
| 427 | * @nid: node id of the new region | 451 | * @nid: node id of the new region |
| 452 | * @flags: flags of the new region | ||
| 428 | * | 453 | * |
| 429 | * Insert new memblock region [@base,@base+@size) into @type at @idx. | 454 | * Insert new memblock region [@base,@base+@size) into @type at @idx. |
| 430 | * @type must already have extra room to accomodate the new region. | 455 | * @type must already have extra room to accomodate the new region. |
| 431 | */ | 456 | */ |
| 432 | static void __init_memblock memblock_insert_region(struct memblock_type *type, | 457 | static void __init_memblock memblock_insert_region(struct memblock_type *type, |
| 433 | int idx, phys_addr_t base, | 458 | int idx, phys_addr_t base, |
| 434 | phys_addr_t size, int nid) | 459 | phys_addr_t size, |
| 460 | int nid, unsigned long flags) | ||
| 435 | { | 461 | { |
| 436 | struct memblock_region *rgn = &type->regions[idx]; | 462 | struct memblock_region *rgn = &type->regions[idx]; |
| 437 | 463 | ||
| @@ -439,6 +465,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
| 439 | memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); | 465 | memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); |
| 440 | rgn->base = base; | 466 | rgn->base = base; |
| 441 | rgn->size = size; | 467 | rgn->size = size; |
| 468 | rgn->flags = flags; | ||
| 442 | memblock_set_region_node(rgn, nid); | 469 | memblock_set_region_node(rgn, nid); |
| 443 | type->cnt++; | 470 | type->cnt++; |
| 444 | type->total_size += size; | 471 | type->total_size += size; |
| @@ -450,6 +477,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
| 450 | * @base: base address of the new region | 477 | * @base: base address of the new region |
| 451 | * @size: size of the new region | 478 | * @size: size of the new region |
| 452 | * @nid: nid of the new region | 479 | * @nid: nid of the new region |
| 480 | * @flags: flags of the new region | ||
| 453 | * | 481 | * |
| 454 | * Add new memblock region [@base,@base+@size) into @type. The new region | 482 | * Add new memblock region [@base,@base+@size) into @type. The new region |
| 455 | * is allowed to overlap with existing ones - overlaps don't affect already | 483 | * is allowed to overlap with existing ones - overlaps don't affect already |
| @@ -460,7 +488,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
| 460 | * 0 on success, -errno on failure. | 488 | * 0 on success, -errno on failure. |
| 461 | */ | 489 | */ |
| 462 | static int __init_memblock memblock_add_region(struct memblock_type *type, | 490 | static int __init_memblock memblock_add_region(struct memblock_type *type, |
| 463 | phys_addr_t base, phys_addr_t size, int nid) | 491 | phys_addr_t base, phys_addr_t size, |
| 492 | int nid, unsigned long flags) | ||
| 464 | { | 493 | { |
| 465 | bool insert = false; | 494 | bool insert = false; |
| 466 | phys_addr_t obase = base; | 495 | phys_addr_t obase = base; |
| @@ -475,6 +504,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, | |||
| 475 | WARN_ON(type->cnt != 1 || type->total_size); | 504 | WARN_ON(type->cnt != 1 || type->total_size); |
| 476 | type->regions[0].base = base; | 505 | type->regions[0].base = base; |
| 477 | type->regions[0].size = size; | 506 | type->regions[0].size = size; |
| 507 | type->regions[0].flags = flags; | ||
| 478 | memblock_set_region_node(&type->regions[0], nid); | 508 | memblock_set_region_node(&type->regions[0], nid); |
| 479 | type->total_size = size; | 509 | type->total_size = size; |
| 480 | return 0; | 510 | return 0; |
| @@ -505,7 +535,8 @@ repeat: | |||
| 505 | nr_new++; | 535 | nr_new++; |
| 506 | if (insert) | 536 | if (insert) |
| 507 | memblock_insert_region(type, i++, base, | 537 | memblock_insert_region(type, i++, base, |
| 508 | rbase - base, nid); | 538 | rbase - base, nid, |
| 539 | flags); | ||
| 509 | } | 540 | } |
| 510 | /* area below @rend is dealt with, forget about it */ | 541 | /* area below @rend is dealt with, forget about it */ |
| 511 | base = min(rend, end); | 542 | base = min(rend, end); |
| @@ -515,7 +546,8 @@ repeat: | |||
| 515 | if (base < end) { | 546 | if (base < end) { |
| 516 | nr_new++; | 547 | nr_new++; |
| 517 | if (insert) | 548 | if (insert) |
| 518 | memblock_insert_region(type, i, base, end - base, nid); | 549 | memblock_insert_region(type, i, base, end - base, |
| 550 | nid, flags); | ||
| 519 | } | 551 | } |
| 520 | 552 | ||
| 521 | /* | 553 | /* |
| @@ -537,12 +569,13 @@ repeat: | |||
| 537 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, | 569 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, |
| 538 | int nid) | 570 | int nid) |
| 539 | { | 571 | { |
| 540 | return memblock_add_region(&memblock.memory, base, size, nid); | 572 | return memblock_add_region(&memblock.memory, base, size, nid, 0); |
| 541 | } | 573 | } |
| 542 | 574 | ||
| 543 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 575 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
| 544 | { | 576 | { |
| 545 | return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); | 577 | return memblock_add_region(&memblock.memory, base, size, |
| 578 | MAX_NUMNODES, 0); | ||
| 546 | } | 579 | } |
| 547 | 580 | ||
| 548 | /** | 581 | /** |
| @@ -597,7 +630,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
| 597 | rgn->size -= base - rbase; | 630 | rgn->size -= base - rbase; |
| 598 | type->total_size -= base - rbase; | 631 | type->total_size -= base - rbase; |
| 599 | memblock_insert_region(type, i, rbase, base - rbase, | 632 | memblock_insert_region(type, i, rbase, base - rbase, |
| 600 | memblock_get_region_node(rgn)); | 633 | memblock_get_region_node(rgn), |
| 634 | rgn->flags); | ||
| 601 | } else if (rend > end) { | 635 | } else if (rend > end) { |
| 602 | /* | 636 | /* |
| 603 | * @rgn intersects from above. Split and redo the | 637 | * @rgn intersects from above. Split and redo the |
| @@ -607,7 +641,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
| 607 | rgn->size -= end - rbase; | 641 | rgn->size -= end - rbase; |
| 608 | type->total_size -= end - rbase; | 642 | type->total_size -= end - rbase; |
| 609 | memblock_insert_region(type, i--, rbase, end - rbase, | 643 | memblock_insert_region(type, i--, rbase, end - rbase, |
| 610 | memblock_get_region_node(rgn)); | 644 | memblock_get_region_node(rgn), |
| 645 | rgn->flags); | ||
| 611 | } else { | 646 | } else { |
| 612 | /* @rgn is fully contained, record it */ | 647 | /* @rgn is fully contained, record it */ |
| 613 | if (!*end_rgn) | 648 | if (!*end_rgn) |
| @@ -643,28 +678,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) | |||
| 643 | { | 678 | { |
| 644 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", | 679 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", |
| 645 | (unsigned long long)base, | 680 | (unsigned long long)base, |
| 646 | (unsigned long long)base + size, | 681 | (unsigned long long)base + size - 1, |
| 647 | (void *)_RET_IP_); | 682 | (void *)_RET_IP_); |
| 648 | 683 | ||
| 649 | return __memblock_remove(&memblock.reserved, base, size); | 684 | return __memblock_remove(&memblock.reserved, base, size); |
| 650 | } | 685 | } |
| 651 | 686 | ||
| 652 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | 687 | static int __init_memblock memblock_reserve_region(phys_addr_t base, |
| 688 | phys_addr_t size, | ||
| 689 | int nid, | ||
| 690 | unsigned long flags) | ||
| 653 | { | 691 | { |
| 654 | struct memblock_type *_rgn = &memblock.reserved; | 692 | struct memblock_type *_rgn = &memblock.reserved; |
| 655 | 693 | ||
| 656 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", | 694 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", |
| 657 | (unsigned long long)base, | 695 | (unsigned long long)base, |
| 658 | (unsigned long long)base + size, | 696 | (unsigned long long)base + size - 1, |
| 659 | (void *)_RET_IP_); | 697 | flags, (void *)_RET_IP_); |
| 698 | |||
| 699 | return memblock_add_region(_rgn, base, size, nid, flags); | ||
| 700 | } | ||
| 701 | |||
| 702 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | ||
| 703 | { | ||
| 704 | return memblock_reserve_region(base, size, MAX_NUMNODES, 0); | ||
| 705 | } | ||
| 706 | |||
| 707 | /** | ||
| 708 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. | ||
| 709 | * @base: the base phys addr of the region | ||
| 710 | * @size: the size of the region | ||
| 711 | * | ||
| 712 | * This function isolates region [@base, @base + @size), and mark it with flag | ||
| 713 | * MEMBLOCK_HOTPLUG. | ||
| 714 | * | ||
| 715 | * Return 0 on succees, -errno on failure. | ||
| 716 | */ | ||
| 717 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | ||
| 718 | { | ||
| 719 | struct memblock_type *type = &memblock.memory; | ||
| 720 | int i, ret, start_rgn, end_rgn; | ||
| 721 | |||
| 722 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
| 723 | if (ret) | ||
| 724 | return ret; | ||
| 725 | |||
| 726 | for (i = start_rgn; i < end_rgn; i++) | ||
| 727 | memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); | ||
| 660 | 728 | ||
| 661 | return memblock_add_region(_rgn, base, size, MAX_NUMNODES); | 729 | memblock_merge_regions(type); |
| 730 | return 0; | ||
| 731 | } | ||
| 732 | |||
| 733 | /** | ||
| 734 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | ||
| 735 | * @base: the base phys addr of the region | ||
| 736 | * @size: the size of the region | ||
| 737 | * | ||
| 738 | * This function isolates region [@base, @base + @size), and clear flag | ||
| 739 | * MEMBLOCK_HOTPLUG for the isolated regions. | ||
| 740 | * | ||
| 741 | * Return 0 on succees, -errno on failure. | ||
| 742 | */ | ||
| 743 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | ||
| 744 | { | ||
| 745 | struct memblock_type *type = &memblock.memory; | ||
| 746 | int i, ret, start_rgn, end_rgn; | ||
| 747 | |||
| 748 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
| 749 | if (ret) | ||
| 750 | return ret; | ||
| 751 | |||
| 752 | for (i = start_rgn; i < end_rgn; i++) | ||
| 753 | memblock_clear_region_flags(&type->regions[i], | ||
| 754 | MEMBLOCK_HOTPLUG); | ||
| 755 | |||
| 756 | memblock_merge_regions(type); | ||
| 757 | return 0; | ||
| 662 | } | 758 | } |
| 663 | 759 | ||
| 664 | /** | 760 | /** |
| 665 | * __next_free_mem_range - next function for for_each_free_mem_range() | 761 | * __next_free_mem_range - next function for for_each_free_mem_range() |
| 666 | * @idx: pointer to u64 loop variable | 762 | * @idx: pointer to u64 loop variable |
| 667 | * @nid: node selector, %MAX_NUMNODES for all nodes | 763 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
| 668 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 764 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
| 669 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 765 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
| 670 | * @out_nid: ptr to int for nid of the range, can be %NULL | 766 | * @out_nid: ptr to int for nid of the range, can be %NULL |
| @@ -693,13 +789,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
| 693 | int mi = *idx & 0xffffffff; | 789 | int mi = *idx & 0xffffffff; |
| 694 | int ri = *idx >> 32; | 790 | int ri = *idx >> 32; |
| 695 | 791 | ||
| 792 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
| 793 | nid = NUMA_NO_NODE; | ||
| 794 | |||
| 696 | for ( ; mi < mem->cnt; mi++) { | 795 | for ( ; mi < mem->cnt; mi++) { |
| 697 | struct memblock_region *m = &mem->regions[mi]; | 796 | struct memblock_region *m = &mem->regions[mi]; |
| 698 | phys_addr_t m_start = m->base; | 797 | phys_addr_t m_start = m->base; |
| 699 | phys_addr_t m_end = m->base + m->size; | 798 | phys_addr_t m_end = m->base + m->size; |
| 700 | 799 | ||
| 701 | /* only memory regions are associated with nodes, check it */ | 800 | /* only memory regions are associated with nodes, check it */ |
| 702 | if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) | 801 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) |
| 703 | continue; | 802 | continue; |
| 704 | 803 | ||
| 705 | /* scan areas before each reservation for intersection */ | 804 | /* scan areas before each reservation for intersection */ |
| @@ -740,12 +839,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
| 740 | /** | 839 | /** |
| 741 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 840 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() |
| 742 | * @idx: pointer to u64 loop variable | 841 | * @idx: pointer to u64 loop variable |
| 743 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 842 | * @nid: nid: node selector, %NUMA_NO_NODE for all nodes |
| 744 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 843 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
| 745 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 844 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
| 746 | * @out_nid: ptr to int for nid of the range, can be %NULL | 845 | * @out_nid: ptr to int for nid of the range, can be %NULL |
| 747 | * | 846 | * |
| 748 | * Reverse of __next_free_mem_range(). | 847 | * Reverse of __next_free_mem_range(). |
| 848 | * | ||
| 849 | * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't | ||
| 850 | * be able to hot-remove hotpluggable memory used by the kernel. So this | ||
| 851 | * function skip hotpluggable regions if needed when allocating memory for the | ||
| 852 | * kernel. | ||
| 749 | */ | 853 | */ |
| 750 | void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | 854 | void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, |
| 751 | phys_addr_t *out_start, | 855 | phys_addr_t *out_start, |
| @@ -756,6 +860,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | |||
| 756 | int mi = *idx & 0xffffffff; | 860 | int mi = *idx & 0xffffffff; |
| 757 | int ri = *idx >> 32; | 861 | int ri = *idx >> 32; |
| 758 | 862 | ||
| 863 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
| 864 | nid = NUMA_NO_NODE; | ||
| 865 | |||
| 759 | if (*idx == (u64)ULLONG_MAX) { | 866 | if (*idx == (u64)ULLONG_MAX) { |
| 760 | mi = mem->cnt - 1; | 867 | mi = mem->cnt - 1; |
| 761 | ri = rsv->cnt; | 868 | ri = rsv->cnt; |
| @@ -767,7 +874,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | |||
| 767 | phys_addr_t m_end = m->base + m->size; | 874 | phys_addr_t m_end = m->base + m->size; |
| 768 | 875 | ||
| 769 | /* only memory regions are associated with nodes, check it */ | 876 | /* only memory regions are associated with nodes, check it */ |
| 770 | if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) | 877 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) |
| 878 | continue; | ||
| 879 | |||
| 880 | /* skip hotpluggable memory regions if needed */ | ||
| 881 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) | ||
| 771 | continue; | 882 | continue; |
| 772 | 883 | ||
| 773 | /* scan areas before each reservation for intersection */ | 884 | /* scan areas before each reservation for intersection */ |
| @@ -837,18 +948,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, | |||
| 837 | * memblock_set_node - set node ID on memblock regions | 948 | * memblock_set_node - set node ID on memblock regions |
| 838 | * @base: base of area to set node ID for | 949 | * @base: base of area to set node ID for |
| 839 | * @size: size of area to set node ID for | 950 | * @size: size of area to set node ID for |
| 951 | * @type: memblock type to set node ID for | ||
| 840 | * @nid: node ID to set | 952 | * @nid: node ID to set |
| 841 | * | 953 | * |
| 842 | * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. | 954 | * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. |
| 843 | * Regions which cross the area boundaries are split as necessary. | 955 | * Regions which cross the area boundaries are split as necessary. |
| 844 | * | 956 | * |
| 845 | * RETURNS: | 957 | * RETURNS: |
| 846 | * 0 on success, -errno on failure. | 958 | * 0 on success, -errno on failure. |
| 847 | */ | 959 | */ |
| 848 | int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | 960 | int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, |
| 849 | int nid) | 961 | struct memblock_type *type, int nid) |
| 850 | { | 962 | { |
| 851 | struct memblock_type *type = &memblock.memory; | ||
| 852 | int start_rgn, end_rgn; | 963 | int start_rgn, end_rgn; |
| 853 | int i, ret; | 964 | int i, ret; |
| 854 | 965 | ||
| @@ -870,13 +981,10 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | |||
| 870 | { | 981 | { |
| 871 | phys_addr_t found; | 982 | phys_addr_t found; |
| 872 | 983 | ||
| 873 | if (WARN_ON(!align)) | 984 | if (!align) |
| 874 | align = __alignof__(long long); | 985 | align = SMP_CACHE_BYTES; |
| 875 | 986 | ||
| 876 | /* align @size to avoid excessive fragmentation on reserved array */ | 987 | found = memblock_find_in_range_node(size, align, 0, max_addr, nid); |
| 877 | size = round_up(size, align); | ||
| 878 | |||
| 879 | found = memblock_find_in_range_node(0, max_addr, size, align, nid); | ||
| 880 | if (found && !memblock_reserve(found, size)) | 988 | if (found && !memblock_reserve(found, size)) |
| 881 | return found; | 989 | return found; |
| 882 | 990 | ||
| @@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n | |||
| 890 | 998 | ||
| 891 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 999 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
| 892 | { | 1000 | { |
| 893 | return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); | 1001 | return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); |
| 894 | } | 1002 | } |
| 895 | 1003 | ||
| 896 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 1004 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
| @@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i | |||
| 920 | return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); | 1028 | return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); |
| 921 | } | 1029 | } |
| 922 | 1030 | ||
| 1031 | /** | ||
| 1032 | * memblock_virt_alloc_internal - allocate boot memory block | ||
| 1033 | * @size: size of memory block to be allocated in bytes | ||
| 1034 | * @align: alignment of the region and block's size | ||
| 1035 | * @min_addr: the lower bound of the memory region to allocate (phys address) | ||
| 1036 | * @max_addr: the upper bound of the memory region to allocate (phys address) | ||
| 1037 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
| 1038 | * | ||
| 1039 | * The @min_addr limit is dropped if it can not be satisfied and the allocation | ||
| 1040 | * will fall back to memory below @min_addr. Also, allocation may fall back | ||
| 1041 | * to any node in the system if the specified node can not | ||
| 1042 | * hold the requested memory. | ||
| 1043 | * | ||
| 1044 | * The allocation is performed from memory region limited by | ||
| 1045 | * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. | ||
| 1046 | * | ||
| 1047 | * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. | ||
| 1048 | * | ||
| 1049 | * The phys address of allocated boot memory block is converted to virtual and | ||
| 1050 | * allocated memory is reset to 0. | ||
| 1051 | * | ||
| 1052 | * In addition, function sets the min_count to 0 using kmemleak_alloc for | ||
| 1053 | * allocated boot memory block, so that it is never reported as leaks. | ||
| 1054 | * | ||
| 1055 | * RETURNS: | ||
| 1056 | * Virtual address of allocated memory block on success, NULL on failure. | ||
| 1057 | */ | ||
| 1058 | static void * __init memblock_virt_alloc_internal( | ||
| 1059 | phys_addr_t size, phys_addr_t align, | ||
| 1060 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
| 1061 | int nid) | ||
| 1062 | { | ||
| 1063 | phys_addr_t alloc; | ||
| 1064 | void *ptr; | ||
| 1065 | |||
| 1066 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
| 1067 | nid = NUMA_NO_NODE; | ||
| 1068 | |||
| 1069 | /* | ||
| 1070 | * Detect any accidental use of these APIs after slab is ready, as at | ||
| 1071 | * this moment memblock may be deinitialized already and its | ||
| 1072 | * internal data may be destroyed (after execution of free_all_bootmem) | ||
| 1073 | */ | ||
| 1074 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 1075 | return kzalloc_node(size, GFP_NOWAIT, nid); | ||
| 1076 | |||
| 1077 | if (!align) | ||
| 1078 | align = SMP_CACHE_BYTES; | ||
| 1079 | |||
| 1080 | if (max_addr > memblock.current_limit) | ||
| 1081 | max_addr = memblock.current_limit; | ||
| 1082 | |||
| 1083 | again: | ||
| 1084 | alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, | ||
| 1085 | nid); | ||
| 1086 | if (alloc) | ||
| 1087 | goto done; | ||
| 1088 | |||
| 1089 | if (nid != NUMA_NO_NODE) { | ||
| 1090 | alloc = memblock_find_in_range_node(size, align, min_addr, | ||
| 1091 | max_addr, NUMA_NO_NODE); | ||
| 1092 | if (alloc) | ||
| 1093 | goto done; | ||
| 1094 | } | ||
| 1095 | |||
| 1096 | if (min_addr) { | ||
| 1097 | min_addr = 0; | ||
| 1098 | goto again; | ||
| 1099 | } else { | ||
| 1100 | goto error; | ||
| 1101 | } | ||
| 1102 | |||
| 1103 | done: | ||
| 1104 | memblock_reserve(alloc, size); | ||
| 1105 | ptr = phys_to_virt(alloc); | ||
| 1106 | memset(ptr, 0, size); | ||
| 1107 | |||
| 1108 | /* | ||
| 1109 | * The min_count is set to 0 so that bootmem allocated blocks | ||
| 1110 | * are never reported as leaks. This is because many of these blocks | ||
| 1111 | * are only referred via the physical address which is not | ||
| 1112 | * looked up by kmemleak. | ||
| 1113 | */ | ||
| 1114 | kmemleak_alloc(ptr, size, 0, 0); | ||
| 1115 | |||
| 1116 | return ptr; | ||
| 1117 | |||
| 1118 | error: | ||
| 1119 | return NULL; | ||
| 1120 | } | ||
| 1121 | |||
| 1122 | /** | ||
| 1123 | * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block | ||
| 1124 | * @size: size of memory block to be allocated in bytes | ||
| 1125 | * @align: alignment of the region and block's size | ||
| 1126 | * @min_addr: the lower bound of the memory region from where the allocation | ||
| 1127 | * is preferred (phys address) | ||
| 1128 | * @max_addr: the upper bound of the memory region from where the allocation | ||
| 1129 | * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to | ||
| 1130 | * allocate only from memory limited by memblock.current_limit value | ||
| 1131 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
| 1132 | * | ||
| 1133 | * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides | ||
| 1134 | * additional debug information (including caller info), if enabled. | ||
| 1135 | * | ||
| 1136 | * RETURNS: | ||
| 1137 | * Virtual address of allocated memory block on success, NULL on failure. | ||
| 1138 | */ | ||
| 1139 | void * __init memblock_virt_alloc_try_nid_nopanic( | ||
| 1140 | phys_addr_t size, phys_addr_t align, | ||
| 1141 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
| 1142 | int nid) | ||
| 1143 | { | ||
| 1144 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | ||
| 1145 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
| 1146 | (u64)max_addr, (void *)_RET_IP_); | ||
| 1147 | return memblock_virt_alloc_internal(size, align, min_addr, | ||
| 1148 | max_addr, nid); | ||
| 1149 | } | ||
| 1150 | |||
| 1151 | /** | ||
| 1152 | * memblock_virt_alloc_try_nid - allocate boot memory block with panicking | ||
| 1153 | * @size: size of memory block to be allocated in bytes | ||
| 1154 | * @align: alignment of the region and block's size | ||
| 1155 | * @min_addr: the lower bound of the memory region from where the allocation | ||
| 1156 | * is preferred (phys address) | ||
| 1157 | * @max_addr: the upper bound of the memory region from where the allocation | ||
| 1158 | * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to | ||
| 1159 | * allocate only from memory limited by memblock.current_limit value | ||
| 1160 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
| 1161 | * | ||
| 1162 | * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() | ||
| 1163 | * which provides debug information (including caller info), if enabled, | ||
| 1164 | * and panics if the request can not be satisfied. | ||
| 1165 | * | ||
| 1166 | * RETURNS: | ||
| 1167 | * Virtual address of allocated memory block on success, NULL on failure. | ||
| 1168 | */ | ||
| 1169 | void * __init memblock_virt_alloc_try_nid( | ||
| 1170 | phys_addr_t size, phys_addr_t align, | ||
| 1171 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
| 1172 | int nid) | ||
| 1173 | { | ||
| 1174 | void *ptr; | ||
| 1175 | |||
| 1176 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | ||
| 1177 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
| 1178 | (u64)max_addr, (void *)_RET_IP_); | ||
| 1179 | ptr = memblock_virt_alloc_internal(size, align, | ||
| 1180 | min_addr, max_addr, nid); | ||
| 1181 | if (ptr) | ||
| 1182 | return ptr; | ||
| 1183 | |||
| 1184 | panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", | ||
| 1185 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
| 1186 | (u64)max_addr); | ||
| 1187 | return NULL; | ||
| 1188 | } | ||
| 1189 | |||
| 1190 | /** | ||
| 1191 | * __memblock_free_early - free boot memory block | ||
| 1192 | * @base: phys starting address of the boot memory block | ||
| 1193 | * @size: size of the boot memory block in bytes | ||
| 1194 | * | ||
| 1195 | * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. | ||
| 1196 | * The freeing memory will not be released to the buddy allocator. | ||
| 1197 | */ | ||
| 1198 | void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) | ||
| 1199 | { | ||
| 1200 | memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", | ||
| 1201 | __func__, (u64)base, (u64)base + size - 1, | ||
| 1202 | (void *)_RET_IP_); | ||
| 1203 | kmemleak_free_part(__va(base), size); | ||
| 1204 | __memblock_remove(&memblock.reserved, base, size); | ||
| 1205 | } | ||
| 1206 | |||
| 1207 | /* | ||
| 1208 | * __memblock_free_late - free bootmem block pages directly to buddy allocator | ||
| 1209 | * @addr: phys starting address of the boot memory block | ||
| 1210 | * @size: size of the boot memory block in bytes | ||
| 1211 | * | ||
| 1212 | * This is only useful when the bootmem allocator has already been torn | ||
| 1213 | * down, but we are still initializing the system. Pages are released directly | ||
| 1214 | * to the buddy allocator, no bootmem metadata is updated because it is gone. | ||
| 1215 | */ | ||
| 1216 | void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) | ||
| 1217 | { | ||
| 1218 | u64 cursor, end; | ||
| 1219 | |||
| 1220 | memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", | ||
| 1221 | __func__, (u64)base, (u64)base + size - 1, | ||
| 1222 | (void *)_RET_IP_); | ||
| 1223 | kmemleak_free_part(__va(base), size); | ||
| 1224 | cursor = PFN_UP(base); | ||
| 1225 | end = PFN_DOWN(base + size); | ||
| 1226 | |||
| 1227 | for (; cursor < end; cursor++) { | ||
| 1228 | __free_pages_bootmem(pfn_to_page(cursor), 0); | ||
| 1229 | totalram_pages++; | ||
| 1230 | } | ||
| 1231 | } | ||
| 923 | 1232 | ||
| 924 | /* | 1233 | /* |
| 925 | * Remaining API functions | 1234 | * Remaining API functions |
| @@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) | |||
| 1101 | static void __init_memblock memblock_dump(struct memblock_type *type, char *name) | 1410 | static void __init_memblock memblock_dump(struct memblock_type *type, char *name) |
| 1102 | { | 1411 | { |
| 1103 | unsigned long long base, size; | 1412 | unsigned long long base, size; |
| 1413 | unsigned long flags; | ||
| 1104 | int i; | 1414 | int i; |
| 1105 | 1415 | ||
| 1106 | pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); | 1416 | pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); |
| @@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name | |||
| 1111 | 1421 | ||
| 1112 | base = rgn->base; | 1422 | base = rgn->base; |
| 1113 | size = rgn->size; | 1423 | size = rgn->size; |
| 1424 | flags = rgn->flags; | ||
| 1114 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 1425 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
| 1115 | if (memblock_get_region_node(rgn) != MAX_NUMNODES) | 1426 | if (memblock_get_region_node(rgn) != MAX_NUMNODES) |
| 1116 | snprintf(nid_buf, sizeof(nid_buf), " on node %d", | 1427 | snprintf(nid_buf, sizeof(nid_buf), " on node %d", |
| 1117 | memblock_get_region_node(rgn)); | 1428 | memblock_get_region_node(rgn)); |
| 1118 | #endif | 1429 | #endif |
| 1119 | pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", | 1430 | pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", |
| 1120 | name, i, base, base + size - 1, size, nid_buf); | 1431 | name, i, base, base + size - 1, size, nid_buf, flags); |
| 1121 | } | 1432 | } |
| 1122 | } | 1433 | } |
| 1123 | 1434 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f1a356153c0..5b6b0039f725 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -45,16 +45,17 @@ | |||
| 45 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
| 46 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
| 47 | #include <linux/eventfd.h> | 47 | #include <linux/eventfd.h> |
| 48 | #include <linux/poll.h> | ||
| 48 | #include <linux/sort.h> | 49 | #include <linux/sort.h> |
| 49 | #include <linux/fs.h> | 50 | #include <linux/fs.h> |
| 50 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
| 51 | #include <linux/vmalloc.h> | ||
| 52 | #include <linux/vmpressure.h> | 52 | #include <linux/vmpressure.h> |
| 53 | #include <linux/mm_inline.h> | 53 | #include <linux/mm_inline.h> |
| 54 | #include <linux/page_cgroup.h> | 54 | #include <linux/page_cgroup.h> |
| 55 | #include <linux/cpu.h> | 55 | #include <linux/cpu.h> |
| 56 | #include <linux/oom.h> | 56 | #include <linux/oom.h> |
| 57 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
| 58 | #include <linux/file.h> | ||
| 58 | #include "internal.h" | 59 | #include "internal.h" |
| 59 | #include <net/sock.h> | 60 | #include <net/sock.h> |
| 60 | #include <net/ip.h> | 61 | #include <net/ip.h> |
| @@ -148,7 +149,7 @@ struct mem_cgroup_reclaim_iter { | |||
| 148 | * matches memcg->dead_count of the hierarchy root group. | 149 | * matches memcg->dead_count of the hierarchy root group. |
| 149 | */ | 150 | */ |
| 150 | struct mem_cgroup *last_visited; | 151 | struct mem_cgroup *last_visited; |
| 151 | unsigned long last_dead_count; | 152 | int last_dead_count; |
| 152 | 153 | ||
| 153 | /* scan generation, increased every round-trip */ | 154 | /* scan generation, increased every round-trip */ |
| 154 | unsigned int generation; | 155 | unsigned int generation; |
| @@ -227,6 +228,46 @@ struct mem_cgroup_eventfd_list { | |||
| 227 | struct eventfd_ctx *eventfd; | 228 | struct eventfd_ctx *eventfd; |
| 228 | }; | 229 | }; |
| 229 | 230 | ||
| 231 | /* | ||
| 232 | * cgroup_event represents events which userspace want to receive. | ||
| 233 | */ | ||
| 234 | struct mem_cgroup_event { | ||
| 235 | /* | ||
| 236 | * memcg which the event belongs to. | ||
| 237 | */ | ||
| 238 | struct mem_cgroup *memcg; | ||
| 239 | /* | ||
| 240 | * eventfd to signal userspace about the event. | ||
| 241 | */ | ||
| 242 | struct eventfd_ctx *eventfd; | ||
| 243 | /* | ||
| 244 | * Each of these stored in a list by the cgroup. | ||
| 245 | */ | ||
| 246 | struct list_head list; | ||
| 247 | /* | ||
| 248 | * register_event() callback will be used to add new userspace | ||
| 249 | * waiter for changes related to this event. Use eventfd_signal() | ||
| 250 | * on eventfd to send notification to userspace. | ||
| 251 | */ | ||
| 252 | int (*register_event)(struct mem_cgroup *memcg, | ||
| 253 | struct eventfd_ctx *eventfd, const char *args); | ||
| 254 | /* | ||
| 255 | * unregister_event() callback will be called when userspace closes | ||
| 256 | * the eventfd or on cgroup removing. This callback must be set, | ||
| 257 | * if you want provide notification functionality. | ||
| 258 | */ | ||
| 259 | void (*unregister_event)(struct mem_cgroup *memcg, | ||
| 260 | struct eventfd_ctx *eventfd); | ||
| 261 | /* | ||
| 262 | * All fields below needed to unregister event when | ||
| 263 | * userspace closes eventfd. | ||
| 264 | */ | ||
| 265 | poll_table pt; | ||
| 266 | wait_queue_head_t *wqh; | ||
| 267 | wait_queue_t wait; | ||
| 268 | struct work_struct remove; | ||
| 269 | }; | ||
| 270 | |||
| 230 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); | 271 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); |
| 231 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | 272 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
| 232 | 273 | ||
| @@ -331,27 +372,20 @@ struct mem_cgroup { | |||
| 331 | atomic_t numainfo_updating; | 372 | atomic_t numainfo_updating; |
| 332 | #endif | 373 | #endif |
| 333 | 374 | ||
| 375 | /* List of events which userspace want to receive */ | ||
| 376 | struct list_head event_list; | ||
| 377 | spinlock_t event_list_lock; | ||
| 378 | |||
| 334 | struct mem_cgroup_per_node *nodeinfo[0]; | 379 | struct mem_cgroup_per_node *nodeinfo[0]; |
| 335 | /* WARNING: nodeinfo must be the last member here */ | 380 | /* WARNING: nodeinfo must be the last member here */ |
| 336 | }; | 381 | }; |
| 337 | 382 | ||
| 338 | static size_t memcg_size(void) | ||
| 339 | { | ||
| 340 | return sizeof(struct mem_cgroup) + | ||
| 341 | nr_node_ids * sizeof(struct mem_cgroup_per_node *); | ||
| 342 | } | ||
| 343 | |||
| 344 | /* internal only representation about the status of kmem accounting. */ | 383 | /* internal only representation about the status of kmem accounting. */ |
| 345 | enum { | 384 | enum { |
| 346 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ | 385 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ |
| 347 | KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ | ||
| 348 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ | 386 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ |
| 349 | }; | 387 | }; |
| 350 | 388 | ||
| 351 | /* We account when limit is on, but only after call sites are patched */ | ||
| 352 | #define KMEM_ACCOUNTED_MASK \ | ||
| 353 | ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) | ||
| 354 | |||
| 355 | #ifdef CONFIG_MEMCG_KMEM | 389 | #ifdef CONFIG_MEMCG_KMEM |
| 356 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) | 390 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) |
| 357 | { | 391 | { |
| @@ -363,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | |||
| 363 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | 397 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); |
| 364 | } | 398 | } |
| 365 | 399 | ||
| 366 | static void memcg_kmem_set_activated(struct mem_cgroup *memcg) | ||
| 367 | { | ||
| 368 | set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | ||
| 369 | } | ||
| 370 | |||
| 371 | static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) | ||
| 372 | { | ||
| 373 | clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | ||
| 374 | } | ||
| 375 | |||
| 376 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) | 400 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) |
| 377 | { | 401 | { |
| 378 | /* | 402 | /* |
| @@ -490,11 +514,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) | |||
| 490 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; | 514 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; |
| 491 | } | 515 | } |
| 492 | 516 | ||
| 493 | struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) | ||
| 494 | { | ||
| 495 | return &mem_cgroup_from_css(css)->vmpressure; | ||
| 496 | } | ||
| 497 | |||
| 498 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | 517 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
| 499 | { | 518 | { |
| 500 | return (memcg == root_mem_cgroup); | 519 | return (memcg == root_mem_cgroup); |
| @@ -1098,16 +1117,22 @@ skip_node: | |||
| 1098 | * skipped and we should continue the tree walk. | 1117 | * skipped and we should continue the tree walk. |
| 1099 | * last_visited css is safe to use because it is | 1118 | * last_visited css is safe to use because it is |
| 1100 | * protected by css_get and the tree walk is rcu safe. | 1119 | * protected by css_get and the tree walk is rcu safe. |
| 1120 | * | ||
| 1121 | * We do not take a reference on the root of the tree walk | ||
| 1122 | * because we might race with the root removal when it would | ||
| 1123 | * be the only node in the iterated hierarchy and mem_cgroup_iter | ||
| 1124 | * would end up in an endless loop because it expects that at | ||
| 1125 | * least one valid node will be returned. Root cannot disappear | ||
| 1126 | * because caller of the iterator should hold it already so | ||
| 1127 | * skipping css reference should be safe. | ||
| 1101 | */ | 1128 | */ |
| 1102 | if (next_css) { | 1129 | if (next_css) { |
| 1103 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 1130 | if ((next_css == &root->css) || |
| 1131 | ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) | ||
| 1132 | return mem_cgroup_from_css(next_css); | ||
| 1104 | 1133 | ||
| 1105 | if (css_tryget(&mem->css)) | 1134 | prev_css = next_css; |
| 1106 | return mem; | 1135 | goto skip_node; |
| 1107 | else { | ||
| 1108 | prev_css = next_css; | ||
| 1109 | goto skip_node; | ||
| 1110 | } | ||
| 1111 | } | 1136 | } |
| 1112 | 1137 | ||
| 1113 | return NULL; | 1138 | return NULL; |
| @@ -1141,7 +1166,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, | |||
| 1141 | if (iter->last_dead_count == *sequence) { | 1166 | if (iter->last_dead_count == *sequence) { |
| 1142 | smp_rmb(); | 1167 | smp_rmb(); |
| 1143 | position = iter->last_visited; | 1168 | position = iter->last_visited; |
| 1144 | if (position && !css_tryget(&position->css)) | 1169 | |
| 1170 | /* | ||
| 1171 | * We cannot take a reference to root because we might race | ||
| 1172 | * with root removal and returning NULL would end up in | ||
| 1173 | * an endless loop on the iterator user level when root | ||
| 1174 | * would be returned all the time. | ||
| 1175 | */ | ||
| 1176 | if (position && position != root && | ||
| 1177 | !css_tryget(&position->css)) | ||
| 1145 | position = NULL; | 1178 | position = NULL; |
| 1146 | } | 1179 | } |
| 1147 | return position; | 1180 | return position; |
| @@ -1150,9 +1183,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, | |||
| 1150 | static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | 1183 | static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, |
| 1151 | struct mem_cgroup *last_visited, | 1184 | struct mem_cgroup *last_visited, |
| 1152 | struct mem_cgroup *new_position, | 1185 | struct mem_cgroup *new_position, |
| 1186 | struct mem_cgroup *root, | ||
| 1153 | int sequence) | 1187 | int sequence) |
| 1154 | { | 1188 | { |
| 1155 | if (last_visited) | 1189 | /* root reference counting symmetric to mem_cgroup_iter_load */ |
| 1190 | if (last_visited && last_visited != root) | ||
| 1156 | css_put(&last_visited->css); | 1191 | css_put(&last_visited->css); |
| 1157 | /* | 1192 | /* |
| 1158 | * We store the sequence count from the time @last_visited was | 1193 | * We store the sequence count from the time @last_visited was |
| @@ -1227,7 +1262,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
| 1227 | memcg = __mem_cgroup_iter_next(root, last_visited); | 1262 | memcg = __mem_cgroup_iter_next(root, last_visited); |
| 1228 | 1263 | ||
| 1229 | if (reclaim) { | 1264 | if (reclaim) { |
| 1230 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1265 | mem_cgroup_iter_update(iter, last_visited, memcg, root, |
| 1266 | seq); | ||
| 1231 | 1267 | ||
| 1232 | if (!memcg) | 1268 | if (!memcg) |
| 1233 | iter->generation++; | 1269 | iter->generation++; |
| @@ -1647,13 +1683,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | |||
| 1647 | */ | 1683 | */ |
| 1648 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | 1684 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) |
| 1649 | { | 1685 | { |
| 1650 | struct cgroup *task_cgrp; | ||
| 1651 | struct cgroup *mem_cgrp; | ||
| 1652 | /* | 1686 | /* |
| 1653 | * Need a buffer in BSS, can't rely on allocations. The code relies | 1687 | * protects memcg_name and makes sure that parallel ooms do not |
| 1654 | * on the assumption that OOM is serialized for memory controller. | 1688 | * interleave |
| 1655 | * If this assumption is broken, revisit this code. | ||
| 1656 | */ | 1689 | */ |
| 1690 | static DEFINE_MUTEX(oom_info_lock); | ||
| 1691 | struct cgroup *task_cgrp; | ||
| 1692 | struct cgroup *mem_cgrp; | ||
| 1657 | static char memcg_name[PATH_MAX]; | 1693 | static char memcg_name[PATH_MAX]; |
| 1658 | int ret; | 1694 | int ret; |
| 1659 | struct mem_cgroup *iter; | 1695 | struct mem_cgroup *iter; |
| @@ -1662,6 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
| 1662 | if (!p) | 1698 | if (!p) |
| 1663 | return; | 1699 | return; |
| 1664 | 1700 | ||
| 1701 | mutex_lock(&oom_info_lock); | ||
| 1665 | rcu_read_lock(); | 1702 | rcu_read_lock(); |
| 1666 | 1703 | ||
| 1667 | mem_cgrp = memcg->css.cgroup; | 1704 | mem_cgrp = memcg->css.cgroup; |
| @@ -1730,6 +1767,7 @@ done: | |||
| 1730 | 1767 | ||
| 1731 | pr_cont("\n"); | 1768 | pr_cont("\n"); |
| 1732 | } | 1769 | } |
| 1770 | mutex_unlock(&oom_info_lock); | ||
| 1733 | } | 1771 | } |
| 1734 | 1772 | ||
| 1735 | /* | 1773 | /* |
| @@ -1822,13 +1860,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
| 1822 | break; | 1860 | break; |
| 1823 | }; | 1861 | }; |
| 1824 | points = oom_badness(task, memcg, NULL, totalpages); | 1862 | points = oom_badness(task, memcg, NULL, totalpages); |
| 1825 | if (points > chosen_points) { | 1863 | if (!points || points < chosen_points) |
| 1826 | if (chosen) | 1864 | continue; |
| 1827 | put_task_struct(chosen); | 1865 | /* Prefer thread group leaders for display purposes */ |
| 1828 | chosen = task; | 1866 | if (points == chosen_points && |
| 1829 | chosen_points = points; | 1867 | thread_group_leader(chosen)) |
| 1830 | get_task_struct(chosen); | 1868 | continue; |
| 1831 | } | 1869 | |
| 1870 | if (chosen) | ||
| 1871 | put_task_struct(chosen); | ||
| 1872 | chosen = task; | ||
| 1873 | chosen_points = points; | ||
| 1874 | get_task_struct(chosen); | ||
| 1832 | } | 1875 | } |
| 1833 | css_task_iter_end(&it); | 1876 | css_task_iter_end(&it); |
| 1834 | } | 1877 | } |
| @@ -2861,7 +2904,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
| 2861 | unsigned short id; | 2904 | unsigned short id; |
| 2862 | swp_entry_t ent; | 2905 | swp_entry_t ent; |
| 2863 | 2906 | ||
| 2864 | VM_BUG_ON(!PageLocked(page)); | 2907 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 2865 | 2908 | ||
| 2866 | pc = lookup_page_cgroup(page); | 2909 | pc = lookup_page_cgroup(page); |
| 2867 | lock_page_cgroup(pc); | 2910 | lock_page_cgroup(pc); |
| @@ -2895,7 +2938,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
| 2895 | bool anon; | 2938 | bool anon; |
| 2896 | 2939 | ||
| 2897 | lock_page_cgroup(pc); | 2940 | lock_page_cgroup(pc); |
| 2898 | VM_BUG_ON(PageCgroupUsed(pc)); | 2941 | VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); |
| 2899 | /* | 2942 | /* |
| 2900 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2943 | * we don't need page_cgroup_lock about tail pages, becase they are not |
| 2901 | * accessed by any other context at this point. | 2944 | * accessed by any other context at this point. |
| @@ -2930,7 +2973,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
| 2930 | if (lrucare) { | 2973 | if (lrucare) { |
| 2931 | if (was_on_lru) { | 2974 | if (was_on_lru) { |
| 2932 | lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); | 2975 | lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); |
| 2933 | VM_BUG_ON(PageLRU(page)); | 2976 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 2934 | SetPageLRU(page); | 2977 | SetPageLRU(page); |
| 2935 | add_page_to_lru_list(page, lruvec, page_lru(page)); | 2978 | add_page_to_lru_list(page, lruvec, page_lru(page)); |
| 2936 | } | 2979 | } |
| @@ -2956,10 +2999,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
| 2956 | static DEFINE_MUTEX(set_limit_mutex); | 2999 | static DEFINE_MUTEX(set_limit_mutex); |
| 2957 | 3000 | ||
| 2958 | #ifdef CONFIG_MEMCG_KMEM | 3001 | #ifdef CONFIG_MEMCG_KMEM |
| 3002 | static DEFINE_MUTEX(activate_kmem_mutex); | ||
| 3003 | |||
| 2959 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | 3004 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) |
| 2960 | { | 3005 | { |
| 2961 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && | 3006 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && |
| 2962 | (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); | 3007 | memcg_kmem_is_active(memcg); |
| 2963 | } | 3008 | } |
| 2964 | 3009 | ||
| 2965 | /* | 3010 | /* |
| @@ -2976,10 +3021,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | |||
| 2976 | } | 3021 | } |
| 2977 | 3022 | ||
| 2978 | #ifdef CONFIG_SLABINFO | 3023 | #ifdef CONFIG_SLABINFO |
| 2979 | static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, | 3024 | static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) |
| 2980 | struct cftype *cft, struct seq_file *m) | ||
| 2981 | { | 3025 | { |
| 2982 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 3026 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 2983 | struct memcg_cache_params *params; | 3027 | struct memcg_cache_params *params; |
| 2984 | 3028 | ||
| 2985 | if (!memcg_can_account_kmem(memcg)) | 3029 | if (!memcg_can_account_kmem(memcg)) |
| @@ -3059,16 +3103,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) | |||
| 3059 | css_put(&memcg->css); | 3103 | css_put(&memcg->css); |
| 3060 | } | 3104 | } |
| 3061 | 3105 | ||
| 3062 | void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) | ||
| 3063 | { | ||
| 3064 | if (!memcg) | ||
| 3065 | return; | ||
| 3066 | |||
| 3067 | mutex_lock(&memcg->slab_caches_mutex); | ||
| 3068 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | ||
| 3069 | mutex_unlock(&memcg->slab_caches_mutex); | ||
| 3070 | } | ||
| 3071 | |||
| 3072 | /* | 3106 | /* |
| 3073 | * helper for acessing a memcg's index. It will be used as an index in the | 3107 | * helper for acessing a memcg's index. It will be used as an index in the |
| 3074 | * child cache array in kmem_cache, and also to derive its name. This function | 3108 | * child cache array in kmem_cache, and also to derive its name. This function |
| @@ -3079,43 +3113,6 @@ int memcg_cache_id(struct mem_cgroup *memcg) | |||
| 3079 | return memcg ? memcg->kmemcg_id : -1; | 3113 | return memcg ? memcg->kmemcg_id : -1; |
| 3080 | } | 3114 | } |
| 3081 | 3115 | ||
| 3082 | /* | ||
| 3083 | * This ends up being protected by the set_limit mutex, during normal | ||
| 3084 | * operation, because that is its main call site. | ||
| 3085 | * | ||
| 3086 | * But when we create a new cache, we can call this as well if its parent | ||
| 3087 | * is kmem-limited. That will have to hold set_limit_mutex as well. | ||
| 3088 | */ | ||
| 3089 | int memcg_update_cache_sizes(struct mem_cgroup *memcg) | ||
| 3090 | { | ||
| 3091 | int num, ret; | ||
| 3092 | |||
| 3093 | num = ida_simple_get(&kmem_limited_groups, | ||
| 3094 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | ||
| 3095 | if (num < 0) | ||
| 3096 | return num; | ||
| 3097 | /* | ||
| 3098 | * After this point, kmem_accounted (that we test atomically in | ||
| 3099 | * the beginning of this conditional), is no longer 0. This | ||
| 3100 | * guarantees only one process will set the following boolean | ||
| 3101 | * to true. We don't need test_and_set because we're protected | ||
| 3102 | * by the set_limit_mutex anyway. | ||
| 3103 | */ | ||
| 3104 | memcg_kmem_set_activated(memcg); | ||
| 3105 | |||
| 3106 | ret = memcg_update_all_caches(num+1); | ||
| 3107 | if (ret) { | ||
| 3108 | ida_simple_remove(&kmem_limited_groups, num); | ||
| 3109 | memcg_kmem_clear_activated(memcg); | ||
| 3110 | return ret; | ||
| 3111 | } | ||
| 3112 | |||
| 3113 | memcg->kmemcg_id = num; | ||
| 3114 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
| 3115 | mutex_init(&memcg->slab_caches_mutex); | ||
| 3116 | return 0; | ||
| 3117 | } | ||
| 3118 | |||
| 3119 | static size_t memcg_caches_array_size(int num_groups) | 3116 | static size_t memcg_caches_array_size(int num_groups) |
| 3120 | { | 3117 | { |
| 3121 | ssize_t size; | 3118 | ssize_t size; |
| @@ -3152,18 +3149,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
| 3152 | 3149 | ||
| 3153 | if (num_groups > memcg_limited_groups_array_size) { | 3150 | if (num_groups > memcg_limited_groups_array_size) { |
| 3154 | int i; | 3151 | int i; |
| 3152 | struct memcg_cache_params *new_params; | ||
| 3155 | ssize_t size = memcg_caches_array_size(num_groups); | 3153 | ssize_t size = memcg_caches_array_size(num_groups); |
| 3156 | 3154 | ||
| 3157 | size *= sizeof(void *); | 3155 | size *= sizeof(void *); |
| 3158 | size += offsetof(struct memcg_cache_params, memcg_caches); | 3156 | size += offsetof(struct memcg_cache_params, memcg_caches); |
| 3159 | 3157 | ||
| 3160 | s->memcg_params = kzalloc(size, GFP_KERNEL); | 3158 | new_params = kzalloc(size, GFP_KERNEL); |
| 3161 | if (!s->memcg_params) { | 3159 | if (!new_params) |
| 3162 | s->memcg_params = cur_params; | ||
| 3163 | return -ENOMEM; | 3160 | return -ENOMEM; |
| 3164 | } | ||
| 3165 | 3161 | ||
| 3166 | s->memcg_params->is_root_cache = true; | 3162 | new_params->is_root_cache = true; |
| 3167 | 3163 | ||
| 3168 | /* | 3164 | /* |
| 3169 | * There is the chance it will be bigger than | 3165 | * There is the chance it will be bigger than |
| @@ -3177,7 +3173,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
| 3177 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | 3173 | for (i = 0; i < memcg_limited_groups_array_size; i++) { |
| 3178 | if (!cur_params->memcg_caches[i]) | 3174 | if (!cur_params->memcg_caches[i]) |
| 3179 | continue; | 3175 | continue; |
| 3180 | s->memcg_params->memcg_caches[i] = | 3176 | new_params->memcg_caches[i] = |
| 3181 | cur_params->memcg_caches[i]; | 3177 | cur_params->memcg_caches[i]; |
| 3182 | } | 3178 | } |
| 3183 | 3179 | ||
| @@ -3190,13 +3186,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
| 3190 | * bigger than the others. And all updates will reset this | 3186 | * bigger than the others. And all updates will reset this |
| 3191 | * anyway. | 3187 | * anyway. |
| 3192 | */ | 3188 | */ |
| 3193 | kfree(cur_params); | 3189 | rcu_assign_pointer(s->memcg_params, new_params); |
| 3190 | if (cur_params) | ||
| 3191 | kfree_rcu(cur_params, rcu_head); | ||
| 3194 | } | 3192 | } |
| 3195 | return 0; | 3193 | return 0; |
| 3196 | } | 3194 | } |
| 3197 | 3195 | ||
| 3198 | int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, | 3196 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, |
| 3199 | struct kmem_cache *root_cache) | 3197 | struct kmem_cache *root_cache) |
| 3200 | { | 3198 | { |
| 3201 | size_t size; | 3199 | size_t size; |
| 3202 | 3200 | ||
| @@ -3224,35 +3222,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, | |||
| 3224 | return 0; | 3222 | return 0; |
| 3225 | } | 3223 | } |
| 3226 | 3224 | ||
| 3227 | void memcg_release_cache(struct kmem_cache *s) | 3225 | void memcg_free_cache_params(struct kmem_cache *s) |
| 3226 | { | ||
| 3227 | kfree(s->memcg_params); | ||
| 3228 | } | ||
| 3229 | |||
| 3230 | void memcg_register_cache(struct kmem_cache *s) | ||
| 3228 | { | 3231 | { |
| 3229 | struct kmem_cache *root; | 3232 | struct kmem_cache *root; |
| 3230 | struct mem_cgroup *memcg; | 3233 | struct mem_cgroup *memcg; |
| 3231 | int id; | 3234 | int id; |
| 3232 | 3235 | ||
| 3233 | /* | 3236 | if (is_root_cache(s)) |
| 3234 | * This happens, for instance, when a root cache goes away before we | ||
| 3235 | * add any memcg. | ||
| 3236 | */ | ||
| 3237 | if (!s->memcg_params) | ||
| 3238 | return; | 3237 | return; |
| 3239 | 3238 | ||
| 3240 | if (s->memcg_params->is_root_cache) | 3239 | /* |
| 3241 | goto out; | 3240 | * Holding the slab_mutex assures nobody will touch the memcg_caches |
| 3241 | * array while we are modifying it. | ||
| 3242 | */ | ||
| 3243 | lockdep_assert_held(&slab_mutex); | ||
| 3242 | 3244 | ||
| 3245 | root = s->memcg_params->root_cache; | ||
| 3243 | memcg = s->memcg_params->memcg; | 3246 | memcg = s->memcg_params->memcg; |
| 3244 | id = memcg_cache_id(memcg); | 3247 | id = memcg_cache_id(memcg); |
| 3248 | |||
| 3249 | css_get(&memcg->css); | ||
| 3250 | |||
| 3251 | |||
| 3252 | /* | ||
| 3253 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | ||
| 3254 | * barrier here to ensure nobody will see the kmem_cache partially | ||
| 3255 | * initialized. | ||
| 3256 | */ | ||
| 3257 | smp_wmb(); | ||
| 3258 | |||
| 3259 | /* | ||
| 3260 | * Initialize the pointer to this cache in its parent's memcg_params | ||
| 3261 | * before adding it to the memcg_slab_caches list, otherwise we can | ||
| 3262 | * fail to convert memcg_params_to_cache() while traversing the list. | ||
| 3263 | */ | ||
| 3264 | VM_BUG_ON(root->memcg_params->memcg_caches[id]); | ||
| 3265 | root->memcg_params->memcg_caches[id] = s; | ||
| 3266 | |||
| 3267 | mutex_lock(&memcg->slab_caches_mutex); | ||
| 3268 | list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); | ||
| 3269 | mutex_unlock(&memcg->slab_caches_mutex); | ||
| 3270 | } | ||
| 3271 | |||
| 3272 | void memcg_unregister_cache(struct kmem_cache *s) | ||
| 3273 | { | ||
| 3274 | struct kmem_cache *root; | ||
| 3275 | struct mem_cgroup *memcg; | ||
| 3276 | int id; | ||
| 3277 | |||
| 3278 | if (is_root_cache(s)) | ||
| 3279 | return; | ||
| 3280 | |||
| 3281 | /* | ||
| 3282 | * Holding the slab_mutex assures nobody will touch the memcg_caches | ||
| 3283 | * array while we are modifying it. | ||
| 3284 | */ | ||
| 3285 | lockdep_assert_held(&slab_mutex); | ||
| 3245 | 3286 | ||
| 3246 | root = s->memcg_params->root_cache; | 3287 | root = s->memcg_params->root_cache; |
| 3247 | root->memcg_params->memcg_caches[id] = NULL; | 3288 | memcg = s->memcg_params->memcg; |
| 3289 | id = memcg_cache_id(memcg); | ||
| 3248 | 3290 | ||
| 3249 | mutex_lock(&memcg->slab_caches_mutex); | 3291 | mutex_lock(&memcg->slab_caches_mutex); |
| 3250 | list_del(&s->memcg_params->list); | 3292 | list_del(&s->memcg_params->list); |
| 3251 | mutex_unlock(&memcg->slab_caches_mutex); | 3293 | mutex_unlock(&memcg->slab_caches_mutex); |
| 3252 | 3294 | ||
| 3295 | /* | ||
| 3296 | * Clear the pointer to this cache in its parent's memcg_params only | ||
| 3297 | * after removing it from the memcg_slab_caches list, otherwise we can | ||
| 3298 | * fail to convert memcg_params_to_cache() while traversing the list. | ||
| 3299 | */ | ||
| 3300 | VM_BUG_ON(!root->memcg_params->memcg_caches[id]); | ||
| 3301 | root->memcg_params->memcg_caches[id] = NULL; | ||
| 3302 | |||
| 3253 | css_put(&memcg->css); | 3303 | css_put(&memcg->css); |
| 3254 | out: | ||
| 3255 | kfree(s->memcg_params); | ||
| 3256 | } | 3304 | } |
| 3257 | 3305 | ||
| 3258 | /* | 3306 | /* |
| @@ -3311,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w) | |||
| 3311 | * So if we aren't down to zero, we'll just schedule a worker and try | 3359 | * So if we aren't down to zero, we'll just schedule a worker and try |
| 3312 | * again | 3360 | * again |
| 3313 | */ | 3361 | */ |
| 3314 | if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { | 3362 | if (atomic_read(&cachep->memcg_params->nr_pages) != 0) |
| 3315 | kmem_cache_shrink(cachep); | 3363 | kmem_cache_shrink(cachep); |
| 3316 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) | 3364 | else |
| 3317 | return; | ||
| 3318 | } else | ||
| 3319 | kmem_cache_destroy(cachep); | 3365 | kmem_cache_destroy(cachep); |
| 3320 | } | 3366 | } |
| 3321 | 3367 | ||
| @@ -3351,27 +3397,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) | |||
| 3351 | schedule_work(&cachep->memcg_params->destroy); | 3397 | schedule_work(&cachep->memcg_params->destroy); |
| 3352 | } | 3398 | } |
| 3353 | 3399 | ||
| 3354 | /* | 3400 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, |
| 3355 | * This lock protects updaters, not readers. We want readers to be as fast as | 3401 | struct kmem_cache *s) |
| 3356 | * they can, and they will either see NULL or a valid cache value. Our model | ||
| 3357 | * allow them to see NULL, in which case the root memcg will be selected. | ||
| 3358 | * | ||
| 3359 | * We need this lock because multiple allocations to the same cache from a non | ||
| 3360 | * will span more than one worker. Only one of them can create the cache. | ||
| 3361 | */ | ||
| 3362 | static DEFINE_MUTEX(memcg_cache_mutex); | ||
| 3363 | |||
| 3364 | /* | ||
| 3365 | * Called with memcg_cache_mutex held | ||
| 3366 | */ | ||
| 3367 | static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | ||
| 3368 | struct kmem_cache *s) | ||
| 3369 | { | 3402 | { |
| 3370 | struct kmem_cache *new; | 3403 | struct kmem_cache *new = NULL; |
| 3371 | static char *tmp_name = NULL; | 3404 | static char *tmp_name = NULL; |
| 3405 | static DEFINE_MUTEX(mutex); /* protects tmp_name */ | ||
| 3372 | 3406 | ||
| 3373 | lockdep_assert_held(&memcg_cache_mutex); | 3407 | BUG_ON(!memcg_can_account_kmem(memcg)); |
| 3374 | 3408 | ||
| 3409 | mutex_lock(&mutex); | ||
| 3375 | /* | 3410 | /* |
| 3376 | * kmem_cache_create_memcg duplicates the given name and | 3411 | * kmem_cache_create_memcg duplicates the given name and |
| 3377 | * cgroup_name for this name requires RCU context. | 3412 | * cgroup_name for this name requires RCU context. |
| @@ -3381,7 +3416,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | |||
| 3381 | if (!tmp_name) { | 3416 | if (!tmp_name) { |
| 3382 | tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); | 3417 | tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); |
| 3383 | if (!tmp_name) | 3418 | if (!tmp_name) |
| 3384 | return NULL; | 3419 | goto out; |
| 3385 | } | 3420 | } |
| 3386 | 3421 | ||
| 3387 | rcu_read_lock(); | 3422 | rcu_read_lock(); |
| @@ -3391,48 +3426,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | |||
| 3391 | 3426 | ||
| 3392 | new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, | 3427 | new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, |
| 3393 | (s->flags & ~SLAB_PANIC), s->ctor, s); | 3428 | (s->flags & ~SLAB_PANIC), s->ctor, s); |
| 3394 | |||
| 3395 | if (new) | 3429 | if (new) |
| 3396 | new->allocflags |= __GFP_KMEMCG; | 3430 | new->allocflags |= __GFP_KMEMCG; |
| 3397 | 3431 | else | |
| 3398 | return new; | 3432 | new = s; |
| 3399 | } | ||
| 3400 | |||
| 3401 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | ||
| 3402 | struct kmem_cache *cachep) | ||
| 3403 | { | ||
| 3404 | struct kmem_cache *new_cachep; | ||
| 3405 | int idx; | ||
| 3406 | |||
| 3407 | BUG_ON(!memcg_can_account_kmem(memcg)); | ||
| 3408 | |||
| 3409 | idx = memcg_cache_id(memcg); | ||
| 3410 | |||
| 3411 | mutex_lock(&memcg_cache_mutex); | ||
| 3412 | new_cachep = cache_from_memcg_idx(cachep, idx); | ||
| 3413 | if (new_cachep) { | ||
| 3414 | css_put(&memcg->css); | ||
| 3415 | goto out; | ||
| 3416 | } | ||
| 3417 | |||
| 3418 | new_cachep = kmem_cache_dup(memcg, cachep); | ||
| 3419 | if (new_cachep == NULL) { | ||
| 3420 | new_cachep = cachep; | ||
| 3421 | css_put(&memcg->css); | ||
| 3422 | goto out; | ||
| 3423 | } | ||
| 3424 | |||
| 3425 | atomic_set(&new_cachep->memcg_params->nr_pages , 0); | ||
| 3426 | |||
| 3427 | cachep->memcg_params->memcg_caches[idx] = new_cachep; | ||
| 3428 | /* | ||
| 3429 | * the readers won't lock, make sure everybody sees the updated value, | ||
| 3430 | * so they won't put stuff in the queue again for no reason | ||
| 3431 | */ | ||
| 3432 | wmb(); | ||
| 3433 | out: | 3433 | out: |
| 3434 | mutex_unlock(&memcg_cache_mutex); | 3434 | mutex_unlock(&mutex); |
| 3435 | return new_cachep; | 3435 | return new; |
| 3436 | } | 3436 | } |
| 3437 | 3437 | ||
| 3438 | void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | 3438 | void kmem_cache_destroy_memcg_children(struct kmem_cache *s) |
| @@ -3452,9 +3452,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | |||
| 3452 | * | 3452 | * |
| 3453 | * Still, we don't want anyone else freeing memcg_caches under our | 3453 | * Still, we don't want anyone else freeing memcg_caches under our |
| 3454 | * noses, which can happen if a new memcg comes to life. As usual, | 3454 | * noses, which can happen if a new memcg comes to life. As usual, |
| 3455 | * we'll take the set_limit_mutex to protect ourselves against this. | 3455 | * we'll take the activate_kmem_mutex to protect ourselves against |
| 3456 | * this. | ||
| 3456 | */ | 3457 | */ |
| 3457 | mutex_lock(&set_limit_mutex); | 3458 | mutex_lock(&activate_kmem_mutex); |
| 3458 | for_each_memcg_cache_index(i) { | 3459 | for_each_memcg_cache_index(i) { |
| 3459 | c = cache_from_memcg_idx(s, i); | 3460 | c = cache_from_memcg_idx(s, i); |
| 3460 | if (!c) | 3461 | if (!c) |
| @@ -3477,7 +3478,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | |||
| 3477 | cancel_work_sync(&c->memcg_params->destroy); | 3478 | cancel_work_sync(&c->memcg_params->destroy); |
| 3478 | kmem_cache_destroy(c); | 3479 | kmem_cache_destroy(c); |
| 3479 | } | 3480 | } |
| 3480 | mutex_unlock(&set_limit_mutex); | 3481 | mutex_unlock(&activate_kmem_mutex); |
| 3481 | } | 3482 | } |
| 3482 | 3483 | ||
| 3483 | struct create_work { | 3484 | struct create_work { |
| @@ -3509,6 +3510,7 @@ static void memcg_create_cache_work_func(struct work_struct *w) | |||
| 3509 | 3510 | ||
| 3510 | cw = container_of(w, struct create_work, work); | 3511 | cw = container_of(w, struct create_work, work); |
| 3511 | memcg_create_kmem_cache(cw->memcg, cw->cachep); | 3512 | memcg_create_kmem_cache(cw->memcg, cw->cachep); |
| 3513 | css_put(&cw->memcg->css); | ||
| 3512 | kfree(cw); | 3514 | kfree(cw); |
| 3513 | } | 3515 | } |
| 3514 | 3516 | ||
| @@ -3568,7 +3570,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
| 3568 | gfp_t gfp) | 3570 | gfp_t gfp) |
| 3569 | { | 3571 | { |
| 3570 | struct mem_cgroup *memcg; | 3572 | struct mem_cgroup *memcg; |
| 3571 | int idx; | 3573 | struct kmem_cache *memcg_cachep; |
| 3572 | 3574 | ||
| 3573 | VM_BUG_ON(!cachep->memcg_params); | 3575 | VM_BUG_ON(!cachep->memcg_params); |
| 3574 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | 3576 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); |
| @@ -3582,15 +3584,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
| 3582 | if (!memcg_can_account_kmem(memcg)) | 3584 | if (!memcg_can_account_kmem(memcg)) |
| 3583 | goto out; | 3585 | goto out; |
| 3584 | 3586 | ||
| 3585 | idx = memcg_cache_id(memcg); | 3587 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); |
| 3586 | 3588 | if (likely(memcg_cachep)) { | |
| 3587 | /* | 3589 | cachep = memcg_cachep; |
| 3588 | * barrier to mare sure we're always seeing the up to date value. The | ||
| 3589 | * code updating memcg_caches will issue a write barrier to match this. | ||
| 3590 | */ | ||
| 3591 | read_barrier_depends(); | ||
| 3592 | if (likely(cache_from_memcg_idx(cachep, idx))) { | ||
| 3593 | cachep = cache_from_memcg_idx(cachep, idx); | ||
| 3594 | goto out; | 3590 | goto out; |
| 3595 | } | 3591 | } |
| 3596 | 3592 | ||
| @@ -3744,7 +3740,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
| 3744 | if (!memcg) | 3740 | if (!memcg) |
| 3745 | return; | 3741 | return; |
| 3746 | 3742 | ||
| 3747 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | 3743 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); |
| 3748 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | 3744 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); |
| 3749 | } | 3745 | } |
| 3750 | #else | 3746 | #else |
| @@ -3823,7 +3819,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
| 3823 | bool anon = PageAnon(page); | 3819 | bool anon = PageAnon(page); |
| 3824 | 3820 | ||
| 3825 | VM_BUG_ON(from == to); | 3821 | VM_BUG_ON(from == to); |
| 3826 | VM_BUG_ON(PageLRU(page)); | 3822 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 3827 | /* | 3823 | /* |
| 3828 | * The page is isolated from LRU. So, collapse function | 3824 | * The page is isolated from LRU. So, collapse function |
| 3829 | * will not handle this page. But page splitting can happen. | 3825 | * will not handle this page. But page splitting can happen. |
| @@ -3916,7 +3912,7 @@ static int mem_cgroup_move_parent(struct page *page, | |||
| 3916 | parent = root_mem_cgroup; | 3912 | parent = root_mem_cgroup; |
| 3917 | 3913 | ||
| 3918 | if (nr_pages > 1) { | 3914 | if (nr_pages > 1) { |
| 3919 | VM_BUG_ON(!PageTransHuge(page)); | 3915 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
| 3920 | flags = compound_lock_irqsave(page); | 3916 | flags = compound_lock_irqsave(page); |
| 3921 | } | 3917 | } |
| 3922 | 3918 | ||
| @@ -3950,7 +3946,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 3950 | 3946 | ||
| 3951 | if (PageTransHuge(page)) { | 3947 | if (PageTransHuge(page)) { |
| 3952 | nr_pages <<= compound_order(page); | 3948 | nr_pages <<= compound_order(page); |
| 3953 | VM_BUG_ON(!PageTransHuge(page)); | 3949 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
| 3954 | /* | 3950 | /* |
| 3955 | * Never OOM-kill a process for a huge page. The | 3951 | * Never OOM-kill a process for a huge page. The |
| 3956 | * fault handler will fall back to regular pages. | 3952 | * fault handler will fall back to regular pages. |
| @@ -3970,8 +3966,8 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
| 3970 | { | 3966 | { |
| 3971 | if (mem_cgroup_disabled()) | 3967 | if (mem_cgroup_disabled()) |
| 3972 | return 0; | 3968 | return 0; |
| 3973 | VM_BUG_ON(page_mapped(page)); | 3969 | VM_BUG_ON_PAGE(page_mapped(page), page); |
| 3974 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 3970 | VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); |
| 3975 | VM_BUG_ON(!mm); | 3971 | VM_BUG_ON(!mm); |
| 3976 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 3972 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
| 3977 | MEM_CGROUP_CHARGE_TYPE_ANON); | 3973 | MEM_CGROUP_CHARGE_TYPE_ANON); |
| @@ -4175,7 +4171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, | |||
| 4175 | 4171 | ||
| 4176 | if (PageTransHuge(page)) { | 4172 | if (PageTransHuge(page)) { |
| 4177 | nr_pages <<= compound_order(page); | 4173 | nr_pages <<= compound_order(page); |
| 4178 | VM_BUG_ON(!PageTransHuge(page)); | 4174 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
| 4179 | } | 4175 | } |
| 4180 | /* | 4176 | /* |
| 4181 | * Check if our page_cgroup is valid | 4177 | * Check if our page_cgroup is valid |
| @@ -4267,7 +4263,7 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
| 4267 | /* early check. */ | 4263 | /* early check. */ |
| 4268 | if (page_mapped(page)) | 4264 | if (page_mapped(page)) |
| 4269 | return; | 4265 | return; |
| 4270 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 4266 | VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); |
| 4271 | /* | 4267 | /* |
| 4272 | * If the page is in swap cache, uncharge should be deferred | 4268 | * If the page is in swap cache, uncharge should be deferred |
| 4273 | * to the swap path, which also properly accounts swap usage | 4269 | * to the swap path, which also properly accounts swap usage |
| @@ -4287,8 +4283,8 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
| 4287 | 4283 | ||
| 4288 | void mem_cgroup_uncharge_cache_page(struct page *page) | 4284 | void mem_cgroup_uncharge_cache_page(struct page *page) |
| 4289 | { | 4285 | { |
| 4290 | VM_BUG_ON(page_mapped(page)); | 4286 | VM_BUG_ON_PAGE(page_mapped(page), page); |
| 4291 | VM_BUG_ON(page->mapping); | 4287 | VM_BUG_ON_PAGE(page->mapping, page); |
| 4292 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); | 4288 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); |
| 4293 | } | 4289 | } |
| 4294 | 4290 | ||
| @@ -5112,14 +5108,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
| 5112 | return val << PAGE_SHIFT; | 5108 | return val << PAGE_SHIFT; |
| 5113 | } | 5109 | } |
| 5114 | 5110 | ||
| 5115 | static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, | 5111 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, |
| 5116 | struct cftype *cft, struct file *file, | 5112 | struct cftype *cft) |
| 5117 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
| 5118 | { | 5113 | { |
| 5119 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5114 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| 5120 | char str[64]; | ||
| 5121 | u64 val; | 5115 | u64 val; |
| 5122 | int name, len; | 5116 | int name; |
| 5123 | enum res_type type; | 5117 | enum res_type type; |
| 5124 | 5118 | ||
| 5125 | type = MEMFILE_TYPE(cft->private); | 5119 | type = MEMFILE_TYPE(cft->private); |
| @@ -5145,15 +5139,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, | |||
| 5145 | BUG(); | 5139 | BUG(); |
| 5146 | } | 5140 | } |
| 5147 | 5141 | ||
| 5148 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | 5142 | return val; |
| 5149 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
| 5150 | } | 5143 | } |
| 5151 | 5144 | ||
| 5152 | static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | ||
| 5153 | { | ||
| 5154 | int ret = -EINVAL; | ||
| 5155 | #ifdef CONFIG_MEMCG_KMEM | 5145 | #ifdef CONFIG_MEMCG_KMEM |
| 5156 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5146 | /* should be called with activate_kmem_mutex held */ |
| 5147 | static int __memcg_activate_kmem(struct mem_cgroup *memcg, | ||
| 5148 | unsigned long long limit) | ||
| 5149 | { | ||
| 5150 | int err = 0; | ||
| 5151 | int memcg_id; | ||
| 5152 | |||
| 5153 | if (memcg_kmem_is_active(memcg)) | ||
| 5154 | return 0; | ||
| 5155 | |||
| 5156 | /* | ||
| 5157 | * We are going to allocate memory for data shared by all memory | ||
| 5158 | * cgroups so let's stop accounting here. | ||
| 5159 | */ | ||
| 5160 | memcg_stop_kmem_account(); | ||
| 5161 | |||
| 5157 | /* | 5162 | /* |
| 5158 | * For simplicity, we won't allow this to be disabled. It also can't | 5163 | * For simplicity, we won't allow this to be disabled. It also can't |
| 5159 | * be changed if the cgroup has children already, or if tasks had | 5164 | * be changed if the cgroup has children already, or if tasks had |
| @@ -5167,72 +5172,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | |||
| 5167 | * of course permitted. | 5172 | * of course permitted. |
| 5168 | */ | 5173 | */ |
| 5169 | mutex_lock(&memcg_create_mutex); | 5174 | mutex_lock(&memcg_create_mutex); |
| 5170 | mutex_lock(&set_limit_mutex); | 5175 | if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) |
| 5171 | if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { | 5176 | err = -EBUSY; |
| 5172 | if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { | 5177 | mutex_unlock(&memcg_create_mutex); |
| 5173 | ret = -EBUSY; | 5178 | if (err) |
| 5174 | goto out; | 5179 | goto out; |
| 5175 | } | ||
| 5176 | ret = res_counter_set_limit(&memcg->kmem, val); | ||
| 5177 | VM_BUG_ON(ret); | ||
| 5178 | 5180 | ||
| 5179 | ret = memcg_update_cache_sizes(memcg); | 5181 | memcg_id = ida_simple_get(&kmem_limited_groups, |
| 5180 | if (ret) { | 5182 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); |
| 5181 | res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); | 5183 | if (memcg_id < 0) { |
| 5182 | goto out; | 5184 | err = memcg_id; |
| 5183 | } | 5185 | goto out; |
| 5184 | static_key_slow_inc(&memcg_kmem_enabled_key); | 5186 | } |
| 5185 | /* | 5187 | |
| 5186 | * setting the active bit after the inc will guarantee no one | 5188 | /* |
| 5187 | * starts accounting before all call sites are patched | 5189 | * Make sure we have enough space for this cgroup in each root cache's |
| 5188 | */ | 5190 | * memcg_params. |
| 5189 | memcg_kmem_set_active(memcg); | 5191 | */ |
| 5190 | } else | 5192 | err = memcg_update_all_caches(memcg_id + 1); |
| 5191 | ret = res_counter_set_limit(&memcg->kmem, val); | 5193 | if (err) |
| 5194 | goto out_rmid; | ||
| 5195 | |||
| 5196 | memcg->kmemcg_id = memcg_id; | ||
| 5197 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
| 5198 | mutex_init(&memcg->slab_caches_mutex); | ||
| 5199 | |||
| 5200 | /* | ||
| 5201 | * We couldn't have accounted to this cgroup, because it hasn't got the | ||
| 5202 | * active bit set yet, so this should succeed. | ||
| 5203 | */ | ||
| 5204 | err = res_counter_set_limit(&memcg->kmem, limit); | ||
| 5205 | VM_BUG_ON(err); | ||
| 5206 | |||
| 5207 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
| 5208 | /* | ||
| 5209 | * Setting the active bit after enabling static branching will | ||
| 5210 | * guarantee no one starts accounting before all call sites are | ||
| 5211 | * patched. | ||
| 5212 | */ | ||
| 5213 | memcg_kmem_set_active(memcg); | ||
| 5192 | out: | 5214 | out: |
| 5193 | mutex_unlock(&set_limit_mutex); | 5215 | memcg_resume_kmem_account(); |
| 5194 | mutex_unlock(&memcg_create_mutex); | 5216 | return err; |
| 5195 | #endif | 5217 | |
| 5218 | out_rmid: | ||
| 5219 | ida_simple_remove(&kmem_limited_groups, memcg_id); | ||
| 5220 | goto out; | ||
| 5221 | } | ||
| 5222 | |||
| 5223 | static int memcg_activate_kmem(struct mem_cgroup *memcg, | ||
| 5224 | unsigned long long limit) | ||
| 5225 | { | ||
| 5226 | int ret; | ||
| 5227 | |||
| 5228 | mutex_lock(&activate_kmem_mutex); | ||
| 5229 | ret = __memcg_activate_kmem(memcg, limit); | ||
| 5230 | mutex_unlock(&activate_kmem_mutex); | ||
| 5231 | return ret; | ||
| 5232 | } | ||
| 5233 | |||
| 5234 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | ||
| 5235 | unsigned long long val) | ||
| 5236 | { | ||
| 5237 | int ret; | ||
| 5238 | |||
| 5239 | if (!memcg_kmem_is_active(memcg)) | ||
| 5240 | ret = memcg_activate_kmem(memcg, val); | ||
| 5241 | else | ||
| 5242 | ret = res_counter_set_limit(&memcg->kmem, val); | ||
| 5196 | return ret; | 5243 | return ret; |
| 5197 | } | 5244 | } |
| 5198 | 5245 | ||
| 5199 | #ifdef CONFIG_MEMCG_KMEM | ||
| 5200 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) | 5246 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) |
| 5201 | { | 5247 | { |
| 5202 | int ret = 0; | 5248 | int ret = 0; |
| 5203 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | 5249 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
| 5204 | if (!parent) | ||
| 5205 | goto out; | ||
| 5206 | 5250 | ||
| 5207 | memcg->kmem_account_flags = parent->kmem_account_flags; | 5251 | if (!parent) |
| 5208 | /* | 5252 | return 0; |
| 5209 | * When that happen, we need to disable the static branch only on those | ||
| 5210 | * memcgs that enabled it. To achieve this, we would be forced to | ||
| 5211 | * complicate the code by keeping track of which memcgs were the ones | ||
| 5212 | * that actually enabled limits, and which ones got it from its | ||
| 5213 | * parents. | ||
| 5214 | * | ||
| 5215 | * It is a lot simpler just to do static_key_slow_inc() on every child | ||
| 5216 | * that is accounted. | ||
| 5217 | */ | ||
| 5218 | if (!memcg_kmem_is_active(memcg)) | ||
| 5219 | goto out; | ||
| 5220 | 5253 | ||
| 5254 | mutex_lock(&activate_kmem_mutex); | ||
| 5221 | /* | 5255 | /* |
| 5222 | * __mem_cgroup_free() will issue static_key_slow_dec() because this | 5256 | * If the parent cgroup is not kmem-active now, it cannot be activated |
| 5223 | * memcg is active already. If the later initialization fails then the | 5257 | * after this point, because it has at least one child already. |
| 5224 | * cgroup core triggers the cleanup so we do not have to do it here. | ||
| 5225 | */ | 5258 | */ |
| 5226 | static_key_slow_inc(&memcg_kmem_enabled_key); | 5259 | if (memcg_kmem_is_active(parent)) |
| 5227 | 5260 | ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); | |
| 5228 | mutex_lock(&set_limit_mutex); | 5261 | mutex_unlock(&activate_kmem_mutex); |
| 5229 | memcg_stop_kmem_account(); | ||
| 5230 | ret = memcg_update_cache_sizes(memcg); | ||
| 5231 | memcg_resume_kmem_account(); | ||
| 5232 | mutex_unlock(&set_limit_mutex); | ||
| 5233 | out: | ||
| 5234 | return ret; | 5262 | return ret; |
| 5235 | } | 5263 | } |
| 5264 | #else | ||
| 5265 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | ||
| 5266 | unsigned long long val) | ||
| 5267 | { | ||
| 5268 | return -EINVAL; | ||
| 5269 | } | ||
| 5236 | #endif /* CONFIG_MEMCG_KMEM */ | 5270 | #endif /* CONFIG_MEMCG_KMEM */ |
| 5237 | 5271 | ||
| 5238 | /* | 5272 | /* |
| @@ -5266,7 +5300,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, | |||
| 5266 | else if (type == _MEMSWAP) | 5300 | else if (type == _MEMSWAP) |
| 5267 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 5301 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
| 5268 | else if (type == _KMEM) | 5302 | else if (type == _KMEM) |
| 5269 | ret = memcg_update_kmem_limit(css, val); | 5303 | ret = memcg_update_kmem_limit(memcg, val); |
| 5270 | else | 5304 | else |
| 5271 | return -EINVAL; | 5305 | return -EINVAL; |
| 5272 | break; | 5306 | break; |
| @@ -5383,8 +5417,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | |||
| 5383 | #endif | 5417 | #endif |
| 5384 | 5418 | ||
| 5385 | #ifdef CONFIG_NUMA | 5419 | #ifdef CONFIG_NUMA |
| 5386 | static int memcg_numa_stat_show(struct cgroup_subsys_state *css, | 5420 | static int memcg_numa_stat_show(struct seq_file *m, void *v) |
| 5387 | struct cftype *cft, struct seq_file *m) | ||
| 5388 | { | 5421 | { |
| 5389 | struct numa_stat { | 5422 | struct numa_stat { |
| 5390 | const char *name; | 5423 | const char *name; |
| @@ -5400,7 +5433,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css, | |||
| 5400 | const struct numa_stat *stat; | 5433 | const struct numa_stat *stat; |
| 5401 | int nid; | 5434 | int nid; |
| 5402 | unsigned long nr; | 5435 | unsigned long nr; |
| 5403 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5436 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5404 | 5437 | ||
| 5405 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { | 5438 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { |
| 5406 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); | 5439 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); |
| @@ -5439,10 +5472,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) | |||
| 5439 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 5472 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
| 5440 | } | 5473 | } |
| 5441 | 5474 | ||
| 5442 | static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, | 5475 | static int memcg_stat_show(struct seq_file *m, void *v) |
| 5443 | struct seq_file *m) | ||
| 5444 | { | 5476 | { |
| 5445 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5477 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5446 | struct mem_cgroup *mi; | 5478 | struct mem_cgroup *mi; |
| 5447 | unsigned int i; | 5479 | unsigned int i; |
| 5448 | 5480 | ||
| @@ -5651,13 +5683,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) | |||
| 5651 | mem_cgroup_oom_notify_cb(iter); | 5683 | mem_cgroup_oom_notify_cb(iter); |
| 5652 | } | 5684 | } |
| 5653 | 5685 | ||
| 5654 | static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, | 5686 | static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| 5655 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 5687 | struct eventfd_ctx *eventfd, const char *args, enum res_type type) |
| 5656 | { | 5688 | { |
| 5657 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 5658 | struct mem_cgroup_thresholds *thresholds; | 5689 | struct mem_cgroup_thresholds *thresholds; |
| 5659 | struct mem_cgroup_threshold_ary *new; | 5690 | struct mem_cgroup_threshold_ary *new; |
| 5660 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
| 5661 | u64 threshold, usage; | 5691 | u64 threshold, usage; |
| 5662 | int i, size, ret; | 5692 | int i, size, ret; |
| 5663 | 5693 | ||
| @@ -5734,13 +5764,23 @@ unlock: | |||
| 5734 | return ret; | 5764 | return ret; |
| 5735 | } | 5765 | } |
| 5736 | 5766 | ||
| 5737 | static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, | 5767 | static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| 5738 | struct cftype *cft, struct eventfd_ctx *eventfd) | 5768 | struct eventfd_ctx *eventfd, const char *args) |
| 5769 | { | ||
| 5770 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); | ||
| 5771 | } | ||
| 5772 | |||
| 5773 | static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, | ||
| 5774 | struct eventfd_ctx *eventfd, const char *args) | ||
| 5775 | { | ||
| 5776 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); | ||
| 5777 | } | ||
| 5778 | |||
| 5779 | static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||
| 5780 | struct eventfd_ctx *eventfd, enum res_type type) | ||
| 5739 | { | 5781 | { |
| 5740 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 5741 | struct mem_cgroup_thresholds *thresholds; | 5782 | struct mem_cgroup_thresholds *thresholds; |
| 5742 | struct mem_cgroup_threshold_ary *new; | 5783 | struct mem_cgroup_threshold_ary *new; |
| 5743 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
| 5744 | u64 usage; | 5784 | u64 usage; |
| 5745 | int i, j, size; | 5785 | int i, j, size; |
| 5746 | 5786 | ||
| @@ -5813,14 +5853,23 @@ unlock: | |||
| 5813 | mutex_unlock(&memcg->thresholds_lock); | 5853 | mutex_unlock(&memcg->thresholds_lock); |
| 5814 | } | 5854 | } |
| 5815 | 5855 | ||
| 5816 | static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | 5856 | static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
| 5817 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | 5857 | struct eventfd_ctx *eventfd) |
| 5858 | { | ||
| 5859 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); | ||
| 5860 | } | ||
| 5861 | |||
| 5862 | static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||
| 5863 | struct eventfd_ctx *eventfd) | ||
| 5864 | { | ||
| 5865 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); | ||
| 5866 | } | ||
| 5867 | |||
| 5868 | static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, | ||
| 5869 | struct eventfd_ctx *eventfd, const char *args) | ||
| 5818 | { | 5870 | { |
| 5819 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 5820 | struct mem_cgroup_eventfd_list *event; | 5871 | struct mem_cgroup_eventfd_list *event; |
| 5821 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
| 5822 | 5872 | ||
| 5823 | BUG_ON(type != _OOM_TYPE); | ||
| 5824 | event = kmalloc(sizeof(*event), GFP_KERNEL); | 5873 | event = kmalloc(sizeof(*event), GFP_KERNEL); |
| 5825 | if (!event) | 5874 | if (!event) |
| 5826 | return -ENOMEM; | 5875 | return -ENOMEM; |
| @@ -5838,14 +5887,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | |||
| 5838 | return 0; | 5887 | return 0; |
| 5839 | } | 5888 | } |
| 5840 | 5889 | ||
| 5841 | static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, | 5890 | static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, |
| 5842 | struct cftype *cft, struct eventfd_ctx *eventfd) | 5891 | struct eventfd_ctx *eventfd) |
| 5843 | { | 5892 | { |
| 5844 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 5845 | struct mem_cgroup_eventfd_list *ev, *tmp; | 5893 | struct mem_cgroup_eventfd_list *ev, *tmp; |
| 5846 | enum res_type type = MEMFILE_TYPE(cft->private); | ||
| 5847 | |||
| 5848 | BUG_ON(type != _OOM_TYPE); | ||
| 5849 | 5894 | ||
| 5850 | spin_lock(&memcg_oom_lock); | 5895 | spin_lock(&memcg_oom_lock); |
| 5851 | 5896 | ||
| @@ -5859,17 +5904,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, | |||
| 5859 | spin_unlock(&memcg_oom_lock); | 5904 | spin_unlock(&memcg_oom_lock); |
| 5860 | } | 5905 | } |
| 5861 | 5906 | ||
| 5862 | static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, | 5907 | static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) |
| 5863 | struct cftype *cft, struct cgroup_map_cb *cb) | ||
| 5864 | { | 5908 | { |
| 5865 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5909 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); |
| 5866 | 5910 | ||
| 5867 | cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); | 5911 | seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); |
| 5868 | 5912 | seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); | |
| 5869 | if (atomic_read(&memcg->under_oom)) | ||
| 5870 | cb->fill(cb, "under_oom", 1); | ||
| 5871 | else | ||
| 5872 | cb->fill(cb, "under_oom", 0); | ||
| 5873 | return 0; | 5913 | return 0; |
| 5874 | } | 5914 | } |
| 5875 | 5915 | ||
| @@ -5962,41 +6002,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | |||
| 5962 | } | 6002 | } |
| 5963 | #endif | 6003 | #endif |
| 5964 | 6004 | ||
| 6005 | /* | ||
| 6006 | * DO NOT USE IN NEW FILES. | ||
| 6007 | * | ||
| 6008 | * "cgroup.event_control" implementation. | ||
| 6009 | * | ||
| 6010 | * This is way over-engineered. It tries to support fully configurable | ||
| 6011 | * events for each user. Such level of flexibility is completely | ||
| 6012 | * unnecessary especially in the light of the planned unified hierarchy. | ||
| 6013 | * | ||
| 6014 | * Please deprecate this and replace with something simpler if at all | ||
| 6015 | * possible. | ||
| 6016 | */ | ||
| 6017 | |||
| 6018 | /* | ||
| 6019 | * Unregister event and free resources. | ||
| 6020 | * | ||
| 6021 | * Gets called from workqueue. | ||
| 6022 | */ | ||
| 6023 | static void memcg_event_remove(struct work_struct *work) | ||
| 6024 | { | ||
| 6025 | struct mem_cgroup_event *event = | ||
| 6026 | container_of(work, struct mem_cgroup_event, remove); | ||
| 6027 | struct mem_cgroup *memcg = event->memcg; | ||
| 6028 | |||
| 6029 | remove_wait_queue(event->wqh, &event->wait); | ||
| 6030 | |||
| 6031 | event->unregister_event(memcg, event->eventfd); | ||
| 6032 | |||
| 6033 | /* Notify userspace the event is going away. */ | ||
| 6034 | eventfd_signal(event->eventfd, 1); | ||
| 6035 | |||
| 6036 | eventfd_ctx_put(event->eventfd); | ||
| 6037 | kfree(event); | ||
| 6038 | css_put(&memcg->css); | ||
| 6039 | } | ||
| 6040 | |||
| 6041 | /* | ||
| 6042 | * Gets called on POLLHUP on eventfd when user closes it. | ||
| 6043 | * | ||
| 6044 | * Called with wqh->lock held and interrupts disabled. | ||
| 6045 | */ | ||
| 6046 | static int memcg_event_wake(wait_queue_t *wait, unsigned mode, | ||
| 6047 | int sync, void *key) | ||
| 6048 | { | ||
| 6049 | struct mem_cgroup_event *event = | ||
| 6050 | container_of(wait, struct mem_cgroup_event, wait); | ||
| 6051 | struct mem_cgroup *memcg = event->memcg; | ||
| 6052 | unsigned long flags = (unsigned long)key; | ||
| 6053 | |||
| 6054 | if (flags & POLLHUP) { | ||
| 6055 | /* | ||
| 6056 | * If the event has been detached at cgroup removal, we | ||
| 6057 | * can simply return knowing the other side will cleanup | ||
| 6058 | * for us. | ||
| 6059 | * | ||
| 6060 | * We can't race against event freeing since the other | ||
| 6061 | * side will require wqh->lock via remove_wait_queue(), | ||
| 6062 | * which we hold. | ||
| 6063 | */ | ||
| 6064 | spin_lock(&memcg->event_list_lock); | ||
| 6065 | if (!list_empty(&event->list)) { | ||
| 6066 | list_del_init(&event->list); | ||
| 6067 | /* | ||
| 6068 | * We are in atomic context, but cgroup_event_remove() | ||
| 6069 | * may sleep, so we have to call it in workqueue. | ||
| 6070 | */ | ||
| 6071 | schedule_work(&event->remove); | ||
| 6072 | } | ||
| 6073 | spin_unlock(&memcg->event_list_lock); | ||
| 6074 | } | ||
| 6075 | |||
| 6076 | return 0; | ||
| 6077 | } | ||
| 6078 | |||
| 6079 | static void memcg_event_ptable_queue_proc(struct file *file, | ||
| 6080 | wait_queue_head_t *wqh, poll_table *pt) | ||
| 6081 | { | ||
| 6082 | struct mem_cgroup_event *event = | ||
| 6083 | container_of(pt, struct mem_cgroup_event, pt); | ||
| 6084 | |||
| 6085 | event->wqh = wqh; | ||
| 6086 | add_wait_queue(wqh, &event->wait); | ||
| 6087 | } | ||
| 6088 | |||
| 6089 | /* | ||
| 6090 | * DO NOT USE IN NEW FILES. | ||
| 6091 | * | ||
| 6092 | * Parse input and register new cgroup event handler. | ||
| 6093 | * | ||
| 6094 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
| 6095 | * Interpretation of args is defined by control file implementation. | ||
| 6096 | */ | ||
| 6097 | static int memcg_write_event_control(struct cgroup_subsys_state *css, | ||
| 6098 | struct cftype *cft, const char *buffer) | ||
| 6099 | { | ||
| 6100 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
| 6101 | struct mem_cgroup_event *event; | ||
| 6102 | struct cgroup_subsys_state *cfile_css; | ||
| 6103 | unsigned int efd, cfd; | ||
| 6104 | struct fd efile; | ||
| 6105 | struct fd cfile; | ||
| 6106 | const char *name; | ||
| 6107 | char *endp; | ||
| 6108 | int ret; | ||
| 6109 | |||
| 6110 | efd = simple_strtoul(buffer, &endp, 10); | ||
| 6111 | if (*endp != ' ') | ||
| 6112 | return -EINVAL; | ||
| 6113 | buffer = endp + 1; | ||
| 6114 | |||
| 6115 | cfd = simple_strtoul(buffer, &endp, 10); | ||
| 6116 | if ((*endp != ' ') && (*endp != '\0')) | ||
| 6117 | return -EINVAL; | ||
| 6118 | buffer = endp + 1; | ||
| 6119 | |||
| 6120 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
| 6121 | if (!event) | ||
| 6122 | return -ENOMEM; | ||
| 6123 | |||
| 6124 | event->memcg = memcg; | ||
| 6125 | INIT_LIST_HEAD(&event->list); | ||
| 6126 | init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); | ||
| 6127 | init_waitqueue_func_entry(&event->wait, memcg_event_wake); | ||
| 6128 | INIT_WORK(&event->remove, memcg_event_remove); | ||
| 6129 | |||
| 6130 | efile = fdget(efd); | ||
| 6131 | if (!efile.file) { | ||
| 6132 | ret = -EBADF; | ||
| 6133 | goto out_kfree; | ||
| 6134 | } | ||
| 6135 | |||
| 6136 | event->eventfd = eventfd_ctx_fileget(efile.file); | ||
| 6137 | if (IS_ERR(event->eventfd)) { | ||
| 6138 | ret = PTR_ERR(event->eventfd); | ||
| 6139 | goto out_put_efile; | ||
| 6140 | } | ||
| 6141 | |||
| 6142 | cfile = fdget(cfd); | ||
| 6143 | if (!cfile.file) { | ||
| 6144 | ret = -EBADF; | ||
| 6145 | goto out_put_eventfd; | ||
| 6146 | } | ||
| 6147 | |||
| 6148 | /* the process need read permission on control file */ | ||
| 6149 | /* AV: shouldn't we check that it's been opened for read instead? */ | ||
| 6150 | ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||
| 6151 | if (ret < 0) | ||
| 6152 | goto out_put_cfile; | ||
| 6153 | |||
| 6154 | /* | ||
| 6155 | * Determine the event callbacks and set them in @event. This used | ||
| 6156 | * to be done via struct cftype but cgroup core no longer knows | ||
| 6157 | * about these events. The following is crude but the whole thing | ||
| 6158 | * is for compatibility anyway. | ||
| 6159 | * | ||
| 6160 | * DO NOT ADD NEW FILES. | ||
| 6161 | */ | ||
| 6162 | name = cfile.file->f_dentry->d_name.name; | ||
| 6163 | |||
| 6164 | if (!strcmp(name, "memory.usage_in_bytes")) { | ||
| 6165 | event->register_event = mem_cgroup_usage_register_event; | ||
| 6166 | event->unregister_event = mem_cgroup_usage_unregister_event; | ||
| 6167 | } else if (!strcmp(name, "memory.oom_control")) { | ||
| 6168 | event->register_event = mem_cgroup_oom_register_event; | ||
| 6169 | event->unregister_event = mem_cgroup_oom_unregister_event; | ||
| 6170 | } else if (!strcmp(name, "memory.pressure_level")) { | ||
| 6171 | event->register_event = vmpressure_register_event; | ||
| 6172 | event->unregister_event = vmpressure_unregister_event; | ||
| 6173 | } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { | ||
| 6174 | event->register_event = memsw_cgroup_usage_register_event; | ||
| 6175 | event->unregister_event = memsw_cgroup_usage_unregister_event; | ||
| 6176 | } else { | ||
| 6177 | ret = -EINVAL; | ||
| 6178 | goto out_put_cfile; | ||
| 6179 | } | ||
| 6180 | |||
| 6181 | /* | ||
| 6182 | * Verify @cfile should belong to @css. Also, remaining events are | ||
| 6183 | * automatically removed on cgroup destruction but the removal is | ||
| 6184 | * asynchronous, so take an extra ref on @css. | ||
| 6185 | */ | ||
| 6186 | rcu_read_lock(); | ||
| 6187 | |||
| 6188 | ret = -EINVAL; | ||
| 6189 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, | ||
| 6190 | &mem_cgroup_subsys); | ||
| 6191 | if (cfile_css == css && css_tryget(css)) | ||
| 6192 | ret = 0; | ||
| 6193 | |||
| 6194 | rcu_read_unlock(); | ||
| 6195 | if (ret) | ||
| 6196 | goto out_put_cfile; | ||
| 6197 | |||
| 6198 | ret = event->register_event(memcg, event->eventfd, buffer); | ||
| 6199 | if (ret) | ||
| 6200 | goto out_put_css; | ||
| 6201 | |||
| 6202 | efile.file->f_op->poll(efile.file, &event->pt); | ||
| 6203 | |||
| 6204 | spin_lock(&memcg->event_list_lock); | ||
| 6205 | list_add(&event->list, &memcg->event_list); | ||
| 6206 | spin_unlock(&memcg->event_list_lock); | ||
| 6207 | |||
| 6208 | fdput(cfile); | ||
| 6209 | fdput(efile); | ||
| 6210 | |||
| 6211 | return 0; | ||
| 6212 | |||
| 6213 | out_put_css: | ||
| 6214 | css_put(css); | ||
| 6215 | out_put_cfile: | ||
| 6216 | fdput(cfile); | ||
| 6217 | out_put_eventfd: | ||
| 6218 | eventfd_ctx_put(event->eventfd); | ||
| 6219 | out_put_efile: | ||
| 6220 | fdput(efile); | ||
| 6221 | out_kfree: | ||
| 6222 | kfree(event); | ||
| 6223 | |||
| 6224 | return ret; | ||
| 6225 | } | ||
| 6226 | |||
| 5965 | static struct cftype mem_cgroup_files[] = { | 6227 | static struct cftype mem_cgroup_files[] = { |
| 5966 | { | 6228 | { |
| 5967 | .name = "usage_in_bytes", | 6229 | .name = "usage_in_bytes", |
| 5968 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 6230 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
| 5969 | .read = mem_cgroup_read, | 6231 | .read_u64 = mem_cgroup_read_u64, |
| 5970 | .register_event = mem_cgroup_usage_register_event, | ||
| 5971 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
| 5972 | }, | 6232 | }, |
| 5973 | { | 6233 | { |
| 5974 | .name = "max_usage_in_bytes", | 6234 | .name = "max_usage_in_bytes", |
| 5975 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), | 6235 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
| 5976 | .trigger = mem_cgroup_reset, | 6236 | .trigger = mem_cgroup_reset, |
| 5977 | .read = mem_cgroup_read, | 6237 | .read_u64 = mem_cgroup_read_u64, |
| 5978 | }, | 6238 | }, |
| 5979 | { | 6239 | { |
| 5980 | .name = "limit_in_bytes", | 6240 | .name = "limit_in_bytes", |
| 5981 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), | 6241 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
| 5982 | .write_string = mem_cgroup_write, | 6242 | .write_string = mem_cgroup_write, |
| 5983 | .read = mem_cgroup_read, | 6243 | .read_u64 = mem_cgroup_read_u64, |
| 5984 | }, | 6244 | }, |
| 5985 | { | 6245 | { |
| 5986 | .name = "soft_limit_in_bytes", | 6246 | .name = "soft_limit_in_bytes", |
| 5987 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | 6247 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), |
| 5988 | .write_string = mem_cgroup_write, | 6248 | .write_string = mem_cgroup_write, |
| 5989 | .read = mem_cgroup_read, | 6249 | .read_u64 = mem_cgroup_read_u64, |
| 5990 | }, | 6250 | }, |
| 5991 | { | 6251 | { |
| 5992 | .name = "failcnt", | 6252 | .name = "failcnt", |
| 5993 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 6253 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
| 5994 | .trigger = mem_cgroup_reset, | 6254 | .trigger = mem_cgroup_reset, |
| 5995 | .read = mem_cgroup_read, | 6255 | .read_u64 = mem_cgroup_read_u64, |
| 5996 | }, | 6256 | }, |
| 5997 | { | 6257 | { |
| 5998 | .name = "stat", | 6258 | .name = "stat", |
| 5999 | .read_seq_string = memcg_stat_show, | 6259 | .seq_show = memcg_stat_show, |
| 6000 | }, | 6260 | }, |
| 6001 | { | 6261 | { |
| 6002 | .name = "force_empty", | 6262 | .name = "force_empty", |
| @@ -6009,6 +6269,12 @@ static struct cftype mem_cgroup_files[] = { | |||
| 6009 | .read_u64 = mem_cgroup_hierarchy_read, | 6269 | .read_u64 = mem_cgroup_hierarchy_read, |
| 6010 | }, | 6270 | }, |
| 6011 | { | 6271 | { |
| 6272 | .name = "cgroup.event_control", /* XXX: for compat */ | ||
| 6273 | .write_string = memcg_write_event_control, | ||
| 6274 | .flags = CFTYPE_NO_PREFIX, | ||
| 6275 | .mode = S_IWUGO, | ||
| 6276 | }, | ||
| 6277 | { | ||
| 6012 | .name = "swappiness", | 6278 | .name = "swappiness", |
| 6013 | .read_u64 = mem_cgroup_swappiness_read, | 6279 | .read_u64 = mem_cgroup_swappiness_read, |
| 6014 | .write_u64 = mem_cgroup_swappiness_write, | 6280 | .write_u64 = mem_cgroup_swappiness_write, |
| @@ -6020,21 +6286,17 @@ static struct cftype mem_cgroup_files[] = { | |||
| 6020 | }, | 6286 | }, |
| 6021 | { | 6287 | { |
| 6022 | .name = "oom_control", | 6288 | .name = "oom_control", |
| 6023 | .read_map = mem_cgroup_oom_control_read, | 6289 | .seq_show = mem_cgroup_oom_control_read, |
| 6024 | .write_u64 = mem_cgroup_oom_control_write, | 6290 | .write_u64 = mem_cgroup_oom_control_write, |
| 6025 | .register_event = mem_cgroup_oom_register_event, | ||
| 6026 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
| 6027 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 6291 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
| 6028 | }, | 6292 | }, |
| 6029 | { | 6293 | { |
| 6030 | .name = "pressure_level", | 6294 | .name = "pressure_level", |
| 6031 | .register_event = vmpressure_register_event, | ||
| 6032 | .unregister_event = vmpressure_unregister_event, | ||
| 6033 | }, | 6295 | }, |
| 6034 | #ifdef CONFIG_NUMA | 6296 | #ifdef CONFIG_NUMA |
| 6035 | { | 6297 | { |
| 6036 | .name = "numa_stat", | 6298 | .name = "numa_stat", |
| 6037 | .read_seq_string = memcg_numa_stat_show, | 6299 | .seq_show = memcg_numa_stat_show, |
| 6038 | }, | 6300 | }, |
| 6039 | #endif | 6301 | #endif |
| 6040 | #ifdef CONFIG_MEMCG_KMEM | 6302 | #ifdef CONFIG_MEMCG_KMEM |
| @@ -6042,29 +6304,29 @@ static struct cftype mem_cgroup_files[] = { | |||
| 6042 | .name = "kmem.limit_in_bytes", | 6304 | .name = "kmem.limit_in_bytes", |
| 6043 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), | 6305 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), |
| 6044 | .write_string = mem_cgroup_write, | 6306 | .write_string = mem_cgroup_write, |
| 6045 | .read = mem_cgroup_read, | 6307 | .read_u64 = mem_cgroup_read_u64, |
| 6046 | }, | 6308 | }, |
| 6047 | { | 6309 | { |
| 6048 | .name = "kmem.usage_in_bytes", | 6310 | .name = "kmem.usage_in_bytes", |
| 6049 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), | 6311 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), |
| 6050 | .read = mem_cgroup_read, | 6312 | .read_u64 = mem_cgroup_read_u64, |
| 6051 | }, | 6313 | }, |
| 6052 | { | 6314 | { |
| 6053 | .name = "kmem.failcnt", | 6315 | .name = "kmem.failcnt", |
| 6054 | .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), | 6316 | .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), |
| 6055 | .trigger = mem_cgroup_reset, | 6317 | .trigger = mem_cgroup_reset, |
| 6056 | .read = mem_cgroup_read, | 6318 | .read_u64 = mem_cgroup_read_u64, |
| 6057 | }, | 6319 | }, |
| 6058 | { | 6320 | { |
| 6059 | .name = "kmem.max_usage_in_bytes", | 6321 | .name = "kmem.max_usage_in_bytes", |
| 6060 | .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), | 6322 | .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), |
| 6061 | .trigger = mem_cgroup_reset, | 6323 | .trigger = mem_cgroup_reset, |
| 6062 | .read = mem_cgroup_read, | 6324 | .read_u64 = mem_cgroup_read_u64, |
| 6063 | }, | 6325 | }, |
| 6064 | #ifdef CONFIG_SLABINFO | 6326 | #ifdef CONFIG_SLABINFO |
| 6065 | { | 6327 | { |
| 6066 | .name = "kmem.slabinfo", | 6328 | .name = "kmem.slabinfo", |
| 6067 | .read_seq_string = mem_cgroup_slabinfo_read, | 6329 | .seq_show = mem_cgroup_slabinfo_read, |
| 6068 | }, | 6330 | }, |
| 6069 | #endif | 6331 | #endif |
| 6070 | #endif | 6332 | #endif |
| @@ -6076,27 +6338,25 @@ static struct cftype memsw_cgroup_files[] = { | |||
| 6076 | { | 6338 | { |
| 6077 | .name = "memsw.usage_in_bytes", | 6339 | .name = "memsw.usage_in_bytes", |
| 6078 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 6340 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
| 6079 | .read = mem_cgroup_read, | 6341 | .read_u64 = mem_cgroup_read_u64, |
| 6080 | .register_event = mem_cgroup_usage_register_event, | ||
| 6081 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
| 6082 | }, | 6342 | }, |
| 6083 | { | 6343 | { |
| 6084 | .name = "memsw.max_usage_in_bytes", | 6344 | .name = "memsw.max_usage_in_bytes", |
| 6085 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | 6345 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), |
| 6086 | .trigger = mem_cgroup_reset, | 6346 | .trigger = mem_cgroup_reset, |
| 6087 | .read = mem_cgroup_read, | 6347 | .read_u64 = mem_cgroup_read_u64, |
| 6088 | }, | 6348 | }, |
| 6089 | { | 6349 | { |
| 6090 | .name = "memsw.limit_in_bytes", | 6350 | .name = "memsw.limit_in_bytes", |
| 6091 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | 6351 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), |
| 6092 | .write_string = mem_cgroup_write, | 6352 | .write_string = mem_cgroup_write, |
| 6093 | .read = mem_cgroup_read, | 6353 | .read_u64 = mem_cgroup_read_u64, |
| 6094 | }, | 6354 | }, |
| 6095 | { | 6355 | { |
| 6096 | .name = "memsw.failcnt", | 6356 | .name = "memsw.failcnt", |
| 6097 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | 6357 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), |
| 6098 | .trigger = mem_cgroup_reset, | 6358 | .trigger = mem_cgroup_reset, |
| 6099 | .read = mem_cgroup_read, | 6359 | .read_u64 = mem_cgroup_read_u64, |
| 6100 | }, | 6360 | }, |
| 6101 | { }, /* terminate */ | 6361 | { }, /* terminate */ |
| 6102 | }; | 6362 | }; |
| @@ -6139,14 +6399,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
| 6139 | static struct mem_cgroup *mem_cgroup_alloc(void) | 6399 | static struct mem_cgroup *mem_cgroup_alloc(void) |
| 6140 | { | 6400 | { |
| 6141 | struct mem_cgroup *memcg; | 6401 | struct mem_cgroup *memcg; |
| 6142 | size_t size = memcg_size(); | 6402 | size_t size; |
| 6143 | 6403 | ||
| 6144 | /* Can be very big if nr_node_ids is very big */ | 6404 | size = sizeof(struct mem_cgroup); |
| 6145 | if (size < PAGE_SIZE) | 6405 | size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); |
| 6146 | memcg = kzalloc(size, GFP_KERNEL); | ||
| 6147 | else | ||
| 6148 | memcg = vzalloc(size); | ||
| 6149 | 6406 | ||
| 6407 | memcg = kzalloc(size, GFP_KERNEL); | ||
| 6150 | if (!memcg) | 6408 | if (!memcg) |
| 6151 | return NULL; | 6409 | return NULL; |
| 6152 | 6410 | ||
| @@ -6157,10 +6415,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
| 6157 | return memcg; | 6415 | return memcg; |
| 6158 | 6416 | ||
| 6159 | out_free: | 6417 | out_free: |
| 6160 | if (size < PAGE_SIZE) | 6418 | kfree(memcg); |
| 6161 | kfree(memcg); | ||
| 6162 | else | ||
| 6163 | vfree(memcg); | ||
| 6164 | return NULL; | 6419 | return NULL; |
| 6165 | } | 6420 | } |
| 6166 | 6421 | ||
| @@ -6178,7 +6433,6 @@ out_free: | |||
| 6178 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | 6433 | static void __mem_cgroup_free(struct mem_cgroup *memcg) |
| 6179 | { | 6434 | { |
| 6180 | int node; | 6435 | int node; |
| 6181 | size_t size = memcg_size(); | ||
| 6182 | 6436 | ||
| 6183 | mem_cgroup_remove_from_trees(memcg); | 6437 | mem_cgroup_remove_from_trees(memcg); |
| 6184 | 6438 | ||
| @@ -6199,10 +6453,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
| 6199 | * the cgroup_lock. | 6453 | * the cgroup_lock. |
| 6200 | */ | 6454 | */ |
| 6201 | disarm_static_keys(memcg); | 6455 | disarm_static_keys(memcg); |
| 6202 | if (size < PAGE_SIZE) | 6456 | kfree(memcg); |
| 6203 | kfree(memcg); | ||
| 6204 | else | ||
| 6205 | vfree(memcg); | ||
| 6206 | } | 6457 | } |
| 6207 | 6458 | ||
| 6208 | /* | 6459 | /* |
| @@ -6268,6 +6519,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 6268 | mutex_init(&memcg->thresholds_lock); | 6519 | mutex_init(&memcg->thresholds_lock); |
| 6269 | spin_lock_init(&memcg->move_lock); | 6520 | spin_lock_init(&memcg->move_lock); |
| 6270 | vmpressure_init(&memcg->vmpressure); | 6521 | vmpressure_init(&memcg->vmpressure); |
| 6522 | INIT_LIST_HEAD(&memcg->event_list); | ||
| 6523 | spin_lock_init(&memcg->event_list_lock); | ||
| 6271 | 6524 | ||
| 6272 | return &memcg->css; | 6525 | return &memcg->css; |
| 6273 | 6526 | ||
| @@ -6281,7 +6534,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
| 6281 | { | 6534 | { |
| 6282 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 6535 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| 6283 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); | 6536 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); |
| 6284 | int error = 0; | ||
| 6285 | 6537 | ||
| 6286 | if (css->cgroup->id > MEM_CGROUP_ID_MAX) | 6538 | if (css->cgroup->id > MEM_CGROUP_ID_MAX) |
| 6287 | return -ENOSPC; | 6539 | return -ENOSPC; |
| @@ -6316,10 +6568,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
| 6316 | if (parent != root_mem_cgroup) | 6568 | if (parent != root_mem_cgroup) |
| 6317 | mem_cgroup_subsys.broken_hierarchy = true; | 6569 | mem_cgroup_subsys.broken_hierarchy = true; |
| 6318 | } | 6570 | } |
| 6319 | |||
| 6320 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); | ||
| 6321 | mutex_unlock(&memcg_create_mutex); | 6571 | mutex_unlock(&memcg_create_mutex); |
| 6322 | return error; | 6572 | |
| 6573 | return memcg_init_kmem(memcg, &mem_cgroup_subsys); | ||
| 6323 | } | 6574 | } |
| 6324 | 6575 | ||
| 6325 | /* | 6576 | /* |
| @@ -6343,11 +6594,32 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | |||
| 6343 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | 6594 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) |
| 6344 | { | 6595 | { |
| 6345 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 6596 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| 6597 | struct mem_cgroup_event *event, *tmp; | ||
| 6598 | struct cgroup_subsys_state *iter; | ||
| 6599 | |||
| 6600 | /* | ||
| 6601 | * Unregister events and notify userspace. | ||
| 6602 | * Notify userspace about cgroup removing only after rmdir of cgroup | ||
| 6603 | * directory to avoid race between userspace and kernelspace. | ||
| 6604 | */ | ||
| 6605 | spin_lock(&memcg->event_list_lock); | ||
| 6606 | list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { | ||
| 6607 | list_del_init(&event->list); | ||
| 6608 | schedule_work(&event->remove); | ||
| 6609 | } | ||
| 6610 | spin_unlock(&memcg->event_list_lock); | ||
| 6346 | 6611 | ||
| 6347 | kmem_cgroup_css_offline(memcg); | 6612 | kmem_cgroup_css_offline(memcg); |
| 6348 | 6613 | ||
| 6349 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6614 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
| 6350 | mem_cgroup_reparent_charges(memcg); | 6615 | |
| 6616 | /* | ||
| 6617 | * This requires that offlining is serialized. Right now that is | ||
| 6618 | * guaranteed because css_killed_work_fn() holds the cgroup_mutex. | ||
| 6619 | */ | ||
| 6620 | css_for_each_descendant_post(iter, css) | ||
| 6621 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); | ||
| 6622 | |||
| 6351 | mem_cgroup_destroy_all_caches(memcg); | 6623 | mem_cgroup_destroy_all_caches(memcg); |
| 6352 | vmpressure_cleanup(&memcg->vmpressure); | 6624 | vmpressure_cleanup(&memcg->vmpressure); |
| 6353 | } | 6625 | } |
| @@ -6615,7 +6887,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | |||
| 6615 | enum mc_target_type ret = MC_TARGET_NONE; | 6887 | enum mc_target_type ret = MC_TARGET_NONE; |
| 6616 | 6888 | ||
| 6617 | page = pmd_page(pmd); | 6889 | page = pmd_page(pmd); |
| 6618 | VM_BUG_ON(!page || !PageHead(page)); | 6890 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); |
| 6619 | if (!move_anon()) | 6891 | if (!move_anon()) |
| 6620 | return ret; | 6892 | return ret; |
| 6621 | pc = lookup_page_cgroup(page); | 6893 | pc = lookup_page_cgroup(page); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fabe55046c1d..90002ea43638 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
| 611 | } | 611 | } |
| 612 | 612 | ||
| 613 | /* | 613 | /* |
| 614 | * Dirty cache page page | 614 | * Dirty pagecache page |
| 615 | * Issues: when the error hit a hole page the error is not properly | 615 | * Issues: when the error hit a hole page the error is not properly |
| 616 | * propagated. | 616 | * propagated. |
| 617 | */ | 617 | */ |
| @@ -856,14 +856,14 @@ static int page_action(struct page_state *ps, struct page *p, | |||
| 856 | * the pages and send SIGBUS to the processes if the data was dirty. | 856 | * the pages and send SIGBUS to the processes if the data was dirty. |
| 857 | */ | 857 | */ |
| 858 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | 858 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
| 859 | int trapno, int flags) | 859 | int trapno, int flags, struct page **hpagep) |
| 860 | { | 860 | { |
| 861 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 861 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
| 862 | struct address_space *mapping; | 862 | struct address_space *mapping; |
| 863 | LIST_HEAD(tokill); | 863 | LIST_HEAD(tokill); |
| 864 | int ret; | 864 | int ret; |
| 865 | int kill = 1, forcekill; | 865 | int kill = 1, forcekill; |
| 866 | struct page *hpage = compound_head(p); | 866 | struct page *hpage = *hpagep; |
| 867 | struct page *ppage; | 867 | struct page *ppage; |
| 868 | 868 | ||
| 869 | if (PageReserved(p) || PageSlab(p)) | 869 | if (PageReserved(p) || PageSlab(p)) |
| @@ -942,11 +942,16 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 942 | * We pinned the head page for hwpoison handling, | 942 | * We pinned the head page for hwpoison handling, |
| 943 | * now we split the thp and we are interested in | 943 | * now we split the thp and we are interested in |
| 944 | * the hwpoisoned raw page, so move the refcount | 944 | * the hwpoisoned raw page, so move the refcount |
| 945 | * to it. | 945 | * to it. Similarly, page lock is shifted. |
| 946 | */ | 946 | */ |
| 947 | if (hpage != p) { | 947 | if (hpage != p) { |
| 948 | put_page(hpage); | 948 | if (!(flags & MF_COUNT_INCREASED)) { |
| 949 | get_page(p); | 949 | put_page(hpage); |
| 950 | get_page(p); | ||
| 951 | } | ||
| 952 | lock_page(p); | ||
| 953 | unlock_page(hpage); | ||
| 954 | *hpagep = p; | ||
| 950 | } | 955 | } |
| 951 | /* THP is split, so ppage should be the real poisoned page. */ | 956 | /* THP is split, so ppage should be the real poisoned page. */ |
| 952 | ppage = p; | 957 | ppage = p; |
| @@ -964,17 +969,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 964 | if (kill) | 969 | if (kill) |
| 965 | collect_procs(ppage, &tokill); | 970 | collect_procs(ppage, &tokill); |
| 966 | 971 | ||
| 967 | if (hpage != ppage) | ||
| 968 | lock_page(ppage); | ||
| 969 | |||
| 970 | ret = try_to_unmap(ppage, ttu); | 972 | ret = try_to_unmap(ppage, ttu); |
| 971 | if (ret != SWAP_SUCCESS) | 973 | if (ret != SWAP_SUCCESS) |
| 972 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 974 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
| 973 | pfn, page_mapcount(ppage)); | 975 | pfn, page_mapcount(ppage)); |
| 974 | 976 | ||
| 975 | if (hpage != ppage) | ||
| 976 | unlock_page(ppage); | ||
| 977 | |||
| 978 | /* | 977 | /* |
| 979 | * Now that the dirty bit has been propagated to the | 978 | * Now that the dirty bit has been propagated to the |
| 980 | * struct page and all unmaps done we can decide if | 979 | * struct page and all unmaps done we can decide if |
| @@ -1193,8 +1192,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1193 | /* | 1192 | /* |
| 1194 | * Now take care of user space mappings. | 1193 | * Now take care of user space mappings. |
| 1195 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. | 1194 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. |
| 1195 | * | ||
| 1196 | * When the raw error page is thp tail page, hpage points to the raw | ||
| 1197 | * page after thp split. | ||
| 1196 | */ | 1198 | */ |
| 1197 | if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { | 1199 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) |
| 1200 | != SWAP_SUCCESS) { | ||
| 1198 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | 1201 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); |
| 1199 | res = -EBUSY; | 1202 | res = -EBUSY; |
| 1200 | goto out; | 1203 | goto out; |
| @@ -1585,7 +1588,13 @@ static int __soft_offline_page(struct page *page, int flags) | |||
| 1585 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1588 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
| 1586 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1589 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
| 1587 | if (ret) { | 1590 | if (ret) { |
| 1588 | putback_lru_pages(&pagelist); | 1591 | if (!list_empty(&pagelist)) { |
| 1592 | list_del(&page->lru); | ||
| 1593 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 1594 | page_is_file_cache(page)); | ||
| 1595 | putback_lru_page(page); | ||
| 1596 | } | ||
| 1597 | |||
| 1589 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1598 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
| 1590 | pfn, ret, page->flags); | 1599 | pfn, ret, page->flags); |
| 1591 | if (ret > 0) | 1600 | if (ret > 0) |
| @@ -1642,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1642 | { | 1651 | { |
| 1643 | int ret; | 1652 | int ret; |
| 1644 | unsigned long pfn = page_to_pfn(page); | 1653 | unsigned long pfn = page_to_pfn(page); |
| 1645 | struct page *hpage = compound_trans_head(page); | 1654 | struct page *hpage = compound_head(page); |
| 1646 | 1655 | ||
| 1647 | if (PageHWPoison(page)) { | 1656 | if (PageHWPoison(page)) { |
| 1648 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | 1657 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
diff --git a/mm/memory.c b/mm/memory.c index 6768ce9e57d2..22dfa617bddb 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -59,6 +59,7 @@ | |||
| 59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
| 60 | #include <linux/migrate.h> | 60 | #include <linux/migrate.h> |
| 61 | #include <linux/string.h> | 61 | #include <linux/string.h> |
| 62 | #include <linux/dma-debug.h> | ||
| 62 | 63 | ||
| 63 | #include <asm/io.h> | 64 | #include <asm/io.h> |
| 64 | #include <asm/pgalloc.h> | 65 | #include <asm/pgalloc.h> |
| @@ -288,7 +289,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
| 288 | return 0; | 289 | return 0; |
| 289 | batch = tlb->active; | 290 | batch = tlb->active; |
| 290 | } | 291 | } |
| 291 | VM_BUG_ON(batch->nr > batch->max); | 292 | VM_BUG_ON_PAGE(batch->nr > batch->max, page); |
| 292 | 293 | ||
| 293 | return batch->max - batch->nr; | 294 | return batch->max - batch->nr; |
| 294 | } | 295 | } |
| @@ -670,7 +671,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
| 670 | current->comm, | 671 | current->comm, |
| 671 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | 672 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); |
| 672 | if (page) | 673 | if (page) |
| 673 | dump_page(page); | 674 | dump_page(page, "bad pte"); |
| 674 | printk(KERN_ALERT | 675 | printk(KERN_ALERT |
| 675 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | 676 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", |
| 676 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | 677 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); |
| @@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | |||
| 2559 | 2560 | ||
| 2560 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 2561 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
| 2561 | { | 2562 | { |
| 2563 | debug_dma_assert_idle(src); | ||
| 2564 | |||
| 2562 | /* | 2565 | /* |
| 2563 | * If the source page was a PFN mapping, we don't have | 2566 | * If the source page was a PFN mapping, we don't have |
| 2564 | * a "struct page" for it. We do a best-effort copy by | 2567 | * a "struct page" for it. We do a best-effort copy by |
| @@ -2699,7 +2702,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2699 | goto unwritable_page; | 2702 | goto unwritable_page; |
| 2700 | } | 2703 | } |
| 2701 | } else | 2704 | } else |
| 2702 | VM_BUG_ON(!PageLocked(old_page)); | 2705 | VM_BUG_ON_PAGE(!PageLocked(old_page), old_page); |
| 2703 | 2706 | ||
| 2704 | /* | 2707 | /* |
| 2705 | * Since we dropped the lock we need to revalidate | 2708 | * Since we dropped the lock we need to revalidate |
| @@ -3345,6 +3348,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3345 | if (ret & VM_FAULT_LOCKED) | 3348 | if (ret & VM_FAULT_LOCKED) |
| 3346 | unlock_page(vmf.page); | 3349 | unlock_page(vmf.page); |
| 3347 | ret = VM_FAULT_HWPOISON; | 3350 | ret = VM_FAULT_HWPOISON; |
| 3351 | page_cache_release(vmf.page); | ||
| 3348 | goto uncharge_out; | 3352 | goto uncharge_out; |
| 3349 | } | 3353 | } |
| 3350 | 3354 | ||
| @@ -3355,7 +3359,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3355 | if (unlikely(!(ret & VM_FAULT_LOCKED))) | 3359 | if (unlikely(!(ret & VM_FAULT_LOCKED))) |
| 3356 | lock_page(vmf.page); | 3360 | lock_page(vmf.page); |
| 3357 | else | 3361 | else |
| 3358 | VM_BUG_ON(!PageLocked(vmf.page)); | 3362 | VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); |
| 3359 | 3363 | ||
| 3360 | /* | 3364 | /* |
| 3361 | * Should we do an early C-O-W break? | 3365 | * Should we do an early C-O-W break? |
| @@ -3392,7 +3396,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3392 | goto unwritable_page; | 3396 | goto unwritable_page; |
| 3393 | } | 3397 | } |
| 3394 | } else | 3398 | } else |
| 3395 | VM_BUG_ON(!PageLocked(page)); | 3399 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 3396 | page_mkwrite = 1; | 3400 | page_mkwrite = 1; |
| 3397 | } | 3401 | } |
| 3398 | } | 3402 | } |
| @@ -3700,7 +3704,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3700 | if (unlikely(is_vm_hugetlb_page(vma))) | 3704 | if (unlikely(is_vm_hugetlb_page(vma))) |
| 3701 | return hugetlb_fault(mm, vma, address, flags); | 3705 | return hugetlb_fault(mm, vma, address, flags); |
| 3702 | 3706 | ||
| 3703 | retry: | ||
| 3704 | pgd = pgd_offset(mm, address); | 3707 | pgd = pgd_offset(mm, address); |
| 3705 | pud = pud_alloc(mm, pgd, address); | 3708 | pud = pud_alloc(mm, pgd, address); |
| 3706 | if (!pud) | 3709 | if (!pud) |
| @@ -3738,20 +3741,13 @@ retry: | |||
| 3738 | if (dirty && !pmd_write(orig_pmd)) { | 3741 | if (dirty && !pmd_write(orig_pmd)) { |
| 3739 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3742 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
| 3740 | orig_pmd); | 3743 | orig_pmd); |
| 3741 | /* | 3744 | if (!(ret & VM_FAULT_FALLBACK)) |
| 3742 | * If COW results in an oom, the huge pmd will | 3745 | return ret; |
| 3743 | * have been split, so retry the fault on the | ||
| 3744 | * pte for a smaller charge. | ||
| 3745 | */ | ||
| 3746 | if (unlikely(ret & VM_FAULT_OOM)) | ||
| 3747 | goto retry; | ||
| 3748 | return ret; | ||
| 3749 | } else { | 3746 | } else { |
| 3750 | huge_pmd_set_accessed(mm, vma, address, pmd, | 3747 | huge_pmd_set_accessed(mm, vma, address, pmd, |
| 3751 | orig_pmd, dirty); | 3748 | orig_pmd, dirty); |
| 3749 | return 0; | ||
| 3752 | } | 3750 | } |
| 3753 | |||
| 3754 | return 0; | ||
| 3755 | } | 3751 | } |
| 3756 | } | 3752 | } |
| 3757 | 3753 | ||
| @@ -4272,11 +4268,20 @@ void copy_user_huge_page(struct page *dst, struct page *src, | |||
| 4272 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | 4268 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
| 4273 | 4269 | ||
| 4274 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS | 4270 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS |
| 4271 | |||
| 4272 | static struct kmem_cache *page_ptl_cachep; | ||
| 4273 | |||
| 4274 | void __init ptlock_cache_init(void) | ||
| 4275 | { | ||
| 4276 | page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, | ||
| 4277 | SLAB_PANIC, NULL); | ||
| 4278 | } | ||
| 4279 | |||
| 4275 | bool ptlock_alloc(struct page *page) | 4280 | bool ptlock_alloc(struct page *page) |
| 4276 | { | 4281 | { |
| 4277 | spinlock_t *ptl; | 4282 | spinlock_t *ptl; |
| 4278 | 4283 | ||
| 4279 | ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); | 4284 | ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); |
| 4280 | if (!ptl) | 4285 | if (!ptl) |
| 4281 | return false; | 4286 | return false; |
| 4282 | page->ptl = ptl; | 4287 | page->ptl = ptl; |
| @@ -4285,6 +4290,6 @@ bool ptlock_alloc(struct page *page) | |||
| 4285 | 4290 | ||
| 4286 | void ptlock_free(struct page *page) | 4291 | void ptlock_free(struct page *page) |
| 4287 | { | 4292 | { |
| 4288 | kfree(page->ptl); | 4293 | kmem_cache_free(page_ptl_cachep, page->ptl); |
| 4289 | } | 4294 | } |
| 4290 | #endif | 4295 | #endif |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 489f235502db..a650db29606f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -9,7 +9,6 @@ | |||
| 9 | #include <linux/swap.h> | 9 | #include <linux/swap.h> |
| 10 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
| 11 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
| 12 | #include <linux/bootmem.h> | ||
| 13 | #include <linux/compiler.h> | 12 | #include <linux/compiler.h> |
| 14 | #include <linux/export.h> | 13 | #include <linux/export.h> |
| 15 | #include <linux/pagevec.h> | 14 | #include <linux/pagevec.h> |
| @@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | |||
| 269 | } | 268 | } |
| 270 | 269 | ||
| 271 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or | 270 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or |
| 272 | * alloc_bootmem_node_nopanic() */ | 271 | * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ |
| 273 | static int __ref ensure_zone_is_initialized(struct zone *zone, | 272 | static int __ref ensure_zone_is_initialized(struct zone *zone, |
| 274 | unsigned long start_pfn, unsigned long num_pages) | 273 | unsigned long start_pfn, unsigned long num_pages) |
| 275 | { | 274 | { |
| @@ -1108,17 +1107,18 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
| 1108 | if (ret) | 1107 | if (ret) |
| 1109 | return ret; | 1108 | return ret; |
| 1110 | 1109 | ||
| 1111 | lock_memory_hotplug(); | ||
| 1112 | |||
| 1113 | res = register_memory_resource(start, size); | 1110 | res = register_memory_resource(start, size); |
| 1114 | ret = -EEXIST; | 1111 | ret = -EEXIST; |
| 1115 | if (!res) | 1112 | if (!res) |
| 1116 | goto out; | 1113 | return ret; |
| 1117 | 1114 | ||
| 1118 | { /* Stupid hack to suppress address-never-null warning */ | 1115 | { /* Stupid hack to suppress address-never-null warning */ |
| 1119 | void *p = NODE_DATA(nid); | 1116 | void *p = NODE_DATA(nid); |
| 1120 | new_pgdat = !p; | 1117 | new_pgdat = !p; |
| 1121 | } | 1118 | } |
| 1119 | |||
| 1120 | lock_memory_hotplug(); | ||
| 1121 | |||
| 1122 | new_node = !node_online(nid); | 1122 | new_node = !node_online(nid); |
| 1123 | if (new_node) { | 1123 | if (new_node) { |
| 1124 | pgdat = hotadd_new_pgdat(nid, start); | 1124 | pgdat = hotadd_new_pgdat(nid, start); |
| @@ -1310,7 +1310,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
| 1310 | #ifdef CONFIG_DEBUG_VM | 1310 | #ifdef CONFIG_DEBUG_VM |
| 1311 | printk(KERN_ALERT "removing pfn %lx from LRU failed\n", | 1311 | printk(KERN_ALERT "removing pfn %lx from LRU failed\n", |
| 1312 | pfn); | 1312 | pfn); |
| 1313 | dump_page(page); | 1313 | dump_page(page, "failed to remove from LRU"); |
| 1314 | #endif | 1314 | #endif |
| 1315 | put_page(page); | 1315 | put_page(page); |
| 1316 | /* Because we don't have big zone->lock. we should | 1316 | /* Because we don't have big zone->lock. we should |
| @@ -1446,6 +1446,7 @@ static int __init cmdline_parse_movable_node(char *p) | |||
| 1446 | * the kernel away from hotpluggable memory. | 1446 | * the kernel away from hotpluggable memory. |
| 1447 | */ | 1447 | */ |
| 1448 | memblock_set_bottom_up(true); | 1448 | memblock_set_bottom_up(true); |
| 1449 | movable_node_enabled = true; | ||
| 1449 | #else | 1450 | #else |
| 1450 | pr_warn("movable_node option not supported\n"); | 1451 | pr_warn("movable_node option not supported\n"); |
| 1451 | #endif | 1452 | #endif |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0cd2c4d4e270..ae3c8f3595d4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -613,7 +613,7 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma, | |||
| 613 | return 0; | 613 | return 0; |
| 614 | } | 614 | } |
| 615 | 615 | ||
| 616 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | 616 | #ifdef CONFIG_NUMA_BALANCING |
| 617 | /* | 617 | /* |
| 618 | * This is used to mark a range of virtual addresses to be inaccessible. | 618 | * This is used to mark a range of virtual addresses to be inaccessible. |
| 619 | * These are later cleared by a NUMA hinting fault. Depending on these | 619 | * These are later cleared by a NUMA hinting fault. Depending on these |
| @@ -627,7 +627,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
| 627 | unsigned long addr, unsigned long end) | 627 | unsigned long addr, unsigned long end) |
| 628 | { | 628 | { |
| 629 | int nr_updated; | 629 | int nr_updated; |
| 630 | BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); | ||
| 631 | 630 | ||
| 632 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); | 631 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); |
| 633 | if (nr_updated) | 632 | if (nr_updated) |
| @@ -641,7 +640,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
| 641 | { | 640 | { |
| 642 | return 0; | 641 | return 0; |
| 643 | } | 642 | } |
| 644 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | 643 | #endif /* CONFIG_NUMA_BALANCING */ |
| 645 | 644 | ||
| 646 | /* | 645 | /* |
| 647 | * Walk through page tables and collect pages to be migrated. | 646 | * Walk through page tables and collect pages to be migrated. |
| @@ -1199,10 +1198,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * | |||
| 1199 | } | 1198 | } |
| 1200 | 1199 | ||
| 1201 | if (PageHuge(page)) { | 1200 | if (PageHuge(page)) { |
| 1202 | if (vma) | 1201 | BUG_ON(!vma); |
| 1203 | return alloc_huge_page_noerr(vma, address, 1); | 1202 | return alloc_huge_page_noerr(vma, address, 1); |
| 1204 | else | ||
| 1205 | return NULL; | ||
| 1206 | } | 1203 | } |
| 1207 | /* | 1204 | /* |
| 1208 | * if !vma, alloc_page_vma() will use task or system default policy | 1205 | * if !vma, alloc_page_vma() will use task or system default policy |
| @@ -2657,7 +2654,7 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
| 2657 | } | 2654 | } |
| 2658 | 2655 | ||
| 2659 | #ifdef CONFIG_NUMA_BALANCING | 2656 | #ifdef CONFIG_NUMA_BALANCING |
| 2660 | static bool __initdata numabalancing_override; | 2657 | static int __initdata numabalancing_override; |
| 2661 | 2658 | ||
| 2662 | static void __init check_numabalancing_enable(void) | 2659 | static void __init check_numabalancing_enable(void) |
| 2663 | { | 2660 | { |
| @@ -2666,9 +2663,15 @@ static void __init check_numabalancing_enable(void) | |||
| 2666 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) | 2663 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) |
| 2667 | numabalancing_default = true; | 2664 | numabalancing_default = true; |
| 2668 | 2665 | ||
| 2666 | /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ | ||
| 2667 | if (numabalancing_override) | ||
| 2668 | set_numabalancing_state(numabalancing_override == 1); | ||
| 2669 | |||
| 2669 | if (nr_node_ids > 1 && !numabalancing_override) { | 2670 | if (nr_node_ids > 1 && !numabalancing_override) { |
| 2670 | printk(KERN_INFO "Enabling automatic NUMA balancing. " | 2671 | pr_info("%s automatic NUMA balancing. " |
| 2671 | "Configure with numa_balancing= or sysctl"); | 2672 | "Configure with numa_balancing= or the " |
| 2673 | "kernel.numa_balancing sysctl", | ||
| 2674 | numabalancing_default ? "Enabling" : "Disabling"); | ||
| 2672 | set_numabalancing_state(numabalancing_default); | 2675 | set_numabalancing_state(numabalancing_default); |
| 2673 | } | 2676 | } |
| 2674 | } | 2677 | } |
| @@ -2678,18 +2681,17 @@ static int __init setup_numabalancing(char *str) | |||
| 2678 | int ret = 0; | 2681 | int ret = 0; |
| 2679 | if (!str) | 2682 | if (!str) |
| 2680 | goto out; | 2683 | goto out; |
| 2681 | numabalancing_override = true; | ||
| 2682 | 2684 | ||
| 2683 | if (!strcmp(str, "enable")) { | 2685 | if (!strcmp(str, "enable")) { |
| 2684 | set_numabalancing_state(true); | 2686 | numabalancing_override = 1; |
| 2685 | ret = 1; | 2687 | ret = 1; |
| 2686 | } else if (!strcmp(str, "disable")) { | 2688 | } else if (!strcmp(str, "disable")) { |
| 2687 | set_numabalancing_state(false); | 2689 | numabalancing_override = -1; |
| 2688 | ret = 1; | 2690 | ret = 1; |
| 2689 | } | 2691 | } |
| 2690 | out: | 2692 | out: |
| 2691 | if (!ret) | 2693 | if (!ret) |
| 2692 | printk(KERN_WARNING "Unable to parse numa_balancing=\n"); | 2694 | pr_warn("Unable to parse numa_balancing=\n"); |
| 2693 | 2695 | ||
| 2694 | return ret; | 2696 | return ret; |
| 2695 | } | 2697 | } |
| @@ -2928,7 +2930,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
| 2928 | unsigned short mode = MPOL_DEFAULT; | 2930 | unsigned short mode = MPOL_DEFAULT; |
| 2929 | unsigned short flags = 0; | 2931 | unsigned short flags = 0; |
| 2930 | 2932 | ||
| 2931 | if (pol && pol != &default_policy) { | 2933 | if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { |
| 2932 | mode = pol->mode; | 2934 | mode = pol->mode; |
| 2933 | flags = pol->flags; | 2935 | flags = pol->flags; |
| 2934 | } | 2936 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 9194375b2307..bed48809e5d0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -72,28 +72,12 @@ int migrate_prep_local(void) | |||
| 72 | } | 72 | } |
| 73 | 73 | ||
| 74 | /* | 74 | /* |
| 75 | * Add isolated pages on the list back to the LRU under page lock | ||
| 76 | * to avoid leaking evictable pages back onto unevictable list. | ||
| 77 | */ | ||
| 78 | void putback_lru_pages(struct list_head *l) | ||
| 79 | { | ||
| 80 | struct page *page; | ||
| 81 | struct page *page2; | ||
| 82 | |||
| 83 | list_for_each_entry_safe(page, page2, l, lru) { | ||
| 84 | list_del(&page->lru); | ||
| 85 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 86 | page_is_file_cache(page)); | ||
| 87 | putback_lru_page(page); | ||
| 88 | } | ||
| 89 | } | ||
| 90 | |||
| 91 | /* | ||
| 92 | * Put previously isolated pages back onto the appropriate lists | 75 | * Put previously isolated pages back onto the appropriate lists |
| 93 | * from where they were once taken off for compaction/migration. | 76 | * from where they were once taken off for compaction/migration. |
| 94 | * | 77 | * |
| 95 | * This function shall be used instead of putback_lru_pages(), | 78 | * This function shall be used whenever the isolated pageset has been |
| 96 | * whenever the isolated pageset has been built by isolate_migratepages_range() | 79 | * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() |
| 80 | * and isolate_huge_page(). | ||
| 97 | */ | 81 | */ |
| 98 | void putback_movable_pages(struct list_head *l) | 82 | void putback_movable_pages(struct list_head *l) |
| 99 | { | 83 | { |
| @@ -194,12 +178,49 @@ out: | |||
| 194 | } | 178 | } |
| 195 | 179 | ||
| 196 | /* | 180 | /* |
| 181 | * Congratulations to trinity for discovering this bug. | ||
| 182 | * mm/fremap.c's remap_file_pages() accepts any range within a single vma to | ||
| 183 | * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then | ||
| 184 | * replace the specified range by file ptes throughout (maybe populated after). | ||
| 185 | * If page migration finds a page within that range, while it's still located | ||
| 186 | * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: | ||
| 187 | * zap_pte() clears the temporary migration entry before mmap_sem is dropped. | ||
| 188 | * But if the migrating page is in a part of the vma outside the range to be | ||
| 189 | * remapped, then it will not be cleared, and remove_migration_ptes() needs to | ||
| 190 | * deal with it. Fortunately, this part of the vma is of course still linear, | ||
| 191 | * so we just need to use linear location on the nonlinear list. | ||
| 192 | */ | ||
| 193 | static int remove_linear_migration_ptes_from_nonlinear(struct page *page, | ||
| 194 | struct address_space *mapping, void *arg) | ||
| 195 | { | ||
| 196 | struct vm_area_struct *vma; | ||
| 197 | /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ | ||
| 198 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 199 | unsigned long addr; | ||
| 200 | |||
| 201 | list_for_each_entry(vma, | ||
| 202 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
| 203 | |||
| 204 | addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
| 205 | if (addr >= vma->vm_start && addr < vma->vm_end) | ||
| 206 | remove_migration_pte(page, vma, addr, arg); | ||
| 207 | } | ||
| 208 | return SWAP_AGAIN; | ||
| 209 | } | ||
| 210 | |||
| 211 | /* | ||
| 197 | * Get rid of all migration entries and replace them by | 212 | * Get rid of all migration entries and replace them by |
| 198 | * references to the indicated page. | 213 | * references to the indicated page. |
| 199 | */ | 214 | */ |
| 200 | static void remove_migration_ptes(struct page *old, struct page *new) | 215 | static void remove_migration_ptes(struct page *old, struct page *new) |
| 201 | { | 216 | { |
| 202 | rmap_walk(new, remove_migration_pte, old); | 217 | struct rmap_walk_control rwc = { |
| 218 | .rmap_one = remove_migration_pte, | ||
| 219 | .arg = old, | ||
| 220 | .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, | ||
| 221 | }; | ||
| 222 | |||
| 223 | rmap_walk(new, &rwc); | ||
| 203 | } | 224 | } |
| 204 | 225 | ||
| 205 | /* | 226 | /* |
| @@ -510,7 +531,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 510 | if (PageUptodate(page)) | 531 | if (PageUptodate(page)) |
| 511 | SetPageUptodate(newpage); | 532 | SetPageUptodate(newpage); |
| 512 | if (TestClearPageActive(page)) { | 533 | if (TestClearPageActive(page)) { |
| 513 | VM_BUG_ON(PageUnevictable(page)); | 534 | VM_BUG_ON_PAGE(PageUnevictable(page), page); |
| 514 | SetPageActive(newpage); | 535 | SetPageActive(newpage); |
| 515 | } else if (TestClearPageUnevictable(page)) | 536 | } else if (TestClearPageUnevictable(page)) |
| 516 | SetPageUnevictable(newpage); | 537 | SetPageUnevictable(newpage); |
| @@ -563,14 +584,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 563 | * Migration functions | 584 | * Migration functions |
| 564 | ***********************************************************/ | 585 | ***********************************************************/ |
| 565 | 586 | ||
| 566 | /* Always fail migration. Used for mappings that are not movable */ | ||
| 567 | int fail_migrate_page(struct address_space *mapping, | ||
| 568 | struct page *newpage, struct page *page) | ||
| 569 | { | ||
| 570 | return -EIO; | ||
| 571 | } | ||
| 572 | EXPORT_SYMBOL(fail_migrate_page); | ||
| 573 | |||
| 574 | /* | 587 | /* |
| 575 | * Common logic to directly migrate a single page suitable for | 588 | * Common logic to directly migrate a single page suitable for |
| 576 | * pages that do not use PagePrivate/PagePrivate2. | 589 | * pages that do not use PagePrivate/PagePrivate2. |
| @@ -890,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
| 890 | * free the metadata, so the page can be freed. | 903 | * free the metadata, so the page can be freed. |
| 891 | */ | 904 | */ |
| 892 | if (!page->mapping) { | 905 | if (!page->mapping) { |
| 893 | VM_BUG_ON(PageAnon(page)); | 906 | VM_BUG_ON_PAGE(PageAnon(page), page); |
| 894 | if (page_has_private(page)) { | 907 | if (page_has_private(page)) { |
| 895 | try_to_free_buffers(page); | 908 | try_to_free_buffers(page); |
| 896 | goto uncharge; | 909 | goto uncharge; |
| @@ -1008,7 +1021,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
| 1008 | { | 1021 | { |
| 1009 | int rc = 0; | 1022 | int rc = 0; |
| 1010 | int *result = NULL; | 1023 | int *result = NULL; |
| 1011 | struct page *new_hpage = get_new_page(hpage, private, &result); | 1024 | struct page *new_hpage; |
| 1012 | struct anon_vma *anon_vma = NULL; | 1025 | struct anon_vma *anon_vma = NULL; |
| 1013 | 1026 | ||
| 1014 | /* | 1027 | /* |
| @@ -1018,9 +1031,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
| 1018 | * tables or check whether the hugepage is pmd-based or not before | 1031 | * tables or check whether the hugepage is pmd-based or not before |
| 1019 | * kicking migration. | 1032 | * kicking migration. |
| 1020 | */ | 1033 | */ |
| 1021 | if (!hugepage_migration_support(page_hstate(hpage))) | 1034 | if (!hugepage_migration_support(page_hstate(hpage))) { |
| 1035 | putback_active_hugepage(hpage); | ||
| 1022 | return -ENOSYS; | 1036 | return -ENOSYS; |
| 1037 | } | ||
| 1023 | 1038 | ||
| 1039 | new_hpage = get_new_page(hpage, private, &result); | ||
| 1024 | if (!new_hpage) | 1040 | if (!new_hpage) |
| 1025 | return -ENOMEM; | 1041 | return -ENOMEM; |
| 1026 | 1042 | ||
| @@ -1120,7 +1136,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
| 1120 | nr_succeeded++; | 1136 | nr_succeeded++; |
| 1121 | break; | 1137 | break; |
| 1122 | default: | 1138 | default: |
| 1123 | /* Permanent failure */ | 1139 | /* |
| 1140 | * Permanent failure (-EBUSY, -ENOSYS, etc.): | ||
| 1141 | * unlike -EAGAIN case, the failed page is | ||
| 1142 | * removed from migration page list and not | ||
| 1143 | * retried in the next outer loop. | ||
| 1144 | */ | ||
| 1124 | nr_failed++; | 1145 | nr_failed++; |
| 1125 | break; | 1146 | break; |
| 1126 | } | 1147 | } |
| @@ -1169,7 +1190,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
| 1169 | pm->node); | 1190 | pm->node); |
| 1170 | else | 1191 | else |
| 1171 | return alloc_pages_exact_node(pm->node, | 1192 | return alloc_pages_exact_node(pm->node, |
| 1172 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); | 1193 | GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); |
| 1173 | } | 1194 | } |
| 1174 | 1195 | ||
| 1175 | /* | 1196 | /* |
| @@ -1555,12 +1576,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page, | |||
| 1555 | struct page *newpage; | 1576 | struct page *newpage; |
| 1556 | 1577 | ||
| 1557 | newpage = alloc_pages_exact_node(nid, | 1578 | newpage = alloc_pages_exact_node(nid, |
| 1558 | (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | | 1579 | (GFP_HIGHUSER_MOVABLE | |
| 1559 | __GFP_NOMEMALLOC | __GFP_NORETRY | | 1580 | __GFP_THISNODE | __GFP_NOMEMALLOC | |
| 1560 | __GFP_NOWARN) & | 1581 | __GFP_NORETRY | __GFP_NOWARN) & |
| 1561 | ~GFP_IOFS, 0); | 1582 | ~GFP_IOFS, 0); |
| 1562 | if (newpage) | ||
| 1563 | page_cpupid_xchg_last(newpage, page_cpupid_last(page)); | ||
| 1564 | 1583 | ||
| 1565 | return newpage; | 1584 | return newpage; |
| 1566 | } | 1585 | } |
| @@ -1594,35 +1613,42 @@ bool migrate_ratelimited(int node) | |||
| 1594 | } | 1613 | } |
| 1595 | 1614 | ||
| 1596 | /* Returns true if the node is migrate rate-limited after the update */ | 1615 | /* Returns true if the node is migrate rate-limited after the update */ |
| 1597 | bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | 1616 | static bool numamigrate_update_ratelimit(pg_data_t *pgdat, |
| 1617 | unsigned long nr_pages) | ||
| 1598 | { | 1618 | { |
| 1599 | bool rate_limited = false; | ||
| 1600 | |||
| 1601 | /* | 1619 | /* |
| 1602 | * Rate-limit the amount of data that is being migrated to a node. | 1620 | * Rate-limit the amount of data that is being migrated to a node. |
| 1603 | * Optimal placement is no good if the memory bus is saturated and | 1621 | * Optimal placement is no good if the memory bus is saturated and |
| 1604 | * all the time is being spent migrating! | 1622 | * all the time is being spent migrating! |
| 1605 | */ | 1623 | */ |
| 1606 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
| 1607 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { | 1624 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { |
| 1625 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
| 1608 | pgdat->numabalancing_migrate_nr_pages = 0; | 1626 | pgdat->numabalancing_migrate_nr_pages = 0; |
| 1609 | pgdat->numabalancing_migrate_next_window = jiffies + | 1627 | pgdat->numabalancing_migrate_next_window = jiffies + |
| 1610 | msecs_to_jiffies(migrate_interval_millisecs); | 1628 | msecs_to_jiffies(migrate_interval_millisecs); |
| 1629 | spin_unlock(&pgdat->numabalancing_migrate_lock); | ||
| 1611 | } | 1630 | } |
| 1612 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) | 1631 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { |
| 1613 | rate_limited = true; | 1632 | trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, |
| 1614 | else | 1633 | nr_pages); |
| 1615 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | 1634 | return true; |
| 1616 | spin_unlock(&pgdat->numabalancing_migrate_lock); | 1635 | } |
| 1617 | 1636 | ||
| 1618 | return rate_limited; | 1637 | /* |
| 1638 | * This is an unlocked non-atomic update so errors are possible. | ||
| 1639 | * The consequences are failing to migrate when we potentiall should | ||
| 1640 | * have which is not severe enough to warrant locking. If it is ever | ||
| 1641 | * a problem, it can be converted to a per-cpu counter. | ||
| 1642 | */ | ||
| 1643 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | ||
| 1644 | return false; | ||
| 1619 | } | 1645 | } |
| 1620 | 1646 | ||
| 1621 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | 1647 | static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) |
| 1622 | { | 1648 | { |
| 1623 | int page_lru; | 1649 | int page_lru; |
| 1624 | 1650 | ||
| 1625 | VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); | 1651 | VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); |
| 1626 | 1652 | ||
| 1627 | /* Avoid migrating to a node that is nearly full */ | 1653 | /* Avoid migrating to a node that is nearly full */ |
| 1628 | if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) | 1654 | if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) |
| @@ -1705,7 +1731,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, | |||
| 1705 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, | 1731 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
| 1706 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); | 1732 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); |
| 1707 | if (nr_remaining) { | 1733 | if (nr_remaining) { |
| 1708 | putback_lru_pages(&migratepages); | 1734 | if (!list_empty(&migratepages)) { |
| 1735 | list_del(&page->lru); | ||
| 1736 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 1737 | page_is_file_cache(page)); | ||
| 1738 | putback_lru_page(page); | ||
| 1739 | } | ||
| 1709 | isolated = 0; | 1740 | isolated = 0; |
| 1710 | } else | 1741 | } else |
| 1711 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | 1742 | count_vm_numa_event(NUMA_PAGE_MIGRATE); |
| @@ -1748,12 +1779,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
| 1748 | goto out_dropref; | 1779 | goto out_dropref; |
| 1749 | 1780 | ||
| 1750 | new_page = alloc_pages_node(node, | 1781 | new_page = alloc_pages_node(node, |
| 1751 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); | 1782 | (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, |
| 1783 | HPAGE_PMD_ORDER); | ||
| 1752 | if (!new_page) | 1784 | if (!new_page) |
| 1753 | goto out_fail; | 1785 | goto out_fail; |
| 1754 | 1786 | ||
| 1755 | page_cpupid_xchg_last(new_page, page_cpupid_last(page)); | ||
| 1756 | |||
| 1757 | isolated = numamigrate_isolate_page(pgdat, page); | 1787 | isolated = numamigrate_isolate_page(pgdat, page); |
| 1758 | if (!isolated) { | 1788 | if (!isolated) { |
| 1759 | put_page(new_page); | 1789 | put_page(new_page); |
diff --git a/mm/mincore.c b/mm/mincore.c index da2be56a7b8f..101623378fbf 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
| @@ -225,13 +225,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v | |||
| 225 | 225 | ||
| 226 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); | 226 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); |
| 227 | 227 | ||
| 228 | if (is_vm_hugetlb_page(vma)) { | ||
| 229 | mincore_hugetlb_page_range(vma, addr, end, vec); | ||
| 230 | return (end - addr) >> PAGE_SHIFT; | ||
| 231 | } | ||
| 232 | |||
| 233 | end = pmd_addr_end(addr, end); | ||
| 234 | |||
| 235 | if (is_vm_hugetlb_page(vma)) | 228 | if (is_vm_hugetlb_page(vma)) |
| 236 | mincore_hugetlb_page_range(vma, addr, end, vec); | 229 | mincore_hugetlb_page_range(vma, addr, end, vec); |
| 237 | else | 230 | else |
diff --git a/mm/mlock.c b/mm/mlock.c index 192e6eebe4f2..4e1a68162285 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -91,6 +91,26 @@ void mlock_vma_page(struct page *page) | |||
| 91 | } | 91 | } |
| 92 | 92 | ||
| 93 | /* | 93 | /* |
| 94 | * Isolate a page from LRU with optional get_page() pin. | ||
| 95 | * Assumes lru_lock already held and page already pinned. | ||
| 96 | */ | ||
| 97 | static bool __munlock_isolate_lru_page(struct page *page, bool getpage) | ||
| 98 | { | ||
| 99 | if (PageLRU(page)) { | ||
| 100 | struct lruvec *lruvec; | ||
| 101 | |||
| 102 | lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); | ||
| 103 | if (getpage) | ||
| 104 | get_page(page); | ||
| 105 | ClearPageLRU(page); | ||
| 106 | del_page_from_lru_list(page, lruvec, page_lru(page)); | ||
| 107 | return true; | ||
| 108 | } | ||
| 109 | |||
| 110 | return false; | ||
| 111 | } | ||
| 112 | |||
| 113 | /* | ||
| 94 | * Finish munlock after successful page isolation | 114 | * Finish munlock after successful page isolation |
| 95 | * | 115 | * |
| 96 | * Page must be locked. This is a wrapper for try_to_munlock() | 116 | * Page must be locked. This is a wrapper for try_to_munlock() |
| @@ -126,9 +146,9 @@ static void __munlock_isolated_page(struct page *page) | |||
| 126 | static void __munlock_isolation_failed(struct page *page) | 146 | static void __munlock_isolation_failed(struct page *page) |
| 127 | { | 147 | { |
| 128 | if (PageUnevictable(page)) | 148 | if (PageUnevictable(page)) |
| 129 | count_vm_event(UNEVICTABLE_PGSTRANDED); | 149 | __count_vm_event(UNEVICTABLE_PGSTRANDED); |
| 130 | else | 150 | else |
| 131 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | 151 | __count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
| 132 | } | 152 | } |
| 133 | 153 | ||
| 134 | /** | 154 | /** |
| @@ -152,28 +172,34 @@ static void __munlock_isolation_failed(struct page *page) | |||
| 152 | unsigned int munlock_vma_page(struct page *page) | 172 | unsigned int munlock_vma_page(struct page *page) |
| 153 | { | 173 | { |
| 154 | unsigned int nr_pages; | 174 | unsigned int nr_pages; |
| 175 | struct zone *zone = page_zone(page); | ||
| 155 | 176 | ||
| 156 | BUG_ON(!PageLocked(page)); | 177 | BUG_ON(!PageLocked(page)); |
| 157 | 178 | ||
| 158 | if (TestClearPageMlocked(page)) { | ||
| 159 | nr_pages = hpage_nr_pages(page); | ||
| 160 | mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); | ||
| 161 | if (!isolate_lru_page(page)) | ||
| 162 | __munlock_isolated_page(page); | ||
| 163 | else | ||
| 164 | __munlock_isolation_failed(page); | ||
| 165 | } else { | ||
| 166 | nr_pages = hpage_nr_pages(page); | ||
| 167 | } | ||
| 168 | |||
| 169 | /* | 179 | /* |
| 170 | * Regardless of the original PageMlocked flag, we determine nr_pages | 180 | * Serialize with any parallel __split_huge_page_refcount() which |
| 171 | * after touching the flag. This leaves a possible race with a THP page | 181 | * might otherwise copy PageMlocked to part of the tail pages before |
| 172 | * split, such that a whole THP page was munlocked, but nr_pages == 1. | 182 | * we clear it in the head page. It also stabilizes hpage_nr_pages(). |
| 173 | * Returning a smaller mask due to that is OK, the worst that can | ||
| 174 | * happen is subsequent useless scanning of the former tail pages. | ||
| 175 | * The NR_MLOCK accounting can however become broken. | ||
| 176 | */ | 183 | */ |
| 184 | spin_lock_irq(&zone->lru_lock); | ||
| 185 | |||
| 186 | nr_pages = hpage_nr_pages(page); | ||
| 187 | if (!TestClearPageMlocked(page)) | ||
| 188 | goto unlock_out; | ||
| 189 | |||
| 190 | __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); | ||
| 191 | |||
| 192 | if (__munlock_isolate_lru_page(page, true)) { | ||
| 193 | spin_unlock_irq(&zone->lru_lock); | ||
| 194 | __munlock_isolated_page(page); | ||
| 195 | goto out; | ||
| 196 | } | ||
| 197 | __munlock_isolation_failed(page); | ||
| 198 | |||
| 199 | unlock_out: | ||
| 200 | spin_unlock_irq(&zone->lru_lock); | ||
| 201 | |||
| 202 | out: | ||
| 177 | return nr_pages - 1; | 203 | return nr_pages - 1; |
| 178 | } | 204 | } |
| 179 | 205 | ||
| @@ -253,8 +279,8 @@ static int __mlock_posix_error_return(long retval) | |||
| 253 | static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, | 279 | static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, |
| 254 | int *pgrescued) | 280 | int *pgrescued) |
| 255 | { | 281 | { |
| 256 | VM_BUG_ON(PageLRU(page)); | 282 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 257 | VM_BUG_ON(!PageLocked(page)); | 283 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 258 | 284 | ||
| 259 | if (page_mapcount(page) <= 1 && page_evictable(page)) { | 285 | if (page_mapcount(page) <= 1 && page_evictable(page)) { |
| 260 | pagevec_add(pvec, page); | 286 | pagevec_add(pvec, page); |
| @@ -310,34 +336,24 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) | |||
| 310 | struct page *page = pvec->pages[i]; | 336 | struct page *page = pvec->pages[i]; |
| 311 | 337 | ||
| 312 | if (TestClearPageMlocked(page)) { | 338 | if (TestClearPageMlocked(page)) { |
| 313 | struct lruvec *lruvec; | ||
| 314 | int lru; | ||
| 315 | |||
| 316 | if (PageLRU(page)) { | ||
| 317 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
| 318 | lru = page_lru(page); | ||
| 319 | /* | ||
| 320 | * We already have pin from follow_page_mask() | ||
| 321 | * so we can spare the get_page() here. | ||
| 322 | */ | ||
| 323 | ClearPageLRU(page); | ||
| 324 | del_page_from_lru_list(page, lruvec, lru); | ||
| 325 | } else { | ||
| 326 | __munlock_isolation_failed(page); | ||
| 327 | goto skip_munlock; | ||
| 328 | } | ||
| 329 | |||
| 330 | } else { | ||
| 331 | skip_munlock: | ||
| 332 | /* | 339 | /* |
| 333 | * We won't be munlocking this page in the next phase | 340 | * We already have pin from follow_page_mask() |
| 334 | * but we still need to release the follow_page_mask() | 341 | * so we can spare the get_page() here. |
| 335 | * pin. We cannot do it under lru_lock however. If it's | ||
| 336 | * the last pin, __page_cache_release would deadlock. | ||
| 337 | */ | 342 | */ |
| 338 | pagevec_add(&pvec_putback, pvec->pages[i]); | 343 | if (__munlock_isolate_lru_page(page, false)) |
| 339 | pvec->pages[i] = NULL; | 344 | continue; |
| 345 | else | ||
| 346 | __munlock_isolation_failed(page); | ||
| 340 | } | 347 | } |
| 348 | |||
| 349 | /* | ||
| 350 | * We won't be munlocking this page in the next phase | ||
| 351 | * but we still need to release the follow_page_mask() | ||
| 352 | * pin. We cannot do it under lru_lock however. If it's | ||
| 353 | * the last pin, __page_cache_release() would deadlock. | ||
| 354 | */ | ||
| 355 | pagevec_add(&pvec_putback, pvec->pages[i]); | ||
| 356 | pvec->pages[i] = NULL; | ||
| 341 | } | 357 | } |
| 342 | delta_munlocked = -nr + pagevec_count(&pvec_putback); | 358 | delta_munlocked = -nr + pagevec_count(&pvec_putback); |
| 343 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); | 359 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); |
| @@ -709,19 +725,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
| 709 | 725 | ||
| 710 | lru_add_drain_all(); /* flush pagevec */ | 726 | lru_add_drain_all(); /* flush pagevec */ |
| 711 | 727 | ||
| 712 | down_write(¤t->mm->mmap_sem); | ||
| 713 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 728 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
| 714 | start &= PAGE_MASK; | 729 | start &= PAGE_MASK; |
| 715 | 730 | ||
| 716 | locked = len >> PAGE_SHIFT; | ||
| 717 | locked += current->mm->locked_vm; | ||
| 718 | |||
| 719 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 731 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 720 | lock_limit >>= PAGE_SHIFT; | 732 | lock_limit >>= PAGE_SHIFT; |
| 733 | locked = len >> PAGE_SHIFT; | ||
| 734 | |||
| 735 | down_write(¤t->mm->mmap_sem); | ||
| 736 | |||
| 737 | locked += current->mm->locked_vm; | ||
| 721 | 738 | ||
| 722 | /* check against resource limits */ | 739 | /* check against resource limits */ |
| 723 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | 740 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
| 724 | error = do_mlock(start, len, 1); | 741 | error = do_mlock(start, len, 1); |
| 742 | |||
| 725 | up_write(¤t->mm->mmap_sem); | 743 | up_write(¤t->mm->mmap_sem); |
| 726 | if (!error) | 744 | if (!error) |
| 727 | error = __mm_populate(start, len, 0); | 745 | error = __mm_populate(start, len, 0); |
| @@ -732,11 +750,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) | |||
| 732 | { | 750 | { |
| 733 | int ret; | 751 | int ret; |
| 734 | 752 | ||
| 735 | down_write(¤t->mm->mmap_sem); | ||
| 736 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 753 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
| 737 | start &= PAGE_MASK; | 754 | start &= PAGE_MASK; |
| 755 | |||
| 756 | down_write(¤t->mm->mmap_sem); | ||
| 738 | ret = do_mlock(start, len, 0); | 757 | ret = do_mlock(start, len, 0); |
| 739 | up_write(¤t->mm->mmap_sem); | 758 | up_write(¤t->mm->mmap_sem); |
| 759 | |||
| 740 | return ret; | 760 | return ret; |
| 741 | } | 761 | } |
| 742 | 762 | ||
| @@ -781,12 +801,12 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
| 781 | if (flags & MCL_CURRENT) | 801 | if (flags & MCL_CURRENT) |
| 782 | lru_add_drain_all(); /* flush pagevec */ | 802 | lru_add_drain_all(); /* flush pagevec */ |
| 783 | 803 | ||
| 784 | down_write(¤t->mm->mmap_sem); | ||
| 785 | |||
| 786 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 804 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 787 | lock_limit >>= PAGE_SHIFT; | 805 | lock_limit >>= PAGE_SHIFT; |
| 788 | 806 | ||
| 789 | ret = -ENOMEM; | 807 | ret = -ENOMEM; |
| 808 | down_write(¤t->mm->mmap_sem); | ||
| 809 | |||
| 790 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || | 810 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || |
| 791 | capable(CAP_IPC_LOCK)) | 811 | capable(CAP_IPC_LOCK)) |
| 792 | ret = do_mlockall(flags); | 812 | ret = do_mlockall(flags); |
diff --git a/mm/mm_init.c b/mm/mm_init.c index 68562e92d50c..4074caf9936b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
| @@ -202,5 +202,4 @@ static int __init mm_sysfs_init(void) | |||
| 202 | 202 | ||
| 203 | return 0; | 203 | return 0; |
| 204 | } | 204 | } |
| 205 | 205 | postcore_initcall(mm_sysfs_init); | |
| 206 | __initcall(mm_sysfs_init); | ||
| @@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); | |||
| 86 | 86 | ||
| 87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
| 88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ | 88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ |
| 89 | unsigned long sysctl_overcommit_kbytes __read_mostly; | ||
| 89 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 90 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
| 90 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 91 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
| 91 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | 92 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ |
| @@ -893,7 +894,15 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 893 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 894 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
| 894 | struct file *file, unsigned long vm_flags) | 895 | struct file *file, unsigned long vm_flags) |
| 895 | { | 896 | { |
| 896 | if (vma->vm_flags ^ vm_flags) | 897 | /* |
| 898 | * VM_SOFTDIRTY should not prevent from VMA merging, if we | ||
| 899 | * match the flags but dirty bit -- the caller should mark | ||
| 900 | * merged VMA as dirty. If dirty bit won't be excluded from | ||
| 901 | * comparison, we increase pressue on the memory system forcing | ||
| 902 | * the kernel to generate new VMAs when old one could be | ||
| 903 | * extended instead. | ||
| 904 | */ | ||
| 905 | if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) | ||
| 897 | return 0; | 906 | return 0; |
| 898 | if (vma->vm_file != file) | 907 | if (vma->vm_file != file) |
| 899 | return 0; | 908 | return 0; |
| @@ -1082,7 +1091,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * | |||
| 1082 | return a->vm_end == b->vm_start && | 1091 | return a->vm_end == b->vm_start && |
| 1083 | mpol_equal(vma_policy(a), vma_policy(b)) && | 1092 | mpol_equal(vma_policy(a), vma_policy(b)) && |
| 1084 | a->vm_file == b->vm_file && | 1093 | a->vm_file == b->vm_file && |
| 1085 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && | 1094 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && |
| 1086 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); | 1095 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); |
| 1087 | } | 1096 | } |
| 1088 | 1097 | ||
| @@ -1190,6 +1199,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint) | |||
| 1190 | return hint; | 1199 | return hint; |
| 1191 | } | 1200 | } |
| 1192 | 1201 | ||
| 1202 | static inline int mlock_future_check(struct mm_struct *mm, | ||
| 1203 | unsigned long flags, | ||
| 1204 | unsigned long len) | ||
| 1205 | { | ||
| 1206 | unsigned long locked, lock_limit; | ||
| 1207 | |||
| 1208 | /* mlock MCL_FUTURE? */ | ||
| 1209 | if (flags & VM_LOCKED) { | ||
| 1210 | locked = len >> PAGE_SHIFT; | ||
| 1211 | locked += mm->locked_vm; | ||
| 1212 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
| 1213 | lock_limit >>= PAGE_SHIFT; | ||
| 1214 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
| 1215 | return -EAGAIN; | ||
| 1216 | } | ||
| 1217 | return 0; | ||
| 1218 | } | ||
| 1219 | |||
| 1193 | /* | 1220 | /* |
| 1194 | * The caller must hold down_write(¤t->mm->mmap_sem). | 1221 | * The caller must hold down_write(¤t->mm->mmap_sem). |
| 1195 | */ | 1222 | */ |
| @@ -1251,16 +1278,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 1251 | if (!can_do_mlock()) | 1278 | if (!can_do_mlock()) |
| 1252 | return -EPERM; | 1279 | return -EPERM; |
| 1253 | 1280 | ||
| 1254 | /* mlock MCL_FUTURE? */ | 1281 | if (mlock_future_check(mm, vm_flags, len)) |
| 1255 | if (vm_flags & VM_LOCKED) { | 1282 | return -EAGAIN; |
| 1256 | unsigned long locked, lock_limit; | ||
| 1257 | locked = len >> PAGE_SHIFT; | ||
| 1258 | locked += mm->locked_vm; | ||
| 1259 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
| 1260 | lock_limit >>= PAGE_SHIFT; | ||
| 1261 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
| 1262 | return -EAGAIN; | ||
| 1263 | } | ||
| 1264 | 1283 | ||
| 1265 | if (file) { | 1284 | if (file) { |
| 1266 | struct inode *inode = file_inode(file); | 1285 | struct inode *inode = file_inode(file); |
| @@ -2591,18 +2610,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2591 | if (error & ~PAGE_MASK) | 2610 | if (error & ~PAGE_MASK) |
| 2592 | return error; | 2611 | return error; |
| 2593 | 2612 | ||
| 2594 | /* | 2613 | error = mlock_future_check(mm, mm->def_flags, len); |
| 2595 | * mlock MCL_FUTURE? | 2614 | if (error) |
| 2596 | */ | 2615 | return error; |
| 2597 | if (mm->def_flags & VM_LOCKED) { | ||
| 2598 | unsigned long locked, lock_limit; | ||
| 2599 | locked = len >> PAGE_SHIFT; | ||
| 2600 | locked += mm->locked_vm; | ||
| 2601 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
| 2602 | lock_limit >>= PAGE_SHIFT; | ||
| 2603 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
| 2604 | return -EAGAIN; | ||
| 2605 | } | ||
| 2606 | 2616 | ||
| 2607 | /* | 2617 | /* |
| 2608 | * mm->mmap_sem is required to protect against another thread | 2618 | * mm->mmap_sem is required to protect against another thread |
| @@ -3140,7 +3150,7 @@ static int init_user_reserve(void) | |||
| 3140 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); | 3150 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); |
| 3141 | return 0; | 3151 | return 0; |
| 3142 | } | 3152 | } |
| 3143 | module_init(init_user_reserve) | 3153 | subsys_initcall(init_user_reserve); |
| 3144 | 3154 | ||
| 3145 | /* | 3155 | /* |
| 3146 | * Initialise sysctl_admin_reserve_kbytes. | 3156 | * Initialise sysctl_admin_reserve_kbytes. |
| @@ -3161,7 +3171,7 @@ static int init_admin_reserve(void) | |||
| 3161 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | 3171 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); |
| 3162 | return 0; | 3172 | return 0; |
| 3163 | } | 3173 | } |
| 3164 | module_init(init_admin_reserve) | 3174 | subsys_initcall(init_admin_reserve); |
| 3165 | 3175 | ||
| 3166 | /* | 3176 | /* |
| 3167 | * Reinititalise user and admin reserves if memory is added or removed. | 3177 | * Reinititalise user and admin reserves if memory is added or removed. |
| @@ -3231,4 +3241,4 @@ static int __meminit init_reserve_notifier(void) | |||
| 3231 | 3241 | ||
| 3232 | return 0; | 3242 | return 0; |
| 3233 | } | 3243 | } |
| 3234 | module_init(init_reserve_notifier) | 3244 | subsys_initcall(init_reserve_notifier); |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 93e6089cb456..41cefdf0aadd 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
| @@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void) | |||
| 329 | { | 329 | { |
| 330 | return init_srcu_struct(&srcu); | 330 | return init_srcu_struct(&srcu); |
| 331 | } | 331 | } |
| 332 | 332 | subsys_initcall(mmu_notifier_init); | |
| 333 | module_init(mmu_notifier_init); | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index bb53a6591aea..769a67a15803 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/mmu_notifier.h> | 23 | #include <linux/mmu_notifier.h> |
| 24 | #include <linux/migrate.h> | 24 | #include <linux/migrate.h> |
| 25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
| 26 | #include <linux/ksm.h> | ||
| 26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
| 27 | #include <asm/pgtable.h> | 28 | #include <asm/pgtable.h> |
| 28 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
| @@ -57,36 +58,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 57 | if (pte_numa(ptent)) | 58 | if (pte_numa(ptent)) |
| 58 | ptent = pte_mknonnuma(ptent); | 59 | ptent = pte_mknonnuma(ptent); |
| 59 | ptent = pte_modify(ptent, newprot); | 60 | ptent = pte_modify(ptent, newprot); |
| 61 | /* | ||
| 62 | * Avoid taking write faults for pages we | ||
| 63 | * know to be dirty. | ||
| 64 | */ | ||
| 65 | if (dirty_accountable && pte_dirty(ptent)) | ||
| 66 | ptent = pte_mkwrite(ptent); | ||
| 67 | ptep_modify_prot_commit(mm, addr, pte, ptent); | ||
| 60 | updated = true; | 68 | updated = true; |
| 61 | } else { | 69 | } else { |
| 62 | struct page *page; | 70 | struct page *page; |
| 63 | 71 | ||
| 64 | ptent = *pte; | ||
| 65 | page = vm_normal_page(vma, addr, oldpte); | 72 | page = vm_normal_page(vma, addr, oldpte); |
| 66 | if (page) { | 73 | if (page && !PageKsm(page)) { |
| 67 | if (!pte_numa(oldpte)) { | 74 | if (!pte_numa(oldpte)) { |
| 68 | ptent = pte_mknuma(ptent); | 75 | ptep_set_numa(mm, addr, pte); |
| 69 | set_pte_at(mm, addr, pte, ptent); | ||
| 70 | updated = true; | 76 | updated = true; |
| 71 | } | 77 | } |
| 72 | } | 78 | } |
| 73 | } | 79 | } |
| 74 | |||
| 75 | /* | ||
| 76 | * Avoid taking write faults for pages we know to be | ||
| 77 | * dirty. | ||
| 78 | */ | ||
| 79 | if (dirty_accountable && pte_dirty(ptent)) { | ||
| 80 | ptent = pte_mkwrite(ptent); | ||
| 81 | updated = true; | ||
| 82 | } | ||
| 83 | |||
| 84 | if (updated) | 80 | if (updated) |
| 85 | pages++; | 81 | pages++; |
| 86 | |||
| 87 | /* Only !prot_numa always clears the pte */ | ||
| 88 | if (!prot_numa) | ||
| 89 | ptep_modify_prot_commit(mm, addr, pte, ptent); | ||
| 90 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { | 82 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { |
| 91 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 83 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
| 92 | 84 | ||
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 2c254d374655..f73f2987a852 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
| @@ -41,11 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | |||
| 41 | if (limit > memblock.current_limit) | 41 | if (limit > memblock.current_limit) |
| 42 | limit = memblock.current_limit; | 42 | limit = memblock.current_limit; |
| 43 | 43 | ||
| 44 | addr = memblock_find_in_range_node(goal, limit, size, align, nid); | 44 | addr = memblock_find_in_range_node(size, align, goal, limit, nid); |
| 45 | if (!addr) | 45 | if (!addr) |
| 46 | return NULL; | 46 | return NULL; |
| 47 | 47 | ||
| 48 | memblock_reserve(addr, size); | 48 | if (memblock_reserve(addr, size)) |
| 49 | return NULL; | ||
| 50 | |||
| 49 | ptr = phys_to_virt(addr); | 51 | ptr = phys_to_virt(addr); |
| 50 | memset(ptr, 0, size); | 52 | memset(ptr, 0, size); |
| 51 | /* | 53 | /* |
| @@ -114,16 +116,27 @@ static unsigned long __init __free_memory_core(phys_addr_t start, | |||
| 114 | static unsigned long __init free_low_memory_core_early(void) | 116 | static unsigned long __init free_low_memory_core_early(void) |
| 115 | { | 117 | { |
| 116 | unsigned long count = 0; | 118 | unsigned long count = 0; |
| 117 | phys_addr_t start, end, size; | 119 | phys_addr_t start, end; |
| 118 | u64 i; | 120 | u64 i; |
| 119 | 121 | ||
| 120 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) | 122 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) |
| 121 | count += __free_memory_core(start, end); | 123 | count += __free_memory_core(start, end); |
| 122 | 124 | ||
| 123 | /* free range that is used for reserved array if we allocate it */ | 125 | #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK |
| 124 | size = get_allocated_memblock_reserved_regions_info(&start); | 126 | { |
| 125 | if (size) | 127 | phys_addr_t size; |
| 126 | count += __free_memory_core(start, start + size); | 128 | |
| 129 | /* Free memblock.reserved array if it was allocated */ | ||
| 130 | size = get_allocated_memblock_reserved_regions_info(&start); | ||
| 131 | if (size) | ||
| 132 | count += __free_memory_core(start, start + size); | ||
| 133 | |||
| 134 | /* Free memblock.memory array if it was allocated */ | ||
| 135 | size = get_allocated_memblock_memory_regions_info(&start); | ||
| 136 | if (size) | ||
| 137 | count += __free_memory_core(start, start + size); | ||
| 138 | } | ||
| 139 | #endif | ||
| 127 | 140 | ||
| 128 | return count; | 141 | return count; |
| 129 | } | 142 | } |
| @@ -161,7 +174,7 @@ unsigned long __init free_all_bootmem(void) | |||
| 161 | reset_all_zones_managed_pages(); | 174 | reset_all_zones_managed_pages(); |
| 162 | 175 | ||
| 163 | /* | 176 | /* |
| 164 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 177 | * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id |
| 165 | * because in some case like Node0 doesn't have RAM installed | 178 | * because in some case like Node0 doesn't have RAM installed |
| 166 | * low ram will be on Node1 | 179 | * low ram will be on Node1 |
| 167 | */ | 180 | */ |
| @@ -215,7 +228,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, | |||
| 215 | 228 | ||
| 216 | restart: | 229 | restart: |
| 217 | 230 | ||
| 218 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | 231 | ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); |
| 219 | 232 | ||
| 220 | if (ptr) | 233 | if (ptr) |
| 221 | return ptr; | 234 | return ptr; |
| @@ -299,7 +312,7 @@ again: | |||
| 299 | if (ptr) | 312 | if (ptr) |
| 300 | return ptr; | 313 | return ptr; |
| 301 | 314 | ||
| 302 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, | 315 | ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, |
| 303 | goal, limit); | 316 | goal, limit); |
| 304 | if (ptr) | 317 | if (ptr) |
| 305 | return ptr; | 318 | return ptr; |
diff --git a/mm/nommu.c b/mm/nommu.c index fec093adad9c..8740213b1647 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn; | |||
| 60 | struct percpu_counter vm_committed_as; | 60 | struct percpu_counter vm_committed_as; |
| 61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
| 62 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 62 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
| 63 | unsigned long sysctl_overcommit_kbytes __read_mostly; | ||
| 63 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 64 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
| 64 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; | 65 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
| 65 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 66 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e4a600a6163..3291e82d4352 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock); | |||
| 47 | #ifdef CONFIG_NUMA | 47 | #ifdef CONFIG_NUMA |
| 48 | /** | 48 | /** |
| 49 | * has_intersects_mems_allowed() - check task eligiblity for kill | 49 | * has_intersects_mems_allowed() - check task eligiblity for kill |
| 50 | * @tsk: task struct of which task to consider | 50 | * @start: task struct of which task to consider |
| 51 | * @mask: nodemask passed to page allocator for mempolicy ooms | 51 | * @mask: nodemask passed to page allocator for mempolicy ooms |
| 52 | * | 52 | * |
| 53 | * Task eligibility is determined by whether or not a candidate task, @tsk, | 53 | * Task eligibility is determined by whether or not a candidate task, @tsk, |
| 54 | * shares the same mempolicy nodes as current if it is bound by such a policy | 54 | * shares the same mempolicy nodes as current if it is bound by such a policy |
| 55 | * and whether or not it has the same set of allowed cpuset nodes. | 55 | * and whether or not it has the same set of allowed cpuset nodes. |
| 56 | */ | 56 | */ |
| 57 | static bool has_intersects_mems_allowed(struct task_struct *tsk, | 57 | static bool has_intersects_mems_allowed(struct task_struct *start, |
| 58 | const nodemask_t *mask) | 58 | const nodemask_t *mask) |
| 59 | { | 59 | { |
| 60 | struct task_struct *start = tsk; | 60 | struct task_struct *tsk; |
| 61 | bool ret = false; | ||
| 61 | 62 | ||
| 62 | do { | 63 | rcu_read_lock(); |
| 64 | for_each_thread(start, tsk) { | ||
| 63 | if (mask) { | 65 | if (mask) { |
| 64 | /* | 66 | /* |
| 65 | * If this is a mempolicy constrained oom, tsk's | 67 | * If this is a mempolicy constrained oom, tsk's |
| @@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
| 67 | * mempolicy intersects current, otherwise it may be | 69 | * mempolicy intersects current, otherwise it may be |
| 68 | * needlessly killed. | 70 | * needlessly killed. |
| 69 | */ | 71 | */ |
| 70 | if (mempolicy_nodemask_intersects(tsk, mask)) | 72 | ret = mempolicy_nodemask_intersects(tsk, mask); |
| 71 | return true; | ||
| 72 | } else { | 73 | } else { |
| 73 | /* | 74 | /* |
| 74 | * This is not a mempolicy constrained oom, so only | 75 | * This is not a mempolicy constrained oom, so only |
| 75 | * check the mems of tsk's cpuset. | 76 | * check the mems of tsk's cpuset. |
| 76 | */ | 77 | */ |
| 77 | if (cpuset_mems_allowed_intersects(current, tsk)) | 78 | ret = cpuset_mems_allowed_intersects(current, tsk); |
| 78 | return true; | ||
| 79 | } | 79 | } |
| 80 | } while_each_thread(start, tsk); | 80 | if (ret) |
| 81 | break; | ||
| 82 | } | ||
| 83 | rcu_read_unlock(); | ||
| 81 | 84 | ||
| 82 | return false; | 85 | return ret; |
| 83 | } | 86 | } |
| 84 | #else | 87 | #else |
| 85 | static bool has_intersects_mems_allowed(struct task_struct *tsk, | 88 | static bool has_intersects_mems_allowed(struct task_struct *tsk, |
| @@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
| 97 | */ | 100 | */ |
| 98 | struct task_struct *find_lock_task_mm(struct task_struct *p) | 101 | struct task_struct *find_lock_task_mm(struct task_struct *p) |
| 99 | { | 102 | { |
| 100 | struct task_struct *t = p; | 103 | struct task_struct *t; |
| 101 | 104 | ||
| 102 | do { | 105 | rcu_read_lock(); |
| 106 | |||
| 107 | for_each_thread(p, t) { | ||
| 103 | task_lock(t); | 108 | task_lock(t); |
| 104 | if (likely(t->mm)) | 109 | if (likely(t->mm)) |
| 105 | return t; | 110 | goto found; |
| 106 | task_unlock(t); | 111 | task_unlock(t); |
| 107 | } while_each_thread(p, t); | 112 | } |
| 113 | t = NULL; | ||
| 114 | found: | ||
| 115 | rcu_read_unlock(); | ||
| 108 | 116 | ||
| 109 | return NULL; | 117 | return t; |
| 110 | } | 118 | } |
| 111 | 119 | ||
| 112 | /* return true if the task is not adequate as candidate victim task. */ | 120 | /* return true if the task is not adequate as candidate victim task. */ |
| @@ -170,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
| 170 | * implementation used by LSMs. | 178 | * implementation used by LSMs. |
| 171 | */ | 179 | */ |
| 172 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) | 180 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
| 173 | adj -= 30; | 181 | points -= (points * 3) / 100; |
| 174 | 182 | ||
| 175 | /* Normalize to oom_score_adj units */ | 183 | /* Normalize to oom_score_adj units */ |
| 176 | adj *= totalpages / 1000; | 184 | adj *= totalpages / 1000; |
| @@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
| 301 | unsigned long chosen_points = 0; | 309 | unsigned long chosen_points = 0; |
| 302 | 310 | ||
| 303 | rcu_read_lock(); | 311 | rcu_read_lock(); |
| 304 | do_each_thread(g, p) { | 312 | for_each_process_thread(g, p) { |
| 305 | unsigned int points; | 313 | unsigned int points; |
| 306 | 314 | ||
| 307 | switch (oom_scan_process_thread(p, totalpages, nodemask, | 315 | switch (oom_scan_process_thread(p, totalpages, nodemask, |
| @@ -319,11 +327,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
| 319 | break; | 327 | break; |
| 320 | }; | 328 | }; |
| 321 | points = oom_badness(p, NULL, nodemask, totalpages); | 329 | points = oom_badness(p, NULL, nodemask, totalpages); |
| 322 | if (points > chosen_points) { | 330 | if (!points || points < chosen_points) |
| 323 | chosen = p; | 331 | continue; |
| 324 | chosen_points = points; | 332 | /* Prefer thread group leaders for display purposes */ |
| 325 | } | 333 | if (points == chosen_points && thread_group_leader(chosen)) |
| 326 | } while_each_thread(g, p); | 334 | continue; |
| 335 | |||
| 336 | chosen = p; | ||
| 337 | chosen_points = points; | ||
| 338 | } | ||
| 327 | if (chosen) | 339 | if (chosen) |
| 328 | get_task_struct(chosen); | 340 | get_task_struct(chosen); |
| 329 | rcu_read_unlock(); | 341 | rcu_read_unlock(); |
| @@ -406,7 +418,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 406 | { | 418 | { |
| 407 | struct task_struct *victim = p; | 419 | struct task_struct *victim = p; |
| 408 | struct task_struct *child; | 420 | struct task_struct *child; |
| 409 | struct task_struct *t = p; | 421 | struct task_struct *t; |
| 410 | struct mm_struct *mm; | 422 | struct mm_struct *mm; |
| 411 | unsigned int victim_points = 0; | 423 | unsigned int victim_points = 0; |
| 412 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 424 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
| @@ -437,7 +449,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 437 | * still freeing memory. | 449 | * still freeing memory. |
| 438 | */ | 450 | */ |
| 439 | read_lock(&tasklist_lock); | 451 | read_lock(&tasklist_lock); |
| 440 | do { | 452 | for_each_thread(p, t) { |
| 441 | list_for_each_entry(child, &t->children, sibling) { | 453 | list_for_each_entry(child, &t->children, sibling) { |
| 442 | unsigned int child_points; | 454 | unsigned int child_points; |
| 443 | 455 | ||
| @@ -455,13 +467,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 455 | get_task_struct(victim); | 467 | get_task_struct(victim); |
| 456 | } | 468 | } |
| 457 | } | 469 | } |
| 458 | } while_each_thread(p, t); | 470 | } |
| 459 | read_unlock(&tasklist_lock); | 471 | read_unlock(&tasklist_lock); |
| 460 | 472 | ||
| 461 | rcu_read_lock(); | ||
| 462 | p = find_lock_task_mm(victim); | 473 | p = find_lock_task_mm(victim); |
| 463 | if (!p) { | 474 | if (!p) { |
| 464 | rcu_read_unlock(); | ||
| 465 | put_task_struct(victim); | 475 | put_task_struct(victim); |
| 466 | return; | 476 | return; |
| 467 | } else if (victim != p) { | 477 | } else if (victim != p) { |
| @@ -487,6 +497,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 487 | * That thread will now get access to memory reserves since it has a | 497 | * That thread will now get access to memory reserves since it has a |
| 488 | * pending fatal signal. | 498 | * pending fatal signal. |
| 489 | */ | 499 | */ |
| 500 | rcu_read_lock(); | ||
| 490 | for_each_process(p) | 501 | for_each_process(p) |
| 491 | if (p->mm == mm && !same_thread_group(p, victim) && | 502 | if (p->mm == mm && !same_thread_group(p, victim) && |
| 492 | !(p->flags & PF_KTHREAD)) { | 503 | !(p->flags & PF_KTHREAD)) { |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 63807583d8e8..7106cb1aca8e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -191,6 +191,26 @@ static unsigned long writeout_period_time = 0; | |||
| 191 | * global dirtyable memory first. | 191 | * global dirtyable memory first. |
| 192 | */ | 192 | */ |
| 193 | 193 | ||
| 194 | /** | ||
| 195 | * zone_dirtyable_memory - number of dirtyable pages in a zone | ||
| 196 | * @zone: the zone | ||
| 197 | * | ||
| 198 | * Returns the zone's number of pages potentially available for dirty | ||
| 199 | * page cache. This is the base value for the per-zone dirty limits. | ||
| 200 | */ | ||
| 201 | static unsigned long zone_dirtyable_memory(struct zone *zone) | ||
| 202 | { | ||
| 203 | unsigned long nr_pages; | ||
| 204 | |||
| 205 | nr_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
| 206 | nr_pages -= min(nr_pages, zone->dirty_balance_reserve); | ||
| 207 | |||
| 208 | nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); | ||
| 209 | nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); | ||
| 210 | |||
| 211 | return nr_pages; | ||
| 212 | } | ||
| 213 | |||
| 194 | static unsigned long highmem_dirtyable_memory(unsigned long total) | 214 | static unsigned long highmem_dirtyable_memory(unsigned long total) |
| 195 | { | 215 | { |
| 196 | #ifdef CONFIG_HIGHMEM | 216 | #ifdef CONFIG_HIGHMEM |
| @@ -198,11 +218,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
| 198 | unsigned long x = 0; | 218 | unsigned long x = 0; |
| 199 | 219 | ||
| 200 | for_each_node_state(node, N_HIGH_MEMORY) { | 220 | for_each_node_state(node, N_HIGH_MEMORY) { |
| 201 | struct zone *z = | 221 | struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; |
| 202 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | ||
| 203 | 222 | ||
| 204 | x += zone_page_state(z, NR_FREE_PAGES) + | 223 | x += zone_dirtyable_memory(z); |
| 205 | zone_reclaimable_pages(z) - z->dirty_balance_reserve; | ||
| 206 | } | 224 | } |
| 207 | /* | 225 | /* |
| 208 | * Unreclaimable memory (kernel memory or anonymous memory | 226 | * Unreclaimable memory (kernel memory or anonymous memory |
| @@ -238,9 +256,12 @@ static unsigned long global_dirtyable_memory(void) | |||
| 238 | { | 256 | { |
| 239 | unsigned long x; | 257 | unsigned long x; |
| 240 | 258 | ||
| 241 | x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); | 259 | x = global_page_state(NR_FREE_PAGES); |
| 242 | x -= min(x, dirty_balance_reserve); | 260 | x -= min(x, dirty_balance_reserve); |
| 243 | 261 | ||
| 262 | x += global_page_state(NR_INACTIVE_FILE); | ||
| 263 | x += global_page_state(NR_ACTIVE_FILE); | ||
| 264 | |||
| 244 | if (!vm_highmem_is_dirtyable) | 265 | if (!vm_highmem_is_dirtyable) |
| 245 | x -= highmem_dirtyable_memory(x); | 266 | x -= highmem_dirtyable_memory(x); |
| 246 | 267 | ||
| @@ -289,32 +310,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
| 289 | } | 310 | } |
| 290 | 311 | ||
| 291 | /** | 312 | /** |
| 292 | * zone_dirtyable_memory - number of dirtyable pages in a zone | ||
| 293 | * @zone: the zone | ||
| 294 | * | ||
| 295 | * Returns the zone's number of pages potentially available for dirty | ||
| 296 | * page cache. This is the base value for the per-zone dirty limits. | ||
| 297 | */ | ||
| 298 | static unsigned long zone_dirtyable_memory(struct zone *zone) | ||
| 299 | { | ||
| 300 | /* | ||
| 301 | * The effective global number of dirtyable pages may exclude | ||
| 302 | * highmem as a big-picture measure to keep the ratio between | ||
| 303 | * dirty memory and lowmem reasonable. | ||
| 304 | * | ||
| 305 | * But this function is purely about the individual zone and a | ||
| 306 | * highmem zone can hold its share of dirty pages, so we don't | ||
| 307 | * care about vm_highmem_is_dirtyable here. | ||
| 308 | */ | ||
| 309 | unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + | ||
| 310 | zone_reclaimable_pages(zone); | ||
| 311 | |||
| 312 | /* don't allow this to underflow */ | ||
| 313 | nr_pages -= min(nr_pages, zone->dirty_balance_reserve); | ||
| 314 | return nr_pages; | ||
| 315 | } | ||
| 316 | |||
| 317 | /** | ||
| 318 | * zone_dirty_limit - maximum number of dirty pages allowed in a zone | 313 | * zone_dirty_limit - maximum number of dirty pages allowed in a zone |
| 319 | * @zone: the zone | 314 | * @zone: the zone |
| 320 | * | 315 | * |
| @@ -2178,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
| 2178 | if (!TestSetPageDirty(page)) { | 2173 | if (!TestSetPageDirty(page)) { |
| 2179 | struct address_space *mapping = page_mapping(page); | 2174 | struct address_space *mapping = page_mapping(page); |
| 2180 | struct address_space *mapping2; | 2175 | struct address_space *mapping2; |
| 2176 | unsigned long flags; | ||
| 2181 | 2177 | ||
| 2182 | if (!mapping) | 2178 | if (!mapping) |
| 2183 | return 1; | 2179 | return 1; |
| 2184 | 2180 | ||
| 2185 | spin_lock_irq(&mapping->tree_lock); | 2181 | spin_lock_irqsave(&mapping->tree_lock, flags); |
| 2186 | mapping2 = page_mapping(page); | 2182 | mapping2 = page_mapping(page); |
| 2187 | if (mapping2) { /* Race with truncate? */ | 2183 | if (mapping2) { /* Race with truncate? */ |
| 2188 | BUG_ON(mapping2 != mapping); | 2184 | BUG_ON(mapping2 != mapping); |
| @@ -2191,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
| 2191 | radix_tree_tag_set(&mapping->page_tree, | 2187 | radix_tree_tag_set(&mapping->page_tree, |
| 2192 | page_index(page), PAGECACHE_TAG_DIRTY); | 2188 | page_index(page), PAGECACHE_TAG_DIRTY); |
| 2193 | } | 2189 | } |
| 2194 | spin_unlock_irq(&mapping->tree_lock); | 2190 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
| 2195 | if (mapping->host) { | 2191 | if (mapping->host) { |
| 2196 | /* !PageAnon && !swapper_space */ | 2192 | /* !PageAnon && !swapper_space */ |
| 2197 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 2193 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5248fe070aa4..3bac76ae4b30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -205,7 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
| 205 | }; | 205 | }; |
| 206 | 206 | ||
| 207 | int min_free_kbytes = 1024; | 207 | int min_free_kbytes = 1024; |
| 208 | int user_min_free_kbytes; | 208 | int user_min_free_kbytes = -1; |
| 209 | 209 | ||
| 210 | static unsigned long __meminitdata nr_kernel_pages; | 210 | static unsigned long __meminitdata nr_kernel_pages; |
| 211 | static unsigned long __meminitdata nr_all_pages; | 211 | static unsigned long __meminitdata nr_all_pages; |
| @@ -295,7 +295,7 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
| 295 | } | 295 | } |
| 296 | #endif | 296 | #endif |
| 297 | 297 | ||
| 298 | static void bad_page(struct page *page) | 298 | static void bad_page(struct page *page, char *reason, unsigned long bad_flags) |
| 299 | { | 299 | { |
| 300 | static unsigned long resume; | 300 | static unsigned long resume; |
| 301 | static unsigned long nr_shown; | 301 | static unsigned long nr_shown; |
| @@ -329,7 +329,7 @@ static void bad_page(struct page *page) | |||
| 329 | 329 | ||
| 330 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 330 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", |
| 331 | current->comm, page_to_pfn(page)); | 331 | current->comm, page_to_pfn(page)); |
| 332 | dump_page(page); | 332 | dump_page_badflags(page, reason, bad_flags); |
| 333 | 333 | ||
| 334 | print_modules(); | 334 | print_modules(); |
| 335 | dump_stack(); | 335 | dump_stack(); |
| @@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
| 369 | __SetPageHead(page); | 369 | __SetPageHead(page); |
| 370 | for (i = 1; i < nr_pages; i++) { | 370 | for (i = 1; i < nr_pages; i++) { |
| 371 | struct page *p = page + i; | 371 | struct page *p = page + i; |
| 372 | __SetPageTail(p); | ||
| 373 | set_page_count(p, 0); | 372 | set_page_count(p, 0); |
| 374 | p->first_page = page; | 373 | p->first_page = page; |
| 374 | /* Make sure p->first_page is always valid for PageTail() */ | ||
| 375 | smp_wmb(); | ||
| 376 | __SetPageTail(p); | ||
| 375 | } | 377 | } |
| 376 | } | 378 | } |
| 377 | 379 | ||
| @@ -383,7 +385,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
| 383 | int bad = 0; | 385 | int bad = 0; |
| 384 | 386 | ||
| 385 | if (unlikely(compound_order(page) != order)) { | 387 | if (unlikely(compound_order(page) != order)) { |
| 386 | bad_page(page); | 388 | bad_page(page, "wrong compound order", 0); |
| 387 | bad++; | 389 | bad++; |
| 388 | } | 390 | } |
| 389 | 391 | ||
| @@ -392,8 +394,11 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
| 392 | for (i = 1; i < nr_pages; i++) { | 394 | for (i = 1; i < nr_pages; i++) { |
| 393 | struct page *p = page + i; | 395 | struct page *p = page + i; |
| 394 | 396 | ||
| 395 | if (unlikely(!PageTail(p) || (p->first_page != page))) { | 397 | if (unlikely(!PageTail(p))) { |
| 396 | bad_page(page); | 398 | bad_page(page, "PageTail not set", 0); |
| 399 | bad++; | ||
| 400 | } else if (unlikely(p->first_page != page)) { | ||
| 401 | bad_page(page, "first_page not consistent", 0); | ||
| 397 | bad++; | 402 | bad++; |
| 398 | } | 403 | } |
| 399 | __ClearPageTail(p); | 404 | __ClearPageTail(p); |
| @@ -506,12 +511,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
| 506 | return 0; | 511 | return 0; |
| 507 | 512 | ||
| 508 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 513 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
| 509 | VM_BUG_ON(page_count(buddy) != 0); | 514 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
| 510 | return 1; | 515 | return 1; |
| 511 | } | 516 | } |
| 512 | 517 | ||
| 513 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 518 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
| 514 | VM_BUG_ON(page_count(buddy) != 0); | 519 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
| 515 | return 1; | 520 | return 1; |
| 516 | } | 521 | } |
| 517 | return 0; | 522 | return 0; |
| @@ -561,8 +566,8 @@ static inline void __free_one_page(struct page *page, | |||
| 561 | 566 | ||
| 562 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 567 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
| 563 | 568 | ||
| 564 | VM_BUG_ON(page_idx & ((1 << order) - 1)); | 569 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); |
| 565 | VM_BUG_ON(bad_range(zone, page)); | 570 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
| 566 | 571 | ||
| 567 | while (order < MAX_ORDER-1) { | 572 | while (order < MAX_ORDER-1) { |
| 568 | buddy_idx = __find_buddy_index(page_idx, order); | 573 | buddy_idx = __find_buddy_index(page_idx, order); |
| @@ -618,12 +623,23 @@ out: | |||
| 618 | 623 | ||
| 619 | static inline int free_pages_check(struct page *page) | 624 | static inline int free_pages_check(struct page *page) |
| 620 | { | 625 | { |
| 621 | if (unlikely(page_mapcount(page) | | 626 | char *bad_reason = NULL; |
| 622 | (page->mapping != NULL) | | 627 | unsigned long bad_flags = 0; |
| 623 | (atomic_read(&page->_count) != 0) | | 628 | |
| 624 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | | 629 | if (unlikely(page_mapcount(page))) |
| 625 | (mem_cgroup_bad_page_check(page)))) { | 630 | bad_reason = "nonzero mapcount"; |
| 626 | bad_page(page); | 631 | if (unlikely(page->mapping != NULL)) |
| 632 | bad_reason = "non-NULL mapping"; | ||
| 633 | if (unlikely(atomic_read(&page->_count) != 0)) | ||
| 634 | bad_reason = "nonzero _count"; | ||
| 635 | if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { | ||
| 636 | bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; | ||
| 637 | bad_flags = PAGE_FLAGS_CHECK_AT_FREE; | ||
| 638 | } | ||
| 639 | if (unlikely(mem_cgroup_bad_page_check(page))) | ||
| 640 | bad_reason = "cgroup check failed"; | ||
| 641 | if (unlikely(bad_reason)) { | ||
| 642 | bad_page(page, bad_reason, bad_flags); | ||
| 627 | return 1; | 643 | return 1; |
| 628 | } | 644 | } |
| 629 | page_cpupid_reset_last(page); | 645 | page_cpupid_reset_last(page); |
| @@ -813,7 +829,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
| 813 | area--; | 829 | area--; |
| 814 | high--; | 830 | high--; |
| 815 | size >>= 1; | 831 | size >>= 1; |
| 816 | VM_BUG_ON(bad_range(zone, &page[size])); | 832 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
| 817 | 833 | ||
| 818 | #ifdef CONFIG_DEBUG_PAGEALLOC | 834 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 819 | if (high < debug_guardpage_minorder()) { | 835 | if (high < debug_guardpage_minorder()) { |
| @@ -843,12 +859,23 @@ static inline void expand(struct zone *zone, struct page *page, | |||
| 843 | */ | 859 | */ |
| 844 | static inline int check_new_page(struct page *page) | 860 | static inline int check_new_page(struct page *page) |
| 845 | { | 861 | { |
| 846 | if (unlikely(page_mapcount(page) | | 862 | char *bad_reason = NULL; |
| 847 | (page->mapping != NULL) | | 863 | unsigned long bad_flags = 0; |
| 848 | (atomic_read(&page->_count) != 0) | | 864 | |
| 849 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | | 865 | if (unlikely(page_mapcount(page))) |
| 850 | (mem_cgroup_bad_page_check(page)))) { | 866 | bad_reason = "nonzero mapcount"; |
| 851 | bad_page(page); | 867 | if (unlikely(page->mapping != NULL)) |
| 868 | bad_reason = "non-NULL mapping"; | ||
| 869 | if (unlikely(atomic_read(&page->_count) != 0)) | ||
| 870 | bad_reason = "nonzero _count"; | ||
| 871 | if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { | ||
| 872 | bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; | ||
| 873 | bad_flags = PAGE_FLAGS_CHECK_AT_PREP; | ||
| 874 | } | ||
| 875 | if (unlikely(mem_cgroup_bad_page_check(page))) | ||
| 876 | bad_reason = "cgroup check failed"; | ||
| 877 | if (unlikely(bad_reason)) { | ||
| 878 | bad_page(page, bad_reason, bad_flags); | ||
| 852 | return 1; | 879 | return 1; |
| 853 | } | 880 | } |
| 854 | return 0; | 881 | return 0; |
| @@ -955,7 +982,7 @@ int move_freepages(struct zone *zone, | |||
| 955 | 982 | ||
| 956 | for (page = start_page; page <= end_page;) { | 983 | for (page = start_page; page <= end_page;) { |
| 957 | /* Make sure we are not inadvertently changing nodes */ | 984 | /* Make sure we are not inadvertently changing nodes */ |
| 958 | VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); | 985 | VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
| 959 | 986 | ||
| 960 | if (!pfn_valid_within(page_to_pfn(page))) { | 987 | if (!pfn_valid_within(page_to_pfn(page))) { |
| 961 | page++; | 988 | page++; |
| @@ -1211,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
| 1211 | } | 1238 | } |
| 1212 | local_irq_restore(flags); | 1239 | local_irq_restore(flags); |
| 1213 | } | 1240 | } |
| 1241 | static bool gfp_thisnode_allocation(gfp_t gfp_mask) | ||
| 1242 | { | ||
| 1243 | return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; | ||
| 1244 | } | ||
| 1245 | #else | ||
| 1246 | static bool gfp_thisnode_allocation(gfp_t gfp_mask) | ||
| 1247 | { | ||
| 1248 | return false; | ||
| 1249 | } | ||
| 1214 | #endif | 1250 | #endif |
| 1215 | 1251 | ||
| 1216 | /* | 1252 | /* |
| @@ -1404,8 +1440,8 @@ void split_page(struct page *page, unsigned int order) | |||
| 1404 | { | 1440 | { |
| 1405 | int i; | 1441 | int i; |
| 1406 | 1442 | ||
| 1407 | VM_BUG_ON(PageCompound(page)); | 1443 | VM_BUG_ON_PAGE(PageCompound(page), page); |
| 1408 | VM_BUG_ON(!page_count(page)); | 1444 | VM_BUG_ON_PAGE(!page_count(page), page); |
| 1409 | 1445 | ||
| 1410 | #ifdef CONFIG_KMEMCHECK | 1446 | #ifdef CONFIG_KMEMCHECK |
| 1411 | /* | 1447 | /* |
| @@ -1547,12 +1583,18 @@ again: | |||
| 1547 | get_pageblock_migratetype(page)); | 1583 | get_pageblock_migratetype(page)); |
| 1548 | } | 1584 | } |
| 1549 | 1585 | ||
| 1550 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | 1586 | /* |
| 1587 | * NOTE: GFP_THISNODE allocations do not partake in the kswapd | ||
| 1588 | * aging protocol, so they can't be fair. | ||
| 1589 | */ | ||
| 1590 | if (!gfp_thisnode_allocation(gfp_flags)) | ||
| 1591 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | ||
| 1592 | |||
| 1551 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1593 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
| 1552 | zone_statistics(preferred_zone, zone, gfp_flags); | 1594 | zone_statistics(preferred_zone, zone, gfp_flags); |
| 1553 | local_irq_restore(flags); | 1595 | local_irq_restore(flags); |
| 1554 | 1596 | ||
| 1555 | VM_BUG_ON(bad_range(zone, page)); | 1597 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
| 1556 | if (prep_new_page(page, order, gfp_flags)) | 1598 | if (prep_new_page(page, order, gfp_flags)) |
| 1557 | goto again; | 1599 | goto again; |
| 1558 | return page; | 1600 | return page; |
| @@ -1919,8 +1961,12 @@ zonelist_scan: | |||
| 1919 | * ultimately fall back to remote zones that do not | 1961 | * ultimately fall back to remote zones that do not |
| 1920 | * partake in the fairness round-robin cycle of this | 1962 | * partake in the fairness round-robin cycle of this |
| 1921 | * zonelist. | 1963 | * zonelist. |
| 1964 | * | ||
| 1965 | * NOTE: GFP_THISNODE allocations do not partake in | ||
| 1966 | * the kswapd aging protocol, so they can't be fair. | ||
| 1922 | */ | 1967 | */ |
| 1923 | if (alloc_flags & ALLOC_WMARK_LOW) { | 1968 | if ((alloc_flags & ALLOC_WMARK_LOW) && |
| 1969 | !gfp_thisnode_allocation(gfp_mask)) { | ||
| 1924 | if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) | 1970 | if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) |
| 1925 | continue; | 1971 | continue; |
| 1926 | if (!zone_local(preferred_zone, zone)) | 1972 | if (!zone_local(preferred_zone, zone)) |
| @@ -2072,13 +2118,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
| 2072 | return; | 2118 | return; |
| 2073 | 2119 | ||
| 2074 | /* | 2120 | /* |
| 2075 | * Walking all memory to count page types is very expensive and should | ||
| 2076 | * be inhibited in non-blockable contexts. | ||
| 2077 | */ | ||
| 2078 | if (!(gfp_mask & __GFP_WAIT)) | ||
| 2079 | filter |= SHOW_MEM_FILTER_PAGE_COUNT; | ||
| 2080 | |||
| 2081 | /* | ||
| 2082 | * This documents exceptions given to allocations in certain | 2121 | * This documents exceptions given to allocations in certain |
| 2083 | * contexts that are allowed to allocate outside current's set | 2122 | * contexts that are allowed to allocate outside current's set |
| 2084 | * of allowed nodes. | 2123 | * of allowed nodes. |
| @@ -2242,10 +2281,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
| 2242 | preferred_zone, migratetype); | 2281 | preferred_zone, migratetype); |
| 2243 | if (page) { | 2282 | if (page) { |
| 2244 | preferred_zone->compact_blockskip_flush = false; | 2283 | preferred_zone->compact_blockskip_flush = false; |
| 2245 | preferred_zone->compact_considered = 0; | 2284 | compaction_defer_reset(preferred_zone, order, true); |
| 2246 | preferred_zone->compact_defer_shift = 0; | ||
| 2247 | if (order >= preferred_zone->compact_order_failed) | ||
| 2248 | preferred_zone->compact_order_failed = order + 1; | ||
| 2249 | count_vm_event(COMPACTSUCCESS); | 2285 | count_vm_event(COMPACTSUCCESS); |
| 2250 | return page; | 2286 | return page; |
| 2251 | } | 2287 | } |
| @@ -2486,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
| 2486 | * allowed per node queues are empty and that nodes are | 2522 | * allowed per node queues are empty and that nodes are |
| 2487 | * over allocated. | 2523 | * over allocated. |
| 2488 | */ | 2524 | */ |
| 2489 | if (IS_ENABLED(CONFIG_NUMA) && | 2525 | if (gfp_thisnode_allocation(gfp_mask)) |
| 2490 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
| 2491 | goto nopage; | 2526 | goto nopage; |
| 2492 | 2527 | ||
| 2493 | restart: | 2528 | restart: |
| @@ -2535,8 +2570,15 @@ rebalance: | |||
| 2535 | } | 2570 | } |
| 2536 | 2571 | ||
| 2537 | /* Atomic allocations - we can't balance anything */ | 2572 | /* Atomic allocations - we can't balance anything */ |
| 2538 | if (!wait) | 2573 | if (!wait) { |
| 2574 | /* | ||
| 2575 | * All existing users of the deprecated __GFP_NOFAIL are | ||
| 2576 | * blockable, so warn of any new users that actually allow this | ||
| 2577 | * type of allocation to fail. | ||
| 2578 | */ | ||
| 2579 | WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); | ||
| 2539 | goto nopage; | 2580 | goto nopage; |
| 2581 | } | ||
| 2540 | 2582 | ||
| 2541 | /* Avoid recursion of direct reclaim */ | 2583 | /* Avoid recursion of direct reclaim */ |
| 2542 | if (current->flags & PF_MEMALLOC) | 2584 | if (current->flags & PF_MEMALLOC) |
| @@ -3901,6 +3943,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
| 3901 | struct page *page; | 3943 | struct page *page; |
| 3902 | unsigned long block_migratetype; | 3944 | unsigned long block_migratetype; |
| 3903 | int reserve; | 3945 | int reserve; |
| 3946 | int old_reserve; | ||
| 3904 | 3947 | ||
| 3905 | /* | 3948 | /* |
| 3906 | * Get the start pfn, end pfn and the number of blocks to reserve | 3949 | * Get the start pfn, end pfn and the number of blocks to reserve |
| @@ -3922,6 +3965,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
| 3922 | * future allocation of hugepages at runtime. | 3965 | * future allocation of hugepages at runtime. |
| 3923 | */ | 3966 | */ |
| 3924 | reserve = min(2, reserve); | 3967 | reserve = min(2, reserve); |
| 3968 | old_reserve = zone->nr_migrate_reserve_block; | ||
| 3969 | |||
| 3970 | /* When memory hot-add, we almost always need to do nothing */ | ||
| 3971 | if (reserve == old_reserve) | ||
| 3972 | return; | ||
| 3973 | zone->nr_migrate_reserve_block = reserve; | ||
| 3925 | 3974 | ||
| 3926 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 3975 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
| 3927 | if (!pfn_valid(pfn)) | 3976 | if (!pfn_valid(pfn)) |
| @@ -3959,6 +4008,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
| 3959 | reserve--; | 4008 | reserve--; |
| 3960 | continue; | 4009 | continue; |
| 3961 | } | 4010 | } |
| 4011 | } else if (!old_reserve) { | ||
| 4012 | /* | ||
| 4013 | * At boot time we don't need to scan the whole zone | ||
| 4014 | * for turning off MIGRATE_RESERVE. | ||
| 4015 | */ | ||
| 4016 | break; | ||
| 3962 | } | 4017 | } |
| 3963 | 4018 | ||
| 3964 | /* | 4019 | /* |
| @@ -4209,7 +4264,6 @@ static noinline __init_refok | |||
| 4209 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 4264 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
| 4210 | { | 4265 | { |
| 4211 | int i; | 4266 | int i; |
| 4212 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
| 4213 | size_t alloc_size; | 4267 | size_t alloc_size; |
| 4214 | 4268 | ||
| 4215 | /* | 4269 | /* |
| @@ -4225,7 +4279,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
| 4225 | 4279 | ||
| 4226 | if (!slab_is_available()) { | 4280 | if (!slab_is_available()) { |
| 4227 | zone->wait_table = (wait_queue_head_t *) | 4281 | zone->wait_table = (wait_queue_head_t *) |
| 4228 | alloc_bootmem_node_nopanic(pgdat, alloc_size); | 4282 | memblock_virt_alloc_node_nopanic( |
| 4283 | alloc_size, zone->zone_pgdat->node_id); | ||
| 4229 | } else { | 4284 | } else { |
| 4230 | /* | 4285 | /* |
| 4231 | * This case means that a zone whose size was 0 gets new memory | 4286 | * This case means that a zone whose size was 0 gets new memory |
| @@ -4345,13 +4400,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) | |||
| 4345 | #endif | 4400 | #endif |
| 4346 | 4401 | ||
| 4347 | /** | 4402 | /** |
| 4348 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | 4403 | * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range |
| 4349 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | 4404 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
| 4350 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node | 4405 | * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid |
| 4351 | * | 4406 | * |
| 4352 | * If an architecture guarantees that all ranges registered with | 4407 | * If an architecture guarantees that all ranges registered with |
| 4353 | * add_active_ranges() contain no holes and may be freed, this | 4408 | * add_active_ranges() contain no holes and may be freed, this |
| 4354 | * this function may be used instead of calling free_bootmem() manually. | 4409 | * this function may be used instead of calling memblock_free_early_nid() |
| 4410 | * manually. | ||
| 4355 | */ | 4411 | */ |
| 4356 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | 4412 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
| 4357 | { | 4413 | { |
| @@ -4363,9 +4419,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | |||
| 4363 | end_pfn = min(end_pfn, max_low_pfn); | 4419 | end_pfn = min(end_pfn, max_low_pfn); |
| 4364 | 4420 | ||
| 4365 | if (start_pfn < end_pfn) | 4421 | if (start_pfn < end_pfn) |
| 4366 | free_bootmem_node(NODE_DATA(this_nid), | 4422 | memblock_free_early_nid(PFN_PHYS(start_pfn), |
| 4367 | PFN_PHYS(start_pfn), | 4423 | (end_pfn - start_pfn) << PAGE_SHIFT, |
| 4368 | (end_pfn - start_pfn) << PAGE_SHIFT); | 4424 | this_nid); |
| 4369 | } | 4425 | } |
| 4370 | } | 4426 | } |
| 4371 | 4427 | ||
| @@ -4636,8 +4692,9 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
| 4636 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); | 4692 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); |
| 4637 | zone->pageblock_flags = NULL; | 4693 | zone->pageblock_flags = NULL; |
| 4638 | if (usemapsize) | 4694 | if (usemapsize) |
| 4639 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, | 4695 | zone->pageblock_flags = |
| 4640 | usemapsize); | 4696 | memblock_virt_alloc_node_nopanic(usemapsize, |
| 4697 | pgdat->node_id); | ||
| 4641 | } | 4698 | } |
| 4642 | #else | 4699 | #else |
| 4643 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, | 4700 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, |
| @@ -4831,7 +4888,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
| 4831 | size = (end - start) * sizeof(struct page); | 4888 | size = (end - start) * sizeof(struct page); |
| 4832 | map = alloc_remap(pgdat->node_id, size); | 4889 | map = alloc_remap(pgdat->node_id, size); |
| 4833 | if (!map) | 4890 | if (!map) |
| 4834 | map = alloc_bootmem_node_nopanic(pgdat, size); | 4891 | map = memblock_virt_alloc_node_nopanic(size, |
| 4892 | pgdat->node_id); | ||
| 4835 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 4893 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
| 4836 | } | 4894 | } |
| 4837 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4895 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
| @@ -5012,9 +5070,33 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
| 5012 | nodemask_t saved_node_state = node_states[N_MEMORY]; | 5070 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
| 5013 | unsigned long totalpages = early_calculate_totalpages(); | 5071 | unsigned long totalpages = early_calculate_totalpages(); |
| 5014 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); | 5072 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
| 5073 | struct memblock_type *type = &memblock.memory; | ||
| 5074 | |||
| 5075 | /* Need to find movable_zone earlier when movable_node is specified. */ | ||
| 5076 | find_usable_zone_for_movable(); | ||
| 5015 | 5077 | ||
| 5016 | /* | 5078 | /* |
| 5017 | * If movablecore was specified, calculate what size of | 5079 | * If movable_node is specified, ignore kernelcore and movablecore |
| 5080 | * options. | ||
| 5081 | */ | ||
| 5082 | if (movable_node_is_enabled()) { | ||
| 5083 | for (i = 0; i < type->cnt; i++) { | ||
| 5084 | if (!memblock_is_hotpluggable(&type->regions[i])) | ||
| 5085 | continue; | ||
| 5086 | |||
| 5087 | nid = type->regions[i].nid; | ||
| 5088 | |||
| 5089 | usable_startpfn = PFN_DOWN(type->regions[i].base); | ||
| 5090 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? | ||
| 5091 | min(usable_startpfn, zone_movable_pfn[nid]) : | ||
| 5092 | usable_startpfn; | ||
| 5093 | } | ||
| 5094 | |||
| 5095 | goto out2; | ||
| 5096 | } | ||
| 5097 | |||
| 5098 | /* | ||
| 5099 | * If movablecore=nn[KMG] was specified, calculate what size of | ||
| 5018 | * kernelcore that corresponds so that memory usable for | 5100 | * kernelcore that corresponds so that memory usable for |
| 5019 | * any allocation type is evenly spread. If both kernelcore | 5101 | * any allocation type is evenly spread. If both kernelcore |
| 5020 | * and movablecore are specified, then the value of kernelcore | 5102 | * and movablecore are specified, then the value of kernelcore |
| @@ -5040,7 +5122,6 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
| 5040 | goto out; | 5122 | goto out; |
| 5041 | 5123 | ||
| 5042 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 5124 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
| 5043 | find_usable_zone_for_movable(); | ||
| 5044 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; | 5125 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; |
| 5045 | 5126 | ||
| 5046 | restart: | 5127 | restart: |
| @@ -5131,6 +5212,7 @@ restart: | |||
| 5131 | if (usable_nodes && required_kernelcore > usable_nodes) | 5212 | if (usable_nodes && required_kernelcore > usable_nodes) |
| 5132 | goto restart; | 5213 | goto restart; |
| 5133 | 5214 | ||
| 5215 | out2: | ||
| 5134 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ | 5216 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
| 5135 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 5217 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
| 5136 | zone_movable_pfn[nid] = | 5218 | zone_movable_pfn[nid] = |
| @@ -5692,7 +5774,12 @@ module_init(init_per_zone_wmark_min) | |||
| 5692 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 5774 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
| 5693 | void __user *buffer, size_t *length, loff_t *ppos) | 5775 | void __user *buffer, size_t *length, loff_t *ppos) |
| 5694 | { | 5776 | { |
| 5695 | proc_dointvec(table, write, buffer, length, ppos); | 5777 | int rc; |
| 5778 | |||
| 5779 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
| 5780 | if (rc) | ||
| 5781 | return rc; | ||
| 5782 | |||
| 5696 | if (write) { | 5783 | if (write) { |
| 5697 | user_min_free_kbytes = min_free_kbytes; | 5784 | user_min_free_kbytes = min_free_kbytes; |
| 5698 | setup_per_zone_wmarks(); | 5785 | setup_per_zone_wmarks(); |
| @@ -5857,7 +5944,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
| 5857 | do { | 5944 | do { |
| 5858 | size = bucketsize << log2qty; | 5945 | size = bucketsize << log2qty; |
| 5859 | if (flags & HASH_EARLY) | 5946 | if (flags & HASH_EARLY) |
| 5860 | table = alloc_bootmem_nopanic(size); | 5947 | table = memblock_virt_alloc_nopanic(size, 0); |
| 5861 | else if (hashdist) | 5948 | else if (hashdist) |
| 5862 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 5949 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
| 5863 | else { | 5950 | else { |
| @@ -5959,7 +6046,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
| 5959 | pfn = page_to_pfn(page); | 6046 | pfn = page_to_pfn(page); |
| 5960 | bitmap = get_pageblock_bitmap(zone, pfn); | 6047 | bitmap = get_pageblock_bitmap(zone, pfn); |
| 5961 | bitidx = pfn_to_bitidx(zone, pfn); | 6048 | bitidx = pfn_to_bitidx(zone, pfn); |
| 5962 | VM_BUG_ON(!zone_spans_pfn(zone, pfn)); | 6049 | VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); |
| 5963 | 6050 | ||
| 5964 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 6051 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) |
| 5965 | if (flags & value) | 6052 | if (flags & value) |
| @@ -6457,12 +6544,24 @@ static void dump_page_flags(unsigned long flags) | |||
| 6457 | printk(")\n"); | 6544 | printk(")\n"); |
| 6458 | } | 6545 | } |
| 6459 | 6546 | ||
| 6460 | void dump_page(struct page *page) | 6547 | void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) |
| 6461 | { | 6548 | { |
| 6462 | printk(KERN_ALERT | 6549 | printk(KERN_ALERT |
| 6463 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 6550 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", |
| 6464 | page, atomic_read(&page->_count), page_mapcount(page), | 6551 | page, atomic_read(&page->_count), page_mapcount(page), |
| 6465 | page->mapping, page->index); | 6552 | page->mapping, page->index); |
| 6466 | dump_page_flags(page->flags); | 6553 | dump_page_flags(page->flags); |
| 6554 | if (reason) | ||
| 6555 | pr_alert("page dumped because: %s\n", reason); | ||
| 6556 | if (page->flags & badflags) { | ||
| 6557 | pr_alert("bad because of flags:\n"); | ||
| 6558 | dump_page_flags(page->flags & badflags); | ||
| 6559 | } | ||
| 6467 | mem_cgroup_print_bad_page(page); | 6560 | mem_cgroup_print_bad_page(page); |
| 6468 | } | 6561 | } |
| 6562 | |||
| 6563 | void dump_page(struct page *page, char *reason) | ||
| 6564 | { | ||
| 6565 | dump_page_badflags(page, reason, 0); | ||
| 6566 | } | ||
| 6567 | EXPORT_SYMBOL_GPL(dump_page); | ||
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 6d757e3a872a..cfd162882c00 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
| @@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid) | |||
| 54 | 54 | ||
| 55 | table_size = sizeof(struct page_cgroup) * nr_pages; | 55 | table_size = sizeof(struct page_cgroup) * nr_pages; |
| 56 | 56 | ||
| 57 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | 57 | base = memblock_virt_alloc_try_nid_nopanic( |
| 58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
| 59 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
| 59 | if (!base) | 60 | if (!base) |
| 60 | return -ENOMEM; | 61 | return -ENOMEM; |
| 61 | NODE_DATA(nid)->node_page_cgroup = base; | 62 | NODE_DATA(nid)->node_page_cgroup = base; |
| @@ -451,7 +452,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | |||
| 451 | * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry | 452 | * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry |
| 452 | * @ent: swap entry to be looked up. | 453 | * @ent: swap entry to be looked up. |
| 453 | * | 454 | * |
| 454 | * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) | 455 | * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) |
| 455 | */ | 456 | */ |
| 456 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) | 457 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) |
| 457 | { | 458 | { |
diff --git a/mm/page_io.c b/mm/page_io.c index 8c79a4764be0..7c59ef681381 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -31,13 +31,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, | |||
| 31 | 31 | ||
| 32 | bio = bio_alloc(gfp_flags, 1); | 32 | bio = bio_alloc(gfp_flags, 1); |
| 33 | if (bio) { | 33 | if (bio) { |
| 34 | bio->bi_sector = map_swap_page(page, &bio->bi_bdev); | 34 | bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); |
| 35 | bio->bi_sector <<= PAGE_SHIFT - 9; | 35 | bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; |
| 36 | bio->bi_io_vec[0].bv_page = page; | 36 | bio->bi_io_vec[0].bv_page = page; |
| 37 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; | 37 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; |
| 38 | bio->bi_io_vec[0].bv_offset = 0; | 38 | bio->bi_io_vec[0].bv_offset = 0; |
| 39 | bio->bi_vcnt = 1; | 39 | bio->bi_vcnt = 1; |
| 40 | bio->bi_size = PAGE_SIZE; | 40 | bio->bi_iter.bi_size = PAGE_SIZE; |
| 41 | bio->bi_end_io = end_io; | 41 | bio->bi_end_io = end_io; |
| 42 | } | 42 | } |
| 43 | return bio; | 43 | return bio; |
| @@ -62,7 +62,7 @@ void end_swap_bio_write(struct bio *bio, int err) | |||
| 62 | printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", | 62 | printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", |
| 63 | imajor(bio->bi_bdev->bd_inode), | 63 | imajor(bio->bi_bdev->bd_inode), |
| 64 | iminor(bio->bi_bdev->bd_inode), | 64 | iminor(bio->bi_bdev->bd_inode), |
| 65 | (unsigned long long)bio->bi_sector); | 65 | (unsigned long long)bio->bi_iter.bi_sector); |
| 66 | ClearPageReclaim(page); | 66 | ClearPageReclaim(page); |
| 67 | } | 67 | } |
| 68 | end_page_writeback(page); | 68 | end_page_writeback(page); |
| @@ -80,7 +80,7 @@ void end_swap_bio_read(struct bio *bio, int err) | |||
| 80 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", | 80 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", |
| 81 | imajor(bio->bi_bdev->bd_inode), | 81 | imajor(bio->bi_bdev->bd_inode), |
| 82 | iminor(bio->bi_bdev->bd_inode), | 82 | iminor(bio->bi_bdev->bd_inode), |
| 83 | (unsigned long long)bio->bi_sector); | 83 | (unsigned long long)bio->bi_iter.bi_sector); |
| 84 | goto out; | 84 | goto out; |
| 85 | } | 85 | } |
| 86 | 86 | ||
| @@ -320,8 +320,8 @@ int swap_readpage(struct page *page) | |||
| 320 | int ret = 0; | 320 | int ret = 0; |
| 321 | struct swap_info_struct *sis = page_swap_info(page); | 321 | struct swap_info_struct *sis = page_swap_info(page); |
| 322 | 322 | ||
| 323 | VM_BUG_ON(!PageLocked(page)); | 323 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 324 | VM_BUG_ON(PageUptodate(page)); | 324 | VM_BUG_ON_PAGE(PageUptodate(page), page); |
| 325 | if (frontswap_load(page) == 0) { | 325 | if (frontswap_load(page) == 0) { |
| 326 | SetPageUptodate(page); | 326 | SetPageUptodate(page); |
| 327 | unlock_page(page); | 327 | unlock_page(page); |
diff --git a/mm/percpu.c b/mm/percpu.c index 0d10defe951e..036cfe07050f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
| @@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, | |||
| 1063 | __alignof__(ai->groups[0].cpu_map[0])); | 1063 | __alignof__(ai->groups[0].cpu_map[0])); |
| 1064 | ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); | 1064 | ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); |
| 1065 | 1065 | ||
| 1066 | ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); | 1066 | ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); |
| 1067 | if (!ptr) | 1067 | if (!ptr) |
| 1068 | return NULL; | 1068 | return NULL; |
| 1069 | ai = ptr; | 1069 | ai = ptr; |
| @@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, | |||
| 1088 | */ | 1088 | */ |
| 1089 | void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) | 1089 | void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) |
| 1090 | { | 1090 | { |
| 1091 | free_bootmem(__pa(ai), ai->__ai_size); | 1091 | memblock_free_early(__pa(ai), ai->__ai_size); |
| 1092 | } | 1092 | } |
| 1093 | 1093 | ||
| 1094 | /** | 1094 | /** |
| @@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
| 1246 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); | 1246 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); |
| 1247 | 1247 | ||
| 1248 | /* process group information and build config tables accordingly */ | 1248 | /* process group information and build config tables accordingly */ |
| 1249 | group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); | 1249 | group_offsets = memblock_virt_alloc(ai->nr_groups * |
| 1250 | group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); | 1250 | sizeof(group_offsets[0]), 0); |
| 1251 | unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); | 1251 | group_sizes = memblock_virt_alloc(ai->nr_groups * |
| 1252 | unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); | 1252 | sizeof(group_sizes[0]), 0); |
| 1253 | unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); | ||
| 1254 | unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); | ||
| 1253 | 1255 | ||
| 1254 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) | 1256 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) |
| 1255 | unit_map[cpu] = UINT_MAX; | 1257 | unit_map[cpu] = UINT_MAX; |
| @@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
| 1311 | * empty chunks. | 1313 | * empty chunks. |
| 1312 | */ | 1314 | */ |
| 1313 | pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; | 1315 | pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; |
| 1314 | pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); | 1316 | pcpu_slot = memblock_virt_alloc( |
| 1317 | pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); | ||
| 1315 | for (i = 0; i < pcpu_nr_slots; i++) | 1318 | for (i = 0; i < pcpu_nr_slots; i++) |
| 1316 | INIT_LIST_HEAD(&pcpu_slot[i]); | 1319 | INIT_LIST_HEAD(&pcpu_slot[i]); |
| 1317 | 1320 | ||
| @@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
| 1322 | * covers static area + reserved area (mostly used for module | 1325 | * covers static area + reserved area (mostly used for module |
| 1323 | * static percpu allocation). | 1326 | * static percpu allocation). |
| 1324 | */ | 1327 | */ |
| 1325 | schunk = alloc_bootmem(pcpu_chunk_struct_size); | 1328 | schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
| 1326 | INIT_LIST_HEAD(&schunk->list); | 1329 | INIT_LIST_HEAD(&schunk->list); |
| 1327 | schunk->base_addr = base_addr; | 1330 | schunk->base_addr = base_addr; |
| 1328 | schunk->map = smap; | 1331 | schunk->map = smap; |
| @@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
| 1346 | 1349 | ||
| 1347 | /* init dynamic chunk if necessary */ | 1350 | /* init dynamic chunk if necessary */ |
| 1348 | if (dyn_size) { | 1351 | if (dyn_size) { |
| 1349 | dchunk = alloc_bootmem(pcpu_chunk_struct_size); | 1352 | dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
| 1350 | INIT_LIST_HEAD(&dchunk->list); | 1353 | INIT_LIST_HEAD(&dchunk->list); |
| 1351 | dchunk->base_addr = base_addr; | 1354 | dchunk->base_addr = base_addr; |
| 1352 | dchunk->map = dmap; | 1355 | dchunk->map = dmap; |
| @@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
| 1626 | size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; | 1629 | size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; |
| 1627 | areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); | 1630 | areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); |
| 1628 | 1631 | ||
| 1629 | areas = alloc_bootmem_nopanic(areas_size); | 1632 | areas = memblock_virt_alloc_nopanic(areas_size, 0); |
| 1630 | if (!areas) { | 1633 | if (!areas) { |
| 1631 | rc = -ENOMEM; | 1634 | rc = -ENOMEM; |
| 1632 | goto out_free; | 1635 | goto out_free; |
| @@ -1686,10 +1689,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
| 1686 | max_distance += ai->unit_size; | 1689 | max_distance += ai->unit_size; |
| 1687 | 1690 | ||
| 1688 | /* warn if maximum distance is further than 75% of vmalloc space */ | 1691 | /* warn if maximum distance is further than 75% of vmalloc space */ |
| 1689 | if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { | 1692 | if (max_distance > VMALLOC_TOTAL * 3 / 4) { |
| 1690 | pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " | 1693 | pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " |
| 1691 | "space 0x%lx\n", max_distance, | 1694 | "space 0x%lx\n", max_distance, |
| 1692 | (unsigned long)(VMALLOC_END - VMALLOC_START)); | 1695 | VMALLOC_TOTAL); |
| 1693 | #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK | 1696 | #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK |
| 1694 | /* and fail if we have fallback */ | 1697 | /* and fail if we have fallback */ |
| 1695 | rc = -EINVAL; | 1698 | rc = -EINVAL; |
| @@ -1712,7 +1715,7 @@ out_free_areas: | |||
| 1712 | out_free: | 1715 | out_free: |
| 1713 | pcpu_free_alloc_info(ai); | 1716 | pcpu_free_alloc_info(ai); |
| 1714 | if (areas) | 1717 | if (areas) |
| 1715 | free_bootmem(__pa(areas), areas_size); | 1718 | memblock_free_early(__pa(areas), areas_size); |
| 1716 | return rc; | 1719 | return rc; |
| 1717 | } | 1720 | } |
| 1718 | #endif /* BUILD_EMBED_FIRST_CHUNK */ | 1721 | #endif /* BUILD_EMBED_FIRST_CHUNK */ |
| @@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, | |||
| 1760 | /* unaligned allocations can't be freed, round up to page size */ | 1763 | /* unaligned allocations can't be freed, round up to page size */ |
| 1761 | pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * | 1764 | pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * |
| 1762 | sizeof(pages[0])); | 1765 | sizeof(pages[0])); |
| 1763 | pages = alloc_bootmem(pages_size); | 1766 | pages = memblock_virt_alloc(pages_size, 0); |
| 1764 | 1767 | ||
| 1765 | /* allocate pages */ | 1768 | /* allocate pages */ |
| 1766 | j = 0; | 1769 | j = 0; |
| @@ -1823,7 +1826,7 @@ enomem: | |||
| 1823 | free_fn(page_address(pages[j]), PAGE_SIZE); | 1826 | free_fn(page_address(pages[j]), PAGE_SIZE); |
| 1824 | rc = -ENOMEM; | 1827 | rc = -ENOMEM; |
| 1825 | out_free_ar: | 1828 | out_free_ar: |
| 1826 | free_bootmem(__pa(pages), pages_size); | 1829 | memblock_free_early(__pa(pages), pages_size); |
| 1827 | pcpu_free_alloc_info(ai); | 1830 | pcpu_free_alloc_info(ai); |
| 1828 | return rc; | 1831 | return rc; |
| 1829 | } | 1832 | } |
| @@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset); | |||
| 1848 | static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, | 1851 | static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, |
| 1849 | size_t align) | 1852 | size_t align) |
| 1850 | { | 1853 | { |
| 1851 | return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); | 1854 | return memblock_virt_alloc_from_nopanic( |
| 1855 | size, align, __pa(MAX_DMA_ADDRESS)); | ||
| 1852 | } | 1856 | } |
| 1853 | 1857 | ||
| 1854 | static void __init pcpu_dfl_fc_free(void *ptr, size_t size) | 1858 | static void __init pcpu_dfl_fc_free(void *ptr, size_t size) |
| 1855 | { | 1859 | { |
| 1856 | free_bootmem(__pa(ptr), size); | 1860 | memblock_free_early(__pa(ptr), size); |
| 1857 | } | 1861 | } |
| 1858 | 1862 | ||
| 1859 | void __init setup_per_cpu_areas(void) | 1863 | void __init setup_per_cpu_areas(void) |
| @@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void) | |||
| 1896 | void *fc; | 1900 | void *fc; |
| 1897 | 1901 | ||
| 1898 | ai = pcpu_alloc_alloc_info(1, 1); | 1902 | ai = pcpu_alloc_alloc_info(1, 1); |
| 1899 | fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 1903 | fc = memblock_virt_alloc_from_nopanic(unit_size, |
| 1904 | PAGE_SIZE, | ||
| 1905 | __pa(MAX_DMA_ADDRESS)); | ||
| 1900 | if (!ai || !fc) | 1906 | if (!ai || !fc) |
| 1901 | panic("Failed to allocate memory for percpu areas."); | 1907 | panic("Failed to allocate memory for percpu areas."); |
| 1902 | /* kmemleak tracks the percpu allocations separately */ | 1908 | /* kmemleak tracks the percpu allocations separately */ |
diff --git a/mm/readahead.c b/mm/readahead.c index 7cdbb44aa90b..0de2360d65f3 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -211,8 +211,6 @@ out: | |||
| 211 | int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | 211 | int force_page_cache_readahead(struct address_space *mapping, struct file *filp, |
| 212 | pgoff_t offset, unsigned long nr_to_read) | 212 | pgoff_t offset, unsigned long nr_to_read) |
| 213 | { | 213 | { |
| 214 | int ret = 0; | ||
| 215 | |||
| 216 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | 214 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) |
| 217 | return -EINVAL; | 215 | return -EINVAL; |
| 218 | 216 | ||
| @@ -226,15 +224,13 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
| 226 | this_chunk = nr_to_read; | 224 | this_chunk = nr_to_read; |
| 227 | err = __do_page_cache_readahead(mapping, filp, | 225 | err = __do_page_cache_readahead(mapping, filp, |
| 228 | offset, this_chunk, 0); | 226 | offset, this_chunk, 0); |
| 229 | if (err < 0) { | 227 | if (err < 0) |
| 230 | ret = err; | 228 | return err; |
| 231 | break; | 229 | |
| 232 | } | ||
| 233 | ret += err; | ||
| 234 | offset += this_chunk; | 230 | offset += this_chunk; |
| 235 | nr_to_read -= this_chunk; | 231 | nr_to_read -= this_chunk; |
| 236 | } | 232 | } |
| 237 | return ret; | 233 | return 0; |
| 238 | } | 234 | } |
| 239 | 235 | ||
| 240 | /* | 236 | /* |
| @@ -576,8 +572,7 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
| 576 | if (!mapping || !mapping->a_ops) | 572 | if (!mapping || !mapping->a_ops) |
| 577 | return -EINVAL; | 573 | return -EINVAL; |
| 578 | 574 | ||
| 579 | force_page_cache_readahead(mapping, filp, index, nr); | 575 | return force_page_cache_readahead(mapping, filp, index, nr); |
| 580 | return 0; | ||
| 581 | } | 576 | } |
| 582 | 577 | ||
| 583 | SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) | 578 | SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) |
| @@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
| 660 | return 1; | 660 | return 1; |
| 661 | } | 661 | } |
| 662 | 662 | ||
| 663 | struct page_referenced_arg { | ||
| 664 | int mapcount; | ||
| 665 | int referenced; | ||
| 666 | unsigned long vm_flags; | ||
| 667 | struct mem_cgroup *memcg; | ||
| 668 | }; | ||
| 663 | /* | 669 | /* |
| 664 | * Subfunctions of page_referenced: page_referenced_one called | 670 | * arg: page_referenced_arg will be passed |
| 665 | * repeatedly from either page_referenced_anon or page_referenced_file. | ||
| 666 | */ | 671 | */ |
| 667 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, | 672 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
| 668 | unsigned long address, unsigned int *mapcount, | 673 | unsigned long address, void *arg) |
| 669 | unsigned long *vm_flags) | ||
| 670 | { | 674 | { |
| 671 | struct mm_struct *mm = vma->vm_mm; | 675 | struct mm_struct *mm = vma->vm_mm; |
| 672 | spinlock_t *ptl; | 676 | spinlock_t *ptl; |
| 673 | int referenced = 0; | 677 | int referenced = 0; |
| 678 | struct page_referenced_arg *pra = arg; | ||
| 674 | 679 | ||
| 675 | if (unlikely(PageTransHuge(page))) { | 680 | if (unlikely(PageTransHuge(page))) { |
| 676 | pmd_t *pmd; | 681 | pmd_t *pmd; |
| @@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
| 682 | pmd = page_check_address_pmd(page, mm, address, | 687 | pmd = page_check_address_pmd(page, mm, address, |
| 683 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); | 688 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); |
| 684 | if (!pmd) | 689 | if (!pmd) |
| 685 | goto out; | 690 | return SWAP_AGAIN; |
| 686 | 691 | ||
| 687 | if (vma->vm_flags & VM_LOCKED) { | 692 | if (vma->vm_flags & VM_LOCKED) { |
| 688 | spin_unlock(ptl); | 693 | spin_unlock(ptl); |
| 689 | *mapcount = 0; /* break early from loop */ | 694 | pra->vm_flags |= VM_LOCKED; |
| 690 | *vm_flags |= VM_LOCKED; | 695 | return SWAP_FAIL; /* To break the loop */ |
| 691 | goto out; | ||
| 692 | } | 696 | } |
| 693 | 697 | ||
| 694 | /* go ahead even if the pmd is pmd_trans_splitting() */ | 698 | /* go ahead even if the pmd is pmd_trans_splitting() */ |
| @@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
| 704 | */ | 708 | */ |
| 705 | pte = page_check_address(page, mm, address, &ptl, 0); | 709 | pte = page_check_address(page, mm, address, &ptl, 0); |
| 706 | if (!pte) | 710 | if (!pte) |
| 707 | goto out; | 711 | return SWAP_AGAIN; |
| 708 | 712 | ||
| 709 | if (vma->vm_flags & VM_LOCKED) { | 713 | if (vma->vm_flags & VM_LOCKED) { |
| 710 | pte_unmap_unlock(pte, ptl); | 714 | pte_unmap_unlock(pte, ptl); |
| 711 | *mapcount = 0; /* break early from loop */ | 715 | pra->vm_flags |= VM_LOCKED; |
| 712 | *vm_flags |= VM_LOCKED; | 716 | return SWAP_FAIL; /* To break the loop */ |
| 713 | goto out; | ||
| 714 | } | 717 | } |
| 715 | 718 | ||
| 716 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 719 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
| @@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
| 727 | pte_unmap_unlock(pte, ptl); | 730 | pte_unmap_unlock(pte, ptl); |
| 728 | } | 731 | } |
| 729 | 732 | ||
| 730 | (*mapcount)--; | 733 | if (referenced) { |
| 731 | 734 | pra->referenced++; | |
| 732 | if (referenced) | 735 | pra->vm_flags |= vma->vm_flags; |
| 733 | *vm_flags |= vma->vm_flags; | ||
| 734 | out: | ||
| 735 | return referenced; | ||
| 736 | } | ||
| 737 | |||
| 738 | static int page_referenced_anon(struct page *page, | ||
| 739 | struct mem_cgroup *memcg, | ||
| 740 | unsigned long *vm_flags) | ||
| 741 | { | ||
| 742 | unsigned int mapcount; | ||
| 743 | struct anon_vma *anon_vma; | ||
| 744 | pgoff_t pgoff; | ||
| 745 | struct anon_vma_chain *avc; | ||
| 746 | int referenced = 0; | ||
| 747 | |||
| 748 | anon_vma = page_lock_anon_vma_read(page); | ||
| 749 | if (!anon_vma) | ||
| 750 | return referenced; | ||
| 751 | |||
| 752 | mapcount = page_mapcount(page); | ||
| 753 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 754 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
| 755 | struct vm_area_struct *vma = avc->vma; | ||
| 756 | unsigned long address = vma_address(page, vma); | ||
| 757 | /* | ||
| 758 | * If we are reclaiming on behalf of a cgroup, skip | ||
| 759 | * counting on behalf of references from different | ||
| 760 | * cgroups | ||
| 761 | */ | ||
| 762 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
| 763 | continue; | ||
| 764 | referenced += page_referenced_one(page, vma, address, | ||
| 765 | &mapcount, vm_flags); | ||
| 766 | if (!mapcount) | ||
| 767 | break; | ||
| 768 | } | 736 | } |
| 769 | 737 | ||
| 770 | page_unlock_anon_vma_read(anon_vma); | 738 | pra->mapcount--; |
| 771 | return referenced; | 739 | if (!pra->mapcount) |
| 740 | return SWAP_SUCCESS; /* To break the loop */ | ||
| 741 | |||
| 742 | return SWAP_AGAIN; | ||
| 772 | } | 743 | } |
| 773 | 744 | ||
| 774 | /** | 745 | static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) |
| 775 | * page_referenced_file - referenced check for object-based rmap | ||
| 776 | * @page: the page we're checking references on. | ||
| 777 | * @memcg: target memory control group | ||
| 778 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
| 779 | * | ||
| 780 | * For an object-based mapped page, find all the places it is mapped and | ||
| 781 | * check/clear the referenced flag. This is done by following the page->mapping | ||
| 782 | * pointer, then walking the chain of vmas it holds. It returns the number | ||
| 783 | * of references it found. | ||
| 784 | * | ||
| 785 | * This function is only called from page_referenced for object-based pages. | ||
| 786 | */ | ||
| 787 | static int page_referenced_file(struct page *page, | ||
| 788 | struct mem_cgroup *memcg, | ||
| 789 | unsigned long *vm_flags) | ||
| 790 | { | 746 | { |
| 791 | unsigned int mapcount; | 747 | struct page_referenced_arg *pra = arg; |
| 792 | struct address_space *mapping = page->mapping; | 748 | struct mem_cgroup *memcg = pra->memcg; |
| 793 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 794 | struct vm_area_struct *vma; | ||
| 795 | int referenced = 0; | ||
| 796 | |||
| 797 | /* | ||
| 798 | * The caller's checks on page->mapping and !PageAnon have made | ||
| 799 | * sure that this is a file page: the check for page->mapping | ||
| 800 | * excludes the case just before it gets set on an anon page. | ||
| 801 | */ | ||
| 802 | BUG_ON(PageAnon(page)); | ||
| 803 | |||
| 804 | /* | ||
| 805 | * The page lock not only makes sure that page->mapping cannot | ||
| 806 | * suddenly be NULLified by truncation, it makes sure that the | ||
| 807 | * structure at mapping cannot be freed and reused yet, | ||
| 808 | * so we can safely take mapping->i_mmap_mutex. | ||
| 809 | */ | ||
| 810 | BUG_ON(!PageLocked(page)); | ||
| 811 | |||
| 812 | mutex_lock(&mapping->i_mmap_mutex); | ||
| 813 | 749 | ||
| 814 | /* | 750 | if (!mm_match_cgroup(vma->vm_mm, memcg)) |
| 815 | * i_mmap_mutex does not stabilize mapcount at all, but mapcount | 751 | return true; |
| 816 | * is more likely to be accurate if we note it after spinning. | ||
| 817 | */ | ||
| 818 | mapcount = page_mapcount(page); | ||
| 819 | |||
| 820 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
| 821 | unsigned long address = vma_address(page, vma); | ||
| 822 | /* | ||
| 823 | * If we are reclaiming on behalf of a cgroup, skip | ||
| 824 | * counting on behalf of references from different | ||
| 825 | * cgroups | ||
| 826 | */ | ||
| 827 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
| 828 | continue; | ||
| 829 | referenced += page_referenced_one(page, vma, address, | ||
| 830 | &mapcount, vm_flags); | ||
| 831 | if (!mapcount) | ||
| 832 | break; | ||
| 833 | } | ||
| 834 | 752 | ||
| 835 | mutex_unlock(&mapping->i_mmap_mutex); | 753 | return false; |
| 836 | return referenced; | ||
| 837 | } | 754 | } |
| 838 | 755 | ||
| 839 | /** | 756 | /** |
| @@ -851,41 +768,57 @@ int page_referenced(struct page *page, | |||
| 851 | struct mem_cgroup *memcg, | 768 | struct mem_cgroup *memcg, |
| 852 | unsigned long *vm_flags) | 769 | unsigned long *vm_flags) |
| 853 | { | 770 | { |
| 854 | int referenced = 0; | 771 | int ret; |
| 855 | int we_locked = 0; | 772 | int we_locked = 0; |
| 773 | struct page_referenced_arg pra = { | ||
| 774 | .mapcount = page_mapcount(page), | ||
| 775 | .memcg = memcg, | ||
| 776 | }; | ||
| 777 | struct rmap_walk_control rwc = { | ||
| 778 | .rmap_one = page_referenced_one, | ||
| 779 | .arg = (void *)&pra, | ||
| 780 | .anon_lock = page_lock_anon_vma_read, | ||
| 781 | }; | ||
| 856 | 782 | ||
| 857 | *vm_flags = 0; | 783 | *vm_flags = 0; |
| 858 | if (page_mapped(page) && page_rmapping(page)) { | 784 | if (!page_mapped(page)) |
| 859 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { | 785 | return 0; |
| 860 | we_locked = trylock_page(page); | 786 | |
| 861 | if (!we_locked) { | 787 | if (!page_rmapping(page)) |
| 862 | referenced++; | 788 | return 0; |
| 863 | goto out; | 789 | |
| 864 | } | 790 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
| 865 | } | 791 | we_locked = trylock_page(page); |
| 866 | if (unlikely(PageKsm(page))) | 792 | if (!we_locked) |
| 867 | referenced += page_referenced_ksm(page, memcg, | 793 | return 1; |
| 868 | vm_flags); | ||
| 869 | else if (PageAnon(page)) | ||
| 870 | referenced += page_referenced_anon(page, memcg, | ||
| 871 | vm_flags); | ||
| 872 | else if (page->mapping) | ||
| 873 | referenced += page_referenced_file(page, memcg, | ||
| 874 | vm_flags); | ||
| 875 | if (we_locked) | ||
| 876 | unlock_page(page); | ||
| 877 | } | 794 | } |
| 878 | out: | 795 | |
| 879 | return referenced; | 796 | /* |
| 797 | * If we are reclaiming on behalf of a cgroup, skip | ||
| 798 | * counting on behalf of references from different | ||
| 799 | * cgroups | ||
| 800 | */ | ||
| 801 | if (memcg) { | ||
| 802 | rwc.invalid_vma = invalid_page_referenced_vma; | ||
| 803 | } | ||
| 804 | |||
| 805 | ret = rmap_walk(page, &rwc); | ||
| 806 | *vm_flags = pra.vm_flags; | ||
| 807 | |||
| 808 | if (we_locked) | ||
| 809 | unlock_page(page); | ||
| 810 | |||
| 811 | return pra.referenced; | ||
| 880 | } | 812 | } |
| 881 | 813 | ||
| 882 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | 814 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
| 883 | unsigned long address) | 815 | unsigned long address, void *arg) |
| 884 | { | 816 | { |
| 885 | struct mm_struct *mm = vma->vm_mm; | 817 | struct mm_struct *mm = vma->vm_mm; |
| 886 | pte_t *pte; | 818 | pte_t *pte; |
| 887 | spinlock_t *ptl; | 819 | spinlock_t *ptl; |
| 888 | int ret = 0; | 820 | int ret = 0; |
| 821 | int *cleaned = arg; | ||
| 889 | 822 | ||
| 890 | pte = page_check_address(page, mm, address, &ptl, 1); | 823 | pte = page_check_address(page, mm, address, &ptl, 1); |
| 891 | if (!pte) | 824 | if (!pte) |
| @@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
| 904 | 837 | ||
| 905 | pte_unmap_unlock(pte, ptl); | 838 | pte_unmap_unlock(pte, ptl); |
| 906 | 839 | ||
| 907 | if (ret) | 840 | if (ret) { |
| 908 | mmu_notifier_invalidate_page(mm, address); | 841 | mmu_notifier_invalidate_page(mm, address); |
| 842 | (*cleaned)++; | ||
| 843 | } | ||
| 909 | out: | 844 | out: |
| 910 | return ret; | 845 | return SWAP_AGAIN; |
| 911 | } | 846 | } |
| 912 | 847 | ||
| 913 | static int page_mkclean_file(struct address_space *mapping, struct page *page) | 848 | static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) |
| 914 | { | 849 | { |
| 915 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 850 | if (vma->vm_flags & VM_SHARED) |
| 916 | struct vm_area_struct *vma; | 851 | return false; |
| 917 | int ret = 0; | ||
| 918 | |||
| 919 | BUG_ON(PageAnon(page)); | ||
| 920 | 852 | ||
| 921 | mutex_lock(&mapping->i_mmap_mutex); | 853 | return true; |
| 922 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
| 923 | if (vma->vm_flags & VM_SHARED) { | ||
| 924 | unsigned long address = vma_address(page, vma); | ||
| 925 | ret += page_mkclean_one(page, vma, address); | ||
| 926 | } | ||
| 927 | } | ||
| 928 | mutex_unlock(&mapping->i_mmap_mutex); | ||
| 929 | return ret; | ||
| 930 | } | 854 | } |
| 931 | 855 | ||
| 932 | int page_mkclean(struct page *page) | 856 | int page_mkclean(struct page *page) |
| 933 | { | 857 | { |
| 934 | int ret = 0; | 858 | int cleaned = 0; |
| 859 | struct address_space *mapping; | ||
| 860 | struct rmap_walk_control rwc = { | ||
| 861 | .arg = (void *)&cleaned, | ||
| 862 | .rmap_one = page_mkclean_one, | ||
| 863 | .invalid_vma = invalid_mkclean_vma, | ||
| 864 | }; | ||
| 935 | 865 | ||
| 936 | BUG_ON(!PageLocked(page)); | 866 | BUG_ON(!PageLocked(page)); |
| 937 | 867 | ||
| 938 | if (page_mapped(page)) { | 868 | if (!page_mapped(page)) |
| 939 | struct address_space *mapping = page_mapping(page); | 869 | return 0; |
| 940 | if (mapping) | ||
| 941 | ret = page_mkclean_file(mapping, page); | ||
| 942 | } | ||
| 943 | 870 | ||
| 944 | return ret; | 871 | mapping = page_mapping(page); |
| 872 | if (!mapping) | ||
| 873 | return 0; | ||
| 874 | |||
| 875 | rmap_walk(page, &rwc); | ||
| 876 | |||
| 877 | return cleaned; | ||
| 945 | } | 878 | } |
| 946 | EXPORT_SYMBOL_GPL(page_mkclean); | 879 | EXPORT_SYMBOL_GPL(page_mkclean); |
| 947 | 880 | ||
| @@ -961,9 +894,9 @@ void page_move_anon_rmap(struct page *page, | |||
| 961 | { | 894 | { |
| 962 | struct anon_vma *anon_vma = vma->anon_vma; | 895 | struct anon_vma *anon_vma = vma->anon_vma; |
| 963 | 896 | ||
| 964 | VM_BUG_ON(!PageLocked(page)); | 897 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 965 | VM_BUG_ON(!anon_vma); | 898 | VM_BUG_ON(!anon_vma); |
| 966 | VM_BUG_ON(page->index != linear_page_index(vma, address)); | 899 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); |
| 967 | 900 | ||
| 968 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 901 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
| 969 | page->mapping = (struct address_space *) anon_vma; | 902 | page->mapping = (struct address_space *) anon_vma; |
| @@ -1062,7 +995,7 @@ void do_page_add_anon_rmap(struct page *page, | |||
| 1062 | if (unlikely(PageKsm(page))) | 995 | if (unlikely(PageKsm(page))) |
| 1063 | return; | 996 | return; |
| 1064 | 997 | ||
| 1065 | VM_BUG_ON(!PageLocked(page)); | 998 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 1066 | /* address might be in next vma when migration races vma_adjust */ | 999 | /* address might be in next vma when migration races vma_adjust */ |
| 1067 | if (first) | 1000 | if (first) |
| 1068 | __page_set_anon_rmap(page, vma, address, exclusive); | 1001 | __page_set_anon_rmap(page, vma, address, exclusive); |
| @@ -1177,17 +1110,17 @@ out: | |||
| 1177 | } | 1110 | } |
| 1178 | 1111 | ||
| 1179 | /* | 1112 | /* |
| 1180 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 1113 | * @arg: enum ttu_flags will be passed to this argument |
| 1181 | * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. | ||
| 1182 | */ | 1114 | */ |
| 1183 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1115 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
| 1184 | unsigned long address, enum ttu_flags flags) | 1116 | unsigned long address, void *arg) |
| 1185 | { | 1117 | { |
| 1186 | struct mm_struct *mm = vma->vm_mm; | 1118 | struct mm_struct *mm = vma->vm_mm; |
| 1187 | pte_t *pte; | 1119 | pte_t *pte; |
| 1188 | pte_t pteval; | 1120 | pte_t pteval; |
| 1189 | spinlock_t *ptl; | 1121 | spinlock_t *ptl; |
| 1190 | int ret = SWAP_AGAIN; | 1122 | int ret = SWAP_AGAIN; |
| 1123 | enum ttu_flags flags = (enum ttu_flags)arg; | ||
| 1191 | 1124 | ||
| 1192 | pte = page_check_address(page, mm, address, &ptl, 0); | 1125 | pte = page_check_address(page, mm, address, &ptl, 0); |
| 1193 | if (!pte) | 1126 | if (!pte) |
| @@ -1426,93 +1359,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
| 1426 | return ret; | 1359 | return ret; |
| 1427 | } | 1360 | } |
| 1428 | 1361 | ||
| 1429 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1362 | static int try_to_unmap_nonlinear(struct page *page, |
| 1430 | { | 1363 | struct address_space *mapping, void *arg) |
| 1431 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | ||
| 1432 | |||
| 1433 | if (!maybe_stack) | ||
| 1434 | return false; | ||
| 1435 | |||
| 1436 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | ||
| 1437 | VM_STACK_INCOMPLETE_SETUP) | ||
| 1438 | return true; | ||
| 1439 | |||
| 1440 | return false; | ||
| 1441 | } | ||
| 1442 | |||
| 1443 | /** | ||
| 1444 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | ||
| 1445 | * rmap method | ||
| 1446 | * @page: the page to unmap/unlock | ||
| 1447 | * @flags: action and flags | ||
| 1448 | * | ||
| 1449 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
| 1450 | * contained in the anon_vma struct it points to. | ||
| 1451 | * | ||
| 1452 | * This function is only called from try_to_unmap/try_to_munlock for | ||
| 1453 | * anonymous pages. | ||
| 1454 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
| 1455 | * where the page was found will be held for write. So, we won't recheck | ||
| 1456 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
| 1457 | * 'LOCKED. | ||
| 1458 | */ | ||
| 1459 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | ||
| 1460 | { | ||
| 1461 | struct anon_vma *anon_vma; | ||
| 1462 | pgoff_t pgoff; | ||
| 1463 | struct anon_vma_chain *avc; | ||
| 1464 | int ret = SWAP_AGAIN; | ||
| 1465 | |||
| 1466 | anon_vma = page_lock_anon_vma_read(page); | ||
| 1467 | if (!anon_vma) | ||
| 1468 | return ret; | ||
| 1469 | |||
| 1470 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 1471 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
| 1472 | struct vm_area_struct *vma = avc->vma; | ||
| 1473 | unsigned long address; | ||
| 1474 | |||
| 1475 | /* | ||
| 1476 | * During exec, a temporary VMA is setup and later moved. | ||
| 1477 | * The VMA is moved under the anon_vma lock but not the | ||
| 1478 | * page tables leading to a race where migration cannot | ||
| 1479 | * find the migration ptes. Rather than increasing the | ||
| 1480 | * locking requirements of exec(), migration skips | ||
| 1481 | * temporary VMAs until after exec() completes. | ||
| 1482 | */ | ||
| 1483 | if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && | ||
| 1484 | is_vma_temporary_stack(vma)) | ||
| 1485 | continue; | ||
| 1486 | |||
| 1487 | address = vma_address(page, vma); | ||
| 1488 | ret = try_to_unmap_one(page, vma, address, flags); | ||
| 1489 | if (ret != SWAP_AGAIN || !page_mapped(page)) | ||
| 1490 | break; | ||
| 1491 | } | ||
| 1492 | |||
| 1493 | page_unlock_anon_vma_read(anon_vma); | ||
| 1494 | return ret; | ||
| 1495 | } | ||
| 1496 | |||
| 1497 | /** | ||
| 1498 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method | ||
| 1499 | * @page: the page to unmap/unlock | ||
| 1500 | * @flags: action and flags | ||
| 1501 | * | ||
| 1502 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
| 1503 | * contained in the address_space struct it points to. | ||
| 1504 | * | ||
| 1505 | * This function is only called from try_to_unmap/try_to_munlock for | ||
| 1506 | * object-based pages. | ||
| 1507 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
| 1508 | * where the page was found will be held for write. So, we won't recheck | ||
| 1509 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
| 1510 | * 'LOCKED. | ||
| 1511 | */ | ||
| 1512 | static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | ||
| 1513 | { | 1364 | { |
| 1514 | struct address_space *mapping = page->mapping; | ||
| 1515 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 1516 | struct vm_area_struct *vma; | 1365 | struct vm_area_struct *vma; |
| 1517 | int ret = SWAP_AGAIN; | 1366 | int ret = SWAP_AGAIN; |
| 1518 | unsigned long cursor; | 1367 | unsigned long cursor; |
| @@ -1520,30 +1369,9 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
| 1520 | unsigned long max_nl_size = 0; | 1369 | unsigned long max_nl_size = 0; |
| 1521 | unsigned int mapcount; | 1370 | unsigned int mapcount; |
| 1522 | 1371 | ||
| 1523 | if (PageHuge(page)) | 1372 | list_for_each_entry(vma, |
| 1524 | pgoff = page->index << compound_order(page); | 1373 | &mapping->i_mmap_nonlinear, shared.nonlinear) { |
| 1525 | 1374 | ||
| 1526 | mutex_lock(&mapping->i_mmap_mutex); | ||
| 1527 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
| 1528 | unsigned long address = vma_address(page, vma); | ||
| 1529 | ret = try_to_unmap_one(page, vma, address, flags); | ||
| 1530 | if (ret != SWAP_AGAIN || !page_mapped(page)) | ||
| 1531 | goto out; | ||
| 1532 | } | ||
| 1533 | |||
| 1534 | if (list_empty(&mapping->i_mmap_nonlinear)) | ||
| 1535 | goto out; | ||
| 1536 | |||
| 1537 | /* | ||
| 1538 | * We don't bother to try to find the munlocked page in nonlinears. | ||
| 1539 | * It's costly. Instead, later, page reclaim logic may call | ||
| 1540 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
| 1541 | */ | ||
| 1542 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
| 1543 | goto out; | ||
| 1544 | |||
| 1545 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | ||
| 1546 | shared.nonlinear) { | ||
| 1547 | cursor = (unsigned long) vma->vm_private_data; | 1375 | cursor = (unsigned long) vma->vm_private_data; |
| 1548 | if (cursor > max_nl_cursor) | 1376 | if (cursor > max_nl_cursor) |
| 1549 | max_nl_cursor = cursor; | 1377 | max_nl_cursor = cursor; |
| @@ -1553,8 +1381,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
| 1553 | } | 1381 | } |
| 1554 | 1382 | ||
| 1555 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ | 1383 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ |
| 1556 | ret = SWAP_FAIL; | 1384 | return SWAP_FAIL; |
| 1557 | goto out; | ||
| 1558 | } | 1385 | } |
| 1559 | 1386 | ||
| 1560 | /* | 1387 | /* |
| @@ -1566,7 +1393,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
| 1566 | */ | 1393 | */ |
| 1567 | mapcount = page_mapcount(page); | 1394 | mapcount = page_mapcount(page); |
| 1568 | if (!mapcount) | 1395 | if (!mapcount) |
| 1569 | goto out; | 1396 | return ret; |
| 1397 | |||
| 1570 | cond_resched(); | 1398 | cond_resched(); |
| 1571 | 1399 | ||
| 1572 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 1400 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
| @@ -1574,10 +1402,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
| 1574 | max_nl_cursor = CLUSTER_SIZE; | 1402 | max_nl_cursor = CLUSTER_SIZE; |
| 1575 | 1403 | ||
| 1576 | do { | 1404 | do { |
| 1577 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1405 | list_for_each_entry(vma, |
| 1578 | shared.nonlinear) { | 1406 | &mapping->i_mmap_nonlinear, shared.nonlinear) { |
| 1407 | |||
| 1579 | cursor = (unsigned long) vma->vm_private_data; | 1408 | cursor = (unsigned long) vma->vm_private_data; |
| 1580 | while ( cursor < max_nl_cursor && | 1409 | while (cursor < max_nl_cursor && |
| 1581 | cursor < vma->vm_end - vma->vm_start) { | 1410 | cursor < vma->vm_end - vma->vm_start) { |
| 1582 | if (try_to_unmap_cluster(cursor, &mapcount, | 1411 | if (try_to_unmap_cluster(cursor, &mapcount, |
| 1583 | vma, page) == SWAP_MLOCK) | 1412 | vma, page) == SWAP_MLOCK) |
| @@ -1585,7 +1414,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
| 1585 | cursor += CLUSTER_SIZE; | 1414 | cursor += CLUSTER_SIZE; |
| 1586 | vma->vm_private_data = (void *) cursor; | 1415 | vma->vm_private_data = (void *) cursor; |
| 1587 | if ((int)mapcount <= 0) | 1416 | if ((int)mapcount <= 0) |
| 1588 | goto out; | 1417 | return ret; |
| 1589 | } | 1418 | } |
| 1590 | vma->vm_private_data = (void *) max_nl_cursor; | 1419 | vma->vm_private_data = (void *) max_nl_cursor; |
| 1591 | } | 1420 | } |
| @@ -1600,11 +1429,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
| 1600 | */ | 1429 | */ |
| 1601 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) | 1430 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) |
| 1602 | vma->vm_private_data = NULL; | 1431 | vma->vm_private_data = NULL; |
| 1603 | out: | 1432 | |
| 1604 | mutex_unlock(&mapping->i_mmap_mutex); | ||
| 1605 | return ret; | 1433 | return ret; |
| 1606 | } | 1434 | } |
| 1607 | 1435 | ||
| 1436 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | ||
| 1437 | { | ||
| 1438 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | ||
| 1439 | |||
| 1440 | if (!maybe_stack) | ||
| 1441 | return false; | ||
| 1442 | |||
| 1443 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | ||
| 1444 | VM_STACK_INCOMPLETE_SETUP) | ||
| 1445 | return true; | ||
| 1446 | |||
| 1447 | return false; | ||
| 1448 | } | ||
| 1449 | |||
| 1450 | static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) | ||
| 1451 | { | ||
| 1452 | return is_vma_temporary_stack(vma); | ||
| 1453 | } | ||
| 1454 | |||
| 1455 | static int page_not_mapped(struct page *page) | ||
| 1456 | { | ||
| 1457 | return !page_mapped(page); | ||
| 1458 | }; | ||
| 1459 | |||
| 1608 | /** | 1460 | /** |
| 1609 | * try_to_unmap - try to remove all page table mappings to a page | 1461 | * try_to_unmap - try to remove all page table mappings to a page |
| 1610 | * @page: the page to get unmapped | 1462 | * @page: the page to get unmapped |
| @@ -1622,16 +1474,29 @@ out: | |||
| 1622 | int try_to_unmap(struct page *page, enum ttu_flags flags) | 1474 | int try_to_unmap(struct page *page, enum ttu_flags flags) |
| 1623 | { | 1475 | { |
| 1624 | int ret; | 1476 | int ret; |
| 1477 | struct rmap_walk_control rwc = { | ||
| 1478 | .rmap_one = try_to_unmap_one, | ||
| 1479 | .arg = (void *)flags, | ||
| 1480 | .done = page_not_mapped, | ||
| 1481 | .file_nonlinear = try_to_unmap_nonlinear, | ||
| 1482 | .anon_lock = page_lock_anon_vma_read, | ||
| 1483 | }; | ||
| 1625 | 1484 | ||
| 1626 | BUG_ON(!PageLocked(page)); | 1485 | VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); |
| 1627 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); | 1486 | |
| 1487 | /* | ||
| 1488 | * During exec, a temporary VMA is setup and later moved. | ||
| 1489 | * The VMA is moved under the anon_vma lock but not the | ||
| 1490 | * page tables leading to a race where migration cannot | ||
| 1491 | * find the migration ptes. Rather than increasing the | ||
| 1492 | * locking requirements of exec(), migration skips | ||
| 1493 | * temporary VMAs until after exec() completes. | ||
| 1494 | */ | ||
| 1495 | if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) | ||
| 1496 | rwc.invalid_vma = invalid_migration_vma; | ||
| 1497 | |||
| 1498 | ret = rmap_walk(page, &rwc); | ||
| 1628 | 1499 | ||
| 1629 | if (unlikely(PageKsm(page))) | ||
| 1630 | ret = try_to_unmap_ksm(page, flags); | ||
| 1631 | else if (PageAnon(page)) | ||
| 1632 | ret = try_to_unmap_anon(page, flags); | ||
| 1633 | else | ||
| 1634 | ret = try_to_unmap_file(page, flags); | ||
| 1635 | if (ret != SWAP_MLOCK && !page_mapped(page)) | 1500 | if (ret != SWAP_MLOCK && !page_mapped(page)) |
| 1636 | ret = SWAP_SUCCESS; | 1501 | ret = SWAP_SUCCESS; |
| 1637 | return ret; | 1502 | return ret; |
| @@ -1654,14 +1519,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
| 1654 | */ | 1519 | */ |
| 1655 | int try_to_munlock(struct page *page) | 1520 | int try_to_munlock(struct page *page) |
| 1656 | { | 1521 | { |
| 1657 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1522 | int ret; |
| 1523 | struct rmap_walk_control rwc = { | ||
| 1524 | .rmap_one = try_to_unmap_one, | ||
| 1525 | .arg = (void *)TTU_MUNLOCK, | ||
| 1526 | .done = page_not_mapped, | ||
| 1527 | /* | ||
| 1528 | * We don't bother to try to find the munlocked page in | ||
| 1529 | * nonlinears. It's costly. Instead, later, page reclaim logic | ||
| 1530 | * may call try_to_unmap() and recover PG_mlocked lazily. | ||
| 1531 | */ | ||
| 1532 | .file_nonlinear = NULL, | ||
| 1533 | .anon_lock = page_lock_anon_vma_read, | ||
| 1658 | 1534 | ||
| 1659 | if (unlikely(PageKsm(page))) | 1535 | }; |
| 1660 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | 1536 | |
| 1661 | else if (PageAnon(page)) | 1537 | VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); |
| 1662 | return try_to_unmap_anon(page, TTU_MUNLOCK); | 1538 | |
| 1663 | else | 1539 | ret = rmap_walk(page, &rwc); |
| 1664 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1540 | return ret; |
| 1665 | } | 1541 | } |
| 1666 | 1542 | ||
| 1667 | void __put_anon_vma(struct anon_vma *anon_vma) | 1543 | void __put_anon_vma(struct anon_vma *anon_vma) |
| @@ -1674,18 +1550,13 @@ void __put_anon_vma(struct anon_vma *anon_vma) | |||
| 1674 | anon_vma_free(anon_vma); | 1550 | anon_vma_free(anon_vma); |
| 1675 | } | 1551 | } |
| 1676 | 1552 | ||
| 1677 | #ifdef CONFIG_MIGRATION | 1553 | static struct anon_vma *rmap_walk_anon_lock(struct page *page, |
| 1678 | /* | 1554 | struct rmap_walk_control *rwc) |
| 1679 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
| 1680 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
| 1681 | */ | ||
| 1682 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
| 1683 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
| 1684 | { | 1555 | { |
| 1685 | struct anon_vma *anon_vma; | 1556 | struct anon_vma *anon_vma; |
| 1686 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1557 | |
| 1687 | struct anon_vma_chain *avc; | 1558 | if (rwc->anon_lock) |
| 1688 | int ret = SWAP_AGAIN; | 1559 | return rwc->anon_lock(page); |
| 1689 | 1560 | ||
| 1690 | /* | 1561 | /* |
| 1691 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() | 1562 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
| @@ -1695,58 +1566,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
| 1695 | */ | 1566 | */ |
| 1696 | anon_vma = page_anon_vma(page); | 1567 | anon_vma = page_anon_vma(page); |
| 1697 | if (!anon_vma) | 1568 | if (!anon_vma) |
| 1698 | return ret; | 1569 | return NULL; |
| 1570 | |||
| 1699 | anon_vma_lock_read(anon_vma); | 1571 | anon_vma_lock_read(anon_vma); |
| 1572 | return anon_vma; | ||
| 1573 | } | ||
| 1574 | |||
| 1575 | /* | ||
| 1576 | * rmap_walk_anon - do something to anonymous page using the object-based | ||
| 1577 | * rmap method | ||
| 1578 | * @page: the page to be handled | ||
| 1579 | * @rwc: control variable according to each walk type | ||
| 1580 | * | ||
| 1581 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
| 1582 | * contained in the anon_vma struct it points to. | ||
| 1583 | * | ||
| 1584 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
| 1585 | * where the page was found will be held for write. So, we won't recheck | ||
| 1586 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
| 1587 | * LOCKED. | ||
| 1588 | */ | ||
| 1589 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | ||
| 1590 | { | ||
| 1591 | struct anon_vma *anon_vma; | ||
| 1592 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 1593 | struct anon_vma_chain *avc; | ||
| 1594 | int ret = SWAP_AGAIN; | ||
| 1595 | |||
| 1596 | anon_vma = rmap_walk_anon_lock(page, rwc); | ||
| 1597 | if (!anon_vma) | ||
| 1598 | return ret; | ||
| 1599 | |||
| 1700 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1600 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
| 1701 | struct vm_area_struct *vma = avc->vma; | 1601 | struct vm_area_struct *vma = avc->vma; |
| 1702 | unsigned long address = vma_address(page, vma); | 1602 | unsigned long address = vma_address(page, vma); |
| 1703 | ret = rmap_one(page, vma, address, arg); | 1603 | |
| 1604 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | ||
| 1605 | continue; | ||
| 1606 | |||
| 1607 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | ||
| 1704 | if (ret != SWAP_AGAIN) | 1608 | if (ret != SWAP_AGAIN) |
| 1705 | break; | 1609 | break; |
| 1610 | if (rwc->done && rwc->done(page)) | ||
| 1611 | break; | ||
| 1706 | } | 1612 | } |
| 1707 | anon_vma_unlock_read(anon_vma); | 1613 | anon_vma_unlock_read(anon_vma); |
| 1708 | return ret; | 1614 | return ret; |
| 1709 | } | 1615 | } |
| 1710 | 1616 | ||
| 1711 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | 1617 | /* |
| 1712 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1618 | * rmap_walk_file - do something to file page using the object-based rmap method |
| 1619 | * @page: the page to be handled | ||
| 1620 | * @rwc: control variable according to each walk type | ||
| 1621 | * | ||
| 1622 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
| 1623 | * contained in the address_space struct it points to. | ||
| 1624 | * | ||
| 1625 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
| 1626 | * where the page was found will be held for write. So, we won't recheck | ||
| 1627 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
| 1628 | * LOCKED. | ||
| 1629 | */ | ||
| 1630 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | ||
| 1713 | { | 1631 | { |
| 1714 | struct address_space *mapping = page->mapping; | 1632 | struct address_space *mapping = page->mapping; |
| 1715 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1633 | pgoff_t pgoff = page->index << compound_order(page); |
| 1716 | struct vm_area_struct *vma; | 1634 | struct vm_area_struct *vma; |
| 1717 | int ret = SWAP_AGAIN; | 1635 | int ret = SWAP_AGAIN; |
| 1718 | 1636 | ||
| 1637 | /* | ||
| 1638 | * The page lock not only makes sure that page->mapping cannot | ||
| 1639 | * suddenly be NULLified by truncation, it makes sure that the | ||
| 1640 | * structure at mapping cannot be freed and reused yet, | ||
| 1641 | * so we can safely take mapping->i_mmap_mutex. | ||
| 1642 | */ | ||
| 1643 | VM_BUG_ON(!PageLocked(page)); | ||
| 1644 | |||
| 1719 | if (!mapping) | 1645 | if (!mapping) |
| 1720 | return ret; | 1646 | return ret; |
| 1721 | mutex_lock(&mapping->i_mmap_mutex); | 1647 | mutex_lock(&mapping->i_mmap_mutex); |
| 1722 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1648 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
| 1723 | unsigned long address = vma_address(page, vma); | 1649 | unsigned long address = vma_address(page, vma); |
| 1724 | ret = rmap_one(page, vma, address, arg); | 1650 | |
| 1651 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | ||
| 1652 | continue; | ||
| 1653 | |||
| 1654 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | ||
| 1725 | if (ret != SWAP_AGAIN) | 1655 | if (ret != SWAP_AGAIN) |
| 1726 | break; | 1656 | goto done; |
| 1657 | if (rwc->done && rwc->done(page)) | ||
| 1658 | goto done; | ||
| 1727 | } | 1659 | } |
| 1728 | /* | 1660 | |
| 1729 | * No nonlinear handling: being always shared, nonlinear vmas | 1661 | if (!rwc->file_nonlinear) |
| 1730 | * never contain migration ptes. Decide what to do about this | 1662 | goto done; |
| 1731 | * limitation to linear when we need rmap_walk() on nonlinear. | 1663 | |
| 1732 | */ | 1664 | if (list_empty(&mapping->i_mmap_nonlinear)) |
| 1665 | goto done; | ||
| 1666 | |||
| 1667 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); | ||
| 1668 | |||
| 1669 | done: | ||
| 1733 | mutex_unlock(&mapping->i_mmap_mutex); | 1670 | mutex_unlock(&mapping->i_mmap_mutex); |
| 1734 | return ret; | 1671 | return ret; |
| 1735 | } | 1672 | } |
| 1736 | 1673 | ||
| 1737 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | 1674 | int rmap_walk(struct page *page, struct rmap_walk_control *rwc) |
| 1738 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
| 1739 | { | 1675 | { |
| 1740 | VM_BUG_ON(!PageLocked(page)); | ||
| 1741 | |||
| 1742 | if (unlikely(PageKsm(page))) | 1676 | if (unlikely(PageKsm(page))) |
| 1743 | return rmap_walk_ksm(page, rmap_one, arg); | 1677 | return rmap_walk_ksm(page, rwc); |
| 1744 | else if (PageAnon(page)) | 1678 | else if (PageAnon(page)) |
| 1745 | return rmap_walk_anon(page, rmap_one, arg); | 1679 | return rmap_walk_anon(page, rwc); |
| 1746 | else | 1680 | else |
| 1747 | return rmap_walk_file(page, rmap_one, arg); | 1681 | return rmap_walk_file(page, rwc); |
| 1748 | } | 1682 | } |
| 1749 | #endif /* CONFIG_MIGRATION */ | ||
| 1750 | 1683 | ||
| 1751 | #ifdef CONFIG_HUGETLB_PAGE | 1684 | #ifdef CONFIG_HUGETLB_PAGE |
| 1752 | /* | 1685 | /* |
diff --git a/mm/shmem.c b/mm/shmem.c index 902a14842b74..1f18c9d0d93e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -45,7 +45,7 @@ static struct vfsmount *shm_mnt; | |||
| 45 | #include <linux/xattr.h> | 45 | #include <linux/xattr.h> |
| 46 | #include <linux/exportfs.h> | 46 | #include <linux/exportfs.h> |
| 47 | #include <linux/posix_acl.h> | 47 | #include <linux/posix_acl.h> |
| 48 | #include <linux/generic_acl.h> | 48 | #include <linux/posix_acl_xattr.h> |
| 49 | #include <linux/mman.h> | 49 | #include <linux/mman.h> |
| 50 | #include <linux/string.h> | 50 | #include <linux/string.h> |
| 51 | #include <linux/slab.h> | 51 | #include <linux/slab.h> |
| @@ -285,8 +285,8 @@ static int shmem_add_to_page_cache(struct page *page, | |||
| 285 | { | 285 | { |
| 286 | int error; | 286 | int error; |
| 287 | 287 | ||
| 288 | VM_BUG_ON(!PageLocked(page)); | 288 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 289 | VM_BUG_ON(!PageSwapBacked(page)); | 289 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
| 290 | 290 | ||
| 291 | page_cache_get(page); | 291 | page_cache_get(page); |
| 292 | page->mapping = mapping; | 292 | page->mapping = mapping; |
| @@ -491,7 +491,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
| 491 | continue; | 491 | continue; |
| 492 | if (!unfalloc || !PageUptodate(page)) { | 492 | if (!unfalloc || !PageUptodate(page)) { |
| 493 | if (page->mapping == mapping) { | 493 | if (page->mapping == mapping) { |
| 494 | VM_BUG_ON(PageWriteback(page)); | 494 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
| 495 | truncate_inode_page(mapping, page); | 495 | truncate_inode_page(mapping, page); |
| 496 | } | 496 | } |
| 497 | } | 497 | } |
| @@ -568,7 +568,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
| 568 | lock_page(page); | 568 | lock_page(page); |
| 569 | if (!unfalloc || !PageUptodate(page)) { | 569 | if (!unfalloc || !PageUptodate(page)) { |
| 570 | if (page->mapping == mapping) { | 570 | if (page->mapping == mapping) { |
| 571 | VM_BUG_ON(PageWriteback(page)); | 571 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
| 572 | truncate_inode_page(mapping, page); | 572 | truncate_inode_page(mapping, page); |
| 573 | } | 573 | } |
| 574 | } | 574 | } |
| @@ -620,10 +620,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 620 | } | 620 | } |
| 621 | 621 | ||
| 622 | setattr_copy(inode, attr); | 622 | setattr_copy(inode, attr); |
| 623 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 624 | if (attr->ia_valid & ATTR_MODE) | 623 | if (attr->ia_valid & ATTR_MODE) |
| 625 | error = generic_acl_chmod(inode); | 624 | error = posix_acl_chmod(inode, inode->i_mode); |
| 626 | #endif | ||
| 627 | return error; | 625 | return error; |
| 628 | } | 626 | } |
| 629 | 627 | ||
| @@ -1937,22 +1935,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) | |||
| 1937 | 1935 | ||
| 1938 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); | 1936 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); |
| 1939 | if (inode) { | 1937 | if (inode) { |
| 1940 | #ifdef CONFIG_TMPFS_POSIX_ACL | 1938 | error = simple_acl_create(dir, inode); |
| 1941 | error = generic_acl_init(inode, dir); | 1939 | if (error) |
| 1942 | if (error) { | 1940 | goto out_iput; |
| 1943 | iput(inode); | ||
| 1944 | return error; | ||
| 1945 | } | ||
| 1946 | #endif | ||
| 1947 | error = security_inode_init_security(inode, dir, | 1941 | error = security_inode_init_security(inode, dir, |
| 1948 | &dentry->d_name, | 1942 | &dentry->d_name, |
| 1949 | shmem_initxattrs, NULL); | 1943 | shmem_initxattrs, NULL); |
| 1950 | if (error) { | 1944 | if (error && error != -EOPNOTSUPP) |
| 1951 | if (error != -EOPNOTSUPP) { | 1945 | goto out_iput; |
| 1952 | iput(inode); | ||
| 1953 | return error; | ||
| 1954 | } | ||
| 1955 | } | ||
| 1956 | 1946 | ||
| 1957 | error = 0; | 1947 | error = 0; |
| 1958 | dir->i_size += BOGO_DIRENT_SIZE; | 1948 | dir->i_size += BOGO_DIRENT_SIZE; |
| @@ -1961,6 +1951,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) | |||
| 1961 | dget(dentry); /* Extra count - pin the dentry in core */ | 1951 | dget(dentry); /* Extra count - pin the dentry in core */ |
| 1962 | } | 1952 | } |
| 1963 | return error; | 1953 | return error; |
| 1954 | out_iput: | ||
| 1955 | iput(inode); | ||
| 1956 | return error; | ||
| 1964 | } | 1957 | } |
| 1965 | 1958 | ||
| 1966 | static int | 1959 | static int |
| @@ -1974,24 +1967,17 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
| 1974 | error = security_inode_init_security(inode, dir, | 1967 | error = security_inode_init_security(inode, dir, |
| 1975 | NULL, | 1968 | NULL, |
| 1976 | shmem_initxattrs, NULL); | 1969 | shmem_initxattrs, NULL); |
| 1977 | if (error) { | 1970 | if (error && error != -EOPNOTSUPP) |
| 1978 | if (error != -EOPNOTSUPP) { | 1971 | goto out_iput; |
| 1979 | iput(inode); | 1972 | error = simple_acl_create(dir, inode); |
| 1980 | return error; | 1973 | if (error) |
| 1981 | } | 1974 | goto out_iput; |
| 1982 | } | ||
| 1983 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 1984 | error = generic_acl_init(inode, dir); | ||
| 1985 | if (error) { | ||
| 1986 | iput(inode); | ||
| 1987 | return error; | ||
| 1988 | } | ||
| 1989 | #else | ||
| 1990 | error = 0; | ||
| 1991 | #endif | ||
| 1992 | d_tmpfile(dentry, inode); | 1975 | d_tmpfile(dentry, inode); |
| 1993 | } | 1976 | } |
| 1994 | return error; | 1977 | return error; |
| 1978 | out_iput: | ||
| 1979 | iput(inode); | ||
| 1980 | return error; | ||
| 1995 | } | 1981 | } |
| 1996 | 1982 | ||
| 1997 | static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | 1983 | static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
| @@ -2223,8 +2209,8 @@ static int shmem_initxattrs(struct inode *inode, | |||
| 2223 | 2209 | ||
| 2224 | static const struct xattr_handler *shmem_xattr_handlers[] = { | 2210 | static const struct xattr_handler *shmem_xattr_handlers[] = { |
| 2225 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2211 | #ifdef CONFIG_TMPFS_POSIX_ACL |
| 2226 | &generic_acl_access_handler, | 2212 | &posix_acl_access_xattr_handler, |
| 2227 | &generic_acl_default_handler, | 2213 | &posix_acl_default_xattr_handler, |
| 2228 | #endif | 2214 | #endif |
| 2229 | NULL | 2215 | NULL |
| 2230 | }; | 2216 | }; |
| @@ -2740,6 +2726,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
| 2740 | .getxattr = shmem_getxattr, | 2726 | .getxattr = shmem_getxattr, |
| 2741 | .listxattr = shmem_listxattr, | 2727 | .listxattr = shmem_listxattr, |
| 2742 | .removexattr = shmem_removexattr, | 2728 | .removexattr = shmem_removexattr, |
| 2729 | .set_acl = simple_set_acl, | ||
| 2743 | #endif | 2730 | #endif |
| 2744 | }; | 2731 | }; |
| 2745 | 2732 | ||
| @@ -2764,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
| 2764 | #endif | 2751 | #endif |
| 2765 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2752 | #ifdef CONFIG_TMPFS_POSIX_ACL |
| 2766 | .setattr = shmem_setattr, | 2753 | .setattr = shmem_setattr, |
| 2754 | .set_acl = simple_set_acl, | ||
| 2767 | #endif | 2755 | #endif |
| 2768 | }; | 2756 | }; |
| 2769 | 2757 | ||
| @@ -2776,6 +2764,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
| 2776 | #endif | 2764 | #endif |
| 2777 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2765 | #ifdef CONFIG_TMPFS_POSIX_ACL |
| 2778 | .setattr = shmem_setattr, | 2766 | .setattr = shmem_setattr, |
| 2767 | .set_acl = simple_set_acl, | ||
| 2779 | #endif | 2768 | #endif |
| 2780 | }; | 2769 | }; |
| 2781 | 2770 | ||
| @@ -1946,7 +1946,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, | |||
| 1946 | /** | 1946 | /** |
| 1947 | * slab_destroy - destroy and release all objects in a slab | 1947 | * slab_destroy - destroy and release all objects in a slab |
| 1948 | * @cachep: cache pointer being destroyed | 1948 | * @cachep: cache pointer being destroyed |
| 1949 | * @slabp: slab pointer being destroyed | 1949 | * @page: page pointer being destroyed |
| 1950 | * | 1950 | * |
| 1951 | * Destroy all the objs in a slab, and release the mem back to the system. | 1951 | * Destroy all the objs in a slab, and release the mem back to the system. |
| 1952 | * Before calling the slab must have been unlinked from the cache. The | 1952 | * Before calling the slab must have been unlinked from the cache. The |
| @@ -160,12 +160,36 @@ static inline const char *cache_name(struct kmem_cache *s) | |||
| 160 | return s->name; | 160 | return s->name; |
| 161 | } | 161 | } |
| 162 | 162 | ||
| 163 | /* | ||
| 164 | * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. | ||
| 165 | * That said the caller must assure the memcg's cache won't go away. Since once | ||
| 166 | * created a memcg's cache is destroyed only along with the root cache, it is | ||
| 167 | * true if we are going to allocate from the cache or hold a reference to the | ||
| 168 | * root cache by other means. Otherwise, we should hold either the slab_mutex | ||
| 169 | * or the memcg's slab_caches_mutex while calling this function and accessing | ||
| 170 | * the returned value. | ||
| 171 | */ | ||
| 163 | static inline struct kmem_cache * | 172 | static inline struct kmem_cache * |
| 164 | cache_from_memcg_idx(struct kmem_cache *s, int idx) | 173 | cache_from_memcg_idx(struct kmem_cache *s, int idx) |
| 165 | { | 174 | { |
| 175 | struct kmem_cache *cachep; | ||
| 176 | struct memcg_cache_params *params; | ||
| 177 | |||
| 166 | if (!s->memcg_params) | 178 | if (!s->memcg_params) |
| 167 | return NULL; | 179 | return NULL; |
| 168 | return s->memcg_params->memcg_caches[idx]; | 180 | |
| 181 | rcu_read_lock(); | ||
| 182 | params = rcu_dereference(s->memcg_params); | ||
| 183 | cachep = params->memcg_caches[idx]; | ||
| 184 | rcu_read_unlock(); | ||
| 185 | |||
| 186 | /* | ||
| 187 | * Make sure we will access the up-to-date value. The code updating | ||
| 188 | * memcg_caches issues a write barrier to match this (see | ||
| 189 | * memcg_register_cache()). | ||
| 190 | */ | ||
| 191 | smp_read_barrier_depends(); | ||
| 192 | return cachep; | ||
| 169 | } | 193 | } |
| 170 | 194 | ||
| 171 | static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | 195 | static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 0b7bb399b0e4..1ec3c619ba04 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
| @@ -171,13 +171,26 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, | |||
| 171 | struct kmem_cache *parent_cache) | 171 | struct kmem_cache *parent_cache) |
| 172 | { | 172 | { |
| 173 | struct kmem_cache *s = NULL; | 173 | struct kmem_cache *s = NULL; |
| 174 | int err = 0; | 174 | int err; |
| 175 | 175 | ||
| 176 | get_online_cpus(); | 176 | get_online_cpus(); |
| 177 | mutex_lock(&slab_mutex); | 177 | mutex_lock(&slab_mutex); |
| 178 | 178 | ||
| 179 | if (!kmem_cache_sanity_check(memcg, name, size) == 0) | 179 | err = kmem_cache_sanity_check(memcg, name, size); |
| 180 | goto out_locked; | 180 | if (err) |
| 181 | goto out_unlock; | ||
| 182 | |||
| 183 | if (memcg) { | ||
| 184 | /* | ||
| 185 | * Since per-memcg caches are created asynchronously on first | ||
| 186 | * allocation (see memcg_kmem_get_cache()), several threads can | ||
| 187 | * try to create the same cache, but only one of them may | ||
| 188 | * succeed. Therefore if we get here and see the cache has | ||
| 189 | * already been created, we silently return NULL. | ||
| 190 | */ | ||
| 191 | if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg))) | ||
| 192 | goto out_unlock; | ||
| 193 | } | ||
| 181 | 194 | ||
| 182 | /* | 195 | /* |
| 183 | * Some allocators will constraint the set of valid flags to a subset | 196 | * Some allocators will constraint the set of valid flags to a subset |
| @@ -189,44 +202,47 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, | |||
| 189 | 202 | ||
| 190 | s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); | 203 | s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); |
| 191 | if (s) | 204 | if (s) |
| 192 | goto out_locked; | 205 | goto out_unlock; |
| 193 | 206 | ||
| 207 | err = -ENOMEM; | ||
| 194 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); | 208 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); |
| 195 | if (s) { | 209 | if (!s) |
| 196 | s->object_size = s->size = size; | 210 | goto out_unlock; |
| 197 | s->align = calculate_alignment(flags, align, size); | ||
| 198 | s->ctor = ctor; | ||
| 199 | 211 | ||
| 200 | if (memcg_register_cache(memcg, s, parent_cache)) { | 212 | s->object_size = s->size = size; |
| 201 | kmem_cache_free(kmem_cache, s); | 213 | s->align = calculate_alignment(flags, align, size); |
| 202 | err = -ENOMEM; | 214 | s->ctor = ctor; |
| 203 | goto out_locked; | ||
| 204 | } | ||
| 205 | 215 | ||
| 206 | s->name = kstrdup(name, GFP_KERNEL); | 216 | s->name = kstrdup(name, GFP_KERNEL); |
| 207 | if (!s->name) { | 217 | if (!s->name) |
| 208 | kmem_cache_free(kmem_cache, s); | 218 | goto out_free_cache; |
| 209 | err = -ENOMEM; | ||
| 210 | goto out_locked; | ||
| 211 | } | ||
| 212 | 219 | ||
| 213 | err = __kmem_cache_create(s, flags); | 220 | err = memcg_alloc_cache_params(memcg, s, parent_cache); |
| 214 | if (!err) { | 221 | if (err) |
| 215 | s->refcount = 1; | 222 | goto out_free_cache; |
| 216 | list_add(&s->list, &slab_caches); | 223 | |
| 217 | memcg_cache_list_add(memcg, s); | 224 | err = __kmem_cache_create(s, flags); |
| 218 | } else { | 225 | if (err) |
| 219 | kfree(s->name); | 226 | goto out_free_cache; |
| 220 | kmem_cache_free(kmem_cache, s); | 227 | |
| 221 | } | 228 | s->refcount = 1; |
| 222 | } else | 229 | list_add(&s->list, &slab_caches); |
| 223 | err = -ENOMEM; | 230 | memcg_register_cache(s); |
| 224 | 231 | ||
| 225 | out_locked: | 232 | out_unlock: |
| 226 | mutex_unlock(&slab_mutex); | 233 | mutex_unlock(&slab_mutex); |
| 227 | put_online_cpus(); | 234 | put_online_cpus(); |
| 228 | 235 | ||
| 229 | if (err) { | 236 | if (err) { |
| 237 | /* | ||
| 238 | * There is no point in flooding logs with warnings or | ||
| 239 | * especially crashing the system if we fail to create a cache | ||
| 240 | * for a memcg. In this case we will be accounting the memcg | ||
| 241 | * allocation to the root cgroup until we succeed to create its | ||
| 242 | * own cache, but it isn't that critical. | ||
| 243 | */ | ||
| 244 | if (!memcg) | ||
| 245 | return NULL; | ||
| 230 | 246 | ||
| 231 | if (flags & SLAB_PANIC) | 247 | if (flags & SLAB_PANIC) |
| 232 | panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", | 248 | panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", |
| @@ -236,11 +252,15 @@ out_locked: | |||
| 236 | name, err); | 252 | name, err); |
| 237 | dump_stack(); | 253 | dump_stack(); |
| 238 | } | 254 | } |
| 239 | |||
| 240 | return NULL; | 255 | return NULL; |
| 241 | } | 256 | } |
| 242 | |||
| 243 | return s; | 257 | return s; |
| 258 | |||
| 259 | out_free_cache: | ||
| 260 | memcg_free_cache_params(s); | ||
| 261 | kfree(s->name); | ||
| 262 | kmem_cache_free(kmem_cache, s); | ||
| 263 | goto out_unlock; | ||
| 244 | } | 264 | } |
| 245 | 265 | ||
| 246 | struct kmem_cache * | 266 | struct kmem_cache * |
| @@ -263,11 +283,12 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
| 263 | list_del(&s->list); | 283 | list_del(&s->list); |
| 264 | 284 | ||
| 265 | if (!__kmem_cache_shutdown(s)) { | 285 | if (!__kmem_cache_shutdown(s)) { |
| 286 | memcg_unregister_cache(s); | ||
| 266 | mutex_unlock(&slab_mutex); | 287 | mutex_unlock(&slab_mutex); |
| 267 | if (s->flags & SLAB_DESTROY_BY_RCU) | 288 | if (s->flags & SLAB_DESTROY_BY_RCU) |
| 268 | rcu_barrier(); | 289 | rcu_barrier(); |
| 269 | 290 | ||
| 270 | memcg_release_cache(s); | 291 | memcg_free_cache_params(s); |
| 271 | kfree(s->name); | 292 | kfree(s->name); |
| 272 | kmem_cache_free(kmem_cache, s); | 293 | kmem_cache_free(kmem_cache, s); |
| 273 | } else { | 294 | } else { |
| @@ -355,6 +355,21 @@ static __always_inline void slab_unlock(struct page *page) | |||
| 355 | __bit_spin_unlock(PG_locked, &page->flags); | 355 | __bit_spin_unlock(PG_locked, &page->flags); |
| 356 | } | 356 | } |
| 357 | 357 | ||
| 358 | static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) | ||
| 359 | { | ||
| 360 | struct page tmp; | ||
| 361 | tmp.counters = counters_new; | ||
| 362 | /* | ||
| 363 | * page->counters can cover frozen/inuse/objects as well | ||
| 364 | * as page->_count. If we assign to ->counters directly | ||
| 365 | * we run the risk of losing updates to page->_count, so | ||
| 366 | * be careful and only assign to the fields we need. | ||
| 367 | */ | ||
| 368 | page->frozen = tmp.frozen; | ||
| 369 | page->inuse = tmp.inuse; | ||
| 370 | page->objects = tmp.objects; | ||
| 371 | } | ||
| 372 | |||
| 358 | /* Interrupts must be disabled (for the fallback code to work right) */ | 373 | /* Interrupts must be disabled (for the fallback code to work right) */ |
| 359 | static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | 374 | static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, |
| 360 | void *freelist_old, unsigned long counters_old, | 375 | void *freelist_old, unsigned long counters_old, |
| @@ -376,7 +391,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page | |||
| 376 | if (page->freelist == freelist_old && | 391 | if (page->freelist == freelist_old && |
| 377 | page->counters == counters_old) { | 392 | page->counters == counters_old) { |
| 378 | page->freelist = freelist_new; | 393 | page->freelist = freelist_new; |
| 379 | page->counters = counters_new; | 394 | set_page_slub_counters(page, counters_new); |
| 380 | slab_unlock(page); | 395 | slab_unlock(page); |
| 381 | return 1; | 396 | return 1; |
| 382 | } | 397 | } |
| @@ -415,7 +430,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
| 415 | if (page->freelist == freelist_old && | 430 | if (page->freelist == freelist_old && |
| 416 | page->counters == counters_old) { | 431 | page->counters == counters_old) { |
| 417 | page->freelist = freelist_new; | 432 | page->freelist = freelist_new; |
| 418 | page->counters = counters_new; | 433 | set_page_slub_counters(page, counters_new); |
| 419 | slab_unlock(page); | 434 | slab_unlock(page); |
| 420 | local_irq_restore(flags); | 435 | local_irq_restore(flags); |
| 421 | return 1; | 436 | return 1; |
| @@ -985,8 +1000,6 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
| 985 | 1000 | ||
| 986 | /* | 1001 | /* |
| 987 | * Tracking of fully allocated slabs for debugging purposes. | 1002 | * Tracking of fully allocated slabs for debugging purposes. |
| 988 | * | ||
| 989 | * list_lock must be held. | ||
| 990 | */ | 1003 | */ |
| 991 | static void add_full(struct kmem_cache *s, | 1004 | static void add_full(struct kmem_cache *s, |
| 992 | struct kmem_cache_node *n, struct page *page) | 1005 | struct kmem_cache_node *n, struct page *page) |
| @@ -994,17 +1007,16 @@ static void add_full(struct kmem_cache *s, | |||
| 994 | if (!(s->flags & SLAB_STORE_USER)) | 1007 | if (!(s->flags & SLAB_STORE_USER)) |
| 995 | return; | 1008 | return; |
| 996 | 1009 | ||
| 1010 | lockdep_assert_held(&n->list_lock); | ||
| 997 | list_add(&page->lru, &n->full); | 1011 | list_add(&page->lru, &n->full); |
| 998 | } | 1012 | } |
| 999 | 1013 | ||
| 1000 | /* | 1014 | static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) |
| 1001 | * list_lock must be held. | ||
| 1002 | */ | ||
| 1003 | static void remove_full(struct kmem_cache *s, struct page *page) | ||
| 1004 | { | 1015 | { |
| 1005 | if (!(s->flags & SLAB_STORE_USER)) | 1016 | if (!(s->flags & SLAB_STORE_USER)) |
| 1006 | return; | 1017 | return; |
| 1007 | 1018 | ||
| 1019 | lockdep_assert_held(&n->list_lock); | ||
| 1008 | list_del(&page->lru); | 1020 | list_del(&page->lru); |
| 1009 | } | 1021 | } |
| 1010 | 1022 | ||
| @@ -1250,7 +1262,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page, | |||
| 1250 | void *object, u8 val) { return 1; } | 1262 | void *object, u8 val) { return 1; } |
| 1251 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, | 1263 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, |
| 1252 | struct page *page) {} | 1264 | struct page *page) {} |
| 1253 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | 1265 | static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, |
| 1266 | struct page *page) {} | ||
| 1254 | static inline unsigned long kmem_cache_flags(unsigned long object_size, | 1267 | static inline unsigned long kmem_cache_flags(unsigned long object_size, |
| 1255 | unsigned long flags, const char *name, | 1268 | unsigned long flags, const char *name, |
| 1256 | void (*ctor)(void *)) | 1269 | void (*ctor)(void *)) |
| @@ -1504,11 +1517,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
| 1504 | 1517 | ||
| 1505 | /* | 1518 | /* |
| 1506 | * Management of partially allocated slabs. | 1519 | * Management of partially allocated slabs. |
| 1507 | * | ||
| 1508 | * list_lock must be held. | ||
| 1509 | */ | 1520 | */ |
| 1510 | static inline void add_partial(struct kmem_cache_node *n, | 1521 | static inline void |
| 1511 | struct page *page, int tail) | 1522 | __add_partial(struct kmem_cache_node *n, struct page *page, int tail) |
| 1512 | { | 1523 | { |
| 1513 | n->nr_partial++; | 1524 | n->nr_partial++; |
| 1514 | if (tail == DEACTIVATE_TO_TAIL) | 1525 | if (tail == DEACTIVATE_TO_TAIL) |
| @@ -1517,23 +1528,32 @@ static inline void add_partial(struct kmem_cache_node *n, | |||
| 1517 | list_add(&page->lru, &n->partial); | 1528 | list_add(&page->lru, &n->partial); |
| 1518 | } | 1529 | } |
| 1519 | 1530 | ||
| 1520 | /* | 1531 | static inline void add_partial(struct kmem_cache_node *n, |
| 1521 | * list_lock must be held. | 1532 | struct page *page, int tail) |
| 1522 | */ | 1533 | { |
| 1523 | static inline void remove_partial(struct kmem_cache_node *n, | 1534 | lockdep_assert_held(&n->list_lock); |
| 1524 | struct page *page) | 1535 | __add_partial(n, page, tail); |
| 1536 | } | ||
| 1537 | |||
| 1538 | static inline void | ||
| 1539 | __remove_partial(struct kmem_cache_node *n, struct page *page) | ||
| 1525 | { | 1540 | { |
| 1526 | list_del(&page->lru); | 1541 | list_del(&page->lru); |
| 1527 | n->nr_partial--; | 1542 | n->nr_partial--; |
| 1528 | } | 1543 | } |
| 1529 | 1544 | ||
| 1545 | static inline void remove_partial(struct kmem_cache_node *n, | ||
| 1546 | struct page *page) | ||
| 1547 | { | ||
| 1548 | lockdep_assert_held(&n->list_lock); | ||
| 1549 | __remove_partial(n, page); | ||
| 1550 | } | ||
| 1551 | |||
| 1530 | /* | 1552 | /* |
| 1531 | * Remove slab from the partial list, freeze it and | 1553 | * Remove slab from the partial list, freeze it and |
| 1532 | * return the pointer to the freelist. | 1554 | * return the pointer to the freelist. |
| 1533 | * | 1555 | * |
| 1534 | * Returns a list of objects or NULL if it fails. | 1556 | * Returns a list of objects or NULL if it fails. |
| 1535 | * | ||
| 1536 | * Must hold list_lock since we modify the partial list. | ||
| 1537 | */ | 1557 | */ |
| 1538 | static inline void *acquire_slab(struct kmem_cache *s, | 1558 | static inline void *acquire_slab(struct kmem_cache *s, |
| 1539 | struct kmem_cache_node *n, struct page *page, | 1559 | struct kmem_cache_node *n, struct page *page, |
| @@ -1543,6 +1563,8 @@ static inline void *acquire_slab(struct kmem_cache *s, | |||
| 1543 | unsigned long counters; | 1563 | unsigned long counters; |
| 1544 | struct page new; | 1564 | struct page new; |
| 1545 | 1565 | ||
| 1566 | lockdep_assert_held(&n->list_lock); | ||
| 1567 | |||
| 1546 | /* | 1568 | /* |
| 1547 | * Zap the freelist and set the frozen bit. | 1569 | * Zap the freelist and set the frozen bit. |
| 1548 | * The old freelist is the list of objects for the | 1570 | * The old freelist is the list of objects for the |
| @@ -1887,7 +1909,7 @@ redo: | |||
| 1887 | 1909 | ||
| 1888 | else if (l == M_FULL) | 1910 | else if (l == M_FULL) |
| 1889 | 1911 | ||
| 1890 | remove_full(s, page); | 1912 | remove_full(s, n, page); |
| 1891 | 1913 | ||
| 1892 | if (m == M_PARTIAL) { | 1914 | if (m == M_PARTIAL) { |
| 1893 | 1915 | ||
| @@ -2541,7 +2563,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
| 2541 | new.inuse--; | 2563 | new.inuse--; |
| 2542 | if ((!new.inuse || !prior) && !was_frozen) { | 2564 | if ((!new.inuse || !prior) && !was_frozen) { |
| 2543 | 2565 | ||
| 2544 | if (kmem_cache_has_cpu_partial(s) && !prior) | 2566 | if (kmem_cache_has_cpu_partial(s) && !prior) { |
| 2545 | 2567 | ||
| 2546 | /* | 2568 | /* |
| 2547 | * Slab was on no list before and will be | 2569 | * Slab was on no list before and will be |
| @@ -2551,7 +2573,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
| 2551 | */ | 2573 | */ |
| 2552 | new.frozen = 1; | 2574 | new.frozen = 1; |
| 2553 | 2575 | ||
| 2554 | else { /* Needs to be taken off a list */ | 2576 | } else { /* Needs to be taken off a list */ |
| 2555 | 2577 | ||
| 2556 | n = get_node(s, page_to_nid(page)); | 2578 | n = get_node(s, page_to_nid(page)); |
| 2557 | /* | 2579 | /* |
| @@ -2600,7 +2622,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
| 2600 | */ | 2622 | */ |
| 2601 | if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { | 2623 | if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { |
| 2602 | if (kmem_cache_debug(s)) | 2624 | if (kmem_cache_debug(s)) |
| 2603 | remove_full(s, page); | 2625 | remove_full(s, n, page); |
| 2604 | add_partial(n, page, DEACTIVATE_TO_TAIL); | 2626 | add_partial(n, page, DEACTIVATE_TO_TAIL); |
| 2605 | stat(s, FREE_ADD_PARTIAL); | 2627 | stat(s, FREE_ADD_PARTIAL); |
| 2606 | } | 2628 | } |
| @@ -2614,9 +2636,10 @@ slab_empty: | |||
| 2614 | */ | 2636 | */ |
| 2615 | remove_partial(n, page); | 2637 | remove_partial(n, page); |
| 2616 | stat(s, FREE_REMOVE_PARTIAL); | 2638 | stat(s, FREE_REMOVE_PARTIAL); |
| 2617 | } else | 2639 | } else { |
| 2618 | /* Slab must be on the full list */ | 2640 | /* Slab must be on the full list */ |
| 2619 | remove_full(s, page); | 2641 | remove_full(s, n, page); |
| 2642 | } | ||
| 2620 | 2643 | ||
| 2621 | spin_unlock_irqrestore(&n->list_lock, flags); | 2644 | spin_unlock_irqrestore(&n->list_lock, flags); |
| 2622 | stat(s, FREE_SLAB); | 2645 | stat(s, FREE_SLAB); |
| @@ -2890,7 +2913,11 @@ static void early_kmem_cache_node_alloc(int node) | |||
| 2890 | init_kmem_cache_node(n); | 2913 | init_kmem_cache_node(n); |
| 2891 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2914 | inc_slabs_node(kmem_cache_node, node, page->objects); |
| 2892 | 2915 | ||
| 2893 | add_partial(n, page, DEACTIVATE_TO_HEAD); | 2916 | /* |
| 2917 | * No locks need to be taken here as it has just been | ||
| 2918 | * initialized and there is no concurrent access. | ||
| 2919 | */ | ||
| 2920 | __add_partial(n, page, DEACTIVATE_TO_HEAD); | ||
| 2894 | } | 2921 | } |
| 2895 | 2922 | ||
| 2896 | static void free_kmem_cache_nodes(struct kmem_cache *s) | 2923 | static void free_kmem_cache_nodes(struct kmem_cache *s) |
| @@ -3176,7 +3203,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) | |||
| 3176 | 3203 | ||
| 3177 | list_for_each_entry_safe(page, h, &n->partial, lru) { | 3204 | list_for_each_entry_safe(page, h, &n->partial, lru) { |
| 3178 | if (!page->inuse) { | 3205 | if (!page->inuse) { |
| 3179 | remove_partial(n, page); | 3206 | __remove_partial(n, page); |
| 3180 | discard_slab(s, page); | 3207 | discard_slab(s, page); |
| 3181 | } else { | 3208 | } else { |
| 3182 | list_slab_objects(s, page, | 3209 | list_slab_objects(s, page, |
| @@ -4299,7 +4326,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
| 4299 | 4326 | ||
| 4300 | page = ACCESS_ONCE(c->partial); | 4327 | page = ACCESS_ONCE(c->partial); |
| 4301 | if (page) { | 4328 | if (page) { |
| 4302 | x = page->pobjects; | 4329 | node = page_to_nid(page); |
| 4330 | if (flags & SO_TOTAL) | ||
| 4331 | WARN_ON_ONCE(1); | ||
| 4332 | else if (flags & SO_OBJECTS) | ||
| 4333 | WARN_ON_ONCE(1); | ||
| 4334 | else | ||
| 4335 | x = page->pages; | ||
| 4303 | total += x; | 4336 | total += x; |
| 4304 | nodes[node] += x; | 4337 | nodes[node] += x; |
| 4305 | } | 4338 | } |
| @@ -5163,7 +5196,7 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
| 5163 | } | 5196 | } |
| 5164 | 5197 | ||
| 5165 | s->kobj.kset = slab_kset; | 5198 | s->kobj.kset = slab_kset; |
| 5166 | err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); | 5199 | err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); |
| 5167 | if (err) { | 5200 | if (err) { |
| 5168 | kobject_put(&s->kobj); | 5201 | kobject_put(&s->kobj); |
| 5169 | return err; | 5202 | return err; |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 27eeab3be757..4cba9c2783a1 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
| @@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, | |||
| 40 | unsigned long align, | 40 | unsigned long align, |
| 41 | unsigned long goal) | 41 | unsigned long goal) |
| 42 | { | 42 | { |
| 43 | return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); | 43 | return memblock_virt_alloc_try_nid(size, align, goal, |
| 44 | BOOTMEM_ALLOC_ACCESSIBLE, node); | ||
| 44 | } | 45 | } |
| 45 | 46 | ||
| 46 | static void *vmemmap_buf; | 47 | static void *vmemmap_buf; |
| @@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
| 226 | 227 | ||
| 227 | if (vmemmap_buf_start) { | 228 | if (vmemmap_buf_start) { |
| 228 | /* need to free left buf */ | 229 | /* need to free left buf */ |
| 229 | free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); | 230 | memblock_free_early(__pa(vmemmap_buf), |
| 231 | vmemmap_buf_end - vmemmap_buf); | ||
| 230 | vmemmap_buf = NULL; | 232 | vmemmap_buf = NULL; |
| 231 | vmemmap_buf_end = NULL; | 233 | vmemmap_buf_end = NULL; |
| 232 | } | 234 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 8cc7be0e9590..63c3ea5c119c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) | |||
| 69 | else | 69 | else |
| 70 | section = kzalloc(array_size, GFP_KERNEL); | 70 | section = kzalloc(array_size, GFP_KERNEL); |
| 71 | } else { | 71 | } else { |
| 72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); | 72 | section = memblock_virt_alloc_node(array_size, nid); |
| 73 | } | 73 | } |
| 74 | 74 | ||
| 75 | return section; | 75 | return section; |
| @@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | |||
| 279 | limit = goal + (1UL << PA_SECTION_SHIFT); | 279 | limit = goal + (1UL << PA_SECTION_SHIFT); |
| 280 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); | 280 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); |
| 281 | again: | 281 | again: |
| 282 | p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, | 282 | p = memblock_virt_alloc_try_nid_nopanic(size, |
| 283 | SMP_CACHE_BYTES, goal, limit); | 283 | SMP_CACHE_BYTES, goal, limit, |
| 284 | nid); | ||
| 284 | if (!p && limit) { | 285 | if (!p && limit) { |
| 285 | limit = 0; | 286 | limit = 0; |
| 286 | goto again; | 287 | goto again; |
| @@ -331,7 +332,7 @@ static unsigned long * __init | |||
| 331 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 332 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
| 332 | unsigned long size) | 333 | unsigned long size) |
| 333 | { | 334 | { |
| 334 | return alloc_bootmem_node_nopanic(pgdat, size); | 335 | return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); |
| 335 | } | 336 | } |
| 336 | 337 | ||
| 337 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 338 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
| @@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
| 376 | return map; | 377 | return map; |
| 377 | 378 | ||
| 378 | size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); | 379 | size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); |
| 379 | map = __alloc_bootmem_node_high(NODE_DATA(nid), size, | 380 | map = memblock_virt_alloc_try_nid(size, |
| 380 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 381 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
| 382 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
| 381 | return map; | 383 | return map; |
| 382 | } | 384 | } |
| 383 | void __init sparse_mem_maps_populate_node(struct page **map_map, | 385 | void __init sparse_mem_maps_populate_node(struct page **map_map, |
| @@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
| 401 | } | 403 | } |
| 402 | 404 | ||
| 403 | size = PAGE_ALIGN(size); | 405 | size = PAGE_ALIGN(size); |
| 404 | map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, | 406 | map = memblock_virt_alloc_try_nid(size * map_count, |
| 405 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 407 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
| 408 | BOOTMEM_ALLOC_ACCESSIBLE, nodeid); | ||
| 406 | if (map) { | 409 | if (map) { |
| 407 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 410 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
| 408 | if (!present_section_nr(pnum)) | 411 | if (!present_section_nr(pnum)) |
| @@ -545,7 +548,7 @@ void __init sparse_init(void) | |||
| 545 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. | 548 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. |
| 546 | */ | 549 | */ |
| 547 | size = sizeof(unsigned long *) * NR_MEM_SECTIONS; | 550 | size = sizeof(unsigned long *) * NR_MEM_SECTIONS; |
| 548 | usemap_map = alloc_bootmem(size); | 551 | usemap_map = memblock_virt_alloc(size, 0); |
| 549 | if (!usemap_map) | 552 | if (!usemap_map) |
| 550 | panic("can not allocate usemap_map\n"); | 553 | panic("can not allocate usemap_map\n"); |
| 551 | alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, | 554 | alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, |
| @@ -553,7 +556,7 @@ void __init sparse_init(void) | |||
| 553 | 556 | ||
| 554 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 557 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
| 555 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; | 558 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; |
| 556 | map_map = alloc_bootmem(size2); | 559 | map_map = memblock_virt_alloc(size2, 0); |
| 557 | if (!map_map) | 560 | if (!map_map) |
| 558 | panic("can not allocate map_map\n"); | 561 | panic("can not allocate map_map\n"); |
| 559 | alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, | 562 | alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, |
| @@ -583,9 +586,9 @@ void __init sparse_init(void) | |||
| 583 | vmemmap_populate_print_last(); | 586 | vmemmap_populate_print_last(); |
| 584 | 587 | ||
| 585 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 588 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
| 586 | free_bootmem(__pa(map_map), size2); | 589 | memblock_free_early(__pa(map_map), size2); |
| 587 | #endif | 590 | #endif |
| 588 | free_bootmem(__pa(usemap_map), size); | 591 | memblock_free_early(__pa(usemap_map), size); |
| 589 | } | 592 | } |
| 590 | 593 | ||
| 591 | #ifdef CONFIG_MEMORY_HOTPLUG | 594 | #ifdef CONFIG_MEMORY_HOTPLUG |
| @@ -31,7 +31,6 @@ | |||
| 31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
| 32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
| 33 | #include <linux/uio.h> | 33 | #include <linux/uio.h> |
| 34 | #include <linux/hugetlb.h> | ||
| 35 | 34 | ||
| 36 | #include "internal.h" | 35 | #include "internal.h" |
| 37 | 36 | ||
| @@ -58,7 +57,7 @@ static void __page_cache_release(struct page *page) | |||
| 58 | 57 | ||
| 59 | spin_lock_irqsave(&zone->lru_lock, flags); | 58 | spin_lock_irqsave(&zone->lru_lock, flags); |
| 60 | lruvec = mem_cgroup_page_lruvec(page, zone); | 59 | lruvec = mem_cgroup_page_lruvec(page, zone); |
| 61 | VM_BUG_ON(!PageLRU(page)); | 60 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
| 62 | __ClearPageLRU(page); | 61 | __ClearPageLRU(page); |
| 63 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); | 62 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
| 64 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 63 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| @@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page) | |||
| 82 | 81 | ||
| 83 | static void put_compound_page(struct page *page) | 82 | static void put_compound_page(struct page *page) |
| 84 | { | 83 | { |
| 85 | if (unlikely(PageTail(page))) { | 84 | struct page *page_head; |
| 86 | /* __split_huge_page_refcount can run under us */ | ||
| 87 | struct page *page_head = compound_trans_head(page); | ||
| 88 | |||
| 89 | if (likely(page != page_head && | ||
| 90 | get_page_unless_zero(page_head))) { | ||
| 91 | unsigned long flags; | ||
| 92 | 85 | ||
| 86 | if (likely(!PageTail(page))) { | ||
| 87 | if (put_page_testzero(page)) { | ||
| 93 | /* | 88 | /* |
| 94 | * THP can not break up slab pages so avoid taking | 89 | * By the time all refcounts have been released |
| 95 | * compound_lock(). Slab performs non-atomic bit ops | 90 | * split_huge_page cannot run anymore from under us. |
| 96 | * on page->flags for better performance. In particular | ||
| 97 | * slab_unlock() in slub used to be a hot path. It is | ||
| 98 | * still hot on arches that do not support | ||
| 99 | * this_cpu_cmpxchg_double(). | ||
| 100 | */ | 91 | */ |
| 101 | if (PageSlab(page_head) || PageHeadHuge(page_head)) { | 92 | if (PageHead(page)) |
| 102 | if (likely(PageTail(page))) { | 93 | __put_compound_page(page); |
| 103 | /* | 94 | else |
| 104 | * __split_huge_page_refcount | 95 | __put_single_page(page); |
| 105 | * cannot race here. | 96 | } |
| 106 | */ | 97 | return; |
| 107 | VM_BUG_ON(!PageHead(page_head)); | 98 | } |
| 108 | atomic_dec(&page->_mapcount); | 99 | |
| 109 | if (put_page_testzero(page_head)) | 100 | /* __split_huge_page_refcount can run under us */ |
| 110 | VM_BUG_ON(1); | 101 | page_head = compound_head(page); |
| 111 | if (put_page_testzero(page_head)) | 102 | |
| 112 | __put_compound_page(page_head); | 103 | /* |
| 113 | return; | 104 | * THP can not break up slab pages so avoid taking |
| 114 | } else | 105 | * compound_lock() and skip the tail page refcounting (in |
| 115 | /* | 106 | * _mapcount) too. Slab performs non-atomic bit ops on |
| 116 | * __split_huge_page_refcount | 107 | * page->flags for better performance. In particular |
| 117 | * run before us, "page" was a | 108 | * slab_unlock() in slub used to be a hot path. It is still |
| 118 | * THP tail. The split | 109 | * hot on arches that do not support |
| 119 | * page_head has been freed | 110 | * this_cpu_cmpxchg_double(). |
| 120 | * and reallocated as slab or | 111 | * |
| 121 | * hugetlbfs page of smaller | 112 | * If "page" is part of a slab or hugetlbfs page it cannot be |
| 122 | * order (only possible if | 113 | * splitted and the head page cannot change from under us. And |
| 123 | * reallocated as slab on | 114 | * if "page" is part of a THP page under splitting, if the |
| 124 | * x86). | 115 | * head page pointed by the THP tail isn't a THP head anymore, |
| 125 | */ | 116 | * we'll find PageTail clear after smp_rmb() and we'll treat |
| 126 | goto skip_lock; | 117 | * it as a single page. |
| 127 | } | 118 | */ |
| 119 | if (!__compound_tail_refcounted(page_head)) { | ||
| 120 | /* | ||
| 121 | * If "page" is a THP tail, we must read the tail page | ||
| 122 | * flags after the head page flags. The | ||
| 123 | * split_huge_page side enforces write memory barriers | ||
| 124 | * between clearing PageTail and before the head page | ||
| 125 | * can be freed and reallocated. | ||
| 126 | */ | ||
| 127 | smp_rmb(); | ||
| 128 | if (likely(PageTail(page))) { | ||
| 128 | /* | 129 | /* |
| 129 | * page_head wasn't a dangling pointer but it | 130 | * __split_huge_page_refcount cannot race |
| 130 | * may not be a head page anymore by the time | 131 | * here. |
| 131 | * we obtain the lock. That is ok as long as it | ||
| 132 | * can't be freed from under us. | ||
| 133 | */ | 132 | */ |
| 134 | flags = compound_lock_irqsave(page_head); | 133 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); |
| 135 | if (unlikely(!PageTail(page))) { | 134 | VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); |
| 136 | /* __split_huge_page_refcount run before us */ | 135 | if (put_page_testzero(page_head)) { |
| 137 | compound_unlock_irqrestore(page_head, flags); | 136 | /* |
| 138 | skip_lock: | 137 | * If this is the tail of a slab |
| 139 | if (put_page_testzero(page_head)) { | 138 | * compound page, the tail pin must |
| 140 | /* | 139 | * not be the last reference held on |
| 141 | * The head page may have been | 140 | * the page, because the PG_slab |
| 142 | * freed and reallocated as a | 141 | * cannot be cleared before all tail |
| 143 | * compound page of smaller | 142 | * pins (which skips the _mapcount |
| 144 | * order and then freed again. | 143 | * tail refcounting) have been |
| 145 | * All we know is that it | 144 | * released. For hugetlbfs the tail |
| 146 | * cannot have become: a THP | 145 | * pin may be the last reference on |
| 147 | * page, a compound page of | 146 | * the page instead, because |
| 148 | * higher order, a tail page. | 147 | * PageHeadHuge will not go away until |
| 149 | * That is because we still | 148 | * the compound page enters the buddy |
| 150 | * hold the refcount of the | 149 | * allocator. |
| 151 | * split THP tail and | 150 | */ |
| 152 | * page_head was the THP head | 151 | VM_BUG_ON_PAGE(PageSlab(page_head), page_head); |
| 153 | * before the split. | 152 | __put_compound_page(page_head); |
| 154 | */ | ||
| 155 | if (PageHead(page_head)) | ||
| 156 | __put_compound_page(page_head); | ||
| 157 | else | ||
| 158 | __put_single_page(page_head); | ||
| 159 | } | ||
| 160 | out_put_single: | ||
| 161 | if (put_page_testzero(page)) | ||
| 162 | __put_single_page(page); | ||
| 163 | return; | ||
| 164 | } | 153 | } |
| 165 | VM_BUG_ON(page_head != page->first_page); | 154 | return; |
| 155 | } else | ||
| 166 | /* | 156 | /* |
| 167 | * We can release the refcount taken by | 157 | * __split_huge_page_refcount run before us, |
| 168 | * get_page_unless_zero() now that | 158 | * "page" was a THP tail. The split page_head |
| 169 | * __split_huge_page_refcount() is blocked on | 159 | * has been freed and reallocated as slab or |
| 170 | * the compound_lock. | 160 | * hugetlbfs page of smaller order (only |
| 161 | * possible if reallocated as slab on x86). | ||
| 171 | */ | 162 | */ |
| 172 | if (put_page_testzero(page_head)) | 163 | goto out_put_single; |
| 173 | VM_BUG_ON(1); | 164 | } |
| 174 | /* __split_huge_page_refcount will wait now */ | ||
| 175 | VM_BUG_ON(page_mapcount(page) <= 0); | ||
| 176 | atomic_dec(&page->_mapcount); | ||
| 177 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | ||
| 178 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
| 179 | compound_unlock_irqrestore(page_head, flags); | ||
| 180 | 165 | ||
| 166 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
| 167 | unsigned long flags; | ||
| 168 | |||
| 169 | /* | ||
| 170 | * page_head wasn't a dangling pointer but it may not | ||
| 171 | * be a head page anymore by the time we obtain the | ||
| 172 | * lock. That is ok as long as it can't be freed from | ||
| 173 | * under us. | ||
| 174 | */ | ||
| 175 | flags = compound_lock_irqsave(page_head); | ||
| 176 | if (unlikely(!PageTail(page))) { | ||
| 177 | /* __split_huge_page_refcount run before us */ | ||
| 178 | compound_unlock_irqrestore(page_head, flags); | ||
| 181 | if (put_page_testzero(page_head)) { | 179 | if (put_page_testzero(page_head)) { |
| 180 | /* | ||
| 181 | * The head page may have been freed | ||
| 182 | * and reallocated as a compound page | ||
| 183 | * of smaller order and then freed | ||
| 184 | * again. All we know is that it | ||
| 185 | * cannot have become: a THP page, a | ||
| 186 | * compound page of higher order, a | ||
| 187 | * tail page. That is because we | ||
| 188 | * still hold the refcount of the | ||
| 189 | * split THP tail and page_head was | ||
| 190 | * the THP head before the split. | ||
| 191 | */ | ||
| 182 | if (PageHead(page_head)) | 192 | if (PageHead(page_head)) |
| 183 | __put_compound_page(page_head); | 193 | __put_compound_page(page_head); |
| 184 | else | 194 | else |
| 185 | __put_single_page(page_head); | 195 | __put_single_page(page_head); |
| 186 | } | 196 | } |
| 187 | } else { | 197 | out_put_single: |
| 188 | /* page_head is a dangling pointer */ | 198 | if (put_page_testzero(page)) |
| 189 | VM_BUG_ON(PageTail(page)); | 199 | __put_single_page(page); |
| 190 | goto out_put_single; | 200 | return; |
| 191 | } | 201 | } |
| 192 | } else if (put_page_testzero(page)) { | 202 | VM_BUG_ON_PAGE(page_head != page->first_page, page); |
| 193 | if (PageHead(page)) | 203 | /* |
| 194 | __put_compound_page(page); | 204 | * We can release the refcount taken by |
| 195 | else | 205 | * get_page_unless_zero() now that |
| 196 | __put_single_page(page); | 206 | * __split_huge_page_refcount() is blocked on the |
| 207 | * compound_lock. | ||
| 208 | */ | ||
| 209 | if (put_page_testzero(page_head)) | ||
| 210 | VM_BUG_ON_PAGE(1, page_head); | ||
| 211 | /* __split_huge_page_refcount will wait now */ | ||
| 212 | VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); | ||
| 213 | atomic_dec(&page->_mapcount); | ||
| 214 | VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); | ||
| 215 | VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); | ||
| 216 | compound_unlock_irqrestore(page_head, flags); | ||
| 217 | |||
| 218 | if (put_page_testzero(page_head)) { | ||
| 219 | if (PageHead(page_head)) | ||
| 220 | __put_compound_page(page_head); | ||
| 221 | else | ||
| 222 | __put_single_page(page_head); | ||
| 223 | } | ||
| 224 | } else { | ||
| 225 | /* page_head is a dangling pointer */ | ||
| 226 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
| 227 | goto out_put_single; | ||
| 197 | } | 228 | } |
| 198 | } | 229 | } |
| 199 | 230 | ||
| @@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page) | |||
| 221 | * split_huge_page(). | 252 | * split_huge_page(). |
| 222 | */ | 253 | */ |
| 223 | unsigned long flags; | 254 | unsigned long flags; |
| 224 | bool got = false; | 255 | bool got; |
| 225 | struct page *page_head = compound_trans_head(page); | 256 | struct page *page_head = compound_head(page); |
| 226 | 257 | ||
| 227 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 258 | /* Ref to put_compound_page() comment. */ |
| 228 | /* Ref to put_compound_page() comment. */ | 259 | if (!__compound_tail_refcounted(page_head)) { |
| 229 | if (PageSlab(page_head) || PageHeadHuge(page_head)) { | 260 | smp_rmb(); |
| 230 | if (likely(PageTail(page))) { | 261 | if (likely(PageTail(page))) { |
| 231 | /* | 262 | /* |
| 232 | * This is a hugetlbfs page or a slab | 263 | * This is a hugetlbfs page or a slab |
| 233 | * page. __split_huge_page_refcount | 264 | * page. __split_huge_page_refcount |
| 234 | * cannot race here. | 265 | * cannot race here. |
| 235 | */ | 266 | */ |
| 236 | VM_BUG_ON(!PageHead(page_head)); | 267 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); |
| 237 | __get_page_tail_foll(page, false); | 268 | __get_page_tail_foll(page, true); |
| 238 | return true; | 269 | return true; |
| 239 | } else { | 270 | } else { |
| 240 | /* | 271 | /* |
| 241 | * __split_huge_page_refcount run | 272 | * __split_huge_page_refcount run |
| 242 | * before us, "page" was a THP | 273 | * before us, "page" was a THP |
| 243 | * tail. The split page_head has been | 274 | * tail. The split page_head has been |
| 244 | * freed and reallocated as slab or | 275 | * freed and reallocated as slab or |
| 245 | * hugetlbfs page of smaller order | 276 | * hugetlbfs page of smaller order |
| 246 | * (only possible if reallocated as | 277 | * (only possible if reallocated as |
| 247 | * slab on x86). | 278 | * slab on x86). |
| 248 | */ | 279 | */ |
| 249 | put_page(page_head); | 280 | return false; |
| 250 | return false; | ||
| 251 | } | ||
| 252 | } | 281 | } |
| 282 | } | ||
| 253 | 283 | ||
| 284 | got = false; | ||
| 285 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
| 254 | /* | 286 | /* |
| 255 | * page_head wasn't a dangling pointer but it | 287 | * page_head wasn't a dangling pointer but it |
| 256 | * may not be a head page anymore by the time | 288 | * may not be a head page anymore by the time |
| @@ -572,8 +604,8 @@ EXPORT_SYMBOL(__lru_cache_add); | |||
| 572 | */ | 604 | */ |
| 573 | void lru_cache_add(struct page *page) | 605 | void lru_cache_add(struct page *page) |
| 574 | { | 606 | { |
| 575 | VM_BUG_ON(PageActive(page) && PageUnevictable(page)); | 607 | VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); |
| 576 | VM_BUG_ON(PageLRU(page)); | 608 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 577 | __lru_cache_add(page); | 609 | __lru_cache_add(page); |
| 578 | } | 610 | } |
| 579 | 611 | ||
| @@ -814,7 +846,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 814 | } | 846 | } |
| 815 | 847 | ||
| 816 | lruvec = mem_cgroup_page_lruvec(page, zone); | 848 | lruvec = mem_cgroup_page_lruvec(page, zone); |
| 817 | VM_BUG_ON(!PageLRU(page)); | 849 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
| 818 | __ClearPageLRU(page); | 850 | __ClearPageLRU(page); |
| 819 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); | 851 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
| 820 | } | 852 | } |
| @@ -856,9 +888,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
| 856 | { | 888 | { |
| 857 | const int file = 0; | 889 | const int file = 0; |
| 858 | 890 | ||
| 859 | VM_BUG_ON(!PageHead(page)); | 891 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 860 | VM_BUG_ON(PageCompound(page_tail)); | 892 | VM_BUG_ON_PAGE(PageCompound(page_tail), page); |
| 861 | VM_BUG_ON(PageLRU(page_tail)); | 893 | VM_BUG_ON_PAGE(PageLRU(page_tail), page); |
| 862 | VM_BUG_ON(NR_CPUS != 1 && | 894 | VM_BUG_ON(NR_CPUS != 1 && |
| 863 | !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); | 895 | !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); |
| 864 | 896 | ||
| @@ -897,7 +929,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, | |||
| 897 | int active = PageActive(page); | 929 | int active = PageActive(page); |
| 898 | enum lru_list lru = page_lru(page); | 930 | enum lru_list lru = page_lru(page); |
| 899 | 931 | ||
| 900 | VM_BUG_ON(PageLRU(page)); | 932 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 901 | 933 | ||
| 902 | SetPageLRU(page); | 934 | SetPageLRU(page); |
| 903 | add_page_to_lru_list(page, lruvec, lru); | 935 | add_page_to_lru_list(page, lruvec, lru); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index e6f15f8ca2af..e76ace30d436 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) | |||
| 63 | return ret; | 63 | return ret; |
| 64 | } | 64 | } |
| 65 | 65 | ||
| 66 | static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); | ||
| 67 | |||
| 66 | void show_swap_cache_info(void) | 68 | void show_swap_cache_info(void) |
| 67 | { | 69 | { |
| 68 | printk("%lu pages in swap cache\n", total_swapcache_pages()); | 70 | printk("%lu pages in swap cache\n", total_swapcache_pages()); |
| @@ -83,9 +85,9 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
| 83 | int error; | 85 | int error; |
| 84 | struct address_space *address_space; | 86 | struct address_space *address_space; |
| 85 | 87 | ||
| 86 | VM_BUG_ON(!PageLocked(page)); | 88 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 87 | VM_BUG_ON(PageSwapCache(page)); | 89 | VM_BUG_ON_PAGE(PageSwapCache(page), page); |
| 88 | VM_BUG_ON(!PageSwapBacked(page)); | 90 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
| 89 | 91 | ||
| 90 | page_cache_get(page); | 92 | page_cache_get(page); |
| 91 | SetPageSwapCache(page); | 93 | SetPageSwapCache(page); |
| @@ -139,9 +141,9 @@ void __delete_from_swap_cache(struct page *page) | |||
| 139 | swp_entry_t entry; | 141 | swp_entry_t entry; |
| 140 | struct address_space *address_space; | 142 | struct address_space *address_space; |
| 141 | 143 | ||
| 142 | VM_BUG_ON(!PageLocked(page)); | 144 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 143 | VM_BUG_ON(!PageSwapCache(page)); | 145 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
| 144 | VM_BUG_ON(PageWriteback(page)); | 146 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
| 145 | 147 | ||
| 146 | entry.val = page_private(page); | 148 | entry.val = page_private(page); |
| 147 | address_space = swap_address_space(entry); | 149 | address_space = swap_address_space(entry); |
| @@ -165,8 +167,8 @@ int add_to_swap(struct page *page, struct list_head *list) | |||
| 165 | swp_entry_t entry; | 167 | swp_entry_t entry; |
| 166 | int err; | 168 | int err; |
| 167 | 169 | ||
| 168 | VM_BUG_ON(!PageLocked(page)); | 170 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 169 | VM_BUG_ON(!PageUptodate(page)); | 171 | VM_BUG_ON_PAGE(!PageUptodate(page), page); |
| 170 | 172 | ||
| 171 | entry = get_swap_page(); | 173 | entry = get_swap_page(); |
| 172 | if (!entry.val) | 174 | if (!entry.val) |
| @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) | |||
| 286 | 288 | ||
| 287 | page = find_get_page(swap_address_space(entry), entry.val); | 289 | page = find_get_page(swap_address_space(entry), entry.val); |
| 288 | 290 | ||
| 289 | if (page) | 291 | if (page) { |
| 290 | INC_CACHE_INFO(find_success); | 292 | INC_CACHE_INFO(find_success); |
| 293 | if (TestClearPageReadahead(page)) | ||
| 294 | atomic_inc(&swapin_readahead_hits); | ||
| 295 | } | ||
| 291 | 296 | ||
| 292 | INC_CACHE_INFO(find_total); | 297 | INC_CACHE_INFO(find_total); |
| 293 | return page; | 298 | return page; |
| @@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 389 | return found_page; | 394 | return found_page; |
| 390 | } | 395 | } |
| 391 | 396 | ||
| 397 | static unsigned long swapin_nr_pages(unsigned long offset) | ||
| 398 | { | ||
| 399 | static unsigned long prev_offset; | ||
| 400 | unsigned int pages, max_pages, last_ra; | ||
| 401 | static atomic_t last_readahead_pages; | ||
| 402 | |||
| 403 | max_pages = 1 << ACCESS_ONCE(page_cluster); | ||
| 404 | if (max_pages <= 1) | ||
| 405 | return 1; | ||
| 406 | |||
| 407 | /* | ||
| 408 | * This heuristic has been found to work well on both sequential and | ||
| 409 | * random loads, swapping to hard disk or to SSD: please don't ask | ||
| 410 | * what the "+ 2" means, it just happens to work well, that's all. | ||
| 411 | */ | ||
| 412 | pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; | ||
| 413 | if (pages == 2) { | ||
| 414 | /* | ||
| 415 | * We can have no readahead hits to judge by: but must not get | ||
| 416 | * stuck here forever, so check for an adjacent offset instead | ||
| 417 | * (and don't even bother to check whether swap type is same). | ||
| 418 | */ | ||
| 419 | if (offset != prev_offset + 1 && offset != prev_offset - 1) | ||
| 420 | pages = 1; | ||
| 421 | prev_offset = offset; | ||
| 422 | } else { | ||
| 423 | unsigned int roundup = 4; | ||
| 424 | while (roundup < pages) | ||
| 425 | roundup <<= 1; | ||
| 426 | pages = roundup; | ||
| 427 | } | ||
| 428 | |||
| 429 | if (pages > max_pages) | ||
| 430 | pages = max_pages; | ||
| 431 | |||
| 432 | /* Don't shrink readahead too fast */ | ||
| 433 | last_ra = atomic_read(&last_readahead_pages) / 2; | ||
| 434 | if (pages < last_ra) | ||
| 435 | pages = last_ra; | ||
| 436 | atomic_set(&last_readahead_pages, pages); | ||
| 437 | |||
| 438 | return pages; | ||
| 439 | } | ||
| 440 | |||
| 392 | /** | 441 | /** |
| 393 | * swapin_readahead - swap in pages in hope we need them soon | 442 | * swapin_readahead - swap in pages in hope we need them soon |
| 394 | * @entry: swap entry of this memory | 443 | * @entry: swap entry of this memory |
| @@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
| 412 | struct vm_area_struct *vma, unsigned long addr) | 461 | struct vm_area_struct *vma, unsigned long addr) |
| 413 | { | 462 | { |
| 414 | struct page *page; | 463 | struct page *page; |
| 415 | unsigned long offset = swp_offset(entry); | 464 | unsigned long entry_offset = swp_offset(entry); |
| 465 | unsigned long offset = entry_offset; | ||
| 416 | unsigned long start_offset, end_offset; | 466 | unsigned long start_offset, end_offset; |
| 417 | unsigned long mask = (1UL << page_cluster) - 1; | 467 | unsigned long mask; |
| 418 | struct blk_plug plug; | 468 | struct blk_plug plug; |
| 419 | 469 | ||
| 470 | mask = swapin_nr_pages(offset) - 1; | ||
| 471 | if (!mask) | ||
| 472 | goto skip; | ||
| 473 | |||
| 420 | /* Read a page_cluster sized and aligned cluster around offset. */ | 474 | /* Read a page_cluster sized and aligned cluster around offset. */ |
| 421 | start_offset = offset & ~mask; | 475 | start_offset = offset & ~mask; |
| 422 | end_offset = offset | mask; | 476 | end_offset = offset | mask; |
| @@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
| 430 | gfp_mask, vma, addr); | 484 | gfp_mask, vma, addr); |
| 431 | if (!page) | 485 | if (!page) |
| 432 | continue; | 486 | continue; |
| 487 | if (offset != entry_offset) | ||
| 488 | SetPageReadahead(page); | ||
| 433 | page_cache_release(page); | 489 | page_cache_release(page); |
| 434 | } | 490 | } |
| 435 | blk_finish_plug(&plug); | 491 | blk_finish_plug(&plug); |
| 436 | 492 | ||
| 437 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 493 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
| 494 | skip: | ||
| 438 | return read_swap_cache_async(entry, gfp_mask, vma, addr); | 495 | return read_swap_cache_async(entry, gfp_mask, vma, addr); |
| 439 | } | 496 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 612a7c9795f6..4a7f7e6992b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -616,7 +616,7 @@ scan: | |||
| 616 | } | 616 | } |
| 617 | } | 617 | } |
| 618 | offset = si->lowest_bit; | 618 | offset = si->lowest_bit; |
| 619 | while (++offset < scan_base) { | 619 | while (offset < scan_base) { |
| 620 | if (!si->swap_map[offset]) { | 620 | if (!si->swap_map[offset]) { |
| 621 | spin_lock(&si->lock); | 621 | spin_lock(&si->lock); |
| 622 | goto checks; | 622 | goto checks; |
| @@ -629,6 +629,7 @@ scan: | |||
| 629 | cond_resched(); | 629 | cond_resched(); |
| 630 | latency_ration = LATENCY_LIMIT; | 630 | latency_ration = LATENCY_LIMIT; |
| 631 | } | 631 | } |
| 632 | offset++; | ||
| 632 | } | 633 | } |
| 633 | spin_lock(&si->lock); | 634 | spin_lock(&si->lock); |
| 634 | 635 | ||
| @@ -906,7 +907,7 @@ int reuse_swap_page(struct page *page) | |||
| 906 | { | 907 | { |
| 907 | int count; | 908 | int count; |
| 908 | 909 | ||
| 909 | VM_BUG_ON(!PageLocked(page)); | 910 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 910 | if (unlikely(PageKsm(page))) | 911 | if (unlikely(PageKsm(page))) |
| 911 | return 0; | 912 | return 0; |
| 912 | count = page_mapcount(page); | 913 | count = page_mapcount(page); |
| @@ -926,7 +927,7 @@ int reuse_swap_page(struct page *page) | |||
| 926 | */ | 927 | */ |
| 927 | int try_to_free_swap(struct page *page) | 928 | int try_to_free_swap(struct page *page) |
| 928 | { | 929 | { |
| 929 | VM_BUG_ON(!PageLocked(page)); | 930 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 930 | 931 | ||
| 931 | if (!PageSwapCache(page)) | 932 | if (!PageSwapCache(page)) |
| 932 | return 0; | 933 | return 0; |
| @@ -1922,7 +1923,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1922 | p->swap_map = NULL; | 1923 | p->swap_map = NULL; |
| 1923 | cluster_info = p->cluster_info; | 1924 | cluster_info = p->cluster_info; |
| 1924 | p->cluster_info = NULL; | 1925 | p->cluster_info = NULL; |
| 1925 | p->flags = 0; | ||
| 1926 | frontswap_map = frontswap_map_get(p); | 1926 | frontswap_map = frontswap_map_get(p); |
| 1927 | spin_unlock(&p->lock); | 1927 | spin_unlock(&p->lock); |
| 1928 | spin_unlock(&swap_lock); | 1928 | spin_unlock(&swap_lock); |
| @@ -1948,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1948 | mutex_unlock(&inode->i_mutex); | 1948 | mutex_unlock(&inode->i_mutex); |
| 1949 | } | 1949 | } |
| 1950 | filp_close(swap_file, NULL); | 1950 | filp_close(swap_file, NULL); |
| 1951 | |||
| 1952 | /* | ||
| 1953 | * Clear the SWP_USED flag after all resources are freed so that swapon | ||
| 1954 | * can reuse this swap_info in alloc_swap_info() safely. It is ok to | ||
| 1955 | * not hold p->lock after we cleared its SWP_WRITEOK. | ||
| 1956 | */ | ||
| 1957 | spin_lock(&swap_lock); | ||
| 1958 | p->flags = 0; | ||
| 1959 | spin_unlock(&swap_lock); | ||
| 1960 | |||
| 1951 | err = 0; | 1961 | err = 0; |
| 1952 | atomic_inc(&proc_poll_event); | 1962 | atomic_inc(&proc_poll_event); |
| 1953 | wake_up_interruptible(&proc_poll_wait); | 1963 | wake_up_interruptible(&proc_poll_wait); |
| @@ -2714,7 +2724,7 @@ struct swap_info_struct *page_swap_info(struct page *page) | |||
| 2714 | */ | 2724 | */ |
| 2715 | struct address_space *__page_file_mapping(struct page *page) | 2725 | struct address_space *__page_file_mapping(struct page *page) |
| 2716 | { | 2726 | { |
| 2717 | VM_BUG_ON(!PageSwapCache(page)); | 2727 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
| 2718 | return page_swap_info(page)->swap_file->f_mapping; | 2728 | return page_swap_info(page)->swap_file->f_mapping; |
| 2719 | } | 2729 | } |
| 2720 | EXPORT_SYMBOL_GPL(__page_file_mapping); | 2730 | EXPORT_SYMBOL_GPL(__page_file_mapping); |
| @@ -2722,7 +2732,7 @@ EXPORT_SYMBOL_GPL(__page_file_mapping); | |||
| 2722 | pgoff_t __page_file_index(struct page *page) | 2732 | pgoff_t __page_file_index(struct page *page) |
| 2723 | { | 2733 | { |
| 2724 | swp_entry_t swap = { .val = page_private(page) }; | 2734 | swp_entry_t swap = { .val = page_private(page) }; |
| 2725 | VM_BUG_ON(!PageSwapCache(page)); | 2735 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
| 2726 | return swp_offset(swap); | 2736 | return swp_offset(swap); |
| 2727 | } | 2737 | } |
| 2728 | EXPORT_SYMBOL_GPL(__page_file_index); | 2738 | EXPORT_SYMBOL_GPL(__page_file_index); |
| @@ -390,7 +390,10 @@ struct address_space *page_mapping(struct page *page) | |||
| 390 | { | 390 | { |
| 391 | struct address_space *mapping = page->mapping; | 391 | struct address_space *mapping = page->mapping; |
| 392 | 392 | ||
| 393 | VM_BUG_ON(PageSlab(page)); | 393 | /* This happens if someone calls flush_dcache_page on slab page */ |
| 394 | if (unlikely(PageSlab(page))) | ||
| 395 | return NULL; | ||
| 396 | |||
| 394 | if (unlikely(PageSwapCache(page))) { | 397 | if (unlikely(PageSwapCache(page))) { |
| 395 | swp_entry_t entry; | 398 | swp_entry_t entry; |
| 396 | 399 | ||
| @@ -401,13 +404,45 @@ struct address_space *page_mapping(struct page *page) | |||
| 401 | return mapping; | 404 | return mapping; |
| 402 | } | 405 | } |
| 403 | 406 | ||
| 407 | int overcommit_ratio_handler(struct ctl_table *table, int write, | ||
| 408 | void __user *buffer, size_t *lenp, | ||
| 409 | loff_t *ppos) | ||
| 410 | { | ||
| 411 | int ret; | ||
| 412 | |||
| 413 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 414 | if (ret == 0 && write) | ||
| 415 | sysctl_overcommit_kbytes = 0; | ||
| 416 | return ret; | ||
| 417 | } | ||
| 418 | |||
| 419 | int overcommit_kbytes_handler(struct ctl_table *table, int write, | ||
| 420 | void __user *buffer, size_t *lenp, | ||
| 421 | loff_t *ppos) | ||
| 422 | { | ||
| 423 | int ret; | ||
| 424 | |||
| 425 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | ||
| 426 | if (ret == 0 && write) | ||
| 427 | sysctl_overcommit_ratio = 0; | ||
| 428 | return ret; | ||
| 429 | } | ||
| 430 | |||
| 404 | /* | 431 | /* |
| 405 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used | 432 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used |
| 406 | */ | 433 | */ |
| 407 | unsigned long vm_commit_limit(void) | 434 | unsigned long vm_commit_limit(void) |
| 408 | { | 435 | { |
| 409 | return ((totalram_pages - hugetlb_total_pages()) | 436 | unsigned long allowed; |
| 410 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | 437 | |
| 438 | if (sysctl_overcommit_kbytes) | ||
| 439 | allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); | ||
| 440 | else | ||
| 441 | allowed = ((totalram_pages - hugetlb_total_pages()) | ||
| 442 | * sysctl_overcommit_ratio / 100); | ||
| 443 | allowed += total_swap_pages; | ||
| 444 | |||
| 445 | return allowed; | ||
| 411 | } | 446 | } |
| 412 | 447 | ||
| 413 | 448 | ||
diff --git a/mm/vmpressure.c b/mm/vmpressure.c index e0f62837c3f4..d4042e75f7c7 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
| 20 | #include <linux/vmstat.h> | 20 | #include <linux/vmstat.h> |
| 21 | #include <linux/eventfd.h> | 21 | #include <linux/eventfd.h> |
| 22 | #include <linux/slab.h> | ||
| 22 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
| 23 | #include <linux/printk.h> | 24 | #include <linux/printk.h> |
| 24 | #include <linux/vmpressure.h> | 25 | #include <linux/vmpressure.h> |
| @@ -278,8 +279,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
| 278 | 279 | ||
| 279 | /** | 280 | /** |
| 280 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd | 281 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd |
| 281 | * @css: css that is interested in vmpressure notifications | 282 | * @memcg: memcg that is interested in vmpressure notifications |
| 282 | * @cft: cgroup control files handle | ||
| 283 | * @eventfd: eventfd context to link notifications with | 283 | * @eventfd: eventfd context to link notifications with |
| 284 | * @args: event arguments (used to set up a pressure level threshold) | 284 | * @args: event arguments (used to set up a pressure level threshold) |
| 285 | * | 285 | * |
| @@ -289,15 +289,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
| 289 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or | 289 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or |
| 290 | * "critical"). | 290 | * "critical"). |
| 291 | * | 291 | * |
| 292 | * This function should not be used directly, just pass it to (struct | 292 | * To be used as memcg event method. |
| 293 | * cftype).register_event, and then cgroup core will handle everything by | ||
| 294 | * itself. | ||
| 295 | */ | 293 | */ |
| 296 | int vmpressure_register_event(struct cgroup_subsys_state *css, | 294 | int vmpressure_register_event(struct mem_cgroup *memcg, |
| 297 | struct cftype *cft, struct eventfd_ctx *eventfd, | 295 | struct eventfd_ctx *eventfd, const char *args) |
| 298 | const char *args) | ||
| 299 | { | 296 | { |
| 300 | struct vmpressure *vmpr = css_to_vmpressure(css); | 297 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); |
| 301 | struct vmpressure_event *ev; | 298 | struct vmpressure_event *ev; |
| 302 | int level; | 299 | int level; |
| 303 | 300 | ||
| @@ -325,23 +322,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, | |||
| 325 | 322 | ||
| 326 | /** | 323 | /** |
| 327 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure | 324 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure |
| 328 | * @css: css handle | 325 | * @memcg: memcg handle |
| 329 | * @cft: cgroup control files handle | ||
| 330 | * @eventfd: eventfd context that was used to link vmpressure with the @cg | 326 | * @eventfd: eventfd context that was used to link vmpressure with the @cg |
| 331 | * | 327 | * |
| 332 | * This function does internal manipulations to detach the @eventfd from | 328 | * This function does internal manipulations to detach the @eventfd from |
| 333 | * the vmpressure notifications, and then frees internal resources | 329 | * the vmpressure notifications, and then frees internal resources |
| 334 | * associated with the @eventfd (but the @eventfd itself is not freed). | 330 | * associated with the @eventfd (but the @eventfd itself is not freed). |
| 335 | * | 331 | * |
| 336 | * This function should not be used directly, just pass it to (struct | 332 | * To be used as memcg event method. |
| 337 | * cftype).unregister_event, and then cgroup core will handle everything | ||
| 338 | * by itself. | ||
| 339 | */ | 333 | */ |
| 340 | void vmpressure_unregister_event(struct cgroup_subsys_state *css, | 334 | void vmpressure_unregister_event(struct mem_cgroup *memcg, |
| 341 | struct cftype *cft, | ||
| 342 | struct eventfd_ctx *eventfd) | 335 | struct eventfd_ctx *eventfd) |
| 343 | { | 336 | { |
| 344 | struct vmpressure *vmpr = css_to_vmpressure(css); | 337 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); |
| 345 | struct vmpressure_event *ev; | 338 | struct vmpressure_event *ev; |
| 346 | 339 | ||
| 347 | mutex_lock(&vmpr->events_lock); | 340 | mutex_lock(&vmpr->events_lock); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index eea668d9cff6..a9c74b409681 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc) | |||
| 147 | } | 147 | } |
| 148 | #endif | 148 | #endif |
| 149 | 149 | ||
| 150 | unsigned long zone_reclaimable_pages(struct zone *zone) | 150 | static unsigned long zone_reclaimable_pages(struct zone *zone) |
| 151 | { | 151 | { |
| 152 | int nr; | 152 | int nr; |
| 153 | 153 | ||
| @@ -281,17 +281,34 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
| 281 | nr_pages_scanned, lru_pages, | 281 | nr_pages_scanned, lru_pages, |
| 282 | max_pass, delta, total_scan); | 282 | max_pass, delta, total_scan); |
| 283 | 283 | ||
| 284 | while (total_scan >= batch_size) { | 284 | /* |
| 285 | * Normally, we should not scan less than batch_size objects in one | ||
| 286 | * pass to avoid too frequent shrinker calls, but if the slab has less | ||
| 287 | * than batch_size objects in total and we are really tight on memory, | ||
| 288 | * we will try to reclaim all available objects, otherwise we can end | ||
| 289 | * up failing allocations although there are plenty of reclaimable | ||
| 290 | * objects spread over several slabs with usage less than the | ||
| 291 | * batch_size. | ||
| 292 | * | ||
| 293 | * We detect the "tight on memory" situations by looking at the total | ||
| 294 | * number of objects we want to scan (total_scan). If it is greater | ||
| 295 | * than the total number of objects on slab (max_pass), we must be | ||
| 296 | * scanning at high prio and therefore should try to reclaim as much as | ||
| 297 | * possible. | ||
| 298 | */ | ||
| 299 | while (total_scan >= batch_size || | ||
| 300 | total_scan >= max_pass) { | ||
| 285 | unsigned long ret; | 301 | unsigned long ret; |
| 302 | unsigned long nr_to_scan = min(batch_size, total_scan); | ||
| 286 | 303 | ||
| 287 | shrinkctl->nr_to_scan = batch_size; | 304 | shrinkctl->nr_to_scan = nr_to_scan; |
| 288 | ret = shrinker->scan_objects(shrinker, shrinkctl); | 305 | ret = shrinker->scan_objects(shrinker, shrinkctl); |
| 289 | if (ret == SHRINK_STOP) | 306 | if (ret == SHRINK_STOP) |
| 290 | break; | 307 | break; |
| 291 | freed += ret; | 308 | freed += ret; |
| 292 | 309 | ||
| 293 | count_vm_events(SLABS_SCANNED, batch_size); | 310 | count_vm_events(SLABS_SCANNED, nr_to_scan); |
| 294 | total_scan -= batch_size; | 311 | total_scan -= nr_to_scan; |
| 295 | 312 | ||
| 296 | cond_resched(); | 313 | cond_resched(); |
| 297 | } | 314 | } |
| @@ -352,16 +369,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, | |||
| 352 | } | 369 | } |
| 353 | 370 | ||
| 354 | list_for_each_entry(shrinker, &shrinker_list, list) { | 371 | list_for_each_entry(shrinker, &shrinker_list, list) { |
| 355 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { | 372 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { |
| 356 | if (!node_online(shrinkctl->nid)) | 373 | shrinkctl->nid = 0; |
| 357 | continue; | ||
| 358 | |||
| 359 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && | ||
| 360 | (shrinkctl->nid != 0)) | ||
| 361 | break; | ||
| 362 | |||
| 363 | freed += shrink_slab_node(shrinkctl, shrinker, | 374 | freed += shrink_slab_node(shrinkctl, shrinker, |
| 364 | nr_pages_scanned, lru_pages); | 375 | nr_pages_scanned, lru_pages); |
| 376 | continue; | ||
| 377 | } | ||
| 378 | |||
| 379 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { | ||
| 380 | if (node_online(shrinkctl->nid)) | ||
| 381 | freed += shrink_slab_node(shrinkctl, shrinker, | ||
| 382 | nr_pages_scanned, lru_pages); | ||
| 365 | 383 | ||
| 366 | } | 384 | } |
| 367 | } | 385 | } |
| @@ -603,7 +621,7 @@ void putback_lru_page(struct page *page) | |||
| 603 | bool is_unevictable; | 621 | bool is_unevictable; |
| 604 | int was_unevictable = PageUnevictable(page); | 622 | int was_unevictable = PageUnevictable(page); |
| 605 | 623 | ||
| 606 | VM_BUG_ON(PageLRU(page)); | 624 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 607 | 625 | ||
| 608 | redo: | 626 | redo: |
| 609 | ClearPageUnevictable(page); | 627 | ClearPageUnevictable(page); |
| @@ -794,8 +812,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 794 | if (!trylock_page(page)) | 812 | if (!trylock_page(page)) |
| 795 | goto keep; | 813 | goto keep; |
| 796 | 814 | ||
| 797 | VM_BUG_ON(PageActive(page)); | 815 | VM_BUG_ON_PAGE(PageActive(page), page); |
| 798 | VM_BUG_ON(page_zone(page) != zone); | 816 | VM_BUG_ON_PAGE(page_zone(page) != zone, page); |
| 799 | 817 | ||
| 800 | sc->nr_scanned++; | 818 | sc->nr_scanned++; |
| 801 | 819 | ||
| @@ -1079,14 +1097,14 @@ activate_locked: | |||
| 1079 | /* Not a candidate for swapping, so reclaim swap space. */ | 1097 | /* Not a candidate for swapping, so reclaim swap space. */ |
| 1080 | if (PageSwapCache(page) && vm_swap_full()) | 1098 | if (PageSwapCache(page) && vm_swap_full()) |
| 1081 | try_to_free_swap(page); | 1099 | try_to_free_swap(page); |
| 1082 | VM_BUG_ON(PageActive(page)); | 1100 | VM_BUG_ON_PAGE(PageActive(page), page); |
| 1083 | SetPageActive(page); | 1101 | SetPageActive(page); |
| 1084 | pgactivate++; | 1102 | pgactivate++; |
| 1085 | keep_locked: | 1103 | keep_locked: |
| 1086 | unlock_page(page); | 1104 | unlock_page(page); |
| 1087 | keep: | 1105 | keep: |
| 1088 | list_add(&page->lru, &ret_pages); | 1106 | list_add(&page->lru, &ret_pages); |
| 1089 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 1107 | VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); |
| 1090 | } | 1108 | } |
| 1091 | 1109 | ||
| 1092 | free_hot_cold_page_list(&free_pages, 1); | 1110 | free_hot_cold_page_list(&free_pages, 1); |
| @@ -1240,7 +1258,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 1240 | page = lru_to_page(src); | 1258 | page = lru_to_page(src); |
| 1241 | prefetchw_prev_lru_page(page, src, flags); | 1259 | prefetchw_prev_lru_page(page, src, flags); |
| 1242 | 1260 | ||
| 1243 | VM_BUG_ON(!PageLRU(page)); | 1261 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
| 1244 | 1262 | ||
| 1245 | switch (__isolate_lru_page(page, mode)) { | 1263 | switch (__isolate_lru_page(page, mode)) { |
| 1246 | case 0: | 1264 | case 0: |
| @@ -1295,7 +1313,7 @@ int isolate_lru_page(struct page *page) | |||
| 1295 | { | 1313 | { |
| 1296 | int ret = -EBUSY; | 1314 | int ret = -EBUSY; |
| 1297 | 1315 | ||
| 1298 | VM_BUG_ON(!page_count(page)); | 1316 | VM_BUG_ON_PAGE(!page_count(page), page); |
| 1299 | 1317 | ||
| 1300 | if (PageLRU(page)) { | 1318 | if (PageLRU(page)) { |
| 1301 | struct zone *zone = page_zone(page); | 1319 | struct zone *zone = page_zone(page); |
| @@ -1366,7 +1384,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | |||
| 1366 | struct page *page = lru_to_page(page_list); | 1384 | struct page *page = lru_to_page(page_list); |
| 1367 | int lru; | 1385 | int lru; |
| 1368 | 1386 | ||
| 1369 | VM_BUG_ON(PageLRU(page)); | 1387 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 1370 | list_del(&page->lru); | 1388 | list_del(&page->lru); |
| 1371 | if (unlikely(!page_evictable(page))) { | 1389 | if (unlikely(!page_evictable(page))) { |
| 1372 | spin_unlock_irq(&zone->lru_lock); | 1390 | spin_unlock_irq(&zone->lru_lock); |
| @@ -1586,7 +1604,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, | |||
| 1586 | page = lru_to_page(list); | 1604 | page = lru_to_page(list); |
| 1587 | lruvec = mem_cgroup_page_lruvec(page, zone); | 1605 | lruvec = mem_cgroup_page_lruvec(page, zone); |
| 1588 | 1606 | ||
| 1589 | VM_BUG_ON(PageLRU(page)); | 1607 | VM_BUG_ON_PAGE(PageLRU(page), page); |
| 1590 | SetPageLRU(page); | 1608 | SetPageLRU(page); |
| 1591 | 1609 | ||
| 1592 | nr_pages = hpage_nr_pages(page); | 1610 | nr_pages = hpage_nr_pages(page); |
| @@ -3297,27 +3315,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | |||
| 3297 | wake_up_interruptible(&pgdat->kswapd_wait); | 3315 | wake_up_interruptible(&pgdat->kswapd_wait); |
| 3298 | } | 3316 | } |
| 3299 | 3317 | ||
| 3300 | /* | ||
| 3301 | * The reclaimable count would be mostly accurate. | ||
| 3302 | * The less reclaimable pages may be | ||
| 3303 | * - mlocked pages, which will be moved to unevictable list when encountered | ||
| 3304 | * - mapped pages, which may require several travels to be reclaimed | ||
| 3305 | * - dirty pages, which is not "instantly" reclaimable | ||
| 3306 | */ | ||
| 3307 | unsigned long global_reclaimable_pages(void) | ||
| 3308 | { | ||
| 3309 | int nr; | ||
| 3310 | |||
| 3311 | nr = global_page_state(NR_ACTIVE_FILE) + | ||
| 3312 | global_page_state(NR_INACTIVE_FILE); | ||
| 3313 | |||
| 3314 | if (get_nr_swap_pages() > 0) | ||
| 3315 | nr += global_page_state(NR_ACTIVE_ANON) + | ||
| 3316 | global_page_state(NR_INACTIVE_ANON); | ||
| 3317 | |||
| 3318 | return nr; | ||
| 3319 | } | ||
| 3320 | |||
| 3321 | #ifdef CONFIG_HIBERNATION | 3318 | #ifdef CONFIG_HIBERNATION |
| 3322 | /* | 3319 | /* |
| 3323 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of | 3320 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
| @@ -3701,7 +3698,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
| 3701 | if (page_evictable(page)) { | 3698 | if (page_evictable(page)) { |
| 3702 | enum lru_list lru = page_lru_base_type(page); | 3699 | enum lru_list lru = page_lru_base_type(page); |
| 3703 | 3700 | ||
| 3704 | VM_BUG_ON(PageActive(page)); | 3701 | VM_BUG_ON_PAGE(PageActive(page), page); |
| 3705 | ClearPageUnevictable(page); | 3702 | ClearPageUnevictable(page); |
| 3706 | del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); | 3703 | del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); |
| 3707 | add_page_to_lru_list(page, lruvec, lru); | 3704 | add_page_to_lru_list(page, lruvec, lru); |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 72496140ac08..def5dd2fbe61 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -851,12 +851,14 @@ const char * const vmstat_text[] = { | |||
| 851 | "thp_zero_page_alloc", | 851 | "thp_zero_page_alloc", |
| 852 | "thp_zero_page_alloc_failed", | 852 | "thp_zero_page_alloc_failed", |
| 853 | #endif | 853 | #endif |
| 854 | #ifdef CONFIG_DEBUG_TLBFLUSH | ||
| 854 | #ifdef CONFIG_SMP | 855 | #ifdef CONFIG_SMP |
| 855 | "nr_tlb_remote_flush", | 856 | "nr_tlb_remote_flush", |
| 856 | "nr_tlb_remote_flush_received", | 857 | "nr_tlb_remote_flush_received", |
| 857 | #endif | 858 | #endif /* CONFIG_SMP */ |
| 858 | "nr_tlb_local_flush_all", | 859 | "nr_tlb_local_flush_all", |
| 859 | "nr_tlb_local_flush_one", | 860 | "nr_tlb_local_flush_one", |
| 861 | #endif /* CONFIG_DEBUG_TLBFLUSH */ | ||
| 860 | 862 | ||
| 861 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 863 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
| 862 | }; | 864 | }; |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c new file mode 100644 index 000000000000..c03ca5e9fe15 --- /dev/null +++ b/mm/zsmalloc.c | |||
| @@ -0,0 +1,1106 @@ | |||
| 1 | /* | ||
| 2 | * zsmalloc memory allocator | ||
| 3 | * | ||
| 4 | * Copyright (C) 2011 Nitin Gupta | ||
| 5 | * Copyright (C) 2012, 2013 Minchan Kim | ||
| 6 | * | ||
| 7 | * This code is released using a dual license strategy: BSD/GPL | ||
| 8 | * You can choose the license that better fits your requirements. | ||
| 9 | * | ||
| 10 | * Released under the terms of 3-clause BSD License | ||
| 11 | * Released under the terms of GNU General Public License Version 2.0 | ||
| 12 | */ | ||
| 13 | |||
| 14 | /* | ||
| 15 | * This allocator is designed for use with zram. Thus, the allocator is | ||
| 16 | * supposed to work well under low memory conditions. In particular, it | ||
| 17 | * never attempts higher order page allocation which is very likely to | ||
| 18 | * fail under memory pressure. On the other hand, if we just use single | ||
| 19 | * (0-order) pages, it would suffer from very high fragmentation -- | ||
| 20 | * any object of size PAGE_SIZE/2 or larger would occupy an entire page. | ||
| 21 | * This was one of the major issues with its predecessor (xvmalloc). | ||
| 22 | * | ||
| 23 | * To overcome these issues, zsmalloc allocates a bunch of 0-order pages | ||
| 24 | * and links them together using various 'struct page' fields. These linked | ||
| 25 | * pages act as a single higher-order page i.e. an object can span 0-order | ||
| 26 | * page boundaries. The code refers to these linked pages as a single entity | ||
| 27 | * called zspage. | ||
| 28 | * | ||
| 29 | * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE | ||
| 30 | * since this satisfies the requirements of all its current users (in the | ||
| 31 | * worst case, page is incompressible and is thus stored "as-is" i.e. in | ||
| 32 | * uncompressed form). For allocation requests larger than this size, failure | ||
| 33 | * is returned (see zs_malloc). | ||
| 34 | * | ||
| 35 | * Additionally, zs_malloc() does not return a dereferenceable pointer. | ||
| 36 | * Instead, it returns an opaque handle (unsigned long) which encodes actual | ||
| 37 | * location of the allocated object. The reason for this indirection is that | ||
| 38 | * zsmalloc does not keep zspages permanently mapped since that would cause | ||
| 39 | * issues on 32-bit systems where the VA region for kernel space mappings | ||
| 40 | * is very small. So, before using the allocating memory, the object has to | ||
| 41 | * be mapped using zs_map_object() to get a usable pointer and subsequently | ||
| 42 | * unmapped using zs_unmap_object(). | ||
| 43 | * | ||
| 44 | * Following is how we use various fields and flags of underlying | ||
| 45 | * struct page(s) to form a zspage. | ||
| 46 | * | ||
| 47 | * Usage of struct page fields: | ||
| 48 | * page->first_page: points to the first component (0-order) page | ||
| 49 | * page->index (union with page->freelist): offset of the first object | ||
| 50 | * starting in this page. For the first page, this is | ||
| 51 | * always 0, so we use this field (aka freelist) to point | ||
| 52 | * to the first free object in zspage. | ||
| 53 | * page->lru: links together all component pages (except the first page) | ||
| 54 | * of a zspage | ||
| 55 | * | ||
| 56 | * For _first_ page only: | ||
| 57 | * | ||
| 58 | * page->private (union with page->first_page): refers to the | ||
| 59 | * component page after the first page | ||
| 60 | * page->freelist: points to the first free object in zspage. | ||
| 61 | * Free objects are linked together using in-place | ||
| 62 | * metadata. | ||
| 63 | * page->objects: maximum number of objects we can store in this | ||
| 64 | * zspage (class->zspage_order * PAGE_SIZE / class->size) | ||
| 65 | * page->lru: links together first pages of various zspages. | ||
| 66 | * Basically forming list of zspages in a fullness group. | ||
| 67 | * page->mapping: class index and fullness group of the zspage | ||
| 68 | * | ||
| 69 | * Usage of struct page flags: | ||
| 70 | * PG_private: identifies the first component page | ||
| 71 | * PG_private2: identifies the last component page | ||
| 72 | * | ||
| 73 | */ | ||
| 74 | |||
| 75 | #ifdef CONFIG_ZSMALLOC_DEBUG | ||
| 76 | #define DEBUG | ||
| 77 | #endif | ||
| 78 | |||
| 79 | #include <linux/module.h> | ||
| 80 | #include <linux/kernel.h> | ||
| 81 | #include <linux/bitops.h> | ||
| 82 | #include <linux/errno.h> | ||
| 83 | #include <linux/highmem.h> | ||
| 84 | #include <linux/string.h> | ||
| 85 | #include <linux/slab.h> | ||
| 86 | #include <asm/tlbflush.h> | ||
| 87 | #include <asm/pgtable.h> | ||
| 88 | #include <linux/cpumask.h> | ||
| 89 | #include <linux/cpu.h> | ||
| 90 | #include <linux/vmalloc.h> | ||
| 91 | #include <linux/hardirq.h> | ||
| 92 | #include <linux/spinlock.h> | ||
| 93 | #include <linux/types.h> | ||
| 94 | #include <linux/zsmalloc.h> | ||
| 95 | |||
| 96 | /* | ||
| 97 | * This must be power of 2 and greater than of equal to sizeof(link_free). | ||
| 98 | * These two conditions ensure that any 'struct link_free' itself doesn't | ||
| 99 | * span more than 1 page which avoids complex case of mapping 2 pages simply | ||
| 100 | * to restore link_free pointer values. | ||
| 101 | */ | ||
| 102 | #define ZS_ALIGN 8 | ||
| 103 | |||
| 104 | /* | ||
| 105 | * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) | ||
| 106 | * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. | ||
| 107 | */ | ||
| 108 | #define ZS_MAX_ZSPAGE_ORDER 2 | ||
| 109 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) | ||
| 110 | |||
| 111 | /* | ||
| 112 | * Object location (<PFN>, <obj_idx>) is encoded as | ||
| 113 | * as single (unsigned long) handle value. | ||
| 114 | * | ||
| 115 | * Note that object index <obj_idx> is relative to system | ||
| 116 | * page <PFN> it is stored in, so for each sub-page belonging | ||
| 117 | * to a zspage, obj_idx starts with 0. | ||
| 118 | * | ||
| 119 | * This is made more complicated by various memory models and PAE. | ||
| 120 | */ | ||
| 121 | |||
| 122 | #ifndef MAX_PHYSMEM_BITS | ||
| 123 | #ifdef CONFIG_HIGHMEM64G | ||
| 124 | #define MAX_PHYSMEM_BITS 36 | ||
| 125 | #else /* !CONFIG_HIGHMEM64G */ | ||
| 126 | /* | ||
| 127 | * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just | ||
| 128 | * be PAGE_SHIFT | ||
| 129 | */ | ||
| 130 | #define MAX_PHYSMEM_BITS BITS_PER_LONG | ||
| 131 | #endif | ||
| 132 | #endif | ||
| 133 | #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) | ||
| 134 | #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) | ||
| 135 | #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) | ||
| 136 | |||
| 137 | #define MAX(a, b) ((a) >= (b) ? (a) : (b)) | ||
| 138 | /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ | ||
| 139 | #define ZS_MIN_ALLOC_SIZE \ | ||
| 140 | MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) | ||
| 141 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE | ||
| 142 | |||
| 143 | /* | ||
| 144 | * On systems with 4K page size, this gives 254 size classes! There is a | ||
| 145 | * trader-off here: | ||
| 146 | * - Large number of size classes is potentially wasteful as free page are | ||
| 147 | * spread across these classes | ||
| 148 | * - Small number of size classes causes large internal fragmentation | ||
| 149 | * - Probably its better to use specific size classes (empirically | ||
| 150 | * determined). NOTE: all those class sizes must be set as multiple of | ||
| 151 | * ZS_ALIGN to make sure link_free itself never has to span 2 pages. | ||
| 152 | * | ||
| 153 | * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN | ||
| 154 | * (reason above) | ||
| 155 | */ | ||
| 156 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) | ||
| 157 | #define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ | ||
| 158 | ZS_SIZE_CLASS_DELTA + 1) | ||
| 159 | |||
| 160 | /* | ||
| 161 | * We do not maintain any list for completely empty or full pages | ||
| 162 | */ | ||
| 163 | enum fullness_group { | ||
| 164 | ZS_ALMOST_FULL, | ||
| 165 | ZS_ALMOST_EMPTY, | ||
| 166 | _ZS_NR_FULLNESS_GROUPS, | ||
| 167 | |||
| 168 | ZS_EMPTY, | ||
| 169 | ZS_FULL | ||
| 170 | }; | ||
| 171 | |||
| 172 | /* | ||
| 173 | * We assign a page to ZS_ALMOST_EMPTY fullness group when: | ||
| 174 | * n <= N / f, where | ||
| 175 | * n = number of allocated objects | ||
| 176 | * N = total number of objects zspage can store | ||
| 177 | * f = 1/fullness_threshold_frac | ||
| 178 | * | ||
| 179 | * Similarly, we assign zspage to: | ||
| 180 | * ZS_ALMOST_FULL when n > N / f | ||
| 181 | * ZS_EMPTY when n == 0 | ||
| 182 | * ZS_FULL when n == N | ||
| 183 | * | ||
| 184 | * (see: fix_fullness_group()) | ||
| 185 | */ | ||
| 186 | static const int fullness_threshold_frac = 4; | ||
| 187 | |||
| 188 | struct size_class { | ||
| 189 | /* | ||
| 190 | * Size of objects stored in this class. Must be multiple | ||
| 191 | * of ZS_ALIGN. | ||
| 192 | */ | ||
| 193 | int size; | ||
| 194 | unsigned int index; | ||
| 195 | |||
| 196 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ | ||
| 197 | int pages_per_zspage; | ||
| 198 | |||
| 199 | spinlock_t lock; | ||
| 200 | |||
| 201 | /* stats */ | ||
| 202 | u64 pages_allocated; | ||
| 203 | |||
| 204 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; | ||
| 205 | }; | ||
| 206 | |||
| 207 | /* | ||
| 208 | * Placed within free objects to form a singly linked list. | ||
| 209 | * For every zspage, first_page->freelist gives head of this list. | ||
| 210 | * | ||
| 211 | * This must be power of 2 and less than or equal to ZS_ALIGN | ||
| 212 | */ | ||
| 213 | struct link_free { | ||
| 214 | /* Handle of next free chunk (encodes <PFN, obj_idx>) */ | ||
| 215 | void *next; | ||
| 216 | }; | ||
| 217 | |||
| 218 | struct zs_pool { | ||
| 219 | struct size_class size_class[ZS_SIZE_CLASSES]; | ||
| 220 | |||
| 221 | gfp_t flags; /* allocation flags used when growing pool */ | ||
| 222 | }; | ||
| 223 | |||
| 224 | /* | ||
| 225 | * A zspage's class index and fullness group | ||
| 226 | * are encoded in its (first)page->mapping | ||
| 227 | */ | ||
| 228 | #define CLASS_IDX_BITS 28 | ||
| 229 | #define FULLNESS_BITS 4 | ||
| 230 | #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) | ||
| 231 | #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) | ||
| 232 | |||
| 233 | struct mapping_area { | ||
| 234 | #ifdef CONFIG_PGTABLE_MAPPING | ||
| 235 | struct vm_struct *vm; /* vm area for mapping object that span pages */ | ||
| 236 | #else | ||
| 237 | char *vm_buf; /* copy buffer for objects that span pages */ | ||
| 238 | #endif | ||
| 239 | char *vm_addr; /* address of kmap_atomic()'ed pages */ | ||
| 240 | enum zs_mapmode vm_mm; /* mapping mode */ | ||
| 241 | }; | ||
| 242 | |||
| 243 | |||
| 244 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | ||
| 245 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); | ||
| 246 | |||
| 247 | static int is_first_page(struct page *page) | ||
| 248 | { | ||
| 249 | return PagePrivate(page); | ||
| 250 | } | ||
| 251 | |||
| 252 | static int is_last_page(struct page *page) | ||
| 253 | { | ||
| 254 | return PagePrivate2(page); | ||
| 255 | } | ||
| 256 | |||
| 257 | static void get_zspage_mapping(struct page *page, unsigned int *class_idx, | ||
| 258 | enum fullness_group *fullness) | ||
| 259 | { | ||
| 260 | unsigned long m; | ||
| 261 | BUG_ON(!is_first_page(page)); | ||
| 262 | |||
| 263 | m = (unsigned long)page->mapping; | ||
| 264 | *fullness = m & FULLNESS_MASK; | ||
| 265 | *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; | ||
| 266 | } | ||
| 267 | |||
| 268 | static void set_zspage_mapping(struct page *page, unsigned int class_idx, | ||
| 269 | enum fullness_group fullness) | ||
| 270 | { | ||
| 271 | unsigned long m; | ||
| 272 | BUG_ON(!is_first_page(page)); | ||
| 273 | |||
| 274 | m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | | ||
| 275 | (fullness & FULLNESS_MASK); | ||
| 276 | page->mapping = (struct address_space *)m; | ||
| 277 | } | ||
| 278 | |||
| 279 | /* | ||
| 280 | * zsmalloc divides the pool into various size classes where each | ||
| 281 | * class maintains a list of zspages where each zspage is divided | ||
| 282 | * into equal sized chunks. Each allocation falls into one of these | ||
| 283 | * classes depending on its size. This function returns index of the | ||
| 284 | * size class which has chunk size big enough to hold the give size. | ||
| 285 | */ | ||
| 286 | static int get_size_class_index(int size) | ||
| 287 | { | ||
| 288 | int idx = 0; | ||
| 289 | |||
| 290 | if (likely(size > ZS_MIN_ALLOC_SIZE)) | ||
| 291 | idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, | ||
| 292 | ZS_SIZE_CLASS_DELTA); | ||
| 293 | |||
| 294 | return idx; | ||
| 295 | } | ||
| 296 | |||
| 297 | /* | ||
| 298 | * For each size class, zspages are divided into different groups | ||
| 299 | * depending on how "full" they are. This was done so that we could | ||
| 300 | * easily find empty or nearly empty zspages when we try to shrink | ||
| 301 | * the pool (not yet implemented). This function returns fullness | ||
| 302 | * status of the given page. | ||
| 303 | */ | ||
| 304 | static enum fullness_group get_fullness_group(struct page *page) | ||
| 305 | { | ||
| 306 | int inuse, max_objects; | ||
| 307 | enum fullness_group fg; | ||
| 308 | BUG_ON(!is_first_page(page)); | ||
| 309 | |||
| 310 | inuse = page->inuse; | ||
| 311 | max_objects = page->objects; | ||
| 312 | |||
| 313 | if (inuse == 0) | ||
| 314 | fg = ZS_EMPTY; | ||
| 315 | else if (inuse == max_objects) | ||
| 316 | fg = ZS_FULL; | ||
| 317 | else if (inuse <= max_objects / fullness_threshold_frac) | ||
| 318 | fg = ZS_ALMOST_EMPTY; | ||
| 319 | else | ||
| 320 | fg = ZS_ALMOST_FULL; | ||
| 321 | |||
| 322 | return fg; | ||
| 323 | } | ||
| 324 | |||
| 325 | /* | ||
| 326 | * Each size class maintains various freelists and zspages are assigned | ||
| 327 | * to one of these freelists based on the number of live objects they | ||
| 328 | * have. This functions inserts the given zspage into the freelist | ||
| 329 | * identified by <class, fullness_group>. | ||
| 330 | */ | ||
| 331 | static void insert_zspage(struct page *page, struct size_class *class, | ||
| 332 | enum fullness_group fullness) | ||
| 333 | { | ||
| 334 | struct page **head; | ||
| 335 | |||
| 336 | BUG_ON(!is_first_page(page)); | ||
| 337 | |||
| 338 | if (fullness >= _ZS_NR_FULLNESS_GROUPS) | ||
| 339 | return; | ||
| 340 | |||
| 341 | head = &class->fullness_list[fullness]; | ||
| 342 | if (*head) | ||
| 343 | list_add_tail(&page->lru, &(*head)->lru); | ||
| 344 | |||
| 345 | *head = page; | ||
| 346 | } | ||
| 347 | |||
| 348 | /* | ||
| 349 | * This function removes the given zspage from the freelist identified | ||
| 350 | * by <class, fullness_group>. | ||
| 351 | */ | ||
| 352 | static void remove_zspage(struct page *page, struct size_class *class, | ||
| 353 | enum fullness_group fullness) | ||
| 354 | { | ||
| 355 | struct page **head; | ||
| 356 | |||
| 357 | BUG_ON(!is_first_page(page)); | ||
| 358 | |||
| 359 | if (fullness >= _ZS_NR_FULLNESS_GROUPS) | ||
| 360 | return; | ||
| 361 | |||
| 362 | head = &class->fullness_list[fullness]; | ||
| 363 | BUG_ON(!*head); | ||
| 364 | if (list_empty(&(*head)->lru)) | ||
| 365 | *head = NULL; | ||
| 366 | else if (*head == page) | ||
| 367 | *head = (struct page *)list_entry((*head)->lru.next, | ||
| 368 | struct page, lru); | ||
| 369 | |||
| 370 | list_del_init(&page->lru); | ||
| 371 | } | ||
| 372 | |||
| 373 | /* | ||
| 374 | * Each size class maintains zspages in different fullness groups depending | ||
| 375 | * on the number of live objects they contain. When allocating or freeing | ||
| 376 | * objects, the fullness status of the page can change, say, from ALMOST_FULL | ||
| 377 | * to ALMOST_EMPTY when freeing an object. This function checks if such | ||
| 378 | * a status change has occurred for the given page and accordingly moves the | ||
| 379 | * page from the freelist of the old fullness group to that of the new | ||
| 380 | * fullness group. | ||
| 381 | */ | ||
| 382 | static enum fullness_group fix_fullness_group(struct zs_pool *pool, | ||
| 383 | struct page *page) | ||
| 384 | { | ||
| 385 | int class_idx; | ||
| 386 | struct size_class *class; | ||
| 387 | enum fullness_group currfg, newfg; | ||
| 388 | |||
| 389 | BUG_ON(!is_first_page(page)); | ||
| 390 | |||
| 391 | get_zspage_mapping(page, &class_idx, &currfg); | ||
| 392 | newfg = get_fullness_group(page); | ||
| 393 | if (newfg == currfg) | ||
| 394 | goto out; | ||
| 395 | |||
| 396 | class = &pool->size_class[class_idx]; | ||
| 397 | remove_zspage(page, class, currfg); | ||
| 398 | insert_zspage(page, class, newfg); | ||
| 399 | set_zspage_mapping(page, class_idx, newfg); | ||
| 400 | |||
| 401 | out: | ||
| 402 | return newfg; | ||
| 403 | } | ||
| 404 | |||
| 405 | /* | ||
| 406 | * We have to decide on how many pages to link together | ||
| 407 | * to form a zspage for each size class. This is important | ||
| 408 | * to reduce wastage due to unusable space left at end of | ||
| 409 | * each zspage which is given as: | ||
| 410 | * wastage = Zp - Zp % size_class | ||
| 411 | * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... | ||
| 412 | * | ||
| 413 | * For example, for size class of 3/8 * PAGE_SIZE, we should | ||
| 414 | * link together 3 PAGE_SIZE sized pages to form a zspage | ||
| 415 | * since then we can perfectly fit in 8 such objects. | ||
| 416 | */ | ||
| 417 | static int get_pages_per_zspage(int class_size) | ||
| 418 | { | ||
| 419 | int i, max_usedpc = 0; | ||
| 420 | /* zspage order which gives maximum used size per KB */ | ||
| 421 | int max_usedpc_order = 1; | ||
| 422 | |||
| 423 | for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { | ||
| 424 | int zspage_size; | ||
| 425 | int waste, usedpc; | ||
| 426 | |||
| 427 | zspage_size = i * PAGE_SIZE; | ||
| 428 | waste = zspage_size % class_size; | ||
| 429 | usedpc = (zspage_size - waste) * 100 / zspage_size; | ||
| 430 | |||
| 431 | if (usedpc > max_usedpc) { | ||
| 432 | max_usedpc = usedpc; | ||
| 433 | max_usedpc_order = i; | ||
| 434 | } | ||
| 435 | } | ||
| 436 | |||
| 437 | return max_usedpc_order; | ||
| 438 | } | ||
| 439 | |||
| 440 | /* | ||
| 441 | * A single 'zspage' is composed of many system pages which are | ||
| 442 | * linked together using fields in struct page. This function finds | ||
| 443 | * the first/head page, given any component page of a zspage. | ||
| 444 | */ | ||
| 445 | static struct page *get_first_page(struct page *page) | ||
| 446 | { | ||
| 447 | if (is_first_page(page)) | ||
| 448 | return page; | ||
| 449 | else | ||
| 450 | return page->first_page; | ||
| 451 | } | ||
| 452 | |||
| 453 | static struct page *get_next_page(struct page *page) | ||
| 454 | { | ||
| 455 | struct page *next; | ||
| 456 | |||
| 457 | if (is_last_page(page)) | ||
| 458 | next = NULL; | ||
| 459 | else if (is_first_page(page)) | ||
| 460 | next = (struct page *)page_private(page); | ||
| 461 | else | ||
| 462 | next = list_entry(page->lru.next, struct page, lru); | ||
| 463 | |||
| 464 | return next; | ||
| 465 | } | ||
| 466 | |||
| 467 | /* | ||
| 468 | * Encode <page, obj_idx> as a single handle value. | ||
| 469 | * On hardware platforms with physical memory starting at 0x0 the pfn | ||
| 470 | * could be 0 so we ensure that the handle will never be 0 by adjusting the | ||
| 471 | * encoded obj_idx value before encoding. | ||
| 472 | */ | ||
| 473 | static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) | ||
| 474 | { | ||
| 475 | unsigned long handle; | ||
| 476 | |||
| 477 | if (!page) { | ||
| 478 | BUG_ON(obj_idx); | ||
| 479 | return NULL; | ||
| 480 | } | ||
| 481 | |||
| 482 | handle = page_to_pfn(page) << OBJ_INDEX_BITS; | ||
| 483 | handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); | ||
| 484 | |||
| 485 | return (void *)handle; | ||
| 486 | } | ||
| 487 | |||
| 488 | /* | ||
| 489 | * Decode <page, obj_idx> pair from the given object handle. We adjust the | ||
| 490 | * decoded obj_idx back to its original value since it was adjusted in | ||
| 491 | * obj_location_to_handle(). | ||
| 492 | */ | ||
| 493 | static void obj_handle_to_location(unsigned long handle, struct page **page, | ||
| 494 | unsigned long *obj_idx) | ||
| 495 | { | ||
| 496 | *page = pfn_to_page(handle >> OBJ_INDEX_BITS); | ||
| 497 | *obj_idx = (handle & OBJ_INDEX_MASK) - 1; | ||
| 498 | } | ||
| 499 | |||
| 500 | static unsigned long obj_idx_to_offset(struct page *page, | ||
| 501 | unsigned long obj_idx, int class_size) | ||
| 502 | { | ||
| 503 | unsigned long off = 0; | ||
| 504 | |||
| 505 | if (!is_first_page(page)) | ||
| 506 | off = page->index; | ||
| 507 | |||
| 508 | return off + obj_idx * class_size; | ||
| 509 | } | ||
| 510 | |||
| 511 | static void reset_page(struct page *page) | ||
| 512 | { | ||
| 513 | clear_bit(PG_private, &page->flags); | ||
| 514 | clear_bit(PG_private_2, &page->flags); | ||
| 515 | set_page_private(page, 0); | ||
| 516 | page->mapping = NULL; | ||
| 517 | page->freelist = NULL; | ||
| 518 | page_mapcount_reset(page); | ||
| 519 | } | ||
| 520 | |||
| 521 | static void free_zspage(struct page *first_page) | ||
| 522 | { | ||
| 523 | struct page *nextp, *tmp, *head_extra; | ||
| 524 | |||
| 525 | BUG_ON(!is_first_page(first_page)); | ||
| 526 | BUG_ON(first_page->inuse); | ||
| 527 | |||
| 528 | head_extra = (struct page *)page_private(first_page); | ||
| 529 | |||
| 530 | reset_page(first_page); | ||
| 531 | __free_page(first_page); | ||
| 532 | |||
| 533 | /* zspage with only 1 system page */ | ||
| 534 | if (!head_extra) | ||
| 535 | return; | ||
| 536 | |||
| 537 | list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { | ||
| 538 | list_del(&nextp->lru); | ||
| 539 | reset_page(nextp); | ||
| 540 | __free_page(nextp); | ||
| 541 | } | ||
| 542 | reset_page(head_extra); | ||
| 543 | __free_page(head_extra); | ||
| 544 | } | ||
| 545 | |||
| 546 | /* Initialize a newly allocated zspage */ | ||
| 547 | static void init_zspage(struct page *first_page, struct size_class *class) | ||
| 548 | { | ||
| 549 | unsigned long off = 0; | ||
| 550 | struct page *page = first_page; | ||
| 551 | |||
| 552 | BUG_ON(!is_first_page(first_page)); | ||
| 553 | while (page) { | ||
| 554 | struct page *next_page; | ||
| 555 | struct link_free *link; | ||
| 556 | unsigned int i, objs_on_page; | ||
| 557 | |||
| 558 | /* | ||
| 559 | * page->index stores offset of first object starting | ||
| 560 | * in the page. For the first page, this is always 0, | ||
| 561 | * so we use first_page->index (aka ->freelist) to store | ||
| 562 | * head of corresponding zspage's freelist. | ||
| 563 | */ | ||
| 564 | if (page != first_page) | ||
| 565 | page->index = off; | ||
| 566 | |||
| 567 | link = (struct link_free *)kmap_atomic(page) + | ||
| 568 | off / sizeof(*link); | ||
| 569 | objs_on_page = (PAGE_SIZE - off) / class->size; | ||
| 570 | |||
| 571 | for (i = 1; i <= objs_on_page; i++) { | ||
| 572 | off += class->size; | ||
| 573 | if (off < PAGE_SIZE) { | ||
| 574 | link->next = obj_location_to_handle(page, i); | ||
| 575 | link += class->size / sizeof(*link); | ||
| 576 | } | ||
| 577 | } | ||
| 578 | |||
| 579 | /* | ||
| 580 | * We now come to the last (full or partial) object on this | ||
| 581 | * page, which must point to the first object on the next | ||
| 582 | * page (if present) | ||
| 583 | */ | ||
| 584 | next_page = get_next_page(page); | ||
| 585 | link->next = obj_location_to_handle(next_page, 0); | ||
| 586 | kunmap_atomic(link); | ||
| 587 | page = next_page; | ||
| 588 | off = (off + class->size) % PAGE_SIZE; | ||
| 589 | } | ||
| 590 | } | ||
| 591 | |||
| 592 | /* | ||
| 593 | * Allocate a zspage for the given size class | ||
| 594 | */ | ||
| 595 | static struct page *alloc_zspage(struct size_class *class, gfp_t flags) | ||
| 596 | { | ||
| 597 | int i, error; | ||
| 598 | struct page *first_page = NULL, *uninitialized_var(prev_page); | ||
| 599 | |||
| 600 | /* | ||
| 601 | * Allocate individual pages and link them together as: | ||
| 602 | * 1. first page->private = first sub-page | ||
| 603 | * 2. all sub-pages are linked together using page->lru | ||
| 604 | * 3. each sub-page is linked to the first page using page->first_page | ||
| 605 | * | ||
| 606 | * For each size class, First/Head pages are linked together using | ||
| 607 | * page->lru. Also, we set PG_private to identify the first page | ||
| 608 | * (i.e. no other sub-page has this flag set) and PG_private_2 to | ||
| 609 | * identify the last page. | ||
| 610 | */ | ||
| 611 | error = -ENOMEM; | ||
| 612 | for (i = 0; i < class->pages_per_zspage; i++) { | ||
| 613 | struct page *page; | ||
| 614 | |||
| 615 | page = alloc_page(flags); | ||
| 616 | if (!page) | ||
| 617 | goto cleanup; | ||
| 618 | |||
| 619 | INIT_LIST_HEAD(&page->lru); | ||
| 620 | if (i == 0) { /* first page */ | ||
| 621 | SetPagePrivate(page); | ||
| 622 | set_page_private(page, 0); | ||
| 623 | first_page = page; | ||
| 624 | first_page->inuse = 0; | ||
| 625 | } | ||
| 626 | if (i == 1) | ||
| 627 | set_page_private(first_page, (unsigned long)page); | ||
| 628 | if (i >= 1) | ||
| 629 | page->first_page = first_page; | ||
| 630 | if (i >= 2) | ||
| 631 | list_add(&page->lru, &prev_page->lru); | ||
| 632 | if (i == class->pages_per_zspage - 1) /* last page */ | ||
| 633 | SetPagePrivate2(page); | ||
| 634 | prev_page = page; | ||
| 635 | } | ||
| 636 | |||
| 637 | init_zspage(first_page, class); | ||
| 638 | |||
| 639 | first_page->freelist = obj_location_to_handle(first_page, 0); | ||
| 640 | /* Maximum number of objects we can store in this zspage */ | ||
| 641 | first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; | ||
| 642 | |||
| 643 | error = 0; /* Success */ | ||
| 644 | |||
| 645 | cleanup: | ||
| 646 | if (unlikely(error) && first_page) { | ||
| 647 | free_zspage(first_page); | ||
| 648 | first_page = NULL; | ||
| 649 | } | ||
| 650 | |||
| 651 | return first_page; | ||
| 652 | } | ||
| 653 | |||
| 654 | static struct page *find_get_zspage(struct size_class *class) | ||
| 655 | { | ||
| 656 | int i; | ||
| 657 | struct page *page; | ||
| 658 | |||
| 659 | for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { | ||
| 660 | page = class->fullness_list[i]; | ||
| 661 | if (page) | ||
| 662 | break; | ||
| 663 | } | ||
| 664 | |||
| 665 | return page; | ||
| 666 | } | ||
| 667 | |||
| 668 | #ifdef CONFIG_PGTABLE_MAPPING | ||
| 669 | static inline int __zs_cpu_up(struct mapping_area *area) | ||
| 670 | { | ||
| 671 | /* | ||
| 672 | * Make sure we don't leak memory if a cpu UP notification | ||
| 673 | * and zs_init() race and both call zs_cpu_up() on the same cpu | ||
| 674 | */ | ||
| 675 | if (area->vm) | ||
| 676 | return 0; | ||
| 677 | area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); | ||
| 678 | if (!area->vm) | ||
| 679 | return -ENOMEM; | ||
| 680 | return 0; | ||
| 681 | } | ||
| 682 | |||
| 683 | static inline void __zs_cpu_down(struct mapping_area *area) | ||
| 684 | { | ||
| 685 | if (area->vm) | ||
| 686 | free_vm_area(area->vm); | ||
| 687 | area->vm = NULL; | ||
| 688 | } | ||
| 689 | |||
| 690 | static inline void *__zs_map_object(struct mapping_area *area, | ||
| 691 | struct page *pages[2], int off, int size) | ||
| 692 | { | ||
| 693 | BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); | ||
| 694 | area->vm_addr = area->vm->addr; | ||
| 695 | return area->vm_addr + off; | ||
| 696 | } | ||
| 697 | |||
| 698 | static inline void __zs_unmap_object(struct mapping_area *area, | ||
| 699 | struct page *pages[2], int off, int size) | ||
| 700 | { | ||
| 701 | unsigned long addr = (unsigned long)area->vm_addr; | ||
| 702 | |||
| 703 | unmap_kernel_range(addr, PAGE_SIZE * 2); | ||
| 704 | } | ||
| 705 | |||
| 706 | #else /* CONFIG_PGTABLE_MAPPING */ | ||
| 707 | |||
| 708 | static inline int __zs_cpu_up(struct mapping_area *area) | ||
| 709 | { | ||
| 710 | /* | ||
| 711 | * Make sure we don't leak memory if a cpu UP notification | ||
| 712 | * and zs_init() race and both call zs_cpu_up() on the same cpu | ||
| 713 | */ | ||
| 714 | if (area->vm_buf) | ||
| 715 | return 0; | ||
| 716 | area->vm_buf = (char *)__get_free_page(GFP_KERNEL); | ||
| 717 | if (!area->vm_buf) | ||
| 718 | return -ENOMEM; | ||
| 719 | return 0; | ||
| 720 | } | ||
| 721 | |||
| 722 | static inline void __zs_cpu_down(struct mapping_area *area) | ||
| 723 | { | ||
| 724 | if (area->vm_buf) | ||
| 725 | free_page((unsigned long)area->vm_buf); | ||
| 726 | area->vm_buf = NULL; | ||
| 727 | } | ||
| 728 | |||
| 729 | static void *__zs_map_object(struct mapping_area *area, | ||
| 730 | struct page *pages[2], int off, int size) | ||
| 731 | { | ||
| 732 | int sizes[2]; | ||
| 733 | void *addr; | ||
| 734 | char *buf = area->vm_buf; | ||
| 735 | |||
| 736 | /* disable page faults to match kmap_atomic() return conditions */ | ||
| 737 | pagefault_disable(); | ||
| 738 | |||
| 739 | /* no read fastpath */ | ||
| 740 | if (area->vm_mm == ZS_MM_WO) | ||
| 741 | goto out; | ||
| 742 | |||
| 743 | sizes[0] = PAGE_SIZE - off; | ||
| 744 | sizes[1] = size - sizes[0]; | ||
| 745 | |||
| 746 | /* copy object to per-cpu buffer */ | ||
| 747 | addr = kmap_atomic(pages[0]); | ||
| 748 | memcpy(buf, addr + off, sizes[0]); | ||
| 749 | kunmap_atomic(addr); | ||
| 750 | addr = kmap_atomic(pages[1]); | ||
| 751 | memcpy(buf + sizes[0], addr, sizes[1]); | ||
| 752 | kunmap_atomic(addr); | ||
| 753 | out: | ||
| 754 | return area->vm_buf; | ||
| 755 | } | ||
| 756 | |||
| 757 | static void __zs_unmap_object(struct mapping_area *area, | ||
| 758 | struct page *pages[2], int off, int size) | ||
| 759 | { | ||
| 760 | int sizes[2]; | ||
| 761 | void *addr; | ||
| 762 | char *buf = area->vm_buf; | ||
| 763 | |||
| 764 | /* no write fastpath */ | ||
| 765 | if (area->vm_mm == ZS_MM_RO) | ||
| 766 | goto out; | ||
| 767 | |||
| 768 | sizes[0] = PAGE_SIZE - off; | ||
| 769 | sizes[1] = size - sizes[0]; | ||
| 770 | |||
| 771 | /* copy per-cpu buffer to object */ | ||
| 772 | addr = kmap_atomic(pages[0]); | ||
| 773 | memcpy(addr + off, buf, sizes[0]); | ||
| 774 | kunmap_atomic(addr); | ||
| 775 | addr = kmap_atomic(pages[1]); | ||
| 776 | memcpy(addr, buf + sizes[0], sizes[1]); | ||
| 777 | kunmap_atomic(addr); | ||
| 778 | |||
| 779 | out: | ||
| 780 | /* enable page faults to match kunmap_atomic() return conditions */ | ||
| 781 | pagefault_enable(); | ||
| 782 | } | ||
| 783 | |||
| 784 | #endif /* CONFIG_PGTABLE_MAPPING */ | ||
| 785 | |||
| 786 | static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, | ||
| 787 | void *pcpu) | ||
| 788 | { | ||
| 789 | int ret, cpu = (long)pcpu; | ||
| 790 | struct mapping_area *area; | ||
| 791 | |||
| 792 | switch (action) { | ||
| 793 | case CPU_UP_PREPARE: | ||
| 794 | area = &per_cpu(zs_map_area, cpu); | ||
| 795 | ret = __zs_cpu_up(area); | ||
| 796 | if (ret) | ||
| 797 | return notifier_from_errno(ret); | ||
| 798 | break; | ||
| 799 | case CPU_DEAD: | ||
| 800 | case CPU_UP_CANCELED: | ||
| 801 | area = &per_cpu(zs_map_area, cpu); | ||
| 802 | __zs_cpu_down(area); | ||
| 803 | break; | ||
| 804 | } | ||
| 805 | |||
| 806 | return NOTIFY_OK; | ||
| 807 | } | ||
| 808 | |||
| 809 | static struct notifier_block zs_cpu_nb = { | ||
| 810 | .notifier_call = zs_cpu_notifier | ||
| 811 | }; | ||
| 812 | |||
| 813 | static void zs_exit(void) | ||
| 814 | { | ||
| 815 | int cpu; | ||
| 816 | |||
| 817 | for_each_online_cpu(cpu) | ||
| 818 | zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); | ||
| 819 | unregister_cpu_notifier(&zs_cpu_nb); | ||
| 820 | } | ||
| 821 | |||
| 822 | static int zs_init(void) | ||
| 823 | { | ||
| 824 | int cpu, ret; | ||
| 825 | |||
| 826 | register_cpu_notifier(&zs_cpu_nb); | ||
| 827 | for_each_online_cpu(cpu) { | ||
| 828 | ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | ||
| 829 | if (notifier_to_errno(ret)) | ||
| 830 | goto fail; | ||
| 831 | } | ||
| 832 | return 0; | ||
| 833 | fail: | ||
| 834 | zs_exit(); | ||
| 835 | return notifier_to_errno(ret); | ||
| 836 | } | ||
| 837 | |||
| 838 | /** | ||
| 839 | * zs_create_pool - Creates an allocation pool to work from. | ||
| 840 | * @flags: allocation flags used to allocate pool metadata | ||
| 841 | * | ||
| 842 | * This function must be called before anything when using | ||
| 843 | * the zsmalloc allocator. | ||
| 844 | * | ||
| 845 | * On success, a pointer to the newly created pool is returned, | ||
| 846 | * otherwise NULL. | ||
| 847 | */ | ||
| 848 | struct zs_pool *zs_create_pool(gfp_t flags) | ||
| 849 | { | ||
| 850 | int i, ovhd_size; | ||
| 851 | struct zs_pool *pool; | ||
| 852 | |||
| 853 | ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); | ||
| 854 | pool = kzalloc(ovhd_size, GFP_KERNEL); | ||
| 855 | if (!pool) | ||
| 856 | return NULL; | ||
| 857 | |||
| 858 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | ||
| 859 | int size; | ||
| 860 | struct size_class *class; | ||
| 861 | |||
| 862 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; | ||
| 863 | if (size > ZS_MAX_ALLOC_SIZE) | ||
| 864 | size = ZS_MAX_ALLOC_SIZE; | ||
| 865 | |||
| 866 | class = &pool->size_class[i]; | ||
| 867 | class->size = size; | ||
| 868 | class->index = i; | ||
| 869 | spin_lock_init(&class->lock); | ||
| 870 | class->pages_per_zspage = get_pages_per_zspage(size); | ||
| 871 | |||
| 872 | } | ||
| 873 | |||
| 874 | pool->flags = flags; | ||
| 875 | |||
| 876 | return pool; | ||
| 877 | } | ||
| 878 | EXPORT_SYMBOL_GPL(zs_create_pool); | ||
| 879 | |||
| 880 | void zs_destroy_pool(struct zs_pool *pool) | ||
| 881 | { | ||
| 882 | int i; | ||
| 883 | |||
| 884 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | ||
| 885 | int fg; | ||
| 886 | struct size_class *class = &pool->size_class[i]; | ||
| 887 | |||
| 888 | for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { | ||
| 889 | if (class->fullness_list[fg]) { | ||
| 890 | pr_info("Freeing non-empty class with size %db, fullness group %d\n", | ||
| 891 | class->size, fg); | ||
| 892 | } | ||
| 893 | } | ||
| 894 | } | ||
| 895 | kfree(pool); | ||
| 896 | } | ||
| 897 | EXPORT_SYMBOL_GPL(zs_destroy_pool); | ||
| 898 | |||
| 899 | /** | ||
| 900 | * zs_malloc - Allocate block of given size from pool. | ||
| 901 | * @pool: pool to allocate from | ||
| 902 | * @size: size of block to allocate | ||
| 903 | * | ||
| 904 | * On success, handle to the allocated object is returned, | ||
| 905 | * otherwise 0. | ||
| 906 | * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. | ||
| 907 | */ | ||
| 908 | unsigned long zs_malloc(struct zs_pool *pool, size_t size) | ||
| 909 | { | ||
| 910 | unsigned long obj; | ||
| 911 | struct link_free *link; | ||
| 912 | int class_idx; | ||
| 913 | struct size_class *class; | ||
| 914 | |||
| 915 | struct page *first_page, *m_page; | ||
| 916 | unsigned long m_objidx, m_offset; | ||
| 917 | |||
| 918 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) | ||
| 919 | return 0; | ||
| 920 | |||
| 921 | class_idx = get_size_class_index(size); | ||
| 922 | class = &pool->size_class[class_idx]; | ||
| 923 | BUG_ON(class_idx != class->index); | ||
| 924 | |||
| 925 | spin_lock(&class->lock); | ||
| 926 | first_page = find_get_zspage(class); | ||
| 927 | |||
| 928 | if (!first_page) { | ||
| 929 | spin_unlock(&class->lock); | ||
| 930 | first_page = alloc_zspage(class, pool->flags); | ||
| 931 | if (unlikely(!first_page)) | ||
| 932 | return 0; | ||
| 933 | |||
| 934 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | ||
| 935 | spin_lock(&class->lock); | ||
| 936 | class->pages_allocated += class->pages_per_zspage; | ||
| 937 | } | ||
| 938 | |||
| 939 | obj = (unsigned long)first_page->freelist; | ||
| 940 | obj_handle_to_location(obj, &m_page, &m_objidx); | ||
| 941 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | ||
| 942 | |||
| 943 | link = (struct link_free *)kmap_atomic(m_page) + | ||
| 944 | m_offset / sizeof(*link); | ||
| 945 | first_page->freelist = link->next; | ||
| 946 | memset(link, POISON_INUSE, sizeof(*link)); | ||
| 947 | kunmap_atomic(link); | ||
| 948 | |||
| 949 | first_page->inuse++; | ||
| 950 | /* Now move the zspage to another fullness group, if required */ | ||
| 951 | fix_fullness_group(pool, first_page); | ||
| 952 | spin_unlock(&class->lock); | ||
| 953 | |||
| 954 | return obj; | ||
| 955 | } | ||
| 956 | EXPORT_SYMBOL_GPL(zs_malloc); | ||
| 957 | |||
| 958 | void zs_free(struct zs_pool *pool, unsigned long obj) | ||
| 959 | { | ||
| 960 | struct link_free *link; | ||
| 961 | struct page *first_page, *f_page; | ||
| 962 | unsigned long f_objidx, f_offset; | ||
| 963 | |||
| 964 | int class_idx; | ||
| 965 | struct size_class *class; | ||
| 966 | enum fullness_group fullness; | ||
| 967 | |||
| 968 | if (unlikely(!obj)) | ||
| 969 | return; | ||
| 970 | |||
| 971 | obj_handle_to_location(obj, &f_page, &f_objidx); | ||
| 972 | first_page = get_first_page(f_page); | ||
| 973 | |||
| 974 | get_zspage_mapping(first_page, &class_idx, &fullness); | ||
| 975 | class = &pool->size_class[class_idx]; | ||
| 976 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); | ||
| 977 | |||
| 978 | spin_lock(&class->lock); | ||
| 979 | |||
| 980 | /* Insert this object in containing zspage's freelist */ | ||
| 981 | link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) | ||
| 982 | + f_offset); | ||
| 983 | link->next = first_page->freelist; | ||
| 984 | kunmap_atomic(link); | ||
| 985 | first_page->freelist = (void *)obj; | ||
| 986 | |||
| 987 | first_page->inuse--; | ||
| 988 | fullness = fix_fullness_group(pool, first_page); | ||
| 989 | |||
| 990 | if (fullness == ZS_EMPTY) | ||
| 991 | class->pages_allocated -= class->pages_per_zspage; | ||
| 992 | |||
| 993 | spin_unlock(&class->lock); | ||
| 994 | |||
| 995 | if (fullness == ZS_EMPTY) | ||
| 996 | free_zspage(first_page); | ||
| 997 | } | ||
| 998 | EXPORT_SYMBOL_GPL(zs_free); | ||
| 999 | |||
| 1000 | /** | ||
| 1001 | * zs_map_object - get address of allocated object from handle. | ||
| 1002 | * @pool: pool from which the object was allocated | ||
| 1003 | * @handle: handle returned from zs_malloc | ||
| 1004 | * | ||
| 1005 | * Before using an object allocated from zs_malloc, it must be mapped using | ||
| 1006 | * this function. When done with the object, it must be unmapped using | ||
| 1007 | * zs_unmap_object. | ||
| 1008 | * | ||
| 1009 | * Only one object can be mapped per cpu at a time. There is no protection | ||
| 1010 | * against nested mappings. | ||
| 1011 | * | ||
| 1012 | * This function returns with preemption and page faults disabled. | ||
| 1013 | */ | ||
| 1014 | void *zs_map_object(struct zs_pool *pool, unsigned long handle, | ||
| 1015 | enum zs_mapmode mm) | ||
| 1016 | { | ||
| 1017 | struct page *page; | ||
| 1018 | unsigned long obj_idx, off; | ||
| 1019 | |||
| 1020 | unsigned int class_idx; | ||
| 1021 | enum fullness_group fg; | ||
| 1022 | struct size_class *class; | ||
| 1023 | struct mapping_area *area; | ||
| 1024 | struct page *pages[2]; | ||
| 1025 | |||
| 1026 | BUG_ON(!handle); | ||
| 1027 | |||
| 1028 | /* | ||
| 1029 | * Because we use per-cpu mapping areas shared among the | ||
| 1030 | * pools/users, we can't allow mapping in interrupt context | ||
| 1031 | * because it can corrupt another users mappings. | ||
| 1032 | */ | ||
| 1033 | BUG_ON(in_interrupt()); | ||
| 1034 | |||
| 1035 | obj_handle_to_location(handle, &page, &obj_idx); | ||
| 1036 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | ||
| 1037 | class = &pool->size_class[class_idx]; | ||
| 1038 | off = obj_idx_to_offset(page, obj_idx, class->size); | ||
| 1039 | |||
| 1040 | area = &get_cpu_var(zs_map_area); | ||
| 1041 | area->vm_mm = mm; | ||
| 1042 | if (off + class->size <= PAGE_SIZE) { | ||
| 1043 | /* this object is contained entirely within a page */ | ||
| 1044 | area->vm_addr = kmap_atomic(page); | ||
| 1045 | return area->vm_addr + off; | ||
| 1046 | } | ||
| 1047 | |||
| 1048 | /* this object spans two pages */ | ||
| 1049 | pages[0] = page; | ||
| 1050 | pages[1] = get_next_page(page); | ||
| 1051 | BUG_ON(!pages[1]); | ||
| 1052 | |||
| 1053 | return __zs_map_object(area, pages, off, class->size); | ||
| 1054 | } | ||
| 1055 | EXPORT_SYMBOL_GPL(zs_map_object); | ||
| 1056 | |||
| 1057 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | ||
| 1058 | { | ||
| 1059 | struct page *page; | ||
| 1060 | unsigned long obj_idx, off; | ||
| 1061 | |||
| 1062 | unsigned int class_idx; | ||
| 1063 | enum fullness_group fg; | ||
| 1064 | struct size_class *class; | ||
| 1065 | struct mapping_area *area; | ||
| 1066 | |||
| 1067 | BUG_ON(!handle); | ||
| 1068 | |||
| 1069 | obj_handle_to_location(handle, &page, &obj_idx); | ||
| 1070 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | ||
| 1071 | class = &pool->size_class[class_idx]; | ||
| 1072 | off = obj_idx_to_offset(page, obj_idx, class->size); | ||
| 1073 | |||
| 1074 | area = &__get_cpu_var(zs_map_area); | ||
| 1075 | if (off + class->size <= PAGE_SIZE) | ||
| 1076 | kunmap_atomic(area->vm_addr); | ||
| 1077 | else { | ||
| 1078 | struct page *pages[2]; | ||
| 1079 | |||
| 1080 | pages[0] = page; | ||
| 1081 | pages[1] = get_next_page(page); | ||
| 1082 | BUG_ON(!pages[1]); | ||
| 1083 | |||
| 1084 | __zs_unmap_object(area, pages, off, class->size); | ||
| 1085 | } | ||
| 1086 | put_cpu_var(zs_map_area); | ||
| 1087 | } | ||
| 1088 | EXPORT_SYMBOL_GPL(zs_unmap_object); | ||
| 1089 | |||
| 1090 | u64 zs_get_total_size_bytes(struct zs_pool *pool) | ||
| 1091 | { | ||
| 1092 | int i; | ||
| 1093 | u64 npages = 0; | ||
| 1094 | |||
| 1095 | for (i = 0; i < ZS_SIZE_CLASSES; i++) | ||
| 1096 | npages += pool->size_class[i].pages_allocated; | ||
| 1097 | |||
| 1098 | return npages << PAGE_SHIFT; | ||
| 1099 | } | ||
| 1100 | EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); | ||
| 1101 | |||
| 1102 | module_init(zs_init); | ||
| 1103 | module_exit(zs_exit); | ||
| 1104 | |||
| 1105 | MODULE_LICENSE("Dual BSD/GPL"); | ||
| 1106 | MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); | ||
diff --git a/mm/zswap.c b/mm/zswap.c index 5a63f78a5601..e55bab9dc41f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
| @@ -77,12 +77,12 @@ static u64 zswap_duplicate_entry; | |||
| 77 | **********************************/ | 77 | **********************************/ |
| 78 | /* Enable/disable zswap (disabled by default, fixed at boot for now) */ | 78 | /* Enable/disable zswap (disabled by default, fixed at boot for now) */ |
| 79 | static bool zswap_enabled __read_mostly; | 79 | static bool zswap_enabled __read_mostly; |
| 80 | module_param_named(enabled, zswap_enabled, bool, 0); | 80 | module_param_named(enabled, zswap_enabled, bool, 0444); |
| 81 | 81 | ||
| 82 | /* Compressor to be used by zswap (fixed at boot for now) */ | 82 | /* Compressor to be used by zswap (fixed at boot for now) */ |
| 83 | #define ZSWAP_COMPRESSOR_DEFAULT "lzo" | 83 | #define ZSWAP_COMPRESSOR_DEFAULT "lzo" |
| 84 | static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; | 84 | static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; |
| 85 | module_param_named(compressor, zswap_compressor, charp, 0); | 85 | module_param_named(compressor, zswap_compressor, charp, 0444); |
| 86 | 86 | ||
| 87 | /* The maximum percentage of memory that the compressed pool can occupy */ | 87 | /* The maximum percentage of memory that the compressed pool can occupy */ |
| 88 | static unsigned int zswap_max_pool_percent = 20; | 88 | static unsigned int zswap_max_pool_percent = 20; |
