aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2014-02-07 14:27:30 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2014-02-07 14:27:30 -0500
commita3b072cd180c12e8fe0ece9487b9065808327640 (patch)
tree62b982041be84748852d77cdf6ca5639ef40858f /mm
parent75a1ba5b2c529db60ca49626bcaf0bddf4548438 (diff)
parent081cd62a010f97b5bc1d2b0cd123c5abc692b68a (diff)
Merge tag 'efi-urgent' into x86/urgent
* Avoid WARN_ON() when mapping BGRT on Baytrail (EFI 32-bit). Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig25
-rw-r--r--mm/Makefile1
-rw-r--r--mm/balloon_compaction.c4
-rw-r--r--mm/bounce.c44
-rw-r--r--mm/cleancache.c6
-rw-r--r--mm/compaction.c68
-rw-r--r--mm/filemap.c58
-rw-r--r--mm/huge_memory.c60
-rw-r--r--mm/hugetlb.c56
-rw-r--r--mm/hugetlb_cgroup.c24
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h24
-rw-r--r--mm/ksm.c135
-rw-r--r--mm/memblock.c393
-rw-r--r--mm/memcontrol.c968
-rw-r--r--mm/memory-failure.c31
-rw-r--r--mm/memory.c26
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/mempolicy.c32
-rw-r--r--mm/migrate.c99
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c126
-rw-r--r--mm/mm_init.c3
-rw-r--r--mm/mmap.c64
-rw-r--r--mm/mmu_notifier.c3
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/nobootmem.c35
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/oom_kill.c65
-rw-r--r--mm/page-writeback.c57
-rw-r--r--mm/page_alloc.c191
-rw-r--r--mm/page_cgroup.c7
-rw-r--r--mm/page_io.c14
-rw-r--r--mm/percpu.c42
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c590
-rw-r--r--mm/shmem.c65
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h26
-rw-r--r--mm/slab_common.c89
-rw-r--r--mm/slub.c75
-rw-r--r--mm/sparse-vmemmap.c6
-rw-r--r--mm/sparse.c27
-rw-r--r--mm/swap.c294
-rw-r--r--mm/swap_state.c16
-rw-r--r--mm/swapfile.c11
-rw-r--r--mm/util.c36
-rw-r--r--mm/vmpressure.c26
-rw-r--r--mm/vmscan.c87
-rw-r--r--mm/zsmalloc.c1106
-rw-r--r--mm/zswap.c4
51 files changed, 3496 insertions, 1666 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 723bbe04a0b0..2d9f1504d75e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -552,3 +552,28 @@ config MEM_SOFT_DIRTY
552 it can be cleared by hands. 552 it can be cleared by hands.
553 553
554 See Documentation/vm/soft-dirty.txt for more details. 554 See Documentation/vm/soft-dirty.txt for more details.
555
556config ZSMALLOC
557 bool "Memory allocator for compressed pages"
558 depends on MMU
559 default n
560 help
561 zsmalloc is a slab-based memory allocator designed to store
562 compressed RAM pages. zsmalloc uses virtual memory mapping
563 in order to reduce fragmentation. However, this results in a
564 non-standard allocator interface where a handle, not a pointer, is
565 returned by an alloc(). This handle must be mapped in order to
566 access the allocated space.
567
568config PGTABLE_MAPPING
569 bool "Use page table mapping to access object in zsmalloc"
570 depends on ZSMALLOC
571 help
572 By default, zsmalloc uses a copy-based object mapping method to
573 access allocations that span two pages. However, if a particular
574 architecture (ex, ARM) performs VM mapping faster than copying,
575 then you should select this. This causes zsmalloc to use page table
576 mapping rather than copying for object mapping.
577
578 You can check speed with zsmalloc benchmark[1].
579 [1] https://github.com/spartacus06/zsmalloc
diff --git a/mm/Makefile b/mm/Makefile
index 305d10acd081..310c90a09264 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
60obj-$(CONFIG_CLEANCACHE) += cleancache.o 60obj-$(CONFIG_CLEANCACHE) += cleancache.o
61obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o 61obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
62obj-$(CONFIG_ZBUD) += zbud.o 62obj-$(CONFIG_ZBUD) += zbud.o
63obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 07dbc8ec46cf..6e45a5074bf0 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -267,7 +267,7 @@ void balloon_page_putback(struct page *page)
267 put_page(page); 267 put_page(page);
268 } else { 268 } else {
269 WARN_ON(1); 269 WARN_ON(1);
270 dump_page(page); 270 dump_page(page, "not movable balloon page");
271 } 271 }
272 unlock_page(page); 272 unlock_page(page);
273} 273}
@@ -287,7 +287,7 @@ int balloon_page_migrate(struct page *newpage,
287 BUG_ON(!trylock_page(newpage)); 287 BUG_ON(!trylock_page(newpage));
288 288
289 if (WARN_ON(!__is_movable_balloon_page(page))) { 289 if (WARN_ON(!__is_movable_balloon_page(page))) {
290 dump_page(page); 290 dump_page(page, "not movable balloon page");
291 unlock_page(newpage); 291 unlock_page(newpage);
292 return rc; 292 return rc;
293 } 293 }
diff --git a/mm/bounce.c b/mm/bounce.c
index 5a7d58fb883b..523918b8c6dc 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -98,27 +98,24 @@ int init_emergency_isa_pool(void)
98static void copy_to_high_bio_irq(struct bio *to, struct bio *from) 98static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
99{ 99{
100 unsigned char *vfrom; 100 unsigned char *vfrom;
101 struct bio_vec *tovec, *fromvec; 101 struct bio_vec tovec, *fromvec = from->bi_io_vec;
102 int i; 102 struct bvec_iter iter;
103 103
104 bio_for_each_segment(tovec, to, i) { 104 bio_for_each_segment(tovec, to, iter) {
105 fromvec = from->bi_io_vec + i; 105 if (tovec.bv_page != fromvec->bv_page) {
106 106 /*
107 /* 107 * fromvec->bv_offset and fromvec->bv_len might have
108 * not bounced 108 * been modified by the block layer, so use the original
109 */ 109 * copy, bounce_copy_vec already uses tovec->bv_len
110 if (tovec->bv_page == fromvec->bv_page) 110 */
111 continue; 111 vfrom = page_address(fromvec->bv_page) +
112 112 tovec.bv_offset;
113 /* 113
114 * fromvec->bv_offset and fromvec->bv_len might have been 114 bounce_copy_vec(&tovec, vfrom);
115 * modified by the block layer, so use the original copy, 115 flush_dcache_page(tovec.bv_page);
116 * bounce_copy_vec already uses tovec->bv_len 116 }
117 */
118 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
119 117
120 bounce_copy_vec(tovec, vfrom); 118 fromvec++;
121 flush_dcache_page(tovec->bv_page);
122 } 119 }
123} 120}
124 121
@@ -201,13 +198,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
201{ 198{
202 struct bio *bio; 199 struct bio *bio;
203 int rw = bio_data_dir(*bio_orig); 200 int rw = bio_data_dir(*bio_orig);
204 struct bio_vec *to, *from; 201 struct bio_vec *to, from;
202 struct bvec_iter iter;
205 unsigned i; 203 unsigned i;
206 204
207 if (force) 205 if (force)
208 goto bounce; 206 goto bounce;
209 bio_for_each_segment(from, *bio_orig, i) 207 bio_for_each_segment(from, *bio_orig, iter)
210 if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) 208 if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
211 goto bounce; 209 goto bounce;
212 210
213 return; 211 return;
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5875f48ce279..d0eac4350403 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -237,7 +237,7 @@ int __cleancache_get_page(struct page *page)
237 goto out; 237 goto out;
238 } 238 }
239 239
240 VM_BUG_ON(!PageLocked(page)); 240 VM_BUG_ON_PAGE(!PageLocked(page), page);
241 fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; 241 fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
242 if (fake_pool_id < 0) 242 if (fake_pool_id < 0)
243 goto out; 243 goto out;
@@ -279,7 +279,7 @@ void __cleancache_put_page(struct page *page)
279 return; 279 return;
280 } 280 }
281 281
282 VM_BUG_ON(!PageLocked(page)); 282 VM_BUG_ON_PAGE(!PageLocked(page), page);
283 fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; 283 fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
284 if (fake_pool_id < 0) 284 if (fake_pool_id < 0)
285 return; 285 return;
@@ -318,7 +318,7 @@ void __cleancache_invalidate_page(struct address_space *mapping,
318 if (pool_id < 0) 318 if (pool_id < 0)
319 return; 319 return;
320 320
321 VM_BUG_ON(!PageLocked(page)); 321 VM_BUG_ON_PAGE(!PageLocked(page), page);
322 if (cleancache_get_key(mapping->host, &key) >= 0) { 322 if (cleancache_get_key(mapping->host, &key) >= 0) {
323 cleancache_ops->invalidate_page(pool_id, 323 cleancache_ops->invalidate_page(pool_id,
324 key, page->index); 324 key, page->index);
diff --git a/mm/compaction.c b/mm/compaction.c
index f58bcd016f43..b48c5259ea33 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
459 unsigned long flags; 459 unsigned long flags;
460 bool locked = false; 460 bool locked = false;
461 struct page *page = NULL, *valid_page = NULL; 461 struct page *page = NULL, *valid_page = NULL;
462 bool skipped_async_unsuitable = false;
462 463
463 /* 464 /*
464 * Ensure that there are not too many pages isolated from the LRU 465 * Ensure that there are not too many pages isolated from the LRU
@@ -522,7 +523,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
522 if (!isolation_suitable(cc, page)) 523 if (!isolation_suitable(cc, page))
523 goto next_pageblock; 524 goto next_pageblock;
524 525
525 /* Skip if free */ 526 /*
527 * Skip if free. page_order cannot be used without zone->lock
528 * as nothing prevents parallel allocations or buddy merging.
529 */
526 if (PageBuddy(page)) 530 if (PageBuddy(page))
527 continue; 531 continue;
528 532
@@ -534,6 +538,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
534 if (!cc->sync && last_pageblock_nr != pageblock_nr && 538 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
535 !migrate_async_suitable(get_pageblock_migratetype(page))) { 539 !migrate_async_suitable(get_pageblock_migratetype(page))) {
536 cc->finished_update_migrate = true; 540 cc->finished_update_migrate = true;
541 skipped_async_unsuitable = true;
537 goto next_pageblock; 542 goto next_pageblock;
538 } 543 }
539 544
@@ -599,7 +604,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
599 if (__isolate_lru_page(page, mode) != 0) 604 if (__isolate_lru_page(page, mode) != 0)
600 continue; 605 continue;
601 606
602 VM_BUG_ON(PageTransCompound(page)); 607 VM_BUG_ON_PAGE(PageTransCompound(page), page);
603 608
604 /* Successfully isolated */ 609 /* Successfully isolated */
605 cc->finished_update_migrate = true; 610 cc->finished_update_migrate = true;
@@ -627,8 +632,13 @@ next_pageblock:
627 if (locked) 632 if (locked)
628 spin_unlock_irqrestore(&zone->lru_lock, flags); 633 spin_unlock_irqrestore(&zone->lru_lock, flags);
629 634
630 /* Update the pageblock-skip if the whole pageblock was scanned */ 635 /*
631 if (low_pfn == end_pfn) 636 * Update the pageblock-skip information and cached scanner pfn,
637 * if the whole pageblock was scanned without isolating any page.
638 * This is not done when pageblock was skipped due to being unsuitable
639 * for async compaction, so that eventual sync compaction can try.
640 */
641 if (low_pfn == end_pfn && !skipped_async_unsuitable)
632 update_pageblock_skip(cc, valid_page, nr_isolated, true); 642 update_pageblock_skip(cc, valid_page, nr_isolated, true);
633 643
634 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 644 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -660,7 +670,7 @@ static void isolate_freepages(struct zone *zone,
660 * is the end of the pageblock the migration scanner is using. 670 * is the end of the pageblock the migration scanner is using.
661 */ 671 */
662 pfn = cc->free_pfn; 672 pfn = cc->free_pfn;
663 low_pfn = cc->migrate_pfn + pageblock_nr_pages; 673 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
664 674
665 /* 675 /*
666 * Take care that if the migration scanner is at the end of the zone 676 * Take care that if the migration scanner is at the end of the zone
@@ -676,7 +686,7 @@ static void isolate_freepages(struct zone *zone,
676 * pages on cc->migratepages. We stop searching if the migrate 686 * pages on cc->migratepages. We stop searching if the migrate
677 * and free page scanners meet or enough free pages are isolated. 687 * and free page scanners meet or enough free pages are isolated.
678 */ 688 */
679 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; 689 for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
680 pfn -= pageblock_nr_pages) { 690 pfn -= pageblock_nr_pages) {
681 unsigned long isolated; 691 unsigned long isolated;
682 692
@@ -738,7 +748,14 @@ static void isolate_freepages(struct zone *zone,
738 /* split_free_page does not map the pages */ 748 /* split_free_page does not map the pages */
739 map_pages(freelist); 749 map_pages(freelist);
740 750
741 cc->free_pfn = high_pfn; 751 /*
752 * If we crossed the migrate scanner, we want to keep it that way
753 * so that compact_finished() may detect this
754 */
755 if (pfn < low_pfn)
756 cc->free_pfn = max(pfn, zone->zone_start_pfn);
757 else
758 cc->free_pfn = high_pfn;
742 cc->nr_freepages = nr_freepages; 759 cc->nr_freepages = nr_freepages;
743} 760}
744 761
@@ -837,6 +854,10 @@ static int compact_finished(struct zone *zone,
837 854
838 /* Compaction run completes if the migrate and free scanner meet */ 855 /* Compaction run completes if the migrate and free scanner meet */
839 if (cc->free_pfn <= cc->migrate_pfn) { 856 if (cc->free_pfn <= cc->migrate_pfn) {
857 /* Let the next compaction start anew. */
858 zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
859 zone->compact_cached_free_pfn = zone_end_pfn(zone);
860
840 /* 861 /*
841 * Mark that the PG_migrate_skip information should be cleared 862 * Mark that the PG_migrate_skip information should be cleared
842 * by kswapd when it goes to sleep. kswapd does not set the 863 * by kswapd when it goes to sleep. kswapd does not set the
@@ -947,6 +968,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
947 } 968 }
948 969
949 /* 970 /*
971 * Clear pageblock skip if there were failures recently and compaction
972 * is about to be retried after being deferred. kswapd does not do
973 * this reset as it'll reset the cached information when going to sleep.
974 */
975 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
976 __reset_isolation_suitable(zone);
977
978 /*
950 * Setup to move all movable pages to the end of the zone. Used cached 979 * Setup to move all movable pages to the end of the zone. Used cached
951 * information on where the scanners should start but check that it 980 * information on where the scanners should start but check that it
952 * is initialised by ensuring the values are within zone boundaries. 981 * is initialised by ensuring the values are within zone boundaries.
@@ -962,13 +991,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
962 zone->compact_cached_migrate_pfn = cc->migrate_pfn; 991 zone->compact_cached_migrate_pfn = cc->migrate_pfn;
963 } 992 }
964 993
965 /* 994 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
966 * Clear pageblock skip if there were failures recently and compaction
967 * is about to be retried after being deferred. kswapd does not do
968 * this reset as it'll reset the cached information when going to sleep.
969 */
970 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
971 __reset_isolation_suitable(zone);
972 995
973 migrate_prep_local(); 996 migrate_prep_local();
974 997
@@ -1003,7 +1026,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1003 if (err) { 1026 if (err) {
1004 putback_movable_pages(&cc->migratepages); 1027 putback_movable_pages(&cc->migratepages);
1005 cc->nr_migratepages = 0; 1028 cc->nr_migratepages = 0;
1006 if (err == -ENOMEM) { 1029 /*
1030 * migrate_pages() may return -ENOMEM when scanners meet
1031 * and we want compact_finished() to detect it
1032 */
1033 if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
1007 ret = COMPACT_PARTIAL; 1034 ret = COMPACT_PARTIAL;
1008 goto out; 1035 goto out;
1009 } 1036 }
@@ -1015,6 +1042,8 @@ out:
1015 cc->nr_freepages -= release_freepages(&cc->freepages); 1042 cc->nr_freepages -= release_freepages(&cc->freepages);
1016 VM_BUG_ON(cc->nr_freepages != 0); 1043 VM_BUG_ON(cc->nr_freepages != 0);
1017 1044
1045 trace_mm_compaction_end(ret);
1046
1018 return ret; 1047 return ret;
1019} 1048}
1020 1049
@@ -1120,12 +1149,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1120 compact_zone(zone, cc); 1149 compact_zone(zone, cc);
1121 1150
1122 if (cc->order > 0) { 1151 if (cc->order > 0) {
1123 int ok = zone_watermark_ok(zone, cc->order, 1152 if (zone_watermark_ok(zone, cc->order,
1124 low_wmark_pages(zone), 0, 0); 1153 low_wmark_pages(zone), 0, 0))
1125 if (ok && cc->order >= zone->compact_order_failed) 1154 compaction_defer_reset(zone, cc->order, false);
1126 zone->compact_order_failed = cc->order + 1;
1127 /* Currently async compaction is never deferred. */ 1155 /* Currently async compaction is never deferred. */
1128 else if (!ok && cc->sync) 1156 else if (cc->sync)
1129 defer_compaction(zone, cc->order); 1157 defer_compaction(zone, cc->order);
1130 } 1158 }
1131 1159
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a92021c..d56d3c145b9f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -409,9 +409,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
409{ 409{
410 int error; 410 int error;
411 411
412 VM_BUG_ON(!PageLocked(old)); 412 VM_BUG_ON_PAGE(!PageLocked(old), old);
413 VM_BUG_ON(!PageLocked(new)); 413 VM_BUG_ON_PAGE(!PageLocked(new), new);
414 VM_BUG_ON(new->mapping); 414 VM_BUG_ON_PAGE(new->mapping, new);
415 415
416 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 416 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
417 if (!error) { 417 if (!error) {
@@ -461,8 +461,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
461{ 461{
462 int error; 462 int error;
463 463
464 VM_BUG_ON(!PageLocked(page)); 464 VM_BUG_ON_PAGE(!PageLocked(page), page);
465 VM_BUG_ON(PageSwapBacked(page)); 465 VM_BUG_ON_PAGE(PageSwapBacked(page), page);
466 466
467 error = mem_cgroup_cache_charge(page, current->mm, 467 error = mem_cgroup_cache_charge(page, current->mm,
468 gfp_mask & GFP_RECLAIM_MASK); 468 gfp_mask & GFP_RECLAIM_MASK);
@@ -607,7 +607,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
607 */ 607 */
608void unlock_page(struct page *page) 608void unlock_page(struct page *page)
609{ 609{
610 VM_BUG_ON(!PageLocked(page)); 610 VM_BUG_ON_PAGE(!PageLocked(page), page);
611 clear_bit_unlock(PG_locked, &page->flags); 611 clear_bit_unlock(PG_locked, &page->flags);
612 smp_mb__after_clear_bit(); 612 smp_mb__after_clear_bit();
613 wake_up_page(page, PG_locked); 613 wake_up_page(page, PG_locked);
@@ -760,7 +760,7 @@ repeat:
760 page_cache_release(page); 760 page_cache_release(page);
761 goto repeat; 761 goto repeat;
762 } 762 }
763 VM_BUG_ON(page->index != offset); 763 VM_BUG_ON_PAGE(page->index != offset, page);
764 } 764 }
765 return page; 765 return page;
766} 766}
@@ -1428,30 +1428,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1428 if (!count) 1428 if (!count)
1429 goto out; /* skip atime */ 1429 goto out; /* skip atime */
1430 size = i_size_read(inode); 1430 size = i_size_read(inode);
1431 if (pos < size) { 1431 retval = filemap_write_and_wait_range(mapping, pos,
1432 retval = filemap_write_and_wait_range(mapping, pos,
1433 pos + iov_length(iov, nr_segs) - 1); 1432 pos + iov_length(iov, nr_segs) - 1);
1434 if (!retval) { 1433 if (!retval) {
1435 retval = mapping->a_ops->direct_IO(READ, iocb, 1434 retval = mapping->a_ops->direct_IO(READ, iocb,
1436 iov, pos, nr_segs); 1435 iov, pos, nr_segs);
1437 } 1436 }
1438 if (retval > 0) { 1437 if (retval > 0) {
1439 *ppos = pos + retval; 1438 *ppos = pos + retval;
1440 count -= retval; 1439 count -= retval;
1441 } 1440 }
1442 1441
1443 /* 1442 /*
1444 * Btrfs can have a short DIO read if we encounter 1443 * Btrfs can have a short DIO read if we encounter
1445 * compressed extents, so if there was an error, or if 1444 * compressed extents, so if there was an error, or if
1446 * we've already read everything we wanted to, or if 1445 * we've already read everything we wanted to, or if
1447 * there was a short read because we hit EOF, go ahead 1446 * there was a short read because we hit EOF, go ahead
1448 * and return. Otherwise fallthrough to buffered io for 1447 * and return. Otherwise fallthrough to buffered io for
1449 * the rest of the read. 1448 * the rest of the read.
1450 */ 1449 */
1451 if (retval < 0 || !count || *ppos >= size) { 1450 if (retval < 0 || !count || *ppos >= size) {
1452 file_accessed(filp); 1451 file_accessed(filp);
1453 goto out; 1452 goto out;
1454 }
1455 } 1453 }
1456 } 1454 }
1457 1455
@@ -1656,7 +1654,7 @@ retry_find:
1656 put_page(page); 1654 put_page(page);
1657 goto retry_find; 1655 goto retry_find;
1658 } 1656 }
1659 VM_BUG_ON(page->index != offset); 1657 VM_BUG_ON_PAGE(page->index != offset, page);
1660 1658
1661 /* 1659 /*
1662 * We have a locked page in the page cache, now we need to check 1660 * We have a locked page in the page cache, now we need to check
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 95d1acb0f3d2..82166bf974e1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -130,8 +130,14 @@ static int set_recommended_min_free_kbytes(void)
130 (unsigned long) nr_free_buffer_pages() / 20); 130 (unsigned long) nr_free_buffer_pages() / 20);
131 recommended_min <<= (PAGE_SHIFT-10); 131 recommended_min <<= (PAGE_SHIFT-10);
132 132
133 if (recommended_min > min_free_kbytes) 133 if (recommended_min > min_free_kbytes) {
134 if (user_min_free_kbytes >= 0)
135 pr_info("raising min_free_kbytes from %d to %lu "
136 "to help transparent hugepage allocations\n",
137 min_free_kbytes, recommended_min);
138
134 min_free_kbytes = recommended_min; 139 min_free_kbytes = recommended_min;
140 }
135 setup_per_zone_wmarks(); 141 setup_per_zone_wmarks();
136 return 0; 142 return 0;
137} 143}
@@ -655,7 +661,7 @@ out:
655 hugepage_exit_sysfs(hugepage_kobj); 661 hugepage_exit_sysfs(hugepage_kobj);
656 return err; 662 return err;
657} 663}
658module_init(hugepage_init) 664subsys_initcall(hugepage_init);
659 665
660static int __init setup_transparent_hugepage(char *str) 666static int __init setup_transparent_hugepage(char *str)
661{ 667{
@@ -712,7 +718,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
712 pgtable_t pgtable; 718 pgtable_t pgtable;
713 spinlock_t *ptl; 719 spinlock_t *ptl;
714 720
715 VM_BUG_ON(!PageCompound(page)); 721 VM_BUG_ON_PAGE(!PageCompound(page), page);
716 pgtable = pte_alloc_one(mm, haddr); 722 pgtable = pte_alloc_one(mm, haddr);
717 if (unlikely(!pgtable)) 723 if (unlikely(!pgtable))
718 return VM_FAULT_OOM; 724 return VM_FAULT_OOM;
@@ -893,7 +899,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
893 goto out; 899 goto out;
894 } 900 }
895 src_page = pmd_page(pmd); 901 src_page = pmd_page(pmd);
896 VM_BUG_ON(!PageHead(src_page)); 902 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
897 get_page(src_page); 903 get_page(src_page);
898 page_dup_rmap(src_page); 904 page_dup_rmap(src_page);
899 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 905 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1067,7 +1073,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1067 ptl = pmd_lock(mm, pmd); 1073 ptl = pmd_lock(mm, pmd);
1068 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1074 if (unlikely(!pmd_same(*pmd, orig_pmd)))
1069 goto out_free_pages; 1075 goto out_free_pages;
1070 VM_BUG_ON(!PageHead(page)); 1076 VM_BUG_ON_PAGE(!PageHead(page), page);
1071 1077
1072 pmdp_clear_flush(vma, haddr, pmd); 1078 pmdp_clear_flush(vma, haddr, pmd);
1073 /* leave pmd empty until pte is filled */ 1079 /* leave pmd empty until pte is filled */
@@ -1133,7 +1139,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1133 goto out_unlock; 1139 goto out_unlock;
1134 1140
1135 page = pmd_page(orig_pmd); 1141 page = pmd_page(orig_pmd);
1136 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1142 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1137 if (page_mapcount(page) == 1) { 1143 if (page_mapcount(page) == 1) {
1138 pmd_t entry; 1144 pmd_t entry;
1139 entry = pmd_mkyoung(orig_pmd); 1145 entry = pmd_mkyoung(orig_pmd);
@@ -1211,7 +1217,7 @@ alloc:
1211 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1217 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1212 put_huge_zero_page(); 1218 put_huge_zero_page();
1213 } else { 1219 } else {
1214 VM_BUG_ON(!PageHead(page)); 1220 VM_BUG_ON_PAGE(!PageHead(page), page);
1215 page_remove_rmap(page); 1221 page_remove_rmap(page);
1216 put_page(page); 1222 put_page(page);
1217 } 1223 }
@@ -1249,7 +1255,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1249 goto out; 1255 goto out;
1250 1256
1251 page = pmd_page(*pmd); 1257 page = pmd_page(*pmd);
1252 VM_BUG_ON(!PageHead(page)); 1258 VM_BUG_ON_PAGE(!PageHead(page), page);
1253 if (flags & FOLL_TOUCH) { 1259 if (flags & FOLL_TOUCH) {
1254 pmd_t _pmd; 1260 pmd_t _pmd;
1255 /* 1261 /*
@@ -1274,7 +1280,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1274 } 1280 }
1275 } 1281 }
1276 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1282 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1277 VM_BUG_ON(!PageCompound(page)); 1283 VM_BUG_ON_PAGE(!PageCompound(page), page);
1278 if (flags & FOLL_GET) 1284 if (flags & FOLL_GET)
1279 get_page_foll(page); 1285 get_page_foll(page);
1280 1286
@@ -1432,9 +1438,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1432 } else { 1438 } else {
1433 page = pmd_page(orig_pmd); 1439 page = pmd_page(orig_pmd);
1434 page_remove_rmap(page); 1440 page_remove_rmap(page);
1435 VM_BUG_ON(page_mapcount(page) < 0); 1441 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1436 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1442 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1437 VM_BUG_ON(!PageHead(page)); 1443 VM_BUG_ON_PAGE(!PageHead(page), page);
1438 atomic_long_dec(&tlb->mm->nr_ptes); 1444 atomic_long_dec(&tlb->mm->nr_ptes);
1439 spin_unlock(ptl); 1445 spin_unlock(ptl);
1440 tlb_remove_page(tlb, page); 1446 tlb_remove_page(tlb, page);
@@ -1502,19 +1508,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1502 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1508 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1503 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1509 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1504 VM_BUG_ON(!pmd_none(*new_pmd)); 1510 VM_BUG_ON(!pmd_none(*new_pmd));
1505 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1506 if (new_ptl != old_ptl) {
1507 pgtable_t pgtable;
1508 1511
1509 /* 1512 if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
1510 * Move preallocated PTE page table if new_pmd is on 1513 pgtable_t pgtable;
1511 * different PMD page table.
1512 */
1513 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 1514 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1514 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 1515 pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
1515
1516 spin_unlock(new_ptl);
1517 } 1516 }
1517 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1518 if (new_ptl != old_ptl)
1519 spin_unlock(new_ptl);
1518 spin_unlock(old_ptl); 1520 spin_unlock(old_ptl);
1519 } 1521 }
1520out: 1522out:
@@ -2176,9 +2178,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2176 if (unlikely(!page)) 2178 if (unlikely(!page))
2177 goto out; 2179 goto out;
2178 2180
2179 VM_BUG_ON(PageCompound(page)); 2181 VM_BUG_ON_PAGE(PageCompound(page), page);
2180 BUG_ON(!PageAnon(page)); 2182 VM_BUG_ON_PAGE(!PageAnon(page), page);
2181 VM_BUG_ON(!PageSwapBacked(page)); 2183 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
2182 2184
2183 /* cannot use mapcount: can't collapse if there's a gup pin */ 2185 /* cannot use mapcount: can't collapse if there's a gup pin */
2184 if (page_count(page) != 1) 2186 if (page_count(page) != 1)
@@ -2201,8 +2203,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2201 } 2203 }
2202 /* 0 stands for page_is_file_cache(page) == false */ 2204 /* 0 stands for page_is_file_cache(page) == false */
2203 inc_zone_page_state(page, NR_ISOLATED_ANON + 0); 2205 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
2204 VM_BUG_ON(!PageLocked(page)); 2206 VM_BUG_ON_PAGE(!PageLocked(page), page);
2205 VM_BUG_ON(PageLRU(page)); 2207 VM_BUG_ON_PAGE(PageLRU(page), page);
2206 2208
2207 /* If there is no mapped pte young don't collapse the page */ 2209 /* If there is no mapped pte young don't collapse the page */
2208 if (pte_young(pteval) || PageReferenced(page) || 2210 if (pte_young(pteval) || PageReferenced(page) ||
@@ -2232,7 +2234,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
2232 } else { 2234 } else {
2233 src_page = pte_page(pteval); 2235 src_page = pte_page(pteval);
2234 copy_user_highpage(page, src_page, address, vma); 2236 copy_user_highpage(page, src_page, address, vma);
2235 VM_BUG_ON(page_mapcount(src_page) != 1); 2237 VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
2236 release_pte_page(src_page); 2238 release_pte_page(src_page);
2237 /* 2239 /*
2238 * ptl mostly unnecessary, but preempt has to 2240 * ptl mostly unnecessary, but preempt has to
@@ -2311,7 +2313,7 @@ static struct page
2311 struct vm_area_struct *vma, unsigned long address, 2313 struct vm_area_struct *vma, unsigned long address,
2312 int node) 2314 int node)
2313{ 2315{
2314 VM_BUG_ON(*hpage); 2316 VM_BUG_ON_PAGE(*hpage, *hpage);
2315 /* 2317 /*
2316 * Allocate the page while the vma is still valid and under 2318 * Allocate the page while the vma is still valid and under
2317 * the mmap_sem read mode so there is no memory allocation 2319 * the mmap_sem read mode so there is no memory allocation
@@ -2580,7 +2582,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2580 */ 2582 */
2581 node = page_to_nid(page); 2583 node = page_to_nid(page);
2582 khugepaged_node_load[node]++; 2584 khugepaged_node_load[node]++;
2583 VM_BUG_ON(PageCompound(page)); 2585 VM_BUG_ON_PAGE(PageCompound(page), page);
2584 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2586 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2585 goto out_unmap; 2587 goto out_unmap;
2586 /* cannot use mapcount: can't collapse if there's a gup pin */ 2588 /* cannot use mapcount: can't collapse if there's a gup pin */
@@ -2876,7 +2878,7 @@ again:
2876 return; 2878 return;
2877 } 2879 }
2878 page = pmd_page(*pmd); 2880 page = pmd_page(*pmd);
2879 VM_BUG_ON(!page_count(page)); 2881 VM_BUG_ON_PAGE(!page_count(page), page);
2880 get_page(page); 2882 get_page(page);
2881 spin_unlock(ptl); 2883 spin_unlock(ptl);
2882 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2884 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dee6cf4e6d34..c01cb9fedb18 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -584,7 +584,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
584 1 << PG_active | 1 << PG_reserved | 584 1 << PG_active | 1 << PG_reserved |
585 1 << PG_private | 1 << PG_writeback); 585 1 << PG_private | 1 << PG_writeback);
586 } 586 }
587 VM_BUG_ON(hugetlb_cgroup_from_page(page)); 587 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
588 set_compound_page_dtor(page, NULL); 588 set_compound_page_dtor(page, NULL);
589 set_page_refcounted(page); 589 set_page_refcounted(page);
590 arch_release_hugepage(page); 590 arch_release_hugepage(page);
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
690 */ 690 */
691int PageHuge(struct page *page) 691int PageHuge(struct page *page)
692{ 692{
693 compound_page_dtor *dtor;
694
695 if (!PageCompound(page)) 693 if (!PageCompound(page))
696 return 0; 694 return 0;
697 695
698 page = compound_head(page); 696 page = compound_head(page);
699 dtor = get_compound_page_dtor(page); 697 return get_compound_page_dtor(page) == free_huge_page;
700
701 return dtor == free_huge_page;
702} 698}
703EXPORT_SYMBOL_GPL(PageHuge); 699EXPORT_SYMBOL_GPL(PageHuge);
704 700
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
708 */ 704 */
709int PageHeadHuge(struct page *page_head) 705int PageHeadHuge(struct page *page_head)
710{ 706{
711 compound_page_dtor *dtor;
712
713 if (!PageHead(page_head)) 707 if (!PageHead(page_head))
714 return 0; 708 return 0;
715 709
716 dtor = get_compound_page_dtor(page_head); 710 return get_compound_page_dtor(page_head) == free_huge_page;
717
718 return dtor == free_huge_page;
719} 711}
720EXPORT_SYMBOL_GPL(PageHeadHuge);
721 712
722pgoff_t __basepage_index(struct page *page) 713pgoff_t __basepage_index(struct page *page)
723{ 714{
@@ -1098,7 +1089,7 @@ retry:
1098 * no users -- drop the buddy allocator's reference. 1089 * no users -- drop the buddy allocator's reference.
1099 */ 1090 */
1100 put_page_testzero(page); 1091 put_page_testzero(page);
1101 VM_BUG_ON(page_count(page)); 1092 VM_BUG_ON_PAGE(page_count(page), page);
1102 enqueue_huge_page(h, page); 1093 enqueue_huge_page(h, page);
1103 } 1094 }
1104free: 1095free:
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1280 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 1271 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1281 void *addr; 1272 void *addr;
1282 1273
1283 addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 1274 addr = memblock_virt_alloc_try_nid_nopanic(
1284 huge_page_size(h), huge_page_size(h), 0); 1275 huge_page_size(h), huge_page_size(h),
1285 1276 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
1286 if (addr) { 1277 if (addr) {
1287 /* 1278 /*
1288 * Use the beginning of the huge page to store the 1279 * Use the beginning of the huge page to store the
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void)
1322 1313
1323#ifdef CONFIG_HIGHMEM 1314#ifdef CONFIG_HIGHMEM
1324 page = pfn_to_page(m->phys >> PAGE_SHIFT); 1315 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1325 free_bootmem_late((unsigned long)m, 1316 memblock_free_late(__pa(m),
1326 sizeof(struct huge_bootmem_page)); 1317 sizeof(struct huge_bootmem_page));
1327#else 1318#else
1328 page = virt_to_page(m); 1319 page = virt_to_page(m);
1329#endif 1320#endif
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2355 int cow; 2346 int cow;
2356 struct hstate *h = hstate_vma(vma); 2347 struct hstate *h = hstate_vma(vma);
2357 unsigned long sz = huge_page_size(h); 2348 unsigned long sz = huge_page_size(h);
2349 unsigned long mmun_start; /* For mmu_notifiers */
2350 unsigned long mmun_end; /* For mmu_notifiers */
2351 int ret = 0;
2358 2352
2359 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2353 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2360 2354
2355 mmun_start = vma->vm_start;
2356 mmun_end = vma->vm_end;
2357 if (cow)
2358 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
2359
2361 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2360 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2362 spinlock_t *src_ptl, *dst_ptl; 2361 spinlock_t *src_ptl, *dst_ptl;
2363 src_pte = huge_pte_offset(src, addr); 2362 src_pte = huge_pte_offset(src, addr);
2364 if (!src_pte) 2363 if (!src_pte)
2365 continue; 2364 continue;
2366 dst_pte = huge_pte_alloc(dst, addr, sz); 2365 dst_pte = huge_pte_alloc(dst, addr, sz);
2367 if (!dst_pte) 2366 if (!dst_pte) {
2368 goto nomem; 2367 ret = -ENOMEM;
2368 break;
2369 }
2369 2370
2370 /* If the pagetables are shared don't copy or take references */ 2371 /* If the pagetables are shared don't copy or take references */
2371 if (dst_pte == src_pte) 2372 if (dst_pte == src_pte)
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2386 spin_unlock(src_ptl); 2387 spin_unlock(src_ptl);
2387 spin_unlock(dst_ptl); 2388 spin_unlock(dst_ptl);
2388 } 2389 }
2389 return 0;
2390 2390
2391nomem: 2391 if (cow)
2392 return -ENOMEM; 2392 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
2393
2394 return ret;
2393} 2395}
2394 2396
2395static int is_hugetlb_entry_migration(pte_t pte) 2397static int is_hugetlb_entry_migration(pte_t pte)
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3079same_page: 3081same_page:
3080 if (pages) { 3082 if (pages) {
3081 pages[i] = mem_map_offset(page, pfn_offset); 3083 pages[i] = mem_map_offset(page, pfn_offset);
3082 get_page(pages[i]); 3084 get_page_foll(pages[i]);
3083 } 3085 }
3084 3086
3085 if (vmas) 3087 if (vmas)
@@ -3501,7 +3503,7 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3501 3503
3502bool isolate_huge_page(struct page *page, struct list_head *list) 3504bool isolate_huge_page(struct page *page, struct list_head *list)
3503{ 3505{
3504 VM_BUG_ON(!PageHead(page)); 3506 VM_BUG_ON_PAGE(!PageHead(page), page);
3505 if (!get_page_unless_zero(page)) 3507 if (!get_page_unless_zero(page))
3506 return false; 3508 return false;
3507 spin_lock(&hugetlb_lock); 3509 spin_lock(&hugetlb_lock);
@@ -3512,7 +3514,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
3512 3514
3513void putback_active_hugepage(struct page *page) 3515void putback_active_hugepage(struct page *page)
3514{ 3516{
3515 VM_BUG_ON(!PageHead(page)); 3517 VM_BUG_ON_PAGE(!PageHead(page), page);
3516 spin_lock(&hugetlb_lock); 3518 spin_lock(&hugetlb_lock);
3517 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 3519 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3518 spin_unlock(&hugetlb_lock); 3520 spin_unlock(&hugetlb_lock);
@@ -3521,7 +3523,7 @@ void putback_active_hugepage(struct page *page)
3521 3523
3522bool is_hugepage_active(struct page *page) 3524bool is_hugepage_active(struct page *page)
3523{ 3525{
3524 VM_BUG_ON(!PageHuge(page)); 3526 VM_BUG_ON_PAGE(!PageHuge(page), page);
3525 /* 3527 /*
3526 * This function can be called for a tail page because the caller, 3528 * This function can be called for a tail page because the caller,
3527 * scan_movable_pages, scans through a given pfn-range which typically 3529 * scan_movable_pages, scans through a given pfn-range which typically
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index bda8e44f6fde..cb00829bb466 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
242 return; 242 return;
243} 243}
244 244
245static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, 245static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
246 struct cftype *cft, struct file *file, 246 struct cftype *cft)
247 char __user *buf, size_t nbytes,
248 loff_t *ppos)
249{ 247{
250 u64 val; 248 int idx, name;
251 char str[64];
252 int idx, name, len;
253 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 249 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
254 250
255 idx = MEMFILE_IDX(cft->private); 251 idx = MEMFILE_IDX(cft->private);
256 name = MEMFILE_ATTR(cft->private); 252 name = MEMFILE_ATTR(cft->private);
257 253
258 val = res_counter_read_u64(&h_cg->hugepage[idx], name); 254 return res_counter_read_u64(&h_cg->hugepage[idx], name);
259 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
260 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
261} 255}
262 256
263static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, 257static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
@@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx)
337 cft = &h->cgroup_files[0]; 331 cft = &h->cgroup_files[0];
338 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 332 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
339 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 333 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
340 cft->read = hugetlb_cgroup_read; 334 cft->read_u64 = hugetlb_cgroup_read_u64;
341 cft->write_string = hugetlb_cgroup_write; 335 cft->write_string = hugetlb_cgroup_write;
342 336
343 /* Add the usage file */ 337 /* Add the usage file */
344 cft = &h->cgroup_files[1]; 338 cft = &h->cgroup_files[1];
345 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 339 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
346 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 340 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
347 cft->read = hugetlb_cgroup_read; 341 cft->read_u64 = hugetlb_cgroup_read_u64;
348 342
349 /* Add the MAX usage file */ 343 /* Add the MAX usage file */
350 cft = &h->cgroup_files[2]; 344 cft = &h->cgroup_files[2];
351 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 345 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
352 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 346 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
353 cft->trigger = hugetlb_cgroup_reset; 347 cft->trigger = hugetlb_cgroup_reset;
354 cft->read = hugetlb_cgroup_read; 348 cft->read_u64 = hugetlb_cgroup_read_u64;
355 349
356 /* Add the failcntfile */ 350 /* Add the failcntfile */
357 cft = &h->cgroup_files[3]; 351 cft = &h->cgroup_files[3];
358 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 352 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
359 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 353 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
360 cft->trigger = hugetlb_cgroup_reset; 354 cft->trigger = hugetlb_cgroup_reset;
361 cft->read = hugetlb_cgroup_read; 355 cft->read_u64 = hugetlb_cgroup_read_u64;
362 356
363 /* NULL terminate the last cft */ 357 /* NULL terminate the last cft */
364 cft = &h->cgroup_files[4]; 358 cft = &h->cgroup_files[4];
@@ -396,7 +390,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
396 if (hugetlb_cgroup_disabled()) 390 if (hugetlb_cgroup_disabled())
397 return; 391 return;
398 392
399 VM_BUG_ON(!PageHuge(oldhpage)); 393 VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
400 spin_lock(&hugetlb_lock); 394 spin_lock(&hugetlb_lock);
401 h_cg = hugetlb_cgroup_from_page(oldhpage); 395 h_cg = hugetlb_cgroup_from_page(oldhpage);
402 set_hugetlb_cgroup(oldhpage, NULL); 396 set_hugetlb_cgroup(oldhpage, NULL);
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4c84678371eb..95487c71cad5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
55 return 0; 55 return 0;
56 56
57inject: 57inject:
58 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); 58 pr_info("Injecting memory failure at pfn %#lx\n", pfn);
59 return memory_failure(pfn, 18, MF_COUNT_INCREASED); 59 return memory_failure(pfn, 18, MF_COUNT_INCREASED);
60} 60}
61 61
diff --git a/mm/internal.h b/mm/internal.h
index 684f7aa9692a..29e1e761f9eb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -27,8 +27,8 @@ static inline void set_page_count(struct page *page, int v)
27 */ 27 */
28static inline void set_page_refcounted(struct page *page) 28static inline void set_page_refcounted(struct page *page)
29{ 29{
30 VM_BUG_ON(PageTail(page)); 30 VM_BUG_ON_PAGE(PageTail(page), page);
31 VM_BUG_ON(atomic_read(&page->_count)); 31 VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
32 set_page_count(page, 1); 32 set_page_count(page, 1);
33} 33}
34 34
@@ -46,12 +46,10 @@ static inline void __get_page_tail_foll(struct page *page,
46 * speculative page access (like in 46 * speculative page access (like in
47 * page_cache_get_speculative()) on tail pages. 47 * page_cache_get_speculative()) on tail pages.
48 */ 48 */
49 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); 49 VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
50 VM_BUG_ON(atomic_read(&page->_count) != 0);
51 VM_BUG_ON(page_mapcount(page) < 0);
52 if (get_page_head) 50 if (get_page_head)
53 atomic_inc(&page->first_page->_count); 51 atomic_inc(&page->first_page->_count);
54 atomic_inc(&page->_mapcount); 52 get_huge_page_tail(page);
55} 53}
56 54
57/* 55/*
@@ -73,7 +71,7 @@ static inline void get_page_foll(struct page *page)
73 * Getting a normal page or the head of a compound page 71 * Getting a normal page or the head of a compound page
74 * requires to already have an elevated page->_count. 72 * requires to already have an elevated page->_count.
75 */ 73 */
76 VM_BUG_ON(atomic_read(&page->_count) <= 0); 74 VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
77 atomic_inc(&page->_count); 75 atomic_inc(&page->_count);
78 } 76 }
79} 77}
@@ -85,7 +83,6 @@ extern unsigned long highest_memmap_pfn;
85 */ 83 */
86extern int isolate_lru_page(struct page *page); 84extern int isolate_lru_page(struct page *page);
87extern void putback_lru_page(struct page *page); 85extern void putback_lru_page(struct page *page);
88extern unsigned long zone_reclaimable_pages(struct zone *zone);
89extern bool zone_reclaimable(struct zone *zone); 86extern bool zone_reclaimable(struct zone *zone);
90 87
91/* 88/*
@@ -101,6 +98,7 @@ extern void prep_compound_page(struct page *page, unsigned long order);
101#ifdef CONFIG_MEMORY_FAILURE 98#ifdef CONFIG_MEMORY_FAILURE
102extern bool is_free_buddy_page(struct page *page); 99extern bool is_free_buddy_page(struct page *page);
103#endif 100#endif
101extern int user_min_free_kbytes;
104 102
105#if defined CONFIG_COMPACTION || defined CONFIG_CMA 103#if defined CONFIG_COMPACTION || defined CONFIG_CMA
106 104
@@ -144,9 +142,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
144#endif 142#endif
145 143
146/* 144/*
147 * function for dealing with page's order in buddy system. 145 * This function returns the order of a free page in the buddy system. In
148 * zone->lock is already acquired when we use these. 146 * general, page_zone(page)->lock must be held by the caller to prevent the
149 * So, we don't need atomic page->flags operations here. 147 * page from being allocated in parallel and returning garbage as the order.
148 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
149 * page cannot be allocated or merged in parallel.
150 */ 150 */
151static inline unsigned long page_order(struct page *page) 151static inline unsigned long page_order(struct page *page)
152{ 152{
@@ -175,7 +175,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
175static inline int mlocked_vma_newpage(struct vm_area_struct *vma, 175static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
176 struct page *page) 176 struct page *page)
177{ 177{
178 VM_BUG_ON(PageLRU(page)); 178 VM_BUG_ON_PAGE(PageLRU(page), page);
179 179
180 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) 180 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
181 return 0; 181 return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff79dc95..aa4c7c7250c1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page,
1891 return new_page; 1891 return new_page;
1892} 1892}
1893 1893
1894int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, 1894int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
1895 unsigned long *vm_flags)
1896{ 1895{
1897 struct stable_node *stable_node; 1896 struct stable_node *stable_node;
1898 struct rmap_item *rmap_item; 1897 struct rmap_item *rmap_item;
1899 unsigned int mapcount = page_mapcount(page); 1898 int ret = SWAP_AGAIN;
1900 int referenced = 0;
1901 int search_new_forks = 0; 1899 int search_new_forks = 0;
1902 1900
1903 VM_BUG_ON(!PageKsm(page)); 1901 VM_BUG_ON_PAGE(!PageKsm(page), page);
1904 VM_BUG_ON(!PageLocked(page)); 1902
1903 /*
1904 * Rely on the page lock to protect against concurrent modifications
1905 * to that page's node of the stable tree.
1906 */
1907 VM_BUG_ON_PAGE(!PageLocked(page), page);
1905 1908
1906 stable_node = page_stable_node(page); 1909 stable_node = page_stable_node(page);
1907 if (!stable_node) 1910 if (!stable_node)
1908 return 0; 1911 return ret;
1909again: 1912again:
1910 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { 1913 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1911 struct anon_vma *anon_vma = rmap_item->anon_vma; 1914 struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1928,113 +1931,16 @@ again:
1928 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1931 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1929 continue; 1932 continue;
1930 1933
1931 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 1934 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1932 continue;
1933
1934 referenced += page_referenced_one(page, vma,
1935 rmap_item->address, &mapcount, vm_flags);
1936 if (!search_new_forks || !mapcount)
1937 break;
1938 }
1939 anon_vma_unlock_read(anon_vma);
1940 if (!mapcount)
1941 goto out;
1942 }
1943 if (!search_new_forks++)
1944 goto again;
1945out:
1946 return referenced;
1947}
1948
1949int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1950{
1951 struct stable_node *stable_node;
1952 struct rmap_item *rmap_item;
1953 int ret = SWAP_AGAIN;
1954 int search_new_forks = 0;
1955
1956 VM_BUG_ON(!PageKsm(page));
1957 VM_BUG_ON(!PageLocked(page));
1958
1959 stable_node = page_stable_node(page);
1960 if (!stable_node)
1961 return SWAP_FAIL;
1962again:
1963 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1964 struct anon_vma *anon_vma = rmap_item->anon_vma;
1965 struct anon_vma_chain *vmac;
1966 struct vm_area_struct *vma;
1967
1968 anon_vma_lock_read(anon_vma);
1969 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1970 0, ULONG_MAX) {
1971 vma = vmac->vma;
1972 if (rmap_item->address < vma->vm_start ||
1973 rmap_item->address >= vma->vm_end)
1974 continue;
1975 /*
1976 * Initially we examine only the vma which covers this
1977 * rmap_item; but later, if there is still work to do,
1978 * we examine covering vmas in other mms: in case they
1979 * were forked from the original since ksmd passed.
1980 */
1981 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1982 continue; 1935 continue;
1983 1936
1984 ret = try_to_unmap_one(page, vma, 1937 ret = rwc->rmap_one(page, vma,
1985 rmap_item->address, flags); 1938 rmap_item->address, rwc->arg);
1986 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1939 if (ret != SWAP_AGAIN) {
1987 anon_vma_unlock_read(anon_vma); 1940 anon_vma_unlock_read(anon_vma);
1988 goto out; 1941 goto out;
1989 } 1942 }
1990 } 1943 if (rwc->done && rwc->done(page)) {
1991 anon_vma_unlock_read(anon_vma);
1992 }
1993 if (!search_new_forks++)
1994 goto again;
1995out:
1996 return ret;
1997}
1998
1999#ifdef CONFIG_MIGRATION
2000int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
2001 struct vm_area_struct *, unsigned long, void *), void *arg)
2002{
2003 struct stable_node *stable_node;
2004 struct rmap_item *rmap_item;
2005 int ret = SWAP_AGAIN;
2006 int search_new_forks = 0;
2007
2008 VM_BUG_ON(!PageKsm(page));
2009 VM_BUG_ON(!PageLocked(page));
2010
2011 stable_node = page_stable_node(page);
2012 if (!stable_node)
2013 return ret;
2014again:
2015 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2016 struct anon_vma *anon_vma = rmap_item->anon_vma;
2017 struct anon_vma_chain *vmac;
2018 struct vm_area_struct *vma;
2019
2020 anon_vma_lock_read(anon_vma);
2021 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2022 0, ULONG_MAX) {
2023 vma = vmac->vma;
2024 if (rmap_item->address < vma->vm_start ||
2025 rmap_item->address >= vma->vm_end)
2026 continue;
2027 /*
2028 * Initially we examine only the vma which covers this
2029 * rmap_item; but later, if there is still work to do,
2030 * we examine covering vmas in other mms: in case they
2031 * were forked from the original since ksmd passed.
2032 */
2033 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2034 continue;
2035
2036 ret = rmap_one(page, vma, rmap_item->address, arg);
2037 if (ret != SWAP_AGAIN) {
2038 anon_vma_unlock_read(anon_vma); 1944 anon_vma_unlock_read(anon_vma);
2039 goto out; 1945 goto out;
2040 } 1946 }
@@ -2047,17 +1953,18 @@ out:
2047 return ret; 1953 return ret;
2048} 1954}
2049 1955
1956#ifdef CONFIG_MIGRATION
2050void ksm_migrate_page(struct page *newpage, struct page *oldpage) 1957void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2051{ 1958{
2052 struct stable_node *stable_node; 1959 struct stable_node *stable_node;
2053 1960
2054 VM_BUG_ON(!PageLocked(oldpage)); 1961 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2055 VM_BUG_ON(!PageLocked(newpage)); 1962 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2056 VM_BUG_ON(newpage->mapping != oldpage->mapping); 1963 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2057 1964
2058 stable_node = page_stable_node(newpage); 1965 stable_node = page_stable_node(newpage);
2059 if (stable_node) { 1966 if (stable_node) {
2060 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); 1967 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2061 stable_node->kpfn = page_to_pfn(newpage); 1968 stable_node->kpfn = page_to_pfn(newpage);
2062 /* 1969 /*
2063 * newpage->mapping was set in advance; now we need smp_wmb() 1970 * newpage->mapping was set in advance; now we need smp_wmb()
@@ -2438,4 +2345,4 @@ out_free:
2438out: 2345out:
2439 return err; 2346 return err;
2440} 2347}
2441module_init(ksm_init) 2348subsys_initcall(ksm_init);
diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477bb5558..39a31e7f0045 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -21,6 +21,9 @@
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22 22
23#include <asm-generic/sections.h> 23#include <asm-generic/sections.h>
24#include <linux/io.h>
25
26#include "internal.h"
24 27
25static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 28static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
26static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 29static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = {
39}; 42};
40 43
41int memblock_debug __initdata_memblock; 44int memblock_debug __initdata_memblock;
45#ifdef CONFIG_MOVABLE_NODE
46bool movable_node_enabled __initdata_memblock = false;
47#endif
42static int memblock_can_resize __initdata_memblock; 48static int memblock_can_resize __initdata_memblock;
43static int memblock_memory_in_slab __initdata_memblock = 0; 49static int memblock_memory_in_slab __initdata_memblock = 0;
44static int memblock_reserved_in_slab __initdata_memblock = 0; 50static int memblock_reserved_in_slab __initdata_memblock = 0;
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
91 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 97 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
92 * @size: size of free area to find 98 * @size: size of free area to find
93 * @align: alignment of free area to find 99 * @align: alignment of free area to find
94 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 100 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
95 * 101 *
96 * Utility called from memblock_find_in_range_node(), find free area bottom-up. 102 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
97 * 103 *
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
123 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 129 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
124 * @size: size of free area to find 130 * @size: size of free area to find
125 * @align: alignment of free area to find 131 * @align: alignment of free area to find
126 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 132 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
127 * 133 *
128 * Utility called from memblock_find_in_range_node(), find free area top-down. 134 * Utility called from memblock_find_in_range_node(), find free area top-down.
129 * 135 *
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
154 160
155/** 161/**
156 * memblock_find_in_range_node - find free area in given range and node 162 * memblock_find_in_range_node - find free area in given range and node
157 * @start: start of candidate range
158 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
159 * @size: size of free area to find 163 * @size: size of free area to find
160 * @align: alignment of free area to find 164 * @align: alignment of free area to find
161 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 165 * @start: start of candidate range
166 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
167 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
162 * 168 *
163 * Find @size free area aligned to @align in the specified range and node. 169 * Find @size free area aligned to @align in the specified range and node.
164 * 170 *
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
173 * RETURNS: 179 * RETURNS:
174 * Found address on success, 0 on failure. 180 * Found address on success, 0 on failure.
175 */ 181 */
176phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 182phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
177 phys_addr_t end, phys_addr_t size, 183 phys_addr_t align, phys_addr_t start,
178 phys_addr_t align, int nid) 184 phys_addr_t end, int nid)
179{ 185{
180 int ret; 186 int ret;
181 phys_addr_t kernel_end; 187 phys_addr_t kernel_end;
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
238 phys_addr_t end, phys_addr_t size, 244 phys_addr_t end, phys_addr_t size,
239 phys_addr_t align) 245 phys_addr_t align)
240{ 246{
241 return memblock_find_in_range_node(start, end, size, align, 247 return memblock_find_in_range_node(size, align, start, end,
242 MAX_NUMNODES); 248 NUMA_NO_NODE);
243} 249}
244 250
245static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 251static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -255,10 +261,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
255 type->cnt = 1; 261 type->cnt = 1;
256 type->regions[0].base = 0; 262 type->regions[0].base = 0;
257 type->regions[0].size = 0; 263 type->regions[0].size = 0;
264 type->regions[0].flags = 0;
258 memblock_set_region_node(&type->regions[0], MAX_NUMNODES); 265 memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
259 } 266 }
260} 267}
261 268
269#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
270
262phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( 271phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
263 phys_addr_t *addr) 272 phys_addr_t *addr)
264{ 273{
@@ -271,6 +280,20 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
271 memblock.reserved.max); 280 memblock.reserved.max);
272} 281}
273 282
283phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
284 phys_addr_t *addr)
285{
286 if (memblock.memory.regions == memblock_memory_init_regions)
287 return 0;
288
289 *addr = __pa(memblock.memory.regions);
290
291 return PAGE_ALIGN(sizeof(struct memblock_region) *
292 memblock.memory.max);
293}
294
295#endif
296
274/** 297/**
275 * memblock_double_array - double the size of the memblock regions array 298 * memblock_double_array - double the size of the memblock regions array
276 * @type: memblock type of the regions array being doubled 299 * @type: memblock type of the regions array being doubled
@@ -405,7 +428,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
405 428
406 if (this->base + this->size != next->base || 429 if (this->base + this->size != next->base ||
407 memblock_get_region_node(this) != 430 memblock_get_region_node(this) !=
408 memblock_get_region_node(next)) { 431 memblock_get_region_node(next) ||
432 this->flags != next->flags) {
409 BUG_ON(this->base + this->size > next->base); 433 BUG_ON(this->base + this->size > next->base);
410 i++; 434 i++;
411 continue; 435 continue;
@@ -425,13 +449,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
425 * @base: base address of the new region 449 * @base: base address of the new region
426 * @size: size of the new region 450 * @size: size of the new region
427 * @nid: node id of the new region 451 * @nid: node id of the new region
452 * @flags: flags of the new region
428 * 453 *
429 * Insert new memblock region [@base,@base+@size) into @type at @idx. 454 * Insert new memblock region [@base,@base+@size) into @type at @idx.
430 * @type must already have extra room to accomodate the new region. 455 * @type must already have extra room to accomodate the new region.
431 */ 456 */
432static void __init_memblock memblock_insert_region(struct memblock_type *type, 457static void __init_memblock memblock_insert_region(struct memblock_type *type,
433 int idx, phys_addr_t base, 458 int idx, phys_addr_t base,
434 phys_addr_t size, int nid) 459 phys_addr_t size,
460 int nid, unsigned long flags)
435{ 461{
436 struct memblock_region *rgn = &type->regions[idx]; 462 struct memblock_region *rgn = &type->regions[idx];
437 463
@@ -439,6 +465,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
439 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); 465 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
440 rgn->base = base; 466 rgn->base = base;
441 rgn->size = size; 467 rgn->size = size;
468 rgn->flags = flags;
442 memblock_set_region_node(rgn, nid); 469 memblock_set_region_node(rgn, nid);
443 type->cnt++; 470 type->cnt++;
444 type->total_size += size; 471 type->total_size += size;
@@ -450,6 +477,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
450 * @base: base address of the new region 477 * @base: base address of the new region
451 * @size: size of the new region 478 * @size: size of the new region
452 * @nid: nid of the new region 479 * @nid: nid of the new region
480 * @flags: flags of the new region
453 * 481 *
454 * Add new memblock region [@base,@base+@size) into @type. The new region 482 * Add new memblock region [@base,@base+@size) into @type. The new region
455 * is allowed to overlap with existing ones - overlaps don't affect already 483 * is allowed to overlap with existing ones - overlaps don't affect already
@@ -460,7 +488,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
460 * 0 on success, -errno on failure. 488 * 0 on success, -errno on failure.
461 */ 489 */
462static int __init_memblock memblock_add_region(struct memblock_type *type, 490static int __init_memblock memblock_add_region(struct memblock_type *type,
463 phys_addr_t base, phys_addr_t size, int nid) 491 phys_addr_t base, phys_addr_t size,
492 int nid, unsigned long flags)
464{ 493{
465 bool insert = false; 494 bool insert = false;
466 phys_addr_t obase = base; 495 phys_addr_t obase = base;
@@ -475,6 +504,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
475 WARN_ON(type->cnt != 1 || type->total_size); 504 WARN_ON(type->cnt != 1 || type->total_size);
476 type->regions[0].base = base; 505 type->regions[0].base = base;
477 type->regions[0].size = size; 506 type->regions[0].size = size;
507 type->regions[0].flags = flags;
478 memblock_set_region_node(&type->regions[0], nid); 508 memblock_set_region_node(&type->regions[0], nid);
479 type->total_size = size; 509 type->total_size = size;
480 return 0; 510 return 0;
@@ -505,7 +535,8 @@ repeat:
505 nr_new++; 535 nr_new++;
506 if (insert) 536 if (insert)
507 memblock_insert_region(type, i++, base, 537 memblock_insert_region(type, i++, base,
508 rbase - base, nid); 538 rbase - base, nid,
539 flags);
509 } 540 }
510 /* area below @rend is dealt with, forget about it */ 541 /* area below @rend is dealt with, forget about it */
511 base = min(rend, end); 542 base = min(rend, end);
@@ -515,7 +546,8 @@ repeat:
515 if (base < end) { 546 if (base < end) {
516 nr_new++; 547 nr_new++;
517 if (insert) 548 if (insert)
518 memblock_insert_region(type, i, base, end - base, nid); 549 memblock_insert_region(type, i, base, end - base,
550 nid, flags);
519 } 551 }
520 552
521 /* 553 /*
@@ -537,12 +569,13 @@ repeat:
537int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, 569int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
538 int nid) 570 int nid)
539{ 571{
540 return memblock_add_region(&memblock.memory, base, size, nid); 572 return memblock_add_region(&memblock.memory, base, size, nid, 0);
541} 573}
542 574
543int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 575int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
544{ 576{
545 return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); 577 return memblock_add_region(&memblock.memory, base, size,
578 MAX_NUMNODES, 0);
546} 579}
547 580
548/** 581/**
@@ -597,7 +630,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
597 rgn->size -= base - rbase; 630 rgn->size -= base - rbase;
598 type->total_size -= base - rbase; 631 type->total_size -= base - rbase;
599 memblock_insert_region(type, i, rbase, base - rbase, 632 memblock_insert_region(type, i, rbase, base - rbase,
600 memblock_get_region_node(rgn)); 633 memblock_get_region_node(rgn),
634 rgn->flags);
601 } else if (rend > end) { 635 } else if (rend > end) {
602 /* 636 /*
603 * @rgn intersects from above. Split and redo the 637 * @rgn intersects from above. Split and redo the
@@ -607,7 +641,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
607 rgn->size -= end - rbase; 641 rgn->size -= end - rbase;
608 type->total_size -= end - rbase; 642 type->total_size -= end - rbase;
609 memblock_insert_region(type, i--, rbase, end - rbase, 643 memblock_insert_region(type, i--, rbase, end - rbase,
610 memblock_get_region_node(rgn)); 644 memblock_get_region_node(rgn),
645 rgn->flags);
611 } else { 646 } else {
612 /* @rgn is fully contained, record it */ 647 /* @rgn is fully contained, record it */
613 if (!*end_rgn) 648 if (!*end_rgn)
@@ -643,28 +678,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
643{ 678{
644 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", 679 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
645 (unsigned long long)base, 680 (unsigned long long)base,
646 (unsigned long long)base + size, 681 (unsigned long long)base + size - 1,
647 (void *)_RET_IP_); 682 (void *)_RET_IP_);
648 683
649 return __memblock_remove(&memblock.reserved, base, size); 684 return __memblock_remove(&memblock.reserved, base, size);
650} 685}
651 686
652int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 687static int __init_memblock memblock_reserve_region(phys_addr_t base,
688 phys_addr_t size,
689 int nid,
690 unsigned long flags)
653{ 691{
654 struct memblock_type *_rgn = &memblock.reserved; 692 struct memblock_type *_rgn = &memblock.reserved;
655 693
656 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", 694 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
657 (unsigned long long)base, 695 (unsigned long long)base,
658 (unsigned long long)base + size, 696 (unsigned long long)base + size - 1,
659 (void *)_RET_IP_); 697 flags, (void *)_RET_IP_);
698
699 return memblock_add_region(_rgn, base, size, nid, flags);
700}
701
702int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
703{
704 return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
705}
706
707/**
708 * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
709 * @base: the base phys addr of the region
710 * @size: the size of the region
711 *
712 * This function isolates region [@base, @base + @size), and mark it with flag
713 * MEMBLOCK_HOTPLUG.
714 *
715 * Return 0 on succees, -errno on failure.
716 */
717int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
718{
719 struct memblock_type *type = &memblock.memory;
720 int i, ret, start_rgn, end_rgn;
721
722 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
723 if (ret)
724 return ret;
725
726 for (i = start_rgn; i < end_rgn; i++)
727 memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
660 728
661 return memblock_add_region(_rgn, base, size, MAX_NUMNODES); 729 memblock_merge_regions(type);
730 return 0;
731}
732
733/**
734 * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
735 * @base: the base phys addr of the region
736 * @size: the size of the region
737 *
738 * This function isolates region [@base, @base + @size), and clear flag
739 * MEMBLOCK_HOTPLUG for the isolated regions.
740 *
741 * Return 0 on succees, -errno on failure.
742 */
743int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
744{
745 struct memblock_type *type = &memblock.memory;
746 int i, ret, start_rgn, end_rgn;
747
748 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
749 if (ret)
750 return ret;
751
752 for (i = start_rgn; i < end_rgn; i++)
753 memblock_clear_region_flags(&type->regions[i],
754 MEMBLOCK_HOTPLUG);
755
756 memblock_merge_regions(type);
757 return 0;
662} 758}
663 759
664/** 760/**
665 * __next_free_mem_range - next function for for_each_free_mem_range() 761 * __next_free_mem_range - next function for for_each_free_mem_range()
666 * @idx: pointer to u64 loop variable 762 * @idx: pointer to u64 loop variable
667 * @nid: node selector, %MAX_NUMNODES for all nodes 763 * @nid: node selector, %NUMA_NO_NODE for all nodes
668 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 764 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
669 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 765 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
670 * @out_nid: ptr to int for nid of the range, can be %NULL 766 * @out_nid: ptr to int for nid of the range, can be %NULL
@@ -693,13 +789,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
693 int mi = *idx & 0xffffffff; 789 int mi = *idx & 0xffffffff;
694 int ri = *idx >> 32; 790 int ri = *idx >> 32;
695 791
792 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
793 nid = NUMA_NO_NODE;
794
696 for ( ; mi < mem->cnt; mi++) { 795 for ( ; mi < mem->cnt; mi++) {
697 struct memblock_region *m = &mem->regions[mi]; 796 struct memblock_region *m = &mem->regions[mi];
698 phys_addr_t m_start = m->base; 797 phys_addr_t m_start = m->base;
699 phys_addr_t m_end = m->base + m->size; 798 phys_addr_t m_end = m->base + m->size;
700 799
701 /* only memory regions are associated with nodes, check it */ 800 /* only memory regions are associated with nodes, check it */
702 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 801 if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
703 continue; 802 continue;
704 803
705 /* scan areas before each reservation for intersection */ 804 /* scan areas before each reservation for intersection */
@@ -740,12 +839,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
740/** 839/**
741 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 840 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
742 * @idx: pointer to u64 loop variable 841 * @idx: pointer to u64 loop variable
743 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 842 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
744 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 843 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
745 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 844 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
746 * @out_nid: ptr to int for nid of the range, can be %NULL 845 * @out_nid: ptr to int for nid of the range, can be %NULL
747 * 846 *
748 * Reverse of __next_free_mem_range(). 847 * Reverse of __next_free_mem_range().
848 *
849 * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't
850 * be able to hot-remove hotpluggable memory used by the kernel. So this
851 * function skip hotpluggable regions if needed when allocating memory for the
852 * kernel.
749 */ 853 */
750void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, 854void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
751 phys_addr_t *out_start, 855 phys_addr_t *out_start,
@@ -756,6 +860,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
756 int mi = *idx & 0xffffffff; 860 int mi = *idx & 0xffffffff;
757 int ri = *idx >> 32; 861 int ri = *idx >> 32;
758 862
863 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
864 nid = NUMA_NO_NODE;
865
759 if (*idx == (u64)ULLONG_MAX) { 866 if (*idx == (u64)ULLONG_MAX) {
760 mi = mem->cnt - 1; 867 mi = mem->cnt - 1;
761 ri = rsv->cnt; 868 ri = rsv->cnt;
@@ -767,7 +874,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
767 phys_addr_t m_end = m->base + m->size; 874 phys_addr_t m_end = m->base + m->size;
768 875
769 /* only memory regions are associated with nodes, check it */ 876 /* only memory regions are associated with nodes, check it */
770 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 877 if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
878 continue;
879
880 /* skip hotpluggable memory regions if needed */
881 if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
771 continue; 882 continue;
772 883
773 /* scan areas before each reservation for intersection */ 884 /* scan areas before each reservation for intersection */
@@ -837,18 +948,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
837 * memblock_set_node - set node ID on memblock regions 948 * memblock_set_node - set node ID on memblock regions
838 * @base: base of area to set node ID for 949 * @base: base of area to set node ID for
839 * @size: size of area to set node ID for 950 * @size: size of area to set node ID for
951 * @type: memblock type to set node ID for
840 * @nid: node ID to set 952 * @nid: node ID to set
841 * 953 *
842 * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. 954 * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
843 * Regions which cross the area boundaries are split as necessary. 955 * Regions which cross the area boundaries are split as necessary.
844 * 956 *
845 * RETURNS: 957 * RETURNS:
846 * 0 on success, -errno on failure. 958 * 0 on success, -errno on failure.
847 */ 959 */
848int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, 960int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
849 int nid) 961 struct memblock_type *type, int nid)
850{ 962{
851 struct memblock_type *type = &memblock.memory;
852 int start_rgn, end_rgn; 963 int start_rgn, end_rgn;
853 int i, ret; 964 int i, ret;
854 965
@@ -870,13 +981,10 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
870{ 981{
871 phys_addr_t found; 982 phys_addr_t found;
872 983
873 if (WARN_ON(!align)) 984 if (!align)
874 align = __alignof__(long long); 985 align = SMP_CACHE_BYTES;
875 986
876 /* align @size to avoid excessive fragmentation on reserved array */ 987 found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
877 size = round_up(size, align);
878
879 found = memblock_find_in_range_node(0, max_addr, size, align, nid);
880 if (found && !memblock_reserve(found, size)) 988 if (found && !memblock_reserve(found, size))
881 return found; 989 return found;
882 990
@@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n
890 998
891phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 999phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
892{ 1000{
893 return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); 1001 return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
894} 1002}
895 1003
896phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 1004phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
920 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 1028 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
921} 1029}
922 1030
1031/**
1032 * memblock_virt_alloc_internal - allocate boot memory block
1033 * @size: size of memory block to be allocated in bytes
1034 * @align: alignment of the region and block's size
1035 * @min_addr: the lower bound of the memory region to allocate (phys address)
1036 * @max_addr: the upper bound of the memory region to allocate (phys address)
1037 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1038 *
1039 * The @min_addr limit is dropped if it can not be satisfied and the allocation
1040 * will fall back to memory below @min_addr. Also, allocation may fall back
1041 * to any node in the system if the specified node can not
1042 * hold the requested memory.
1043 *
1044 * The allocation is performed from memory region limited by
1045 * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
1046 *
1047 * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
1048 *
1049 * The phys address of allocated boot memory block is converted to virtual and
1050 * allocated memory is reset to 0.
1051 *
1052 * In addition, function sets the min_count to 0 using kmemleak_alloc for
1053 * allocated boot memory block, so that it is never reported as leaks.
1054 *
1055 * RETURNS:
1056 * Virtual address of allocated memory block on success, NULL on failure.
1057 */
1058static void * __init memblock_virt_alloc_internal(
1059 phys_addr_t size, phys_addr_t align,
1060 phys_addr_t min_addr, phys_addr_t max_addr,
1061 int nid)
1062{
1063 phys_addr_t alloc;
1064 void *ptr;
1065
1066 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
1067 nid = NUMA_NO_NODE;
1068
1069 /*
1070 * Detect any accidental use of these APIs after slab is ready, as at
1071 * this moment memblock may be deinitialized already and its
1072 * internal data may be destroyed (after execution of free_all_bootmem)
1073 */
1074 if (WARN_ON_ONCE(slab_is_available()))
1075 return kzalloc_node(size, GFP_NOWAIT, nid);
1076
1077 if (!align)
1078 align = SMP_CACHE_BYTES;
1079
1080 if (max_addr > memblock.current_limit)
1081 max_addr = memblock.current_limit;
1082
1083again:
1084 alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
1085 nid);
1086 if (alloc)
1087 goto done;
1088
1089 if (nid != NUMA_NO_NODE) {
1090 alloc = memblock_find_in_range_node(size, align, min_addr,
1091 max_addr, NUMA_NO_NODE);
1092 if (alloc)
1093 goto done;
1094 }
1095
1096 if (min_addr) {
1097 min_addr = 0;
1098 goto again;
1099 } else {
1100 goto error;
1101 }
1102
1103done:
1104 memblock_reserve(alloc, size);
1105 ptr = phys_to_virt(alloc);
1106 memset(ptr, 0, size);
1107
1108 /*
1109 * The min_count is set to 0 so that bootmem allocated blocks
1110 * are never reported as leaks. This is because many of these blocks
1111 * are only referred via the physical address which is not
1112 * looked up by kmemleak.
1113 */
1114 kmemleak_alloc(ptr, size, 0, 0);
1115
1116 return ptr;
1117
1118error:
1119 return NULL;
1120}
1121
1122/**
1123 * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
1124 * @size: size of memory block to be allocated in bytes
1125 * @align: alignment of the region and block's size
1126 * @min_addr: the lower bound of the memory region from where the allocation
1127 * is preferred (phys address)
1128 * @max_addr: the upper bound of the memory region from where the allocation
1129 * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
1130 * allocate only from memory limited by memblock.current_limit value
1131 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1132 *
1133 * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
1134 * additional debug information (including caller info), if enabled.
1135 *
1136 * RETURNS:
1137 * Virtual address of allocated memory block on success, NULL on failure.
1138 */
1139void * __init memblock_virt_alloc_try_nid_nopanic(
1140 phys_addr_t size, phys_addr_t align,
1141 phys_addr_t min_addr, phys_addr_t max_addr,
1142 int nid)
1143{
1144 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1145 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1146 (u64)max_addr, (void *)_RET_IP_);
1147 return memblock_virt_alloc_internal(size, align, min_addr,
1148 max_addr, nid);
1149}
1150
1151/**
1152 * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
1153 * @size: size of memory block to be allocated in bytes
1154 * @align: alignment of the region and block's size
1155 * @min_addr: the lower bound of the memory region from where the allocation
1156 * is preferred (phys address)
1157 * @max_addr: the upper bound of the memory region from where the allocation
1158 * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
1159 * allocate only from memory limited by memblock.current_limit value
1160 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1161 *
1162 * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
1163 * which provides debug information (including caller info), if enabled,
1164 * and panics if the request can not be satisfied.
1165 *
1166 * RETURNS:
1167 * Virtual address of allocated memory block on success, NULL on failure.
1168 */
1169void * __init memblock_virt_alloc_try_nid(
1170 phys_addr_t size, phys_addr_t align,
1171 phys_addr_t min_addr, phys_addr_t max_addr,
1172 int nid)
1173{
1174 void *ptr;
1175
1176 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1177 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1178 (u64)max_addr, (void *)_RET_IP_);
1179 ptr = memblock_virt_alloc_internal(size, align,
1180 min_addr, max_addr, nid);
1181 if (ptr)
1182 return ptr;
1183
1184 panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
1185 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1186 (u64)max_addr);
1187 return NULL;
1188}
1189
1190/**
1191 * __memblock_free_early - free boot memory block
1192 * @base: phys starting address of the boot memory block
1193 * @size: size of the boot memory block in bytes
1194 *
1195 * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
1196 * The freeing memory will not be released to the buddy allocator.
1197 */
1198void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
1199{
1200 memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
1201 __func__, (u64)base, (u64)base + size - 1,
1202 (void *)_RET_IP_);
1203 kmemleak_free_part(__va(base), size);
1204 __memblock_remove(&memblock.reserved, base, size);
1205}
1206
1207/*
1208 * __memblock_free_late - free bootmem block pages directly to buddy allocator
1209 * @addr: phys starting address of the boot memory block
1210 * @size: size of the boot memory block in bytes
1211 *
1212 * This is only useful when the bootmem allocator has already been torn
1213 * down, but we are still initializing the system. Pages are released directly
1214 * to the buddy allocator, no bootmem metadata is updated because it is gone.
1215 */
1216void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
1217{
1218 u64 cursor, end;
1219
1220 memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
1221 __func__, (u64)base, (u64)base + size - 1,
1222 (void *)_RET_IP_);
1223 kmemleak_free_part(__va(base), size);
1224 cursor = PFN_UP(base);
1225 end = PFN_DOWN(base + size);
1226
1227 for (; cursor < end; cursor++) {
1228 __free_pages_bootmem(pfn_to_page(cursor), 0);
1229 totalram_pages++;
1230 }
1231}
923 1232
924/* 1233/*
925 * Remaining API functions 1234 * Remaining API functions
@@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
1101static void __init_memblock memblock_dump(struct memblock_type *type, char *name) 1410static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
1102{ 1411{
1103 unsigned long long base, size; 1412 unsigned long long base, size;
1413 unsigned long flags;
1104 int i; 1414 int i;
1105 1415
1106 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); 1416 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
@@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
1111 1421
1112 base = rgn->base; 1422 base = rgn->base;
1113 size = rgn->size; 1423 size = rgn->size;
1424 flags = rgn->flags;
1114#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 1425#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1115 if (memblock_get_region_node(rgn) != MAX_NUMNODES) 1426 if (memblock_get_region_node(rgn) != MAX_NUMNODES)
1116 snprintf(nid_buf, sizeof(nid_buf), " on node %d", 1427 snprintf(nid_buf, sizeof(nid_buf), " on node %d",
1117 memblock_get_region_node(rgn)); 1428 memblock_get_region_node(rgn));
1118#endif 1429#endif
1119 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", 1430 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
1120 name, i, base, base + size - 1, size, nid_buf); 1431 name, i, base, base + size - 1, size, nid_buf, flags);
1121 } 1432 }
1122} 1433}
1123 1434
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7f1a356153c0..53385cd4e6f0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,16 +45,17 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
47#include <linux/eventfd.h> 47#include <linux/eventfd.h>
48#include <linux/poll.h>
48#include <linux/sort.h> 49#include <linux/sort.h>
49#include <linux/fs.h> 50#include <linux/fs.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
51#include <linux/vmalloc.h>
52#include <linux/vmpressure.h> 52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h> 53#include <linux/mm_inline.h>
54#include <linux/page_cgroup.h> 54#include <linux/page_cgroup.h>
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/oom.h> 56#include <linux/oom.h>
57#include <linux/lockdep.h> 57#include <linux/lockdep.h>
58#include <linux/file.h>
58#include "internal.h" 59#include "internal.h"
59#include <net/sock.h> 60#include <net/sock.h>
60#include <net/ip.h> 61#include <net/ip.h>
@@ -148,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
148 * matches memcg->dead_count of the hierarchy root group. 149 * matches memcg->dead_count of the hierarchy root group.
149 */ 150 */
150 struct mem_cgroup *last_visited; 151 struct mem_cgroup *last_visited;
151 unsigned long last_dead_count; 152 int last_dead_count;
152 153
153 /* scan generation, increased every round-trip */ 154 /* scan generation, increased every round-trip */
154 unsigned int generation; 155 unsigned int generation;
@@ -227,6 +228,46 @@ struct mem_cgroup_eventfd_list {
227 struct eventfd_ctx *eventfd; 228 struct eventfd_ctx *eventfd;
228}; 229};
229 230
231/*
232 * cgroup_event represents events which userspace want to receive.
233 */
234struct mem_cgroup_event {
235 /*
236 * memcg which the event belongs to.
237 */
238 struct mem_cgroup *memcg;
239 /*
240 * eventfd to signal userspace about the event.
241 */
242 struct eventfd_ctx *eventfd;
243 /*
244 * Each of these stored in a list by the cgroup.
245 */
246 struct list_head list;
247 /*
248 * register_event() callback will be used to add new userspace
249 * waiter for changes related to this event. Use eventfd_signal()
250 * on eventfd to send notification to userspace.
251 */
252 int (*register_event)(struct mem_cgroup *memcg,
253 struct eventfd_ctx *eventfd, const char *args);
254 /*
255 * unregister_event() callback will be called when userspace closes
256 * the eventfd or on cgroup removing. This callback must be set,
257 * if you want provide notification functionality.
258 */
259 void (*unregister_event)(struct mem_cgroup *memcg,
260 struct eventfd_ctx *eventfd);
261 /*
262 * All fields below needed to unregister event when
263 * userspace closes eventfd.
264 */
265 poll_table pt;
266 wait_queue_head_t *wqh;
267 wait_queue_t wait;
268 struct work_struct remove;
269};
270
230static void mem_cgroup_threshold(struct mem_cgroup *memcg); 271static void mem_cgroup_threshold(struct mem_cgroup *memcg);
231static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 272static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
232 273
@@ -331,27 +372,20 @@ struct mem_cgroup {
331 atomic_t numainfo_updating; 372 atomic_t numainfo_updating;
332#endif 373#endif
333 374
375 /* List of events which userspace want to receive */
376 struct list_head event_list;
377 spinlock_t event_list_lock;
378
334 struct mem_cgroup_per_node *nodeinfo[0]; 379 struct mem_cgroup_per_node *nodeinfo[0];
335 /* WARNING: nodeinfo must be the last member here */ 380 /* WARNING: nodeinfo must be the last member here */
336}; 381};
337 382
338static size_t memcg_size(void)
339{
340 return sizeof(struct mem_cgroup) +
341 nr_node_ids * sizeof(struct mem_cgroup_per_node *);
342}
343
344/* internal only representation about the status of kmem accounting. */ 383/* internal only representation about the status of kmem accounting. */
345enum { 384enum {
346 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ 385 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
347 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
348 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 386 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
349}; 387};
350 388
351/* We account when limit is on, but only after call sites are patched */
352#define KMEM_ACCOUNTED_MASK \
353 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
354
355#ifdef CONFIG_MEMCG_KMEM 389#ifdef CONFIG_MEMCG_KMEM
356static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 390static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
357{ 391{
@@ -363,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
363 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 397 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
364} 398}
365 399
366static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
367{
368 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
369}
370
371static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
372{
373 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
374}
375
376static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 400static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
377{ 401{
378 /* 402 /*
@@ -490,11 +514,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 514 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
491} 515}
492 516
493struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
494{
495 return &mem_cgroup_from_css(css)->vmpressure;
496}
497
498static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 517static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
499{ 518{
500 return (memcg == root_mem_cgroup); 519 return (memcg == root_mem_cgroup);
@@ -1098,16 +1117,22 @@ skip_node:
1098 * skipped and we should continue the tree walk. 1117 * skipped and we should continue the tree walk.
1099 * last_visited css is safe to use because it is 1118 * last_visited css is safe to use because it is
1100 * protected by css_get and the tree walk is rcu safe. 1119 * protected by css_get and the tree walk is rcu safe.
1120 *
1121 * We do not take a reference on the root of the tree walk
1122 * because we might race with the root removal when it would
1123 * be the only node in the iterated hierarchy and mem_cgroup_iter
1124 * would end up in an endless loop because it expects that at
1125 * least one valid node will be returned. Root cannot disappear
1126 * because caller of the iterator should hold it already so
1127 * skipping css reference should be safe.
1101 */ 1128 */
1102 if (next_css) { 1129 if (next_css) {
1103 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1130 if ((next_css->flags & CSS_ONLINE) &&
1131 (next_css == &root->css || css_tryget(next_css)))
1132 return mem_cgroup_from_css(next_css);
1104 1133
1105 if (css_tryget(&mem->css)) 1134 prev_css = next_css;
1106 return mem; 1135 goto skip_node;
1107 else {
1108 prev_css = next_css;
1109 goto skip_node;
1110 }
1111 } 1136 }
1112 1137
1113 return NULL; 1138 return NULL;
@@ -1141,7 +1166,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1141 if (iter->last_dead_count == *sequence) { 1166 if (iter->last_dead_count == *sequence) {
1142 smp_rmb(); 1167 smp_rmb();
1143 position = iter->last_visited; 1168 position = iter->last_visited;
1144 if (position && !css_tryget(&position->css)) 1169
1170 /*
1171 * We cannot take a reference to root because we might race
1172 * with root removal and returning NULL would end up in
1173 * an endless loop on the iterator user level when root
1174 * would be returned all the time.
1175 */
1176 if (position && position != root &&
1177 !css_tryget(&position->css))
1145 position = NULL; 1178 position = NULL;
1146 } 1179 }
1147 return position; 1180 return position;
@@ -1150,9 +1183,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1150static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, 1183static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1151 struct mem_cgroup *last_visited, 1184 struct mem_cgroup *last_visited,
1152 struct mem_cgroup *new_position, 1185 struct mem_cgroup *new_position,
1186 struct mem_cgroup *root,
1153 int sequence) 1187 int sequence)
1154{ 1188{
1155 if (last_visited) 1189 /* root reference counting symmetric to mem_cgroup_iter_load */
1190 if (last_visited && last_visited != root)
1156 css_put(&last_visited->css); 1191 css_put(&last_visited->css);
1157 /* 1192 /*
1158 * We store the sequence count from the time @last_visited was 1193 * We store the sequence count from the time @last_visited was
@@ -1227,7 +1262,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1227 memcg = __mem_cgroup_iter_next(root, last_visited); 1262 memcg = __mem_cgroup_iter_next(root, last_visited);
1228 1263
1229 if (reclaim) { 1264 if (reclaim) {
1230 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1265 mem_cgroup_iter_update(iter, last_visited, memcg, root,
1266 seq);
1231 1267
1232 if (!memcg) 1268 if (!memcg)
1233 iter->generation++; 1269 iter->generation++;
@@ -1647,13 +1683,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1647 */ 1683 */
1648void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1684void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1649{ 1685{
1650 struct cgroup *task_cgrp;
1651 struct cgroup *mem_cgrp;
1652 /* 1686 /*
1653 * Need a buffer in BSS, can't rely on allocations. The code relies 1687 * protects memcg_name and makes sure that parallel ooms do not
1654 * on the assumption that OOM is serialized for memory controller. 1688 * interleave
1655 * If this assumption is broken, revisit this code.
1656 */ 1689 */
1690 static DEFINE_SPINLOCK(oom_info_lock);
1691 struct cgroup *task_cgrp;
1692 struct cgroup *mem_cgrp;
1657 static char memcg_name[PATH_MAX]; 1693 static char memcg_name[PATH_MAX];
1658 int ret; 1694 int ret;
1659 struct mem_cgroup *iter; 1695 struct mem_cgroup *iter;
@@ -1662,6 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1662 if (!p) 1698 if (!p)
1663 return; 1699 return;
1664 1700
1701 spin_lock(&oom_info_lock);
1665 rcu_read_lock(); 1702 rcu_read_lock();
1666 1703
1667 mem_cgrp = memcg->css.cgroup; 1704 mem_cgrp = memcg->css.cgroup;
@@ -1730,6 +1767,7 @@ done:
1730 1767
1731 pr_cont("\n"); 1768 pr_cont("\n");
1732 } 1769 }
1770 spin_unlock(&oom_info_lock);
1733} 1771}
1734 1772
1735/* 1773/*
@@ -1822,13 +1860,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1822 break; 1860 break;
1823 }; 1861 };
1824 points = oom_badness(task, memcg, NULL, totalpages); 1862 points = oom_badness(task, memcg, NULL, totalpages);
1825 if (points > chosen_points) { 1863 if (!points || points < chosen_points)
1826 if (chosen) 1864 continue;
1827 put_task_struct(chosen); 1865 /* Prefer thread group leaders for display purposes */
1828 chosen = task; 1866 if (points == chosen_points &&
1829 chosen_points = points; 1867 thread_group_leader(chosen))
1830 get_task_struct(chosen); 1868 continue;
1831 } 1869
1870 if (chosen)
1871 put_task_struct(chosen);
1872 chosen = task;
1873 chosen_points = points;
1874 get_task_struct(chosen);
1832 } 1875 }
1833 css_task_iter_end(&it); 1876 css_task_iter_end(&it);
1834 } 1877 }
@@ -2861,7 +2904,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2861 unsigned short id; 2904 unsigned short id;
2862 swp_entry_t ent; 2905 swp_entry_t ent;
2863 2906
2864 VM_BUG_ON(!PageLocked(page)); 2907 VM_BUG_ON_PAGE(!PageLocked(page), page);
2865 2908
2866 pc = lookup_page_cgroup(page); 2909 pc = lookup_page_cgroup(page);
2867 lock_page_cgroup(pc); 2910 lock_page_cgroup(pc);
@@ -2895,7 +2938,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2895 bool anon; 2938 bool anon;
2896 2939
2897 lock_page_cgroup(pc); 2940 lock_page_cgroup(pc);
2898 VM_BUG_ON(PageCgroupUsed(pc)); 2941 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2899 /* 2942 /*
2900 * we don't need page_cgroup_lock about tail pages, becase they are not 2943 * we don't need page_cgroup_lock about tail pages, becase they are not
2901 * accessed by any other context at this point. 2944 * accessed by any other context at this point.
@@ -2930,7 +2973,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2930 if (lrucare) { 2973 if (lrucare) {
2931 if (was_on_lru) { 2974 if (was_on_lru) {
2932 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2975 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2933 VM_BUG_ON(PageLRU(page)); 2976 VM_BUG_ON_PAGE(PageLRU(page), page);
2934 SetPageLRU(page); 2977 SetPageLRU(page);
2935 add_page_to_lru_list(page, lruvec, page_lru(page)); 2978 add_page_to_lru_list(page, lruvec, page_lru(page));
2936 } 2979 }
@@ -2956,10 +2999,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2956static DEFINE_MUTEX(set_limit_mutex); 2999static DEFINE_MUTEX(set_limit_mutex);
2957 3000
2958#ifdef CONFIG_MEMCG_KMEM 3001#ifdef CONFIG_MEMCG_KMEM
3002static DEFINE_MUTEX(activate_kmem_mutex);
3003
2959static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 3004static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2960{ 3005{
2961 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 3006 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2962 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); 3007 memcg_kmem_is_active(memcg);
2963} 3008}
2964 3009
2965/* 3010/*
@@ -2976,10 +3021,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2976} 3021}
2977 3022
2978#ifdef CONFIG_SLABINFO 3023#ifdef CONFIG_SLABINFO
2979static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, 3024static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2980 struct cftype *cft, struct seq_file *m)
2981{ 3025{
2982 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3026 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2983 struct memcg_cache_params *params; 3027 struct memcg_cache_params *params;
2984 3028
2985 if (!memcg_can_account_kmem(memcg)) 3029 if (!memcg_can_account_kmem(memcg))
@@ -3059,16 +3103,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
3059 css_put(&memcg->css); 3103 css_put(&memcg->css);
3060} 3104}
3061 3105
3062void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
3063{
3064 if (!memcg)
3065 return;
3066
3067 mutex_lock(&memcg->slab_caches_mutex);
3068 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3069 mutex_unlock(&memcg->slab_caches_mutex);
3070}
3071
3072/* 3106/*
3073 * helper for acessing a memcg's index. It will be used as an index in the 3107 * helper for acessing a memcg's index. It will be used as an index in the
3074 * child cache array in kmem_cache, and also to derive its name. This function 3108 * child cache array in kmem_cache, and also to derive its name. This function
@@ -3079,43 +3113,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
3079 return memcg ? memcg->kmemcg_id : -1; 3113 return memcg ? memcg->kmemcg_id : -1;
3080} 3114}
3081 3115
3082/*
3083 * This ends up being protected by the set_limit mutex, during normal
3084 * operation, because that is its main call site.
3085 *
3086 * But when we create a new cache, we can call this as well if its parent
3087 * is kmem-limited. That will have to hold set_limit_mutex as well.
3088 */
3089int memcg_update_cache_sizes(struct mem_cgroup *memcg)
3090{
3091 int num, ret;
3092
3093 num = ida_simple_get(&kmem_limited_groups,
3094 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
3095 if (num < 0)
3096 return num;
3097 /*
3098 * After this point, kmem_accounted (that we test atomically in
3099 * the beginning of this conditional), is no longer 0. This
3100 * guarantees only one process will set the following boolean
3101 * to true. We don't need test_and_set because we're protected
3102 * by the set_limit_mutex anyway.
3103 */
3104 memcg_kmem_set_activated(memcg);
3105
3106 ret = memcg_update_all_caches(num+1);
3107 if (ret) {
3108 ida_simple_remove(&kmem_limited_groups, num);
3109 memcg_kmem_clear_activated(memcg);
3110 return ret;
3111 }
3112
3113 memcg->kmemcg_id = num;
3114 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
3115 mutex_init(&memcg->slab_caches_mutex);
3116 return 0;
3117}
3118
3119static size_t memcg_caches_array_size(int num_groups) 3116static size_t memcg_caches_array_size(int num_groups)
3120{ 3117{
3121 ssize_t size; 3118 ssize_t size;
@@ -3152,18 +3149,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3152 3149
3153 if (num_groups > memcg_limited_groups_array_size) { 3150 if (num_groups > memcg_limited_groups_array_size) {
3154 int i; 3151 int i;
3152 struct memcg_cache_params *new_params;
3155 ssize_t size = memcg_caches_array_size(num_groups); 3153 ssize_t size = memcg_caches_array_size(num_groups);
3156 3154
3157 size *= sizeof(void *); 3155 size *= sizeof(void *);
3158 size += offsetof(struct memcg_cache_params, memcg_caches); 3156 size += offsetof(struct memcg_cache_params, memcg_caches);
3159 3157
3160 s->memcg_params = kzalloc(size, GFP_KERNEL); 3158 new_params = kzalloc(size, GFP_KERNEL);
3161 if (!s->memcg_params) { 3159 if (!new_params)
3162 s->memcg_params = cur_params;
3163 return -ENOMEM; 3160 return -ENOMEM;
3164 }
3165 3161
3166 s->memcg_params->is_root_cache = true; 3162 new_params->is_root_cache = true;
3167 3163
3168 /* 3164 /*
3169 * There is the chance it will be bigger than 3165 * There is the chance it will be bigger than
@@ -3177,7 +3173,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3177 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3173 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3178 if (!cur_params->memcg_caches[i]) 3174 if (!cur_params->memcg_caches[i])
3179 continue; 3175 continue;
3180 s->memcg_params->memcg_caches[i] = 3176 new_params->memcg_caches[i] =
3181 cur_params->memcg_caches[i]; 3177 cur_params->memcg_caches[i];
3182 } 3178 }
3183 3179
@@ -3190,13 +3186,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3190 * bigger than the others. And all updates will reset this 3186 * bigger than the others. And all updates will reset this
3191 * anyway. 3187 * anyway.
3192 */ 3188 */
3193 kfree(cur_params); 3189 rcu_assign_pointer(s->memcg_params, new_params);
3190 if (cur_params)
3191 kfree_rcu(cur_params, rcu_head);
3194 } 3192 }
3195 return 0; 3193 return 0;
3196} 3194}
3197 3195
3198int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, 3196int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3199 struct kmem_cache *root_cache) 3197 struct kmem_cache *root_cache)
3200{ 3198{
3201 size_t size; 3199 size_t size;
3202 3200
@@ -3224,35 +3222,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3224 return 0; 3222 return 0;
3225} 3223}
3226 3224
3227void memcg_release_cache(struct kmem_cache *s) 3225void memcg_free_cache_params(struct kmem_cache *s)
3226{
3227 kfree(s->memcg_params);
3228}
3229
3230void memcg_register_cache(struct kmem_cache *s)
3228{ 3231{
3229 struct kmem_cache *root; 3232 struct kmem_cache *root;
3230 struct mem_cgroup *memcg; 3233 struct mem_cgroup *memcg;
3231 int id; 3234 int id;
3232 3235
3233 /* 3236 if (is_root_cache(s))
3234 * This happens, for instance, when a root cache goes away before we
3235 * add any memcg.
3236 */
3237 if (!s->memcg_params)
3238 return; 3237 return;
3239 3238
3240 if (s->memcg_params->is_root_cache) 3239 /*
3241 goto out; 3240 * Holding the slab_mutex assures nobody will touch the memcg_caches
3241 * array while we are modifying it.
3242 */
3243 lockdep_assert_held(&slab_mutex);
3242 3244
3245 root = s->memcg_params->root_cache;
3243 memcg = s->memcg_params->memcg; 3246 memcg = s->memcg_params->memcg;
3244 id = memcg_cache_id(memcg); 3247 id = memcg_cache_id(memcg);
3248
3249 css_get(&memcg->css);
3250
3251
3252 /*
3253 * Since readers won't lock (see cache_from_memcg_idx()), we need a
3254 * barrier here to ensure nobody will see the kmem_cache partially
3255 * initialized.
3256 */
3257 smp_wmb();
3258
3259 /*
3260 * Initialize the pointer to this cache in its parent's memcg_params
3261 * before adding it to the memcg_slab_caches list, otherwise we can
3262 * fail to convert memcg_params_to_cache() while traversing the list.
3263 */
3264 VM_BUG_ON(root->memcg_params->memcg_caches[id]);
3265 root->memcg_params->memcg_caches[id] = s;
3266
3267 mutex_lock(&memcg->slab_caches_mutex);
3268 list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
3269 mutex_unlock(&memcg->slab_caches_mutex);
3270}
3271
3272void memcg_unregister_cache(struct kmem_cache *s)
3273{
3274 struct kmem_cache *root;
3275 struct mem_cgroup *memcg;
3276 int id;
3277
3278 if (is_root_cache(s))
3279 return;
3280
3281 /*
3282 * Holding the slab_mutex assures nobody will touch the memcg_caches
3283 * array while we are modifying it.
3284 */
3285 lockdep_assert_held(&slab_mutex);
3245 3286
3246 root = s->memcg_params->root_cache; 3287 root = s->memcg_params->root_cache;
3247 root->memcg_params->memcg_caches[id] = NULL; 3288 memcg = s->memcg_params->memcg;
3289 id = memcg_cache_id(memcg);
3248 3290
3249 mutex_lock(&memcg->slab_caches_mutex); 3291 mutex_lock(&memcg->slab_caches_mutex);
3250 list_del(&s->memcg_params->list); 3292 list_del(&s->memcg_params->list);
3251 mutex_unlock(&memcg->slab_caches_mutex); 3293 mutex_unlock(&memcg->slab_caches_mutex);
3252 3294
3295 /*
3296 * Clear the pointer to this cache in its parent's memcg_params only
3297 * after removing it from the memcg_slab_caches list, otherwise we can
3298 * fail to convert memcg_params_to_cache() while traversing the list.
3299 */
3300 VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
3301 root->memcg_params->memcg_caches[id] = NULL;
3302
3253 css_put(&memcg->css); 3303 css_put(&memcg->css);
3254out:
3255 kfree(s->memcg_params);
3256} 3304}
3257 3305
3258/* 3306/*
@@ -3311,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)
3311 * So if we aren't down to zero, we'll just schedule a worker and try 3359 * So if we aren't down to zero, we'll just schedule a worker and try
3312 * again 3360 * again
3313 */ 3361 */
3314 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { 3362 if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
3315 kmem_cache_shrink(cachep); 3363 kmem_cache_shrink(cachep);
3316 if (atomic_read(&cachep->memcg_params->nr_pages) == 0) 3364 else
3317 return;
3318 } else
3319 kmem_cache_destroy(cachep); 3365 kmem_cache_destroy(cachep);
3320} 3366}
3321 3367
@@ -3351,27 +3397,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3351 schedule_work(&cachep->memcg_params->destroy); 3397 schedule_work(&cachep->memcg_params->destroy);
3352} 3398}
3353 3399
3354/* 3400static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3355 * This lock protects updaters, not readers. We want readers to be as fast as 3401 struct kmem_cache *s)
3356 * they can, and they will either see NULL or a valid cache value. Our model
3357 * allow them to see NULL, in which case the root memcg will be selected.
3358 *
3359 * We need this lock because multiple allocations to the same cache from a non
3360 * will span more than one worker. Only one of them can create the cache.
3361 */
3362static DEFINE_MUTEX(memcg_cache_mutex);
3363
3364/*
3365 * Called with memcg_cache_mutex held
3366 */
3367static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3368 struct kmem_cache *s)
3369{ 3402{
3370 struct kmem_cache *new; 3403 struct kmem_cache *new = NULL;
3371 static char *tmp_name = NULL; 3404 static char *tmp_name = NULL;
3405 static DEFINE_MUTEX(mutex); /* protects tmp_name */
3372 3406
3373 lockdep_assert_held(&memcg_cache_mutex); 3407 BUG_ON(!memcg_can_account_kmem(memcg));
3374 3408
3409 mutex_lock(&mutex);
3375 /* 3410 /*
3376 * kmem_cache_create_memcg duplicates the given name and 3411 * kmem_cache_create_memcg duplicates the given name and
3377 * cgroup_name for this name requires RCU context. 3412 * cgroup_name for this name requires RCU context.
@@ -3381,7 +3416,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3381 if (!tmp_name) { 3416 if (!tmp_name) {
3382 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); 3417 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3383 if (!tmp_name) 3418 if (!tmp_name)
3384 return NULL; 3419 goto out;
3385 } 3420 }
3386 3421
3387 rcu_read_lock(); 3422 rcu_read_lock();
@@ -3391,48 +3426,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3391 3426
3392 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, 3427 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3393 (s->flags & ~SLAB_PANIC), s->ctor, s); 3428 (s->flags & ~SLAB_PANIC), s->ctor, s);
3394
3395 if (new) 3429 if (new)
3396 new->allocflags |= __GFP_KMEMCG; 3430 new->allocflags |= __GFP_KMEMCG;
3397 3431 else
3398 return new; 3432 new = s;
3399}
3400
3401static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3402 struct kmem_cache *cachep)
3403{
3404 struct kmem_cache *new_cachep;
3405 int idx;
3406
3407 BUG_ON(!memcg_can_account_kmem(memcg));
3408
3409 idx = memcg_cache_id(memcg);
3410
3411 mutex_lock(&memcg_cache_mutex);
3412 new_cachep = cache_from_memcg_idx(cachep, idx);
3413 if (new_cachep) {
3414 css_put(&memcg->css);
3415 goto out;
3416 }
3417
3418 new_cachep = kmem_cache_dup(memcg, cachep);
3419 if (new_cachep == NULL) {
3420 new_cachep = cachep;
3421 css_put(&memcg->css);
3422 goto out;
3423 }
3424
3425 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3426
3427 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3428 /*
3429 * the readers won't lock, make sure everybody sees the updated value,
3430 * so they won't put stuff in the queue again for no reason
3431 */
3432 wmb();
3433out: 3433out:
3434 mutex_unlock(&memcg_cache_mutex); 3434 mutex_unlock(&mutex);
3435 return new_cachep; 3435 return new;
3436} 3436}
3437 3437
3438void kmem_cache_destroy_memcg_children(struct kmem_cache *s) 3438void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
@@ -3452,9 +3452,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3452 * 3452 *
3453 * Still, we don't want anyone else freeing memcg_caches under our 3453 * Still, we don't want anyone else freeing memcg_caches under our
3454 * noses, which can happen if a new memcg comes to life. As usual, 3454 * noses, which can happen if a new memcg comes to life. As usual,
3455 * we'll take the set_limit_mutex to protect ourselves against this. 3455 * we'll take the activate_kmem_mutex to protect ourselves against
3456 * this.
3456 */ 3457 */
3457 mutex_lock(&set_limit_mutex); 3458 mutex_lock(&activate_kmem_mutex);
3458 for_each_memcg_cache_index(i) { 3459 for_each_memcg_cache_index(i) {
3459 c = cache_from_memcg_idx(s, i); 3460 c = cache_from_memcg_idx(s, i);
3460 if (!c) 3461 if (!c)
@@ -3477,7 +3478,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3477 cancel_work_sync(&c->memcg_params->destroy); 3478 cancel_work_sync(&c->memcg_params->destroy);
3478 kmem_cache_destroy(c); 3479 kmem_cache_destroy(c);
3479 } 3480 }
3480 mutex_unlock(&set_limit_mutex); 3481 mutex_unlock(&activate_kmem_mutex);
3481} 3482}
3482 3483
3483struct create_work { 3484struct create_work {
@@ -3509,6 +3510,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
3509 3510
3510 cw = container_of(w, struct create_work, work); 3511 cw = container_of(w, struct create_work, work);
3511 memcg_create_kmem_cache(cw->memcg, cw->cachep); 3512 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3513 css_put(&cw->memcg->css);
3512 kfree(cw); 3514 kfree(cw);
3513} 3515}
3514 3516
@@ -3568,7 +3570,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3568 gfp_t gfp) 3570 gfp_t gfp)
3569{ 3571{
3570 struct mem_cgroup *memcg; 3572 struct mem_cgroup *memcg;
3571 int idx; 3573 struct kmem_cache *memcg_cachep;
3572 3574
3573 VM_BUG_ON(!cachep->memcg_params); 3575 VM_BUG_ON(!cachep->memcg_params);
3574 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3576 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3582,15 +3584,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3582 if (!memcg_can_account_kmem(memcg)) 3584 if (!memcg_can_account_kmem(memcg))
3583 goto out; 3585 goto out;
3584 3586
3585 idx = memcg_cache_id(memcg); 3587 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
3586 3588 if (likely(memcg_cachep)) {
3587 /* 3589 cachep = memcg_cachep;
3588 * barrier to mare sure we're always seeing the up to date value. The
3589 * code updating memcg_caches will issue a write barrier to match this.
3590 */
3591 read_barrier_depends();
3592 if (likely(cache_from_memcg_idx(cachep, idx))) {
3593 cachep = cache_from_memcg_idx(cachep, idx);
3594 goto out; 3590 goto out;
3595 } 3591 }
3596 3592
@@ -3744,7 +3740,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
3744 if (!memcg) 3740 if (!memcg)
3745 return; 3741 return;
3746 3742
3747 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3743 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3748 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3744 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3749} 3745}
3750#else 3746#else
@@ -3823,7 +3819,7 @@ static int mem_cgroup_move_account(struct page *page,
3823 bool anon = PageAnon(page); 3819 bool anon = PageAnon(page);
3824 3820
3825 VM_BUG_ON(from == to); 3821 VM_BUG_ON(from == to);
3826 VM_BUG_ON(PageLRU(page)); 3822 VM_BUG_ON_PAGE(PageLRU(page), page);
3827 /* 3823 /*
3828 * The page is isolated from LRU. So, collapse function 3824 * The page is isolated from LRU. So, collapse function
3829 * will not handle this page. But page splitting can happen. 3825 * will not handle this page. But page splitting can happen.
@@ -3916,7 +3912,7 @@ static int mem_cgroup_move_parent(struct page *page,
3916 parent = root_mem_cgroup; 3912 parent = root_mem_cgroup;
3917 3913
3918 if (nr_pages > 1) { 3914 if (nr_pages > 1) {
3919 VM_BUG_ON(!PageTransHuge(page)); 3915 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3920 flags = compound_lock_irqsave(page); 3916 flags = compound_lock_irqsave(page);
3921 } 3917 }
3922 3918
@@ -3950,7 +3946,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3950 3946
3951 if (PageTransHuge(page)) { 3947 if (PageTransHuge(page)) {
3952 nr_pages <<= compound_order(page); 3948 nr_pages <<= compound_order(page);
3953 VM_BUG_ON(!PageTransHuge(page)); 3949 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3954 /* 3950 /*
3955 * Never OOM-kill a process for a huge page. The 3951 * Never OOM-kill a process for a huge page. The
3956 * fault handler will fall back to regular pages. 3952 * fault handler will fall back to regular pages.
@@ -3970,8 +3966,8 @@ int mem_cgroup_newpage_charge(struct page *page,
3970{ 3966{
3971 if (mem_cgroup_disabled()) 3967 if (mem_cgroup_disabled())
3972 return 0; 3968 return 0;
3973 VM_BUG_ON(page_mapped(page)); 3969 VM_BUG_ON_PAGE(page_mapped(page), page);
3974 VM_BUG_ON(page->mapping && !PageAnon(page)); 3970 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3975 VM_BUG_ON(!mm); 3971 VM_BUG_ON(!mm);
3976 return mem_cgroup_charge_common(page, mm, gfp_mask, 3972 return mem_cgroup_charge_common(page, mm, gfp_mask,
3977 MEM_CGROUP_CHARGE_TYPE_ANON); 3973 MEM_CGROUP_CHARGE_TYPE_ANON);
@@ -4175,7 +4171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4175 4171
4176 if (PageTransHuge(page)) { 4172 if (PageTransHuge(page)) {
4177 nr_pages <<= compound_order(page); 4173 nr_pages <<= compound_order(page);
4178 VM_BUG_ON(!PageTransHuge(page)); 4174 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
4179 } 4175 }
4180 /* 4176 /*
4181 * Check if our page_cgroup is valid 4177 * Check if our page_cgroup is valid
@@ -4267,7 +4263,7 @@ void mem_cgroup_uncharge_page(struct page *page)
4267 /* early check. */ 4263 /* early check. */
4268 if (page_mapped(page)) 4264 if (page_mapped(page))
4269 return; 4265 return;
4270 VM_BUG_ON(page->mapping && !PageAnon(page)); 4266 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
4271 /* 4267 /*
4272 * If the page is in swap cache, uncharge should be deferred 4268 * If the page is in swap cache, uncharge should be deferred
4273 * to the swap path, which also properly accounts swap usage 4269 * to the swap path, which also properly accounts swap usage
@@ -4287,8 +4283,8 @@ void mem_cgroup_uncharge_page(struct page *page)
4287 4283
4288void mem_cgroup_uncharge_cache_page(struct page *page) 4284void mem_cgroup_uncharge_cache_page(struct page *page)
4289{ 4285{
4290 VM_BUG_ON(page_mapped(page)); 4286 VM_BUG_ON_PAGE(page_mapped(page), page);
4291 VM_BUG_ON(page->mapping); 4287 VM_BUG_ON_PAGE(page->mapping, page);
4292 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); 4288 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4293} 4289}
4294 4290
@@ -5112,14 +5108,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5112 return val << PAGE_SHIFT; 5108 return val << PAGE_SHIFT;
5113} 5109}
5114 5110
5115static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, 5111static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
5116 struct cftype *cft, struct file *file, 5112 struct cftype *cft)
5117 char __user *buf, size_t nbytes, loff_t *ppos)
5118{ 5113{
5119 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5114 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5120 char str[64];
5121 u64 val; 5115 u64 val;
5122 int name, len; 5116 int name;
5123 enum res_type type; 5117 enum res_type type;
5124 5118
5125 type = MEMFILE_TYPE(cft->private); 5119 type = MEMFILE_TYPE(cft->private);
@@ -5145,15 +5139,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
5145 BUG(); 5139 BUG();
5146 } 5140 }
5147 5141
5148 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 5142 return val;
5149 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5150} 5143}
5151 5144
5152static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
5153{
5154 int ret = -EINVAL;
5155#ifdef CONFIG_MEMCG_KMEM 5145#ifdef CONFIG_MEMCG_KMEM
5156 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5146/* should be called with activate_kmem_mutex held */
5147static int __memcg_activate_kmem(struct mem_cgroup *memcg,
5148 unsigned long long limit)
5149{
5150 int err = 0;
5151 int memcg_id;
5152
5153 if (memcg_kmem_is_active(memcg))
5154 return 0;
5155
5156 /*
5157 * We are going to allocate memory for data shared by all memory
5158 * cgroups so let's stop accounting here.
5159 */
5160 memcg_stop_kmem_account();
5161
5157 /* 5162 /*
5158 * For simplicity, we won't allow this to be disabled. It also can't 5163 * For simplicity, we won't allow this to be disabled. It also can't
5159 * be changed if the cgroup has children already, or if tasks had 5164 * be changed if the cgroup has children already, or if tasks had
@@ -5167,72 +5172,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
5167 * of course permitted. 5172 * of course permitted.
5168 */ 5173 */
5169 mutex_lock(&memcg_create_mutex); 5174 mutex_lock(&memcg_create_mutex);
5170 mutex_lock(&set_limit_mutex); 5175 if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
5171 if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { 5176 err = -EBUSY;
5172 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { 5177 mutex_unlock(&memcg_create_mutex);
5173 ret = -EBUSY; 5178 if (err)
5174 goto out; 5179 goto out;
5175 }
5176 ret = res_counter_set_limit(&memcg->kmem, val);
5177 VM_BUG_ON(ret);
5178 5180
5179 ret = memcg_update_cache_sizes(memcg); 5181 memcg_id = ida_simple_get(&kmem_limited_groups,
5180 if (ret) { 5182 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
5181 res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); 5183 if (memcg_id < 0) {
5182 goto out; 5184 err = memcg_id;
5183 } 5185 goto out;
5184 static_key_slow_inc(&memcg_kmem_enabled_key); 5186 }
5185 /* 5187
5186 * setting the active bit after the inc will guarantee no one 5188 /*
5187 * starts accounting before all call sites are patched 5189 * Make sure we have enough space for this cgroup in each root cache's
5188 */ 5190 * memcg_params.
5189 memcg_kmem_set_active(memcg); 5191 */
5190 } else 5192 err = memcg_update_all_caches(memcg_id + 1);
5191 ret = res_counter_set_limit(&memcg->kmem, val); 5193 if (err)
5194 goto out_rmid;
5195
5196 memcg->kmemcg_id = memcg_id;
5197 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
5198 mutex_init(&memcg->slab_caches_mutex);
5199
5200 /*
5201 * We couldn't have accounted to this cgroup, because it hasn't got the
5202 * active bit set yet, so this should succeed.
5203 */
5204 err = res_counter_set_limit(&memcg->kmem, limit);
5205 VM_BUG_ON(err);
5206
5207 static_key_slow_inc(&memcg_kmem_enabled_key);
5208 /*
5209 * Setting the active bit after enabling static branching will
5210 * guarantee no one starts accounting before all call sites are
5211 * patched.
5212 */
5213 memcg_kmem_set_active(memcg);
5192out: 5214out:
5193 mutex_unlock(&set_limit_mutex); 5215 memcg_resume_kmem_account();
5194 mutex_unlock(&memcg_create_mutex); 5216 return err;
5195#endif 5217
5218out_rmid:
5219 ida_simple_remove(&kmem_limited_groups, memcg_id);
5220 goto out;
5221}
5222
5223static int memcg_activate_kmem(struct mem_cgroup *memcg,
5224 unsigned long long limit)
5225{
5226 int ret;
5227
5228 mutex_lock(&activate_kmem_mutex);
5229 ret = __memcg_activate_kmem(memcg, limit);
5230 mutex_unlock(&activate_kmem_mutex);
5231 return ret;
5232}
5233
5234static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5235 unsigned long long val)
5236{
5237 int ret;
5238
5239 if (!memcg_kmem_is_active(memcg))
5240 ret = memcg_activate_kmem(memcg, val);
5241 else
5242 ret = res_counter_set_limit(&memcg->kmem, val);
5196 return ret; 5243 return ret;
5197} 5244}
5198 5245
5199#ifdef CONFIG_MEMCG_KMEM
5200static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5246static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5201{ 5247{
5202 int ret = 0; 5248 int ret = 0;
5203 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5249 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5204 if (!parent)
5205 goto out;
5206 5250
5207 memcg->kmem_account_flags = parent->kmem_account_flags; 5251 if (!parent)
5208 /* 5252 return 0;
5209 * When that happen, we need to disable the static branch only on those
5210 * memcgs that enabled it. To achieve this, we would be forced to
5211 * complicate the code by keeping track of which memcgs were the ones
5212 * that actually enabled limits, and which ones got it from its
5213 * parents.
5214 *
5215 * It is a lot simpler just to do static_key_slow_inc() on every child
5216 * that is accounted.
5217 */
5218 if (!memcg_kmem_is_active(memcg))
5219 goto out;
5220 5253
5254 mutex_lock(&activate_kmem_mutex);
5221 /* 5255 /*
5222 * __mem_cgroup_free() will issue static_key_slow_dec() because this 5256 * If the parent cgroup is not kmem-active now, it cannot be activated
5223 * memcg is active already. If the later initialization fails then the 5257 * after this point, because it has at least one child already.
5224 * cgroup core triggers the cleanup so we do not have to do it here.
5225 */ 5258 */
5226 static_key_slow_inc(&memcg_kmem_enabled_key); 5259 if (memcg_kmem_is_active(parent))
5227 5260 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
5228 mutex_lock(&set_limit_mutex); 5261 mutex_unlock(&activate_kmem_mutex);
5229 memcg_stop_kmem_account();
5230 ret = memcg_update_cache_sizes(memcg);
5231 memcg_resume_kmem_account();
5232 mutex_unlock(&set_limit_mutex);
5233out:
5234 return ret; 5262 return ret;
5235} 5263}
5264#else
5265static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5266 unsigned long long val)
5267{
5268 return -EINVAL;
5269}
5236#endif /* CONFIG_MEMCG_KMEM */ 5270#endif /* CONFIG_MEMCG_KMEM */
5237 5271
5238/* 5272/*
@@ -5266,7 +5300,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5266 else if (type == _MEMSWAP) 5300 else if (type == _MEMSWAP)
5267 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5301 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5268 else if (type == _KMEM) 5302 else if (type == _KMEM)
5269 ret = memcg_update_kmem_limit(css, val); 5303 ret = memcg_update_kmem_limit(memcg, val);
5270 else 5304 else
5271 return -EINVAL; 5305 return -EINVAL;
5272 break; 5306 break;
@@ -5383,8 +5417,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5383#endif 5417#endif
5384 5418
5385#ifdef CONFIG_NUMA 5419#ifdef CONFIG_NUMA
5386static int memcg_numa_stat_show(struct cgroup_subsys_state *css, 5420static int memcg_numa_stat_show(struct seq_file *m, void *v)
5387 struct cftype *cft, struct seq_file *m)
5388{ 5421{
5389 struct numa_stat { 5422 struct numa_stat {
5390 const char *name; 5423 const char *name;
@@ -5400,7 +5433,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5400 const struct numa_stat *stat; 5433 const struct numa_stat *stat;
5401 int nid; 5434 int nid;
5402 unsigned long nr; 5435 unsigned long nr;
5403 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5436 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5404 5437
5405 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 5438 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5406 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 5439 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -5439,10 +5472,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
5439 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5472 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5440} 5473}
5441 5474
5442static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, 5475static int memcg_stat_show(struct seq_file *m, void *v)
5443 struct seq_file *m)
5444{ 5476{
5445 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5477 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5446 struct mem_cgroup *mi; 5478 struct mem_cgroup *mi;
5447 unsigned int i; 5479 unsigned int i;
5448 5480
@@ -5651,13 +5683,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5651 mem_cgroup_oom_notify_cb(iter); 5683 mem_cgroup_oom_notify_cb(iter);
5652} 5684}
5653 5685
5654static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5686static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5655 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5687 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
5656{ 5688{
5657 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5658 struct mem_cgroup_thresholds *thresholds; 5689 struct mem_cgroup_thresholds *thresholds;
5659 struct mem_cgroup_threshold_ary *new; 5690 struct mem_cgroup_threshold_ary *new;
5660 enum res_type type = MEMFILE_TYPE(cft->private);
5661 u64 threshold, usage; 5691 u64 threshold, usage;
5662 int i, size, ret; 5692 int i, size, ret;
5663 5693
@@ -5734,13 +5764,23 @@ unlock:
5734 return ret; 5764 return ret;
5735} 5765}
5736 5766
5737static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5767static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5738 struct cftype *cft, struct eventfd_ctx *eventfd) 5768 struct eventfd_ctx *eventfd, const char *args)
5769{
5770 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
5771}
5772
5773static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
5774 struct eventfd_ctx *eventfd, const char *args)
5775{
5776 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
5777}
5778
5779static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5780 struct eventfd_ctx *eventfd, enum res_type type)
5739{ 5781{
5740 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5741 struct mem_cgroup_thresholds *thresholds; 5782 struct mem_cgroup_thresholds *thresholds;
5742 struct mem_cgroup_threshold_ary *new; 5783 struct mem_cgroup_threshold_ary *new;
5743 enum res_type type = MEMFILE_TYPE(cft->private);
5744 u64 usage; 5784 u64 usage;
5745 int i, j, size; 5785 int i, j, size;
5746 5786
@@ -5813,14 +5853,23 @@ unlock:
5813 mutex_unlock(&memcg->thresholds_lock); 5853 mutex_unlock(&memcg->thresholds_lock);
5814} 5854}
5815 5855
5816static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5856static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5817 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5857 struct eventfd_ctx *eventfd)
5858{
5859 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
5860}
5861
5862static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5863 struct eventfd_ctx *eventfd)
5864{
5865 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
5866}
5867
5868static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
5869 struct eventfd_ctx *eventfd, const char *args)
5818{ 5870{
5819 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5820 struct mem_cgroup_eventfd_list *event; 5871 struct mem_cgroup_eventfd_list *event;
5821 enum res_type type = MEMFILE_TYPE(cft->private);
5822 5872
5823 BUG_ON(type != _OOM_TYPE);
5824 event = kmalloc(sizeof(*event), GFP_KERNEL); 5873 event = kmalloc(sizeof(*event), GFP_KERNEL);
5825 if (!event) 5874 if (!event)
5826 return -ENOMEM; 5875 return -ENOMEM;
@@ -5838,14 +5887,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5838 return 0; 5887 return 0;
5839} 5888}
5840 5889
5841static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5890static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
5842 struct cftype *cft, struct eventfd_ctx *eventfd) 5891 struct eventfd_ctx *eventfd)
5843{ 5892{
5844 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5845 struct mem_cgroup_eventfd_list *ev, *tmp; 5893 struct mem_cgroup_eventfd_list *ev, *tmp;
5846 enum res_type type = MEMFILE_TYPE(cft->private);
5847
5848 BUG_ON(type != _OOM_TYPE);
5849 5894
5850 spin_lock(&memcg_oom_lock); 5895 spin_lock(&memcg_oom_lock);
5851 5896
@@ -5859,17 +5904,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
5859 spin_unlock(&memcg_oom_lock); 5904 spin_unlock(&memcg_oom_lock);
5860} 5905}
5861 5906
5862static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, 5907static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
5863 struct cftype *cft, struct cgroup_map_cb *cb)
5864{ 5908{
5865 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5909 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
5866 5910
5867 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 5911 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
5868 5912 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
5869 if (atomic_read(&memcg->under_oom))
5870 cb->fill(cb, "under_oom", 1);
5871 else
5872 cb->fill(cb, "under_oom", 0);
5873 return 0; 5913 return 0;
5874} 5914}
5875 5915
@@ -5962,41 +6002,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5962} 6002}
5963#endif 6003#endif
5964 6004
6005/*
6006 * DO NOT USE IN NEW FILES.
6007 *
6008 * "cgroup.event_control" implementation.
6009 *
6010 * This is way over-engineered. It tries to support fully configurable
6011 * events for each user. Such level of flexibility is completely
6012 * unnecessary especially in the light of the planned unified hierarchy.
6013 *
6014 * Please deprecate this and replace with something simpler if at all
6015 * possible.
6016 */
6017
6018/*
6019 * Unregister event and free resources.
6020 *
6021 * Gets called from workqueue.
6022 */
6023static void memcg_event_remove(struct work_struct *work)
6024{
6025 struct mem_cgroup_event *event =
6026 container_of(work, struct mem_cgroup_event, remove);
6027 struct mem_cgroup *memcg = event->memcg;
6028
6029 remove_wait_queue(event->wqh, &event->wait);
6030
6031 event->unregister_event(memcg, event->eventfd);
6032
6033 /* Notify userspace the event is going away. */
6034 eventfd_signal(event->eventfd, 1);
6035
6036 eventfd_ctx_put(event->eventfd);
6037 kfree(event);
6038 css_put(&memcg->css);
6039}
6040
6041/*
6042 * Gets called on POLLHUP on eventfd when user closes it.
6043 *
6044 * Called with wqh->lock held and interrupts disabled.
6045 */
6046static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
6047 int sync, void *key)
6048{
6049 struct mem_cgroup_event *event =
6050 container_of(wait, struct mem_cgroup_event, wait);
6051 struct mem_cgroup *memcg = event->memcg;
6052 unsigned long flags = (unsigned long)key;
6053
6054 if (flags & POLLHUP) {
6055 /*
6056 * If the event has been detached at cgroup removal, we
6057 * can simply return knowing the other side will cleanup
6058 * for us.
6059 *
6060 * We can't race against event freeing since the other
6061 * side will require wqh->lock via remove_wait_queue(),
6062 * which we hold.
6063 */
6064 spin_lock(&memcg->event_list_lock);
6065 if (!list_empty(&event->list)) {
6066 list_del_init(&event->list);
6067 /*
6068 * We are in atomic context, but cgroup_event_remove()
6069 * may sleep, so we have to call it in workqueue.
6070 */
6071 schedule_work(&event->remove);
6072 }
6073 spin_unlock(&memcg->event_list_lock);
6074 }
6075
6076 return 0;
6077}
6078
6079static void memcg_event_ptable_queue_proc(struct file *file,
6080 wait_queue_head_t *wqh, poll_table *pt)
6081{
6082 struct mem_cgroup_event *event =
6083 container_of(pt, struct mem_cgroup_event, pt);
6084
6085 event->wqh = wqh;
6086 add_wait_queue(wqh, &event->wait);
6087}
6088
6089/*
6090 * DO NOT USE IN NEW FILES.
6091 *
6092 * Parse input and register new cgroup event handler.
6093 *
6094 * Input must be in format '<event_fd> <control_fd> <args>'.
6095 * Interpretation of args is defined by control file implementation.
6096 */
6097static int memcg_write_event_control(struct cgroup_subsys_state *css,
6098 struct cftype *cft, const char *buffer)
6099{
6100 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6101 struct mem_cgroup_event *event;
6102 struct cgroup_subsys_state *cfile_css;
6103 unsigned int efd, cfd;
6104 struct fd efile;
6105 struct fd cfile;
6106 const char *name;
6107 char *endp;
6108 int ret;
6109
6110 efd = simple_strtoul(buffer, &endp, 10);
6111 if (*endp != ' ')
6112 return -EINVAL;
6113 buffer = endp + 1;
6114
6115 cfd = simple_strtoul(buffer, &endp, 10);
6116 if ((*endp != ' ') && (*endp != '\0'))
6117 return -EINVAL;
6118 buffer = endp + 1;
6119
6120 event = kzalloc(sizeof(*event), GFP_KERNEL);
6121 if (!event)
6122 return -ENOMEM;
6123
6124 event->memcg = memcg;
6125 INIT_LIST_HEAD(&event->list);
6126 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
6127 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
6128 INIT_WORK(&event->remove, memcg_event_remove);
6129
6130 efile = fdget(efd);
6131 if (!efile.file) {
6132 ret = -EBADF;
6133 goto out_kfree;
6134 }
6135
6136 event->eventfd = eventfd_ctx_fileget(efile.file);
6137 if (IS_ERR(event->eventfd)) {
6138 ret = PTR_ERR(event->eventfd);
6139 goto out_put_efile;
6140 }
6141
6142 cfile = fdget(cfd);
6143 if (!cfile.file) {
6144 ret = -EBADF;
6145 goto out_put_eventfd;
6146 }
6147
6148 /* the process need read permission on control file */
6149 /* AV: shouldn't we check that it's been opened for read instead? */
6150 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6151 if (ret < 0)
6152 goto out_put_cfile;
6153
6154 /*
6155 * Determine the event callbacks and set them in @event. This used
6156 * to be done via struct cftype but cgroup core no longer knows
6157 * about these events. The following is crude but the whole thing
6158 * is for compatibility anyway.
6159 *
6160 * DO NOT ADD NEW FILES.
6161 */
6162 name = cfile.file->f_dentry->d_name.name;
6163
6164 if (!strcmp(name, "memory.usage_in_bytes")) {
6165 event->register_event = mem_cgroup_usage_register_event;
6166 event->unregister_event = mem_cgroup_usage_unregister_event;
6167 } else if (!strcmp(name, "memory.oom_control")) {
6168 event->register_event = mem_cgroup_oom_register_event;
6169 event->unregister_event = mem_cgroup_oom_unregister_event;
6170 } else if (!strcmp(name, "memory.pressure_level")) {
6171 event->register_event = vmpressure_register_event;
6172 event->unregister_event = vmpressure_unregister_event;
6173 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
6174 event->register_event = memsw_cgroup_usage_register_event;
6175 event->unregister_event = memsw_cgroup_usage_unregister_event;
6176 } else {
6177 ret = -EINVAL;
6178 goto out_put_cfile;
6179 }
6180
6181 /*
6182 * Verify @cfile should belong to @css. Also, remaining events are
6183 * automatically removed on cgroup destruction but the removal is
6184 * asynchronous, so take an extra ref on @css.
6185 */
6186 rcu_read_lock();
6187
6188 ret = -EINVAL;
6189 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
6190 &mem_cgroup_subsys);
6191 if (cfile_css == css && css_tryget(css))
6192 ret = 0;
6193
6194 rcu_read_unlock();
6195 if (ret)
6196 goto out_put_cfile;
6197
6198 ret = event->register_event(memcg, event->eventfd, buffer);
6199 if (ret)
6200 goto out_put_css;
6201
6202 efile.file->f_op->poll(efile.file, &event->pt);
6203
6204 spin_lock(&memcg->event_list_lock);
6205 list_add(&event->list, &memcg->event_list);
6206 spin_unlock(&memcg->event_list_lock);
6207
6208 fdput(cfile);
6209 fdput(efile);
6210
6211 return 0;
6212
6213out_put_css:
6214 css_put(css);
6215out_put_cfile:
6216 fdput(cfile);
6217out_put_eventfd:
6218 eventfd_ctx_put(event->eventfd);
6219out_put_efile:
6220 fdput(efile);
6221out_kfree:
6222 kfree(event);
6223
6224 return ret;
6225}
6226
5965static struct cftype mem_cgroup_files[] = { 6227static struct cftype mem_cgroup_files[] = {
5966 { 6228 {
5967 .name = "usage_in_bytes", 6229 .name = "usage_in_bytes",
5968 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6230 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5969 .read = mem_cgroup_read, 6231 .read_u64 = mem_cgroup_read_u64,
5970 .register_event = mem_cgroup_usage_register_event,
5971 .unregister_event = mem_cgroup_usage_unregister_event,
5972 }, 6232 },
5973 { 6233 {
5974 .name = "max_usage_in_bytes", 6234 .name = "max_usage_in_bytes",
5975 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 6235 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5976 .trigger = mem_cgroup_reset, 6236 .trigger = mem_cgroup_reset,
5977 .read = mem_cgroup_read, 6237 .read_u64 = mem_cgroup_read_u64,
5978 }, 6238 },
5979 { 6239 {
5980 .name = "limit_in_bytes", 6240 .name = "limit_in_bytes",
5981 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 6241 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5982 .write_string = mem_cgroup_write, 6242 .write_string = mem_cgroup_write,
5983 .read = mem_cgroup_read, 6243 .read_u64 = mem_cgroup_read_u64,
5984 }, 6244 },
5985 { 6245 {
5986 .name = "soft_limit_in_bytes", 6246 .name = "soft_limit_in_bytes",
5987 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 6247 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5988 .write_string = mem_cgroup_write, 6248 .write_string = mem_cgroup_write,
5989 .read = mem_cgroup_read, 6249 .read_u64 = mem_cgroup_read_u64,
5990 }, 6250 },
5991 { 6251 {
5992 .name = "failcnt", 6252 .name = "failcnt",
5993 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 6253 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5994 .trigger = mem_cgroup_reset, 6254 .trigger = mem_cgroup_reset,
5995 .read = mem_cgroup_read, 6255 .read_u64 = mem_cgroup_read_u64,
5996 }, 6256 },
5997 { 6257 {
5998 .name = "stat", 6258 .name = "stat",
5999 .read_seq_string = memcg_stat_show, 6259 .seq_show = memcg_stat_show,
6000 }, 6260 },
6001 { 6261 {
6002 .name = "force_empty", 6262 .name = "force_empty",
@@ -6009,6 +6269,12 @@ static struct cftype mem_cgroup_files[] = {
6009 .read_u64 = mem_cgroup_hierarchy_read, 6269 .read_u64 = mem_cgroup_hierarchy_read,
6010 }, 6270 },
6011 { 6271 {
6272 .name = "cgroup.event_control", /* XXX: for compat */
6273 .write_string = memcg_write_event_control,
6274 .flags = CFTYPE_NO_PREFIX,
6275 .mode = S_IWUGO,
6276 },
6277 {
6012 .name = "swappiness", 6278 .name = "swappiness",
6013 .read_u64 = mem_cgroup_swappiness_read, 6279 .read_u64 = mem_cgroup_swappiness_read,
6014 .write_u64 = mem_cgroup_swappiness_write, 6280 .write_u64 = mem_cgroup_swappiness_write,
@@ -6020,21 +6286,17 @@ static struct cftype mem_cgroup_files[] = {
6020 }, 6286 },
6021 { 6287 {
6022 .name = "oom_control", 6288 .name = "oom_control",
6023 .read_map = mem_cgroup_oom_control_read, 6289 .seq_show = mem_cgroup_oom_control_read,
6024 .write_u64 = mem_cgroup_oom_control_write, 6290 .write_u64 = mem_cgroup_oom_control_write,
6025 .register_event = mem_cgroup_oom_register_event,
6026 .unregister_event = mem_cgroup_oom_unregister_event,
6027 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6291 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6028 }, 6292 },
6029 { 6293 {
6030 .name = "pressure_level", 6294 .name = "pressure_level",
6031 .register_event = vmpressure_register_event,
6032 .unregister_event = vmpressure_unregister_event,
6033 }, 6295 },
6034#ifdef CONFIG_NUMA 6296#ifdef CONFIG_NUMA
6035 { 6297 {
6036 .name = "numa_stat", 6298 .name = "numa_stat",
6037 .read_seq_string = memcg_numa_stat_show, 6299 .seq_show = memcg_numa_stat_show,
6038 }, 6300 },
6039#endif 6301#endif
6040#ifdef CONFIG_MEMCG_KMEM 6302#ifdef CONFIG_MEMCG_KMEM
@@ -6042,29 +6304,29 @@ static struct cftype mem_cgroup_files[] = {
6042 .name = "kmem.limit_in_bytes", 6304 .name = "kmem.limit_in_bytes",
6043 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6305 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6044 .write_string = mem_cgroup_write, 6306 .write_string = mem_cgroup_write,
6045 .read = mem_cgroup_read, 6307 .read_u64 = mem_cgroup_read_u64,
6046 }, 6308 },
6047 { 6309 {
6048 .name = "kmem.usage_in_bytes", 6310 .name = "kmem.usage_in_bytes",
6049 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6311 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6050 .read = mem_cgroup_read, 6312 .read_u64 = mem_cgroup_read_u64,
6051 }, 6313 },
6052 { 6314 {
6053 .name = "kmem.failcnt", 6315 .name = "kmem.failcnt",
6054 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6316 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6055 .trigger = mem_cgroup_reset, 6317 .trigger = mem_cgroup_reset,
6056 .read = mem_cgroup_read, 6318 .read_u64 = mem_cgroup_read_u64,
6057 }, 6319 },
6058 { 6320 {
6059 .name = "kmem.max_usage_in_bytes", 6321 .name = "kmem.max_usage_in_bytes",
6060 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6322 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6061 .trigger = mem_cgroup_reset, 6323 .trigger = mem_cgroup_reset,
6062 .read = mem_cgroup_read, 6324 .read_u64 = mem_cgroup_read_u64,
6063 }, 6325 },
6064#ifdef CONFIG_SLABINFO 6326#ifdef CONFIG_SLABINFO
6065 { 6327 {
6066 .name = "kmem.slabinfo", 6328 .name = "kmem.slabinfo",
6067 .read_seq_string = mem_cgroup_slabinfo_read, 6329 .seq_show = mem_cgroup_slabinfo_read,
6068 }, 6330 },
6069#endif 6331#endif
6070#endif 6332#endif
@@ -6076,27 +6338,25 @@ static struct cftype memsw_cgroup_files[] = {
6076 { 6338 {
6077 .name = "memsw.usage_in_bytes", 6339 .name = "memsw.usage_in_bytes",
6078 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6340 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6079 .read = mem_cgroup_read, 6341 .read_u64 = mem_cgroup_read_u64,
6080 .register_event = mem_cgroup_usage_register_event,
6081 .unregister_event = mem_cgroup_usage_unregister_event,
6082 }, 6342 },
6083 { 6343 {
6084 .name = "memsw.max_usage_in_bytes", 6344 .name = "memsw.max_usage_in_bytes",
6085 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6345 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6086 .trigger = mem_cgroup_reset, 6346 .trigger = mem_cgroup_reset,
6087 .read = mem_cgroup_read, 6347 .read_u64 = mem_cgroup_read_u64,
6088 }, 6348 },
6089 { 6349 {
6090 .name = "memsw.limit_in_bytes", 6350 .name = "memsw.limit_in_bytes",
6091 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6351 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6092 .write_string = mem_cgroup_write, 6352 .write_string = mem_cgroup_write,
6093 .read = mem_cgroup_read, 6353 .read_u64 = mem_cgroup_read_u64,
6094 }, 6354 },
6095 { 6355 {
6096 .name = "memsw.failcnt", 6356 .name = "memsw.failcnt",
6097 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6357 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6098 .trigger = mem_cgroup_reset, 6358 .trigger = mem_cgroup_reset,
6099 .read = mem_cgroup_read, 6359 .read_u64 = mem_cgroup_read_u64,
6100 }, 6360 },
6101 { }, /* terminate */ 6361 { }, /* terminate */
6102}; 6362};
@@ -6139,14 +6399,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6139static struct mem_cgroup *mem_cgroup_alloc(void) 6399static struct mem_cgroup *mem_cgroup_alloc(void)
6140{ 6400{
6141 struct mem_cgroup *memcg; 6401 struct mem_cgroup *memcg;
6142 size_t size = memcg_size(); 6402 size_t size;
6143 6403
6144 /* Can be very big if nr_node_ids is very big */ 6404 size = sizeof(struct mem_cgroup);
6145 if (size < PAGE_SIZE) 6405 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
6146 memcg = kzalloc(size, GFP_KERNEL);
6147 else
6148 memcg = vzalloc(size);
6149 6406
6407 memcg = kzalloc(size, GFP_KERNEL);
6150 if (!memcg) 6408 if (!memcg)
6151 return NULL; 6409 return NULL;
6152 6410
@@ -6157,10 +6415,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
6157 return memcg; 6415 return memcg;
6158 6416
6159out_free: 6417out_free:
6160 if (size < PAGE_SIZE) 6418 kfree(memcg);
6161 kfree(memcg);
6162 else
6163 vfree(memcg);
6164 return NULL; 6419 return NULL;
6165} 6420}
6166 6421
@@ -6178,7 +6433,6 @@ out_free:
6178static void __mem_cgroup_free(struct mem_cgroup *memcg) 6433static void __mem_cgroup_free(struct mem_cgroup *memcg)
6179{ 6434{
6180 int node; 6435 int node;
6181 size_t size = memcg_size();
6182 6436
6183 mem_cgroup_remove_from_trees(memcg); 6437 mem_cgroup_remove_from_trees(memcg);
6184 6438
@@ -6199,10 +6453,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
6199 * the cgroup_lock. 6453 * the cgroup_lock.
6200 */ 6454 */
6201 disarm_static_keys(memcg); 6455 disarm_static_keys(memcg);
6202 if (size < PAGE_SIZE) 6456 kfree(memcg);
6203 kfree(memcg);
6204 else
6205 vfree(memcg);
6206} 6457}
6207 6458
6208/* 6459/*
@@ -6268,6 +6519,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6268 mutex_init(&memcg->thresholds_lock); 6519 mutex_init(&memcg->thresholds_lock);
6269 spin_lock_init(&memcg->move_lock); 6520 spin_lock_init(&memcg->move_lock);
6270 vmpressure_init(&memcg->vmpressure); 6521 vmpressure_init(&memcg->vmpressure);
6522 INIT_LIST_HEAD(&memcg->event_list);
6523 spin_lock_init(&memcg->event_list_lock);
6271 6524
6272 return &memcg->css; 6525 return &memcg->css;
6273 6526
@@ -6281,7 +6534,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
6281{ 6534{
6282 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6535 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6283 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); 6536 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6284 int error = 0;
6285 6537
6286 if (css->cgroup->id > MEM_CGROUP_ID_MAX) 6538 if (css->cgroup->id > MEM_CGROUP_ID_MAX)
6287 return -ENOSPC; 6539 return -ENOSPC;
@@ -6316,10 +6568,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
6316 if (parent != root_mem_cgroup) 6568 if (parent != root_mem_cgroup)
6317 mem_cgroup_subsys.broken_hierarchy = true; 6569 mem_cgroup_subsys.broken_hierarchy = true;
6318 } 6570 }
6319
6320 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6321 mutex_unlock(&memcg_create_mutex); 6571 mutex_unlock(&memcg_create_mutex);
6322 return error; 6572
6573 return memcg_init_kmem(memcg, &mem_cgroup_subsys);
6323} 6574}
6324 6575
6325/* 6576/*
@@ -6343,6 +6594,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6343static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6594static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6344{ 6595{
6345 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6596 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6597 struct mem_cgroup_event *event, *tmp;
6598
6599 /*
6600 * Unregister events and notify userspace.
6601 * Notify userspace about cgroup removing only after rmdir of cgroup
6602 * directory to avoid race between userspace and kernelspace.
6603 */
6604 spin_lock(&memcg->event_list_lock);
6605 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
6606 list_del_init(&event->list);
6607 schedule_work(&event->remove);
6608 }
6609 spin_unlock(&memcg->event_list_lock);
6346 6610
6347 kmem_cgroup_css_offline(memcg); 6611 kmem_cgroup_css_offline(memcg);
6348 6612
@@ -6615,7 +6879,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6615 enum mc_target_type ret = MC_TARGET_NONE; 6879 enum mc_target_type ret = MC_TARGET_NONE;
6616 6880
6617 page = pmd_page(pmd); 6881 page = pmd_page(pmd);
6618 VM_BUG_ON(!page || !PageHead(page)); 6882 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
6619 if (!move_anon()) 6883 if (!move_anon())
6620 return ret; 6884 return ret;
6621 pc = lookup_page_cgroup(page); 6885 pc = lookup_page_cgroup(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fabe55046c1d..4f08a2d61487 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
611} 611}
612 612
613/* 613/*
614 * Dirty cache page page 614 * Dirty pagecache page
615 * Issues: when the error hit a hole page the error is not properly 615 * Issues: when the error hit a hole page the error is not properly
616 * propagated. 616 * propagated.
617 */ 617 */
@@ -856,14 +856,14 @@ static int page_action(struct page_state *ps, struct page *p,
856 * the pages and send SIGBUS to the processes if the data was dirty. 856 * the pages and send SIGBUS to the processes if the data was dirty.
857 */ 857 */
858static int hwpoison_user_mappings(struct page *p, unsigned long pfn, 858static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
859 int trapno, int flags) 859 int trapno, int flags, struct page **hpagep)
860{ 860{
861 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 861 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
862 struct address_space *mapping; 862 struct address_space *mapping;
863 LIST_HEAD(tokill); 863 LIST_HEAD(tokill);
864 int ret; 864 int ret;
865 int kill = 1, forcekill; 865 int kill = 1, forcekill;
866 struct page *hpage = compound_head(p); 866 struct page *hpage = *hpagep;
867 struct page *ppage; 867 struct page *ppage;
868 868
869 if (PageReserved(p) || PageSlab(p)) 869 if (PageReserved(p) || PageSlab(p))
@@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
942 * We pinned the head page for hwpoison handling, 942 * We pinned the head page for hwpoison handling,
943 * now we split the thp and we are interested in 943 * now we split the thp and we are interested in
944 * the hwpoisoned raw page, so move the refcount 944 * the hwpoisoned raw page, so move the refcount
945 * to it. 945 * to it. Similarly, page lock is shifted.
946 */ 946 */
947 if (hpage != p) { 947 if (hpage != p) {
948 put_page(hpage); 948 put_page(hpage);
949 get_page(p); 949 get_page(p);
950 lock_page(p);
951 unlock_page(hpage);
952 *hpagep = p;
950 } 953 }
951 /* THP is split, so ppage should be the real poisoned page. */ 954 /* THP is split, so ppage should be the real poisoned page. */
952 ppage = p; 955 ppage = p;
@@ -964,17 +967,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
964 if (kill) 967 if (kill)
965 collect_procs(ppage, &tokill); 968 collect_procs(ppage, &tokill);
966 969
967 if (hpage != ppage)
968 lock_page(ppage);
969
970 ret = try_to_unmap(ppage, ttu); 970 ret = try_to_unmap(ppage, ttu);
971 if (ret != SWAP_SUCCESS) 971 if (ret != SWAP_SUCCESS)
972 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 972 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
973 pfn, page_mapcount(ppage)); 973 pfn, page_mapcount(ppage));
974 974
975 if (hpage != ppage)
976 unlock_page(ppage);
977
978 /* 975 /*
979 * Now that the dirty bit has been propagated to the 976 * Now that the dirty bit has been propagated to the
980 * struct page and all unmaps done we can decide if 977 * struct page and all unmaps done we can decide if
@@ -1193,8 +1190,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1193 /* 1190 /*
1194 * Now take care of user space mappings. 1191 * Now take care of user space mappings.
1195 * Abort on fail: __delete_from_page_cache() assumes unmapped page. 1192 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1193 *
1194 * When the raw error page is thp tail page, hpage points to the raw
1195 * page after thp split.
1196 */ 1196 */
1197 if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { 1197 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1198 != SWAP_SUCCESS) {
1198 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 1199 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1199 res = -EBUSY; 1200 res = -EBUSY;
1200 goto out; 1201 goto out;
@@ -1585,7 +1586,13 @@ static int __soft_offline_page(struct page *page, int flags)
1585 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1586 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1586 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1587 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1587 if (ret) { 1588 if (ret) {
1588 putback_lru_pages(&pagelist); 1589 if (!list_empty(&pagelist)) {
1590 list_del(&page->lru);
1591 dec_zone_page_state(page, NR_ISOLATED_ANON +
1592 page_is_file_cache(page));
1593 putback_lru_page(page);
1594 }
1595
1589 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1596 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1590 pfn, ret, page->flags); 1597 pfn, ret, page->flags);
1591 if (ret > 0) 1598 if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9e57d2..be6a0c0d4ae0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,6 +59,7 @@
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h> 60#include <linux/migrate.h>
61#include <linux/string.h> 61#include <linux/string.h>
62#include <linux/dma-debug.h>
62 63
63#include <asm/io.h> 64#include <asm/io.h>
64#include <asm/pgalloc.h> 65#include <asm/pgalloc.h>
@@ -288,7 +289,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
288 return 0; 289 return 0;
289 batch = tlb->active; 290 batch = tlb->active;
290 } 291 }
291 VM_BUG_ON(batch->nr > batch->max); 292 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
292 293
293 return batch->max - batch->nr; 294 return batch->max - batch->nr;
294} 295}
@@ -670,7 +671,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
670 current->comm, 671 current->comm,
671 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 672 (long long)pte_val(pte), (long long)pmd_val(*pmd));
672 if (page) 673 if (page)
673 dump_page(page); 674 dump_page(page, "bad pte");
674 printk(KERN_ALERT 675 printk(KERN_ALERT
675 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 676 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
676 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 677 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2559 2560
2560static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2561static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2561{ 2562{
2563 debug_dma_assert_idle(src);
2564
2562 /* 2565 /*
2563 * If the source page was a PFN mapping, we don't have 2566 * If the source page was a PFN mapping, we don't have
2564 * a "struct page" for it. We do a best-effort copy by 2567 * a "struct page" for it. We do a best-effort copy by
@@ -2699,7 +2702,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2699 goto unwritable_page; 2702 goto unwritable_page;
2700 } 2703 }
2701 } else 2704 } else
2702 VM_BUG_ON(!PageLocked(old_page)); 2705 VM_BUG_ON_PAGE(!PageLocked(old_page), old_page);
2703 2706
2704 /* 2707 /*
2705 * Since we dropped the lock we need to revalidate 2708 * Since we dropped the lock we need to revalidate
@@ -3355,7 +3358,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3355 if (unlikely(!(ret & VM_FAULT_LOCKED))) 3358 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3356 lock_page(vmf.page); 3359 lock_page(vmf.page);
3357 else 3360 else
3358 VM_BUG_ON(!PageLocked(vmf.page)); 3361 VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
3359 3362
3360 /* 3363 /*
3361 * Should we do an early C-O-W break? 3364 * Should we do an early C-O-W break?
@@ -3392,7 +3395,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3392 goto unwritable_page; 3395 goto unwritable_page;
3393 } 3396 }
3394 } else 3397 } else
3395 VM_BUG_ON(!PageLocked(page)); 3398 VM_BUG_ON_PAGE(!PageLocked(page), page);
3396 page_mkwrite = 1; 3399 page_mkwrite = 1;
3397 } 3400 }
3398 } 3401 }
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src,
4272#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 4275#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
4273 4276
4274#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS 4277#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4278
4279static struct kmem_cache *page_ptl_cachep;
4280
4281void __init ptlock_cache_init(void)
4282{
4283 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4284 SLAB_PANIC, NULL);
4285}
4286
4275bool ptlock_alloc(struct page *page) 4287bool ptlock_alloc(struct page *page)
4276{ 4288{
4277 spinlock_t *ptl; 4289 spinlock_t *ptl;
4278 4290
4279 ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); 4291 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4280 if (!ptl) 4292 if (!ptl)
4281 return false; 4293 return false;
4282 page->ptl = ptl; 4294 page->ptl = ptl;
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page)
4285 4297
4286void ptlock_free(struct page *page) 4298void ptlock_free(struct page *page)
4287{ 4299{
4288 kfree(page->ptl); 4300 kmem_cache_free(page_ptl_cachep, page->ptl);
4289} 4301}
4290#endif 4302#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235502db..a650db29606f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,7 +9,6 @@
9#include <linux/swap.h> 9#include <linux/swap.h>
10#include <linux/interrupt.h> 10#include <linux/interrupt.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/bootmem.h>
13#include <linux/compiler.h> 12#include <linux/compiler.h>
14#include <linux/export.h> 13#include <linux/export.h>
15#include <linux/pagevec.h> 14#include <linux/pagevec.h>
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
269} 268}
270 269
271/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 270/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
272 * alloc_bootmem_node_nopanic() */ 271 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
273static int __ref ensure_zone_is_initialized(struct zone *zone, 272static int __ref ensure_zone_is_initialized(struct zone *zone,
274 unsigned long start_pfn, unsigned long num_pages) 273 unsigned long start_pfn, unsigned long num_pages)
275{ 274{
@@ -1108,17 +1107,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
1108 if (ret) 1107 if (ret)
1109 return ret; 1108 return ret;
1110 1109
1111 lock_memory_hotplug();
1112
1113 res = register_memory_resource(start, size); 1110 res = register_memory_resource(start, size);
1114 ret = -EEXIST; 1111 ret = -EEXIST;
1115 if (!res) 1112 if (!res)
1116 goto out; 1113 return ret;
1117 1114
1118 { /* Stupid hack to suppress address-never-null warning */ 1115 { /* Stupid hack to suppress address-never-null warning */
1119 void *p = NODE_DATA(nid); 1116 void *p = NODE_DATA(nid);
1120 new_pgdat = !p; 1117 new_pgdat = !p;
1121 } 1118 }
1119
1120 lock_memory_hotplug();
1121
1122 new_node = !node_online(nid); 1122 new_node = !node_online(nid);
1123 if (new_node) { 1123 if (new_node) {
1124 pgdat = hotadd_new_pgdat(nid, start); 1124 pgdat = hotadd_new_pgdat(nid, start);
@@ -1310,7 +1310,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1310#ifdef CONFIG_DEBUG_VM 1310#ifdef CONFIG_DEBUG_VM
1311 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1311 printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
1312 pfn); 1312 pfn);
1313 dump_page(page); 1313 dump_page(page, "failed to remove from LRU");
1314#endif 1314#endif
1315 put_page(page); 1315 put_page(page);
1316 /* Because we don't have big zone->lock. we should 1316 /* Because we don't have big zone->lock. we should
@@ -1446,6 +1446,7 @@ static int __init cmdline_parse_movable_node(char *p)
1446 * the kernel away from hotpluggable memory. 1446 * the kernel away from hotpluggable memory.
1447 */ 1447 */
1448 memblock_set_bottom_up(true); 1448 memblock_set_bottom_up(true);
1449 movable_node_enabled = true;
1449#else 1450#else
1450 pr_warn("movable_node option not supported\n"); 1451 pr_warn("movable_node option not supported\n");
1451#endif 1452#endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0cd2c4d4e270..ae3c8f3595d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -613,7 +613,7 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
613 return 0; 613 return 0;
614} 614}
615 615
616#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE 616#ifdef CONFIG_NUMA_BALANCING
617/* 617/*
618 * This is used to mark a range of virtual addresses to be inaccessible. 618 * This is used to mark a range of virtual addresses to be inaccessible.
619 * These are later cleared by a NUMA hinting fault. Depending on these 619 * These are later cleared by a NUMA hinting fault. Depending on these
@@ -627,7 +627,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
627 unsigned long addr, unsigned long end) 627 unsigned long addr, unsigned long end)
628{ 628{
629 int nr_updated; 629 int nr_updated;
630 BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
631 630
632 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); 631 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
633 if (nr_updated) 632 if (nr_updated)
@@ -641,7 +640,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
641{ 640{
642 return 0; 641 return 0;
643} 642}
644#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ 643#endif /* CONFIG_NUMA_BALANCING */
645 644
646/* 645/*
647 * Walk through page tables and collect pages to be migrated. 646 * Walk through page tables and collect pages to be migrated.
@@ -1199,10 +1198,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
1199 } 1198 }
1200 1199
1201 if (PageHuge(page)) { 1200 if (PageHuge(page)) {
1202 if (vma) 1201 BUG_ON(!vma);
1203 return alloc_huge_page_noerr(vma, address, 1); 1202 return alloc_huge_page_noerr(vma, address, 1);
1204 else
1205 return NULL;
1206 } 1203 }
1207 /* 1204 /*
1208 * if !vma, alloc_page_vma() will use task or system default policy 1205 * if !vma, alloc_page_vma() will use task or system default policy
@@ -2657,7 +2654,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
2657} 2654}
2658 2655
2659#ifdef CONFIG_NUMA_BALANCING 2656#ifdef CONFIG_NUMA_BALANCING
2660static bool __initdata numabalancing_override; 2657static int __initdata numabalancing_override;
2661 2658
2662static void __init check_numabalancing_enable(void) 2659static void __init check_numabalancing_enable(void)
2663{ 2660{
@@ -2666,9 +2663,15 @@ static void __init check_numabalancing_enable(void)
2666 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 2663 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2667 numabalancing_default = true; 2664 numabalancing_default = true;
2668 2665
2666 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2667 if (numabalancing_override)
2668 set_numabalancing_state(numabalancing_override == 1);
2669
2669 if (nr_node_ids > 1 && !numabalancing_override) { 2670 if (nr_node_ids > 1 && !numabalancing_override) {
2670 printk(KERN_INFO "Enabling automatic NUMA balancing. " 2671 pr_info("%s automatic NUMA balancing. "
2671 "Configure with numa_balancing= or sysctl"); 2672 "Configure with numa_balancing= or the "
2673 "kernel.numa_balancing sysctl",
2674 numabalancing_default ? "Enabling" : "Disabling");
2672 set_numabalancing_state(numabalancing_default); 2675 set_numabalancing_state(numabalancing_default);
2673 } 2676 }
2674} 2677}
@@ -2678,18 +2681,17 @@ static int __init setup_numabalancing(char *str)
2678 int ret = 0; 2681 int ret = 0;
2679 if (!str) 2682 if (!str)
2680 goto out; 2683 goto out;
2681 numabalancing_override = true;
2682 2684
2683 if (!strcmp(str, "enable")) { 2685 if (!strcmp(str, "enable")) {
2684 set_numabalancing_state(true); 2686 numabalancing_override = 1;
2685 ret = 1; 2687 ret = 1;
2686 } else if (!strcmp(str, "disable")) { 2688 } else if (!strcmp(str, "disable")) {
2687 set_numabalancing_state(false); 2689 numabalancing_override = -1;
2688 ret = 1; 2690 ret = 1;
2689 } 2691 }
2690out: 2692out:
2691 if (!ret) 2693 if (!ret)
2692 printk(KERN_WARNING "Unable to parse numa_balancing=\n"); 2694 pr_warn("Unable to parse numa_balancing=\n");
2693 2695
2694 return ret; 2696 return ret;
2695} 2697}
@@ -2928,7 +2930,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2928 unsigned short mode = MPOL_DEFAULT; 2930 unsigned short mode = MPOL_DEFAULT;
2929 unsigned short flags = 0; 2931 unsigned short flags = 0;
2930 2932
2931 if (pol && pol != &default_policy) { 2933 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2932 mode = pol->mode; 2934 mode = pol->mode;
2933 flags = pol->flags; 2935 flags = pol->flags;
2934 } 2936 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 9194375b2307..482a33d89134 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -72,28 +72,12 @@ int migrate_prep_local(void)
72} 72}
73 73
74/* 74/*
75 * Add isolated pages on the list back to the LRU under page lock
76 * to avoid leaking evictable pages back onto unevictable list.
77 */
78void putback_lru_pages(struct list_head *l)
79{
80 struct page *page;
81 struct page *page2;
82
83 list_for_each_entry_safe(page, page2, l, lru) {
84 list_del(&page->lru);
85 dec_zone_page_state(page, NR_ISOLATED_ANON +
86 page_is_file_cache(page));
87 putback_lru_page(page);
88 }
89}
90
91/*
92 * Put previously isolated pages back onto the appropriate lists 75 * Put previously isolated pages back onto the appropriate lists
93 * from where they were once taken off for compaction/migration. 76 * from where they were once taken off for compaction/migration.
94 * 77 *
95 * This function shall be used instead of putback_lru_pages(), 78 * This function shall be used whenever the isolated pageset has been
96 * whenever the isolated pageset has been built by isolate_migratepages_range() 79 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
80 * and isolate_huge_page().
97 */ 81 */
98void putback_movable_pages(struct list_head *l) 82void putback_movable_pages(struct list_head *l)
99{ 83{
@@ -199,7 +183,12 @@ out:
199 */ 183 */
200static void remove_migration_ptes(struct page *old, struct page *new) 184static void remove_migration_ptes(struct page *old, struct page *new)
201{ 185{
202 rmap_walk(new, remove_migration_pte, old); 186 struct rmap_walk_control rwc = {
187 .rmap_one = remove_migration_pte,
188 .arg = old,
189 };
190
191 rmap_walk(new, &rwc);
203} 192}
204 193
205/* 194/*
@@ -510,7 +499,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
510 if (PageUptodate(page)) 499 if (PageUptodate(page))
511 SetPageUptodate(newpage); 500 SetPageUptodate(newpage);
512 if (TestClearPageActive(page)) { 501 if (TestClearPageActive(page)) {
513 VM_BUG_ON(PageUnevictable(page)); 502 VM_BUG_ON_PAGE(PageUnevictable(page), page);
514 SetPageActive(newpage); 503 SetPageActive(newpage);
515 } else if (TestClearPageUnevictable(page)) 504 } else if (TestClearPageUnevictable(page))
516 SetPageUnevictable(newpage); 505 SetPageUnevictable(newpage);
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
563 * Migration functions 552 * Migration functions
564 ***********************************************************/ 553 ***********************************************************/
565 554
566/* Always fail migration. Used for mappings that are not movable */
567int fail_migrate_page(struct address_space *mapping,
568 struct page *newpage, struct page *page)
569{
570 return -EIO;
571}
572EXPORT_SYMBOL(fail_migrate_page);
573
574/* 555/*
575 * Common logic to directly migrate a single page suitable for 556 * Common logic to directly migrate a single page suitable for
576 * pages that do not use PagePrivate/PagePrivate2. 557 * pages that do not use PagePrivate/PagePrivate2.
@@ -890,7 +871,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
890 * free the metadata, so the page can be freed. 871 * free the metadata, so the page can be freed.
891 */ 872 */
892 if (!page->mapping) { 873 if (!page->mapping) {
893 VM_BUG_ON(PageAnon(page)); 874 VM_BUG_ON_PAGE(PageAnon(page), page);
894 if (page_has_private(page)) { 875 if (page_has_private(page)) {
895 try_to_free_buffers(page); 876 try_to_free_buffers(page);
896 goto uncharge; 877 goto uncharge;
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1008{ 989{
1009 int rc = 0; 990 int rc = 0;
1010 int *result = NULL; 991 int *result = NULL;
1011 struct page *new_hpage = get_new_page(hpage, private, &result); 992 struct page *new_hpage;
1012 struct anon_vma *anon_vma = NULL; 993 struct anon_vma *anon_vma = NULL;
1013 994
1014 /* 995 /*
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1018 * tables or check whether the hugepage is pmd-based or not before 999 * tables or check whether the hugepage is pmd-based or not before
1019 * kicking migration. 1000 * kicking migration.
1020 */ 1001 */
1021 if (!hugepage_migration_support(page_hstate(hpage))) 1002 if (!hugepage_migration_support(page_hstate(hpage))) {
1003 putback_active_hugepage(hpage);
1022 return -ENOSYS; 1004 return -ENOSYS;
1005 }
1023 1006
1007 new_hpage = get_new_page(hpage, private, &result);
1024 if (!new_hpage) 1008 if (!new_hpage)
1025 return -ENOMEM; 1009 return -ENOMEM;
1026 1010
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1120 nr_succeeded++; 1104 nr_succeeded++;
1121 break; 1105 break;
1122 default: 1106 default:
1123 /* Permanent failure */ 1107 /*
1108 * Permanent failure (-EBUSY, -ENOSYS, etc.):
1109 * unlike -EAGAIN case, the failed page is
1110 * removed from migration page list and not
1111 * retried in the next outer loop.
1112 */
1124 nr_failed++; 1113 nr_failed++;
1125 break; 1114 break;
1126 } 1115 }
@@ -1559,8 +1548,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1559 __GFP_NOMEMALLOC | __GFP_NORETRY | 1548 __GFP_NOMEMALLOC | __GFP_NORETRY |
1560 __GFP_NOWARN) & 1549 __GFP_NOWARN) &
1561 ~GFP_IOFS, 0); 1550 ~GFP_IOFS, 0);
1562 if (newpage)
1563 page_cpupid_xchg_last(newpage, page_cpupid_last(page));
1564 1551
1565 return newpage; 1552 return newpage;
1566} 1553}
@@ -1594,35 +1581,42 @@ bool migrate_ratelimited(int node)
1594} 1581}
1595 1582
1596/* Returns true if the node is migrate rate-limited after the update */ 1583/* Returns true if the node is migrate rate-limited after the update */
1597bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) 1584static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
1585 unsigned long nr_pages)
1598{ 1586{
1599 bool rate_limited = false;
1600
1601 /* 1587 /*
1602 * Rate-limit the amount of data that is being migrated to a node. 1588 * Rate-limit the amount of data that is being migrated to a node.
1603 * Optimal placement is no good if the memory bus is saturated and 1589 * Optimal placement is no good if the memory bus is saturated and
1604 * all the time is being spent migrating! 1590 * all the time is being spent migrating!
1605 */ 1591 */
1606 spin_lock(&pgdat->numabalancing_migrate_lock);
1607 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { 1592 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1593 spin_lock(&pgdat->numabalancing_migrate_lock);
1608 pgdat->numabalancing_migrate_nr_pages = 0; 1594 pgdat->numabalancing_migrate_nr_pages = 0;
1609 pgdat->numabalancing_migrate_next_window = jiffies + 1595 pgdat->numabalancing_migrate_next_window = jiffies +
1610 msecs_to_jiffies(migrate_interval_millisecs); 1596 msecs_to_jiffies(migrate_interval_millisecs);
1597 spin_unlock(&pgdat->numabalancing_migrate_lock);
1611 } 1598 }
1612 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) 1599 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
1613 rate_limited = true; 1600 trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
1614 else 1601 nr_pages);
1615 pgdat->numabalancing_migrate_nr_pages += nr_pages; 1602 return true;
1616 spin_unlock(&pgdat->numabalancing_migrate_lock); 1603 }
1617 1604
1618 return rate_limited; 1605 /*
1606 * This is an unlocked non-atomic update so errors are possible.
1607 * The consequences are failing to migrate when we potentiall should
1608 * have which is not severe enough to warrant locking. If it is ever
1609 * a problem, it can be converted to a per-cpu counter.
1610 */
1611 pgdat->numabalancing_migrate_nr_pages += nr_pages;
1612 return false;
1619} 1613}
1620 1614
1621int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1615static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1622{ 1616{
1623 int page_lru; 1617 int page_lru;
1624 1618
1625 VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); 1619 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
1626 1620
1627 /* Avoid migrating to a node that is nearly full */ 1621 /* Avoid migrating to a node that is nearly full */
1628 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) 1622 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
@@ -1705,7 +1699,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1705 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1699 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1706 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); 1700 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1707 if (nr_remaining) { 1701 if (nr_remaining) {
1708 putback_lru_pages(&migratepages); 1702 if (!list_empty(&migratepages)) {
1703 list_del(&page->lru);
1704 dec_zone_page_state(page, NR_ISOLATED_ANON +
1705 page_is_file_cache(page));
1706 putback_lru_page(page);
1707 }
1709 isolated = 0; 1708 isolated = 0;
1710 } else 1709 } else
1711 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1710 count_vm_numa_event(NUMA_PAGE_MIGRATE);
@@ -1752,8 +1751,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1752 if (!new_page) 1751 if (!new_page)
1753 goto out_fail; 1752 goto out_fail;
1754 1753
1755 page_cpupid_xchg_last(new_page, page_cpupid_last(page));
1756
1757 isolated = numamigrate_isolate_page(pgdat, page); 1754 isolated = numamigrate_isolate_page(pgdat, page);
1758 if (!isolated) { 1755 if (!isolated) {
1759 put_page(new_page); 1756 put_page(new_page);
diff --git a/mm/mincore.c b/mm/mincore.c
index da2be56a7b8f..101623378fbf 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -225,13 +225,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
225 225
226 end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); 226 end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
227 227
228 if (is_vm_hugetlb_page(vma)) {
229 mincore_hugetlb_page_range(vma, addr, end, vec);
230 return (end - addr) >> PAGE_SHIFT;
231 }
232
233 end = pmd_addr_end(addr, end);
234
235 if (is_vm_hugetlb_page(vma)) 228 if (is_vm_hugetlb_page(vma))
236 mincore_hugetlb_page_range(vma, addr, end, vec); 229 mincore_hugetlb_page_range(vma, addr, end, vec);
237 else 230 else
diff --git a/mm/mlock.c b/mm/mlock.c
index 192e6eebe4f2..4e1a68162285 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -91,6 +91,26 @@ void mlock_vma_page(struct page *page)
91} 91}
92 92
93/* 93/*
94 * Isolate a page from LRU with optional get_page() pin.
95 * Assumes lru_lock already held and page already pinned.
96 */
97static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
98{
99 if (PageLRU(page)) {
100 struct lruvec *lruvec;
101
102 lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
103 if (getpage)
104 get_page(page);
105 ClearPageLRU(page);
106 del_page_from_lru_list(page, lruvec, page_lru(page));
107 return true;
108 }
109
110 return false;
111}
112
113/*
94 * Finish munlock after successful page isolation 114 * Finish munlock after successful page isolation
95 * 115 *
96 * Page must be locked. This is a wrapper for try_to_munlock() 116 * Page must be locked. This is a wrapper for try_to_munlock()
@@ -126,9 +146,9 @@ static void __munlock_isolated_page(struct page *page)
126static void __munlock_isolation_failed(struct page *page) 146static void __munlock_isolation_failed(struct page *page)
127{ 147{
128 if (PageUnevictable(page)) 148 if (PageUnevictable(page))
129 count_vm_event(UNEVICTABLE_PGSTRANDED); 149 __count_vm_event(UNEVICTABLE_PGSTRANDED);
130 else 150 else
131 count_vm_event(UNEVICTABLE_PGMUNLOCKED); 151 __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
132} 152}
133 153
134/** 154/**
@@ -152,28 +172,34 @@ static void __munlock_isolation_failed(struct page *page)
152unsigned int munlock_vma_page(struct page *page) 172unsigned int munlock_vma_page(struct page *page)
153{ 173{
154 unsigned int nr_pages; 174 unsigned int nr_pages;
175 struct zone *zone = page_zone(page);
155 176
156 BUG_ON(!PageLocked(page)); 177 BUG_ON(!PageLocked(page));
157 178
158 if (TestClearPageMlocked(page)) {
159 nr_pages = hpage_nr_pages(page);
160 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
161 if (!isolate_lru_page(page))
162 __munlock_isolated_page(page);
163 else
164 __munlock_isolation_failed(page);
165 } else {
166 nr_pages = hpage_nr_pages(page);
167 }
168
169 /* 179 /*
170 * Regardless of the original PageMlocked flag, we determine nr_pages 180 * Serialize with any parallel __split_huge_page_refcount() which
171 * after touching the flag. This leaves a possible race with a THP page 181 * might otherwise copy PageMlocked to part of the tail pages before
172 * split, such that a whole THP page was munlocked, but nr_pages == 1. 182 * we clear it in the head page. It also stabilizes hpage_nr_pages().
173 * Returning a smaller mask due to that is OK, the worst that can
174 * happen is subsequent useless scanning of the former tail pages.
175 * The NR_MLOCK accounting can however become broken.
176 */ 183 */
184 spin_lock_irq(&zone->lru_lock);
185
186 nr_pages = hpage_nr_pages(page);
187 if (!TestClearPageMlocked(page))
188 goto unlock_out;
189
190 __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
191
192 if (__munlock_isolate_lru_page(page, true)) {
193 spin_unlock_irq(&zone->lru_lock);
194 __munlock_isolated_page(page);
195 goto out;
196 }
197 __munlock_isolation_failed(page);
198
199unlock_out:
200 spin_unlock_irq(&zone->lru_lock);
201
202out:
177 return nr_pages - 1; 203 return nr_pages - 1;
178} 204}
179 205
@@ -253,8 +279,8 @@ static int __mlock_posix_error_return(long retval)
253static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, 279static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
254 int *pgrescued) 280 int *pgrescued)
255{ 281{
256 VM_BUG_ON(PageLRU(page)); 282 VM_BUG_ON_PAGE(PageLRU(page), page);
257 VM_BUG_ON(!PageLocked(page)); 283 VM_BUG_ON_PAGE(!PageLocked(page), page);
258 284
259 if (page_mapcount(page) <= 1 && page_evictable(page)) { 285 if (page_mapcount(page) <= 1 && page_evictable(page)) {
260 pagevec_add(pvec, page); 286 pagevec_add(pvec, page);
@@ -310,34 +336,24 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
310 struct page *page = pvec->pages[i]; 336 struct page *page = pvec->pages[i];
311 337
312 if (TestClearPageMlocked(page)) { 338 if (TestClearPageMlocked(page)) {
313 struct lruvec *lruvec;
314 int lru;
315
316 if (PageLRU(page)) {
317 lruvec = mem_cgroup_page_lruvec(page, zone);
318 lru = page_lru(page);
319 /*
320 * We already have pin from follow_page_mask()
321 * so we can spare the get_page() here.
322 */
323 ClearPageLRU(page);
324 del_page_from_lru_list(page, lruvec, lru);
325 } else {
326 __munlock_isolation_failed(page);
327 goto skip_munlock;
328 }
329
330 } else {
331skip_munlock:
332 /* 339 /*
333 * We won't be munlocking this page in the next phase 340 * We already have pin from follow_page_mask()
334 * but we still need to release the follow_page_mask() 341 * so we can spare the get_page() here.
335 * pin. We cannot do it under lru_lock however. If it's
336 * the last pin, __page_cache_release would deadlock.
337 */ 342 */
338 pagevec_add(&pvec_putback, pvec->pages[i]); 343 if (__munlock_isolate_lru_page(page, false))
339 pvec->pages[i] = NULL; 344 continue;
345 else
346 __munlock_isolation_failed(page);
340 } 347 }
348
349 /*
350 * We won't be munlocking this page in the next phase
351 * but we still need to release the follow_page_mask()
352 * pin. We cannot do it under lru_lock however. If it's
353 * the last pin, __page_cache_release() would deadlock.
354 */
355 pagevec_add(&pvec_putback, pvec->pages[i]);
356 pvec->pages[i] = NULL;
341 } 357 }
342 delta_munlocked = -nr + pagevec_count(&pvec_putback); 358 delta_munlocked = -nr + pagevec_count(&pvec_putback);
343 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); 359 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
@@ -709,19 +725,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
709 725
710 lru_add_drain_all(); /* flush pagevec */ 726 lru_add_drain_all(); /* flush pagevec */
711 727
712 down_write(&current->mm->mmap_sem);
713 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 728 len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
714 start &= PAGE_MASK; 729 start &= PAGE_MASK;
715 730
716 locked = len >> PAGE_SHIFT;
717 locked += current->mm->locked_vm;
718
719 lock_limit = rlimit(RLIMIT_MEMLOCK); 731 lock_limit = rlimit(RLIMIT_MEMLOCK);
720 lock_limit >>= PAGE_SHIFT; 732 lock_limit >>= PAGE_SHIFT;
733 locked = len >> PAGE_SHIFT;
734
735 down_write(&current->mm->mmap_sem);
736
737 locked += current->mm->locked_vm;
721 738
722 /* check against resource limits */ 739 /* check against resource limits */
723 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 740 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
724 error = do_mlock(start, len, 1); 741 error = do_mlock(start, len, 1);
742
725 up_write(&current->mm->mmap_sem); 743 up_write(&current->mm->mmap_sem);
726 if (!error) 744 if (!error)
727 error = __mm_populate(start, len, 0); 745 error = __mm_populate(start, len, 0);
@@ -732,11 +750,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
732{ 750{
733 int ret; 751 int ret;
734 752
735 down_write(&current->mm->mmap_sem);
736 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 753 len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
737 start &= PAGE_MASK; 754 start &= PAGE_MASK;
755
756 down_write(&current->mm->mmap_sem);
738 ret = do_mlock(start, len, 0); 757 ret = do_mlock(start, len, 0);
739 up_write(&current->mm->mmap_sem); 758 up_write(&current->mm->mmap_sem);
759
740 return ret; 760 return ret;
741} 761}
742 762
@@ -781,12 +801,12 @@ SYSCALL_DEFINE1(mlockall, int, flags)
781 if (flags & MCL_CURRENT) 801 if (flags & MCL_CURRENT)
782 lru_add_drain_all(); /* flush pagevec */ 802 lru_add_drain_all(); /* flush pagevec */
783 803
784 down_write(&current->mm->mmap_sem);
785
786 lock_limit = rlimit(RLIMIT_MEMLOCK); 804 lock_limit = rlimit(RLIMIT_MEMLOCK);
787 lock_limit >>= PAGE_SHIFT; 805 lock_limit >>= PAGE_SHIFT;
788 806
789 ret = -ENOMEM; 807 ret = -ENOMEM;
808 down_write(&current->mm->mmap_sem);
809
790 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || 810 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
791 capable(CAP_IPC_LOCK)) 811 capable(CAP_IPC_LOCK))
792 ret = do_mlockall(flags); 812 ret = do_mlockall(flags);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 68562e92d50c..4074caf9936b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -202,5 +202,4 @@ static int __init mm_sysfs_init(void)
202 202
203 return 0; 203 return 0;
204} 204}
205 205postcore_initcall(mm_sysfs_init);
206__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d785f1e..20ff0c33274c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
86 86
87int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ 87int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
88int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 88int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
89unsigned long sysctl_overcommit_kbytes __read_mostly;
89int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 90int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
90unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 91unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
91unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 92unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -893,7 +894,15 @@ again: remove_next = 1 + (end > next->vm_end);
893static inline int is_mergeable_vma(struct vm_area_struct *vma, 894static inline int is_mergeable_vma(struct vm_area_struct *vma,
894 struct file *file, unsigned long vm_flags) 895 struct file *file, unsigned long vm_flags)
895{ 896{
896 if (vma->vm_flags ^ vm_flags) 897 /*
898 * VM_SOFTDIRTY should not prevent from VMA merging, if we
899 * match the flags but dirty bit -- the caller should mark
900 * merged VMA as dirty. If dirty bit won't be excluded from
901 * comparison, we increase pressue on the memory system forcing
902 * the kernel to generate new VMAs when old one could be
903 * extended instead.
904 */
905 if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
897 return 0; 906 return 0;
898 if (vma->vm_file != file) 907 if (vma->vm_file != file)
899 return 0; 908 return 0;
@@ -1082,7 +1091,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
1082 return a->vm_end == b->vm_start && 1091 return a->vm_end == b->vm_start &&
1083 mpol_equal(vma_policy(a), vma_policy(b)) && 1092 mpol_equal(vma_policy(a), vma_policy(b)) &&
1084 a->vm_file == b->vm_file && 1093 a->vm_file == b->vm_file &&
1085 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && 1094 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
1086 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1095 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1087} 1096}
1088 1097
@@ -1190,6 +1199,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
1190 return hint; 1199 return hint;
1191} 1200}
1192 1201
1202static inline int mlock_future_check(struct mm_struct *mm,
1203 unsigned long flags,
1204 unsigned long len)
1205{
1206 unsigned long locked, lock_limit;
1207
1208 /* mlock MCL_FUTURE? */
1209 if (flags & VM_LOCKED) {
1210 locked = len >> PAGE_SHIFT;
1211 locked += mm->locked_vm;
1212 lock_limit = rlimit(RLIMIT_MEMLOCK);
1213 lock_limit >>= PAGE_SHIFT;
1214 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1215 return -EAGAIN;
1216 }
1217 return 0;
1218}
1219
1193/* 1220/*
1194 * The caller must hold down_write(&current->mm->mmap_sem). 1221 * The caller must hold down_write(&current->mm->mmap_sem).
1195 */ 1222 */
@@ -1251,16 +1278,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1251 if (!can_do_mlock()) 1278 if (!can_do_mlock())
1252 return -EPERM; 1279 return -EPERM;
1253 1280
1254 /* mlock MCL_FUTURE? */ 1281 if (mlock_future_check(mm, vm_flags, len))
1255 if (vm_flags & VM_LOCKED) { 1282 return -EAGAIN;
1256 unsigned long locked, lock_limit;
1257 locked = len >> PAGE_SHIFT;
1258 locked += mm->locked_vm;
1259 lock_limit = rlimit(RLIMIT_MEMLOCK);
1260 lock_limit >>= PAGE_SHIFT;
1261 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1262 return -EAGAIN;
1263 }
1264 1283
1265 if (file) { 1284 if (file) {
1266 struct inode *inode = file_inode(file); 1285 struct inode *inode = file_inode(file);
@@ -2591,18 +2610,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2591 if (error & ~PAGE_MASK) 2610 if (error & ~PAGE_MASK)
2592 return error; 2611 return error;
2593 2612
2594 /* 2613 error = mlock_future_check(mm, mm->def_flags, len);
2595 * mlock MCL_FUTURE? 2614 if (error)
2596 */ 2615 return error;
2597 if (mm->def_flags & VM_LOCKED) {
2598 unsigned long locked, lock_limit;
2599 locked = len >> PAGE_SHIFT;
2600 locked += mm->locked_vm;
2601 lock_limit = rlimit(RLIMIT_MEMLOCK);
2602 lock_limit >>= PAGE_SHIFT;
2603 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
2604 return -EAGAIN;
2605 }
2606 2616
2607 /* 2617 /*
2608 * mm->mmap_sem is required to protect against another thread 2618 * mm->mmap_sem is required to protect against another thread
@@ -3140,7 +3150,7 @@ static int init_user_reserve(void)
3140 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); 3150 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
3141 return 0; 3151 return 0;
3142} 3152}
3143module_init(init_user_reserve) 3153subsys_initcall(init_user_reserve);
3144 3154
3145/* 3155/*
3146 * Initialise sysctl_admin_reserve_kbytes. 3156 * Initialise sysctl_admin_reserve_kbytes.
@@ -3161,7 +3171,7 @@ static int init_admin_reserve(void)
3161 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); 3171 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3162 return 0; 3172 return 0;
3163} 3173}
3164module_init(init_admin_reserve) 3174subsys_initcall(init_admin_reserve);
3165 3175
3166/* 3176/*
3167 * Reinititalise user and admin reserves if memory is added or removed. 3177 * Reinititalise user and admin reserves if memory is added or removed.
@@ -3231,4 +3241,4 @@ static int __meminit init_reserve_notifier(void)
3231 3241
3232 return 0; 3242 return 0;
3233} 3243}
3234module_init(init_reserve_notifier) 3244subsys_initcall(init_reserve_notifier);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 93e6089cb456..41cefdf0aadd 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void)
329{ 329{
330 return init_srcu_struct(&srcu); 330 return init_srcu_struct(&srcu);
331} 331}
332 332subsys_initcall(mmu_notifier_init);
333module_init(mmu_notifier_init);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bb53a6591aea..7332c1785744 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
23#include <linux/mmu_notifier.h> 23#include <linux/mmu_notifier.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26#include <linux/ksm.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
63 64
64 ptent = *pte; 65 ptent = *pte;
65 page = vm_normal_page(vma, addr, oldpte); 66 page = vm_normal_page(vma, addr, oldpte);
66 if (page) { 67 if (page && !PageKsm(page)) {
67 if (!pte_numa(oldpte)) { 68 if (!pte_numa(oldpte)) {
68 ptent = pte_mknuma(ptent); 69 ptent = pte_mknuma(ptent);
69 set_pte_at(mm, addr, pte, ptent); 70 set_pte_at(mm, addr, pte, ptent);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d374655..f73f2987a852 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,11 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
41 if (limit > memblock.current_limit) 41 if (limit > memblock.current_limit)
42 limit = memblock.current_limit; 42 limit = memblock.current_limit;
43 43
44 addr = memblock_find_in_range_node(goal, limit, size, align, nid); 44 addr = memblock_find_in_range_node(size, align, goal, limit, nid);
45 if (!addr) 45 if (!addr)
46 return NULL; 46 return NULL;
47 47
48 memblock_reserve(addr, size); 48 if (memblock_reserve(addr, size))
49 return NULL;
50
49 ptr = phys_to_virt(addr); 51 ptr = phys_to_virt(addr);
50 memset(ptr, 0, size); 52 memset(ptr, 0, size);
51 /* 53 /*
@@ -114,16 +116,27 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
114static unsigned long __init free_low_memory_core_early(void) 116static unsigned long __init free_low_memory_core_early(void)
115{ 117{
116 unsigned long count = 0; 118 unsigned long count = 0;
117 phys_addr_t start, end, size; 119 phys_addr_t start, end;
118 u64 i; 120 u64 i;
119 121
120 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) 122 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
121 count += __free_memory_core(start, end); 123 count += __free_memory_core(start, end);
122 124
123 /* free range that is used for reserved array if we allocate it */ 125#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
124 size = get_allocated_memblock_reserved_regions_info(&start); 126 {
125 if (size) 127 phys_addr_t size;
126 count += __free_memory_core(start, start + size); 128
129 /* Free memblock.reserved array if it was allocated */
130 size = get_allocated_memblock_reserved_regions_info(&start);
131 if (size)
132 count += __free_memory_core(start, start + size);
133
134 /* Free memblock.memory array if it was allocated */
135 size = get_allocated_memblock_memory_regions_info(&start);
136 if (size)
137 count += __free_memory_core(start, start + size);
138 }
139#endif
127 140
128 return count; 141 return count;
129} 142}
@@ -161,7 +174,7 @@ unsigned long __init free_all_bootmem(void)
161 reset_all_zones_managed_pages(); 174 reset_all_zones_managed_pages();
162 175
163 /* 176 /*
164 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 177 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
165 * because in some case like Node0 doesn't have RAM installed 178 * because in some case like Node0 doesn't have RAM installed
166 * low ram will be on Node1 179 * low ram will be on Node1
167 */ 180 */
@@ -215,7 +228,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
215 228
216restart: 229restart:
217 230
218 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); 231 ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
219 232
220 if (ptr) 233 if (ptr)
221 return ptr; 234 return ptr;
@@ -299,7 +312,7 @@ again:
299 if (ptr) 312 if (ptr)
300 return ptr; 313 return ptr;
301 314
302 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, 315 ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
303 goal, limit); 316 goal, limit);
304 if (ptr) 317 if (ptr)
305 return ptr; 318 return ptr;
diff --git a/mm/nommu.c b/mm/nommu.c
index fec093adad9c..8740213b1647 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn;
60struct percpu_counter vm_committed_as; 60struct percpu_counter vm_committed_as;
61int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 61int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
62int sysctl_overcommit_ratio = 50; /* default is 50% */ 62int sysctl_overcommit_ratio = 50; /* default is 50% */
63unsigned long sysctl_overcommit_kbytes __read_mostly;
63int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 64int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
64int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 65int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
65unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 66unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e4a600a6163..3291e82d4352 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
47#ifdef CONFIG_NUMA 47#ifdef CONFIG_NUMA
48/** 48/**
49 * has_intersects_mems_allowed() - check task eligiblity for kill 49 * has_intersects_mems_allowed() - check task eligiblity for kill
50 * @tsk: task struct of which task to consider 50 * @start: task struct of which task to consider
51 * @mask: nodemask passed to page allocator for mempolicy ooms 51 * @mask: nodemask passed to page allocator for mempolicy ooms
52 * 52 *
53 * Task eligibility is determined by whether or not a candidate task, @tsk, 53 * Task eligibility is determined by whether or not a candidate task, @tsk,
54 * shares the same mempolicy nodes as current if it is bound by such a policy 54 * shares the same mempolicy nodes as current if it is bound by such a policy
55 * and whether or not it has the same set of allowed cpuset nodes. 55 * and whether or not it has the same set of allowed cpuset nodes.
56 */ 56 */
57static bool has_intersects_mems_allowed(struct task_struct *tsk, 57static bool has_intersects_mems_allowed(struct task_struct *start,
58 const nodemask_t *mask) 58 const nodemask_t *mask)
59{ 59{
60 struct task_struct *start = tsk; 60 struct task_struct *tsk;
61 bool ret = false;
61 62
62 do { 63 rcu_read_lock();
64 for_each_thread(start, tsk) {
63 if (mask) { 65 if (mask) {
64 /* 66 /*
65 * If this is a mempolicy constrained oom, tsk's 67 * If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
67 * mempolicy intersects current, otherwise it may be 69 * mempolicy intersects current, otherwise it may be
68 * needlessly killed. 70 * needlessly killed.
69 */ 71 */
70 if (mempolicy_nodemask_intersects(tsk, mask)) 72 ret = mempolicy_nodemask_intersects(tsk, mask);
71 return true;
72 } else { 73 } else {
73 /* 74 /*
74 * This is not a mempolicy constrained oom, so only 75 * This is not a mempolicy constrained oom, so only
75 * check the mems of tsk's cpuset. 76 * check the mems of tsk's cpuset.
76 */ 77 */
77 if (cpuset_mems_allowed_intersects(current, tsk)) 78 ret = cpuset_mems_allowed_intersects(current, tsk);
78 return true;
79 } 79 }
80 } while_each_thread(start, tsk); 80 if (ret)
81 break;
82 }
83 rcu_read_unlock();
81 84
82 return false; 85 return ret;
83} 86}
84#else 87#else
85static bool has_intersects_mems_allowed(struct task_struct *tsk, 88static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
97 */ 100 */
98struct task_struct *find_lock_task_mm(struct task_struct *p) 101struct task_struct *find_lock_task_mm(struct task_struct *p)
99{ 102{
100 struct task_struct *t = p; 103 struct task_struct *t;
101 104
102 do { 105 rcu_read_lock();
106
107 for_each_thread(p, t) {
103 task_lock(t); 108 task_lock(t);
104 if (likely(t->mm)) 109 if (likely(t->mm))
105 return t; 110 goto found;
106 task_unlock(t); 111 task_unlock(t);
107 } while_each_thread(p, t); 112 }
113 t = NULL;
114found:
115 rcu_read_unlock();
108 116
109 return NULL; 117 return t;
110} 118}
111 119
112/* return true if the task is not adequate as candidate victim task. */ 120/* return true if the task is not adequate as candidate victim task. */
@@ -170,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
170 * implementation used by LSMs. 178 * implementation used by LSMs.
171 */ 179 */
172 if (has_capability_noaudit(p, CAP_SYS_ADMIN)) 180 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
173 adj -= 30; 181 points -= (points * 3) / 100;
174 182
175 /* Normalize to oom_score_adj units */ 183 /* Normalize to oom_score_adj units */
176 adj *= totalpages / 1000; 184 adj *= totalpages / 1000;
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
301 unsigned long chosen_points = 0; 309 unsigned long chosen_points = 0;
302 310
303 rcu_read_lock(); 311 rcu_read_lock();
304 do_each_thread(g, p) { 312 for_each_process_thread(g, p) {
305 unsigned int points; 313 unsigned int points;
306 314
307 switch (oom_scan_process_thread(p, totalpages, nodemask, 315 switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -319,11 +327,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
319 break; 327 break;
320 }; 328 };
321 points = oom_badness(p, NULL, nodemask, totalpages); 329 points = oom_badness(p, NULL, nodemask, totalpages);
322 if (points > chosen_points) { 330 if (!points || points < chosen_points)
323 chosen = p; 331 continue;
324 chosen_points = points; 332 /* Prefer thread group leaders for display purposes */
325 } 333 if (points == chosen_points && thread_group_leader(chosen))
326 } while_each_thread(g, p); 334 continue;
335
336 chosen = p;
337 chosen_points = points;
338 }
327 if (chosen) 339 if (chosen)
328 get_task_struct(chosen); 340 get_task_struct(chosen);
329 rcu_read_unlock(); 341 rcu_read_unlock();
@@ -406,7 +418,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
406{ 418{
407 struct task_struct *victim = p; 419 struct task_struct *victim = p;
408 struct task_struct *child; 420 struct task_struct *child;
409 struct task_struct *t = p; 421 struct task_struct *t;
410 struct mm_struct *mm; 422 struct mm_struct *mm;
411 unsigned int victim_points = 0; 423 unsigned int victim_points = 0;
412 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 424 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +449,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
437 * still freeing memory. 449 * still freeing memory.
438 */ 450 */
439 read_lock(&tasklist_lock); 451 read_lock(&tasklist_lock);
440 do { 452 for_each_thread(p, t) {
441 list_for_each_entry(child, &t->children, sibling) { 453 list_for_each_entry(child, &t->children, sibling) {
442 unsigned int child_points; 454 unsigned int child_points;
443 455
@@ -455,13 +467,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
455 get_task_struct(victim); 467 get_task_struct(victim);
456 } 468 }
457 } 469 }
458 } while_each_thread(p, t); 470 }
459 read_unlock(&tasklist_lock); 471 read_unlock(&tasklist_lock);
460 472
461 rcu_read_lock();
462 p = find_lock_task_mm(victim); 473 p = find_lock_task_mm(victim);
463 if (!p) { 474 if (!p) {
464 rcu_read_unlock();
465 put_task_struct(victim); 475 put_task_struct(victim);
466 return; 476 return;
467 } else if (victim != p) { 477 } else if (victim != p) {
@@ -487,6 +497,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
487 * That thread will now get access to memory reserves since it has a 497 * That thread will now get access to memory reserves since it has a
488 * pending fatal signal. 498 * pending fatal signal.
489 */ 499 */
500 rcu_read_lock();
490 for_each_process(p) 501 for_each_process(p)
491 if (p->mm == mm && !same_thread_group(p, victim) && 502 if (p->mm == mm && !same_thread_group(p, victim) &&
492 !(p->flags & PF_KTHREAD)) { 503 !(p->flags & PF_KTHREAD)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 63807583d8e8..2d30e2cfe804 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -191,6 +191,26 @@ static unsigned long writeout_period_time = 0;
191 * global dirtyable memory first. 191 * global dirtyable memory first.
192 */ 192 */
193 193
194/**
195 * zone_dirtyable_memory - number of dirtyable pages in a zone
196 * @zone: the zone
197 *
198 * Returns the zone's number of pages potentially available for dirty
199 * page cache. This is the base value for the per-zone dirty limits.
200 */
201static unsigned long zone_dirtyable_memory(struct zone *zone)
202{
203 unsigned long nr_pages;
204
205 nr_pages = zone_page_state(zone, NR_FREE_PAGES);
206 nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
207
208 nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
209 nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
210
211 return nr_pages;
212}
213
194static unsigned long highmem_dirtyable_memory(unsigned long total) 214static unsigned long highmem_dirtyable_memory(unsigned long total)
195{ 215{
196#ifdef CONFIG_HIGHMEM 216#ifdef CONFIG_HIGHMEM
@@ -198,11 +218,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
198 unsigned long x = 0; 218 unsigned long x = 0;
199 219
200 for_each_node_state(node, N_HIGH_MEMORY) { 220 for_each_node_state(node, N_HIGH_MEMORY) {
201 struct zone *z = 221 struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
202 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
203 222
204 x += zone_page_state(z, NR_FREE_PAGES) + 223 x += zone_dirtyable_memory(z);
205 zone_reclaimable_pages(z) - z->dirty_balance_reserve;
206 } 224 }
207 /* 225 /*
208 * Unreclaimable memory (kernel memory or anonymous memory 226 * Unreclaimable memory (kernel memory or anonymous memory
@@ -238,9 +256,12 @@ static unsigned long global_dirtyable_memory(void)
238{ 256{
239 unsigned long x; 257 unsigned long x;
240 258
241 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); 259 x = global_page_state(NR_FREE_PAGES);
242 x -= min(x, dirty_balance_reserve); 260 x -= min(x, dirty_balance_reserve);
243 261
262 x += global_page_state(NR_INACTIVE_FILE);
263 x += global_page_state(NR_ACTIVE_FILE);
264
244 if (!vm_highmem_is_dirtyable) 265 if (!vm_highmem_is_dirtyable)
245 x -= highmem_dirtyable_memory(x); 266 x -= highmem_dirtyable_memory(x);
246 267
@@ -289,32 +310,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
289} 310}
290 311
291/** 312/**
292 * zone_dirtyable_memory - number of dirtyable pages in a zone
293 * @zone: the zone
294 *
295 * Returns the zone's number of pages potentially available for dirty
296 * page cache. This is the base value for the per-zone dirty limits.
297 */
298static unsigned long zone_dirtyable_memory(struct zone *zone)
299{
300 /*
301 * The effective global number of dirtyable pages may exclude
302 * highmem as a big-picture measure to keep the ratio between
303 * dirty memory and lowmem reasonable.
304 *
305 * But this function is purely about the individual zone and a
306 * highmem zone can hold its share of dirty pages, so we don't
307 * care about vm_highmem_is_dirtyable here.
308 */
309 unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
310 zone_reclaimable_pages(zone);
311
312 /* don't allow this to underflow */
313 nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
314 return nr_pages;
315}
316
317/**
318 * zone_dirty_limit - maximum number of dirty pages allowed in a zone 313 * zone_dirty_limit - maximum number of dirty pages allowed in a zone
319 * @zone: the zone 314 * @zone: the zone
320 * 315 *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5248fe070aa4..e3758a09a009 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -205,7 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
205}; 205};
206 206
207int min_free_kbytes = 1024; 207int min_free_kbytes = 1024;
208int user_min_free_kbytes; 208int user_min_free_kbytes = -1;
209 209
210static unsigned long __meminitdata nr_kernel_pages; 210static unsigned long __meminitdata nr_kernel_pages;
211static unsigned long __meminitdata nr_all_pages; 211static unsigned long __meminitdata nr_all_pages;
@@ -295,7 +295,7 @@ static inline int bad_range(struct zone *zone, struct page *page)
295} 295}
296#endif 296#endif
297 297
298static void bad_page(struct page *page) 298static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
299{ 299{
300 static unsigned long resume; 300 static unsigned long resume;
301 static unsigned long nr_shown; 301 static unsigned long nr_shown;
@@ -329,7 +329,7 @@ static void bad_page(struct page *page)
329 329
330 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 330 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
331 current->comm, page_to_pfn(page)); 331 current->comm, page_to_pfn(page));
332 dump_page(page); 332 dump_page_badflags(page, reason, bad_flags);
333 333
334 print_modules(); 334 print_modules();
335 dump_stack(); 335 dump_stack();
@@ -383,7 +383,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
383 int bad = 0; 383 int bad = 0;
384 384
385 if (unlikely(compound_order(page) != order)) { 385 if (unlikely(compound_order(page) != order)) {
386 bad_page(page); 386 bad_page(page, "wrong compound order", 0);
387 bad++; 387 bad++;
388 } 388 }
389 389
@@ -392,8 +392,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)
392 for (i = 1; i < nr_pages; i++) { 392 for (i = 1; i < nr_pages; i++) {
393 struct page *p = page + i; 393 struct page *p = page + i;
394 394
395 if (unlikely(!PageTail(p) || (p->first_page != page))) { 395 if (unlikely(!PageTail(p))) {
396 bad_page(page); 396 bad_page(page, "PageTail not set", 0);
397 bad++;
398 } else if (unlikely(p->first_page != page)) {
399 bad_page(page, "first_page not consistent", 0);
397 bad++; 400 bad++;
398 } 401 }
399 __ClearPageTail(p); 402 __ClearPageTail(p);
@@ -506,12 +509,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
506 return 0; 509 return 0;
507 510
508 if (page_is_guard(buddy) && page_order(buddy) == order) { 511 if (page_is_guard(buddy) && page_order(buddy) == order) {
509 VM_BUG_ON(page_count(buddy) != 0); 512 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
510 return 1; 513 return 1;
511 } 514 }
512 515
513 if (PageBuddy(buddy) && page_order(buddy) == order) { 516 if (PageBuddy(buddy) && page_order(buddy) == order) {
514 VM_BUG_ON(page_count(buddy) != 0); 517 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
515 return 1; 518 return 1;
516 } 519 }
517 return 0; 520 return 0;
@@ -561,8 +564,8 @@ static inline void __free_one_page(struct page *page,
561 564
562 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 565 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
563 566
564 VM_BUG_ON(page_idx & ((1 << order) - 1)); 567 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
565 VM_BUG_ON(bad_range(zone, page)); 568 VM_BUG_ON_PAGE(bad_range(zone, page), page);
566 569
567 while (order < MAX_ORDER-1) { 570 while (order < MAX_ORDER-1) {
568 buddy_idx = __find_buddy_index(page_idx, order); 571 buddy_idx = __find_buddy_index(page_idx, order);
@@ -618,12 +621,23 @@ out:
618 621
619static inline int free_pages_check(struct page *page) 622static inline int free_pages_check(struct page *page)
620{ 623{
621 if (unlikely(page_mapcount(page) | 624 char *bad_reason = NULL;
622 (page->mapping != NULL) | 625 unsigned long bad_flags = 0;
623 (atomic_read(&page->_count) != 0) | 626
624 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 627 if (unlikely(page_mapcount(page)))
625 (mem_cgroup_bad_page_check(page)))) { 628 bad_reason = "nonzero mapcount";
626 bad_page(page); 629 if (unlikely(page->mapping != NULL))
630 bad_reason = "non-NULL mapping";
631 if (unlikely(atomic_read(&page->_count) != 0))
632 bad_reason = "nonzero _count";
633 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
634 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
635 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
636 }
637 if (unlikely(mem_cgroup_bad_page_check(page)))
638 bad_reason = "cgroup check failed";
639 if (unlikely(bad_reason)) {
640 bad_page(page, bad_reason, bad_flags);
627 return 1; 641 return 1;
628 } 642 }
629 page_cpupid_reset_last(page); 643 page_cpupid_reset_last(page);
@@ -813,7 +827,7 @@ static inline void expand(struct zone *zone, struct page *page,
813 area--; 827 area--;
814 high--; 828 high--;
815 size >>= 1; 829 size >>= 1;
816 VM_BUG_ON(bad_range(zone, &page[size])); 830 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
817 831
818#ifdef CONFIG_DEBUG_PAGEALLOC 832#ifdef CONFIG_DEBUG_PAGEALLOC
819 if (high < debug_guardpage_minorder()) { 833 if (high < debug_guardpage_minorder()) {
@@ -843,12 +857,23 @@ static inline void expand(struct zone *zone, struct page *page,
843 */ 857 */
844static inline int check_new_page(struct page *page) 858static inline int check_new_page(struct page *page)
845{ 859{
846 if (unlikely(page_mapcount(page) | 860 char *bad_reason = NULL;
847 (page->mapping != NULL) | 861 unsigned long bad_flags = 0;
848 (atomic_read(&page->_count) != 0) | 862
849 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 863 if (unlikely(page_mapcount(page)))
850 (mem_cgroup_bad_page_check(page)))) { 864 bad_reason = "nonzero mapcount";
851 bad_page(page); 865 if (unlikely(page->mapping != NULL))
866 bad_reason = "non-NULL mapping";
867 if (unlikely(atomic_read(&page->_count) != 0))
868 bad_reason = "nonzero _count";
869 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
870 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
871 bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
872 }
873 if (unlikely(mem_cgroup_bad_page_check(page)))
874 bad_reason = "cgroup check failed";
875 if (unlikely(bad_reason)) {
876 bad_page(page, bad_reason, bad_flags);
852 return 1; 877 return 1;
853 } 878 }
854 return 0; 879 return 0;
@@ -955,7 +980,7 @@ int move_freepages(struct zone *zone,
955 980
956 for (page = start_page; page <= end_page;) { 981 for (page = start_page; page <= end_page;) {
957 /* Make sure we are not inadvertently changing nodes */ 982 /* Make sure we are not inadvertently changing nodes */
958 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 983 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
959 984
960 if (!pfn_valid_within(page_to_pfn(page))) { 985 if (!pfn_valid_within(page_to_pfn(page))) {
961 page++; 986 page++;
@@ -1404,8 +1429,8 @@ void split_page(struct page *page, unsigned int order)
1404{ 1429{
1405 int i; 1430 int i;
1406 1431
1407 VM_BUG_ON(PageCompound(page)); 1432 VM_BUG_ON_PAGE(PageCompound(page), page);
1408 VM_BUG_ON(!page_count(page)); 1433 VM_BUG_ON_PAGE(!page_count(page), page);
1409 1434
1410#ifdef CONFIG_KMEMCHECK 1435#ifdef CONFIG_KMEMCHECK
1411 /* 1436 /*
@@ -1552,7 +1577,7 @@ again:
1552 zone_statistics(preferred_zone, zone, gfp_flags); 1577 zone_statistics(preferred_zone, zone, gfp_flags);
1553 local_irq_restore(flags); 1578 local_irq_restore(flags);
1554 1579
1555 VM_BUG_ON(bad_range(zone, page)); 1580 VM_BUG_ON_PAGE(bad_range(zone, page), page);
1556 if (prep_new_page(page, order, gfp_flags)) 1581 if (prep_new_page(page, order, gfp_flags))
1557 goto again; 1582 goto again;
1558 return page; 1583 return page;
@@ -2072,13 +2097,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2072 return; 2097 return;
2073 2098
2074 /* 2099 /*
2075 * Walking all memory to count page types is very expensive and should
2076 * be inhibited in non-blockable contexts.
2077 */
2078 if (!(gfp_mask & __GFP_WAIT))
2079 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2080
2081 /*
2082 * This documents exceptions given to allocations in certain 2100 * This documents exceptions given to allocations in certain
2083 * contexts that are allowed to allocate outside current's set 2101 * contexts that are allowed to allocate outside current's set
2084 * of allowed nodes. 2102 * of allowed nodes.
@@ -2242,10 +2260,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2242 preferred_zone, migratetype); 2260 preferred_zone, migratetype);
2243 if (page) { 2261 if (page) {
2244 preferred_zone->compact_blockskip_flush = false; 2262 preferred_zone->compact_blockskip_flush = false;
2245 preferred_zone->compact_considered = 0; 2263 compaction_defer_reset(preferred_zone, order, true);
2246 preferred_zone->compact_defer_shift = 0;
2247 if (order >= preferred_zone->compact_order_failed)
2248 preferred_zone->compact_order_failed = order + 1;
2249 count_vm_event(COMPACTSUCCESS); 2264 count_vm_event(COMPACTSUCCESS);
2250 return page; 2265 return page;
2251 } 2266 }
@@ -2535,8 +2550,15 @@ rebalance:
2535 } 2550 }
2536 2551
2537 /* Atomic allocations - we can't balance anything */ 2552 /* Atomic allocations - we can't balance anything */
2538 if (!wait) 2553 if (!wait) {
2554 /*
2555 * All existing users of the deprecated __GFP_NOFAIL are
2556 * blockable, so warn of any new users that actually allow this
2557 * type of allocation to fail.
2558 */
2559 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
2539 goto nopage; 2560 goto nopage;
2561 }
2540 2562
2541 /* Avoid recursion of direct reclaim */ 2563 /* Avoid recursion of direct reclaim */
2542 if (current->flags & PF_MEMALLOC) 2564 if (current->flags & PF_MEMALLOC)
@@ -3901,6 +3923,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3901 struct page *page; 3923 struct page *page;
3902 unsigned long block_migratetype; 3924 unsigned long block_migratetype;
3903 int reserve; 3925 int reserve;
3926 int old_reserve;
3904 3927
3905 /* 3928 /*
3906 * Get the start pfn, end pfn and the number of blocks to reserve 3929 * Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3945,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3922 * future allocation of hugepages at runtime. 3945 * future allocation of hugepages at runtime.
3923 */ 3946 */
3924 reserve = min(2, reserve); 3947 reserve = min(2, reserve);
3948 old_reserve = zone->nr_migrate_reserve_block;
3949
3950 /* When memory hot-add, we almost always need to do nothing */
3951 if (reserve == old_reserve)
3952 return;
3953 zone->nr_migrate_reserve_block = reserve;
3925 3954
3926 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3955 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3927 if (!pfn_valid(pfn)) 3956 if (!pfn_valid(pfn))
@@ -3959,6 +3988,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3959 reserve--; 3988 reserve--;
3960 continue; 3989 continue;
3961 } 3990 }
3991 } else if (!old_reserve) {
3992 /*
3993 * At boot time we don't need to scan the whole zone
3994 * for turning off MIGRATE_RESERVE.
3995 */
3996 break;
3962 } 3997 }
3963 3998
3964 /* 3999 /*
@@ -4209,7 +4244,6 @@ static noinline __init_refok
4209int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4244int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4210{ 4245{
4211 int i; 4246 int i;
4212 struct pglist_data *pgdat = zone->zone_pgdat;
4213 size_t alloc_size; 4247 size_t alloc_size;
4214 4248
4215 /* 4249 /*
@@ -4225,7 +4259,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4225 4259
4226 if (!slab_is_available()) { 4260 if (!slab_is_available()) {
4227 zone->wait_table = (wait_queue_head_t *) 4261 zone->wait_table = (wait_queue_head_t *)
4228 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4262 memblock_virt_alloc_node_nopanic(
4263 alloc_size, zone->zone_pgdat->node_id);
4229 } else { 4264 } else {
4230 /* 4265 /*
4231 * This case means that a zone whose size was 0 gets new memory 4266 * This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4380,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4345#endif 4380#endif
4346 4381
4347/** 4382/**
4348 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4383 * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
4349 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4384 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4350 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4385 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
4351 * 4386 *
4352 * If an architecture guarantees that all ranges registered with 4387 * If an architecture guarantees that all ranges registered with
4353 * add_active_ranges() contain no holes and may be freed, this 4388 * add_active_ranges() contain no holes and may be freed, this
4354 * this function may be used instead of calling free_bootmem() manually. 4389 * this function may be used instead of calling memblock_free_early_nid()
4390 * manually.
4355 */ 4391 */
4356void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4392void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4357{ 4393{
@@ -4363,9 +4399,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4363 end_pfn = min(end_pfn, max_low_pfn); 4399 end_pfn = min(end_pfn, max_low_pfn);
4364 4400
4365 if (start_pfn < end_pfn) 4401 if (start_pfn < end_pfn)
4366 free_bootmem_node(NODE_DATA(this_nid), 4402 memblock_free_early_nid(PFN_PHYS(start_pfn),
4367 PFN_PHYS(start_pfn), 4403 (end_pfn - start_pfn) << PAGE_SHIFT,
4368 (end_pfn - start_pfn) << PAGE_SHIFT); 4404 this_nid);
4369 } 4405 }
4370} 4406}
4371 4407
@@ -4636,8 +4672,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
4636 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4672 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4637 zone->pageblock_flags = NULL; 4673 zone->pageblock_flags = NULL;
4638 if (usemapsize) 4674 if (usemapsize)
4639 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4675 zone->pageblock_flags =
4640 usemapsize); 4676 memblock_virt_alloc_node_nopanic(usemapsize,
4677 pgdat->node_id);
4641} 4678}
4642#else 4679#else
4643static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4680static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4868,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4831 size = (end - start) * sizeof(struct page); 4868 size = (end - start) * sizeof(struct page);
4832 map = alloc_remap(pgdat->node_id, size); 4869 map = alloc_remap(pgdat->node_id, size);
4833 if (!map) 4870 if (!map)
4834 map = alloc_bootmem_node_nopanic(pgdat, size); 4871 map = memblock_virt_alloc_node_nopanic(size,
4872 pgdat->node_id);
4835 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4873 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4836 } 4874 }
4837#ifndef CONFIG_NEED_MULTIPLE_NODES 4875#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5012,9 +5050,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5012 nodemask_t saved_node_state = node_states[N_MEMORY]; 5050 nodemask_t saved_node_state = node_states[N_MEMORY];
5013 unsigned long totalpages = early_calculate_totalpages(); 5051 unsigned long totalpages = early_calculate_totalpages();
5014 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5052 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5053 struct memblock_type *type = &memblock.memory;
5054
5055 /* Need to find movable_zone earlier when movable_node is specified. */
5056 find_usable_zone_for_movable();
5057
5058 /*
5059 * If movable_node is specified, ignore kernelcore and movablecore
5060 * options.
5061 */
5062 if (movable_node_is_enabled()) {
5063 for (i = 0; i < type->cnt; i++) {
5064 if (!memblock_is_hotpluggable(&type->regions[i]))
5065 continue;
5066
5067 nid = type->regions[i].nid;
5068
5069 usable_startpfn = PFN_DOWN(type->regions[i].base);
5070 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
5071 min(usable_startpfn, zone_movable_pfn[nid]) :
5072 usable_startpfn;
5073 }
5074
5075 goto out2;
5076 }
5015 5077
5016 /* 5078 /*
5017 * If movablecore was specified, calculate what size of 5079 * If movablecore=nn[KMG] was specified, calculate what size of
5018 * kernelcore that corresponds so that memory usable for 5080 * kernelcore that corresponds so that memory usable for
5019 * any allocation type is evenly spread. If both kernelcore 5081 * any allocation type is evenly spread. If both kernelcore
5020 * and movablecore are specified, then the value of kernelcore 5082 * and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5102,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5040 goto out; 5102 goto out;
5041 5103
5042 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5104 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
5043 find_usable_zone_for_movable();
5044 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5105 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5045 5106
5046restart: 5107restart:
@@ -5131,6 +5192,7 @@ restart:
5131 if (usable_nodes && required_kernelcore > usable_nodes) 5192 if (usable_nodes && required_kernelcore > usable_nodes)
5132 goto restart; 5193 goto restart;
5133 5194
5195out2:
5134 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5196 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5135 for (nid = 0; nid < MAX_NUMNODES; nid++) 5197 for (nid = 0; nid < MAX_NUMNODES; nid++)
5136 zone_movable_pfn[nid] = 5198 zone_movable_pfn[nid] =
@@ -5692,7 +5754,12 @@ module_init(init_per_zone_wmark_min)
5692int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5754int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5693 void __user *buffer, size_t *length, loff_t *ppos) 5755 void __user *buffer, size_t *length, loff_t *ppos)
5694{ 5756{
5695 proc_dointvec(table, write, buffer, length, ppos); 5757 int rc;
5758
5759 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5760 if (rc)
5761 return rc;
5762
5696 if (write) { 5763 if (write) {
5697 user_min_free_kbytes = min_free_kbytes; 5764 user_min_free_kbytes = min_free_kbytes;
5698 setup_per_zone_wmarks(); 5765 setup_per_zone_wmarks();
@@ -5857,7 +5924,7 @@ void *__init alloc_large_system_hash(const char *tablename,
5857 do { 5924 do {
5858 size = bucketsize << log2qty; 5925 size = bucketsize << log2qty;
5859 if (flags & HASH_EARLY) 5926 if (flags & HASH_EARLY)
5860 table = alloc_bootmem_nopanic(size); 5927 table = memblock_virt_alloc_nopanic(size, 0);
5861 else if (hashdist) 5928 else if (hashdist)
5862 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5929 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5863 else { 5930 else {
@@ -5959,7 +6026,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5959 pfn = page_to_pfn(page); 6026 pfn = page_to_pfn(page);
5960 bitmap = get_pageblock_bitmap(zone, pfn); 6027 bitmap = get_pageblock_bitmap(zone, pfn);
5961 bitidx = pfn_to_bitidx(zone, pfn); 6028 bitidx = pfn_to_bitidx(zone, pfn);
5962 VM_BUG_ON(!zone_spans_pfn(zone, pfn)); 6029 VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
5963 6030
5964 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 6031 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5965 if (flags & value) 6032 if (flags & value)
@@ -6457,12 +6524,24 @@ static void dump_page_flags(unsigned long flags)
6457 printk(")\n"); 6524 printk(")\n");
6458} 6525}
6459 6526
6460void dump_page(struct page *page) 6527void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
6461{ 6528{
6462 printk(KERN_ALERT 6529 printk(KERN_ALERT
6463 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6530 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6464 page, atomic_read(&page->_count), page_mapcount(page), 6531 page, atomic_read(&page->_count), page_mapcount(page),
6465 page->mapping, page->index); 6532 page->mapping, page->index);
6466 dump_page_flags(page->flags); 6533 dump_page_flags(page->flags);
6534 if (reason)
6535 pr_alert("page dumped because: %s\n", reason);
6536 if (page->flags & badflags) {
6537 pr_alert("bad because of flags:\n");
6538 dump_page_flags(page->flags & badflags);
6539 }
6467 mem_cgroup_print_bad_page(page); 6540 mem_cgroup_print_bad_page(page);
6468} 6541}
6542
6543void dump_page(struct page *page, char *reason)
6544{
6545 dump_page_badflags(page, reason, 0);
6546}
6547EXPORT_SYMBOL_GPL(dump_page);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a872a..cfd162882c00 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)
54 54
55 table_size = sizeof(struct page_cgroup) * nr_pages; 55 table_size = sizeof(struct page_cgroup) * nr_pages;
56 56
57 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), 57 base = memblock_virt_alloc_try_nid_nopanic(
58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
59 BOOTMEM_ALLOC_ACCESSIBLE, nid);
59 if (!base) 60 if (!base)
60 return -ENOMEM; 61 return -ENOMEM;
61 NODE_DATA(nid)->node_page_cgroup = base; 62 NODE_DATA(nid)->node_page_cgroup = base;
@@ -451,7 +452,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
451 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry 452 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
452 * @ent: swap entry to be looked up. 453 * @ent: swap entry to be looked up.
453 * 454 *
454 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) 455 * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
455 */ 456 */
456unsigned short lookup_swap_cgroup_id(swp_entry_t ent) 457unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
457{ 458{
diff --git a/mm/page_io.c b/mm/page_io.c
index 8c79a4764be0..7c59ef681381 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -31,13 +31,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
31 31
32 bio = bio_alloc(gfp_flags, 1); 32 bio = bio_alloc(gfp_flags, 1);
33 if (bio) { 33 if (bio) {
34 bio->bi_sector = map_swap_page(page, &bio->bi_bdev); 34 bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
35 bio->bi_sector <<= PAGE_SHIFT - 9; 35 bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
36 bio->bi_io_vec[0].bv_page = page; 36 bio->bi_io_vec[0].bv_page = page;
37 bio->bi_io_vec[0].bv_len = PAGE_SIZE; 37 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
38 bio->bi_io_vec[0].bv_offset = 0; 38 bio->bi_io_vec[0].bv_offset = 0;
39 bio->bi_vcnt = 1; 39 bio->bi_vcnt = 1;
40 bio->bi_size = PAGE_SIZE; 40 bio->bi_iter.bi_size = PAGE_SIZE;
41 bio->bi_end_io = end_io; 41 bio->bi_end_io = end_io;
42 } 42 }
43 return bio; 43 return bio;
@@ -62,7 +62,7 @@ void end_swap_bio_write(struct bio *bio, int err)
62 printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", 62 printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
63 imajor(bio->bi_bdev->bd_inode), 63 imajor(bio->bi_bdev->bd_inode),
64 iminor(bio->bi_bdev->bd_inode), 64 iminor(bio->bi_bdev->bd_inode),
65 (unsigned long long)bio->bi_sector); 65 (unsigned long long)bio->bi_iter.bi_sector);
66 ClearPageReclaim(page); 66 ClearPageReclaim(page);
67 } 67 }
68 end_page_writeback(page); 68 end_page_writeback(page);
@@ -80,7 +80,7 @@ void end_swap_bio_read(struct bio *bio, int err)
80 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", 80 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
81 imajor(bio->bi_bdev->bd_inode), 81 imajor(bio->bi_bdev->bd_inode),
82 iminor(bio->bi_bdev->bd_inode), 82 iminor(bio->bi_bdev->bd_inode),
83 (unsigned long long)bio->bi_sector); 83 (unsigned long long)bio->bi_iter.bi_sector);
84 goto out; 84 goto out;
85 } 85 }
86 86
@@ -320,8 +320,8 @@ int swap_readpage(struct page *page)
320 int ret = 0; 320 int ret = 0;
321 struct swap_info_struct *sis = page_swap_info(page); 321 struct swap_info_struct *sis = page_swap_info(page);
322 322
323 VM_BUG_ON(!PageLocked(page)); 323 VM_BUG_ON_PAGE(!PageLocked(page), page);
324 VM_BUG_ON(PageUptodate(page)); 324 VM_BUG_ON_PAGE(PageUptodate(page), page);
325 if (frontswap_load(page) == 0) { 325 if (frontswap_load(page) == 0) {
326 SetPageUptodate(page); 326 SetPageUptodate(page);
327 unlock_page(page); 327 unlock_page(page);
diff --git a/mm/percpu.c b/mm/percpu.c
index 0d10defe951e..036cfe07050f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1063 __alignof__(ai->groups[0].cpu_map[0])); 1063 __alignof__(ai->groups[0].cpu_map[0]));
1064 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); 1064 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1065 1065
1066 ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); 1066 ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
1067 if (!ptr) 1067 if (!ptr)
1068 return NULL; 1068 return NULL;
1069 ai = ptr; 1069 ai = ptr;
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1088 */ 1088 */
1089void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) 1089void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1090{ 1090{
1091 free_bootmem(__pa(ai), ai->__ai_size); 1091 memblock_free_early(__pa(ai), ai->__ai_size);
1092} 1092}
1093 1093
1094/** 1094/**
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1246 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); 1246 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1247 1247
1248 /* process group information and build config tables accordingly */ 1248 /* process group information and build config tables accordingly */
1249 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); 1249 group_offsets = memblock_virt_alloc(ai->nr_groups *
1250 group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); 1250 sizeof(group_offsets[0]), 0);
1251 unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); 1251 group_sizes = memblock_virt_alloc(ai->nr_groups *
1252 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); 1252 sizeof(group_sizes[0]), 0);
1253 unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
1254 unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
1253 1255
1254 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1256 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1255 unit_map[cpu] = UINT_MAX; 1257 unit_map[cpu] = UINT_MAX;
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1311 * empty chunks. 1313 * empty chunks.
1312 */ 1314 */
1313 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; 1315 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1314 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); 1316 pcpu_slot = memblock_virt_alloc(
1317 pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
1315 for (i = 0; i < pcpu_nr_slots; i++) 1318 for (i = 0; i < pcpu_nr_slots; i++)
1316 INIT_LIST_HEAD(&pcpu_slot[i]); 1319 INIT_LIST_HEAD(&pcpu_slot[i]);
1317 1320
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1322 * covers static area + reserved area (mostly used for module 1325 * covers static area + reserved area (mostly used for module
1323 * static percpu allocation). 1326 * static percpu allocation).
1324 */ 1327 */
1325 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1328 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1326 INIT_LIST_HEAD(&schunk->list); 1329 INIT_LIST_HEAD(&schunk->list);
1327 schunk->base_addr = base_addr; 1330 schunk->base_addr = base_addr;
1328 schunk->map = smap; 1331 schunk->map = smap;
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1346 1349
1347 /* init dynamic chunk if necessary */ 1350 /* init dynamic chunk if necessary */
1348 if (dyn_size) { 1351 if (dyn_size) {
1349 dchunk = alloc_bootmem(pcpu_chunk_struct_size); 1352 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1350 INIT_LIST_HEAD(&dchunk->list); 1353 INIT_LIST_HEAD(&dchunk->list);
1351 dchunk->base_addr = base_addr; 1354 dchunk->base_addr = base_addr;
1352 dchunk->map = dmap; 1355 dchunk->map = dmap;
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1626 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; 1629 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1627 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); 1630 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1628 1631
1629 areas = alloc_bootmem_nopanic(areas_size); 1632 areas = memblock_virt_alloc_nopanic(areas_size, 0);
1630 if (!areas) { 1633 if (!areas) {
1631 rc = -ENOMEM; 1634 rc = -ENOMEM;
1632 goto out_free; 1635 goto out_free;
@@ -1686,10 +1689,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1686 max_distance += ai->unit_size; 1689 max_distance += ai->unit_size;
1687 1690
1688 /* warn if maximum distance is further than 75% of vmalloc space */ 1691 /* warn if maximum distance is further than 75% of vmalloc space */
1689 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { 1692 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
1690 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " 1693 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
1691 "space 0x%lx\n", max_distance, 1694 "space 0x%lx\n", max_distance,
1692 (unsigned long)(VMALLOC_END - VMALLOC_START)); 1695 VMALLOC_TOTAL);
1693#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1696#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1694 /* and fail if we have fallback */ 1697 /* and fail if we have fallback */
1695 rc = -EINVAL; 1698 rc = -EINVAL;
@@ -1712,7 +1715,7 @@ out_free_areas:
1712out_free: 1715out_free:
1713 pcpu_free_alloc_info(ai); 1716 pcpu_free_alloc_info(ai);
1714 if (areas) 1717 if (areas)
1715 free_bootmem(__pa(areas), areas_size); 1718 memblock_free_early(__pa(areas), areas_size);
1716 return rc; 1719 return rc;
1717} 1720}
1718#endif /* BUILD_EMBED_FIRST_CHUNK */ 1721#endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
1760 /* unaligned allocations can't be freed, round up to page size */ 1763 /* unaligned allocations can't be freed, round up to page size */
1761 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * 1764 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1762 sizeof(pages[0])); 1765 sizeof(pages[0]));
1763 pages = alloc_bootmem(pages_size); 1766 pages = memblock_virt_alloc(pages_size, 0);
1764 1767
1765 /* allocate pages */ 1768 /* allocate pages */
1766 j = 0; 1769 j = 0;
@@ -1823,7 +1826,7 @@ enomem:
1823 free_fn(page_address(pages[j]), PAGE_SIZE); 1826 free_fn(page_address(pages[j]), PAGE_SIZE);
1824 rc = -ENOMEM; 1827 rc = -ENOMEM;
1825out_free_ar: 1828out_free_ar:
1826 free_bootmem(__pa(pages), pages_size); 1829 memblock_free_early(__pa(pages), pages_size);
1827 pcpu_free_alloc_info(ai); 1830 pcpu_free_alloc_info(ai);
1828 return rc; 1831 return rc;
1829} 1832}
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset);
1848static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, 1851static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
1849 size_t align) 1852 size_t align)
1850{ 1853{
1851 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 1854 return memblock_virt_alloc_from_nopanic(
1855 size, align, __pa(MAX_DMA_ADDRESS));
1852} 1856}
1853 1857
1854static void __init pcpu_dfl_fc_free(void *ptr, size_t size) 1858static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
1855{ 1859{
1856 free_bootmem(__pa(ptr), size); 1860 memblock_free_early(__pa(ptr), size);
1857} 1861}
1858 1862
1859void __init setup_per_cpu_areas(void) 1863void __init setup_per_cpu_areas(void)
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void)
1896 void *fc; 1900 void *fc;
1897 1901
1898 ai = pcpu_alloc_alloc_info(1, 1); 1902 ai = pcpu_alloc_alloc_info(1, 1);
1899 fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 1903 fc = memblock_virt_alloc_from_nopanic(unit_size,
1904 PAGE_SIZE,
1905 __pa(MAX_DMA_ADDRESS));
1900 if (!ai || !fc) 1906 if (!ai || !fc)
1901 panic("Failed to allocate memory for percpu areas."); 1907 panic("Failed to allocate memory for percpu areas.");
1902 /* kmemleak tracks the percpu allocations separately */ 1908 /* kmemleak tracks the percpu allocations separately */
diff --git a/mm/readahead.c b/mm/readahead.c
index 7cdbb44aa90b..0de2360d65f3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -211,8 +211,6 @@ out:
211int force_page_cache_readahead(struct address_space *mapping, struct file *filp, 211int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
212 pgoff_t offset, unsigned long nr_to_read) 212 pgoff_t offset, unsigned long nr_to_read)
213{ 213{
214 int ret = 0;
215
216 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 214 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
217 return -EINVAL; 215 return -EINVAL;
218 216
@@ -226,15 +224,13 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
226 this_chunk = nr_to_read; 224 this_chunk = nr_to_read;
227 err = __do_page_cache_readahead(mapping, filp, 225 err = __do_page_cache_readahead(mapping, filp,
228 offset, this_chunk, 0); 226 offset, this_chunk, 0);
229 if (err < 0) { 227 if (err < 0)
230 ret = err; 228 return err;
231 break; 229
232 }
233 ret += err;
234 offset += this_chunk; 230 offset += this_chunk;
235 nr_to_read -= this_chunk; 231 nr_to_read -= this_chunk;
236 } 232 }
237 return ret; 233 return 0;
238} 234}
239 235
240/* 236/*
@@ -576,8 +572,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
576 if (!mapping || !mapping->a_ops) 572 if (!mapping || !mapping->a_ops)
577 return -EINVAL; 573 return -EINVAL;
578 574
579 force_page_cache_readahead(mapping, filp, index, nr); 575 return force_page_cache_readahead(mapping, filp, index, nr);
580 return 0;
581} 576}
582 577
583SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) 578SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
diff --git a/mm/rmap.c b/mm/rmap.c
index 068522d8502a..d9d42316a99a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
660 return 1; 660 return 1;
661} 661}
662 662
663struct page_referenced_arg {
664 int mapcount;
665 int referenced;
666 unsigned long vm_flags;
667 struct mem_cgroup *memcg;
668};
663/* 669/*
664 * Subfunctions of page_referenced: page_referenced_one called 670 * arg: page_referenced_arg will be passed
665 * repeatedly from either page_referenced_anon or page_referenced_file.
666 */ 671 */
667int page_referenced_one(struct page *page, struct vm_area_struct *vma, 672int page_referenced_one(struct page *page, struct vm_area_struct *vma,
668 unsigned long address, unsigned int *mapcount, 673 unsigned long address, void *arg)
669 unsigned long *vm_flags)
670{ 674{
671 struct mm_struct *mm = vma->vm_mm; 675 struct mm_struct *mm = vma->vm_mm;
672 spinlock_t *ptl; 676 spinlock_t *ptl;
673 int referenced = 0; 677 int referenced = 0;
678 struct page_referenced_arg *pra = arg;
674 679
675 if (unlikely(PageTransHuge(page))) { 680 if (unlikely(PageTransHuge(page))) {
676 pmd_t *pmd; 681 pmd_t *pmd;
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
682 pmd = page_check_address_pmd(page, mm, address, 687 pmd = page_check_address_pmd(page, mm, address,
683 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); 688 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
684 if (!pmd) 689 if (!pmd)
685 goto out; 690 return SWAP_AGAIN;
686 691
687 if (vma->vm_flags & VM_LOCKED) { 692 if (vma->vm_flags & VM_LOCKED) {
688 spin_unlock(ptl); 693 spin_unlock(ptl);
689 *mapcount = 0; /* break early from loop */ 694 pra->vm_flags |= VM_LOCKED;
690 *vm_flags |= VM_LOCKED; 695 return SWAP_FAIL; /* To break the loop */
691 goto out;
692 } 696 }
693 697
694 /* go ahead even if the pmd is pmd_trans_splitting() */ 698 /* go ahead even if the pmd is pmd_trans_splitting() */
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
704 */ 708 */
705 pte = page_check_address(page, mm, address, &ptl, 0); 709 pte = page_check_address(page, mm, address, &ptl, 0);
706 if (!pte) 710 if (!pte)
707 goto out; 711 return SWAP_AGAIN;
708 712
709 if (vma->vm_flags & VM_LOCKED) { 713 if (vma->vm_flags & VM_LOCKED) {
710 pte_unmap_unlock(pte, ptl); 714 pte_unmap_unlock(pte, ptl);
711 *mapcount = 0; /* break early from loop */ 715 pra->vm_flags |= VM_LOCKED;
712 *vm_flags |= VM_LOCKED; 716 return SWAP_FAIL; /* To break the loop */
713 goto out;
714 } 717 }
715 718
716 if (ptep_clear_flush_young_notify(vma, address, pte)) { 719 if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
727 pte_unmap_unlock(pte, ptl); 730 pte_unmap_unlock(pte, ptl);
728 } 731 }
729 732
730 (*mapcount)--; 733 if (referenced) {
731 734 pra->referenced++;
732 if (referenced) 735 pra->vm_flags |= vma->vm_flags;
733 *vm_flags |= vma->vm_flags;
734out:
735 return referenced;
736}
737
738static int page_referenced_anon(struct page *page,
739 struct mem_cgroup *memcg,
740 unsigned long *vm_flags)
741{
742 unsigned int mapcount;
743 struct anon_vma *anon_vma;
744 pgoff_t pgoff;
745 struct anon_vma_chain *avc;
746 int referenced = 0;
747
748 anon_vma = page_lock_anon_vma_read(page);
749 if (!anon_vma)
750 return referenced;
751
752 mapcount = page_mapcount(page);
753 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
754 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
755 struct vm_area_struct *vma = avc->vma;
756 unsigned long address = vma_address(page, vma);
757 /*
758 * If we are reclaiming on behalf of a cgroup, skip
759 * counting on behalf of references from different
760 * cgroups
761 */
762 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
763 continue;
764 referenced += page_referenced_one(page, vma, address,
765 &mapcount, vm_flags);
766 if (!mapcount)
767 break;
768 } 736 }
769 737
770 page_unlock_anon_vma_read(anon_vma); 738 pra->mapcount--;
771 return referenced; 739 if (!pra->mapcount)
740 return SWAP_SUCCESS; /* To break the loop */
741
742 return SWAP_AGAIN;
772} 743}
773 744
774/** 745static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
775 * page_referenced_file - referenced check for object-based rmap
776 * @page: the page we're checking references on.
777 * @memcg: target memory control group
778 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
779 *
780 * For an object-based mapped page, find all the places it is mapped and
781 * check/clear the referenced flag. This is done by following the page->mapping
782 * pointer, then walking the chain of vmas it holds. It returns the number
783 * of references it found.
784 *
785 * This function is only called from page_referenced for object-based pages.
786 */
787static int page_referenced_file(struct page *page,
788 struct mem_cgroup *memcg,
789 unsigned long *vm_flags)
790{ 746{
791 unsigned int mapcount; 747 struct page_referenced_arg *pra = arg;
792 struct address_space *mapping = page->mapping; 748 struct mem_cgroup *memcg = pra->memcg;
793 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
794 struct vm_area_struct *vma;
795 int referenced = 0;
796 749
797 /* 750 if (!mm_match_cgroup(vma->vm_mm, memcg))
798 * The caller's checks on page->mapping and !PageAnon have made 751 return true;
799 * sure that this is a file page: the check for page->mapping
800 * excludes the case just before it gets set on an anon page.
801 */
802 BUG_ON(PageAnon(page));
803
804 /*
805 * The page lock not only makes sure that page->mapping cannot
806 * suddenly be NULLified by truncation, it makes sure that the
807 * structure at mapping cannot be freed and reused yet,
808 * so we can safely take mapping->i_mmap_mutex.
809 */
810 BUG_ON(!PageLocked(page));
811
812 mutex_lock(&mapping->i_mmap_mutex);
813
814 /*
815 * i_mmap_mutex does not stabilize mapcount at all, but mapcount
816 * is more likely to be accurate if we note it after spinning.
817 */
818 mapcount = page_mapcount(page);
819
820 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
821 unsigned long address = vma_address(page, vma);
822 /*
823 * If we are reclaiming on behalf of a cgroup, skip
824 * counting on behalf of references from different
825 * cgroups
826 */
827 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
828 continue;
829 referenced += page_referenced_one(page, vma, address,
830 &mapcount, vm_flags);
831 if (!mapcount)
832 break;
833 }
834 752
835 mutex_unlock(&mapping->i_mmap_mutex); 753 return false;
836 return referenced;
837} 754}
838 755
839/** 756/**
@@ -851,41 +768,57 @@ int page_referenced(struct page *page,
851 struct mem_cgroup *memcg, 768 struct mem_cgroup *memcg,
852 unsigned long *vm_flags) 769 unsigned long *vm_flags)
853{ 770{
854 int referenced = 0; 771 int ret;
855 int we_locked = 0; 772 int we_locked = 0;
773 struct page_referenced_arg pra = {
774 .mapcount = page_mapcount(page),
775 .memcg = memcg,
776 };
777 struct rmap_walk_control rwc = {
778 .rmap_one = page_referenced_one,
779 .arg = (void *)&pra,
780 .anon_lock = page_lock_anon_vma_read,
781 };
856 782
857 *vm_flags = 0; 783 *vm_flags = 0;
858 if (page_mapped(page) && page_rmapping(page)) { 784 if (!page_mapped(page))
859 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 785 return 0;
860 we_locked = trylock_page(page); 786
861 if (!we_locked) { 787 if (!page_rmapping(page))
862 referenced++; 788 return 0;
863 goto out; 789
864 } 790 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
865 } 791 we_locked = trylock_page(page);
866 if (unlikely(PageKsm(page))) 792 if (!we_locked)
867 referenced += page_referenced_ksm(page, memcg, 793 return 1;
868 vm_flags);
869 else if (PageAnon(page))
870 referenced += page_referenced_anon(page, memcg,
871 vm_flags);
872 else if (page->mapping)
873 referenced += page_referenced_file(page, memcg,
874 vm_flags);
875 if (we_locked)
876 unlock_page(page);
877 } 794 }
878out: 795
879 return referenced; 796 /*
797 * If we are reclaiming on behalf of a cgroup, skip
798 * counting on behalf of references from different
799 * cgroups
800 */
801 if (memcg) {
802 rwc.invalid_vma = invalid_page_referenced_vma;
803 }
804
805 ret = rmap_walk(page, &rwc);
806 *vm_flags = pra.vm_flags;
807
808 if (we_locked)
809 unlock_page(page);
810
811 return pra.referenced;
880} 812}
881 813
882static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, 814static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
883 unsigned long address) 815 unsigned long address, void *arg)
884{ 816{
885 struct mm_struct *mm = vma->vm_mm; 817 struct mm_struct *mm = vma->vm_mm;
886 pte_t *pte; 818 pte_t *pte;
887 spinlock_t *ptl; 819 spinlock_t *ptl;
888 int ret = 0; 820 int ret = 0;
821 int *cleaned = arg;
889 822
890 pte = page_check_address(page, mm, address, &ptl, 1); 823 pte = page_check_address(page, mm, address, &ptl, 1);
891 if (!pte) 824 if (!pte)
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
904 837
905 pte_unmap_unlock(pte, ptl); 838 pte_unmap_unlock(pte, ptl);
906 839
907 if (ret) 840 if (ret) {
908 mmu_notifier_invalidate_page(mm, address); 841 mmu_notifier_invalidate_page(mm, address);
842 (*cleaned)++;
843 }
909out: 844out:
910 return ret; 845 return SWAP_AGAIN;
911} 846}
912 847
913static int page_mkclean_file(struct address_space *mapping, struct page *page) 848static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
914{ 849{
915 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 850 if (vma->vm_flags & VM_SHARED)
916 struct vm_area_struct *vma; 851 return false;
917 int ret = 0;
918
919 BUG_ON(PageAnon(page));
920 852
921 mutex_lock(&mapping->i_mmap_mutex); 853 return true;
922 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
923 if (vma->vm_flags & VM_SHARED) {
924 unsigned long address = vma_address(page, vma);
925 ret += page_mkclean_one(page, vma, address);
926 }
927 }
928 mutex_unlock(&mapping->i_mmap_mutex);
929 return ret;
930} 854}
931 855
932int page_mkclean(struct page *page) 856int page_mkclean(struct page *page)
933{ 857{
934 int ret = 0; 858 int cleaned = 0;
859 struct address_space *mapping;
860 struct rmap_walk_control rwc = {
861 .arg = (void *)&cleaned,
862 .rmap_one = page_mkclean_one,
863 .invalid_vma = invalid_mkclean_vma,
864 };
935 865
936 BUG_ON(!PageLocked(page)); 866 BUG_ON(!PageLocked(page));
937 867
938 if (page_mapped(page)) { 868 if (!page_mapped(page))
939 struct address_space *mapping = page_mapping(page); 869 return 0;
940 if (mapping)
941 ret = page_mkclean_file(mapping, page);
942 }
943 870
944 return ret; 871 mapping = page_mapping(page);
872 if (!mapping)
873 return 0;
874
875 rmap_walk(page, &rwc);
876
877 return cleaned;
945} 878}
946EXPORT_SYMBOL_GPL(page_mkclean); 879EXPORT_SYMBOL_GPL(page_mkclean);
947 880
@@ -961,9 +894,9 @@ void page_move_anon_rmap(struct page *page,
961{ 894{
962 struct anon_vma *anon_vma = vma->anon_vma; 895 struct anon_vma *anon_vma = vma->anon_vma;
963 896
964 VM_BUG_ON(!PageLocked(page)); 897 VM_BUG_ON_PAGE(!PageLocked(page), page);
965 VM_BUG_ON(!anon_vma); 898 VM_BUG_ON(!anon_vma);
966 VM_BUG_ON(page->index != linear_page_index(vma, address)); 899 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
967 900
968 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 901 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
969 page->mapping = (struct address_space *) anon_vma; 902 page->mapping = (struct address_space *) anon_vma;
@@ -1062,7 +995,7 @@ void do_page_add_anon_rmap(struct page *page,
1062 if (unlikely(PageKsm(page))) 995 if (unlikely(PageKsm(page)))
1063 return; 996 return;
1064 997
1065 VM_BUG_ON(!PageLocked(page)); 998 VM_BUG_ON_PAGE(!PageLocked(page), page);
1066 /* address might be in next vma when migration races vma_adjust */ 999 /* address might be in next vma when migration races vma_adjust */
1067 if (first) 1000 if (first)
1068 __page_set_anon_rmap(page, vma, address, exclusive); 1001 __page_set_anon_rmap(page, vma, address, exclusive);
@@ -1177,17 +1110,17 @@ out:
1177} 1110}
1178 1111
1179/* 1112/*
1180 * Subfunctions of try_to_unmap: try_to_unmap_one called 1113 * @arg: enum ttu_flags will be passed to this argument
1181 * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
1182 */ 1114 */
1183int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1115int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1184 unsigned long address, enum ttu_flags flags) 1116 unsigned long address, void *arg)
1185{ 1117{
1186 struct mm_struct *mm = vma->vm_mm; 1118 struct mm_struct *mm = vma->vm_mm;
1187 pte_t *pte; 1119 pte_t *pte;
1188 pte_t pteval; 1120 pte_t pteval;
1189 spinlock_t *ptl; 1121 spinlock_t *ptl;
1190 int ret = SWAP_AGAIN; 1122 int ret = SWAP_AGAIN;
1123 enum ttu_flags flags = (enum ttu_flags)arg;
1191 1124
1192 pte = page_check_address(page, mm, address, &ptl, 0); 1125 pte = page_check_address(page, mm, address, &ptl, 0);
1193 if (!pte) 1126 if (!pte)
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1426 return ret; 1359 return ret;
1427} 1360}
1428 1361
1429bool is_vma_temporary_stack(struct vm_area_struct *vma) 1362static int try_to_unmap_nonlinear(struct page *page,
1430{ 1363 struct address_space *mapping, struct vm_area_struct *vma)
1431 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1432
1433 if (!maybe_stack)
1434 return false;
1435
1436 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1437 VM_STACK_INCOMPLETE_SETUP)
1438 return true;
1439
1440 return false;
1441}
1442
1443/**
1444 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
1445 * rmap method
1446 * @page: the page to unmap/unlock
1447 * @flags: action and flags
1448 *
1449 * Find all the mappings of a page using the mapping pointer and the vma chains
1450 * contained in the anon_vma struct it points to.
1451 *
1452 * This function is only called from try_to_unmap/try_to_munlock for
1453 * anonymous pages.
1454 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1455 * where the page was found will be held for write. So, we won't recheck
1456 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1457 * 'LOCKED.
1458 */
1459static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1460{ 1364{
1461 struct anon_vma *anon_vma;
1462 pgoff_t pgoff;
1463 struct anon_vma_chain *avc;
1464 int ret = SWAP_AGAIN;
1465
1466 anon_vma = page_lock_anon_vma_read(page);
1467 if (!anon_vma)
1468 return ret;
1469
1470 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1471 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1472 struct vm_area_struct *vma = avc->vma;
1473 unsigned long address;
1474
1475 /*
1476 * During exec, a temporary VMA is setup and later moved.
1477 * The VMA is moved under the anon_vma lock but not the
1478 * page tables leading to a race where migration cannot
1479 * find the migration ptes. Rather than increasing the
1480 * locking requirements of exec(), migration skips
1481 * temporary VMAs until after exec() completes.
1482 */
1483 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
1484 is_vma_temporary_stack(vma))
1485 continue;
1486
1487 address = vma_address(page, vma);
1488 ret = try_to_unmap_one(page, vma, address, flags);
1489 if (ret != SWAP_AGAIN || !page_mapped(page))
1490 break;
1491 }
1492
1493 page_unlock_anon_vma_read(anon_vma);
1494 return ret;
1495}
1496
1497/**
1498 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1499 * @page: the page to unmap/unlock
1500 * @flags: action and flags
1501 *
1502 * Find all the mappings of a page using the mapping pointer and the vma chains
1503 * contained in the address_space struct it points to.
1504 *
1505 * This function is only called from try_to_unmap/try_to_munlock for
1506 * object-based pages.
1507 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1508 * where the page was found will be held for write. So, we won't recheck
1509 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1510 * 'LOCKED.
1511 */
1512static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1513{
1514 struct address_space *mapping = page->mapping;
1515 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1516 struct vm_area_struct *vma;
1517 int ret = SWAP_AGAIN; 1365 int ret = SWAP_AGAIN;
1518 unsigned long cursor; 1366 unsigned long cursor;
1519 unsigned long max_nl_cursor = 0; 1367 unsigned long max_nl_cursor = 0;
1520 unsigned long max_nl_size = 0; 1368 unsigned long max_nl_size = 0;
1521 unsigned int mapcount; 1369 unsigned int mapcount;
1522 1370
1523 if (PageHuge(page)) 1371 list_for_each_entry(vma,
1524 pgoff = page->index << compound_order(page); 1372 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1525 1373
1526 mutex_lock(&mapping->i_mmap_mutex);
1527 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1528 unsigned long address = vma_address(page, vma);
1529 ret = try_to_unmap_one(page, vma, address, flags);
1530 if (ret != SWAP_AGAIN || !page_mapped(page))
1531 goto out;
1532 }
1533
1534 if (list_empty(&mapping->i_mmap_nonlinear))
1535 goto out;
1536
1537 /*
1538 * We don't bother to try to find the munlocked page in nonlinears.
1539 * It's costly. Instead, later, page reclaim logic may call
1540 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
1541 */
1542 if (TTU_ACTION(flags) == TTU_MUNLOCK)
1543 goto out;
1544
1545 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1546 shared.nonlinear) {
1547 cursor = (unsigned long) vma->vm_private_data; 1374 cursor = (unsigned long) vma->vm_private_data;
1548 if (cursor > max_nl_cursor) 1375 if (cursor > max_nl_cursor)
1549 max_nl_cursor = cursor; 1376 max_nl_cursor = cursor;
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1553 } 1380 }
1554 1381
1555 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ 1382 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
1556 ret = SWAP_FAIL; 1383 return SWAP_FAIL;
1557 goto out;
1558 } 1384 }
1559 1385
1560 /* 1386 /*
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1566 */ 1392 */
1567 mapcount = page_mapcount(page); 1393 mapcount = page_mapcount(page);
1568 if (!mapcount) 1394 if (!mapcount)
1569 goto out; 1395 return ret;
1396
1570 cond_resched(); 1397 cond_resched();
1571 1398
1572 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 1399 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1574 max_nl_cursor = CLUSTER_SIZE; 1401 max_nl_cursor = CLUSTER_SIZE;
1575 1402
1576 do { 1403 do {
1577 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1404 list_for_each_entry(vma,
1578 shared.nonlinear) { 1405 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1406
1579 cursor = (unsigned long) vma->vm_private_data; 1407 cursor = (unsigned long) vma->vm_private_data;
1580 while ( cursor < max_nl_cursor && 1408 while (cursor < max_nl_cursor &&
1581 cursor < vma->vm_end - vma->vm_start) { 1409 cursor < vma->vm_end - vma->vm_start) {
1582 if (try_to_unmap_cluster(cursor, &mapcount, 1410 if (try_to_unmap_cluster(cursor, &mapcount,
1583 vma, page) == SWAP_MLOCK) 1411 vma, page) == SWAP_MLOCK)
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1585 cursor += CLUSTER_SIZE; 1413 cursor += CLUSTER_SIZE;
1586 vma->vm_private_data = (void *) cursor; 1414 vma->vm_private_data = (void *) cursor;
1587 if ((int)mapcount <= 0) 1415 if ((int)mapcount <= 0)
1588 goto out; 1416 return ret;
1589 } 1417 }
1590 vma->vm_private_data = (void *) max_nl_cursor; 1418 vma->vm_private_data = (void *) max_nl_cursor;
1591 } 1419 }
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1600 */ 1428 */
1601 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) 1429 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
1602 vma->vm_private_data = NULL; 1430 vma->vm_private_data = NULL;
1603out: 1431
1604 mutex_unlock(&mapping->i_mmap_mutex);
1605 return ret; 1432 return ret;
1606} 1433}
1607 1434
1435bool is_vma_temporary_stack(struct vm_area_struct *vma)
1436{
1437 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1438
1439 if (!maybe_stack)
1440 return false;
1441
1442 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1443 VM_STACK_INCOMPLETE_SETUP)
1444 return true;
1445
1446 return false;
1447}
1448
1449static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
1450{
1451 return is_vma_temporary_stack(vma);
1452}
1453
1454static int page_not_mapped(struct page *page)
1455{
1456 return !page_mapped(page);
1457};
1458
1608/** 1459/**
1609 * try_to_unmap - try to remove all page table mappings to a page 1460 * try_to_unmap - try to remove all page table mappings to a page
1610 * @page: the page to get unmapped 1461 * @page: the page to get unmapped
@@ -1622,16 +1473,29 @@ out:
1622int try_to_unmap(struct page *page, enum ttu_flags flags) 1473int try_to_unmap(struct page *page, enum ttu_flags flags)
1623{ 1474{
1624 int ret; 1475 int ret;
1476 struct rmap_walk_control rwc = {
1477 .rmap_one = try_to_unmap_one,
1478 .arg = (void *)flags,
1479 .done = page_not_mapped,
1480 .file_nonlinear = try_to_unmap_nonlinear,
1481 .anon_lock = page_lock_anon_vma_read,
1482 };
1625 1483
1626 BUG_ON(!PageLocked(page)); 1484 VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
1627 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); 1485
1486 /*
1487 * During exec, a temporary VMA is setup and later moved.
1488 * The VMA is moved under the anon_vma lock but not the
1489 * page tables leading to a race where migration cannot
1490 * find the migration ptes. Rather than increasing the
1491 * locking requirements of exec(), migration skips
1492 * temporary VMAs until after exec() completes.
1493 */
1494 if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page))
1495 rwc.invalid_vma = invalid_migration_vma;
1496
1497 ret = rmap_walk(page, &rwc);
1628 1498
1629 if (unlikely(PageKsm(page)))
1630 ret = try_to_unmap_ksm(page, flags);
1631 else if (PageAnon(page))
1632 ret = try_to_unmap_anon(page, flags);
1633 else
1634 ret = try_to_unmap_file(page, flags);
1635 if (ret != SWAP_MLOCK && !page_mapped(page)) 1499 if (ret != SWAP_MLOCK && !page_mapped(page))
1636 ret = SWAP_SUCCESS; 1500 ret = SWAP_SUCCESS;
1637 return ret; 1501 return ret;
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1654 */ 1518 */
1655int try_to_munlock(struct page *page) 1519int try_to_munlock(struct page *page)
1656{ 1520{
1657 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1521 int ret;
1522 struct rmap_walk_control rwc = {
1523 .rmap_one = try_to_unmap_one,
1524 .arg = (void *)TTU_MUNLOCK,
1525 .done = page_not_mapped,
1526 /*
1527 * We don't bother to try to find the munlocked page in
1528 * nonlinears. It's costly. Instead, later, page reclaim logic
1529 * may call try_to_unmap() and recover PG_mlocked lazily.
1530 */
1531 .file_nonlinear = NULL,
1532 .anon_lock = page_lock_anon_vma_read,
1658 1533
1659 if (unlikely(PageKsm(page))) 1534 };
1660 return try_to_unmap_ksm(page, TTU_MUNLOCK); 1535
1661 else if (PageAnon(page)) 1536 VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
1662 return try_to_unmap_anon(page, TTU_MUNLOCK); 1537
1663 else 1538 ret = rmap_walk(page, &rwc);
1664 return try_to_unmap_file(page, TTU_MUNLOCK); 1539 return ret;
1665} 1540}
1666 1541
1667void __put_anon_vma(struct anon_vma *anon_vma) 1542void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma)
1674 anon_vma_free(anon_vma); 1549 anon_vma_free(anon_vma);
1675} 1550}
1676 1551
1677#ifdef CONFIG_MIGRATION 1552static struct anon_vma *rmap_walk_anon_lock(struct page *page,
1678/* 1553 struct rmap_walk_control *rwc)
1679 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
1680 * Called by migrate.c to remove migration ptes, but might be used more later.
1681 */
1682static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1683 struct vm_area_struct *, unsigned long, void *), void *arg)
1684{ 1554{
1685 struct anon_vma *anon_vma; 1555 struct anon_vma *anon_vma;
1686 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1556
1687 struct anon_vma_chain *avc; 1557 if (rwc->anon_lock)
1688 int ret = SWAP_AGAIN; 1558 return rwc->anon_lock(page);
1689 1559
1690 /* 1560 /*
1691 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1561 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1695 */ 1565 */
1696 anon_vma = page_anon_vma(page); 1566 anon_vma = page_anon_vma(page);
1697 if (!anon_vma) 1567 if (!anon_vma)
1698 return ret; 1568 return NULL;
1569
1699 anon_vma_lock_read(anon_vma); 1570 anon_vma_lock_read(anon_vma);
1571 return anon_vma;
1572}
1573
1574/*
1575 * rmap_walk_anon - do something to anonymous page using the object-based
1576 * rmap method
1577 * @page: the page to be handled
1578 * @rwc: control variable according to each walk type
1579 *
1580 * Find all the mappings of a page using the mapping pointer and the vma chains
1581 * contained in the anon_vma struct it points to.
1582 *
1583 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1584 * where the page was found will be held for write. So, we won't recheck
1585 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1586 * LOCKED.
1587 */
1588static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1589{
1590 struct anon_vma *anon_vma;
1591 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1592 struct anon_vma_chain *avc;
1593 int ret = SWAP_AGAIN;
1594
1595 anon_vma = rmap_walk_anon_lock(page, rwc);
1596 if (!anon_vma)
1597 return ret;
1598
1700 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1599 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1701 struct vm_area_struct *vma = avc->vma; 1600 struct vm_area_struct *vma = avc->vma;
1702 unsigned long address = vma_address(page, vma); 1601 unsigned long address = vma_address(page, vma);
1703 ret = rmap_one(page, vma, address, arg); 1602
1603 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1604 continue;
1605
1606 ret = rwc->rmap_one(page, vma, address, rwc->arg);
1704 if (ret != SWAP_AGAIN) 1607 if (ret != SWAP_AGAIN)
1705 break; 1608 break;
1609 if (rwc->done && rwc->done(page))
1610 break;
1706 } 1611 }
1707 anon_vma_unlock_read(anon_vma); 1612 anon_vma_unlock_read(anon_vma);
1708 return ret; 1613 return ret;
1709} 1614}
1710 1615
1711static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, 1616/*
1712 struct vm_area_struct *, unsigned long, void *), void *arg) 1617 * rmap_walk_file - do something to file page using the object-based rmap method
1618 * @page: the page to be handled
1619 * @rwc: control variable according to each walk type
1620 *
1621 * Find all the mappings of a page using the mapping pointer and the vma chains
1622 * contained in the address_space struct it points to.
1623 *
1624 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1625 * where the page was found will be held for write. So, we won't recheck
1626 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1627 * LOCKED.
1628 */
1629static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1713{ 1630{
1714 struct address_space *mapping = page->mapping; 1631 struct address_space *mapping = page->mapping;
1715 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1632 pgoff_t pgoff = page->index << compound_order(page);
1716 struct vm_area_struct *vma; 1633 struct vm_area_struct *vma;
1717 int ret = SWAP_AGAIN; 1634 int ret = SWAP_AGAIN;
1718 1635
1636 /*
1637 * The page lock not only makes sure that page->mapping cannot
1638 * suddenly be NULLified by truncation, it makes sure that the
1639 * structure at mapping cannot be freed and reused yet,
1640 * so we can safely take mapping->i_mmap_mutex.
1641 */
1642 VM_BUG_ON(!PageLocked(page));
1643
1719 if (!mapping) 1644 if (!mapping)
1720 return ret; 1645 return ret;
1721 mutex_lock(&mapping->i_mmap_mutex); 1646 mutex_lock(&mapping->i_mmap_mutex);
1722 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1647 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1723 unsigned long address = vma_address(page, vma); 1648 unsigned long address = vma_address(page, vma);
1724 ret = rmap_one(page, vma, address, arg); 1649
1650 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1651 continue;
1652
1653 ret = rwc->rmap_one(page, vma, address, rwc->arg);
1725 if (ret != SWAP_AGAIN) 1654 if (ret != SWAP_AGAIN)
1726 break; 1655 goto done;
1656 if (rwc->done && rwc->done(page))
1657 goto done;
1727 } 1658 }
1728 /* 1659
1729 * No nonlinear handling: being always shared, nonlinear vmas 1660 if (!rwc->file_nonlinear)
1730 * never contain migration ptes. Decide what to do about this 1661 goto done;
1731 * limitation to linear when we need rmap_walk() on nonlinear. 1662
1732 */ 1663 if (list_empty(&mapping->i_mmap_nonlinear))
1664 goto done;
1665
1666 ret = rwc->file_nonlinear(page, mapping, vma);
1667
1668done:
1733 mutex_unlock(&mapping->i_mmap_mutex); 1669 mutex_unlock(&mapping->i_mmap_mutex);
1734 return ret; 1670 return ret;
1735} 1671}
1736 1672
1737int rmap_walk(struct page *page, int (*rmap_one)(struct page *, 1673int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
1738 struct vm_area_struct *, unsigned long, void *), void *arg)
1739{ 1674{
1740 VM_BUG_ON(!PageLocked(page));
1741
1742 if (unlikely(PageKsm(page))) 1675 if (unlikely(PageKsm(page)))
1743 return rmap_walk_ksm(page, rmap_one, arg); 1676 return rmap_walk_ksm(page, rwc);
1744 else if (PageAnon(page)) 1677 else if (PageAnon(page))
1745 return rmap_walk_anon(page, rmap_one, arg); 1678 return rmap_walk_anon(page, rwc);
1746 else 1679 else
1747 return rmap_walk_file(page, rmap_one, arg); 1680 return rmap_walk_file(page, rwc);
1748} 1681}
1749#endif /* CONFIG_MIGRATION */
1750 1682
1751#ifdef CONFIG_HUGETLB_PAGE 1683#ifdef CONFIG_HUGETLB_PAGE
1752/* 1684/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 902a14842b74..1f18c9d0d93e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,7 +45,7 @@ static struct vfsmount *shm_mnt;
45#include <linux/xattr.h> 45#include <linux/xattr.h>
46#include <linux/exportfs.h> 46#include <linux/exportfs.h>
47#include <linux/posix_acl.h> 47#include <linux/posix_acl.h>
48#include <linux/generic_acl.h> 48#include <linux/posix_acl_xattr.h>
49#include <linux/mman.h> 49#include <linux/mman.h>
50#include <linux/string.h> 50#include <linux/string.h>
51#include <linux/slab.h> 51#include <linux/slab.h>
@@ -285,8 +285,8 @@ static int shmem_add_to_page_cache(struct page *page,
285{ 285{
286 int error; 286 int error;
287 287
288 VM_BUG_ON(!PageLocked(page)); 288 VM_BUG_ON_PAGE(!PageLocked(page), page);
289 VM_BUG_ON(!PageSwapBacked(page)); 289 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
290 290
291 page_cache_get(page); 291 page_cache_get(page);
292 page->mapping = mapping; 292 page->mapping = mapping;
@@ -491,7 +491,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
491 continue; 491 continue;
492 if (!unfalloc || !PageUptodate(page)) { 492 if (!unfalloc || !PageUptodate(page)) {
493 if (page->mapping == mapping) { 493 if (page->mapping == mapping) {
494 VM_BUG_ON(PageWriteback(page)); 494 VM_BUG_ON_PAGE(PageWriteback(page), page);
495 truncate_inode_page(mapping, page); 495 truncate_inode_page(mapping, page);
496 } 496 }
497 } 497 }
@@ -568,7 +568,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
568 lock_page(page); 568 lock_page(page);
569 if (!unfalloc || !PageUptodate(page)) { 569 if (!unfalloc || !PageUptodate(page)) {
570 if (page->mapping == mapping) { 570 if (page->mapping == mapping) {
571 VM_BUG_ON(PageWriteback(page)); 571 VM_BUG_ON_PAGE(PageWriteback(page), page);
572 truncate_inode_page(mapping, page); 572 truncate_inode_page(mapping, page);
573 } 573 }
574 } 574 }
@@ -620,10 +620,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
620 } 620 }
621 621
622 setattr_copy(inode, attr); 622 setattr_copy(inode, attr);
623#ifdef CONFIG_TMPFS_POSIX_ACL
624 if (attr->ia_valid & ATTR_MODE) 623 if (attr->ia_valid & ATTR_MODE)
625 error = generic_acl_chmod(inode); 624 error = posix_acl_chmod(inode, inode->i_mode);
626#endif
627 return error; 625 return error;
628} 626}
629 627
@@ -1937,22 +1935,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1937 1935
1938 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1936 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1939 if (inode) { 1937 if (inode) {
1940#ifdef CONFIG_TMPFS_POSIX_ACL 1938 error = simple_acl_create(dir, inode);
1941 error = generic_acl_init(inode, dir); 1939 if (error)
1942 if (error) { 1940 goto out_iput;
1943 iput(inode);
1944 return error;
1945 }
1946#endif
1947 error = security_inode_init_security(inode, dir, 1941 error = security_inode_init_security(inode, dir,
1948 &dentry->d_name, 1942 &dentry->d_name,
1949 shmem_initxattrs, NULL); 1943 shmem_initxattrs, NULL);
1950 if (error) { 1944 if (error && error != -EOPNOTSUPP)
1951 if (error != -EOPNOTSUPP) { 1945 goto out_iput;
1952 iput(inode);
1953 return error;
1954 }
1955 }
1956 1946
1957 error = 0; 1947 error = 0;
1958 dir->i_size += BOGO_DIRENT_SIZE; 1948 dir->i_size += BOGO_DIRENT_SIZE;
@@ -1961,6 +1951,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1961 dget(dentry); /* Extra count - pin the dentry in core */ 1951 dget(dentry); /* Extra count - pin the dentry in core */
1962 } 1952 }
1963 return error; 1953 return error;
1954out_iput:
1955 iput(inode);
1956 return error;
1964} 1957}
1965 1958
1966static int 1959static int
@@ -1974,24 +1967,17 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
1974 error = security_inode_init_security(inode, dir, 1967 error = security_inode_init_security(inode, dir,
1975 NULL, 1968 NULL,
1976 shmem_initxattrs, NULL); 1969 shmem_initxattrs, NULL);
1977 if (error) { 1970 if (error && error != -EOPNOTSUPP)
1978 if (error != -EOPNOTSUPP) { 1971 goto out_iput;
1979 iput(inode); 1972 error = simple_acl_create(dir, inode);
1980 return error; 1973 if (error)
1981 } 1974 goto out_iput;
1982 }
1983#ifdef CONFIG_TMPFS_POSIX_ACL
1984 error = generic_acl_init(inode, dir);
1985 if (error) {
1986 iput(inode);
1987 return error;
1988 }
1989#else
1990 error = 0;
1991#endif
1992 d_tmpfile(dentry, inode); 1975 d_tmpfile(dentry, inode);
1993 } 1976 }
1994 return error; 1977 return error;
1978out_iput:
1979 iput(inode);
1980 return error;
1995} 1981}
1996 1982
1997static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 1983static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -2223,8 +2209,8 @@ static int shmem_initxattrs(struct inode *inode,
2223 2209
2224static const struct xattr_handler *shmem_xattr_handlers[] = { 2210static const struct xattr_handler *shmem_xattr_handlers[] = {
2225#ifdef CONFIG_TMPFS_POSIX_ACL 2211#ifdef CONFIG_TMPFS_POSIX_ACL
2226 &generic_acl_access_handler, 2212 &posix_acl_access_xattr_handler,
2227 &generic_acl_default_handler, 2213 &posix_acl_default_xattr_handler,
2228#endif 2214#endif
2229 NULL 2215 NULL
2230}; 2216};
@@ -2740,6 +2726,7 @@ static const struct inode_operations shmem_inode_operations = {
2740 .getxattr = shmem_getxattr, 2726 .getxattr = shmem_getxattr,
2741 .listxattr = shmem_listxattr, 2727 .listxattr = shmem_listxattr,
2742 .removexattr = shmem_removexattr, 2728 .removexattr = shmem_removexattr,
2729 .set_acl = simple_set_acl,
2743#endif 2730#endif
2744}; 2731};
2745 2732
@@ -2764,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2764#endif 2751#endif
2765#ifdef CONFIG_TMPFS_POSIX_ACL 2752#ifdef CONFIG_TMPFS_POSIX_ACL
2766 .setattr = shmem_setattr, 2753 .setattr = shmem_setattr,
2754 .set_acl = simple_set_acl,
2767#endif 2755#endif
2768}; 2756};
2769 2757
@@ -2776,6 +2764,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2776#endif 2764#endif
2777#ifdef CONFIG_TMPFS_POSIX_ACL 2765#ifdef CONFIG_TMPFS_POSIX_ACL
2778 .setattr = shmem_setattr, 2766 .setattr = shmem_setattr,
2767 .set_acl = simple_set_acl,
2779#endif 2768#endif
2780}; 2769};
2781 2770
diff --git a/mm/slab.c b/mm/slab.c
index eb043bf05f4c..b264214c77ea 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1946,7 +1946,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1946/** 1946/**
1947 * slab_destroy - destroy and release all objects in a slab 1947 * slab_destroy - destroy and release all objects in a slab
1948 * @cachep: cache pointer being destroyed 1948 * @cachep: cache pointer being destroyed
1949 * @slabp: slab pointer being destroyed 1949 * @page: page pointer being destroyed
1950 * 1950 *
1951 * Destroy all the objs in a slab, and release the mem back to the system. 1951 * Destroy all the objs in a slab, and release the mem back to the system.
1952 * Before calling the slab must have been unlinked from the cache. The 1952 * Before calling the slab must have been unlinked from the cache. The
diff --git a/mm/slab.h b/mm/slab.h
index 0859c4241ba1..8184a7cde272 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,12 +160,36 @@ static inline const char *cache_name(struct kmem_cache *s)
160 return s->name; 160 return s->name;
161} 161}
162 162
163/*
164 * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
165 * That said the caller must assure the memcg's cache won't go away. Since once
166 * created a memcg's cache is destroyed only along with the root cache, it is
167 * true if we are going to allocate from the cache or hold a reference to the
168 * root cache by other means. Otherwise, we should hold either the slab_mutex
169 * or the memcg's slab_caches_mutex while calling this function and accessing
170 * the returned value.
171 */
163static inline struct kmem_cache * 172static inline struct kmem_cache *
164cache_from_memcg_idx(struct kmem_cache *s, int idx) 173cache_from_memcg_idx(struct kmem_cache *s, int idx)
165{ 174{
175 struct kmem_cache *cachep;
176 struct memcg_cache_params *params;
177
166 if (!s->memcg_params) 178 if (!s->memcg_params)
167 return NULL; 179 return NULL;
168 return s->memcg_params->memcg_caches[idx]; 180
181 rcu_read_lock();
182 params = rcu_dereference(s->memcg_params);
183 cachep = params->memcg_caches[idx];
184 rcu_read_unlock();
185
186 /*
187 * Make sure we will access the up-to-date value. The code updating
188 * memcg_caches issues a write barrier to match this (see
189 * memcg_register_cache()).
190 */
191 smp_read_barrier_depends();
192 return cachep;
169} 193}
170 194
171static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) 195static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb399b0e4..1ec3c619ba04 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -171,13 +171,26 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
171 struct kmem_cache *parent_cache) 171 struct kmem_cache *parent_cache)
172{ 172{
173 struct kmem_cache *s = NULL; 173 struct kmem_cache *s = NULL;
174 int err = 0; 174 int err;
175 175
176 get_online_cpus(); 176 get_online_cpus();
177 mutex_lock(&slab_mutex); 177 mutex_lock(&slab_mutex);
178 178
179 if (!kmem_cache_sanity_check(memcg, name, size) == 0) 179 err = kmem_cache_sanity_check(memcg, name, size);
180 goto out_locked; 180 if (err)
181 goto out_unlock;
182
183 if (memcg) {
184 /*
185 * Since per-memcg caches are created asynchronously on first
186 * allocation (see memcg_kmem_get_cache()), several threads can
187 * try to create the same cache, but only one of them may
188 * succeed. Therefore if we get here and see the cache has
189 * already been created, we silently return NULL.
190 */
191 if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
192 goto out_unlock;
193 }
181 194
182 /* 195 /*
183 * Some allocators will constraint the set of valid flags to a subset 196 * Some allocators will constraint the set of valid flags to a subset
@@ -189,44 +202,47 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
189 202
190 s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); 203 s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
191 if (s) 204 if (s)
192 goto out_locked; 205 goto out_unlock;
193 206
207 err = -ENOMEM;
194 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); 208 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
195 if (s) { 209 if (!s)
196 s->object_size = s->size = size; 210 goto out_unlock;
197 s->align = calculate_alignment(flags, align, size);
198 s->ctor = ctor;
199 211
200 if (memcg_register_cache(memcg, s, parent_cache)) { 212 s->object_size = s->size = size;
201 kmem_cache_free(kmem_cache, s); 213 s->align = calculate_alignment(flags, align, size);
202 err = -ENOMEM; 214 s->ctor = ctor;
203 goto out_locked;
204 }
205 215
206 s->name = kstrdup(name, GFP_KERNEL); 216 s->name = kstrdup(name, GFP_KERNEL);
207 if (!s->name) { 217 if (!s->name)
208 kmem_cache_free(kmem_cache, s); 218 goto out_free_cache;
209 err = -ENOMEM;
210 goto out_locked;
211 }
212 219
213 err = __kmem_cache_create(s, flags); 220 err = memcg_alloc_cache_params(memcg, s, parent_cache);
214 if (!err) { 221 if (err)
215 s->refcount = 1; 222 goto out_free_cache;
216 list_add(&s->list, &slab_caches); 223
217 memcg_cache_list_add(memcg, s); 224 err = __kmem_cache_create(s, flags);
218 } else { 225 if (err)
219 kfree(s->name); 226 goto out_free_cache;
220 kmem_cache_free(kmem_cache, s); 227
221 } 228 s->refcount = 1;
222 } else 229 list_add(&s->list, &slab_caches);
223 err = -ENOMEM; 230 memcg_register_cache(s);
224 231
225out_locked: 232out_unlock:
226 mutex_unlock(&slab_mutex); 233 mutex_unlock(&slab_mutex);
227 put_online_cpus(); 234 put_online_cpus();
228 235
229 if (err) { 236 if (err) {
237 /*
238 * There is no point in flooding logs with warnings or
239 * especially crashing the system if we fail to create a cache
240 * for a memcg. In this case we will be accounting the memcg
241 * allocation to the root cgroup until we succeed to create its
242 * own cache, but it isn't that critical.
243 */
244 if (!memcg)
245 return NULL;
230 246
231 if (flags & SLAB_PANIC) 247 if (flags & SLAB_PANIC)
232 panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", 248 panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
@@ -236,11 +252,15 @@ out_locked:
236 name, err); 252 name, err);
237 dump_stack(); 253 dump_stack();
238 } 254 }
239
240 return NULL; 255 return NULL;
241 } 256 }
242
243 return s; 257 return s;
258
259out_free_cache:
260 memcg_free_cache_params(s);
261 kfree(s->name);
262 kmem_cache_free(kmem_cache, s);
263 goto out_unlock;
244} 264}
245 265
246struct kmem_cache * 266struct kmem_cache *
@@ -263,11 +283,12 @@ void kmem_cache_destroy(struct kmem_cache *s)
263 list_del(&s->list); 283 list_del(&s->list);
264 284
265 if (!__kmem_cache_shutdown(s)) { 285 if (!__kmem_cache_shutdown(s)) {
286 memcg_unregister_cache(s);
266 mutex_unlock(&slab_mutex); 287 mutex_unlock(&slab_mutex);
267 if (s->flags & SLAB_DESTROY_BY_RCU) 288 if (s->flags & SLAB_DESTROY_BY_RCU)
268 rcu_barrier(); 289 rcu_barrier();
269 290
270 memcg_release_cache(s); 291 memcg_free_cache_params(s);
271 kfree(s->name); 292 kfree(s->name);
272 kmem_cache_free(kmem_cache, s); 293 kmem_cache_free(kmem_cache, s);
273 } else { 294 } else {
diff --git a/mm/slub.c b/mm/slub.c
index 545a170ebf9f..7e3e0458bce4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -355,6 +355,21 @@ static __always_inline void slab_unlock(struct page *page)
355 __bit_spin_unlock(PG_locked, &page->flags); 355 __bit_spin_unlock(PG_locked, &page->flags);
356} 356}
357 357
358static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
359{
360 struct page tmp;
361 tmp.counters = counters_new;
362 /*
363 * page->counters can cover frozen/inuse/objects as well
364 * as page->_count. If we assign to ->counters directly
365 * we run the risk of losing updates to page->_count, so
366 * be careful and only assign to the fields we need.
367 */
368 page->frozen = tmp.frozen;
369 page->inuse = tmp.inuse;
370 page->objects = tmp.objects;
371}
372
358/* Interrupts must be disabled (for the fallback code to work right) */ 373/* Interrupts must be disabled (for the fallback code to work right) */
359static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 374static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
360 void *freelist_old, unsigned long counters_old, 375 void *freelist_old, unsigned long counters_old,
@@ -376,7 +391,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
376 if (page->freelist == freelist_old && 391 if (page->freelist == freelist_old &&
377 page->counters == counters_old) { 392 page->counters == counters_old) {
378 page->freelist = freelist_new; 393 page->freelist = freelist_new;
379 page->counters = counters_new; 394 set_page_slub_counters(page, counters_new);
380 slab_unlock(page); 395 slab_unlock(page);
381 return 1; 396 return 1;
382 } 397 }
@@ -415,7 +430,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
415 if (page->freelist == freelist_old && 430 if (page->freelist == freelist_old &&
416 page->counters == counters_old) { 431 page->counters == counters_old) {
417 page->freelist = freelist_new; 432 page->freelist = freelist_new;
418 page->counters = counters_new; 433 set_page_slub_counters(page, counters_new);
419 slab_unlock(page); 434 slab_unlock(page);
420 local_irq_restore(flags); 435 local_irq_restore(flags);
421 return 1; 436 return 1;
@@ -985,23 +1000,22 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
985 1000
986/* 1001/*
987 * Tracking of fully allocated slabs for debugging purposes. 1002 * Tracking of fully allocated slabs for debugging purposes.
988 *
989 * list_lock must be held.
990 */ 1003 */
991static void add_full(struct kmem_cache *s, 1004static void add_full(struct kmem_cache *s,
992 struct kmem_cache_node *n, struct page *page) 1005 struct kmem_cache_node *n, struct page *page)
993{ 1006{
1007 lockdep_assert_held(&n->list_lock);
1008
994 if (!(s->flags & SLAB_STORE_USER)) 1009 if (!(s->flags & SLAB_STORE_USER))
995 return; 1010 return;
996 1011
997 list_add(&page->lru, &n->full); 1012 list_add(&page->lru, &n->full);
998} 1013}
999 1014
1000/* 1015static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
1001 * list_lock must be held.
1002 */
1003static void remove_full(struct kmem_cache *s, struct page *page)
1004{ 1016{
1017 lockdep_assert_held(&n->list_lock);
1018
1005 if (!(s->flags & SLAB_STORE_USER)) 1019 if (!(s->flags & SLAB_STORE_USER))
1006 return; 1020 return;
1007 1021
@@ -1250,7 +1264,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1250 void *object, u8 val) { return 1; } 1264 void *object, u8 val) { return 1; }
1251static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1265static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1252 struct page *page) {} 1266 struct page *page) {}
1253static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1267static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1268 struct page *page) {}
1254static inline unsigned long kmem_cache_flags(unsigned long object_size, 1269static inline unsigned long kmem_cache_flags(unsigned long object_size,
1255 unsigned long flags, const char *name, 1270 unsigned long flags, const char *name,
1256 void (*ctor)(void *)) 1271 void (*ctor)(void *))
@@ -1504,12 +1519,12 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
1504 1519
1505/* 1520/*
1506 * Management of partially allocated slabs. 1521 * Management of partially allocated slabs.
1507 *
1508 * list_lock must be held.
1509 */ 1522 */
1510static inline void add_partial(struct kmem_cache_node *n, 1523static inline void add_partial(struct kmem_cache_node *n,
1511 struct page *page, int tail) 1524 struct page *page, int tail)
1512{ 1525{
1526 lockdep_assert_held(&n->list_lock);
1527
1513 n->nr_partial++; 1528 n->nr_partial++;
1514 if (tail == DEACTIVATE_TO_TAIL) 1529 if (tail == DEACTIVATE_TO_TAIL)
1515 list_add_tail(&page->lru, &n->partial); 1530 list_add_tail(&page->lru, &n->partial);
@@ -1517,12 +1532,11 @@ static inline void add_partial(struct kmem_cache_node *n,
1517 list_add(&page->lru, &n->partial); 1532 list_add(&page->lru, &n->partial);
1518} 1533}
1519 1534
1520/*
1521 * list_lock must be held.
1522 */
1523static inline void remove_partial(struct kmem_cache_node *n, 1535static inline void remove_partial(struct kmem_cache_node *n,
1524 struct page *page) 1536 struct page *page)
1525{ 1537{
1538 lockdep_assert_held(&n->list_lock);
1539
1526 list_del(&page->lru); 1540 list_del(&page->lru);
1527 n->nr_partial--; 1541 n->nr_partial--;
1528} 1542}
@@ -1532,8 +1546,6 @@ static inline void remove_partial(struct kmem_cache_node *n,
1532 * return the pointer to the freelist. 1546 * return the pointer to the freelist.
1533 * 1547 *
1534 * Returns a list of objects or NULL if it fails. 1548 * Returns a list of objects or NULL if it fails.
1535 *
1536 * Must hold list_lock since we modify the partial list.
1537 */ 1549 */
1538static inline void *acquire_slab(struct kmem_cache *s, 1550static inline void *acquire_slab(struct kmem_cache *s,
1539 struct kmem_cache_node *n, struct page *page, 1551 struct kmem_cache_node *n, struct page *page,
@@ -1543,6 +1555,8 @@ static inline void *acquire_slab(struct kmem_cache *s,
1543 unsigned long counters; 1555 unsigned long counters;
1544 struct page new; 1556 struct page new;
1545 1557
1558 lockdep_assert_held(&n->list_lock);
1559
1546 /* 1560 /*
1547 * Zap the freelist and set the frozen bit. 1561 * Zap the freelist and set the frozen bit.
1548 * The old freelist is the list of objects for the 1562 * The old freelist is the list of objects for the
@@ -1887,7 +1901,7 @@ redo:
1887 1901
1888 else if (l == M_FULL) 1902 else if (l == M_FULL)
1889 1903
1890 remove_full(s, page); 1904 remove_full(s, n, page);
1891 1905
1892 if (m == M_PARTIAL) { 1906 if (m == M_PARTIAL) {
1893 1907
@@ -2541,7 +2555,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2541 new.inuse--; 2555 new.inuse--;
2542 if ((!new.inuse || !prior) && !was_frozen) { 2556 if ((!new.inuse || !prior) && !was_frozen) {
2543 2557
2544 if (kmem_cache_has_cpu_partial(s) && !prior) 2558 if (kmem_cache_has_cpu_partial(s) && !prior) {
2545 2559
2546 /* 2560 /*
2547 * Slab was on no list before and will be 2561 * Slab was on no list before and will be
@@ -2551,7 +2565,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2551 */ 2565 */
2552 new.frozen = 1; 2566 new.frozen = 1;
2553 2567
2554 else { /* Needs to be taken off a list */ 2568 } else { /* Needs to be taken off a list */
2555 2569
2556 n = get_node(s, page_to_nid(page)); 2570 n = get_node(s, page_to_nid(page));
2557 /* 2571 /*
@@ -2600,7 +2614,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2600 */ 2614 */
2601 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { 2615 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
2602 if (kmem_cache_debug(s)) 2616 if (kmem_cache_debug(s))
2603 remove_full(s, page); 2617 remove_full(s, n, page);
2604 add_partial(n, page, DEACTIVATE_TO_TAIL); 2618 add_partial(n, page, DEACTIVATE_TO_TAIL);
2605 stat(s, FREE_ADD_PARTIAL); 2619 stat(s, FREE_ADD_PARTIAL);
2606 } 2620 }
@@ -2614,9 +2628,10 @@ slab_empty:
2614 */ 2628 */
2615 remove_partial(n, page); 2629 remove_partial(n, page);
2616 stat(s, FREE_REMOVE_PARTIAL); 2630 stat(s, FREE_REMOVE_PARTIAL);
2617 } else 2631 } else {
2618 /* Slab must be on the full list */ 2632 /* Slab must be on the full list */
2619 remove_full(s, page); 2633 remove_full(s, n, page);
2634 }
2620 2635
2621 spin_unlock_irqrestore(&n->list_lock, flags); 2636 spin_unlock_irqrestore(&n->list_lock, flags);
2622 stat(s, FREE_SLAB); 2637 stat(s, FREE_SLAB);
@@ -2890,7 +2905,13 @@ static void early_kmem_cache_node_alloc(int node)
2890 init_kmem_cache_node(n); 2905 init_kmem_cache_node(n);
2891 inc_slabs_node(kmem_cache_node, node, page->objects); 2906 inc_slabs_node(kmem_cache_node, node, page->objects);
2892 2907
2908 /*
2909 * the lock is for lockdep's sake, not for any actual
2910 * race protection
2911 */
2912 spin_lock(&n->list_lock);
2893 add_partial(n, page, DEACTIVATE_TO_HEAD); 2913 add_partial(n, page, DEACTIVATE_TO_HEAD);
2914 spin_unlock(&n->list_lock);
2894} 2915}
2895 2916
2896static void free_kmem_cache_nodes(struct kmem_cache *s) 2917static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -4299,7 +4320,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4299 4320
4300 page = ACCESS_ONCE(c->partial); 4321 page = ACCESS_ONCE(c->partial);
4301 if (page) { 4322 if (page) {
4302 x = page->pobjects; 4323 node = page_to_nid(page);
4324 if (flags & SO_TOTAL)
4325 WARN_ON_ONCE(1);
4326 else if (flags & SO_OBJECTS)
4327 WARN_ON_ONCE(1);
4328 else
4329 x = page->pages;
4303 total += x; 4330 total += x;
4304 nodes[node] += x; 4331 nodes[node] += x;
4305 } 4332 }
@@ -5163,7 +5190,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
5163 } 5190 }
5164 5191
5165 s->kobj.kset = slab_kset; 5192 s->kobj.kset = slab_kset;
5166 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 5193 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5167 if (err) { 5194 if (err) {
5168 kobject_put(&s->kobj); 5195 kobject_put(&s->kobj);
5169 return err; 5196 return err;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 27eeab3be757..4cba9c2783a1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 40 unsigned long align,
41 unsigned long goal) 41 unsigned long goal)
42{ 42{
43 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); 43 return memblock_virt_alloc_try_nid(size, align, goal,
44 BOOTMEM_ALLOC_ACCESSIBLE, node);
44} 45}
45 46
46static void *vmemmap_buf; 47static void *vmemmap_buf;
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
226 227
227 if (vmemmap_buf_start) { 228 if (vmemmap_buf_start) {
228 /* need to free left buf */ 229 /* need to free left buf */
229 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); 230 memblock_free_early(__pa(vmemmap_buf),
231 vmemmap_buf_end - vmemmap_buf);
230 vmemmap_buf = NULL; 232 vmemmap_buf = NULL;
231 vmemmap_buf_end = NULL; 233 vmemmap_buf_end = NULL;
232 } 234 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0e9590..63c3ea5c119c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
69 else 69 else
70 section = kzalloc(array_size, GFP_KERNEL); 70 section = kzalloc(array_size, GFP_KERNEL);
71 } else { 71 } else {
72 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 72 section = memblock_virt_alloc_node(array_size, nid);
73 } 73 }
74 74
75 return section; 75 return section;
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
279 limit = goal + (1UL << PA_SECTION_SHIFT); 279 limit = goal + (1UL << PA_SECTION_SHIFT);
280 nid = early_pfn_to_nid(goal >> PAGE_SHIFT); 280 nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
281again: 281again:
282 p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, 282 p = memblock_virt_alloc_try_nid_nopanic(size,
283 SMP_CACHE_BYTES, goal, limit); 283 SMP_CACHE_BYTES, goal, limit,
284 nid);
284 if (!p && limit) { 285 if (!p && limit) {
285 limit = 0; 286 limit = 0;
286 goto again; 287 goto again;
@@ -331,7 +332,7 @@ static unsigned long * __init
331sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 332sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
332 unsigned long size) 333 unsigned long size)
333{ 334{
334 return alloc_bootmem_node_nopanic(pgdat, size); 335 return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
335} 336}
336 337
337static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 338static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
376 return map; 377 return map;
377 378
378 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); 379 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
379 map = __alloc_bootmem_node_high(NODE_DATA(nid), size, 380 map = memblock_virt_alloc_try_nid(size,
380 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 381 PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
382 BOOTMEM_ALLOC_ACCESSIBLE, nid);
381 return map; 383 return map;
382} 384}
383void __init sparse_mem_maps_populate_node(struct page **map_map, 385void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
401 } 403 }
402 404
403 size = PAGE_ALIGN(size); 405 size = PAGE_ALIGN(size);
404 map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, 406 map = memblock_virt_alloc_try_nid(size * map_count,
405 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 407 PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
408 BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
406 if (map) { 409 if (map) {
407 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 410 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
408 if (!present_section_nr(pnum)) 411 if (!present_section_nr(pnum))
@@ -545,7 +548,7 @@ void __init sparse_init(void)
545 * sparse_early_mem_map_alloc, so allocate usemap_map at first. 548 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
546 */ 549 */
547 size = sizeof(unsigned long *) * NR_MEM_SECTIONS; 550 size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
548 usemap_map = alloc_bootmem(size); 551 usemap_map = memblock_virt_alloc(size, 0);
549 if (!usemap_map) 552 if (!usemap_map)
550 panic("can not allocate usemap_map\n"); 553 panic("can not allocate usemap_map\n");
551 alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, 554 alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
@@ -553,7 +556,7 @@ void __init sparse_init(void)
553 556
554#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 557#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
555 size2 = sizeof(struct page *) * NR_MEM_SECTIONS; 558 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
556 map_map = alloc_bootmem(size2); 559 map_map = memblock_virt_alloc(size2, 0);
557 if (!map_map) 560 if (!map_map)
558 panic("can not allocate map_map\n"); 561 panic("can not allocate map_map\n");
559 alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, 562 alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
@@ -583,9 +586,9 @@ void __init sparse_init(void)
583 vmemmap_populate_print_last(); 586 vmemmap_populate_print_last();
584 587
585#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 588#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
586 free_bootmem(__pa(map_map), size2); 589 memblock_free_early(__pa(map_map), size2);
587#endif 590#endif
588 free_bootmem(__pa(usemap_map), size); 591 memblock_free_early(__pa(usemap_map), size);
589} 592}
590 593
591#ifdef CONFIG_MEMORY_HOTPLUG 594#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aaabd03..b31ba67d440a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h>
35 34
36#include "internal.h" 35#include "internal.h"
37 36
@@ -58,7 +57,7 @@ static void __page_cache_release(struct page *page)
58 57
59 spin_lock_irqsave(&zone->lru_lock, flags); 58 spin_lock_irqsave(&zone->lru_lock, flags);
60 lruvec = mem_cgroup_page_lruvec(page, zone); 59 lruvec = mem_cgroup_page_lruvec(page, zone);
61 VM_BUG_ON(!PageLRU(page)); 60 VM_BUG_ON_PAGE(!PageLRU(page), page);
62 __ClearPageLRU(page); 61 __ClearPageLRU(page);
63 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 62 del_page_from_lru_list(page, lruvec, page_off_lru(page));
64 spin_unlock_irqrestore(&zone->lru_lock, flags); 63 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page)
82 81
83static void put_compound_page(struct page *page) 82static void put_compound_page(struct page *page)
84{ 83{
85 if (unlikely(PageTail(page))) { 84 struct page *page_head;
86 /* __split_huge_page_refcount can run under us */
87 struct page *page_head = compound_trans_head(page);
88
89 if (likely(page != page_head &&
90 get_page_unless_zero(page_head))) {
91 unsigned long flags;
92 85
86 if (likely(!PageTail(page))) {
87 if (put_page_testzero(page)) {
93 /* 88 /*
94 * THP can not break up slab pages so avoid taking 89 * By the time all refcounts have been released
95 * compound_lock(). Slab performs non-atomic bit ops 90 * split_huge_page cannot run anymore from under us.
96 * on page->flags for better performance. In particular
97 * slab_unlock() in slub used to be a hot path. It is
98 * still hot on arches that do not support
99 * this_cpu_cmpxchg_double().
100 */ 91 */
101 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 92 if (PageHead(page))
102 if (likely(PageTail(page))) { 93 __put_compound_page(page);
103 /* 94 else
104 * __split_huge_page_refcount 95 __put_single_page(page);
105 * cannot race here. 96 }
106 */ 97 return;
107 VM_BUG_ON(!PageHead(page_head)); 98 }
108 atomic_dec(&page->_mapcount); 99
109 if (put_page_testzero(page_head)) 100 /* __split_huge_page_refcount can run under us */
110 VM_BUG_ON(1); 101 page_head = compound_trans_head(page);
111 if (put_page_testzero(page_head)) 102
112 __put_compound_page(page_head); 103 /*
113 return; 104 * THP can not break up slab pages so avoid taking
114 } else 105 * compound_lock() and skip the tail page refcounting (in
115 /* 106 * _mapcount) too. Slab performs non-atomic bit ops on
116 * __split_huge_page_refcount 107 * page->flags for better performance. In particular
117 * run before us, "page" was a 108 * slab_unlock() in slub used to be a hot path. It is still
118 * THP tail. The split 109 * hot on arches that do not support
119 * page_head has been freed 110 * this_cpu_cmpxchg_double().
120 * and reallocated as slab or 111 *
121 * hugetlbfs page of smaller 112 * If "page" is part of a slab or hugetlbfs page it cannot be
122 * order (only possible if 113 * splitted and the head page cannot change from under us. And
123 * reallocated as slab on 114 * if "page" is part of a THP page under splitting, if the
124 * x86). 115 * head page pointed by the THP tail isn't a THP head anymore,
125 */ 116 * we'll find PageTail clear after smp_rmb() and we'll treat
126 goto skip_lock; 117 * it as a single page.
127 } 118 */
119 if (!__compound_tail_refcounted(page_head)) {
120 /*
121 * If "page" is a THP tail, we must read the tail page
122 * flags after the head page flags. The
123 * split_huge_page side enforces write memory barriers
124 * between clearing PageTail and before the head page
125 * can be freed and reallocated.
126 */
127 smp_rmb();
128 if (likely(PageTail(page))) {
128 /* 129 /*
129 * page_head wasn't a dangling pointer but it 130 * __split_huge_page_refcount cannot race
130 * may not be a head page anymore by the time 131 * here.
131 * we obtain the lock. That is ok as long as it
132 * can't be freed from under us.
133 */ 132 */
134 flags = compound_lock_irqsave(page_head); 133 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
135 if (unlikely(!PageTail(page))) { 134 VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
136 /* __split_huge_page_refcount run before us */ 135 if (put_page_testzero(page_head)) {
137 compound_unlock_irqrestore(page_head, flags); 136 /*
138skip_lock: 137 * If this is the tail of a slab
139 if (put_page_testzero(page_head)) { 138 * compound page, the tail pin must
140 /* 139 * not be the last reference held on
141 * The head page may have been 140 * the page, because the PG_slab
142 * freed and reallocated as a 141 * cannot be cleared before all tail
143 * compound page of smaller 142 * pins (which skips the _mapcount
144 * order and then freed again. 143 * tail refcounting) have been
145 * All we know is that it 144 * released. For hugetlbfs the tail
146 * cannot have become: a THP 145 * pin may be the last reference on
147 * page, a compound page of 146 * the page instead, because
148 * higher order, a tail page. 147 * PageHeadHuge will not go away until
149 * That is because we still 148 * the compound page enters the buddy
150 * hold the refcount of the 149 * allocator.
151 * split THP tail and 150 */
152 * page_head was the THP head 151 VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
153 * before the split. 152 __put_compound_page(page_head);
154 */
155 if (PageHead(page_head))
156 __put_compound_page(page_head);
157 else
158 __put_single_page(page_head);
159 }
160out_put_single:
161 if (put_page_testzero(page))
162 __put_single_page(page);
163 return;
164 } 153 }
165 VM_BUG_ON(page_head != page->first_page); 154 return;
155 } else
166 /* 156 /*
167 * We can release the refcount taken by 157 * __split_huge_page_refcount run before us,
168 * get_page_unless_zero() now that 158 * "page" was a THP tail. The split page_head
169 * __split_huge_page_refcount() is blocked on 159 * has been freed and reallocated as slab or
170 * the compound_lock. 160 * hugetlbfs page of smaller order (only
161 * possible if reallocated as slab on x86).
171 */ 162 */
172 if (put_page_testzero(page_head)) 163 goto out_put_single;
173 VM_BUG_ON(1); 164 }
174 /* __split_huge_page_refcount will wait now */
175 VM_BUG_ON(page_mapcount(page) <= 0);
176 atomic_dec(&page->_mapcount);
177 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
178 VM_BUG_ON(atomic_read(&page->_count) != 0);
179 compound_unlock_irqrestore(page_head, flags);
180 165
166 if (likely(page != page_head && get_page_unless_zero(page_head))) {
167 unsigned long flags;
168
169 /*
170 * page_head wasn't a dangling pointer but it may not
171 * be a head page anymore by the time we obtain the
172 * lock. That is ok as long as it can't be freed from
173 * under us.
174 */
175 flags = compound_lock_irqsave(page_head);
176 if (unlikely(!PageTail(page))) {
177 /* __split_huge_page_refcount run before us */
178 compound_unlock_irqrestore(page_head, flags);
181 if (put_page_testzero(page_head)) { 179 if (put_page_testzero(page_head)) {
180 /*
181 * The head page may have been freed
182 * and reallocated as a compound page
183 * of smaller order and then freed
184 * again. All we know is that it
185 * cannot have become: a THP page, a
186 * compound page of higher order, a
187 * tail page. That is because we
188 * still hold the refcount of the
189 * split THP tail and page_head was
190 * the THP head before the split.
191 */
182 if (PageHead(page_head)) 192 if (PageHead(page_head))
183 __put_compound_page(page_head); 193 __put_compound_page(page_head);
184 else 194 else
185 __put_single_page(page_head); 195 __put_single_page(page_head);
186 } 196 }
187 } else { 197out_put_single:
188 /* page_head is a dangling pointer */ 198 if (put_page_testzero(page))
189 VM_BUG_ON(PageTail(page)); 199 __put_single_page(page);
190 goto out_put_single; 200 return;
191 } 201 }
192 } else if (put_page_testzero(page)) { 202 VM_BUG_ON_PAGE(page_head != page->first_page, page);
193 if (PageHead(page)) 203 /*
194 __put_compound_page(page); 204 * We can release the refcount taken by
195 else 205 * get_page_unless_zero() now that
196 __put_single_page(page); 206 * __split_huge_page_refcount() is blocked on the
207 * compound_lock.
208 */
209 if (put_page_testzero(page_head))
210 VM_BUG_ON_PAGE(1, page_head);
211 /* __split_huge_page_refcount will wait now */
212 VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
213 atomic_dec(&page->_mapcount);
214 VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
215 VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
216 compound_unlock_irqrestore(page_head, flags);
217
218 if (put_page_testzero(page_head)) {
219 if (PageHead(page_head))
220 __put_compound_page(page_head);
221 else
222 __put_single_page(page_head);
223 }
224 } else {
225 /* page_head is a dangling pointer */
226 VM_BUG_ON_PAGE(PageTail(page), page);
227 goto out_put_single;
197 } 228 }
198} 229}
199 230
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page)
221 * split_huge_page(). 252 * split_huge_page().
222 */ 253 */
223 unsigned long flags; 254 unsigned long flags;
224 bool got = false; 255 bool got;
225 struct page *page_head = compound_trans_head(page); 256 struct page *page_head = compound_trans_head(page);
226 257
227 if (likely(page != page_head && get_page_unless_zero(page_head))) { 258 /* Ref to put_compound_page() comment. */
228 /* Ref to put_compound_page() comment. */ 259 if (!__compound_tail_refcounted(page_head)) {
229 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 260 smp_rmb();
230 if (likely(PageTail(page))) { 261 if (likely(PageTail(page))) {
231 /* 262 /*
232 * This is a hugetlbfs page or a slab 263 * This is a hugetlbfs page or a slab
233 * page. __split_huge_page_refcount 264 * page. __split_huge_page_refcount
234 * cannot race here. 265 * cannot race here.
235 */ 266 */
236 VM_BUG_ON(!PageHead(page_head)); 267 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
237 __get_page_tail_foll(page, false); 268 __get_page_tail_foll(page, true);
238 return true; 269 return true;
239 } else { 270 } else {
240 /* 271 /*
241 * __split_huge_page_refcount run 272 * __split_huge_page_refcount run
242 * before us, "page" was a THP 273 * before us, "page" was a THP
243 * tail. The split page_head has been 274 * tail. The split page_head has been
244 * freed and reallocated as slab or 275 * freed and reallocated as slab or
245 * hugetlbfs page of smaller order 276 * hugetlbfs page of smaller order
246 * (only possible if reallocated as 277 * (only possible if reallocated as
247 * slab on x86). 278 * slab on x86).
248 */ 279 */
249 put_page(page_head); 280 return false;
250 return false;
251 }
252 } 281 }
282 }
253 283
284 got = false;
285 if (likely(page != page_head && get_page_unless_zero(page_head))) {
254 /* 286 /*
255 * page_head wasn't a dangling pointer but it 287 * page_head wasn't a dangling pointer but it
256 * may not be a head page anymore by the time 288 * may not be a head page anymore by the time
@@ -572,8 +604,8 @@ EXPORT_SYMBOL(__lru_cache_add);
572 */ 604 */
573void lru_cache_add(struct page *page) 605void lru_cache_add(struct page *page)
574{ 606{
575 VM_BUG_ON(PageActive(page) && PageUnevictable(page)); 607 VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
576 VM_BUG_ON(PageLRU(page)); 608 VM_BUG_ON_PAGE(PageLRU(page), page);
577 __lru_cache_add(page); 609 __lru_cache_add(page);
578} 610}
579 611
@@ -814,7 +846,7 @@ void release_pages(struct page **pages, int nr, int cold)
814 } 846 }
815 847
816 lruvec = mem_cgroup_page_lruvec(page, zone); 848 lruvec = mem_cgroup_page_lruvec(page, zone);
817 VM_BUG_ON(!PageLRU(page)); 849 VM_BUG_ON_PAGE(!PageLRU(page), page);
818 __ClearPageLRU(page); 850 __ClearPageLRU(page);
819 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 851 del_page_from_lru_list(page, lruvec, page_off_lru(page));
820 } 852 }
@@ -856,9 +888,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
856{ 888{
857 const int file = 0; 889 const int file = 0;
858 890
859 VM_BUG_ON(!PageHead(page)); 891 VM_BUG_ON_PAGE(!PageHead(page), page);
860 VM_BUG_ON(PageCompound(page_tail)); 892 VM_BUG_ON_PAGE(PageCompound(page_tail), page);
861 VM_BUG_ON(PageLRU(page_tail)); 893 VM_BUG_ON_PAGE(PageLRU(page_tail), page);
862 VM_BUG_ON(NR_CPUS != 1 && 894 VM_BUG_ON(NR_CPUS != 1 &&
863 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 895 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
864 896
@@ -897,7 +929,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
897 int active = PageActive(page); 929 int active = PageActive(page);
898 enum lru_list lru = page_lru(page); 930 enum lru_list lru = page_lru(page);
899 931
900 VM_BUG_ON(PageLRU(page)); 932 VM_BUG_ON_PAGE(PageLRU(page), page);
901 933
902 SetPageLRU(page); 934 SetPageLRU(page);
903 add_page_to_lru_list(page, lruvec, lru); 935 add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e6f15f8ca2af..98e85e9c2b2d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,9 +83,9 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
83 int error; 83 int error;
84 struct address_space *address_space; 84 struct address_space *address_space;
85 85
86 VM_BUG_ON(!PageLocked(page)); 86 VM_BUG_ON_PAGE(!PageLocked(page), page);
87 VM_BUG_ON(PageSwapCache(page)); 87 VM_BUG_ON_PAGE(PageSwapCache(page), page);
88 VM_BUG_ON(!PageSwapBacked(page)); 88 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
89 89
90 page_cache_get(page); 90 page_cache_get(page);
91 SetPageSwapCache(page); 91 SetPageSwapCache(page);
@@ -139,9 +139,9 @@ void __delete_from_swap_cache(struct page *page)
139 swp_entry_t entry; 139 swp_entry_t entry;
140 struct address_space *address_space; 140 struct address_space *address_space;
141 141
142 VM_BUG_ON(!PageLocked(page)); 142 VM_BUG_ON_PAGE(!PageLocked(page), page);
143 VM_BUG_ON(!PageSwapCache(page)); 143 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
144 VM_BUG_ON(PageWriteback(page)); 144 VM_BUG_ON_PAGE(PageWriteback(page), page);
145 145
146 entry.val = page_private(page); 146 entry.val = page_private(page);
147 address_space = swap_address_space(entry); 147 address_space = swap_address_space(entry);
@@ -165,8 +165,8 @@ int add_to_swap(struct page *page, struct list_head *list)
165 swp_entry_t entry; 165 swp_entry_t entry;
166 int err; 166 int err;
167 167
168 VM_BUG_ON(!PageLocked(page)); 168 VM_BUG_ON_PAGE(!PageLocked(page), page);
169 VM_BUG_ON(!PageUptodate(page)); 169 VM_BUG_ON_PAGE(!PageUptodate(page), page);
170 170
171 entry = get_swap_page(); 171 entry = get_swap_page();
172 if (!entry.val) 172 if (!entry.val)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 612a7c9795f6..c6c13b050a58 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -616,7 +616,7 @@ scan:
616 } 616 }
617 } 617 }
618 offset = si->lowest_bit; 618 offset = si->lowest_bit;
619 while (++offset < scan_base) { 619 while (offset < scan_base) {
620 if (!si->swap_map[offset]) { 620 if (!si->swap_map[offset]) {
621 spin_lock(&si->lock); 621 spin_lock(&si->lock);
622 goto checks; 622 goto checks;
@@ -629,6 +629,7 @@ scan:
629 cond_resched(); 629 cond_resched();
630 latency_ration = LATENCY_LIMIT; 630 latency_ration = LATENCY_LIMIT;
631 } 631 }
632 offset++;
632 } 633 }
633 spin_lock(&si->lock); 634 spin_lock(&si->lock);
634 635
@@ -906,7 +907,7 @@ int reuse_swap_page(struct page *page)
906{ 907{
907 int count; 908 int count;
908 909
909 VM_BUG_ON(!PageLocked(page)); 910 VM_BUG_ON_PAGE(!PageLocked(page), page);
910 if (unlikely(PageKsm(page))) 911 if (unlikely(PageKsm(page)))
911 return 0; 912 return 0;
912 count = page_mapcount(page); 913 count = page_mapcount(page);
@@ -926,7 +927,7 @@ int reuse_swap_page(struct page *page)
926 */ 927 */
927int try_to_free_swap(struct page *page) 928int try_to_free_swap(struct page *page)
928{ 929{
929 VM_BUG_ON(!PageLocked(page)); 930 VM_BUG_ON_PAGE(!PageLocked(page), page);
930 931
931 if (!PageSwapCache(page)) 932 if (!PageSwapCache(page))
932 return 0; 933 return 0;
@@ -2714,7 +2715,7 @@ struct swap_info_struct *page_swap_info(struct page *page)
2714 */ 2715 */
2715struct address_space *__page_file_mapping(struct page *page) 2716struct address_space *__page_file_mapping(struct page *page)
2716{ 2717{
2717 VM_BUG_ON(!PageSwapCache(page)); 2718 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
2718 return page_swap_info(page)->swap_file->f_mapping; 2719 return page_swap_info(page)->swap_file->f_mapping;
2719} 2720}
2720EXPORT_SYMBOL_GPL(__page_file_mapping); 2721EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -2722,7 +2723,7 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
2722pgoff_t __page_file_index(struct page *page) 2723pgoff_t __page_file_index(struct page *page)
2723{ 2724{
2724 swp_entry_t swap = { .val = page_private(page) }; 2725 swp_entry_t swap = { .val = page_private(page) };
2725 VM_BUG_ON(!PageSwapCache(page)); 2726 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
2726 return swp_offset(swap); 2727 return swp_offset(swap);
2727} 2728}
2728EXPORT_SYMBOL_GPL(__page_file_index); 2729EXPORT_SYMBOL_GPL(__page_file_index);
diff --git a/mm/util.c b/mm/util.c
index 808f375648e7..a24aa22f2473 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page)
404 return mapping; 404 return mapping;
405} 405}
406 406
407int overcommit_ratio_handler(struct ctl_table *table, int write,
408 void __user *buffer, size_t *lenp,
409 loff_t *ppos)
410{
411 int ret;
412
413 ret = proc_dointvec(table, write, buffer, lenp, ppos);
414 if (ret == 0 && write)
415 sysctl_overcommit_kbytes = 0;
416 return ret;
417}
418
419int overcommit_kbytes_handler(struct ctl_table *table, int write,
420 void __user *buffer, size_t *lenp,
421 loff_t *ppos)
422{
423 int ret;
424
425 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
426 if (ret == 0 && write)
427 sysctl_overcommit_ratio = 0;
428 return ret;
429}
430
407/* 431/*
408 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used 432 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
409 */ 433 */
410unsigned long vm_commit_limit(void) 434unsigned long vm_commit_limit(void)
411{ 435{
412 return ((totalram_pages - hugetlb_total_pages()) 436 unsigned long allowed;
413 * sysctl_overcommit_ratio / 100) + total_swap_pages; 437
438 if (sysctl_overcommit_kbytes)
439 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
440 else
441 allowed = ((totalram_pages - hugetlb_total_pages())
442 * sysctl_overcommit_ratio / 100);
443 allowed += total_swap_pages;
444
445 return allowed;
414} 446}
415 447
416 448
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index e0f62837c3f4..196970a4541f 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
278 278
279/** 279/**
280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
281 * @css: css that is interested in vmpressure notifications 281 * @memcg: memcg that is interested in vmpressure notifications
282 * @cft: cgroup control files handle
283 * @eventfd: eventfd context to link notifications with 282 * @eventfd: eventfd context to link notifications with
284 * @args: event arguments (used to set up a pressure level threshold) 283 * @args: event arguments (used to set up a pressure level threshold)
285 * 284 *
@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
289 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or 288 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
290 * "critical"). 289 * "critical").
291 * 290 *
292 * This function should not be used directly, just pass it to (struct 291 * To be used as memcg event method.
293 * cftype).register_event, and then cgroup core will handle everything by
294 * itself.
295 */ 292 */
296int vmpressure_register_event(struct cgroup_subsys_state *css, 293int vmpressure_register_event(struct mem_cgroup *memcg,
297 struct cftype *cft, struct eventfd_ctx *eventfd, 294 struct eventfd_ctx *eventfd, const char *args)
298 const char *args)
299{ 295{
300 struct vmpressure *vmpr = css_to_vmpressure(css); 296 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
301 struct vmpressure_event *ev; 297 struct vmpressure_event *ev;
302 int level; 298 int level;
303 299
@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
325 321
326/** 322/**
327 * vmpressure_unregister_event() - Unbind eventfd from vmpressure 323 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
328 * @css: css handle 324 * @memcg: memcg handle
329 * @cft: cgroup control files handle
330 * @eventfd: eventfd context that was used to link vmpressure with the @cg 325 * @eventfd: eventfd context that was used to link vmpressure with the @cg
331 * 326 *
332 * This function does internal manipulations to detach the @eventfd from 327 * This function does internal manipulations to detach the @eventfd from
333 * the vmpressure notifications, and then frees internal resources 328 * the vmpressure notifications, and then frees internal resources
334 * associated with the @eventfd (but the @eventfd itself is not freed). 329 * associated with the @eventfd (but the @eventfd itself is not freed).
335 * 330 *
336 * This function should not be used directly, just pass it to (struct 331 * To be used as memcg event method.
337 * cftype).unregister_event, and then cgroup core will handle everything
338 * by itself.
339 */ 332 */
340void vmpressure_unregister_event(struct cgroup_subsys_state *css, 333void vmpressure_unregister_event(struct mem_cgroup *memcg,
341 struct cftype *cft,
342 struct eventfd_ctx *eventfd) 334 struct eventfd_ctx *eventfd)
343{ 335{
344 struct vmpressure *vmpr = css_to_vmpressure(css); 336 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
345 struct vmpressure_event *ev; 337 struct vmpressure_event *ev;
346 338
347 mutex_lock(&vmpr->events_lock); 339 mutex_lock(&vmpr->events_lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eea668d9cff6..a9c74b409681 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc)
147} 147}
148#endif 148#endif
149 149
150unsigned long zone_reclaimable_pages(struct zone *zone) 150static unsigned long zone_reclaimable_pages(struct zone *zone)
151{ 151{
152 int nr; 152 int nr;
153 153
@@ -281,17 +281,34 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
281 nr_pages_scanned, lru_pages, 281 nr_pages_scanned, lru_pages,
282 max_pass, delta, total_scan); 282 max_pass, delta, total_scan);
283 283
284 while (total_scan >= batch_size) { 284 /*
285 * Normally, we should not scan less than batch_size objects in one
286 * pass to avoid too frequent shrinker calls, but if the slab has less
287 * than batch_size objects in total and we are really tight on memory,
288 * we will try to reclaim all available objects, otherwise we can end
289 * up failing allocations although there are plenty of reclaimable
290 * objects spread over several slabs with usage less than the
291 * batch_size.
292 *
293 * We detect the "tight on memory" situations by looking at the total
294 * number of objects we want to scan (total_scan). If it is greater
295 * than the total number of objects on slab (max_pass), we must be
296 * scanning at high prio and therefore should try to reclaim as much as
297 * possible.
298 */
299 while (total_scan >= batch_size ||
300 total_scan >= max_pass) {
285 unsigned long ret; 301 unsigned long ret;
302 unsigned long nr_to_scan = min(batch_size, total_scan);
286 303
287 shrinkctl->nr_to_scan = batch_size; 304 shrinkctl->nr_to_scan = nr_to_scan;
288 ret = shrinker->scan_objects(shrinker, shrinkctl); 305 ret = shrinker->scan_objects(shrinker, shrinkctl);
289 if (ret == SHRINK_STOP) 306 if (ret == SHRINK_STOP)
290 break; 307 break;
291 freed += ret; 308 freed += ret;
292 309
293 count_vm_events(SLABS_SCANNED, batch_size); 310 count_vm_events(SLABS_SCANNED, nr_to_scan);
294 total_scan -= batch_size; 311 total_scan -= nr_to_scan;
295 312
296 cond_resched(); 313 cond_resched();
297 } 314 }
@@ -352,16 +369,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
352 } 369 }
353 370
354 list_for_each_entry(shrinker, &shrinker_list, list) { 371 list_for_each_entry(shrinker, &shrinker_list, list) {
355 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { 372 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
356 if (!node_online(shrinkctl->nid)) 373 shrinkctl->nid = 0;
357 continue;
358
359 if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
360 (shrinkctl->nid != 0))
361 break;
362
363 freed += shrink_slab_node(shrinkctl, shrinker, 374 freed += shrink_slab_node(shrinkctl, shrinker,
364 nr_pages_scanned, lru_pages); 375 nr_pages_scanned, lru_pages);
376 continue;
377 }
378
379 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
380 if (node_online(shrinkctl->nid))
381 freed += shrink_slab_node(shrinkctl, shrinker,
382 nr_pages_scanned, lru_pages);
365 383
366 } 384 }
367 } 385 }
@@ -603,7 +621,7 @@ void putback_lru_page(struct page *page)
603 bool is_unevictable; 621 bool is_unevictable;
604 int was_unevictable = PageUnevictable(page); 622 int was_unevictable = PageUnevictable(page);
605 623
606 VM_BUG_ON(PageLRU(page)); 624 VM_BUG_ON_PAGE(PageLRU(page), page);
607 625
608redo: 626redo:
609 ClearPageUnevictable(page); 627 ClearPageUnevictable(page);
@@ -794,8 +812,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
794 if (!trylock_page(page)) 812 if (!trylock_page(page))
795 goto keep; 813 goto keep;
796 814
797 VM_BUG_ON(PageActive(page)); 815 VM_BUG_ON_PAGE(PageActive(page), page);
798 VM_BUG_ON(page_zone(page) != zone); 816 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
799 817
800 sc->nr_scanned++; 818 sc->nr_scanned++;
801 819
@@ -1079,14 +1097,14 @@ activate_locked:
1079 /* Not a candidate for swapping, so reclaim swap space. */ 1097 /* Not a candidate for swapping, so reclaim swap space. */
1080 if (PageSwapCache(page) && vm_swap_full()) 1098 if (PageSwapCache(page) && vm_swap_full())
1081 try_to_free_swap(page); 1099 try_to_free_swap(page);
1082 VM_BUG_ON(PageActive(page)); 1100 VM_BUG_ON_PAGE(PageActive(page), page);
1083 SetPageActive(page); 1101 SetPageActive(page);
1084 pgactivate++; 1102 pgactivate++;
1085keep_locked: 1103keep_locked:
1086 unlock_page(page); 1104 unlock_page(page);
1087keep: 1105keep:
1088 list_add(&page->lru, &ret_pages); 1106 list_add(&page->lru, &ret_pages);
1089 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1107 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1090 } 1108 }
1091 1109
1092 free_hot_cold_page_list(&free_pages, 1); 1110 free_hot_cold_page_list(&free_pages, 1);
@@ -1240,7 +1258,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1240 page = lru_to_page(src); 1258 page = lru_to_page(src);
1241 prefetchw_prev_lru_page(page, src, flags); 1259 prefetchw_prev_lru_page(page, src, flags);
1242 1260
1243 VM_BUG_ON(!PageLRU(page)); 1261 VM_BUG_ON_PAGE(!PageLRU(page), page);
1244 1262
1245 switch (__isolate_lru_page(page, mode)) { 1263 switch (__isolate_lru_page(page, mode)) {
1246 case 0: 1264 case 0:
@@ -1295,7 +1313,7 @@ int isolate_lru_page(struct page *page)
1295{ 1313{
1296 int ret = -EBUSY; 1314 int ret = -EBUSY;
1297 1315
1298 VM_BUG_ON(!page_count(page)); 1316 VM_BUG_ON_PAGE(!page_count(page), page);
1299 1317
1300 if (PageLRU(page)) { 1318 if (PageLRU(page)) {
1301 struct zone *zone = page_zone(page); 1319 struct zone *zone = page_zone(page);
@@ -1366,7 +1384,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1366 struct page *page = lru_to_page(page_list); 1384 struct page *page = lru_to_page(page_list);
1367 int lru; 1385 int lru;
1368 1386
1369 VM_BUG_ON(PageLRU(page)); 1387 VM_BUG_ON_PAGE(PageLRU(page), page);
1370 list_del(&page->lru); 1388 list_del(&page->lru);
1371 if (unlikely(!page_evictable(page))) { 1389 if (unlikely(!page_evictable(page))) {
1372 spin_unlock_irq(&zone->lru_lock); 1390 spin_unlock_irq(&zone->lru_lock);
@@ -1586,7 +1604,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
1586 page = lru_to_page(list); 1604 page = lru_to_page(list);
1587 lruvec = mem_cgroup_page_lruvec(page, zone); 1605 lruvec = mem_cgroup_page_lruvec(page, zone);
1588 1606
1589 VM_BUG_ON(PageLRU(page)); 1607 VM_BUG_ON_PAGE(PageLRU(page), page);
1590 SetPageLRU(page); 1608 SetPageLRU(page);
1591 1609
1592 nr_pages = hpage_nr_pages(page); 1610 nr_pages = hpage_nr_pages(page);
@@ -3297,27 +3315,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3297 wake_up_interruptible(&pgdat->kswapd_wait); 3315 wake_up_interruptible(&pgdat->kswapd_wait);
3298} 3316}
3299 3317
3300/*
3301 * The reclaimable count would be mostly accurate.
3302 * The less reclaimable pages may be
3303 * - mlocked pages, which will be moved to unevictable list when encountered
3304 * - mapped pages, which may require several travels to be reclaimed
3305 * - dirty pages, which is not "instantly" reclaimable
3306 */
3307unsigned long global_reclaimable_pages(void)
3308{
3309 int nr;
3310
3311 nr = global_page_state(NR_ACTIVE_FILE) +
3312 global_page_state(NR_INACTIVE_FILE);
3313
3314 if (get_nr_swap_pages() > 0)
3315 nr += global_page_state(NR_ACTIVE_ANON) +
3316 global_page_state(NR_INACTIVE_ANON);
3317
3318 return nr;
3319}
3320
3321#ifdef CONFIG_HIBERNATION 3318#ifdef CONFIG_HIBERNATION
3322/* 3319/*
3323 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 3320 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3701,7 +3698,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3701 if (page_evictable(page)) { 3698 if (page_evictable(page)) {
3702 enum lru_list lru = page_lru_base_type(page); 3699 enum lru_list lru = page_lru_base_type(page);
3703 3700
3704 VM_BUG_ON(PageActive(page)); 3701 VM_BUG_ON_PAGE(PageActive(page), page);
3705 ClearPageUnevictable(page); 3702 ClearPageUnevictable(page);
3706 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); 3703 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3707 add_page_to_lru_list(page, lruvec, lru); 3704 add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
new file mode 100644
index 000000000000..c03ca5e9fe15
--- /dev/null
+++ b/mm/zsmalloc.c
@@ -0,0 +1,1106 @@
1/*
2 * zsmalloc memory allocator
3 *
4 * Copyright (C) 2011 Nitin Gupta
5 * Copyright (C) 2012, 2013 Minchan Kim
6 *
7 * This code is released using a dual license strategy: BSD/GPL
8 * You can choose the license that better fits your requirements.
9 *
10 * Released under the terms of 3-clause BSD License
11 * Released under the terms of GNU General Public License Version 2.0
12 */
13
14/*
15 * This allocator is designed for use with zram. Thus, the allocator is
16 * supposed to work well under low memory conditions. In particular, it
17 * never attempts higher order page allocation which is very likely to
18 * fail under memory pressure. On the other hand, if we just use single
19 * (0-order) pages, it would suffer from very high fragmentation --
20 * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
21 * This was one of the major issues with its predecessor (xvmalloc).
22 *
23 * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
24 * and links them together using various 'struct page' fields. These linked
25 * pages act as a single higher-order page i.e. an object can span 0-order
26 * page boundaries. The code refers to these linked pages as a single entity
27 * called zspage.
28 *
29 * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
30 * since this satisfies the requirements of all its current users (in the
31 * worst case, page is incompressible and is thus stored "as-is" i.e. in
32 * uncompressed form). For allocation requests larger than this size, failure
33 * is returned (see zs_malloc).
34 *
35 * Additionally, zs_malloc() does not return a dereferenceable pointer.
36 * Instead, it returns an opaque handle (unsigned long) which encodes actual
37 * location of the allocated object. The reason for this indirection is that
38 * zsmalloc does not keep zspages permanently mapped since that would cause
39 * issues on 32-bit systems where the VA region for kernel space mappings
40 * is very small. So, before using the allocating memory, the object has to
41 * be mapped using zs_map_object() to get a usable pointer and subsequently
42 * unmapped using zs_unmap_object().
43 *
44 * Following is how we use various fields and flags of underlying
45 * struct page(s) to form a zspage.
46 *
47 * Usage of struct page fields:
48 * page->first_page: points to the first component (0-order) page
49 * page->index (union with page->freelist): offset of the first object
50 * starting in this page. For the first page, this is
51 * always 0, so we use this field (aka freelist) to point
52 * to the first free object in zspage.
53 * page->lru: links together all component pages (except the first page)
54 * of a zspage
55 *
56 * For _first_ page only:
57 *
58 * page->private (union with page->first_page): refers to the
59 * component page after the first page
60 * page->freelist: points to the first free object in zspage.
61 * Free objects are linked together using in-place
62 * metadata.
63 * page->objects: maximum number of objects we can store in this
64 * zspage (class->zspage_order * PAGE_SIZE / class->size)
65 * page->lru: links together first pages of various zspages.
66 * Basically forming list of zspages in a fullness group.
67 * page->mapping: class index and fullness group of the zspage
68 *
69 * Usage of struct page flags:
70 * PG_private: identifies the first component page
71 * PG_private2: identifies the last component page
72 *
73 */
74
75#ifdef CONFIG_ZSMALLOC_DEBUG
76#define DEBUG
77#endif
78
79#include <linux/module.h>
80#include <linux/kernel.h>
81#include <linux/bitops.h>
82#include <linux/errno.h>
83#include <linux/highmem.h>
84#include <linux/string.h>
85#include <linux/slab.h>
86#include <asm/tlbflush.h>
87#include <asm/pgtable.h>
88#include <linux/cpumask.h>
89#include <linux/cpu.h>
90#include <linux/vmalloc.h>
91#include <linux/hardirq.h>
92#include <linux/spinlock.h>
93#include <linux/types.h>
94#include <linux/zsmalloc.h>
95
96/*
97 * This must be power of 2 and greater than of equal to sizeof(link_free).
98 * These two conditions ensure that any 'struct link_free' itself doesn't
99 * span more than 1 page which avoids complex case of mapping 2 pages simply
100 * to restore link_free pointer values.
101 */
102#define ZS_ALIGN 8
103
104/*
105 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
106 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
107 */
108#define ZS_MAX_ZSPAGE_ORDER 2
109#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
110
111/*
112 * Object location (<PFN>, <obj_idx>) is encoded as
113 * as single (unsigned long) handle value.
114 *
115 * Note that object index <obj_idx> is relative to system
116 * page <PFN> it is stored in, so for each sub-page belonging
117 * to a zspage, obj_idx starts with 0.
118 *
119 * This is made more complicated by various memory models and PAE.
120 */
121
122#ifndef MAX_PHYSMEM_BITS
123#ifdef CONFIG_HIGHMEM64G
124#define MAX_PHYSMEM_BITS 36
125#else /* !CONFIG_HIGHMEM64G */
126/*
127 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
128 * be PAGE_SHIFT
129 */
130#define MAX_PHYSMEM_BITS BITS_PER_LONG
131#endif
132#endif
133#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
134#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
135#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
136
137#define MAX(a, b) ((a) >= (b) ? (a) : (b))
138/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
139#define ZS_MIN_ALLOC_SIZE \
140 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
141#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
142
143/*
144 * On systems with 4K page size, this gives 254 size classes! There is a
145 * trader-off here:
146 * - Large number of size classes is potentially wasteful as free page are
147 * spread across these classes
148 * - Small number of size classes causes large internal fragmentation
149 * - Probably its better to use specific size classes (empirically
150 * determined). NOTE: all those class sizes must be set as multiple of
151 * ZS_ALIGN to make sure link_free itself never has to span 2 pages.
152 *
153 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
154 * (reason above)
155 */
156#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
157#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
158 ZS_SIZE_CLASS_DELTA + 1)
159
160/*
161 * We do not maintain any list for completely empty or full pages
162 */
163enum fullness_group {
164 ZS_ALMOST_FULL,
165 ZS_ALMOST_EMPTY,
166 _ZS_NR_FULLNESS_GROUPS,
167
168 ZS_EMPTY,
169 ZS_FULL
170};
171
172/*
173 * We assign a page to ZS_ALMOST_EMPTY fullness group when:
174 * n <= N / f, where
175 * n = number of allocated objects
176 * N = total number of objects zspage can store
177 * f = 1/fullness_threshold_frac
178 *
179 * Similarly, we assign zspage to:
180 * ZS_ALMOST_FULL when n > N / f
181 * ZS_EMPTY when n == 0
182 * ZS_FULL when n == N
183 *
184 * (see: fix_fullness_group())
185 */
186static const int fullness_threshold_frac = 4;
187
188struct size_class {
189 /*
190 * Size of objects stored in this class. Must be multiple
191 * of ZS_ALIGN.
192 */
193 int size;
194 unsigned int index;
195
196 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
197 int pages_per_zspage;
198
199 spinlock_t lock;
200
201 /* stats */
202 u64 pages_allocated;
203
204 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
205};
206
207/*
208 * Placed within free objects to form a singly linked list.
209 * For every zspage, first_page->freelist gives head of this list.
210 *
211 * This must be power of 2 and less than or equal to ZS_ALIGN
212 */
213struct link_free {
214 /* Handle of next free chunk (encodes <PFN, obj_idx>) */
215 void *next;
216};
217
218struct zs_pool {
219 struct size_class size_class[ZS_SIZE_CLASSES];
220
221 gfp_t flags; /* allocation flags used when growing pool */
222};
223
224/*
225 * A zspage's class index and fullness group
226 * are encoded in its (first)page->mapping
227 */
228#define CLASS_IDX_BITS 28
229#define FULLNESS_BITS 4
230#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
231#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
232
233struct mapping_area {
234#ifdef CONFIG_PGTABLE_MAPPING
235 struct vm_struct *vm; /* vm area for mapping object that span pages */
236#else
237 char *vm_buf; /* copy buffer for objects that span pages */
238#endif
239 char *vm_addr; /* address of kmap_atomic()'ed pages */
240 enum zs_mapmode vm_mm; /* mapping mode */
241};
242
243
244/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
245static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
246
247static int is_first_page(struct page *page)
248{
249 return PagePrivate(page);
250}
251
252static int is_last_page(struct page *page)
253{
254 return PagePrivate2(page);
255}
256
257static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
258 enum fullness_group *fullness)
259{
260 unsigned long m;
261 BUG_ON(!is_first_page(page));
262
263 m = (unsigned long)page->mapping;
264 *fullness = m & FULLNESS_MASK;
265 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
266}
267
268static void set_zspage_mapping(struct page *page, unsigned int class_idx,
269 enum fullness_group fullness)
270{
271 unsigned long m;
272 BUG_ON(!is_first_page(page));
273
274 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
275 (fullness & FULLNESS_MASK);
276 page->mapping = (struct address_space *)m;
277}
278
279/*
280 * zsmalloc divides the pool into various size classes where each
281 * class maintains a list of zspages where each zspage is divided
282 * into equal sized chunks. Each allocation falls into one of these
283 * classes depending on its size. This function returns index of the
284 * size class which has chunk size big enough to hold the give size.
285 */
286static int get_size_class_index(int size)
287{
288 int idx = 0;
289
290 if (likely(size > ZS_MIN_ALLOC_SIZE))
291 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
292 ZS_SIZE_CLASS_DELTA);
293
294 return idx;
295}
296
297/*
298 * For each size class, zspages are divided into different groups
299 * depending on how "full" they are. This was done so that we could
300 * easily find empty or nearly empty zspages when we try to shrink
301 * the pool (not yet implemented). This function returns fullness
302 * status of the given page.
303 */
304static enum fullness_group get_fullness_group(struct page *page)
305{
306 int inuse, max_objects;
307 enum fullness_group fg;
308 BUG_ON(!is_first_page(page));
309
310 inuse = page->inuse;
311 max_objects = page->objects;
312
313 if (inuse == 0)
314 fg = ZS_EMPTY;
315 else if (inuse == max_objects)
316 fg = ZS_FULL;
317 else if (inuse <= max_objects / fullness_threshold_frac)
318 fg = ZS_ALMOST_EMPTY;
319 else
320 fg = ZS_ALMOST_FULL;
321
322 return fg;
323}
324
325/*
326 * Each size class maintains various freelists and zspages are assigned
327 * to one of these freelists based on the number of live objects they
328 * have. This functions inserts the given zspage into the freelist
329 * identified by <class, fullness_group>.
330 */
331static void insert_zspage(struct page *page, struct size_class *class,
332 enum fullness_group fullness)
333{
334 struct page **head;
335
336 BUG_ON(!is_first_page(page));
337
338 if (fullness >= _ZS_NR_FULLNESS_GROUPS)
339 return;
340
341 head = &class->fullness_list[fullness];
342 if (*head)
343 list_add_tail(&page->lru, &(*head)->lru);
344
345 *head = page;
346}
347
348/*
349 * This function removes the given zspage from the freelist identified
350 * by <class, fullness_group>.
351 */
352static void remove_zspage(struct page *page, struct size_class *class,
353 enum fullness_group fullness)
354{
355 struct page **head;
356
357 BUG_ON(!is_first_page(page));
358
359 if (fullness >= _ZS_NR_FULLNESS_GROUPS)
360 return;
361
362 head = &class->fullness_list[fullness];
363 BUG_ON(!*head);
364 if (list_empty(&(*head)->lru))
365 *head = NULL;
366 else if (*head == page)
367 *head = (struct page *)list_entry((*head)->lru.next,
368 struct page, lru);
369
370 list_del_init(&page->lru);
371}
372
373/*
374 * Each size class maintains zspages in different fullness groups depending
375 * on the number of live objects they contain. When allocating or freeing
376 * objects, the fullness status of the page can change, say, from ALMOST_FULL
377 * to ALMOST_EMPTY when freeing an object. This function checks if such
378 * a status change has occurred for the given page and accordingly moves the
379 * page from the freelist of the old fullness group to that of the new
380 * fullness group.
381 */
382static enum fullness_group fix_fullness_group(struct zs_pool *pool,
383 struct page *page)
384{
385 int class_idx;
386 struct size_class *class;
387 enum fullness_group currfg, newfg;
388
389 BUG_ON(!is_first_page(page));
390
391 get_zspage_mapping(page, &class_idx, &currfg);
392 newfg = get_fullness_group(page);
393 if (newfg == currfg)
394 goto out;
395
396 class = &pool->size_class[class_idx];
397 remove_zspage(page, class, currfg);
398 insert_zspage(page, class, newfg);
399 set_zspage_mapping(page, class_idx, newfg);
400
401out:
402 return newfg;
403}
404
405/*
406 * We have to decide on how many pages to link together
407 * to form a zspage for each size class. This is important
408 * to reduce wastage due to unusable space left at end of
409 * each zspage which is given as:
410 * wastage = Zp - Zp % size_class
411 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
412 *
413 * For example, for size class of 3/8 * PAGE_SIZE, we should
414 * link together 3 PAGE_SIZE sized pages to form a zspage
415 * since then we can perfectly fit in 8 such objects.
416 */
417static int get_pages_per_zspage(int class_size)
418{
419 int i, max_usedpc = 0;
420 /* zspage order which gives maximum used size per KB */
421 int max_usedpc_order = 1;
422
423 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
424 int zspage_size;
425 int waste, usedpc;
426
427 zspage_size = i * PAGE_SIZE;
428 waste = zspage_size % class_size;
429 usedpc = (zspage_size - waste) * 100 / zspage_size;
430
431 if (usedpc > max_usedpc) {
432 max_usedpc = usedpc;
433 max_usedpc_order = i;
434 }
435 }
436
437 return max_usedpc_order;
438}
439
440/*
441 * A single 'zspage' is composed of many system pages which are
442 * linked together using fields in struct page. This function finds
443 * the first/head page, given any component page of a zspage.
444 */
445static struct page *get_first_page(struct page *page)
446{
447 if (is_first_page(page))
448 return page;
449 else
450 return page->first_page;
451}
452
453static struct page *get_next_page(struct page *page)
454{
455 struct page *next;
456
457 if (is_last_page(page))
458 next = NULL;
459 else if (is_first_page(page))
460 next = (struct page *)page_private(page);
461 else
462 next = list_entry(page->lru.next, struct page, lru);
463
464 return next;
465}
466
467/*
468 * Encode <page, obj_idx> as a single handle value.
469 * On hardware platforms with physical memory starting at 0x0 the pfn
470 * could be 0 so we ensure that the handle will never be 0 by adjusting the
471 * encoded obj_idx value before encoding.
472 */
473static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
474{
475 unsigned long handle;
476
477 if (!page) {
478 BUG_ON(obj_idx);
479 return NULL;
480 }
481
482 handle = page_to_pfn(page) << OBJ_INDEX_BITS;
483 handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
484
485 return (void *)handle;
486}
487
488/*
489 * Decode <page, obj_idx> pair from the given object handle. We adjust the
490 * decoded obj_idx back to its original value since it was adjusted in
491 * obj_location_to_handle().
492 */
493static void obj_handle_to_location(unsigned long handle, struct page **page,
494 unsigned long *obj_idx)
495{
496 *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
497 *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
498}
499
500static unsigned long obj_idx_to_offset(struct page *page,
501 unsigned long obj_idx, int class_size)
502{
503 unsigned long off = 0;
504
505 if (!is_first_page(page))
506 off = page->index;
507
508 return off + obj_idx * class_size;
509}
510
511static void reset_page(struct page *page)
512{
513 clear_bit(PG_private, &page->flags);
514 clear_bit(PG_private_2, &page->flags);
515 set_page_private(page, 0);
516 page->mapping = NULL;
517 page->freelist = NULL;
518 page_mapcount_reset(page);
519}
520
521static void free_zspage(struct page *first_page)
522{
523 struct page *nextp, *tmp, *head_extra;
524
525 BUG_ON(!is_first_page(first_page));
526 BUG_ON(first_page->inuse);
527
528 head_extra = (struct page *)page_private(first_page);
529
530 reset_page(first_page);
531 __free_page(first_page);
532
533 /* zspage with only 1 system page */
534 if (!head_extra)
535 return;
536
537 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
538 list_del(&nextp->lru);
539 reset_page(nextp);
540 __free_page(nextp);
541 }
542 reset_page(head_extra);
543 __free_page(head_extra);
544}
545
546/* Initialize a newly allocated zspage */
547static void init_zspage(struct page *first_page, struct size_class *class)
548{
549 unsigned long off = 0;
550 struct page *page = first_page;
551
552 BUG_ON(!is_first_page(first_page));
553 while (page) {
554 struct page *next_page;
555 struct link_free *link;
556 unsigned int i, objs_on_page;
557
558 /*
559 * page->index stores offset of first object starting
560 * in the page. For the first page, this is always 0,
561 * so we use first_page->index (aka ->freelist) to store
562 * head of corresponding zspage's freelist.
563 */
564 if (page != first_page)
565 page->index = off;
566
567 link = (struct link_free *)kmap_atomic(page) +
568 off / sizeof(*link);
569 objs_on_page = (PAGE_SIZE - off) / class->size;
570
571 for (i = 1; i <= objs_on_page; i++) {
572 off += class->size;
573 if (off < PAGE_SIZE) {
574 link->next = obj_location_to_handle(page, i);
575 link += class->size / sizeof(*link);
576 }
577 }
578
579 /*
580 * We now come to the last (full or partial) object on this
581 * page, which must point to the first object on the next
582 * page (if present)
583 */
584 next_page = get_next_page(page);
585 link->next = obj_location_to_handle(next_page, 0);
586 kunmap_atomic(link);
587 page = next_page;
588 off = (off + class->size) % PAGE_SIZE;
589 }
590}
591
592/*
593 * Allocate a zspage for the given size class
594 */
595static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
596{
597 int i, error;
598 struct page *first_page = NULL, *uninitialized_var(prev_page);
599
600 /*
601 * Allocate individual pages and link them together as:
602 * 1. first page->private = first sub-page
603 * 2. all sub-pages are linked together using page->lru
604 * 3. each sub-page is linked to the first page using page->first_page
605 *
606 * For each size class, First/Head pages are linked together using
607 * page->lru. Also, we set PG_private to identify the first page
608 * (i.e. no other sub-page has this flag set) and PG_private_2 to
609 * identify the last page.
610 */
611 error = -ENOMEM;
612 for (i = 0; i < class->pages_per_zspage; i++) {
613 struct page *page;
614
615 page = alloc_page(flags);
616 if (!page)
617 goto cleanup;
618
619 INIT_LIST_HEAD(&page->lru);
620 if (i == 0) { /* first page */
621 SetPagePrivate(page);
622 set_page_private(page, 0);
623 first_page = page;
624 first_page->inuse = 0;
625 }
626 if (i == 1)
627 set_page_private(first_page, (unsigned long)page);
628 if (i >= 1)
629 page->first_page = first_page;
630 if (i >= 2)
631 list_add(&page->lru, &prev_page->lru);
632 if (i == class->pages_per_zspage - 1) /* last page */
633 SetPagePrivate2(page);
634 prev_page = page;
635 }
636
637 init_zspage(first_page, class);
638
639 first_page->freelist = obj_location_to_handle(first_page, 0);
640 /* Maximum number of objects we can store in this zspage */
641 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
642
643 error = 0; /* Success */
644
645cleanup:
646 if (unlikely(error) && first_page) {
647 free_zspage(first_page);
648 first_page = NULL;
649 }
650
651 return first_page;
652}
653
654static struct page *find_get_zspage(struct size_class *class)
655{
656 int i;
657 struct page *page;
658
659 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
660 page = class->fullness_list[i];
661 if (page)
662 break;
663 }
664
665 return page;
666}
667
668#ifdef CONFIG_PGTABLE_MAPPING
669static inline int __zs_cpu_up(struct mapping_area *area)
670{
671 /*
672 * Make sure we don't leak memory if a cpu UP notification
673 * and zs_init() race and both call zs_cpu_up() on the same cpu
674 */
675 if (area->vm)
676 return 0;
677 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
678 if (!area->vm)
679 return -ENOMEM;
680 return 0;
681}
682
683static inline void __zs_cpu_down(struct mapping_area *area)
684{
685 if (area->vm)
686 free_vm_area(area->vm);
687 area->vm = NULL;
688}
689
690static inline void *__zs_map_object(struct mapping_area *area,
691 struct page *pages[2], int off, int size)
692{
693 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages));
694 area->vm_addr = area->vm->addr;
695 return area->vm_addr + off;
696}
697
698static inline void __zs_unmap_object(struct mapping_area *area,
699 struct page *pages[2], int off, int size)
700{
701 unsigned long addr = (unsigned long)area->vm_addr;
702
703 unmap_kernel_range(addr, PAGE_SIZE * 2);
704}
705
706#else /* CONFIG_PGTABLE_MAPPING */
707
708static inline int __zs_cpu_up(struct mapping_area *area)
709{
710 /*
711 * Make sure we don't leak memory if a cpu UP notification
712 * and zs_init() race and both call zs_cpu_up() on the same cpu
713 */
714 if (area->vm_buf)
715 return 0;
716 area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
717 if (!area->vm_buf)
718 return -ENOMEM;
719 return 0;
720}
721
722static inline void __zs_cpu_down(struct mapping_area *area)
723{
724 if (area->vm_buf)
725 free_page((unsigned long)area->vm_buf);
726 area->vm_buf = NULL;
727}
728
729static void *__zs_map_object(struct mapping_area *area,
730 struct page *pages[2], int off, int size)
731{
732 int sizes[2];
733 void *addr;
734 char *buf = area->vm_buf;
735
736 /* disable page faults to match kmap_atomic() return conditions */
737 pagefault_disable();
738
739 /* no read fastpath */
740 if (area->vm_mm == ZS_MM_WO)
741 goto out;
742
743 sizes[0] = PAGE_SIZE - off;
744 sizes[1] = size - sizes[0];
745
746 /* copy object to per-cpu buffer */
747 addr = kmap_atomic(pages[0]);
748 memcpy(buf, addr + off, sizes[0]);
749 kunmap_atomic(addr);
750 addr = kmap_atomic(pages[1]);
751 memcpy(buf + sizes[0], addr, sizes[1]);
752 kunmap_atomic(addr);
753out:
754 return area->vm_buf;
755}
756
757static void __zs_unmap_object(struct mapping_area *area,
758 struct page *pages[2], int off, int size)
759{
760 int sizes[2];
761 void *addr;
762 char *buf = area->vm_buf;
763
764 /* no write fastpath */
765 if (area->vm_mm == ZS_MM_RO)
766 goto out;
767
768 sizes[0] = PAGE_SIZE - off;
769 sizes[1] = size - sizes[0];
770
771 /* copy per-cpu buffer to object */
772 addr = kmap_atomic(pages[0]);
773 memcpy(addr + off, buf, sizes[0]);
774 kunmap_atomic(addr);
775 addr = kmap_atomic(pages[1]);
776 memcpy(addr, buf + sizes[0], sizes[1]);
777 kunmap_atomic(addr);
778
779out:
780 /* enable page faults to match kunmap_atomic() return conditions */
781 pagefault_enable();
782}
783
784#endif /* CONFIG_PGTABLE_MAPPING */
785
786static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
787 void *pcpu)
788{
789 int ret, cpu = (long)pcpu;
790 struct mapping_area *area;
791
792 switch (action) {
793 case CPU_UP_PREPARE:
794 area = &per_cpu(zs_map_area, cpu);
795 ret = __zs_cpu_up(area);
796 if (ret)
797 return notifier_from_errno(ret);
798 break;
799 case CPU_DEAD:
800 case CPU_UP_CANCELED:
801 area = &per_cpu(zs_map_area, cpu);
802 __zs_cpu_down(area);
803 break;
804 }
805
806 return NOTIFY_OK;
807}
808
809static struct notifier_block zs_cpu_nb = {
810 .notifier_call = zs_cpu_notifier
811};
812
813static void zs_exit(void)
814{
815 int cpu;
816
817 for_each_online_cpu(cpu)
818 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
819 unregister_cpu_notifier(&zs_cpu_nb);
820}
821
822static int zs_init(void)
823{
824 int cpu, ret;
825
826 register_cpu_notifier(&zs_cpu_nb);
827 for_each_online_cpu(cpu) {
828 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
829 if (notifier_to_errno(ret))
830 goto fail;
831 }
832 return 0;
833fail:
834 zs_exit();
835 return notifier_to_errno(ret);
836}
837
838/**
839 * zs_create_pool - Creates an allocation pool to work from.
840 * @flags: allocation flags used to allocate pool metadata
841 *
842 * This function must be called before anything when using
843 * the zsmalloc allocator.
844 *
845 * On success, a pointer to the newly created pool is returned,
846 * otherwise NULL.
847 */
848struct zs_pool *zs_create_pool(gfp_t flags)
849{
850 int i, ovhd_size;
851 struct zs_pool *pool;
852
853 ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
854 pool = kzalloc(ovhd_size, GFP_KERNEL);
855 if (!pool)
856 return NULL;
857
858 for (i = 0; i < ZS_SIZE_CLASSES; i++) {
859 int size;
860 struct size_class *class;
861
862 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
863 if (size > ZS_MAX_ALLOC_SIZE)
864 size = ZS_MAX_ALLOC_SIZE;
865
866 class = &pool->size_class[i];
867 class->size = size;
868 class->index = i;
869 spin_lock_init(&class->lock);
870 class->pages_per_zspage = get_pages_per_zspage(size);
871
872 }
873
874 pool->flags = flags;
875
876 return pool;
877}
878EXPORT_SYMBOL_GPL(zs_create_pool);
879
880void zs_destroy_pool(struct zs_pool *pool)
881{
882 int i;
883
884 for (i = 0; i < ZS_SIZE_CLASSES; i++) {
885 int fg;
886 struct size_class *class = &pool->size_class[i];
887
888 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
889 if (class->fullness_list[fg]) {
890 pr_info("Freeing non-empty class with size %db, fullness group %d\n",
891 class->size, fg);
892 }
893 }
894 }
895 kfree(pool);
896}
897EXPORT_SYMBOL_GPL(zs_destroy_pool);
898
899/**
900 * zs_malloc - Allocate block of given size from pool.
901 * @pool: pool to allocate from
902 * @size: size of block to allocate
903 *
904 * On success, handle to the allocated object is returned,
905 * otherwise 0.
906 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
907 */
908unsigned long zs_malloc(struct zs_pool *pool, size_t size)
909{
910 unsigned long obj;
911 struct link_free *link;
912 int class_idx;
913 struct size_class *class;
914
915 struct page *first_page, *m_page;
916 unsigned long m_objidx, m_offset;
917
918 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
919 return 0;
920
921 class_idx = get_size_class_index(size);
922 class = &pool->size_class[class_idx];
923 BUG_ON(class_idx != class->index);
924
925 spin_lock(&class->lock);
926 first_page = find_get_zspage(class);
927
928 if (!first_page) {
929 spin_unlock(&class->lock);
930 first_page = alloc_zspage(class, pool->flags);
931 if (unlikely(!first_page))
932 return 0;
933
934 set_zspage_mapping(first_page, class->index, ZS_EMPTY);
935 spin_lock(&class->lock);
936 class->pages_allocated += class->pages_per_zspage;
937 }
938
939 obj = (unsigned long)first_page->freelist;
940 obj_handle_to_location(obj, &m_page, &m_objidx);
941 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
942
943 link = (struct link_free *)kmap_atomic(m_page) +
944 m_offset / sizeof(*link);
945 first_page->freelist = link->next;
946 memset(link, POISON_INUSE, sizeof(*link));
947 kunmap_atomic(link);
948
949 first_page->inuse++;
950 /* Now move the zspage to another fullness group, if required */
951 fix_fullness_group(pool, first_page);
952 spin_unlock(&class->lock);
953
954 return obj;
955}
956EXPORT_SYMBOL_GPL(zs_malloc);
957
958void zs_free(struct zs_pool *pool, unsigned long obj)
959{
960 struct link_free *link;
961 struct page *first_page, *f_page;
962 unsigned long f_objidx, f_offset;
963
964 int class_idx;
965 struct size_class *class;
966 enum fullness_group fullness;
967
968 if (unlikely(!obj))
969 return;
970
971 obj_handle_to_location(obj, &f_page, &f_objidx);
972 first_page = get_first_page(f_page);
973
974 get_zspage_mapping(first_page, &class_idx, &fullness);
975 class = &pool->size_class[class_idx];
976 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
977
978 spin_lock(&class->lock);
979
980 /* Insert this object in containing zspage's freelist */
981 link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
982 + f_offset);
983 link->next = first_page->freelist;
984 kunmap_atomic(link);
985 first_page->freelist = (void *)obj;
986
987 first_page->inuse--;
988 fullness = fix_fullness_group(pool, first_page);
989
990 if (fullness == ZS_EMPTY)
991 class->pages_allocated -= class->pages_per_zspage;
992
993 spin_unlock(&class->lock);
994
995 if (fullness == ZS_EMPTY)
996 free_zspage(first_page);
997}
998EXPORT_SYMBOL_GPL(zs_free);
999
1000/**
1001 * zs_map_object - get address of allocated object from handle.
1002 * @pool: pool from which the object was allocated
1003 * @handle: handle returned from zs_malloc
1004 *
1005 * Before using an object allocated from zs_malloc, it must be mapped using
1006 * this function. When done with the object, it must be unmapped using
1007 * zs_unmap_object.
1008 *
1009 * Only one object can be mapped per cpu at a time. There is no protection
1010 * against nested mappings.
1011 *
1012 * This function returns with preemption and page faults disabled.
1013 */
1014void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1015 enum zs_mapmode mm)
1016{
1017 struct page *page;
1018 unsigned long obj_idx, off;
1019
1020 unsigned int class_idx;
1021 enum fullness_group fg;
1022 struct size_class *class;
1023 struct mapping_area *area;
1024 struct page *pages[2];
1025
1026 BUG_ON(!handle);
1027
1028 /*
1029 * Because we use per-cpu mapping areas shared among the
1030 * pools/users, we can't allow mapping in interrupt context
1031 * because it can corrupt another users mappings.
1032 */
1033 BUG_ON(in_interrupt());
1034
1035 obj_handle_to_location(handle, &page, &obj_idx);
1036 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1037 class = &pool->size_class[class_idx];
1038 off = obj_idx_to_offset(page, obj_idx, class->size);
1039
1040 area = &get_cpu_var(zs_map_area);
1041 area->vm_mm = mm;
1042 if (off + class->size <= PAGE_SIZE) {
1043 /* this object is contained entirely within a page */
1044 area->vm_addr = kmap_atomic(page);
1045 return area->vm_addr + off;
1046 }
1047
1048 /* this object spans two pages */
1049 pages[0] = page;
1050 pages[1] = get_next_page(page);
1051 BUG_ON(!pages[1]);
1052
1053 return __zs_map_object(area, pages, off, class->size);
1054}
1055EXPORT_SYMBOL_GPL(zs_map_object);
1056
1057void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1058{
1059 struct page *page;
1060 unsigned long obj_idx, off;
1061
1062 unsigned int class_idx;
1063 enum fullness_group fg;
1064 struct size_class *class;
1065 struct mapping_area *area;
1066
1067 BUG_ON(!handle);
1068
1069 obj_handle_to_location(handle, &page, &obj_idx);
1070 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1071 class = &pool->size_class[class_idx];
1072 off = obj_idx_to_offset(page, obj_idx, class->size);
1073
1074 area = &__get_cpu_var(zs_map_area);
1075 if (off + class->size <= PAGE_SIZE)
1076 kunmap_atomic(area->vm_addr);
1077 else {
1078 struct page *pages[2];
1079
1080 pages[0] = page;
1081 pages[1] = get_next_page(page);
1082 BUG_ON(!pages[1]);
1083
1084 __zs_unmap_object(area, pages, off, class->size);
1085 }
1086 put_cpu_var(zs_map_area);
1087}
1088EXPORT_SYMBOL_GPL(zs_unmap_object);
1089
1090u64 zs_get_total_size_bytes(struct zs_pool *pool)
1091{
1092 int i;
1093 u64 npages = 0;
1094
1095 for (i = 0; i < ZS_SIZE_CLASSES; i++)
1096 npages += pool->size_class[i].pages_allocated;
1097
1098 return npages << PAGE_SHIFT;
1099}
1100EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
1101
1102module_init(zs_init);
1103module_exit(zs_exit);
1104
1105MODULE_LICENSE("Dual BSD/GPL");
1106MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
diff --git a/mm/zswap.c b/mm/zswap.c
index 5a63f78a5601..e55bab9dc41f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -77,12 +77,12 @@ static u64 zswap_duplicate_entry;
77**********************************/ 77**********************************/
78/* Enable/disable zswap (disabled by default, fixed at boot for now) */ 78/* Enable/disable zswap (disabled by default, fixed at boot for now) */
79static bool zswap_enabled __read_mostly; 79static bool zswap_enabled __read_mostly;
80module_param_named(enabled, zswap_enabled, bool, 0); 80module_param_named(enabled, zswap_enabled, bool, 0444);
81 81
82/* Compressor to be used by zswap (fixed at boot for now) */ 82/* Compressor to be used by zswap (fixed at boot for now) */
83#define ZSWAP_COMPRESSOR_DEFAULT "lzo" 83#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
84static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 84static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
85module_param_named(compressor, zswap_compressor, charp, 0); 85module_param_named(compressor, zswap_compressor, charp, 0444);
86 86
87/* The maximum percentage of memory that the compressed pool can occupy */ 87/* The maximum percentage of memory that the compressed pool can occupy */
88static unsigned int zswap_max_pool_percent = 20; 88static unsigned int zswap_max_pool_percent = 20;