aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 20:52:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 20:52:23 -0400
commitf6f7a6369203fa3e07efb7f35cfd81efe9f25b07 (patch)
tree97bec9ddd999040822acf314647eaf4208213589 /mm
parent839fe9156fbe89c3157aa6146d22090f8cffddd8 (diff)
parentdf69f52d990bd85159727bd26e819d3a6e49c666 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton: "Almost all of the rest of MM. There was an unusually large amount of MM material this time" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits) zpool: remove no-op module init/exit mm: zbud: constify the zbud_ops mm: zpool: constify the zpool_ops mm: swap: zswap: maybe_preload & refactoring zram: unify error reporting zsmalloc: remove null check from destroy_handle_cache() zsmalloc: do not take class lock in zs_shrinker_count() zsmalloc: use class->pages_per_zspage zsmalloc: consider ZS_ALMOST_FULL as migrate source zsmalloc: partial page ordering within a fullness_list zsmalloc: use shrinker to trigger auto-compaction zsmalloc: account the number of compacted pages zsmalloc/zram: introduce zs_pool_stats api zsmalloc: cosmetic compaction code adjustments zsmalloc: introduce zs_can_compact() function zsmalloc: always keep per-class stats zsmalloc: drop unused variable `nr_to_migrate' mm/memblock.c: fix comment in __next_mem_range() mm/page_alloc.c: fix type information of memoryless node memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c7
-rw-r--r--mm/compaction.c175
-rw-r--r--mm/dmapool.c12
-rw-r--r--mm/early_ioremap.c22
-rw-r--r--mm/filemap.c36
-rw-r--r--mm/huge_memory.c163
-rw-r--r--mm/hugetlb.c432
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h1
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/list_lru.c4
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c31
-rw-r--r--mm/memcontrol.c394
-rw-r--r--mm/memory-failure.c103
-rw-r--r--mm/memory.c48
-rw-r--r--mm/mempolicy.c7
-rw-r--r--mm/mempool.c3
-rw-r--r--mm/memtest.c27
-rw-r--r--mm/migrate.c13
-rw-r--r--mm/mmap.c71
-rw-r--r--mm/oom_kill.c142
-rw-r--r--mm/page_alloc.c80
-rw-r--r--mm/page_isolation.c35
-rw-r--r--mm/shmem.c16
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab_common.c5
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swap_state.c37
-rw-r--r--mm/swapfile.c42
-rw-r--r--mm/vmscan.c14
-rw-r--r--mm/zbud.c10
-rw-r--r--mm/zpool.c18
-rw-r--r--mm/zsmalloc.c235
-rw-r--r--mm/zswap.c75
36 files changed, 1243 insertions, 1030 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a23dd1934654..3b6380784c28 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -236,6 +236,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
236 count += pages; 236 count += pages;
237 while (pages--) 237 while (pages--)
238 __free_pages_bootmem(page++, cur++, 0); 238 __free_pages_bootmem(page++, cur++, 0);
239 bdata->node_bootmem_map = NULL;
239 240
240 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); 241 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
241 242
@@ -294,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata,
294 sidx + bdata->node_min_pfn, 295 sidx + bdata->node_min_pfn,
295 eidx + bdata->node_min_pfn); 296 eidx + bdata->node_min_pfn);
296 297
298 if (WARN_ON(bdata->node_bootmem_map == NULL))
299 return;
300
297 if (bdata->hint_idx > sidx) 301 if (bdata->hint_idx > sidx)
298 bdata->hint_idx = sidx; 302 bdata->hint_idx = sidx;
299 303
@@ -314,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
314 eidx + bdata->node_min_pfn, 318 eidx + bdata->node_min_pfn,
315 flags); 319 flags);
316 320
321 if (WARN_ON(bdata->node_bootmem_map == NULL))
322 return 0;
323
317 for (idx = sidx; idx < eidx; idx++) 324 for (idx = sidx; idx < eidx; idx++)
318 if (test_and_set_bit(idx, bdata->node_bootmem_map)) { 325 if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
319 if (exclusive) { 326 if (exclusive) {
diff --git a/mm/compaction.c b/mm/compaction.c
index 018f08da99a2..c5c627aae996 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -207,6 +207,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
207 return !get_pageblock_skip(page); 207 return !get_pageblock_skip(page);
208} 208}
209 209
210static void reset_cached_positions(struct zone *zone)
211{
212 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
213 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
214 zone->compact_cached_free_pfn = zone_end_pfn(zone);
215}
216
210/* 217/*
211 * This function is called to clear all cached information on pageblocks that 218 * This function is called to clear all cached information on pageblocks that
212 * should be skipped for page isolation when the migrate and free page scanner 219 * should be skipped for page isolation when the migrate and free page scanner
@@ -218,9 +225,6 @@ static void __reset_isolation_suitable(struct zone *zone)
218 unsigned long end_pfn = zone_end_pfn(zone); 225 unsigned long end_pfn = zone_end_pfn(zone);
219 unsigned long pfn; 226 unsigned long pfn;
220 227
221 zone->compact_cached_migrate_pfn[0] = start_pfn;
222 zone->compact_cached_migrate_pfn[1] = start_pfn;
223 zone->compact_cached_free_pfn = end_pfn;
224 zone->compact_blockskip_flush = false; 228 zone->compact_blockskip_flush = false;
225 229
226 /* Walk the zone and mark every pageblock as suitable for isolation */ 230 /* Walk the zone and mark every pageblock as suitable for isolation */
@@ -238,6 +242,8 @@ static void __reset_isolation_suitable(struct zone *zone)
238 242
239 clear_pageblock_skip(page); 243 clear_pageblock_skip(page);
240 } 244 }
245
246 reset_cached_positions(zone);
241} 247}
242 248
243void reset_isolation_suitable(pg_data_t *pgdat) 249void reset_isolation_suitable(pg_data_t *pgdat)
@@ -431,6 +437,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
431 437
432 if (!valid_page) 438 if (!valid_page)
433 valid_page = page; 439 valid_page = page;
440
441 /*
442 * For compound pages such as THP and hugetlbfs, we can save
443 * potentially a lot of iterations if we skip them at once.
444 * The check is racy, but we can consider only valid values
445 * and the only danger is skipping too much.
446 */
447 if (PageCompound(page)) {
448 unsigned int comp_order = compound_order(page);
449
450 if (likely(comp_order < MAX_ORDER)) {
451 blockpfn += (1UL << comp_order) - 1;
452 cursor += (1UL << comp_order) - 1;
453 }
454
455 goto isolate_fail;
456 }
457
434 if (!PageBuddy(page)) 458 if (!PageBuddy(page))
435 goto isolate_fail; 459 goto isolate_fail;
436 460
@@ -490,6 +514,13 @@ isolate_fail:
490 514
491 } 515 }
492 516
517 /*
518 * There is a tiny chance that we have read bogus compound_order(),
519 * so be careful to not go outside of the pageblock.
520 */
521 if (unlikely(blockpfn > end_pfn))
522 blockpfn = end_pfn;
523
493 trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, 524 trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
494 nr_scanned, total_isolated); 525 nr_scanned, total_isolated);
495 526
@@ -674,6 +705,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
674 705
675 /* Time to isolate some pages for migration */ 706 /* Time to isolate some pages for migration */
676 for (; low_pfn < end_pfn; low_pfn++) { 707 for (; low_pfn < end_pfn; low_pfn++) {
708 bool is_lru;
709
677 /* 710 /*
678 * Periodically drop the lock (if held) regardless of its 711 * Periodically drop the lock (if held) regardless of its
679 * contention, to give chance to IRQs. Abort async compaction 712 * contention, to give chance to IRQs. Abort async compaction
@@ -717,36 +750,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
717 * It's possible to migrate LRU pages and balloon pages 750 * It's possible to migrate LRU pages and balloon pages
718 * Skip any other type of page 751 * Skip any other type of page
719 */ 752 */
720 if (!PageLRU(page)) { 753 is_lru = PageLRU(page);
754 if (!is_lru) {
721 if (unlikely(balloon_page_movable(page))) { 755 if (unlikely(balloon_page_movable(page))) {
722 if (balloon_page_isolate(page)) { 756 if (balloon_page_isolate(page)) {
723 /* Successfully isolated */ 757 /* Successfully isolated */
724 goto isolate_success; 758 goto isolate_success;
725 } 759 }
726 } 760 }
727 continue;
728 } 761 }
729 762
730 /* 763 /*
731 * PageLRU is set. lru_lock normally excludes isolation 764 * Regardless of being on LRU, compound pages such as THP and
732 * splitting and collapsing (collapsing has already happened 765 * hugetlbfs are not to be compacted. We can potentially save
733 * if PageLRU is set) but the lock is not necessarily taken 766 * a lot of iterations if we skip them at once. The check is
734 * here and it is wasteful to take it just to check transhuge. 767 * racy, but we can consider only valid values and the only
735 * Check TransHuge without lock and skip the whole pageblock if 768 * danger is skipping too much.
736 * it's either a transhuge or hugetlbfs page, as calling
737 * compound_order() without preventing THP from splitting the
738 * page underneath us may return surprising results.
739 */ 769 */
740 if (PageTransHuge(page)) { 770 if (PageCompound(page)) {
741 if (!locked) 771 unsigned int comp_order = compound_order(page);
742 low_pfn = ALIGN(low_pfn + 1, 772
743 pageblock_nr_pages) - 1; 773 if (likely(comp_order < MAX_ORDER))
744 else 774 low_pfn += (1UL << comp_order) - 1;
745 low_pfn += (1 << compound_order(page)) - 1;
746 775
747 continue; 776 continue;
748 } 777 }
749 778
779 if (!is_lru)
780 continue;
781
750 /* 782 /*
751 * Migration will fail if an anonymous page is pinned in memory, 783 * Migration will fail if an anonymous page is pinned in memory,
752 * so avoid taking lru_lock and isolating it unnecessarily in an 784 * so avoid taking lru_lock and isolating it unnecessarily in an
@@ -763,11 +795,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
763 if (!locked) 795 if (!locked)
764 break; 796 break;
765 797
766 /* Recheck PageLRU and PageTransHuge under lock */ 798 /* Recheck PageLRU and PageCompound under lock */
767 if (!PageLRU(page)) 799 if (!PageLRU(page))
768 continue; 800 continue;
769 if (PageTransHuge(page)) { 801
770 low_pfn += (1 << compound_order(page)) - 1; 802 /*
803 * Page become compound since the non-locked check,
804 * and it's on LRU. It can only be a THP so the order
805 * is safe to read and it's 0 for tail pages.
806 */
807 if (unlikely(PageCompound(page))) {
808 low_pfn += (1UL << compound_order(page)) - 1;
771 continue; 809 continue;
772 } 810 }
773 } 811 }
@@ -778,7 +816,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
778 if (__isolate_lru_page(page, isolate_mode) != 0) 816 if (__isolate_lru_page(page, isolate_mode) != 0)
779 continue; 817 continue;
780 818
781 VM_BUG_ON_PAGE(PageTransCompound(page), page); 819 VM_BUG_ON_PAGE(PageCompound(page), page);
782 820
783 /* Successfully isolated */ 821 /* Successfully isolated */
784 del_page_from_lru_list(page, lruvec, page_lru(page)); 822 del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -898,6 +936,16 @@ static bool suitable_migration_target(struct page *page)
898} 936}
899 937
900/* 938/*
939 * Test whether the free scanner has reached the same or lower pageblock than
940 * the migration scanner, and compaction should thus terminate.
941 */
942static inline bool compact_scanners_met(struct compact_control *cc)
943{
944 return (cc->free_pfn >> pageblock_order)
945 <= (cc->migrate_pfn >> pageblock_order);
946}
947
948/*
901 * Based on information in the current compact_control, find blocks 949 * Based on information in the current compact_control, find blocks
902 * suitable for isolating free pages from and then isolate them. 950 * suitable for isolating free pages from and then isolate them.
903 */ 951 */
@@ -933,8 +981,7 @@ static void isolate_freepages(struct compact_control *cc)
933 * pages on cc->migratepages. We stop searching if the migrate 981 * pages on cc->migratepages. We stop searching if the migrate
934 * and free page scanners meet or enough free pages are isolated. 982 * and free page scanners meet or enough free pages are isolated.
935 */ 983 */
936 for (; block_start_pfn >= low_pfn && 984 for (; block_start_pfn >= low_pfn;
937 cc->nr_migratepages > cc->nr_freepages;
938 block_end_pfn = block_start_pfn, 985 block_end_pfn = block_start_pfn,
939 block_start_pfn -= pageblock_nr_pages, 986 block_start_pfn -= pageblock_nr_pages,
940 isolate_start_pfn = block_start_pfn) { 987 isolate_start_pfn = block_start_pfn) {
@@ -966,6 +1013,8 @@ static void isolate_freepages(struct compact_control *cc)
966 block_end_pfn, freelist, false); 1013 block_end_pfn, freelist, false);
967 1014
968 /* 1015 /*
1016 * If we isolated enough freepages, or aborted due to async
1017 * compaction being contended, terminate the loop.
969 * Remember where the free scanner should restart next time, 1018 * Remember where the free scanner should restart next time,
970 * which is where isolate_freepages_block() left off. 1019 * which is where isolate_freepages_block() left off.
971 * But if it scanned the whole pageblock, isolate_start_pfn 1020 * But if it scanned the whole pageblock, isolate_start_pfn
@@ -974,27 +1023,31 @@ static void isolate_freepages(struct compact_control *cc)
974 * In that case we will however want to restart at the start 1023 * In that case we will however want to restart at the start
975 * of the previous pageblock. 1024 * of the previous pageblock.
976 */ 1025 */
977 cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? 1026 if ((cc->nr_freepages >= cc->nr_migratepages)
978 isolate_start_pfn : 1027 || cc->contended) {
979 block_start_pfn - pageblock_nr_pages; 1028 if (isolate_start_pfn >= block_end_pfn)
980 1029 isolate_start_pfn =
981 /* 1030 block_start_pfn - pageblock_nr_pages;
982 * isolate_freepages_block() might have aborted due to async
983 * compaction being contended
984 */
985 if (cc->contended)
986 break; 1031 break;
1032 } else {
1033 /*
1034 * isolate_freepages_block() should not terminate
1035 * prematurely unless contended, or isolated enough
1036 */
1037 VM_BUG_ON(isolate_start_pfn < block_end_pfn);
1038 }
987 } 1039 }
988 1040
989 /* split_free_page does not map the pages */ 1041 /* split_free_page does not map the pages */
990 map_pages(freelist); 1042 map_pages(freelist);
991 1043
992 /* 1044 /*
993 * If we crossed the migrate scanner, we want to keep it that way 1045 * Record where the free scanner will restart next time. Either we
994 * so that compact_finished() may detect this 1046 * broke from the loop and set isolate_start_pfn based on the last
1047 * call to isolate_freepages_block(), or we met the migration scanner
1048 * and the loop terminated due to isolate_start_pfn < low_pfn
995 */ 1049 */
996 if (block_start_pfn < low_pfn) 1050 cc->free_pfn = isolate_start_pfn;
997 cc->free_pfn = cc->migrate_pfn;
998} 1051}
999 1052
1000/* 1053/*
@@ -1062,6 +1115,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1062 struct compact_control *cc) 1115 struct compact_control *cc)
1063{ 1116{
1064 unsigned long low_pfn, end_pfn; 1117 unsigned long low_pfn, end_pfn;
1118 unsigned long isolate_start_pfn;
1065 struct page *page; 1119 struct page *page;
1066 const isolate_mode_t isolate_mode = 1120 const isolate_mode_t isolate_mode =
1067 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | 1121 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1110,6 +1164,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1110 continue; 1164 continue;
1111 1165
1112 /* Perform the isolation */ 1166 /* Perform the isolation */
1167 isolate_start_pfn = low_pfn;
1113 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1168 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
1114 isolate_mode); 1169 isolate_mode);
1115 1170
@@ -1119,6 +1174,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1119 } 1174 }
1120 1175
1121 /* 1176 /*
1177 * Record where we could have freed pages by migration and not
1178 * yet flushed them to buddy allocator.
1179 * - this is the lowest page that could have been isolated and
1180 * then freed by migration.
1181 */
1182 if (cc->nr_migratepages && !cc->last_migrated_pfn)
1183 cc->last_migrated_pfn = isolate_start_pfn;
1184
1185 /*
1122 * Either we isolated something and proceed with migration. Or 1186 * Either we isolated something and proceed with migration. Or
1123 * we failed and compact_zone should decide if we should 1187 * we failed and compact_zone should decide if we should
1124 * continue or not. 1188 * continue or not.
@@ -1127,12 +1191,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1127 } 1191 }
1128 1192
1129 acct_isolated(zone, cc); 1193 acct_isolated(zone, cc);
1130 /* 1194 /* Record where migration scanner will be restarted. */
1131 * Record where migration scanner will be restarted. If we end up in 1195 cc->migrate_pfn = low_pfn;
1132 * the same pageblock as the free scanner, make the scanners fully
1133 * meet so that compact_finished() terminates compaction.
1134 */
1135 cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
1136 1196
1137 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1197 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1138} 1198}
@@ -1147,11 +1207,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
1147 return COMPACT_PARTIAL; 1207 return COMPACT_PARTIAL;
1148 1208
1149 /* Compaction run completes if the migrate and free scanner meet */ 1209 /* Compaction run completes if the migrate and free scanner meet */
1150 if (cc->free_pfn <= cc->migrate_pfn) { 1210 if (compact_scanners_met(cc)) {
1151 /* Let the next compaction start anew. */ 1211 /* Let the next compaction start anew. */
1152 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; 1212 reset_cached_positions(zone);
1153 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
1154 zone->compact_cached_free_pfn = zone_end_pfn(zone);
1155 1213
1156 /* 1214 /*
1157 * Mark that the PG_migrate_skip information should be cleared 1215 * Mark that the PG_migrate_skip information should be cleared
@@ -1295,7 +1353,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1295 unsigned long end_pfn = zone_end_pfn(zone); 1353 unsigned long end_pfn = zone_end_pfn(zone);
1296 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); 1354 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1297 const bool sync = cc->mode != MIGRATE_ASYNC; 1355 const bool sync = cc->mode != MIGRATE_ASYNC;
1298 unsigned long last_migrated_pfn = 0;
1299 1356
1300 ret = compaction_suitable(zone, cc->order, cc->alloc_flags, 1357 ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
1301 cc->classzone_idx); 1358 cc->classzone_idx);
@@ -1333,6 +1390,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1333 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1390 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1334 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1391 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1335 } 1392 }
1393 cc->last_migrated_pfn = 0;
1336 1394
1337 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, 1395 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
1338 cc->free_pfn, end_pfn, sync); 1396 cc->free_pfn, end_pfn, sync);
@@ -1342,7 +1400,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1342 while ((ret = compact_finished(zone, cc, migratetype)) == 1400 while ((ret = compact_finished(zone, cc, migratetype)) ==
1343 COMPACT_CONTINUE) { 1401 COMPACT_CONTINUE) {
1344 int err; 1402 int err;
1345 unsigned long isolate_start_pfn = cc->migrate_pfn;
1346 1403
1347 switch (isolate_migratepages(zone, cc)) { 1404 switch (isolate_migratepages(zone, cc)) {
1348 case ISOLATE_ABORT: 1405 case ISOLATE_ABORT:
@@ -1376,22 +1433,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1376 * migrate_pages() may return -ENOMEM when scanners meet 1433 * migrate_pages() may return -ENOMEM when scanners meet
1377 * and we want compact_finished() to detect it 1434 * and we want compact_finished() to detect it
1378 */ 1435 */
1379 if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { 1436 if (err == -ENOMEM && !compact_scanners_met(cc)) {
1380 ret = COMPACT_PARTIAL; 1437 ret = COMPACT_PARTIAL;
1381 goto out; 1438 goto out;
1382 } 1439 }
1383 } 1440 }
1384 1441
1385 /*
1386 * Record where we could have freed pages by migration and not
1387 * yet flushed them to buddy allocator. We use the pfn that
1388 * isolate_migratepages() started from in this loop iteration
1389 * - this is the lowest page that could have been isolated and
1390 * then freed by migration.
1391 */
1392 if (!last_migrated_pfn)
1393 last_migrated_pfn = isolate_start_pfn;
1394
1395check_drain: 1442check_drain:
1396 /* 1443 /*
1397 * Has the migration scanner moved away from the previous 1444 * Has the migration scanner moved away from the previous
@@ -1400,18 +1447,18 @@ check_drain:
1400 * compact_finished() can detect immediately if allocation 1447 * compact_finished() can detect immediately if allocation
1401 * would succeed. 1448 * would succeed.
1402 */ 1449 */
1403 if (cc->order > 0 && last_migrated_pfn) { 1450 if (cc->order > 0 && cc->last_migrated_pfn) {
1404 int cpu; 1451 int cpu;
1405 unsigned long current_block_start = 1452 unsigned long current_block_start =
1406 cc->migrate_pfn & ~((1UL << cc->order) - 1); 1453 cc->migrate_pfn & ~((1UL << cc->order) - 1);
1407 1454
1408 if (last_migrated_pfn < current_block_start) { 1455 if (cc->last_migrated_pfn < current_block_start) {
1409 cpu = get_cpu(); 1456 cpu = get_cpu();
1410 lru_add_drain_cpu(cpu); 1457 lru_add_drain_cpu(cpu);
1411 drain_local_pages(zone); 1458 drain_local_pages(zone);
1412 put_cpu(); 1459 put_cpu();
1413 /* No more flushing until we migrate again */ 1460 /* No more flushing until we migrate again */
1414 last_migrated_pfn = 0; 1461 cc->last_migrated_pfn = 0;
1415 } 1462 }
1416 } 1463 }
1417 1464
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 59d10d16f0a5..71a8998cd03a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool)
271{ 271{
272 bool empty = false; 272 bool empty = false;
273 273
274 if (unlikely(!pool))
275 return;
276
274 mutex_lock(&pools_reg_lock); 277 mutex_lock(&pools_reg_lock);
275 mutex_lock(&pools_lock); 278 mutex_lock(&pools_lock);
276 list_del(&pool->pools); 279 list_del(&pool->pools);
@@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
334 /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ 337 /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
335 spin_unlock_irqrestore(&pool->lock, flags); 338 spin_unlock_irqrestore(&pool->lock, flags);
336 339
337 page = pool_alloc_page(pool, mem_flags); 340 page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
338 if (!page) 341 if (!page)
339 return NULL; 342 return NULL;
340 343
@@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
372 break; 375 break;
373 } 376 }
374 } 377 }
375 memset(retval, POOL_POISON_ALLOCATED, pool->size); 378 if (!(mem_flags & __GFP_ZERO))
379 memset(retval, POOL_POISON_ALLOCATED, pool->size);
376#endif 380#endif
377 spin_unlock_irqrestore(&pool->lock, flags); 381 spin_unlock_irqrestore(&pool->lock, flags);
382
383 if (mem_flags & __GFP_ZERO)
384 memset(retval, 0, pool->size);
385
378 return retval; 386 return retval;
379} 387}
380EXPORT_SYMBOL(dma_pool_alloc); 388EXPORT_SYMBOL(dma_pool_alloc);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 0cfadafb3fb0..23f744d77ce0 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -224,6 +224,28 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size)
224 return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); 224 return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
225} 225}
226#endif 226#endif
227
228#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
229
230void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
231{
232 unsigned long slop, clen;
233 char *p;
234
235 while (size) {
236 slop = src & ~PAGE_MASK;
237 clen = size;
238 if (clen > MAX_MAP_CHUNK - slop)
239 clen = MAX_MAP_CHUNK - slop;
240 p = early_memremap(src & PAGE_MASK, clen + slop);
241 memcpy(dest, p + slop, clen);
242 early_memunmap(p, clen + slop);
243 dest += clen;
244 src += clen;
245 size -= clen;
246 }
247}
248
227#else /* CONFIG_MMU */ 249#else /* CONFIG_MMU */
228 250
229void __init __iomem * 251void __init __iomem *
diff --git a/mm/filemap.c b/mm/filemap.c
index 1283fc825458..72940fb38666 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -674,7 +674,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
674 do { 674 do {
675 cpuset_mems_cookie = read_mems_allowed_begin(); 675 cpuset_mems_cookie = read_mems_allowed_begin();
676 n = cpuset_mem_spread_node(); 676 n = cpuset_mem_spread_node();
677 page = alloc_pages_exact_node(n, gfp, 0); 677 page = __alloc_pages_node(n, gfp, 0);
678 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); 678 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
679 679
680 return page; 680 return page;
@@ -2473,21 +2473,6 @@ ssize_t generic_perform_write(struct file *file,
2473 iov_iter_count(i)); 2473 iov_iter_count(i));
2474 2474
2475again: 2475again:
2476 /*
2477 * Bring in the user page that we will copy from _first_.
2478 * Otherwise there's a nasty deadlock on copying from the
2479 * same page as we're writing to, without it being marked
2480 * up-to-date.
2481 *
2482 * Not only is this an optimisation, but it is also required
2483 * to check that the address is actually valid, when atomic
2484 * usercopies are used, below.
2485 */
2486 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2487 status = -EFAULT;
2488 break;
2489 }
2490
2491 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2476 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2492 &page, &fsdata); 2477 &page, &fsdata);
2493 if (unlikely(status < 0)) 2478 if (unlikely(status < 0))
@@ -2495,8 +2480,17 @@ again:
2495 2480
2496 if (mapping_writably_mapped(mapping)) 2481 if (mapping_writably_mapped(mapping))
2497 flush_dcache_page(page); 2482 flush_dcache_page(page);
2498 2483 /*
2484 * 'page' is now locked. If we are trying to copy from a
2485 * mapping of 'page' in userspace, the copy might fault and
2486 * would need PageUptodate() to complete. But, page can not be
2487 * made Uptodate without acquiring the page lock, which we hold.
2488 * Deadlock. Avoid with pagefault_disable(). Fix up below with
2489 * iov_iter_fault_in_readable().
2490 */
2491 pagefault_disable();
2499 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2492 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2493 pagefault_enable();
2500 flush_dcache_page(page); 2494 flush_dcache_page(page);
2501 2495
2502 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2496 status = a_ops->write_end(file, mapping, pos, bytes, copied,
@@ -2519,6 +2513,14 @@ again:
2519 */ 2513 */
2520 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2514 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2521 iov_iter_single_seg_count(i)); 2515 iov_iter_single_seg_count(i));
2516 /*
2517 * This is the fallback to recover if the copy from
2518 * userspace above faults.
2519 */
2520 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2521 status = -EFAULT;
2522 break;
2523 }
2522 goto again; 2524 goto again;
2523 } 2525 }
2524 pos += copied; 2526 pos += copied;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 279a818a39b1..b16279cbd91d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
16#include <linux/swap.h> 16#include <linux/swap.h>
17#include <linux/shrinker.h> 17#include <linux/shrinker.h>
18#include <linux/mm_inline.h> 18#include <linux/mm_inline.h>
19#include <linux/dax.h>
19#include <linux/kthread.h> 20#include <linux/kthread.h>
20#include <linux/khugepaged.h> 21#include <linux/khugepaged.h>
21#include <linux/freezer.h> 22#include <linux/freezer.h>
@@ -105,7 +106,7 @@ static struct khugepaged_scan khugepaged_scan = {
105}; 106};
106 107
107 108
108static int set_recommended_min_free_kbytes(void) 109static void set_recommended_min_free_kbytes(void)
109{ 110{
110 struct zone *zone; 111 struct zone *zone;
111 int nr_zones = 0; 112 int nr_zones = 0;
@@ -140,7 +141,6 @@ static int set_recommended_min_free_kbytes(void)
140 min_free_kbytes = recommended_min; 141 min_free_kbytes = recommended_min;
141 } 142 }
142 setup_per_zone_wmarks(); 143 setup_per_zone_wmarks();
143 return 0;
144} 144}
145 145
146static int start_stop_khugepaged(void) 146static int start_stop_khugepaged(void)
@@ -172,12 +172,7 @@ fail:
172static atomic_t huge_zero_refcount; 172static atomic_t huge_zero_refcount;
173struct page *huge_zero_page __read_mostly; 173struct page *huge_zero_page __read_mostly;
174 174
175static inline bool is_huge_zero_pmd(pmd_t pmd) 175struct page *get_huge_zero_page(void)
176{
177 return is_huge_zero_page(pmd_page(pmd));
178}
179
180static struct page *get_huge_zero_page(void)
181{ 176{
182 struct page *zero_page; 177 struct page *zero_page;
183retry: 178retry:
@@ -794,16 +789,19 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
794} 789}
795 790
796/* Caller must hold page table lock. */ 791/* Caller must hold page table lock. */
797static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 792static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
798 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 793 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
799 struct page *zero_page) 794 struct page *zero_page)
800{ 795{
801 pmd_t entry; 796 pmd_t entry;
797 if (!pmd_none(*pmd))
798 return false;
802 entry = mk_pmd(zero_page, vma->vm_page_prot); 799 entry = mk_pmd(zero_page, vma->vm_page_prot);
803 entry = pmd_mkhuge(entry); 800 entry = pmd_mkhuge(entry);
804 pgtable_trans_huge_deposit(mm, pmd, pgtable); 801 pgtable_trans_huge_deposit(mm, pmd, pgtable);
805 set_pmd_at(mm, haddr, pmd, entry); 802 set_pmd_at(mm, haddr, pmd, entry);
806 atomic_long_inc(&mm->nr_ptes); 803 atomic_long_inc(&mm->nr_ptes);
804 return true;
807} 805}
808 806
809int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 807int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -870,6 +868,49 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
870 flags); 868 flags);
871} 869}
872 870
871static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
872 pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
873{
874 struct mm_struct *mm = vma->vm_mm;
875 pmd_t entry;
876 spinlock_t *ptl;
877
878 ptl = pmd_lock(mm, pmd);
879 if (pmd_none(*pmd)) {
880 entry = pmd_mkhuge(pfn_pmd(pfn, prot));
881 if (write) {
882 entry = pmd_mkyoung(pmd_mkdirty(entry));
883 entry = maybe_pmd_mkwrite(entry, vma);
884 }
885 set_pmd_at(mm, addr, pmd, entry);
886 update_mmu_cache_pmd(vma, addr, pmd);
887 }
888 spin_unlock(ptl);
889}
890
891int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
892 pmd_t *pmd, unsigned long pfn, bool write)
893{
894 pgprot_t pgprot = vma->vm_page_prot;
895 /*
896 * If we had pmd_special, we could avoid all these restrictions,
897 * but we need to be consistent with PTEs and architectures that
898 * can't support a 'special' bit.
899 */
900 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
901 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
902 (VM_PFNMAP|VM_MIXEDMAP));
903 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
904 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
905
906 if (addr < vma->vm_start || addr >= vma->vm_end)
907 return VM_FAULT_SIGBUS;
908 if (track_pfn_insert(vma, &pgprot, pfn))
909 return VM_FAULT_SIGBUS;
910 insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
911 return VM_FAULT_NOPAGE;
912}
913
873int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 914int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
874 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 915 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
875 struct vm_area_struct *vma) 916 struct vm_area_struct *vma)
@@ -1414,41 +1455,41 @@ out:
1414int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1455int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1415 pmd_t *pmd, unsigned long addr) 1456 pmd_t *pmd, unsigned long addr)
1416{ 1457{
1458 pmd_t orig_pmd;
1417 spinlock_t *ptl; 1459 spinlock_t *ptl;
1418 int ret = 0;
1419 1460
1420 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1461 if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
1421 struct page *page; 1462 return 0;
1422 pgtable_t pgtable; 1463 /*
1423 pmd_t orig_pmd; 1464 * For architectures like ppc64 we look at deposited pgtable
1424 /* 1465 * when calling pmdp_huge_get_and_clear. So do the
1425 * For architectures like ppc64 we look at deposited pgtable 1466 * pgtable_trans_huge_withdraw after finishing pmdp related
1426 * when calling pmdp_huge_get_and_clear. So do the 1467 * operations.
1427 * pgtable_trans_huge_withdraw after finishing pmdp related 1468 */
1428 * operations. 1469 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
1429 */ 1470 tlb->fullmm);
1430 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 1471 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1431 tlb->fullmm); 1472 if (vma_is_dax(vma)) {
1432 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1473 spin_unlock(ptl);
1433 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); 1474 if (is_huge_zero_pmd(orig_pmd))
1434 if (is_huge_zero_pmd(orig_pmd)) {
1435 atomic_long_dec(&tlb->mm->nr_ptes);
1436 spin_unlock(ptl);
1437 put_huge_zero_page(); 1475 put_huge_zero_page();
1438 } else { 1476 } else if (is_huge_zero_pmd(orig_pmd)) {
1439 page = pmd_page(orig_pmd); 1477 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
1440 page_remove_rmap(page); 1478 atomic_long_dec(&tlb->mm->nr_ptes);
1441 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1479 spin_unlock(ptl);
1442 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1480 put_huge_zero_page();
1443 VM_BUG_ON_PAGE(!PageHead(page), page); 1481 } else {
1444 atomic_long_dec(&tlb->mm->nr_ptes); 1482 struct page *page = pmd_page(orig_pmd);
1445 spin_unlock(ptl); 1483 page_remove_rmap(page);
1446 tlb_remove_page(tlb, page); 1484 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1447 } 1485 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1448 pte_free(tlb->mm, pgtable); 1486 VM_BUG_ON_PAGE(!PageHead(page), page);
1449 ret = 1; 1487 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
1488 atomic_long_dec(&tlb->mm->nr_ptes);
1489 spin_unlock(ptl);
1490 tlb_remove_page(tlb, page);
1450 } 1491 }
1451 return ret; 1492 return 1;
1452} 1493}
1453 1494
1454int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, 1495int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
@@ -2285,8 +2326,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
2285 2326
2286static void khugepaged_alloc_sleep(void) 2327static void khugepaged_alloc_sleep(void)
2287{ 2328{
2288 wait_event_freezable_timeout(khugepaged_wait, false, 2329 DEFINE_WAIT(wait);
2289 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2330
2331 add_wait_queue(&khugepaged_wait, &wait);
2332 freezable_schedule_timeout_interruptible(
2333 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2334 remove_wait_queue(&khugepaged_wait, &wait);
2290} 2335}
2291 2336
2292static int khugepaged_node_load[MAX_NUMNODES]; 2337static int khugepaged_node_load[MAX_NUMNODES];
@@ -2373,7 +2418,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2373 */ 2418 */
2374 up_read(&mm->mmap_sem); 2419 up_read(&mm->mmap_sem);
2375 2420
2376 *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); 2421 *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
2377 if (unlikely(!*hpage)) { 2422 if (unlikely(!*hpage)) {
2378 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2423 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2379 *hpage = ERR_PTR(-ENOMEM); 2424 *hpage = ERR_PTR(-ENOMEM);
@@ -2911,7 +2956,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2911 pmd_t *pmd) 2956 pmd_t *pmd)
2912{ 2957{
2913 spinlock_t *ptl; 2958 spinlock_t *ptl;
2914 struct page *page; 2959 struct page *page = NULL;
2915 struct mm_struct *mm = vma->vm_mm; 2960 struct mm_struct *mm = vma->vm_mm;
2916 unsigned long haddr = address & HPAGE_PMD_MASK; 2961 unsigned long haddr = address & HPAGE_PMD_MASK;
2917 unsigned long mmun_start; /* For mmu_notifiers */ 2962 unsigned long mmun_start; /* For mmu_notifiers */
@@ -2924,25 +2969,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2924again: 2969again:
2925 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2970 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2926 ptl = pmd_lock(mm, pmd); 2971 ptl = pmd_lock(mm, pmd);
2927 if (unlikely(!pmd_trans_huge(*pmd))) { 2972 if (unlikely(!pmd_trans_huge(*pmd)))
2928 spin_unlock(ptl); 2973 goto unlock;
2929 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2974 if (vma_is_dax(vma)) {
2930 return; 2975 pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2931 } 2976 if (is_huge_zero_pmd(_pmd))
2932 if (is_huge_zero_pmd(*pmd)) { 2977 put_huge_zero_page();
2978 } else if (is_huge_zero_pmd(*pmd)) {
2933 __split_huge_zero_page_pmd(vma, haddr, pmd); 2979 __split_huge_zero_page_pmd(vma, haddr, pmd);
2934 spin_unlock(ptl); 2980 } else {
2935 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2981 page = pmd_page(*pmd);
2936 return; 2982 VM_BUG_ON_PAGE(!page_count(page), page);
2983 get_page(page);
2937 } 2984 }
2938 page = pmd_page(*pmd); 2985 unlock:
2939 VM_BUG_ON_PAGE(!page_count(page), page);
2940 get_page(page);
2941 spin_unlock(ptl); 2986 spin_unlock(ptl);
2942 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2987 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2943 2988
2944 split_huge_page(page); 2989 if (!page)
2990 return;
2945 2991
2992 split_huge_page(page);
2946 put_page(page); 2993 put_page(page);
2947 2994
2948 /* 2995 /*
@@ -2991,7 +3038,7 @@ static void split_huge_page_address(struct mm_struct *mm,
2991 split_huge_page_pmd_mm(mm, address, pmd); 3038 split_huge_page_pmd_mm(mm, address, pmd);
2992} 3039}
2993 3040
2994void __vma_adjust_trans_huge(struct vm_area_struct *vma, 3041void vma_adjust_trans_huge(struct vm_area_struct *vma,
2995 unsigned long start, 3042 unsigned long start,
2996 unsigned long end, 3043 unsigned long end,
2997 long adjust_next) 3044 long adjust_next)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51ae41d0fbc0..999fb0aef8f1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
64 * prevent spurious OOMs when the hugepage pool is fully utilized. 64 * prevent spurious OOMs when the hugepage pool is fully utilized.
65 */ 65 */
66static int num_fault_mutexes; 66static int num_fault_mutexes;
67static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; 67struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
68 68
69/* Forward declaration */ 69/* Forward declaration */
70static int hugetlb_acct_memory(struct hstate *h, long delta); 70static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -240,11 +240,14 @@ struct file_region {
240 240
241/* 241/*
242 * Add the huge page range represented by [f, t) to the reserve 242 * Add the huge page range represented by [f, t) to the reserve
243 * map. Existing regions will be expanded to accommodate the 243 * map. In the normal case, existing regions will be expanded
244 * specified range. We know only existing regions need to be 244 * to accommodate the specified range. Sufficient regions should
245 * expanded, because region_add is only called after region_chg 245 * exist for expansion due to the previous call to region_chg
246 * with the same range. If a new file_region structure must 246 * with the same range. However, it is possible that region_del
247 * be allocated, it is done in region_chg. 247 * could have been called after region_chg and modifed the map
248 * in such a way that no region exists to be expanded. In this
249 * case, pull a region descriptor from the cache associated with
250 * the map and use that for the new range.
248 * 251 *
249 * Return the number of new huge pages added to the map. This 252 * Return the number of new huge pages added to the map. This
250 * number is greater than or equal to zero. 253 * number is greater than or equal to zero.
@@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
261 if (f <= rg->to) 264 if (f <= rg->to)
262 break; 265 break;
263 266
267 /*
268 * If no region exists which can be expanded to include the
269 * specified range, the list must have been modified by an
270 * interleving call to region_del(). Pull a region descriptor
271 * from the cache and use it for this range.
272 */
273 if (&rg->link == head || t < rg->from) {
274 VM_BUG_ON(resv->region_cache_count <= 0);
275
276 resv->region_cache_count--;
277 nrg = list_first_entry(&resv->region_cache, struct file_region,
278 link);
279 list_del(&nrg->link);
280
281 nrg->from = f;
282 nrg->to = t;
283 list_add(&nrg->link, rg->link.prev);
284
285 add += t - f;
286 goto out_locked;
287 }
288
264 /* Round our left edge to the current segment if it encloses us. */ 289 /* Round our left edge to the current segment if it encloses us. */
265 if (f > rg->from) 290 if (f > rg->from)
266 f = rg->from; 291 f = rg->from;
@@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
294 add += t - nrg->to; /* Added to end of region */ 319 add += t - nrg->to; /* Added to end of region */
295 nrg->to = t; 320 nrg->to = t;
296 321
322out_locked:
323 resv->adds_in_progress--;
297 spin_unlock(&resv->lock); 324 spin_unlock(&resv->lock);
298 VM_BUG_ON(add < 0); 325 VM_BUG_ON(add < 0);
299 return add; 326 return add;
@@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
312 * so that the subsequent region_add call will have all the 339 * so that the subsequent region_add call will have all the
313 * regions it needs and will not fail. 340 * regions it needs and will not fail.
314 * 341 *
315 * Returns the number of huge pages that need to be added 342 * Upon entry, region_chg will also examine the cache of region descriptors
316 * to the existing reservation map for the range [f, t). 343 * associated with the map. If there are not enough descriptors cached, one
317 * This number is greater or equal to zero. -ENOMEM is 344 * will be allocated for the in progress add operation.
318 * returned if a new file_region structure is needed and can 345 *
319 * not be allocated. 346 * Returns the number of huge pages that need to be added to the existing
347 * reservation map for the range [f, t). This number is greater or equal to
348 * zero. -ENOMEM is returned if a new file_region structure or cache entry
349 * is needed and can not be allocated.
320 */ 350 */
321static long region_chg(struct resv_map *resv, long f, long t) 351static long region_chg(struct resv_map *resv, long f, long t)
322{ 352{
@@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
326 356
327retry: 357retry:
328 spin_lock(&resv->lock); 358 spin_lock(&resv->lock);
359retry_locked:
360 resv->adds_in_progress++;
361
362 /*
363 * Check for sufficient descriptors in the cache to accommodate
364 * the number of in progress add operations.
365 */
366 if (resv->adds_in_progress > resv->region_cache_count) {
367 struct file_region *trg;
368
369 VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
370 /* Must drop lock to allocate a new descriptor. */
371 resv->adds_in_progress--;
372 spin_unlock(&resv->lock);
373
374 trg = kmalloc(sizeof(*trg), GFP_KERNEL);
375 if (!trg)
376 return -ENOMEM;
377
378 spin_lock(&resv->lock);
379 list_add(&trg->link, &resv->region_cache);
380 resv->region_cache_count++;
381 goto retry_locked;
382 }
383
329 /* Locate the region we are before or in. */ 384 /* Locate the region we are before or in. */
330 list_for_each_entry(rg, head, link) 385 list_for_each_entry(rg, head, link)
331 if (f <= rg->to) 386 if (f <= rg->to)
@@ -336,6 +391,7 @@ retry:
336 * size such that we can guarantee to record the reservation. */ 391 * size such that we can guarantee to record the reservation. */
337 if (&rg->link == head || t < rg->from) { 392 if (&rg->link == head || t < rg->from) {
338 if (!nrg) { 393 if (!nrg) {
394 resv->adds_in_progress--;
339 spin_unlock(&resv->lock); 395 spin_unlock(&resv->lock);
340 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 396 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
341 if (!nrg) 397 if (!nrg)
@@ -385,43 +441,131 @@ out_nrg:
385} 441}
386 442
387/* 443/*
388 * Truncate the reserve map at index 'end'. Modify/truncate any 444 * Abort the in progress add operation. The adds_in_progress field
389 * region which contains end. Delete any regions past end. 445 * of the resv_map keeps track of the operations in progress between
390 * Return the number of huge pages removed from the map. 446 * calls to region_chg and region_add. Operations are sometimes
447 * aborted after the call to region_chg. In such cases, region_abort
448 * is called to decrement the adds_in_progress counter.
449 *
450 * NOTE: The range arguments [f, t) are not needed or used in this
451 * routine. They are kept to make reading the calling code easier as
452 * arguments will match the associated region_chg call.
391 */ 453 */
392static long region_truncate(struct resv_map *resv, long end) 454static void region_abort(struct resv_map *resv, long f, long t)
455{
456 spin_lock(&resv->lock);
457 VM_BUG_ON(!resv->region_cache_count);
458 resv->adds_in_progress--;
459 spin_unlock(&resv->lock);
460}
461
462/*
463 * Delete the specified range [f, t) from the reserve map. If the
464 * t parameter is LONG_MAX, this indicates that ALL regions after f
465 * should be deleted. Locate the regions which intersect [f, t)
466 * and either trim, delete or split the existing regions.
467 *
468 * Returns the number of huge pages deleted from the reserve map.
469 * In the normal case, the return value is zero or more. In the
470 * case where a region must be split, a new region descriptor must
471 * be allocated. If the allocation fails, -ENOMEM will be returned.
472 * NOTE: If the parameter t == LONG_MAX, then we will never split
473 * a region and possibly return -ENOMEM. Callers specifying
474 * t == LONG_MAX do not need to check for -ENOMEM error.
475 */
476static long region_del(struct resv_map *resv, long f, long t)
393{ 477{
394 struct list_head *head = &resv->regions; 478 struct list_head *head = &resv->regions;
395 struct file_region *rg, *trg; 479 struct file_region *rg, *trg;
396 long chg = 0; 480 struct file_region *nrg = NULL;
481 long del = 0;
397 482
483retry:
398 spin_lock(&resv->lock); 484 spin_lock(&resv->lock);
399 /* Locate the region we are either in or before. */ 485 list_for_each_entry_safe(rg, trg, head, link) {
400 list_for_each_entry(rg, head, link) 486 if (rg->to <= f)
401 if (end <= rg->to) 487 continue;
488 if (rg->from >= t)
402 break; 489 break;
403 if (&rg->link == head)
404 goto out;
405 490
406 /* If we are in the middle of a region then adjust it. */ 491 if (f > rg->from && t < rg->to) { /* Must split region */
407 if (end > rg->from) { 492 /*
408 chg = rg->to - end; 493 * Check for an entry in the cache before dropping
409 rg->to = end; 494 * lock and attempting allocation.
410 rg = list_entry(rg->link.next, typeof(*rg), link); 495 */
411 } 496 if (!nrg &&
497 resv->region_cache_count > resv->adds_in_progress) {
498 nrg = list_first_entry(&resv->region_cache,
499 struct file_region,
500 link);
501 list_del(&nrg->link);
502 resv->region_cache_count--;
503 }
412 504
413 /* Drop any remaining regions. */ 505 if (!nrg) {
414 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 506 spin_unlock(&resv->lock);
415 if (&rg->link == head) 507 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
508 if (!nrg)
509 return -ENOMEM;
510 goto retry;
511 }
512
513 del += t - f;
514
515 /* New entry for end of split region */
516 nrg->from = t;
517 nrg->to = rg->to;
518 INIT_LIST_HEAD(&nrg->link);
519
520 /* Original entry is trimmed */
521 rg->to = f;
522
523 list_add(&nrg->link, &rg->link);
524 nrg = NULL;
416 break; 525 break;
417 chg += rg->to - rg->from; 526 }
418 list_del(&rg->link); 527
419 kfree(rg); 528 if (f <= rg->from && t >= rg->to) { /* Remove entire region */
529 del += rg->to - rg->from;
530 list_del(&rg->link);
531 kfree(rg);
532 continue;
533 }
534
535 if (f <= rg->from) { /* Trim beginning of region */
536 del += t - rg->from;
537 rg->from = t;
538 } else { /* Trim end of region */
539 del += rg->to - f;
540 rg->to = f;
541 }
420 } 542 }
421 543
422out:
423 spin_unlock(&resv->lock); 544 spin_unlock(&resv->lock);
424 return chg; 545 kfree(nrg);
546 return del;
547}
548
549/*
550 * A rare out of memory error was encountered which prevented removal of
551 * the reserve map region for a page. The huge page itself was free'ed
552 * and removed from the page cache. This routine will adjust the subpool
553 * usage count, and the global reserve count if needed. By incrementing
554 * these counts, the reserve map entry which could not be deleted will
555 * appear as a "reserved" entry instead of simply dangling with incorrect
556 * counts.
557 */
558void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
559{
560 struct hugepage_subpool *spool = subpool_inode(inode);
561 long rsv_adjust;
562
563 rsv_adjust = hugepage_subpool_get_pages(spool, 1);
564 if (restore_reserve && rsv_adjust) {
565 struct hstate *h = hstate_inode(inode);
566
567 hugetlb_acct_memory(h, 1);
568 }
425} 569}
426 570
427/* 571/*
@@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
544struct resv_map *resv_map_alloc(void) 688struct resv_map *resv_map_alloc(void)
545{ 689{
546 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 690 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
547 if (!resv_map) 691 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
692
693 if (!resv_map || !rg) {
694 kfree(resv_map);
695 kfree(rg);
548 return NULL; 696 return NULL;
697 }
549 698
550 kref_init(&resv_map->refs); 699 kref_init(&resv_map->refs);
551 spin_lock_init(&resv_map->lock); 700 spin_lock_init(&resv_map->lock);
552 INIT_LIST_HEAD(&resv_map->regions); 701 INIT_LIST_HEAD(&resv_map->regions);
553 702
703 resv_map->adds_in_progress = 0;
704
705 INIT_LIST_HEAD(&resv_map->region_cache);
706 list_add(&rg->link, &resv_map->region_cache);
707 resv_map->region_cache_count = 1;
708
554 return resv_map; 709 return resv_map;
555} 710}
556 711
557void resv_map_release(struct kref *ref) 712void resv_map_release(struct kref *ref)
558{ 713{
559 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 714 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
715 struct list_head *head = &resv_map->region_cache;
716 struct file_region *rg, *trg;
560 717
561 /* Clear out any active regions before we release the map. */ 718 /* Clear out any active regions before we release the map. */
562 region_truncate(resv_map, 0); 719 region_del(resv_map, 0, LONG_MAX);
720
721 /* ... and any entries left in the cache */
722 list_for_each_entry_safe(rg, trg, head, link) {
723 list_del(&rg->link);
724 kfree(rg);
725 }
726
727 VM_BUG_ON(resv_map->adds_in_progress);
728
563 kfree(resv_map); 729 kfree(resv_map);
564} 730}
565 731
@@ -635,8 +801,19 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
635 } 801 }
636 802
637 /* Shared mappings always use reserves */ 803 /* Shared mappings always use reserves */
638 if (vma->vm_flags & VM_MAYSHARE) 804 if (vma->vm_flags & VM_MAYSHARE) {
639 return true; 805 /*
806 * We know VM_NORESERVE is not set. Therefore, there SHOULD
807 * be a region map for all pages. The only situation where
808 * there is no region map is if a hole was punched via
809 * fallocate. In this case, there really are no reverves to
810 * use. This situation is indicated if chg != 0.
811 */
812 if (chg)
813 return false;
814 else
815 return true;
816 }
640 817
641 /* 818 /*
642 * Only the process that called mmap() has reserves for 819 * Only the process that called mmap() has reserves for
@@ -1154,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
1154{ 1331{
1155 struct page *page; 1332 struct page *page;
1156 1333
1157 page = alloc_pages_exact_node(nid, 1334 page = __alloc_pages_node(nid,
1158 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 1335 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1159 __GFP_REPEAT|__GFP_NOWARN, 1336 __GFP_REPEAT|__GFP_NOWARN,
1160 huge_page_order(h)); 1337 huge_page_order(h));
@@ -1306,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
1306 __GFP_REPEAT|__GFP_NOWARN, 1483 __GFP_REPEAT|__GFP_NOWARN,
1307 huge_page_order(h)); 1484 huge_page_order(h));
1308 else 1485 else
1309 page = alloc_pages_exact_node(nid, 1486 page = __alloc_pages_node(nid,
1310 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 1487 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1311 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); 1488 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
1312 1489
@@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h,
1473 } 1650 }
1474} 1651}
1475 1652
1653
1476/* 1654/*
1477 * vma_needs_reservation and vma_commit_reservation are used by the huge 1655 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
1478 * page allocation routines to manage reservations. 1656 * are used by the huge page allocation routines to manage reservations.
1479 * 1657 *
1480 * vma_needs_reservation is called to determine if the huge page at addr 1658 * vma_needs_reservation is called to determine if the huge page at addr
1481 * within the vma has an associated reservation. If a reservation is 1659 * within the vma has an associated reservation. If a reservation is
1482 * needed, the value 1 is returned. The caller is then responsible for 1660 * needed, the value 1 is returned. The caller is then responsible for
1483 * managing the global reservation and subpool usage counts. After 1661 * managing the global reservation and subpool usage counts. After
1484 * the huge page has been allocated, vma_commit_reservation is called 1662 * the huge page has been allocated, vma_commit_reservation is called
1485 * to add the page to the reservation map. 1663 * to add the page to the reservation map. If the page allocation fails,
1664 * the reservation must be ended instead of committed. vma_end_reservation
1665 * is called in such cases.
1486 * 1666 *
1487 * In the normal case, vma_commit_reservation returns the same value 1667 * In the normal case, vma_commit_reservation returns the same value
1488 * as the preceding vma_needs_reservation call. The only time this 1668 * as the preceding vma_needs_reservation call. The only time this
@@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h,
1490 * is the responsibility of the caller to notice the difference and 1670 * is the responsibility of the caller to notice the difference and
1491 * take appropriate action. 1671 * take appropriate action.
1492 */ 1672 */
1673enum vma_resv_mode {
1674 VMA_NEEDS_RESV,
1675 VMA_COMMIT_RESV,
1676 VMA_END_RESV,
1677};
1493static long __vma_reservation_common(struct hstate *h, 1678static long __vma_reservation_common(struct hstate *h,
1494 struct vm_area_struct *vma, unsigned long addr, 1679 struct vm_area_struct *vma, unsigned long addr,
1495 bool commit) 1680 enum vma_resv_mode mode)
1496{ 1681{
1497 struct resv_map *resv; 1682 struct resv_map *resv;
1498 pgoff_t idx; 1683 pgoff_t idx;
@@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h,
1503 return 1; 1688 return 1;
1504 1689
1505 idx = vma_hugecache_offset(h, vma, addr); 1690 idx = vma_hugecache_offset(h, vma, addr);
1506 if (commit) 1691 switch (mode) {
1507 ret = region_add(resv, idx, idx + 1); 1692 case VMA_NEEDS_RESV:
1508 else
1509 ret = region_chg(resv, idx, idx + 1); 1693 ret = region_chg(resv, idx, idx + 1);
1694 break;
1695 case VMA_COMMIT_RESV:
1696 ret = region_add(resv, idx, idx + 1);
1697 break;
1698 case VMA_END_RESV:
1699 region_abort(resv, idx, idx + 1);
1700 ret = 0;
1701 break;
1702 default:
1703 BUG();
1704 }
1510 1705
1511 if (vma->vm_flags & VM_MAYSHARE) 1706 if (vma->vm_flags & VM_MAYSHARE)
1512 return ret; 1707 return ret;
@@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h,
1517static long vma_needs_reservation(struct hstate *h, 1712static long vma_needs_reservation(struct hstate *h,
1518 struct vm_area_struct *vma, unsigned long addr) 1713 struct vm_area_struct *vma, unsigned long addr)
1519{ 1714{
1520 return __vma_reservation_common(h, vma, addr, false); 1715 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
1521} 1716}
1522 1717
1523static long vma_commit_reservation(struct hstate *h, 1718static long vma_commit_reservation(struct hstate *h,
1524 struct vm_area_struct *vma, unsigned long addr) 1719 struct vm_area_struct *vma, unsigned long addr)
1525{ 1720{
1526 return __vma_reservation_common(h, vma, addr, true); 1721 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
1722}
1723
1724static void vma_end_reservation(struct hstate *h,
1725 struct vm_area_struct *vma, unsigned long addr)
1726{
1727 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
1527} 1728}
1528 1729
1529static struct page *alloc_huge_page(struct vm_area_struct *vma, 1730struct page *alloc_huge_page(struct vm_area_struct *vma,
1530 unsigned long addr, int avoid_reserve) 1731 unsigned long addr, int avoid_reserve)
1531{ 1732{
1532 struct hugepage_subpool *spool = subpool_vma(vma); 1733 struct hugepage_subpool *spool = subpool_vma(vma);
1533 struct hstate *h = hstate_vma(vma); 1734 struct hstate *h = hstate_vma(vma);
1534 struct page *page; 1735 struct page *page;
1535 long chg, commit; 1736 long map_chg, map_commit;
1737 long gbl_chg;
1536 int ret, idx; 1738 int ret, idx;
1537 struct hugetlb_cgroup *h_cg; 1739 struct hugetlb_cgroup *h_cg;
1538 1740
1539 idx = hstate_index(h); 1741 idx = hstate_index(h);
1540 /* 1742 /*
1541 * Processes that did not create the mapping will have no 1743 * Examine the region/reserve map to determine if the process
1542 * reserves and will not have accounted against subpool 1744 * has a reservation for the page to be allocated. A return
1543 * limit. Check that the subpool limit can be made before 1745 * code of zero indicates a reservation exists (no change).
1544 * satisfying the allocation MAP_NORESERVE mappings may also
1545 * need pages and subpool limit allocated allocated if no reserve
1546 * mapping overlaps.
1547 */ 1746 */
1548 chg = vma_needs_reservation(h, vma, addr); 1747 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
1549 if (chg < 0) 1748 if (map_chg < 0)
1550 return ERR_PTR(-ENOMEM); 1749 return ERR_PTR(-ENOMEM);
1551 if (chg || avoid_reserve) 1750
1552 if (hugepage_subpool_get_pages(spool, 1) < 0) 1751 /*
1752 * Processes that did not create the mapping will have no
1753 * reserves as indicated by the region/reserve map. Check
1754 * that the allocation will not exceed the subpool limit.
1755 * Allocations for MAP_NORESERVE mappings also need to be
1756 * checked against any subpool limit.
1757 */
1758 if (map_chg || avoid_reserve) {
1759 gbl_chg = hugepage_subpool_get_pages(spool, 1);
1760 if (gbl_chg < 0) {
1761 vma_end_reservation(h, vma, addr);
1553 return ERR_PTR(-ENOSPC); 1762 return ERR_PTR(-ENOSPC);
1763 }
1764
1765 /*
1766 * Even though there was no reservation in the region/reserve
1767 * map, there could be reservations associated with the
1768 * subpool that can be used. This would be indicated if the
1769 * return value of hugepage_subpool_get_pages() is zero.
1770 * However, if avoid_reserve is specified we still avoid even
1771 * the subpool reservations.
1772 */
1773 if (avoid_reserve)
1774 gbl_chg = 1;
1775 }
1554 1776
1555 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1777 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1556 if (ret) 1778 if (ret)
1557 goto out_subpool_put; 1779 goto out_subpool_put;
1558 1780
1559 spin_lock(&hugetlb_lock); 1781 spin_lock(&hugetlb_lock);
1560 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); 1782 /*
1783 * glb_chg is passed to indicate whether or not a page must be taken
1784 * from the global free pool (global change). gbl_chg == 0 indicates
1785 * a reservation exists for the allocation.
1786 */
1787 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
1561 if (!page) { 1788 if (!page) {
1562 spin_unlock(&hugetlb_lock); 1789 spin_unlock(&hugetlb_lock);
1563 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1790 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
@@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1573 1800
1574 set_page_private(page, (unsigned long)spool); 1801 set_page_private(page, (unsigned long)spool);
1575 1802
1576 commit = vma_commit_reservation(h, vma, addr); 1803 map_commit = vma_commit_reservation(h, vma, addr);
1577 if (unlikely(chg > commit)) { 1804 if (unlikely(map_chg > map_commit)) {
1578 /* 1805 /*
1579 * The page was added to the reservation map between 1806 * The page was added to the reservation map between
1580 * vma_needs_reservation and vma_commit_reservation. 1807 * vma_needs_reservation and vma_commit_reservation.
@@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1594out_uncharge_cgroup: 1821out_uncharge_cgroup:
1595 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 1822 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
1596out_subpool_put: 1823out_subpool_put:
1597 if (chg || avoid_reserve) 1824 if (map_chg || avoid_reserve)
1598 hugepage_subpool_put_pages(spool, 1); 1825 hugepage_subpool_put_pages(spool, 1);
1826 vma_end_reservation(h, vma, addr);
1599 return ERR_PTR(-ENOSPC); 1827 return ERR_PTR(-ENOSPC);
1600} 1828}
1601 1829
@@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void)
2311 } 2539 }
2312 2540
2313 kobject_put(hugepages_kobj); 2541 kobject_put(hugepages_kobj);
2314 kfree(htlb_fault_mutex_table); 2542 kfree(hugetlb_fault_mutex_table);
2315} 2543}
2316module_exit(hugetlb_exit); 2544module_exit(hugetlb_exit);
2317 2545
@@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void)
2344#else 2572#else
2345 num_fault_mutexes = 1; 2573 num_fault_mutexes = 1;
2346#endif 2574#endif
2347 htlb_fault_mutex_table = 2575 hugetlb_fault_mutex_table =
2348 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); 2576 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
2349 BUG_ON(!htlb_fault_mutex_table); 2577 BUG_ON(!hugetlb_fault_mutex_table);
2350 2578
2351 for (i = 0; i < num_fault_mutexes; i++) 2579 for (i = 0; i < num_fault_mutexes; i++)
2352 mutex_init(&htlb_fault_mutex_table[i]); 2580 mutex_init(&hugetlb_fault_mutex_table[i]);
2353 return 0; 2581 return 0;
2354} 2582}
2355module_init(hugetlb_init); 2583module_init(hugetlb_init);
@@ -3147,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
3147 return page != NULL; 3375 return page != NULL;
3148} 3376}
3149 3377
3378int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
3379 pgoff_t idx)
3380{
3381 struct inode *inode = mapping->host;
3382 struct hstate *h = hstate_inode(inode);
3383 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
3384
3385 if (err)
3386 return err;
3387 ClearPagePrivate(page);
3388
3389 spin_lock(&inode->i_lock);
3390 inode->i_blocks += blocks_per_huge_page(h);
3391 spin_unlock(&inode->i_lock);
3392 return 0;
3393}
3394
3150static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 3395static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
3151 struct address_space *mapping, pgoff_t idx, 3396 struct address_space *mapping, pgoff_t idx,
3152 unsigned long address, pte_t *ptep, unsigned int flags) 3397 unsigned long address, pte_t *ptep, unsigned int flags)
@@ -3194,21 +3439,13 @@ retry:
3194 set_page_huge_active(page); 3439 set_page_huge_active(page);
3195 3440
3196 if (vma->vm_flags & VM_MAYSHARE) { 3441 if (vma->vm_flags & VM_MAYSHARE) {
3197 int err; 3442 int err = huge_add_to_page_cache(page, mapping, idx);
3198 struct inode *inode = mapping->host;
3199
3200 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
3201 if (err) { 3443 if (err) {
3202 put_page(page); 3444 put_page(page);
3203 if (err == -EEXIST) 3445 if (err == -EEXIST)
3204 goto retry; 3446 goto retry;
3205 goto out; 3447 goto out;
3206 } 3448 }
3207 ClearPagePrivate(page);
3208
3209 spin_lock(&inode->i_lock);
3210 inode->i_blocks += blocks_per_huge_page(h);
3211 spin_unlock(&inode->i_lock);
3212 } else { 3449 } else {
3213 lock_page(page); 3450 lock_page(page);
3214 if (unlikely(anon_vma_prepare(vma))) { 3451 if (unlikely(anon_vma_prepare(vma))) {
@@ -3236,11 +3473,14 @@ retry:
3236 * any allocations necessary to record that reservation occur outside 3473 * any allocations necessary to record that reservation occur outside
3237 * the spinlock. 3474 * the spinlock.
3238 */ 3475 */
3239 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) 3476 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3240 if (vma_needs_reservation(h, vma, address) < 0) { 3477 if (vma_needs_reservation(h, vma, address) < 0) {
3241 ret = VM_FAULT_OOM; 3478 ret = VM_FAULT_OOM;
3242 goto backout_unlocked; 3479 goto backout_unlocked;
3243 } 3480 }
3481 /* Just decrements count, does not deallocate */
3482 vma_end_reservation(h, vma, address);
3483 }
3244 3484
3245 ptl = huge_pte_lockptr(h, mm, ptep); 3485 ptl = huge_pte_lockptr(h, mm, ptep);
3246 spin_lock(ptl); 3486 spin_lock(ptl);
@@ -3280,7 +3520,7 @@ backout_unlocked:
3280} 3520}
3281 3521
3282#ifdef CONFIG_SMP 3522#ifdef CONFIG_SMP
3283static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3523u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
3284 struct vm_area_struct *vma, 3524 struct vm_area_struct *vma,
3285 struct address_space *mapping, 3525 struct address_space *mapping,
3286 pgoff_t idx, unsigned long address) 3526 pgoff_t idx, unsigned long address)
@@ -3305,7 +3545,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
3305 * For uniprocesor systems we always use a single mutex, so just 3545 * For uniprocesor systems we always use a single mutex, so just
3306 * return 0 and avoid the hashing overhead. 3546 * return 0 and avoid the hashing overhead.
3307 */ 3547 */
3308static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3548u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
3309 struct vm_area_struct *vma, 3549 struct vm_area_struct *vma,
3310 struct address_space *mapping, 3550 struct address_space *mapping,
3311 pgoff_t idx, unsigned long address) 3551 pgoff_t idx, unsigned long address)
@@ -3353,8 +3593,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3353 * get spurious allocation failures if two CPUs race to instantiate 3593 * get spurious allocation failures if two CPUs race to instantiate
3354 * the same page in the page cache. 3594 * the same page in the page cache.
3355 */ 3595 */
3356 hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); 3596 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
3357 mutex_lock(&htlb_fault_mutex_table[hash]); 3597 mutex_lock(&hugetlb_fault_mutex_table[hash]);
3358 3598
3359 entry = huge_ptep_get(ptep); 3599 entry = huge_ptep_get(ptep);
3360 if (huge_pte_none(entry)) { 3600 if (huge_pte_none(entry)) {
@@ -3387,6 +3627,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3387 ret = VM_FAULT_OOM; 3627 ret = VM_FAULT_OOM;
3388 goto out_mutex; 3628 goto out_mutex;
3389 } 3629 }
3630 /* Just decrements count, does not deallocate */
3631 vma_end_reservation(h, vma, address);
3390 3632
3391 if (!(vma->vm_flags & VM_MAYSHARE)) 3633 if (!(vma->vm_flags & VM_MAYSHARE))
3392 pagecache_page = hugetlbfs_pagecache_page(h, 3634 pagecache_page = hugetlbfs_pagecache_page(h,
@@ -3437,7 +3679,7 @@ out_ptl:
3437 put_page(pagecache_page); 3679 put_page(pagecache_page);
3438 } 3680 }
3439out_mutex: 3681out_mutex:
3440 mutex_unlock(&htlb_fault_mutex_table[hash]); 3682 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
3441 /* 3683 /*
3442 * Generally it's safe to hold refcount during waiting page lock. But 3684 * Generally it's safe to hold refcount during waiting page lock. But
3443 * here we just wait to defer the next page fault to avoid busy loop and 3685 * here we just wait to defer the next page fault to avoid busy loop and
@@ -3726,12 +3968,15 @@ int hugetlb_reserve_pages(struct inode *inode,
3726 } 3968 }
3727 return 0; 3969 return 0;
3728out_err: 3970out_err:
3971 if (!vma || vma->vm_flags & VM_MAYSHARE)
3972 region_abort(resv_map, from, to);
3729 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3973 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3730 kref_put(&resv_map->refs, resv_map_release); 3974 kref_put(&resv_map->refs, resv_map_release);
3731 return ret; 3975 return ret;
3732} 3976}
3733 3977
3734void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 3978long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
3979 long freed)
3735{ 3980{
3736 struct hstate *h = hstate_inode(inode); 3981 struct hstate *h = hstate_inode(inode);
3737 struct resv_map *resv_map = inode_resv_map(inode); 3982 struct resv_map *resv_map = inode_resv_map(inode);
@@ -3739,8 +3984,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3739 struct hugepage_subpool *spool = subpool_inode(inode); 3984 struct hugepage_subpool *spool = subpool_inode(inode);
3740 long gbl_reserve; 3985 long gbl_reserve;
3741 3986
3742 if (resv_map) 3987 if (resv_map) {
3743 chg = region_truncate(resv_map, offset); 3988 chg = region_del(resv_map, start, end);
3989 /*
3990 * region_del() can fail in the rare case where a region
3991 * must be split and another region descriptor can not be
3992 * allocated. If end == LONG_MAX, it will not fail.
3993 */
3994 if (chg < 0)
3995 return chg;
3996 }
3997
3744 spin_lock(&inode->i_lock); 3998 spin_lock(&inode->i_lock);
3745 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3999 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
3746 spin_unlock(&inode->i_lock); 4000 spin_unlock(&inode->i_lock);
@@ -3751,6 +4005,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3751 */ 4005 */
3752 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 4006 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
3753 hugetlb_acct_memory(h, -gbl_reserve); 4007 hugetlb_acct_memory(h, -gbl_reserve);
4008
4009 return 0;
3754} 4010}
3755 4011
3756#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 4012#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index bf73ac17dad4..aeba0edd6e44 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -58,7 +58,7 @@ inject:
58 pr_info("Injecting memory failure at pfn %#lx\n", pfn); 58 pr_info("Injecting memory failure at pfn %#lx\n", pfn);
59 return memory_failure(pfn, 18, MF_COUNT_INCREASED); 59 return memory_failure(pfn, 18, MF_COUNT_INCREASED);
60put_out: 60put_out:
61 put_page(p); 61 put_hwpoison_page(p);
62 return 0; 62 return 0;
63} 63}
64 64
diff --git a/mm/internal.h b/mm/internal.h
index 1195dd2d6a2b..bc0fa9a69e46 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -182,6 +182,7 @@ struct compact_control {
182 unsigned long nr_migratepages; /* Number of pages to migrate */ 182 unsigned long nr_migratepages; /* Number of pages to migrate */
183 unsigned long free_pfn; /* isolate_freepages search base */ 183 unsigned long free_pfn; /* isolate_freepages search base */
184 unsigned long migrate_pfn; /* isolate_migratepages search base */ 184 unsigned long migrate_pfn; /* isolate_migratepages search base */
185 unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
185 enum migrate_mode mode; /* Async or sync migration mode */ 186 enum migrate_mode mode; /* Async or sync migration mode */
186 bool ignore_skip_hint; /* Scan blocks even if marked skip */ 187 bool ignore_skip_hint; /* Scan blocks even if marked skip */
187 int order; /* order a direct compactor needs */ 188 int order; /* order a direct compactor needs */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index cf79f110157c..f532f6a37b55 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -838,6 +838,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
838 } 838 }
839 839
840 if (crt_early_log >= ARRAY_SIZE(early_log)) { 840 if (crt_early_log >= ARRAY_SIZE(early_log)) {
841 crt_early_log++;
841 kmemleak_disable(); 842 kmemleak_disable();
842 return; 843 return;
843 } 844 }
@@ -1882,7 +1883,7 @@ void __init kmemleak_init(void)
1882 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); 1883 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
1883 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); 1884 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
1884 1885
1885 if (crt_early_log >= ARRAY_SIZE(early_log)) 1886 if (crt_early_log > ARRAY_SIZE(early_log))
1886 pr_warning("Early log buffer exceeded (%d), please increase " 1887 pr_warning("Early log buffer exceeded (%d), please increase "
1887 "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); 1888 "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
1888 1889
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 909eca2c820e..e1da19fac1b3 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -99,8 +99,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
99 struct list_lru_one *l; 99 struct list_lru_one *l;
100 100
101 spin_lock(&nlru->lock); 101 spin_lock(&nlru->lock);
102 l = list_lru_from_kmem(nlru, item);
103 if (list_empty(item)) { 102 if (list_empty(item)) {
103 l = list_lru_from_kmem(nlru, item);
104 list_add_tail(item, &l->list); 104 list_add_tail(item, &l->list);
105 l->nr_items++; 105 l->nr_items++;
106 spin_unlock(&nlru->lock); 106 spin_unlock(&nlru->lock);
@@ -118,8 +118,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
118 struct list_lru_one *l; 118 struct list_lru_one *l;
119 119
120 spin_lock(&nlru->lock); 120 spin_lock(&nlru->lock);
121 l = list_lru_from_kmem(nlru, item);
122 if (!list_empty(item)) { 121 if (!list_empty(item)) {
122 l = list_lru_from_kmem(nlru, item);
123 list_del_init(item); 123 list_del_init(item);
124 l->nr_items--; 124 l->nr_items--;
125 spin_unlock(&nlru->lock); 125 spin_unlock(&nlru->lock);
diff --git a/mm/madvise.c b/mm/madvise.c
index ce3a4222c7e7..c889fcbb530e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -301,7 +301,7 @@ static long madvise_remove(struct vm_area_struct *vma,
301 301
302 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 302 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
303 303
304 if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) 304 if (vma->vm_flags & VM_LOCKED)
305 return -EINVAL; 305 return -EINVAL;
306 306
307 f = vma->vm_file; 307 f = vma->vm_file;
diff --git a/mm/memblock.c b/mm/memblock.c
index 95ce68c6da8a..1c7b647e5897 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
91 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); 91 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
92} 92}
93 93
94static long __init_memblock memblock_overlaps_region(struct memblock_type *type, 94bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
95 phys_addr_t base, phys_addr_t size) 95 phys_addr_t base, phys_addr_t size)
96{ 96{
97 unsigned long i; 97 unsigned long i;
@@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
103 break; 103 break;
104 } 104 }
105 105
106 return (i < type->cnt) ? i : -1; 106 return i < type->cnt;
107} 107}
108 108
109/* 109/*
@@ -569,6 +569,7 @@ repeat:
569#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 569#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
570 WARN_ON(nid != memblock_get_region_node(rgn)); 570 WARN_ON(nid != memblock_get_region_node(rgn));
571#endif 571#endif
572 WARN_ON(flags != rgn->flags);
572 nr_new++; 573 nr_new++;
573 if (insert) 574 if (insert)
574 memblock_insert_region(type, i++, base, 575 memblock_insert_region(type, i++, base,
@@ -614,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
614 int nid, 615 int nid,
615 unsigned long flags) 616 unsigned long flags)
616{ 617{
617 struct memblock_type *_rgn = &memblock.memory; 618 struct memblock_type *type = &memblock.memory;
618 619
619 memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", 620 memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
620 (unsigned long long)base, 621 (unsigned long long)base,
621 (unsigned long long)base + size - 1, 622 (unsigned long long)base + size - 1,
622 flags, (void *)_RET_IP_); 623 flags, (void *)_RET_IP_);
623 624
624 return memblock_add_range(_rgn, base, size, nid, flags); 625 return memblock_add_range(type, base, size, nid, flags);
625} 626}
626 627
627int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 628int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -761,7 +762,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
761 * 762 *
762 * This function isolates region [@base, @base + @size), and sets/clears flag 763 * This function isolates region [@base, @base + @size), and sets/clears flag
763 * 764 *
764 * Return 0 on succees, -errno on failure. 765 * Return 0 on success, -errno on failure.
765 */ 766 */
766static int __init_memblock memblock_setclr_flag(phys_addr_t base, 767static int __init_memblock memblock_setclr_flag(phys_addr_t base,
767 phys_addr_t size, int set, int flag) 768 phys_addr_t size, int set, int flag)
@@ -788,7 +789,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
788 * @base: the base phys addr of the region 789 * @base: the base phys addr of the region
789 * @size: the size of the region 790 * @size: the size of the region
790 * 791 *
791 * Return 0 on succees, -errno on failure. 792 * Return 0 on success, -errno on failure.
792 */ 793 */
793int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) 794int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
794{ 795{
@@ -800,7 +801,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
800 * @base: the base phys addr of the region 801 * @base: the base phys addr of the region
801 * @size: the size of the region 802 * @size: the size of the region
802 * 803 *
803 * Return 0 on succees, -errno on failure. 804 * Return 0 on success, -errno on failure.
804 */ 805 */
805int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) 806int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
806{ 807{
@@ -812,7 +813,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
812 * @base: the base phys addr of the region 813 * @base: the base phys addr of the region
813 * @size: the size of the region 814 * @size: the size of the region
814 * 815 *
815 * Return 0 on succees, -errno on failure. 816 * Return 0 on success, -errno on failure.
816 */ 817 */
817int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) 818int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
818{ 819{
@@ -834,10 +835,10 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
834 phys_addr_t *out_start, 835 phys_addr_t *out_start,
835 phys_addr_t *out_end) 836 phys_addr_t *out_end)
836{ 837{
837 struct memblock_type *rsv = &memblock.reserved; 838 struct memblock_type *type = &memblock.reserved;
838 839
839 if (*idx >= 0 && *idx < rsv->cnt) { 840 if (*idx >= 0 && *idx < type->cnt) {
840 struct memblock_region *r = &rsv->regions[*idx]; 841 struct memblock_region *r = &type->regions[*idx];
841 phys_addr_t base = r->base; 842 phys_addr_t base = r->base;
842 phys_addr_t size = r->size; 843 phys_addr_t size = r->size;
843 844
@@ -975,7 +976,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
975 * in type_b. 976 * in type_b.
976 * 977 *
977 * @idx: pointer to u64 loop variable 978 * @idx: pointer to u64 loop variable
978 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes 979 * @nid: node selector, %NUMA_NO_NODE for all nodes
979 * @flags: pick from blocks based on memory attributes 980 * @flags: pick from blocks based on memory attributes
980 * @type_a: pointer to memblock_type from where the range is taken 981 * @type_a: pointer to memblock_type from where the range is taken
981 * @type_b: pointer to memblock_type which excludes memory from being taken 982 * @type_b: pointer to memblock_type which excludes memory from being taken
@@ -1565,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
1565 * Check if the region [@base, @base+@size) intersects a reserved memory block. 1566 * Check if the region [@base, @base+@size) intersects a reserved memory block.
1566 * 1567 *
1567 * RETURNS: 1568 * RETURNS:
1568 * 0 if false, non-zero if true 1569 * True if they intersect, false if not.
1569 */ 1570 */
1570int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 1571bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
1571{ 1572{
1572 memblock_cap_size(base, &size); 1573 memblock_cap_size(base, &size);
1573 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; 1574 return memblock_overlaps_region(&memblock.reserved, base, size);
1574} 1575}
1575 1576
1576void __init_memblock memblock_trim_memory(phys_addr_t align) 1577void __init_memblock memblock_trim_memory(phys_addr_t align)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1af057575ce9..1742a2db89c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
111 "unevictable", 111 "unevictable",
112}; 112};
113 113
114/*
115 * Per memcg event counter is incremented at every pagein/pageout. With THP,
116 * it will be incremated by the number of pages. This counter is used for
117 * for trigger some periodic events. This is straightforward and better
118 * than using jiffies etc. to handle periodic memcg event.
119 */
120enum mem_cgroup_events_target {
121 MEM_CGROUP_TARGET_THRESH,
122 MEM_CGROUP_TARGET_SOFTLIMIT,
123 MEM_CGROUP_TARGET_NUMAINFO,
124 MEM_CGROUP_NTARGETS,
125};
126#define THRESHOLDS_EVENTS_TARGET 128 114#define THRESHOLDS_EVENTS_TARGET 128
127#define SOFTLIMIT_EVENTS_TARGET 1024 115#define SOFTLIMIT_EVENTS_TARGET 1024
128#define NUMAINFO_EVENTS_TARGET 1024 116#define NUMAINFO_EVENTS_TARGET 1024
129 117
130struct mem_cgroup_stat_cpu {
131 long count[MEM_CGROUP_STAT_NSTATS];
132 unsigned long events[MEMCG_NR_EVENTS];
133 unsigned long nr_page_events;
134 unsigned long targets[MEM_CGROUP_NTARGETS];
135};
136
137struct reclaim_iter {
138 struct mem_cgroup *position;
139 /* scan generation, increased every round-trip */
140 unsigned int generation;
141};
142
143/*
144 * per-zone information in memory controller.
145 */
146struct mem_cgroup_per_zone {
147 struct lruvec lruvec;
148 unsigned long lru_size[NR_LRU_LISTS];
149
150 struct reclaim_iter iter[DEF_PRIORITY + 1];
151
152 struct rb_node tree_node; /* RB tree node */
153 unsigned long usage_in_excess;/* Set to the value by which */
154 /* the soft limit is exceeded*/
155 bool on_tree;
156 struct mem_cgroup *memcg; /* Back pointer, we cannot */
157 /* use container_of */
158};
159
160struct mem_cgroup_per_node {
161 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
162};
163
164/* 118/*
165 * Cgroups above their limits are maintained in a RB-Tree, independent of 119 * Cgroups above their limits are maintained in a RB-Tree, independent of
166 * their hierarchy representation 120 * their hierarchy representation
@@ -181,32 +135,6 @@ struct mem_cgroup_tree {
181 135
182static struct mem_cgroup_tree soft_limit_tree __read_mostly; 136static struct mem_cgroup_tree soft_limit_tree __read_mostly;
183 137
184struct mem_cgroup_threshold {
185 struct eventfd_ctx *eventfd;
186 unsigned long threshold;
187};
188
189/* For threshold */
190struct mem_cgroup_threshold_ary {
191 /* An array index points to threshold just below or equal to usage. */
192 int current_threshold;
193 /* Size of entries[] */
194 unsigned int size;
195 /* Array of thresholds */
196 struct mem_cgroup_threshold entries[0];
197};
198
199struct mem_cgroup_thresholds {
200 /* Primary thresholds array */
201 struct mem_cgroup_threshold_ary *primary;
202 /*
203 * Spare threshold array.
204 * This is needed to make mem_cgroup_unregister_event() "never fail".
205 * It must be able to store at least primary->size - 1 entries.
206 */
207 struct mem_cgroup_threshold_ary *spare;
208};
209
210/* for OOM */ 138/* for OOM */
211struct mem_cgroup_eventfd_list { 139struct mem_cgroup_eventfd_list {
212 struct list_head list; 140 struct list_head list;
@@ -256,113 +184,6 @@ struct mem_cgroup_event {
256static void mem_cgroup_threshold(struct mem_cgroup *memcg); 184static void mem_cgroup_threshold(struct mem_cgroup *memcg);
257static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 185static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
258 186
259/*
260 * The memory controller data structure. The memory controller controls both
261 * page cache and RSS per cgroup. We would eventually like to provide
262 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
263 * to help the administrator determine what knobs to tune.
264 */
265struct mem_cgroup {
266 struct cgroup_subsys_state css;
267
268 /* Accounted resources */
269 struct page_counter memory;
270 struct page_counter memsw;
271 struct page_counter kmem;
272
273 /* Normal memory consumption range */
274 unsigned long low;
275 unsigned long high;
276
277 unsigned long soft_limit;
278
279 /* vmpressure notifications */
280 struct vmpressure vmpressure;
281
282 /* css_online() has been completed */
283 int initialized;
284
285 /*
286 * Should the accounting and control be hierarchical, per subtree?
287 */
288 bool use_hierarchy;
289
290 /* protected by memcg_oom_lock */
291 bool oom_lock;
292 int under_oom;
293
294 int swappiness;
295 /* OOM-Killer disable */
296 int oom_kill_disable;
297
298 /* protect arrays of thresholds */
299 struct mutex thresholds_lock;
300
301 /* thresholds for memory usage. RCU-protected */
302 struct mem_cgroup_thresholds thresholds;
303
304 /* thresholds for mem+swap usage. RCU-protected */
305 struct mem_cgroup_thresholds memsw_thresholds;
306
307 /* For oom notifier event fd */
308 struct list_head oom_notify;
309
310 /*
311 * Should we move charges of a task when a task is moved into this
312 * mem_cgroup ? And what type of charges should we move ?
313 */
314 unsigned long move_charge_at_immigrate;
315 /*
316 * set > 0 if pages under this cgroup are moving to other cgroup.
317 */
318 atomic_t moving_account;
319 /* taken only while moving_account > 0 */
320 spinlock_t move_lock;
321 struct task_struct *move_lock_task;
322 unsigned long move_lock_flags;
323 /*
324 * percpu counter.
325 */
326 struct mem_cgroup_stat_cpu __percpu *stat;
327 spinlock_t pcp_counter_lock;
328
329#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
330 struct cg_proto tcp_mem;
331#endif
332#if defined(CONFIG_MEMCG_KMEM)
333 /* Index in the kmem_cache->memcg_params.memcg_caches array */
334 int kmemcg_id;
335 bool kmem_acct_activated;
336 bool kmem_acct_active;
337#endif
338
339 int last_scanned_node;
340#if MAX_NUMNODES > 1
341 nodemask_t scan_nodes;
342 atomic_t numainfo_events;
343 atomic_t numainfo_updating;
344#endif
345
346#ifdef CONFIG_CGROUP_WRITEBACK
347 struct list_head cgwb_list;
348 struct wb_domain cgwb_domain;
349#endif
350
351 /* List of events which userspace want to receive */
352 struct list_head event_list;
353 spinlock_t event_list_lock;
354
355 struct mem_cgroup_per_node *nodeinfo[0];
356 /* WARNING: nodeinfo must be the last member here */
357};
358
359#ifdef CONFIG_MEMCG_KMEM
360bool memcg_kmem_is_active(struct mem_cgroup *memcg)
361{
362 return memcg->kmem_acct_active;
363}
364#endif
365
366/* Stuffs for move charges at task migration. */ 187/* Stuffs for move charges at task migration. */
367/* 188/*
368 * Types of charges to be moved. 189 * Types of charges to be moved.
@@ -423,11 +244,6 @@ enum res_type {
423 */ 244 */
424static DEFINE_MUTEX(memcg_create_mutex); 245static DEFINE_MUTEX(memcg_create_mutex);
425 246
426struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
427{
428 return s ? container_of(s, struct mem_cgroup, css) : NULL;
429}
430
431/* Some nice accessors for the vmpressure. */ 247/* Some nice accessors for the vmpressure. */
432struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 248struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
433{ 249{
@@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)
499 rcu_read_lock(); 315 rcu_read_lock();
500 memcg = mem_cgroup_from_task(current); 316 memcg = mem_cgroup_from_task(current);
501 cg_proto = sk->sk_prot->proto_cgroup(memcg); 317 cg_proto = sk->sk_prot->proto_cgroup(memcg);
502 if (!mem_cgroup_is_root(memcg) && 318 if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
503 memcg_proto_active(cg_proto) &&
504 css_tryget_online(&memcg->css)) { 319 css_tryget_online(&memcg->css)) {
505 sk->sk_cgrp = cg_proto; 320 sk->sk_cgrp = cg_proto;
506 } 321 }
@@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
593 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 408 return &memcg->nodeinfo[nid]->zoneinfo[zid];
594} 409}
595 410
596struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
597{
598 return &memcg->css;
599}
600
601/** 411/**
602 * mem_cgroup_css_from_page - css of the memcg associated with a page 412 * mem_cgroup_css_from_page - css of the memcg associated with a page
603 * @page: page of interest 413 * @page: page of interest
@@ -876,14 +686,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
876 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 686 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
877} 687}
878 688
879unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
880{
881 struct mem_cgroup_per_zone *mz;
882
883 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
884 return mz->lru_size[lru];
885}
886
887static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 689static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
888 int nid, 690 int nid,
889 unsigned int lru_mask) 691 unsigned int lru_mask)
@@ -986,6 +788,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
986 788
987 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 789 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
988} 790}
791EXPORT_SYMBOL(mem_cgroup_from_task);
989 792
990static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 793static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
991{ 794{
@@ -1031,7 +834,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1031 struct mem_cgroup *prev, 834 struct mem_cgroup *prev,
1032 struct mem_cgroup_reclaim_cookie *reclaim) 835 struct mem_cgroup_reclaim_cookie *reclaim)
1033{ 836{
1034 struct reclaim_iter *uninitialized_var(iter); 837 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1035 struct cgroup_subsys_state *css = NULL; 838 struct cgroup_subsys_state *css = NULL;
1036 struct mem_cgroup *memcg = NULL; 839 struct mem_cgroup *memcg = NULL;
1037 struct mem_cgroup *pos = NULL; 840 struct mem_cgroup *pos = NULL;
@@ -1173,30 +976,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
1173 iter != NULL; \ 976 iter != NULL; \
1174 iter = mem_cgroup_iter(NULL, iter, NULL)) 977 iter = mem_cgroup_iter(NULL, iter, NULL))
1175 978
1176void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1177{
1178 struct mem_cgroup *memcg;
1179
1180 rcu_read_lock();
1181 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1182 if (unlikely(!memcg))
1183 goto out;
1184
1185 switch (idx) {
1186 case PGFAULT:
1187 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1188 break;
1189 case PGMAJFAULT:
1190 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1191 break;
1192 default:
1193 BUG();
1194 }
1195out:
1196 rcu_read_unlock();
1197}
1198EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1199
1200/** 979/**
1201 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 980 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1202 * @zone: zone of the wanted lruvec 981 * @zone: zone of the wanted lruvec
@@ -1295,15 +1074,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1295 VM_BUG_ON((long)(*lru_size) < 0); 1074 VM_BUG_ON((long)(*lru_size) < 0);
1296} 1075}
1297 1076
1298bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
1299{
1300 if (root == memcg)
1301 return true;
1302 if (!root->use_hierarchy)
1303 return false;
1304 return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
1305}
1306
1307bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) 1077bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1308{ 1078{
1309 struct mem_cgroup *task_memcg; 1079 struct mem_cgroup *task_memcg;
@@ -1330,39 +1100,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1330 return ret; 1100 return ret;
1331} 1101}
1332 1102
1333int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1334{
1335 unsigned long inactive_ratio;
1336 unsigned long inactive;
1337 unsigned long active;
1338 unsigned long gb;
1339
1340 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1341 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1342
1343 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1344 if (gb)
1345 inactive_ratio = int_sqrt(10 * gb);
1346 else
1347 inactive_ratio = 1;
1348
1349 return inactive * inactive_ratio < active;
1350}
1351
1352bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
1353{
1354 struct mem_cgroup_per_zone *mz;
1355 struct mem_cgroup *memcg;
1356
1357 if (mem_cgroup_disabled())
1358 return true;
1359
1360 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1361 memcg = mz->memcg;
1362
1363 return !!(memcg->css.flags & CSS_ONLINE);
1364}
1365
1366#define mem_cgroup_from_counter(counter, member) \ 1103#define mem_cgroup_from_counter(counter, member) \
1367 container_of(counter, struct mem_cgroup, member) 1104 container_of(counter, struct mem_cgroup, member)
1368 1105
@@ -1394,15 +1131,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1394 return margin; 1131 return margin;
1395} 1132}
1396 1133
1397int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1398{
1399 /* root ? */
1400 if (mem_cgroup_disabled() || !memcg->css.parent)
1401 return vm_swappiness;
1402
1403 return memcg->swappiness;
1404}
1405
1406/* 1134/*
1407 * A routine for checking "mem" is under move_account() or not. 1135 * A routine for checking "mem" is under move_account() or not.
1408 * 1136 *
@@ -1545,6 +1273,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1545static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1273static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1546 int order) 1274 int order)
1547{ 1275{
1276 struct oom_control oc = {
1277 .zonelist = NULL,
1278 .nodemask = NULL,
1279 .gfp_mask = gfp_mask,
1280 .order = order,
1281 };
1548 struct mem_cgroup *iter; 1282 struct mem_cgroup *iter;
1549 unsigned long chosen_points = 0; 1283 unsigned long chosen_points = 0;
1550 unsigned long totalpages; 1284 unsigned long totalpages;
@@ -1563,7 +1297,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1563 goto unlock; 1297 goto unlock;
1564 } 1298 }
1565 1299
1566 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); 1300 check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
1567 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1301 totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1568 for_each_mem_cgroup_tree(iter, memcg) { 1302 for_each_mem_cgroup_tree(iter, memcg) {
1569 struct css_task_iter it; 1303 struct css_task_iter it;
@@ -1571,8 +1305,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1571 1305
1572 css_task_iter_start(&iter->css, &it); 1306 css_task_iter_start(&iter->css, &it);
1573 while ((task = css_task_iter_next(&it))) { 1307 while ((task = css_task_iter_next(&it))) {
1574 switch (oom_scan_process_thread(task, totalpages, NULL, 1308 switch (oom_scan_process_thread(&oc, task, totalpages)) {
1575 false)) {
1576 case OOM_SCAN_SELECT: 1309 case OOM_SCAN_SELECT:
1577 if (chosen) 1310 if (chosen)
1578 put_task_struct(chosen); 1311 put_task_struct(chosen);
@@ -1610,8 +1343,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1610 1343
1611 if (chosen) { 1344 if (chosen) {
1612 points = chosen_points * 1000 / totalpages; 1345 points = chosen_points * 1000 / totalpages;
1613 oom_kill_process(chosen, gfp_mask, order, points, totalpages, 1346 oom_kill_process(&oc, chosen, points, totalpages, memcg,
1614 memcg, NULL, "Memory cgroup out of memory"); 1347 "Memory cgroup out of memory");
1615 } 1348 }
1616unlock: 1349unlock:
1617 mutex_unlock(&oom_lock); 1350 mutex_unlock(&oom_lock);
@@ -2062,23 +1795,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
2062} 1795}
2063EXPORT_SYMBOL(mem_cgroup_end_page_stat); 1796EXPORT_SYMBOL(mem_cgroup_end_page_stat);
2064 1797
2065/**
2066 * mem_cgroup_update_page_stat - update page state statistics
2067 * @memcg: memcg to account against
2068 * @idx: page state item to account
2069 * @val: number of pages (positive or negative)
2070 *
2071 * See mem_cgroup_begin_page_stat() for locking requirements.
2072 */
2073void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
2074 enum mem_cgroup_stat_index idx, int val)
2075{
2076 VM_BUG_ON(!rcu_read_lock_held());
2077
2078 if (memcg)
2079 this_cpu_add(memcg->stat->count[idx], val);
2080}
2081
2082/* 1798/*
2083 * size of first charge trial. "32" comes from vmscan.c's magic value. 1799 * size of first charge trial. "32" comes from vmscan.c's magic value.
2084 * TODO: maybe necessary to use big numbers in big irons. 1800 * TODO: maybe necessary to use big numbers in big irons.
@@ -2504,16 +2220,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
2504 css_put_many(&memcg->css, nr_pages); 2220 css_put_many(&memcg->css, nr_pages);
2505} 2221}
2506 2222
2507/*
2508 * helper for acessing a memcg's index. It will be used as an index in the
2509 * child cache array in kmem_cache, and also to derive its name. This function
2510 * will return -1 when this is not a kmem-limited memcg.
2511 */
2512int memcg_cache_id(struct mem_cgroup *memcg)
2513{
2514 return memcg ? memcg->kmemcg_id : -1;
2515}
2516
2517static int memcg_alloc_cache_id(void) 2223static int memcg_alloc_cache_id(void)
2518{ 2224{
2519 int id, size; 2225 int id, size;
@@ -5127,10 +4833,12 @@ static void mem_cgroup_clear_mc(void)
5127static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 4833static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5128 struct cgroup_taskset *tset) 4834 struct cgroup_taskset *tset)
5129{ 4835{
5130 struct task_struct *p = cgroup_taskset_first(tset);
5131 int ret = 0;
5132 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4836 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4837 struct mem_cgroup *from;
4838 struct task_struct *p;
4839 struct mm_struct *mm;
5133 unsigned long move_flags; 4840 unsigned long move_flags;
4841 int ret = 0;
5134 4842
5135 /* 4843 /*
5136 * We are now commited to this value whatever it is. Changes in this 4844 * We are now commited to this value whatever it is. Changes in this
@@ -5138,36 +4846,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5138 * So we need to save it, and keep it going. 4846 * So we need to save it, and keep it going.
5139 */ 4847 */
5140 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 4848 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5141 if (move_flags) { 4849 if (!move_flags)
5142 struct mm_struct *mm; 4850 return 0;
5143 struct mem_cgroup *from = mem_cgroup_from_task(p);
5144 4851
5145 VM_BUG_ON(from == memcg); 4852 p = cgroup_taskset_first(tset);
4853 from = mem_cgroup_from_task(p);
5146 4854
5147 mm = get_task_mm(p); 4855 VM_BUG_ON(from == memcg);
5148 if (!mm) 4856
5149 return 0; 4857 mm = get_task_mm(p);
5150 /* We move charges only when we move a owner of the mm */ 4858 if (!mm)
5151 if (mm->owner == p) { 4859 return 0;
5152 VM_BUG_ON(mc.from); 4860 /* We move charges only when we move a owner of the mm */
5153 VM_BUG_ON(mc.to); 4861 if (mm->owner == p) {
5154 VM_BUG_ON(mc.precharge); 4862 VM_BUG_ON(mc.from);
5155 VM_BUG_ON(mc.moved_charge); 4863 VM_BUG_ON(mc.to);
5156 VM_BUG_ON(mc.moved_swap); 4864 VM_BUG_ON(mc.precharge);
5157 4865 VM_BUG_ON(mc.moved_charge);
5158 spin_lock(&mc.lock); 4866 VM_BUG_ON(mc.moved_swap);
5159 mc.from = from; 4867
5160 mc.to = memcg; 4868 spin_lock(&mc.lock);
5161 mc.flags = move_flags; 4869 mc.from = from;
5162 spin_unlock(&mc.lock); 4870 mc.to = memcg;
5163 /* We set mc.moving_task later */ 4871 mc.flags = move_flags;
5164 4872 spin_unlock(&mc.lock);
5165 ret = mem_cgroup_precharge_mc(mm); 4873 /* We set mc.moving_task later */
5166 if (ret) 4874
5167 mem_cgroup_clear_mc(); 4875 ret = mem_cgroup_precharge_mc(mm);
5168 } 4876 if (ret)
5169 mmput(mm); 4877 mem_cgroup_clear_mc();
5170 } 4878 }
4879 mmput(mm);
5171 return ret; 4880 return ret;
5172} 4881}
5173 4882
@@ -5521,19 +5230,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
5521}; 5230};
5522 5231
5523/** 5232/**
5524 * mem_cgroup_events - count memory events against a cgroup
5525 * @memcg: the memory cgroup
5526 * @idx: the event index
5527 * @nr: the number of events to account for
5528 */
5529void mem_cgroup_events(struct mem_cgroup *memcg,
5530 enum mem_cgroup_events_index idx,
5531 unsigned int nr)
5532{
5533 this_cpu_add(memcg->stat->events[idx], nr);
5534}
5535
5536/**
5537 * mem_cgroup_low - check if memory consumption is below the normal range 5233 * mem_cgroup_low - check if memory consumption is below the normal range
5538 * @root: the highest ancestor to consider 5234 * @root: the highest ancestor to consider
5539 * @memcg: the memory cgroup to check 5235 * @memcg: the memory cgroup to check
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1f4446a90cef..eeda6485e76c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -146,7 +146,7 @@ static int hwpoison_filter_task(struct page *p)
146 if (!mem) 146 if (!mem)
147 return -EINVAL; 147 return -EINVAL;
148 148
149 css = mem_cgroup_css(mem); 149 css = &mem->css;
150 ino = cgroup_ino(css->cgroup); 150 ino = cgroup_ino(css->cgroup);
151 css_put(css); 151 css_put(css);
152 152
@@ -934,6 +934,27 @@ int get_hwpoison_page(struct page *page)
934} 934}
935EXPORT_SYMBOL_GPL(get_hwpoison_page); 935EXPORT_SYMBOL_GPL(get_hwpoison_page);
936 936
937/**
938 * put_hwpoison_page() - Put refcount for memory error handling:
939 * @page: raw error page (hit by memory error)
940 */
941void put_hwpoison_page(struct page *page)
942{
943 struct page *head = compound_head(page);
944
945 if (PageHuge(head)) {
946 put_page(head);
947 return;
948 }
949
950 if (PageTransHuge(head))
951 if (page != head)
952 put_page(head);
953
954 put_page(page);
955}
956EXPORT_SYMBOL_GPL(put_hwpoison_page);
957
937/* 958/*
938 * Do all that is necessary to remove user space mappings. Unmap 959 * Do all that is necessary to remove user space mappings. Unmap
939 * the pages and send SIGBUS to the processes if the data was dirty. 960 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -1100,7 +1121,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1100 nr_pages = 1 << compound_order(hpage); 1121 nr_pages = 1 << compound_order(hpage);
1101 else /* normal page or thp */ 1122 else /* normal page or thp */
1102 nr_pages = 1; 1123 nr_pages = 1;
1103 atomic_long_add(nr_pages, &num_poisoned_pages); 1124 num_poisoned_pages_add(nr_pages);
1104 1125
1105 /* 1126 /*
1106 * We need/can do nothing about count=0 pages. 1127 * We need/can do nothing about count=0 pages.
@@ -1128,7 +1149,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1128 if (PageHWPoison(hpage)) { 1149 if (PageHWPoison(hpage)) {
1129 if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) 1150 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1130 || (p != hpage && TestSetPageHWPoison(hpage))) { 1151 || (p != hpage && TestSetPageHWPoison(hpage))) {
1131 atomic_long_sub(nr_pages, &num_poisoned_pages); 1152 num_poisoned_pages_sub(nr_pages);
1132 unlock_page(hpage); 1153 unlock_page(hpage);
1133 return 0; 1154 return 0;
1134 } 1155 }
@@ -1152,10 +1173,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1152 else 1173 else
1153 pr_err("MCE: %#lx: thp split failed\n", pfn); 1174 pr_err("MCE: %#lx: thp split failed\n", pfn);
1154 if (TestClearPageHWPoison(p)) 1175 if (TestClearPageHWPoison(p))
1155 atomic_long_sub(nr_pages, &num_poisoned_pages); 1176 num_poisoned_pages_sub(nr_pages);
1156 put_page(p); 1177 put_hwpoison_page(p);
1157 if (p != hpage)
1158 put_page(hpage);
1159 return -EBUSY; 1178 return -EBUSY;
1160 } 1179 }
1161 VM_BUG_ON_PAGE(!page_count(p), p); 1180 VM_BUG_ON_PAGE(!page_count(p), p);
@@ -1214,16 +1233,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1214 */ 1233 */
1215 if (!PageHWPoison(p)) { 1234 if (!PageHWPoison(p)) {
1216 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); 1235 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1217 atomic_long_sub(nr_pages, &num_poisoned_pages); 1236 num_poisoned_pages_sub(nr_pages);
1218 unlock_page(hpage); 1237 unlock_page(hpage);
1219 put_page(hpage); 1238 put_hwpoison_page(hpage);
1220 return 0; 1239 return 0;
1221 } 1240 }
1222 if (hwpoison_filter(p)) { 1241 if (hwpoison_filter(p)) {
1223 if (TestClearPageHWPoison(p)) 1242 if (TestClearPageHWPoison(p))
1224 atomic_long_sub(nr_pages, &num_poisoned_pages); 1243 num_poisoned_pages_sub(nr_pages);
1225 unlock_page(hpage); 1244 unlock_page(hpage);
1226 put_page(hpage); 1245 put_hwpoison_page(hpage);
1227 return 0; 1246 return 0;
1228 } 1247 }
1229 1248
@@ -1237,7 +1256,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1237 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { 1256 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1238 action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); 1257 action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
1239 unlock_page(hpage); 1258 unlock_page(hpage);
1240 put_page(hpage); 1259 put_hwpoison_page(hpage);
1241 return 0; 1260 return 0;
1242 } 1261 }
1243 /* 1262 /*
@@ -1426,6 +1445,22 @@ int unpoison_memory(unsigned long pfn)
1426 return 0; 1445 return 0;
1427 } 1446 }
1428 1447
1448 if (page_count(page) > 1) {
1449 pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
1450 return 0;
1451 }
1452
1453 if (page_mapped(page)) {
1454 pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
1455 return 0;
1456 }
1457
1458 if (page_mapping(page)) {
1459 pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
1460 pfn);
1461 return 0;
1462 }
1463
1429 /* 1464 /*
1430 * unpoison_memory() can encounter thp only when the thp is being 1465 * unpoison_memory() can encounter thp only when the thp is being
1431 * worked by memory_failure() and the page lock is not held yet. 1466 * worked by memory_failure() and the page lock is not held yet.
@@ -1450,7 +1485,7 @@ int unpoison_memory(unsigned long pfn)
1450 return 0; 1485 return 0;
1451 } 1486 }
1452 if (TestClearPageHWPoison(p)) 1487 if (TestClearPageHWPoison(p))
1453 atomic_long_dec(&num_poisoned_pages); 1488 num_poisoned_pages_dec();
1454 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); 1489 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1455 return 0; 1490 return 0;
1456 } 1491 }
@@ -1464,16 +1499,16 @@ int unpoison_memory(unsigned long pfn)
1464 */ 1499 */
1465 if (TestClearPageHWPoison(page)) { 1500 if (TestClearPageHWPoison(page)) {
1466 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); 1501 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1467 atomic_long_sub(nr_pages, &num_poisoned_pages); 1502 num_poisoned_pages_sub(nr_pages);
1468 freeit = 1; 1503 freeit = 1;
1469 if (PageHuge(page)) 1504 if (PageHuge(page))
1470 clear_page_hwpoison_huge_page(page); 1505 clear_page_hwpoison_huge_page(page);
1471 } 1506 }
1472 unlock_page(page); 1507 unlock_page(page);
1473 1508
1474 put_page(page); 1509 put_hwpoison_page(page);
1475 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) 1510 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1476 put_page(page); 1511 put_hwpoison_page(page);
1477 1512
1478 return 0; 1513 return 0;
1479} 1514}
@@ -1486,7 +1521,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
1486 return alloc_huge_page_node(page_hstate(compound_head(p)), 1521 return alloc_huge_page_node(page_hstate(compound_head(p)),
1487 nid); 1522 nid);
1488 else 1523 else
1489 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1524 return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1490} 1525}
1491 1526
1492/* 1527/*
@@ -1533,7 +1568,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
1533 /* 1568 /*
1534 * Try to free it. 1569 * Try to free it.
1535 */ 1570 */
1536 put_page(page); 1571 put_hwpoison_page(page);
1537 shake_page(page, 1); 1572 shake_page(page, 1);
1538 1573
1539 /* 1574 /*
@@ -1542,7 +1577,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
1542 ret = __get_any_page(page, pfn, 0); 1577 ret = __get_any_page(page, pfn, 0);
1543 if (!PageLRU(page)) { 1578 if (!PageLRU(page)) {
1544 /* Drop page reference which is from __get_any_page() */ 1579 /* Drop page reference which is from __get_any_page() */
1545 put_page(page); 1580 put_hwpoison_page(page);
1546 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", 1581 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1547 pfn, page->flags); 1582 pfn, page->flags);
1548 return -EIO; 1583 return -EIO;
@@ -1565,7 +1600,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1565 lock_page(hpage); 1600 lock_page(hpage);
1566 if (PageHWPoison(hpage)) { 1601 if (PageHWPoison(hpage)) {
1567 unlock_page(hpage); 1602 unlock_page(hpage);
1568 put_page(hpage); 1603 put_hwpoison_page(hpage);
1569 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); 1604 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1570 return -EBUSY; 1605 return -EBUSY;
1571 } 1606 }
@@ -1576,7 +1611,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1576 * get_any_page() and isolate_huge_page() takes a refcount each, 1611 * get_any_page() and isolate_huge_page() takes a refcount each,
1577 * so need to drop one here. 1612 * so need to drop one here.
1578 */ 1613 */
1579 put_page(hpage); 1614 put_hwpoison_page(hpage);
1580 if (!ret) { 1615 if (!ret) {
1581 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); 1616 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1582 return -EBUSY; 1617 return -EBUSY;
@@ -1600,11 +1635,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
1600 if (PageHuge(page)) { 1635 if (PageHuge(page)) {
1601 set_page_hwpoison_huge_page(hpage); 1636 set_page_hwpoison_huge_page(hpage);
1602 dequeue_hwpoisoned_huge_page(hpage); 1637 dequeue_hwpoisoned_huge_page(hpage);
1603 atomic_long_add(1 << compound_order(hpage), 1638 num_poisoned_pages_add(1 << compound_order(hpage));
1604 &num_poisoned_pages);
1605 } else { 1639 } else {
1606 SetPageHWPoison(page); 1640 SetPageHWPoison(page);
1607 atomic_long_inc(&num_poisoned_pages); 1641 num_poisoned_pages_inc();
1608 } 1642 }
1609 } 1643 }
1610 return ret; 1644 return ret;
@@ -1625,7 +1659,7 @@ static int __soft_offline_page(struct page *page, int flags)
1625 wait_on_page_writeback(page); 1659 wait_on_page_writeback(page);
1626 if (PageHWPoison(page)) { 1660 if (PageHWPoison(page)) {
1627 unlock_page(page); 1661 unlock_page(page);
1628 put_page(page); 1662 put_hwpoison_page(page);
1629 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1663 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1630 return -EBUSY; 1664 return -EBUSY;
1631 } 1665 }
@@ -1640,10 +1674,10 @@ static int __soft_offline_page(struct page *page, int flags)
1640 * would need to fix isolation locking first. 1674 * would need to fix isolation locking first.
1641 */ 1675 */
1642 if (ret == 1) { 1676 if (ret == 1) {
1643 put_page(page); 1677 put_hwpoison_page(page);
1644 pr_info("soft_offline: %#lx: invalidated\n", pfn); 1678 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1645 SetPageHWPoison(page); 1679 SetPageHWPoison(page);
1646 atomic_long_inc(&num_poisoned_pages); 1680 num_poisoned_pages_inc();
1647 return 0; 1681 return 0;
1648 } 1682 }
1649 1683
@@ -1657,14 +1691,12 @@ static int __soft_offline_page(struct page *page, int flags)
1657 * Drop page reference which is came from get_any_page() 1691 * Drop page reference which is came from get_any_page()
1658 * successful isolate_lru_page() already took another one. 1692 * successful isolate_lru_page() already took another one.
1659 */ 1693 */
1660 put_page(page); 1694 put_hwpoison_page(page);
1661 if (!ret) { 1695 if (!ret) {
1662 LIST_HEAD(pagelist); 1696 LIST_HEAD(pagelist);
1663 inc_zone_page_state(page, NR_ISOLATED_ANON + 1697 inc_zone_page_state(page, NR_ISOLATED_ANON +
1664 page_is_file_cache(page)); 1698 page_is_file_cache(page));
1665 list_add(&page->lru, &pagelist); 1699 list_add(&page->lru, &pagelist);
1666 if (!TestSetPageHWPoison(page))
1667 atomic_long_inc(&num_poisoned_pages);
1668 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, 1700 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1669 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1701 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1670 if (ret) { 1702 if (ret) {
@@ -1679,8 +1711,6 @@ static int __soft_offline_page(struct page *page, int flags)
1679 pfn, ret, page->flags); 1711 pfn, ret, page->flags);
1680 if (ret > 0) 1712 if (ret > 0)
1681 ret = -EIO; 1713 ret = -EIO;
1682 if (TestClearPageHWPoison(page))
1683 atomic_long_dec(&num_poisoned_pages);
1684 } 1714 }
1685 } else { 1715 } else {
1686 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1716 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1719,12 +1749,16 @@ int soft_offline_page(struct page *page, int flags)
1719 1749
1720 if (PageHWPoison(page)) { 1750 if (PageHWPoison(page)) {
1721 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1751 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1752 if (flags & MF_COUNT_INCREASED)
1753 put_hwpoison_page(page);
1722 return -EBUSY; 1754 return -EBUSY;
1723 } 1755 }
1724 if (!PageHuge(page) && PageTransHuge(hpage)) { 1756 if (!PageHuge(page) && PageTransHuge(hpage)) {
1725 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { 1757 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1726 pr_info("soft offline: %#lx: failed to split THP\n", 1758 pr_info("soft offline: %#lx: failed to split THP\n",
1727 pfn); 1759 pfn);
1760 if (flags & MF_COUNT_INCREASED)
1761 put_hwpoison_page(page);
1728 return -EBUSY; 1762 return -EBUSY;
1729 } 1763 }
1730 } 1764 }
@@ -1742,11 +1776,10 @@ int soft_offline_page(struct page *page, int flags)
1742 if (PageHuge(page)) { 1776 if (PageHuge(page)) {
1743 set_page_hwpoison_huge_page(hpage); 1777 set_page_hwpoison_huge_page(hpage);
1744 if (!dequeue_hwpoisoned_huge_page(hpage)) 1778 if (!dequeue_hwpoisoned_huge_page(hpage))
1745 atomic_long_add(1 << compound_order(hpage), 1779 num_poisoned_pages_add(1 << compound_order(hpage));
1746 &num_poisoned_pages);
1747 } else { 1780 } else {
1748 if (!TestSetPageHWPoison(page)) 1781 if (!TestSetPageHWPoison(page))
1749 atomic_long_inc(&num_poisoned_pages); 1782 num_poisoned_pages_inc();
1750 } 1783 }
1751 } 1784 }
1752 return ret; 1785 return ret;
diff --git a/mm/memory.c b/mm/memory.c
index bb04d8f2f86c..6cd0b2160401 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2426,8 +2426,6 @@ void unmap_mapping_range(struct address_space *mapping,
2426 if (details.last_index < details.first_index) 2426 if (details.last_index < details.first_index)
2427 details.last_index = ULONG_MAX; 2427 details.last_index = ULONG_MAX;
2428 2428
2429
2430 /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
2431 i_mmap_lock_write(mapping); 2429 i_mmap_lock_write(mapping);
2432 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) 2430 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2433 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2431 unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -3015,9 +3013,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3015 } else { 3013 } else {
3016 /* 3014 /*
3017 * The fault handler has no page to lock, so it holds 3015 * The fault handler has no page to lock, so it holds
3018 * i_mmap_lock for read to protect against truncate. 3016 * i_mmap_lock for write to protect against truncate.
3019 */ 3017 */
3020 i_mmap_unlock_read(vma->vm_file->f_mapping); 3018 i_mmap_unlock_write(vma->vm_file->f_mapping);
3021 } 3019 }
3022 goto uncharge_out; 3020 goto uncharge_out;
3023 } 3021 }
@@ -3031,9 +3029,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3031 } else { 3029 } else {
3032 /* 3030 /*
3033 * The fault handler has no page to lock, so it holds 3031 * The fault handler has no page to lock, so it holds
3034 * i_mmap_lock for read to protect against truncate. 3032 * i_mmap_lock for write to protect against truncate.
3035 */ 3033 */
3036 i_mmap_unlock_read(vma->vm_file->f_mapping); 3034 i_mmap_unlock_write(vma->vm_file->f_mapping);
3037 } 3035 }
3038 return ret; 3036 return ret;
3039uncharge_out: 3037uncharge_out:
@@ -3232,6 +3230,27 @@ out:
3232 return 0; 3230 return 0;
3233} 3231}
3234 3232
3233static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3234 unsigned long address, pmd_t *pmd, unsigned int flags)
3235{
3236 if (!vma->vm_ops)
3237 return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
3238 if (vma->vm_ops->pmd_fault)
3239 return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
3240 return VM_FAULT_FALLBACK;
3241}
3242
3243static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3244 unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
3245 unsigned int flags)
3246{
3247 if (!vma->vm_ops)
3248 return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
3249 if (vma->vm_ops->pmd_fault)
3250 return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
3251 return VM_FAULT_FALLBACK;
3252}
3253
3235/* 3254/*
3236 * These routines also need to handle stuff like marking pages dirty 3255 * These routines also need to handle stuff like marking pages dirty
3237 * and/or accessed for architectures that don't do it in hardware (most 3256 * and/or accessed for architectures that don't do it in hardware (most
@@ -3267,12 +3286,12 @@ static int handle_pte_fault(struct mm_struct *mm,
3267 barrier(); 3286 barrier();
3268 if (!pte_present(entry)) { 3287 if (!pte_present(entry)) {
3269 if (pte_none(entry)) { 3288 if (pte_none(entry)) {
3270 if (vma->vm_ops) 3289 if (vma_is_anonymous(vma))
3290 return do_anonymous_page(mm, vma, address,
3291 pte, pmd, flags);
3292 else
3271 return do_fault(mm, vma, address, pte, pmd, 3293 return do_fault(mm, vma, address, pte, pmd,
3272 flags, entry); 3294 flags, entry);
3273
3274 return do_anonymous_page(mm, vma, address, pte, pmd,
3275 flags);
3276 } 3295 }
3277 return do_swap_page(mm, vma, address, 3296 return do_swap_page(mm, vma, address,
3278 pte, pmd, flags, entry); 3297 pte, pmd, flags, entry);
@@ -3334,10 +3353,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3334 if (!pmd) 3353 if (!pmd)
3335 return VM_FAULT_OOM; 3354 return VM_FAULT_OOM;
3336 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { 3355 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3337 int ret = VM_FAULT_FALLBACK; 3356 int ret = create_huge_pmd(mm, vma, address, pmd, flags);
3338 if (!vma->vm_ops)
3339 ret = do_huge_pmd_anonymous_page(mm, vma, address,
3340 pmd, flags);
3341 if (!(ret & VM_FAULT_FALLBACK)) 3357 if (!(ret & VM_FAULT_FALLBACK))
3342 return ret; 3358 return ret;
3343 } else { 3359 } else {
@@ -3361,8 +3377,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3361 orig_pmd, pmd); 3377 orig_pmd, pmd);
3362 3378
3363 if (dirty && !pmd_write(orig_pmd)) { 3379 if (dirty && !pmd_write(orig_pmd)) {
3364 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3380 ret = wp_huge_pmd(mm, vma, address, pmd,
3365 orig_pmd); 3381 orig_pmd, flags);
3366 if (!(ret & VM_FAULT_FALLBACK)) 3382 if (!(ret & VM_FAULT_FALLBACK))
3367 return ret; 3383 return ret;
3368 } else { 3384 } else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a7f1e0d1d6b8..87a177917cb2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
608 608
609 qp->prev = vma; 609 qp->prev = vma;
610 610
611 if (vma->vm_flags & VM_PFNMAP)
612 return 1;
613
614 if (flags & MPOL_MF_LAZY) { 611 if (flags & MPOL_MF_LAZY) {
615 /* Similar to task_numa_work, skip inaccessible VMAs */ 612 /* Similar to task_numa_work, skip inaccessible VMAs */
616 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) 613 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
@@ -945,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
945 return alloc_huge_page_node(page_hstate(compound_head(page)), 942 return alloc_huge_page_node(page_hstate(compound_head(page)),
946 node); 943 node);
947 else 944 else
948 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | 945 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
949 __GFP_THISNODE, 0); 946 __GFP_THISNODE, 0);
950} 947}
951 948
@@ -2001,7 +1998,7 @@ retry_cpuset:
2001 nmask = policy_nodemask(gfp, pol); 1998 nmask = policy_nodemask(gfp, pol);
2002 if (!nmask || node_isset(hpage_node, *nmask)) { 1999 if (!nmask || node_isset(hpage_node, *nmask)) {
2003 mpol_cond_put(pol); 2000 mpol_cond_put(pol);
2004 page = alloc_pages_exact_node(hpage_node, 2001 page = __alloc_pages_node(hpage_node,
2005 gfp | __GFP_THISNODE, order); 2002 gfp | __GFP_THISNODE, order);
2006 goto out; 2003 goto out;
2007 } 2004 }
diff --git a/mm/mempool.c b/mm/mempool.c
index 2cc08de8b1db..4c533bc51d73 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool)
150 */ 150 */
151void mempool_destroy(mempool_t *pool) 151void mempool_destroy(mempool_t *pool)
152{ 152{
153 if (unlikely(!pool))
154 return;
155
153 while (pool->curr_nr) { 156 while (pool->curr_nr) {
154 void *element = remove_element(pool); 157 void *element = remove_element(pool);
155 pool->free(element, pool->pool_data); 158 pool->free(element, pool->pool_data);
diff --git a/mm/memtest.c b/mm/memtest.c
index 0a1cc133f6d7..8eaa4c3a5f65 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -1,11 +1,6 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h> 2#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h> 3#include <linux/init.h>
8#include <linux/pfn.h>
9#include <linux/memblock.h> 4#include <linux/memblock.h>
10 5
11static u64 patterns[] __initdata = { 6static u64 patterns[] __initdata = {
@@ -31,10 +26,8 @@ static u64 patterns[] __initdata = {
31 26
32static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) 27static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
33{ 28{
34 printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", 29 pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
35 (unsigned long long) pattern, 30 cpu_to_be64(pattern), &start_bad, &end_bad);
36 (unsigned long long) start_bad,
37 (unsigned long long) end_bad);
38 memblock_reserve(start_bad, end_bad - start_bad); 31 memblock_reserve(start_bad, end_bad - start_bad);
39} 32}
40 33
@@ -79,26 +72,26 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
79 this_start = clamp(this_start, start, end); 72 this_start = clamp(this_start, start, end);
80 this_end = clamp(this_end, start, end); 73 this_end = clamp(this_end, start, end);
81 if (this_start < this_end) { 74 if (this_start < this_end) {
82 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", 75 pr_info(" %pa - %pa pattern %016llx\n",
83 (unsigned long long)this_start, 76 &this_start, &this_end, cpu_to_be64(pattern));
84 (unsigned long long)this_end,
85 (unsigned long long)cpu_to_be64(pattern));
86 memtest(pattern, this_start, this_end - this_start); 77 memtest(pattern, this_start, this_end - this_start);
87 } 78 }
88 } 79 }
89} 80}
90 81
91/* default is disabled */ 82/* default is disabled */
92static int memtest_pattern __initdata; 83static unsigned int memtest_pattern __initdata;
93 84
94static int __init parse_memtest(char *arg) 85static int __init parse_memtest(char *arg)
95{ 86{
87 int ret = 0;
88
96 if (arg) 89 if (arg)
97 memtest_pattern = simple_strtoul(arg, NULL, 0); 90 ret = kstrtouint(arg, 0, &memtest_pattern);
98 else 91 else
99 memtest_pattern = ARRAY_SIZE(patterns); 92 memtest_pattern = ARRAY_SIZE(patterns);
100 93
101 return 0; 94 return ret;
102} 95}
103 96
104early_param("memtest", parse_memtest); 97early_param("memtest", parse_memtest);
@@ -111,7 +104,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
111 if (!memtest_pattern) 104 if (!memtest_pattern)
112 return; 105 return;
113 106
114 printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); 107 pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
115 for (i = memtest_pattern-1; i < UINT_MAX; --i) { 108 for (i = memtest_pattern-1; i < UINT_MAX; --i) {
116 idx = i % ARRAY_SIZE(patterns); 109 idx = i % ARRAY_SIZE(patterns);
117 do_one_pass(patterns[idx], start, end); 110 do_one_pass(patterns[idx], start, end);
diff --git a/mm/migrate.c b/mm/migrate.c
index 5c08cab5419e..02ce25df16c2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -880,8 +880,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
880 /* Establish migration ptes or remove ptes */ 880 /* Establish migration ptes or remove ptes */
881 if (page_mapped(page)) { 881 if (page_mapped(page)) {
882 try_to_unmap(page, 882 try_to_unmap(page,
883 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| 883 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
884 TTU_IGNORE_HWPOISON);
885 page_was_mapped = 1; 884 page_was_mapped = 1;
886 } 885 }
887 886
@@ -952,9 +951,11 @@ out:
952 dec_zone_page_state(page, NR_ISOLATED_ANON + 951 dec_zone_page_state(page, NR_ISOLATED_ANON +
953 page_is_file_cache(page)); 952 page_is_file_cache(page));
954 /* Soft-offlined page shouldn't go through lru cache list */ 953 /* Soft-offlined page shouldn't go through lru cache list */
955 if (reason == MR_MEMORY_FAILURE) 954 if (reason == MR_MEMORY_FAILURE) {
956 put_page(page); 955 put_page(page);
957 else 956 if (!test_set_page_hwpoison(page))
957 num_poisoned_pages_inc();
958 } else
958 putback_lru_page(page); 959 putback_lru_page(page);
959 } 960 }
960 961
@@ -1194,7 +1195,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
1194 return alloc_huge_page_node(page_hstate(compound_head(p)), 1195 return alloc_huge_page_node(page_hstate(compound_head(p)),
1195 pm->node); 1196 pm->node);
1196 else 1197 else
1197 return alloc_pages_exact_node(pm->node, 1198 return __alloc_pages_node(pm->node,
1198 GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); 1199 GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
1199} 1200}
1200 1201
@@ -1554,7 +1555,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1554 int nid = (int) data; 1555 int nid = (int) data;
1555 struct page *newpage; 1556 struct page *newpage;
1556 1557
1557 newpage = alloc_pages_exact_node(nid, 1558 newpage = __alloc_pages_node(nid,
1558 (GFP_HIGHUSER_MOVABLE | 1559 (GFP_HIGHUSER_MOVABLE |
1559 __GFP_THISNODE | __GFP_NOMEMALLOC | 1560 __GFP_THISNODE | __GFP_NOMEMALLOC |
1560 __GFP_NORETRY | __GFP_NOWARN) & 1561 __GFP_NORETRY | __GFP_NOWARN) &
diff --git a/mm/mmap.c b/mm/mmap.c
index 82db4fc0a9d3..b6be3249f0a9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2455,7 +2455,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2455 unsigned long addr, int new_below) 2455 unsigned long addr, int new_below)
2456{ 2456{
2457 struct vm_area_struct *new; 2457 struct vm_area_struct *new;
2458 int err = -ENOMEM; 2458 int err;
2459 2459
2460 if (is_vm_hugetlb_page(vma) && (addr & 2460 if (is_vm_hugetlb_page(vma) && (addr &
2461 ~(huge_page_mask(hstate_vma(vma))))) 2461 ~(huge_page_mask(hstate_vma(vma)))))
@@ -2463,7 +2463,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2463 2463
2464 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2464 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2465 if (!new) 2465 if (!new)
2466 goto out_err; 2466 return -ENOMEM;
2467 2467
2468 /* most fields are the same, copy all, and then fixup */ 2468 /* most fields are the same, copy all, and then fixup */
2469 *new = *vma; 2469 *new = *vma;
@@ -2511,7 +2511,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2511 mpol_put(vma_policy(new)); 2511 mpol_put(vma_policy(new));
2512 out_free_vma: 2512 out_free_vma:
2513 kmem_cache_free(vm_area_cachep, new); 2513 kmem_cache_free(vm_area_cachep, new);
2514 out_err:
2515 return err; 2514 return err;
2516} 2515}
2517 2516
@@ -2872,6 +2871,13 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2872 struct vm_area_struct *prev; 2871 struct vm_area_struct *prev;
2873 struct rb_node **rb_link, *rb_parent; 2872 struct rb_node **rb_link, *rb_parent;
2874 2873
2874 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2875 &prev, &rb_link, &rb_parent))
2876 return -ENOMEM;
2877 if ((vma->vm_flags & VM_ACCOUNT) &&
2878 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2879 return -ENOMEM;
2880
2875 /* 2881 /*
2876 * The vm_pgoff of a purely anonymous vma should be irrelevant 2882 * The vm_pgoff of a purely anonymous vma should be irrelevant
2877 * until its first write fault, when page's anon_vma and index 2883 * until its first write fault, when page's anon_vma and index
@@ -2884,16 +2890,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2884 * using the existing file pgoff checks and manipulations. 2890 * using the existing file pgoff checks and manipulations.
2885 * Similarly in do_mmap_pgoff and in do_brk. 2891 * Similarly in do_mmap_pgoff and in do_brk.
2886 */ 2892 */
2887 if (!vma->vm_file) { 2893 if (vma_is_anonymous(vma)) {
2888 BUG_ON(vma->anon_vma); 2894 BUG_ON(vma->anon_vma);
2889 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 2895 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2890 } 2896 }
2891 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2892 &prev, &rb_link, &rb_parent))
2893 return -ENOMEM;
2894 if ((vma->vm_flags & VM_ACCOUNT) &&
2895 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2896 return -ENOMEM;
2897 2897
2898 vma_link(mm, vma, prev, rb_link, rb_parent); 2898 vma_link(mm, vma, prev, rb_link, rb_parent);
2899 return 0; 2899 return 0;
@@ -2918,7 +2918,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2918 * If anonymous vma has not yet been faulted, update new pgoff 2918 * If anonymous vma has not yet been faulted, update new pgoff
2919 * to match new location, to increase its chance of merging. 2919 * to match new location, to increase its chance of merging.
2920 */ 2920 */
2921 if (unlikely(!vma->vm_file && !vma->anon_vma)) { 2921 if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
2922 pgoff = addr >> PAGE_SHIFT; 2922 pgoff = addr >> PAGE_SHIFT;
2923 faulted_in_anon_vma = false; 2923 faulted_in_anon_vma = false;
2924 } 2924 }
@@ -2952,30 +2952,31 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2952 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 2952 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2953 } else { 2953 } else {
2954 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2954 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2955 if (new_vma) { 2955 if (!new_vma)
2956 *new_vma = *vma; 2956 goto out;
2957 new_vma->vm_start = addr; 2957 *new_vma = *vma;
2958 new_vma->vm_end = addr + len; 2958 new_vma->vm_start = addr;
2959 new_vma->vm_pgoff = pgoff; 2959 new_vma->vm_end = addr + len;
2960 if (vma_dup_policy(vma, new_vma)) 2960 new_vma->vm_pgoff = pgoff;
2961 goto out_free_vma; 2961 if (vma_dup_policy(vma, new_vma))
2962 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2962 goto out_free_vma;
2963 if (anon_vma_clone(new_vma, vma)) 2963 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2964 goto out_free_mempol; 2964 if (anon_vma_clone(new_vma, vma))
2965 if (new_vma->vm_file) 2965 goto out_free_mempol;
2966 get_file(new_vma->vm_file); 2966 if (new_vma->vm_file)
2967 if (new_vma->vm_ops && new_vma->vm_ops->open) 2967 get_file(new_vma->vm_file);
2968 new_vma->vm_ops->open(new_vma); 2968 if (new_vma->vm_ops && new_vma->vm_ops->open)
2969 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2969 new_vma->vm_ops->open(new_vma);
2970 *need_rmap_locks = false; 2970 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2971 } 2971 *need_rmap_locks = false;
2972 } 2972 }
2973 return new_vma; 2973 return new_vma;
2974 2974
2975 out_free_mempol: 2975out_free_mempol:
2976 mpol_put(vma_policy(new_vma)); 2976 mpol_put(vma_policy(new_vma));
2977 out_free_vma: 2977out_free_vma:
2978 kmem_cache_free(vm_area_cachep, new_vma); 2978 kmem_cache_free(vm_area_cachep, new_vma);
2979out:
2979 return NULL; 2980 return NULL;
2980} 2981}
2981 2982
@@ -3027,21 +3028,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
3027 pgoff_t pgoff; 3028 pgoff_t pgoff;
3028 struct page **pages; 3029 struct page **pages;
3029 3030
3030 /*
3031 * special mappings have no vm_file, and in that case, the mm
3032 * uses vm_pgoff internally. So we have to subtract it from here.
3033 * We are allowed to do this because we are the mm; do not copy
3034 * this code into drivers!
3035 */
3036 pgoff = vmf->pgoff - vma->vm_pgoff;
3037
3038 if (vma->vm_ops == &legacy_special_mapping_vmops) 3031 if (vma->vm_ops == &legacy_special_mapping_vmops)
3039 pages = vma->vm_private_data; 3032 pages = vma->vm_private_data;
3040 else 3033 else
3041 pages = ((struct vm_special_mapping *)vma->vm_private_data)-> 3034 pages = ((struct vm_special_mapping *)vma->vm_private_data)->
3042 pages; 3035 pages;
3043 3036
3044 for (; pgoff && *pages; ++pages) 3037 for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
3045 pgoff--; 3038 pgoff--;
3046 3039
3047 if (*pages) { 3040 if (*pages) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dff991e0681e..1ecc0bcaecc5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
196 * Determine the type of allocation constraint. 196 * Determine the type of allocation constraint.
197 */ 197 */
198#ifdef CONFIG_NUMA 198#ifdef CONFIG_NUMA
199static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 199static enum oom_constraint constrained_alloc(struct oom_control *oc,
200 gfp_t gfp_mask, nodemask_t *nodemask, 200 unsigned long *totalpages)
201 unsigned long *totalpages)
202{ 201{
203 struct zone *zone; 202 struct zone *zone;
204 struct zoneref *z; 203 struct zoneref *z;
205 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 204 enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
206 bool cpuset_limited = false; 205 bool cpuset_limited = false;
207 int nid; 206 int nid;
208 207
209 /* Default to all available memory */ 208 /* Default to all available memory */
210 *totalpages = totalram_pages + total_swap_pages; 209 *totalpages = totalram_pages + total_swap_pages;
211 210
212 if (!zonelist) 211 if (!oc->zonelist)
213 return CONSTRAINT_NONE; 212 return CONSTRAINT_NONE;
214 /* 213 /*
215 * Reach here only when __GFP_NOFAIL is used. So, we should avoid 214 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
216 * to kill current.We have to random task kill in this case. 215 * to kill current.We have to random task kill in this case.
217 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. 216 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
218 */ 217 */
219 if (gfp_mask & __GFP_THISNODE) 218 if (oc->gfp_mask & __GFP_THISNODE)
220 return CONSTRAINT_NONE; 219 return CONSTRAINT_NONE;
221 220
222 /* 221 /*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
224 * the page allocator means a mempolicy is in effect. Cpuset policy 223 * the page allocator means a mempolicy is in effect. Cpuset policy
225 * is enforced in get_page_from_freelist(). 224 * is enforced in get_page_from_freelist().
226 */ 225 */
227 if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { 226 if (oc->nodemask &&
227 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
228 *totalpages = total_swap_pages; 228 *totalpages = total_swap_pages;
229 for_each_node_mask(nid, *nodemask) 229 for_each_node_mask(nid, *oc->nodemask)
230 *totalpages += node_spanned_pages(nid); 230 *totalpages += node_spanned_pages(nid);
231 return CONSTRAINT_MEMORY_POLICY; 231 return CONSTRAINT_MEMORY_POLICY;
232 } 232 }
233 233
234 /* Check this allocation failure is caused by cpuset's wall function */ 234 /* Check this allocation failure is caused by cpuset's wall function */
235 for_each_zone_zonelist_nodemask(zone, z, zonelist, 235 for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
236 high_zoneidx, nodemask) 236 high_zoneidx, oc->nodemask)
237 if (!cpuset_zone_allowed(zone, gfp_mask)) 237 if (!cpuset_zone_allowed(zone, oc->gfp_mask))
238 cpuset_limited = true; 238 cpuset_limited = true;
239 239
240 if (cpuset_limited) { 240 if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
246 return CONSTRAINT_NONE; 246 return CONSTRAINT_NONE;
247} 247}
248#else 248#else
249static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 249static enum oom_constraint constrained_alloc(struct oom_control *oc,
250 gfp_t gfp_mask, nodemask_t *nodemask, 250 unsigned long *totalpages)
251 unsigned long *totalpages)
252{ 251{
253 *totalpages = totalram_pages + total_swap_pages; 252 *totalpages = totalram_pages + total_swap_pages;
254 return CONSTRAINT_NONE; 253 return CONSTRAINT_NONE;
255} 254}
256#endif 255#endif
257 256
258enum oom_scan_t oom_scan_process_thread(struct task_struct *task, 257enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
259 unsigned long totalpages, const nodemask_t *nodemask, 258 struct task_struct *task, unsigned long totalpages)
260 bool force_kill)
261{ 259{
262 if (oom_unkillable_task(task, NULL, nodemask)) 260 if (oom_unkillable_task(task, NULL, oc->nodemask))
263 return OOM_SCAN_CONTINUE; 261 return OOM_SCAN_CONTINUE;
264 262
265 /* 263 /*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
267 * Don't allow any other task to have access to the reserves. 265 * Don't allow any other task to have access to the reserves.
268 */ 266 */
269 if (test_tsk_thread_flag(task, TIF_MEMDIE)) { 267 if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
270 if (!force_kill) 268 if (oc->order != -1)
271 return OOM_SCAN_ABORT; 269 return OOM_SCAN_ABORT;
272 } 270 }
273 if (!task->mm) 271 if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
280 if (oom_task_origin(task)) 278 if (oom_task_origin(task))
281 return OOM_SCAN_SELECT; 279 return OOM_SCAN_SELECT;
282 280
283 if (task_will_free_mem(task) && !force_kill) 281 if (task_will_free_mem(task) && oc->order != -1)
284 return OOM_SCAN_ABORT; 282 return OOM_SCAN_ABORT;
285 283
286 return OOM_SCAN_OK; 284 return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
289/* 287/*
290 * Simple selection loop. We chose the process with the highest 288 * Simple selection loop. We chose the process with the highest
291 * number of 'points'. Returns -1 on scan abort. 289 * number of 'points'. Returns -1 on scan abort.
292 *
293 * (not docbooked, we don't want this one cluttering up the manual)
294 */ 290 */
295static struct task_struct *select_bad_process(unsigned int *ppoints, 291static struct task_struct *select_bad_process(struct oom_control *oc,
296 unsigned long totalpages, const nodemask_t *nodemask, 292 unsigned int *ppoints, unsigned long totalpages)
297 bool force_kill)
298{ 293{
299 struct task_struct *g, *p; 294 struct task_struct *g, *p;
300 struct task_struct *chosen = NULL; 295 struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
304 for_each_process_thread(g, p) { 299 for_each_process_thread(g, p) {
305 unsigned int points; 300 unsigned int points;
306 301
307 switch (oom_scan_process_thread(p, totalpages, nodemask, 302 switch (oom_scan_process_thread(oc, p, totalpages)) {
308 force_kill)) {
309 case OOM_SCAN_SELECT: 303 case OOM_SCAN_SELECT:
310 chosen = p; 304 chosen = p;
311 chosen_points = ULONG_MAX; 305 chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
318 case OOM_SCAN_OK: 312 case OOM_SCAN_OK:
319 break; 313 break;
320 }; 314 };
321 points = oom_badness(p, NULL, nodemask, totalpages); 315 points = oom_badness(p, NULL, oc->nodemask, totalpages);
322 if (!points || points < chosen_points) 316 if (!points || points < chosen_points)
323 continue; 317 continue;
324 /* Prefer thread group leaders for display purposes */ 318 /* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
380 rcu_read_unlock(); 374 rcu_read_unlock();
381} 375}
382 376
383static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 377static void dump_header(struct oom_control *oc, struct task_struct *p,
384 struct mem_cgroup *memcg, const nodemask_t *nodemask) 378 struct mem_cgroup *memcg)
385{ 379{
386 task_lock(current); 380 task_lock(current);
387 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 381 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
388 "oom_score_adj=%hd\n", 382 "oom_score_adj=%hd\n",
389 current->comm, gfp_mask, order, 383 current->comm, oc->gfp_mask, oc->order,
390 current->signal->oom_score_adj); 384 current->signal->oom_score_adj);
391 cpuset_print_task_mems_allowed(current); 385 cpuset_print_task_mems_allowed(current);
392 task_unlock(current); 386 task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
396 else 390 else
397 show_mem(SHOW_MEM_FILTER_NODES); 391 show_mem(SHOW_MEM_FILTER_NODES);
398 if (sysctl_oom_dump_tasks) 392 if (sysctl_oom_dump_tasks)
399 dump_tasks(memcg, nodemask); 393 dump_tasks(memcg, oc->nodemask);
400} 394}
401 395
402/* 396/*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
487 * Must be called while holding a reference to p, which will be released upon 481 * Must be called while holding a reference to p, which will be released upon
488 * returning. 482 * returning.
489 */ 483 */
490void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 484void oom_kill_process(struct oom_control *oc, struct task_struct *p,
491 unsigned int points, unsigned long totalpages, 485 unsigned int points, unsigned long totalpages,
492 struct mem_cgroup *memcg, nodemask_t *nodemask, 486 struct mem_cgroup *memcg, const char *message)
493 const char *message)
494{ 487{
495 struct task_struct *victim = p; 488 struct task_struct *victim = p;
496 struct task_struct *child; 489 struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
514 task_unlock(p); 507 task_unlock(p);
515 508
516 if (__ratelimit(&oom_rs)) 509 if (__ratelimit(&oom_rs))
517 dump_header(p, gfp_mask, order, memcg, nodemask); 510 dump_header(oc, p, memcg);
518 511
519 task_lock(p); 512 task_lock(p);
520 pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", 513 pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
537 /* 530 /*
538 * oom_badness() returns 0 if the thread is unkillable 531 * oom_badness() returns 0 if the thread is unkillable
539 */ 532 */
540 child_points = oom_badness(child, memcg, nodemask, 533 child_points = oom_badness(child, memcg, oc->nodemask,
541 totalpages); 534 totalpages);
542 if (child_points > victim_points) { 535 if (child_points > victim_points) {
543 put_task_struct(victim); 536 put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
600/* 593/*
601 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 594 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
602 */ 595 */
603void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 596void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
604 int order, const nodemask_t *nodemask,
605 struct mem_cgroup *memcg) 597 struct mem_cgroup *memcg)
606{ 598{
607 if (likely(!sysctl_panic_on_oom)) 599 if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
615 if (constraint != CONSTRAINT_NONE) 607 if (constraint != CONSTRAINT_NONE)
616 return; 608 return;
617 } 609 }
618 dump_header(NULL, gfp_mask, order, memcg, nodemask); 610 /* Do not panic for oom kills triggered by sysrq */
611 if (oc->order == -1)
612 return;
613 dump_header(oc, NULL, memcg);
619 panic("Out of memory: %s panic_on_oom is enabled\n", 614 panic("Out of memory: %s panic_on_oom is enabled\n",
620 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 615 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
621} 616}
@@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb)
635EXPORT_SYMBOL_GPL(unregister_oom_notifier); 630EXPORT_SYMBOL_GPL(unregister_oom_notifier);
636 631
637/** 632/**
638 * __out_of_memory - kill the "best" process when we run out of memory 633 * out_of_memory - kill the "best" process when we run out of memory
639 * @zonelist: zonelist pointer 634 * @oc: pointer to struct oom_control
640 * @gfp_mask: memory allocation flags
641 * @order: amount of memory being requested as a power of 2
642 * @nodemask: nodemask passed to page allocator
643 * @force_kill: true if a task must be killed, even if others are exiting
644 * 635 *
645 * If we run out of memory, we have the choice between either 636 * If we run out of memory, we have the choice between either
646 * killing a random task (bad), letting the system crash (worse) 637 * killing a random task (bad), letting the system crash (worse)
647 * OR try to be smart about which process to kill. Note that we 638 * OR try to be smart about which process to kill. Note that we
648 * don't have to be perfect here, we just have to be good. 639 * don't have to be perfect here, we just have to be good.
649 */ 640 */
650bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 641bool out_of_memory(struct oom_control *oc)
651 int order, nodemask_t *nodemask, bool force_kill)
652{ 642{
653 const nodemask_t *mpol_mask;
654 struct task_struct *p; 643 struct task_struct *p;
655 unsigned long totalpages; 644 unsigned long totalpages;
656 unsigned long freed = 0; 645 unsigned long freed = 0;
657 unsigned int uninitialized_var(points); 646 unsigned int uninitialized_var(points);
658 enum oom_constraint constraint = CONSTRAINT_NONE; 647 enum oom_constraint constraint = CONSTRAINT_NONE;
659 int killed = 0;
660 648
661 if (oom_killer_disabled) 649 if (oom_killer_disabled)
662 return false; 650 return false;
@@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
664 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 652 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
665 if (freed > 0) 653 if (freed > 0)
666 /* Got some memory back in the last second. */ 654 /* Got some memory back in the last second. */
667 goto out; 655 return true;
668 656
669 /* 657 /*
670 * If current has a pending SIGKILL or is exiting, then automatically 658 * If current has a pending SIGKILL or is exiting, then automatically
@@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
677 if (current->mm && 665 if (current->mm &&
678 (fatal_signal_pending(current) || task_will_free_mem(current))) { 666 (fatal_signal_pending(current) || task_will_free_mem(current))) {
679 mark_oom_victim(current); 667 mark_oom_victim(current);
680 goto out; 668 return true;
681 } 669 }
682 670
683 /* 671 /*
684 * Check if there were limitations on the allocation (only relevant for 672 * Check if there were limitations on the allocation (only relevant for
685 * NUMA) that may require different handling. 673 * NUMA) that may require different handling.
686 */ 674 */
687 constraint = constrained_alloc(zonelist, gfp_mask, nodemask, 675 constraint = constrained_alloc(oc, &totalpages);
688 &totalpages); 676 if (constraint != CONSTRAINT_MEMORY_POLICY)
689 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; 677 oc->nodemask = NULL;
690 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); 678 check_panic_on_oom(oc, constraint, NULL);
691 679
692 if (sysctl_oom_kill_allocating_task && current->mm && 680 if (sysctl_oom_kill_allocating_task && current->mm &&
693 !oom_unkillable_task(current, NULL, nodemask) && 681 !oom_unkillable_task(current, NULL, oc->nodemask) &&
694 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { 682 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
695 get_task_struct(current); 683 get_task_struct(current);
696 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, 684 oom_kill_process(oc, current, 0, totalpages, NULL,
697 nodemask,
698 "Out of memory (oom_kill_allocating_task)"); 685 "Out of memory (oom_kill_allocating_task)");
699 goto out; 686 return true;
700 } 687 }
701 688
702 p = select_bad_process(&points, totalpages, mpol_mask, force_kill); 689 p = select_bad_process(oc, &points, totalpages);
703 /* Found nothing?!?! Either we hang forever, or we panic. */ 690 /* Found nothing?!?! Either we hang forever, or we panic. */
704 if (!p) { 691 if (!p && oc->order != -1) {
705 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 692 dump_header(oc, NULL, NULL);
706 panic("Out of memory and no killable processes...\n"); 693 panic("Out of memory and no killable processes...\n");
707 } 694 }
708 if (p != (void *)-1UL) { 695 if (p && p != (void *)-1UL) {
709 oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, 696 oom_kill_process(oc, p, points, totalpages, NULL,
710 nodemask, "Out of memory"); 697 "Out of memory");
711 killed = 1; 698 /*
712 } 699 * Give the killed process a good chance to exit before trying
713out: 700 * to allocate memory again.
714 /* 701 */
715 * Give the killed threads a good chance of exiting before trying to
716 * allocate memory again.
717 */
718 if (killed)
719 schedule_timeout_killable(1); 702 schedule_timeout_killable(1);
720 703 }
721 return true; 704 return true;
722} 705}
723 706
@@ -728,13 +711,20 @@ out:
728 */ 711 */
729void pagefault_out_of_memory(void) 712void pagefault_out_of_memory(void)
730{ 713{
714 struct oom_control oc = {
715 .zonelist = NULL,
716 .nodemask = NULL,
717 .gfp_mask = 0,
718 .order = 0,
719 };
720
731 if (mem_cgroup_oom_synchronize(true)) 721 if (mem_cgroup_oom_synchronize(true))
732 return; 722 return;
733 723
734 if (!mutex_trylock(&oom_lock)) 724 if (!mutex_trylock(&oom_lock))
735 return; 725 return;
736 726
737 if (!out_of_memory(NULL, 0, 0, NULL, false)) { 727 if (!out_of_memory(&oc)) {
738 /* 728 /*
739 * There shouldn't be any user tasks runnable while the 729 * There shouldn't be any user tasks runnable while the
740 * OOM killer is disabled, so the current task has to 730 * OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b401d40cb4fd..48aaf7b9f253 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
125int percpu_pagelist_fraction; 125int percpu_pagelist_fraction;
126gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 126gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
127 127
128/*
129 * A cached value of the page's pageblock's migratetype, used when the page is
130 * put on a pcplist. Used to avoid the pageblock migratetype lookup when
131 * freeing from pcplists in most cases, at the cost of possibly becoming stale.
132 * Also the migratetype set in the page does not necessarily match the pcplist
133 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
134 * other index - this ensures that it will be put on the correct CMA freelist.
135 */
136static inline int get_pcppage_migratetype(struct page *page)
137{
138 return page->index;
139}
140
141static inline void set_pcppage_migratetype(struct page *page, int migratetype)
142{
143 page->index = migratetype;
144}
145
128#ifdef CONFIG_PM_SLEEP 146#ifdef CONFIG_PM_SLEEP
129/* 147/*
130 * The following functions are used by the suspend/hibernate code to temporarily 148 * The following functions are used by the suspend/hibernate code to temporarily
@@ -791,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
791 page = list_entry(list->prev, struct page, lru); 809 page = list_entry(list->prev, struct page, lru);
792 /* must delete as __free_one_page list manipulates */ 810 /* must delete as __free_one_page list manipulates */
793 list_del(&page->lru); 811 list_del(&page->lru);
794 mt = get_freepage_migratetype(page); 812
813 mt = get_pcppage_migratetype(page);
814 /* MIGRATE_ISOLATE page should not go to pcplists */
815 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
816 /* Pageblock could have been isolated meanwhile */
795 if (unlikely(has_isolate_pageblock(zone))) 817 if (unlikely(has_isolate_pageblock(zone)))
796 mt = get_pageblock_migratetype(page); 818 mt = get_pageblock_migratetype(page);
797 819
@@ -955,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
955 migratetype = get_pfnblock_migratetype(page, pfn); 977 migratetype = get_pfnblock_migratetype(page, pfn);
956 local_irq_save(flags); 978 local_irq_save(flags);
957 __count_vm_events(PGFREE, 1 << order); 979 __count_vm_events(PGFREE, 1 << order);
958 set_freepage_migratetype(page, migratetype);
959 free_one_page(page_zone(page), page, pfn, order, migratetype); 980 free_one_page(page_zone(page), page, pfn, order, migratetype);
960 local_irq_restore(flags); 981 local_irq_restore(flags);
961} 982}
@@ -1383,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1383 rmv_page_order(page); 1404 rmv_page_order(page);
1384 area->nr_free--; 1405 area->nr_free--;
1385 expand(zone, page, order, current_order, area, migratetype); 1406 expand(zone, page, order, current_order, area, migratetype);
1386 set_freepage_migratetype(page, migratetype); 1407 set_pcppage_migratetype(page, migratetype);
1387 return page; 1408 return page;
1388 } 1409 }
1389 1410
@@ -1460,7 +1481,6 @@ int move_freepages(struct zone *zone,
1460 order = page_order(page); 1481 order = page_order(page);
1461 list_move(&page->lru, 1482 list_move(&page->lru,
1462 &zone->free_area[order].free_list[migratetype]); 1483 &zone->free_area[order].free_list[migratetype]);
1463 set_freepage_migratetype(page, migratetype);
1464 page += 1 << order; 1484 page += 1 << order;
1465 pages_moved += 1 << order; 1485 pages_moved += 1 << order;
1466 } 1486 }
@@ -1630,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1630 expand(zone, page, order, current_order, area, 1650 expand(zone, page, order, current_order, area,
1631 start_migratetype); 1651 start_migratetype);
1632 /* 1652 /*
1633 * The freepage_migratetype may differ from pageblock's 1653 * The pcppage_migratetype may differ from pageblock's
1634 * migratetype depending on the decisions in 1654 * migratetype depending on the decisions in
1635 * try_to_steal_freepages(). This is OK as long as it 1655 * find_suitable_fallback(). This is OK as long as it does not
1636 * does not differ for MIGRATE_CMA pageblocks. For CMA 1656 * differ for MIGRATE_CMA pageblocks. Those can be used as
1637 * we need to make sure unallocated pages flushed from 1657 * fallback only via special __rmqueue_cma_fallback() function
1638 * pcp lists are returned to the correct freelist.
1639 */ 1658 */
1640 set_freepage_migratetype(page, start_migratetype); 1659 set_pcppage_migratetype(page, start_migratetype);
1641 1660
1642 trace_mm_page_alloc_extfrag(page, order, current_order, 1661 trace_mm_page_alloc_extfrag(page, order, current_order,
1643 start_migratetype, fallback_mt); 1662 start_migratetype, fallback_mt);
@@ -1713,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1713 else 1732 else
1714 list_add_tail(&page->lru, list); 1733 list_add_tail(&page->lru, list);
1715 list = &page->lru; 1734 list = &page->lru;
1716 if (is_migrate_cma(get_freepage_migratetype(page))) 1735 if (is_migrate_cma(get_pcppage_migratetype(page)))
1717 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1736 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1718 -(1 << order)); 1737 -(1 << order));
1719 } 1738 }
@@ -1910,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold)
1910 return; 1929 return;
1911 1930
1912 migratetype = get_pfnblock_migratetype(page, pfn); 1931 migratetype = get_pfnblock_migratetype(page, pfn);
1913 set_freepage_migratetype(page, migratetype); 1932 set_pcppage_migratetype(page, migratetype);
1914 local_irq_save(flags); 1933 local_irq_save(flags);
1915 __count_vm_event(PGFREE); 1934 __count_vm_event(PGFREE);
1916 1935
@@ -2115,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
2115 if (!page) 2134 if (!page)
2116 goto failed; 2135 goto failed;
2117 __mod_zone_freepage_state(zone, -(1 << order), 2136 __mod_zone_freepage_state(zone, -(1 << order),
2118 get_freepage_migratetype(page)); 2137 get_pcppage_migratetype(page));
2119 } 2138 }
2120 2139
2121 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 2140 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -2696,6 +2715,12 @@ static inline struct page *
2696__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2715__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2697 const struct alloc_context *ac, unsigned long *did_some_progress) 2716 const struct alloc_context *ac, unsigned long *did_some_progress)
2698{ 2717{
2718 struct oom_control oc = {
2719 .zonelist = ac->zonelist,
2720 .nodemask = ac->nodemask,
2721 .gfp_mask = gfp_mask,
2722 .order = order,
2723 };
2699 struct page *page; 2724 struct page *page;
2700 2725
2701 *did_some_progress = 0; 2726 *did_some_progress = 0;
@@ -2747,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2747 goto out; 2772 goto out;
2748 } 2773 }
2749 /* Exhausted what can be done so it's blamo time */ 2774 /* Exhausted what can be done so it's blamo time */
2750 if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) 2775 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
2751 || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
2752 *did_some_progress = 1; 2776 *did_some_progress = 1;
2753out: 2777out:
2754 mutex_unlock(&oom_lock); 2778 mutex_unlock(&oom_lock);
@@ -3490,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact);
3490 * 3514 *
3491 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 3515 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
3492 * back. 3516 * back.
3493 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
3494 * but is not exact.
3495 */ 3517 */
3496void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 3518void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
3497{ 3519{
@@ -5066,7 +5088,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
5066{ 5088{
5067 unsigned long zone_start_pfn, zone_end_pfn; 5089 unsigned long zone_start_pfn, zone_end_pfn;
5068 5090
5069 /* When hotadd a new node, the node should be empty */ 5091 /* When hotadd a new node from cpu_up(), the node should be empty */
5070 if (!node_start_pfn && !node_end_pfn) 5092 if (!node_start_pfn && !node_end_pfn)
5071 return 0; 5093 return 0;
5072 5094
@@ -5133,7 +5155,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
5133 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 5155 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
5134 unsigned long zone_start_pfn, zone_end_pfn; 5156 unsigned long zone_start_pfn, zone_end_pfn;
5135 5157
5136 /* When hotadd a new node, the node should be empty */ 5158 /* When hotadd a new node from cpu_up(), the node should be empty */
5137 if (!node_start_pfn && !node_end_pfn) 5159 if (!node_start_pfn && !node_end_pfn)
5138 return 0; 5160 return 0;
5139 5161
@@ -5306,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
5306 * 5328 *
5307 * NOTE: pgdat should get zeroed by caller. 5329 * NOTE: pgdat should get zeroed by caller.
5308 */ 5330 */
5309static void __paginginit free_area_init_core(struct pglist_data *pgdat, 5331static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5310 unsigned long node_start_pfn, unsigned long node_end_pfn)
5311{ 5332{
5312 enum zone_type j; 5333 enum zone_type j;
5313 int nid = pgdat->node_id; 5334 int nid = pgdat->node_id;
@@ -5458,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5458#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5479#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5459 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 5480 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
5460 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 5481 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
5461 (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); 5482 (u64)start_pfn << PAGE_SHIFT,
5483 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
5462#endif 5484#endif
5463 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 5485 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
5464 zones_size, zholes_size); 5486 zones_size, zholes_size);
@@ -5470,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5470 (unsigned long)pgdat->node_mem_map); 5492 (unsigned long)pgdat->node_mem_map);
5471#endif 5493#endif
5472 5494
5473 free_area_init_core(pgdat, start_pfn, end_pfn); 5495 free_area_init_core(pgdat);
5474} 5496}
5475 5497
5476#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5498#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5481,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5481 */ 5503 */
5482void __init setup_nr_node_ids(void) 5504void __init setup_nr_node_ids(void)
5483{ 5505{
5484 unsigned int node; 5506 unsigned int highest;
5485 unsigned int highest = 0;
5486 5507
5487 for_each_node_mask(node, node_possible_map) 5508 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
5488 highest = node;
5489 nr_node_ids = highest + 1; 5509 nr_node_ids = highest + 1;
5490} 5510}
5491#endif 5511#endif
@@ -6006,7 +6026,7 @@ void __init mem_init_print_info(const char *str)
6006 * set_dma_reserve - set the specified number of pages reserved in the first zone 6026 * set_dma_reserve - set the specified number of pages reserved in the first zone
6007 * @new_dma_reserve: The number of pages to mark reserved 6027 * @new_dma_reserve: The number of pages to mark reserved
6008 * 6028 *
6009 * The per-cpu batchsize and zone watermarks are determined by present_pages. 6029 * The per-cpu batchsize and zone watermarks are determined by managed_pages.
6010 * In the DMA zone, a significant percentage may be consumed by kernel image 6030 * In the DMA zone, a significant percentage may be consumed by kernel image
6011 * and other unfreeable allocations which can skew the watermarks badly. This 6031 * and other unfreeable allocations which can skew the watermarks badly. This
6012 * function may optionally be used to account for unfreeable pages in the 6032 * function may optionally be used to account for unfreeable pages in the
@@ -6059,7 +6079,7 @@ void __init page_alloc_init(void)
6059} 6079}
6060 6080
6061/* 6081/*
6062 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 6082 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
6063 * or min_free_kbytes changes. 6083 * or min_free_kbytes changes.
6064 */ 6084 */
6065static void calculate_totalreserve_pages(void) 6085static void calculate_totalreserve_pages(void)
@@ -6103,7 +6123,7 @@ static void calculate_totalreserve_pages(void)
6103 6123
6104/* 6124/*
6105 * setup_per_zone_lowmem_reserve - called whenever 6125 * setup_per_zone_lowmem_reserve - called whenever
6106 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 6126 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
6107 * has a correct pages reserved value, so an adequate number of 6127 * has a correct pages reserved value, so an adequate number of
6108 * pages are left in the zone after a successful __alloc_pages(). 6128 * pages are left in the zone after a successful __alloc_pages().
6109 */ 6129 */
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 303c908790ef..4568fd58f70a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,7 +9,8 @@
9#include <linux/hugetlb.h> 9#include <linux/hugetlb.h>
10#include "internal.h" 10#include "internal.h"
11 11
12int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) 12static int set_migratetype_isolate(struct page *page,
13 bool skip_hwpoisoned_pages)
13{ 14{
14 struct zone *zone; 15 struct zone *zone;
15 unsigned long flags, pfn; 16 unsigned long flags, pfn;
@@ -72,7 +73,7 @@ out:
72 return ret; 73 return ret;
73} 74}
74 75
75void unset_migratetype_isolate(struct page *page, unsigned migratetype) 76static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
76{ 77{
77 struct zone *zone; 78 struct zone *zone;
78 unsigned long flags, nr_pages; 79 unsigned long flags, nr_pages;
@@ -223,34 +224,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
223 continue; 224 continue;
224 } 225 }
225 page = pfn_to_page(pfn); 226 page = pfn_to_page(pfn);
226 if (PageBuddy(page)) { 227 if (PageBuddy(page))
227 /* 228 /*
228 * If race between isolatation and allocation happens, 229 * If the page is on a free list, it has to be on
229 * some free pages could be in MIGRATE_MOVABLE list 230 * the correct MIGRATE_ISOLATE freelist. There is no
230 * although pageblock's migratation type of the page 231 * simple way to verify that as VM_BUG_ON(), though.
231 * is MIGRATE_ISOLATE. Catch it and move the page into
232 * MIGRATE_ISOLATE list.
233 */ 232 */
234 if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
235 struct page *end_page;
236
237 end_page = page + (1 << page_order(page)) - 1;
238 move_freepages(page_zone(page), page, end_page,
239 MIGRATE_ISOLATE);
240 }
241 pfn += 1 << page_order(page); 233 pfn += 1 << page_order(page);
242 } 234 else if (skip_hwpoisoned_pages && PageHWPoison(page))
243 else if (page_count(page) == 0 && 235 /* A HWPoisoned page cannot be also PageBuddy */
244 get_freepage_migratetype(page) == MIGRATE_ISOLATE)
245 pfn += 1;
246 else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
247 /*
248 * The HWPoisoned page may be not in buddy
249 * system, and page_count() is not 0.
250 */
251 pfn++; 236 pfn++;
252 continue;
253 }
254 else 237 else
255 break; 238 break;
256 } 239 }
diff --git a/mm/shmem.c b/mm/shmem.c
index dbe0c1e8349c..48ce82926d93 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -542,6 +542,21 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
542} 542}
543EXPORT_SYMBOL_GPL(shmem_truncate_range); 543EXPORT_SYMBOL_GPL(shmem_truncate_range);
544 544
545static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
546 struct kstat *stat)
547{
548 struct inode *inode = dentry->d_inode;
549 struct shmem_inode_info *info = SHMEM_I(inode);
550
551 spin_lock(&info->lock);
552 shmem_recalc_inode(inode);
553 spin_unlock(&info->lock);
554
555 generic_fillattr(inode, stat);
556
557 return 0;
558}
559
545static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 560static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
546{ 561{
547 struct inode *inode = d_inode(dentry); 562 struct inode *inode = d_inode(dentry);
@@ -3122,6 +3137,7 @@ static const struct file_operations shmem_file_operations = {
3122}; 3137};
3123 3138
3124static const struct inode_operations shmem_inode_operations = { 3139static const struct inode_operations shmem_inode_operations = {
3140 .getattr = shmem_getattr,
3125 .setattr = shmem_setattr, 3141 .setattr = shmem_setattr,
3126#ifdef CONFIG_TMPFS_XATTR 3142#ifdef CONFIG_TMPFS_XATTR
3127 .setxattr = shmem_setxattr, 3143 .setxattr = shmem_setxattr,
diff --git a/mm/slab.c b/mm/slab.c
index 60c936938b84..c77ebe6cc87c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1595,7 +1595,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1595 if (memcg_charge_slab(cachep, flags, cachep->gfporder)) 1595 if (memcg_charge_slab(cachep, flags, cachep->gfporder))
1596 return NULL; 1596 return NULL;
1597 1597
1598 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1598 page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1599 if (!page) { 1599 if (!page) {
1600 memcg_uncharge_slab(cachep, cachep->gfporder); 1600 memcg_uncharge_slab(cachep, cachep->gfporder);
1601 slab_out_of_memory(cachep, flags, nodeid); 1601 slab_out_of_memory(cachep, flags, nodeid);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c26829fe4e37..5ce4faeb16fb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
500 struct kmem_cache *root_cache) 500 struct kmem_cache *root_cache)
501{ 501{
502 static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ 502 static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
503 struct cgroup_subsys_state *css = mem_cgroup_css(memcg); 503 struct cgroup_subsys_state *css = &memcg->css;
504 struct memcg_cache_array *arr; 504 struct memcg_cache_array *arr;
505 struct kmem_cache *s = NULL; 505 struct kmem_cache *s = NULL;
506 char *cache_name; 506 char *cache_name;
@@ -640,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
640 bool need_rcu_barrier = false; 640 bool need_rcu_barrier = false;
641 bool busy = false; 641 bool busy = false;
642 642
643 if (unlikely(!s))
644 return;
645
643 BUG_ON(!is_root_cache(s)); 646 BUG_ON(!is_root_cache(s));
644 647
645 get_online_cpus(); 648 get_online_cpus();
diff --git a/mm/slob.c b/mm/slob.c
index 165bbd3cd606..0d7e5df74d1f 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -45,7 +45,7 @@
45 * NUMA support in SLOB is fairly simplistic, pushing most of the real 45 * NUMA support in SLOB is fairly simplistic, pushing most of the real
46 * logic down to the page allocator, and simply doing the node accounting 46 * logic down to the page allocator, and simply doing the node accounting
47 * on the upper levels. In the event that a node id is explicitly 47 * on the upper levels. In the event that a node id is explicitly
48 * provided, alloc_pages_exact_node() with the specified node id is used 48 * provided, __alloc_pages_node() with the specified node id is used
49 * instead. The common case (or when the node id isn't explicitly provided) 49 * instead. The common case (or when the node id isn't explicitly provided)
50 * will default to the current node, as per numa_node_id(). 50 * will default to the current node, as per numa_node_id().
51 * 51 *
@@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
193 193
194#ifdef CONFIG_NUMA 194#ifdef CONFIG_NUMA
195 if (node != NUMA_NO_NODE) 195 if (node != NUMA_NO_NODE)
196 page = alloc_pages_exact_node(node, gfp, order); 196 page = __alloc_pages_node(node, gfp, order);
197 else 197 else
198#endif 198#endif
199 page = alloc_pages(gfp, order); 199 page = alloc_pages(gfp, order);
diff --git a/mm/slub.c b/mm/slub.c
index 084184e706c6..f614b5dc396b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1334,7 +1334,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
1334 if (node == NUMA_NO_NODE) 1334 if (node == NUMA_NO_NODE)
1335 page = alloc_pages(flags, order); 1335 page = alloc_pages(flags, order);
1336 else 1336 else
1337 page = alloc_pages_exact_node(node, flags, order); 1337 page = __alloc_pages_node(node, flags, order);
1338 1338
1339 if (!page) 1339 if (!page)
1340 memcg_uncharge_slab(s, order); 1340 memcg_uncharge_slab(s, order);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8bc8e66138da..d504adb7fa5f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry)
288 return page; 288 return page;
289} 289}
290 290
291/* 291struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
292 * Locate a page of swap in physical memory, reserving swap cache space 292 struct vm_area_struct *vma, unsigned long addr,
293 * and reading the disk if it is not already cached. 293 bool *new_page_allocated)
294 * A failure return means that either the page allocation failed or that
295 * the swap entry is no longer in use.
296 */
297struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
298 struct vm_area_struct *vma, unsigned long addr)
299{ 294{
300 struct page *found_page, *new_page = NULL; 295 struct page *found_page, *new_page = NULL;
296 struct address_space *swapper_space = swap_address_space(entry);
301 int err; 297 int err;
298 *new_page_allocated = false;
302 299
303 do { 300 do {
304 /* 301 /*
@@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
306 * called after lookup_swap_cache() failed, re-calling 303 * called after lookup_swap_cache() failed, re-calling
307 * that would confuse statistics. 304 * that would confuse statistics.
308 */ 305 */
309 found_page = find_get_page(swap_address_space(entry), 306 found_page = find_get_page(swapper_space, entry.val);
310 entry.val);
311 if (found_page) 307 if (found_page)
312 break; 308 break;
313 309
@@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
366 * Initiate read into locked page and return. 362 * Initiate read into locked page and return.
367 */ 363 */
368 lru_cache_add_anon(new_page); 364 lru_cache_add_anon(new_page);
369 swap_readpage(new_page); 365 *new_page_allocated = true;
370 return new_page; 366 return new_page;
371 } 367 }
372 radix_tree_preload_end(); 368 radix_tree_preload_end();
@@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
384 return found_page; 380 return found_page;
385} 381}
386 382
383/*
384 * Locate a page of swap in physical memory, reserving swap cache space
385 * and reading the disk if it is not already cached.
386 * A failure return means that either the page allocation failed or that
387 * the swap entry is no longer in use.
388 */
389struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
390 struct vm_area_struct *vma, unsigned long addr)
391{
392 bool page_was_allocated;
393 struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
394 vma, addr, &page_was_allocated);
395
396 if (page_was_allocated)
397 swap_readpage(retpage);
398
399 return retpage;
400}
401
387static unsigned long swapin_nr_pages(unsigned long offset) 402static unsigned long swapin_nr_pages(unsigned long offset)
388{ 403{
389 static unsigned long prev_offset; 404 static unsigned long prev_offset;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index aebc2dd6e649..58877312cf6b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -875,6 +875,48 @@ int page_swapcount(struct page *page)
875} 875}
876 876
877/* 877/*
878 * How many references to @entry are currently swapped out?
879 * This considers COUNT_CONTINUED so it returns exact answer.
880 */
881int swp_swapcount(swp_entry_t entry)
882{
883 int count, tmp_count, n;
884 struct swap_info_struct *p;
885 struct page *page;
886 pgoff_t offset;
887 unsigned char *map;
888
889 p = swap_info_get(entry);
890 if (!p)
891 return 0;
892
893 count = swap_count(p->swap_map[swp_offset(entry)]);
894 if (!(count & COUNT_CONTINUED))
895 goto out;
896
897 count &= ~COUNT_CONTINUED;
898 n = SWAP_MAP_MAX + 1;
899
900 offset = swp_offset(entry);
901 page = vmalloc_to_page(p->swap_map + offset);
902 offset &= ~PAGE_MASK;
903 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
904
905 do {
906 page = list_entry(page->lru.next, struct page, lru);
907 map = kmap_atomic(page);
908 tmp_count = map[offset];
909 kunmap_atomic(map);
910
911 count += (tmp_count & ~COUNT_CONTINUED) * n;
912 n *= (SWAP_CONT_MAX + 1);
913 } while (tmp_count & COUNT_CONTINUED);
914out:
915 spin_unlock(&p->lock);
916 return count;
917}
918
919/*
878 * We can write to an anon page without COW if there are no other references 920 * We can write to an anon page without COW if there are no other references
879 * to it. And as a side-effect, free up its swap: because the old content 921 * to it. And as a side-effect, free up its swap: because the old content
880 * on disk will never be read, and seeking back there to write new content 922 * on disk will never be read, and seeking back there to write new content
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1139039122a..2d978b28a410 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
175 if (!memcg) 175 if (!memcg)
176 return true; 176 return true;
177#ifdef CONFIG_CGROUP_WRITEBACK 177#ifdef CONFIG_CGROUP_WRITEBACK
178 if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup)) 178 if (memcg->css.cgroup)
179 return true; 179 return true;
180#endif 180#endif
181 return false; 181 return false;
@@ -985,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
985 * __GFP_IO|__GFP_FS for this reason); but more thought 985 * __GFP_IO|__GFP_FS for this reason); but more thought
986 * would probably show more reasons. 986 * would probably show more reasons.
987 * 987 *
988 * 3) Legacy memcg encounters a page that is not already marked 988 * 3) Legacy memcg encounters a page that is already marked
989 * PageReclaim. memcg does not have any dirty pages 989 * PageReclaim. memcg does not have any dirty pages
990 * throttling so we could easily OOM just because too many 990 * throttling so we could easily OOM just because too many
991 * pages are in writeback and there is nothing else to 991 * pages are in writeback and there is nothing else to
@@ -1015,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1015 */ 1015 */
1016 SetPageReclaim(page); 1016 SetPageReclaim(page);
1017 nr_writeback++; 1017 nr_writeback++;
1018
1019 goto keep_locked; 1018 goto keep_locked;
1020 1019
1021 /* Case 3 above */ 1020 /* Case 3 above */
1022 } else { 1021 } else {
1022 unlock_page(page);
1023 wait_on_page_writeback(page); 1023 wait_on_page_writeback(page);
1024 /* then go back and try same page again */
1025 list_add_tail(&page->lru, page_list);
1026 continue;
1024 } 1027 }
1025 } 1028 }
1026 1029
@@ -1196,7 +1199,7 @@ cull_mlocked:
1196 if (PageSwapCache(page)) 1199 if (PageSwapCache(page))
1197 try_to_free_swap(page); 1200 try_to_free_swap(page);
1198 unlock_page(page); 1201 unlock_page(page);
1199 putback_lru_page(page); 1202 list_add(&page->lru, &ret_pages);
1200 continue; 1203 continue;
1201 1204
1202activate_locked: 1205activate_locked:
@@ -1359,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1359 unsigned long nr_taken = 0; 1362 unsigned long nr_taken = 0;
1360 unsigned long scan; 1363 unsigned long scan;
1361 1364
1362 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1365 for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
1366 !list_empty(src); scan++) {
1363 struct page *page; 1367 struct page *page;
1364 int nr_pages; 1368 int nr_pages;
1365 1369
diff --git a/mm/zbud.c b/mm/zbud.c
index f3bf6f7627d8..fa48bcdff9d5 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -96,10 +96,10 @@ struct zbud_pool {
96 struct list_head buddied; 96 struct list_head buddied;
97 struct list_head lru; 97 struct list_head lru;
98 u64 pages_nr; 98 u64 pages_nr;
99 struct zbud_ops *ops; 99 const struct zbud_ops *ops;
100#ifdef CONFIG_ZPOOL 100#ifdef CONFIG_ZPOOL
101 struct zpool *zpool; 101 struct zpool *zpool;
102 struct zpool_ops *zpool_ops; 102 const struct zpool_ops *zpool_ops;
103#endif 103#endif
104}; 104};
105 105
@@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
133 return -ENOENT; 133 return -ENOENT;
134} 134}
135 135
136static struct zbud_ops zbud_zpool_ops = { 136static const struct zbud_ops zbud_zpool_ops = {
137 .evict = zbud_zpool_evict 137 .evict = zbud_zpool_evict
138}; 138};
139 139
140static void *zbud_zpool_create(char *name, gfp_t gfp, 140static void *zbud_zpool_create(char *name, gfp_t gfp,
141 struct zpool_ops *zpool_ops, 141 const struct zpool_ops *zpool_ops,
142 struct zpool *zpool) 142 struct zpool *zpool)
143{ 143{
144 struct zbud_pool *pool; 144 struct zbud_pool *pool;
@@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
302 * Return: pointer to the new zbud pool or NULL if the metadata allocation 302 * Return: pointer to the new zbud pool or NULL if the metadata allocation
303 * failed. 303 * failed.
304 */ 304 */
305struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) 305struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
306{ 306{
307 struct zbud_pool *pool; 307 struct zbud_pool *pool;
308 int i; 308 int i;
diff --git a/mm/zpool.c b/mm/zpool.c
index 722a4f60e90b..68d2dd8ed2d8 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -22,7 +22,7 @@ struct zpool {
22 22
23 struct zpool_driver *driver; 23 struct zpool_driver *driver;
24 void *pool; 24 void *pool;
25 struct zpool_ops *ops; 25 const struct zpool_ops *ops;
26 26
27 struct list_head list; 27 struct list_head list;
28}; 28};
@@ -115,7 +115,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
115 * Returns: New zpool on success, NULL on failure. 115 * Returns: New zpool on success, NULL on failure.
116 */ 116 */
117struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, 117struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
118 struct zpool_ops *ops) 118 const struct zpool_ops *ops)
119{ 119{
120 struct zpool_driver *driver; 120 struct zpool_driver *driver;
121 struct zpool *zpool; 121 struct zpool *zpool;
@@ -320,20 +320,6 @@ u64 zpool_get_total_size(struct zpool *zpool)
320 return zpool->driver->total_size(zpool->pool); 320 return zpool->driver->total_size(zpool->pool);
321} 321}
322 322
323static int __init init_zpool(void)
324{
325 pr_info("loaded\n");
326 return 0;
327}
328
329static void __exit exit_zpool(void)
330{
331 pr_info("unloaded\n");
332}
333
334module_init(init_zpool);
335module_exit(exit_zpool);
336
337MODULE_LICENSE("GPL"); 323MODULE_LICENSE("GPL");
338MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); 324MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
339MODULE_DESCRIPTION("Common API for compressed memory storage"); 325MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0a7f81aa2249..f135b1b6fcdc 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -169,14 +169,12 @@ enum zs_stat_type {
169 NR_ZS_STAT_TYPE, 169 NR_ZS_STAT_TYPE,
170}; 170};
171 171
172#ifdef CONFIG_ZSMALLOC_STAT
173
174static struct dentry *zs_stat_root;
175
176struct zs_size_stat { 172struct zs_size_stat {
177 unsigned long objs[NR_ZS_STAT_TYPE]; 173 unsigned long objs[NR_ZS_STAT_TYPE];
178}; 174};
179 175
176#ifdef CONFIG_ZSMALLOC_STAT
177static struct dentry *zs_stat_root;
180#endif 178#endif
181 179
182/* 180/*
@@ -201,6 +199,8 @@ static int zs_size_classes;
201static const int fullness_threshold_frac = 4; 199static const int fullness_threshold_frac = 4;
202 200
203struct size_class { 201struct size_class {
202 spinlock_t lock;
203 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
204 /* 204 /*
205 * Size of objects stored in this class. Must be multiple 205 * Size of objects stored in this class. Must be multiple
206 * of ZS_ALIGN. 206 * of ZS_ALIGN.
@@ -210,16 +210,10 @@ struct size_class {
210 210
211 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 211 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
212 int pages_per_zspage; 212 int pages_per_zspage;
213 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
214 bool huge;
215
216#ifdef CONFIG_ZSMALLOC_STAT
217 struct zs_size_stat stats; 213 struct zs_size_stat stats;
218#endif
219
220 spinlock_t lock;
221 214
222 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 215 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
216 bool huge;
223}; 217};
224 218
225/* 219/*
@@ -251,6 +245,15 @@ struct zs_pool {
251 gfp_t flags; /* allocation flags used when growing pool */ 245 gfp_t flags; /* allocation flags used when growing pool */
252 atomic_long_t pages_allocated; 246 atomic_long_t pages_allocated;
253 247
248 struct zs_pool_stats stats;
249
250 /* Compact classes */
251 struct shrinker shrinker;
252 /*
253 * To signify that register_shrinker() was successful
254 * and unregister_shrinker() will not Oops.
255 */
256 bool shrinker_enabled;
254#ifdef CONFIG_ZSMALLOC_STAT 257#ifdef CONFIG_ZSMALLOC_STAT
255 struct dentry *stat_dentry; 258 struct dentry *stat_dentry;
256#endif 259#endif
@@ -285,8 +288,7 @@ static int create_handle_cache(struct zs_pool *pool)
285 288
286static void destroy_handle_cache(struct zs_pool *pool) 289static void destroy_handle_cache(struct zs_pool *pool)
287{ 290{
288 if (pool->handle_cachep) 291 kmem_cache_destroy(pool->handle_cachep);
289 kmem_cache_destroy(pool->handle_cachep);
290} 292}
291 293
292static unsigned long alloc_handle(struct zs_pool *pool) 294static unsigned long alloc_handle(struct zs_pool *pool)
@@ -309,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
309 311
310#ifdef CONFIG_ZPOOL 312#ifdef CONFIG_ZPOOL
311 313
312static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops, 314static void *zs_zpool_create(char *name, gfp_t gfp,
315 const struct zpool_ops *zpool_ops,
313 struct zpool *zpool) 316 struct zpool *zpool)
314{ 317{
315 return zs_create_pool(name, gfp); 318 return zs_create_pool(name, gfp);
@@ -441,8 +444,6 @@ static int get_size_class_index(int size)
441 return min(zs_size_classes - 1, idx); 444 return min(zs_size_classes - 1, idx);
442} 445}
443 446
444#ifdef CONFIG_ZSMALLOC_STAT
445
446static inline void zs_stat_inc(struct size_class *class, 447static inline void zs_stat_inc(struct size_class *class,
447 enum zs_stat_type type, unsigned long cnt) 448 enum zs_stat_type type, unsigned long cnt)
448{ 449{
@@ -461,6 +462,8 @@ static inline unsigned long zs_stat_get(struct size_class *class,
461 return class->stats.objs[type]; 462 return class->stats.objs[type];
462} 463}
463 464
465#ifdef CONFIG_ZSMALLOC_STAT
466
464static int __init zs_stat_init(void) 467static int __init zs_stat_init(void)
465{ 468{
466 if (!debugfs_initialized()) 469 if (!debugfs_initialized())
@@ -576,23 +579,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
576} 579}
577 580
578#else /* CONFIG_ZSMALLOC_STAT */ 581#else /* CONFIG_ZSMALLOC_STAT */
579
580static inline void zs_stat_inc(struct size_class *class,
581 enum zs_stat_type type, unsigned long cnt)
582{
583}
584
585static inline void zs_stat_dec(struct size_class *class,
586 enum zs_stat_type type, unsigned long cnt)
587{
588}
589
590static inline unsigned long zs_stat_get(struct size_class *class,
591 enum zs_stat_type type)
592{
593 return 0;
594}
595
596static int __init zs_stat_init(void) 582static int __init zs_stat_init(void)
597{ 583{
598 return 0; 584 return 0;
@@ -610,7 +596,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
610static inline void zs_pool_stat_destroy(struct zs_pool *pool) 596static inline void zs_pool_stat_destroy(struct zs_pool *pool)
611{ 597{
612} 598}
613
614#endif 599#endif
615 600
616 601
@@ -658,13 +643,22 @@ static void insert_zspage(struct page *page, struct size_class *class,
658 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 643 if (fullness >= _ZS_NR_FULLNESS_GROUPS)
659 return; 644 return;
660 645
661 head = &class->fullness_list[fullness];
662 if (*head)
663 list_add_tail(&page->lru, &(*head)->lru);
664
665 *head = page;
666 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? 646 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
667 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 647 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
648
649 head = &class->fullness_list[fullness];
650 if (!*head) {
651 *head = page;
652 return;
653 }
654
655 /*
656 * We want to see more ZS_FULL pages and less almost
657 * empty/full. Put pages with higher ->inuse first.
658 */
659 list_add_tail(&page->lru, &(*head)->lru);
660 if (page->inuse >= (*head)->inuse)
661 *head = page;
668} 662}
669 663
670/* 664/*
@@ -1495,7 +1489,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
1495} 1489}
1496EXPORT_SYMBOL_GPL(zs_free); 1490EXPORT_SYMBOL_GPL(zs_free);
1497 1491
1498static void zs_object_copy(unsigned long src, unsigned long dst, 1492static void zs_object_copy(unsigned long dst, unsigned long src,
1499 struct size_class *class) 1493 struct size_class *class)
1500{ 1494{
1501 struct page *s_page, *d_page; 1495 struct page *s_page, *d_page;
@@ -1602,8 +1596,6 @@ struct zs_compact_control {
1602 /* Starting object index within @s_page which used for live object 1596 /* Starting object index within @s_page which used for live object
1603 * in the subpage. */ 1597 * in the subpage. */
1604 int index; 1598 int index;
1605 /* how many of objects are migrated */
1606 int nr_migrated;
1607}; 1599};
1608 1600
1609static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1601static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1614,7 +1606,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1614 struct page *s_page = cc->s_page; 1606 struct page *s_page = cc->s_page;
1615 struct page *d_page = cc->d_page; 1607 struct page *d_page = cc->d_page;
1616 unsigned long index = cc->index; 1608 unsigned long index = cc->index;
1617 int nr_migrated = 0;
1618 int ret = 0; 1609 int ret = 0;
1619 1610
1620 while (1) { 1611 while (1) {
@@ -1636,23 +1627,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1636 1627
1637 used_obj = handle_to_obj(handle); 1628 used_obj = handle_to_obj(handle);
1638 free_obj = obj_malloc(d_page, class, handle); 1629 free_obj = obj_malloc(d_page, class, handle);
1639 zs_object_copy(used_obj, free_obj, class); 1630 zs_object_copy(free_obj, used_obj, class);
1640 index++; 1631 index++;
1641 record_obj(handle, free_obj); 1632 record_obj(handle, free_obj);
1642 unpin_tag(handle); 1633 unpin_tag(handle);
1643 obj_free(pool, class, used_obj); 1634 obj_free(pool, class, used_obj);
1644 nr_migrated++;
1645 } 1635 }
1646 1636
1647 /* Remember last position in this iteration */ 1637 /* Remember last position in this iteration */
1648 cc->s_page = s_page; 1638 cc->s_page = s_page;
1649 cc->index = index; 1639 cc->index = index;
1650 cc->nr_migrated = nr_migrated;
1651 1640
1652 return ret; 1641 return ret;
1653} 1642}
1654 1643
1655static struct page *alloc_target_page(struct size_class *class) 1644static struct page *isolate_target_page(struct size_class *class)
1656{ 1645{
1657 int i; 1646 int i;
1658 struct page *page; 1647 struct page *page;
@@ -1668,8 +1657,17 @@ static struct page *alloc_target_page(struct size_class *class)
1668 return page; 1657 return page;
1669} 1658}
1670 1659
1671static void putback_zspage(struct zs_pool *pool, struct size_class *class, 1660/*
1672 struct page *first_page) 1661 * putback_zspage - add @first_page into right class's fullness list
1662 * @pool: target pool
1663 * @class: destination class
1664 * @first_page: target page
1665 *
1666 * Return @fist_page's fullness_group
1667 */
1668static enum fullness_group putback_zspage(struct zs_pool *pool,
1669 struct size_class *class,
1670 struct page *first_page)
1673{ 1671{
1674 enum fullness_group fullness; 1672 enum fullness_group fullness;
1675 1673
@@ -1687,50 +1685,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class,
1687 1685
1688 free_zspage(first_page); 1686 free_zspage(first_page);
1689 } 1687 }
1688
1689 return fullness;
1690} 1690}
1691 1691
1692static struct page *isolate_source_page(struct size_class *class) 1692static struct page *isolate_source_page(struct size_class *class)
1693{ 1693{
1694 struct page *page; 1694 int i;
1695 struct page *page = NULL;
1695 1696
1696 page = class->fullness_list[ZS_ALMOST_EMPTY]; 1697 for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
1697 if (page) 1698 page = class->fullness_list[i];
1698 remove_zspage(page, class, ZS_ALMOST_EMPTY); 1699 if (!page)
1700 continue;
1701
1702 remove_zspage(page, class, i);
1703 break;
1704 }
1699 1705
1700 return page; 1706 return page;
1701} 1707}
1702 1708
1703static unsigned long __zs_compact(struct zs_pool *pool, 1709/*
1704 struct size_class *class) 1710 *
1711 * Based on the number of unused allocated objects calculate
1712 * and return the number of pages that we can free.
1713 */
1714static unsigned long zs_can_compact(struct size_class *class)
1715{
1716 unsigned long obj_wasted;
1717
1718 obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
1719 zs_stat_get(class, OBJ_USED);
1720
1721 obj_wasted /= get_maxobj_per_zspage(class->size,
1722 class->pages_per_zspage);
1723
1724 return obj_wasted * class->pages_per_zspage;
1725}
1726
1727static void __zs_compact(struct zs_pool *pool, struct size_class *class)
1705{ 1728{
1706 int nr_to_migrate;
1707 struct zs_compact_control cc; 1729 struct zs_compact_control cc;
1708 struct page *src_page; 1730 struct page *src_page;
1709 struct page *dst_page = NULL; 1731 struct page *dst_page = NULL;
1710 unsigned long nr_total_migrated = 0;
1711 1732
1712 spin_lock(&class->lock); 1733 spin_lock(&class->lock);
1713 while ((src_page = isolate_source_page(class))) { 1734 while ((src_page = isolate_source_page(class))) {
1714 1735
1715 BUG_ON(!is_first_page(src_page)); 1736 BUG_ON(!is_first_page(src_page));
1716 1737
1717 /* The goal is to migrate all live objects in source page */ 1738 if (!zs_can_compact(class))
1718 nr_to_migrate = src_page->inuse; 1739 break;
1740
1719 cc.index = 0; 1741 cc.index = 0;
1720 cc.s_page = src_page; 1742 cc.s_page = src_page;
1721 1743
1722 while ((dst_page = alloc_target_page(class))) { 1744 while ((dst_page = isolate_target_page(class))) {
1723 cc.d_page = dst_page; 1745 cc.d_page = dst_page;
1724 /* 1746 /*
1725 * If there is no more space in dst_page, try to 1747 * If there is no more space in dst_page, resched
1726 * allocate another zspage. 1748 * and see if anyone had allocated another zspage.
1727 */ 1749 */
1728 if (!migrate_zspage(pool, class, &cc)) 1750 if (!migrate_zspage(pool, class, &cc))
1729 break; 1751 break;
1730 1752
1731 putback_zspage(pool, class, dst_page); 1753 putback_zspage(pool, class, dst_page);
1732 nr_total_migrated += cc.nr_migrated;
1733 nr_to_migrate -= cc.nr_migrated;
1734 } 1754 }
1735 1755
1736 /* Stop if we couldn't find slot */ 1756 /* Stop if we couldn't find slot */
@@ -1738,9 +1758,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
1738 break; 1758 break;
1739 1759
1740 putback_zspage(pool, class, dst_page); 1760 putback_zspage(pool, class, dst_page);
1741 putback_zspage(pool, class, src_page); 1761 if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
1762 pool->stats.pages_compacted += class->pages_per_zspage;
1742 spin_unlock(&class->lock); 1763 spin_unlock(&class->lock);
1743 nr_total_migrated += cc.nr_migrated;
1744 cond_resched(); 1764 cond_resched();
1745 spin_lock(&class->lock); 1765 spin_lock(&class->lock);
1746 } 1766 }
@@ -1749,14 +1769,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
1749 putback_zspage(pool, class, src_page); 1769 putback_zspage(pool, class, src_page);
1750 1770
1751 spin_unlock(&class->lock); 1771 spin_unlock(&class->lock);
1752
1753 return nr_total_migrated;
1754} 1772}
1755 1773
1756unsigned long zs_compact(struct zs_pool *pool) 1774unsigned long zs_compact(struct zs_pool *pool)
1757{ 1775{
1758 int i; 1776 int i;
1759 unsigned long nr_migrated = 0;
1760 struct size_class *class; 1777 struct size_class *class;
1761 1778
1762 for (i = zs_size_classes - 1; i >= 0; i--) { 1779 for (i = zs_size_classes - 1; i >= 0; i--) {
@@ -1765,13 +1782,80 @@ unsigned long zs_compact(struct zs_pool *pool)
1765 continue; 1782 continue;
1766 if (class->index != i) 1783 if (class->index != i)
1767 continue; 1784 continue;
1768 nr_migrated += __zs_compact(pool, class); 1785 __zs_compact(pool, class);
1769 } 1786 }
1770 1787
1771 return nr_migrated; 1788 return pool->stats.pages_compacted;
1772} 1789}
1773EXPORT_SYMBOL_GPL(zs_compact); 1790EXPORT_SYMBOL_GPL(zs_compact);
1774 1791
1792void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
1793{
1794 memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
1795}
1796EXPORT_SYMBOL_GPL(zs_pool_stats);
1797
1798static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
1799 struct shrink_control *sc)
1800{
1801 unsigned long pages_freed;
1802 struct zs_pool *pool = container_of(shrinker, struct zs_pool,
1803 shrinker);
1804
1805 pages_freed = pool->stats.pages_compacted;
1806 /*
1807 * Compact classes and calculate compaction delta.
1808 * Can run concurrently with a manually triggered
1809 * (by user) compaction.
1810 */
1811 pages_freed = zs_compact(pool) - pages_freed;
1812
1813 return pages_freed ? pages_freed : SHRINK_STOP;
1814}
1815
1816static unsigned long zs_shrinker_count(struct shrinker *shrinker,
1817 struct shrink_control *sc)
1818{
1819 int i;
1820 struct size_class *class;
1821 unsigned long pages_to_free = 0;
1822 struct zs_pool *pool = container_of(shrinker, struct zs_pool,
1823 shrinker);
1824
1825 if (!pool->shrinker_enabled)
1826 return 0;
1827
1828 for (i = zs_size_classes - 1; i >= 0; i--) {
1829 class = pool->size_class[i];
1830 if (!class)
1831 continue;
1832 if (class->index != i)
1833 continue;
1834
1835 pages_to_free += zs_can_compact(class);
1836 }
1837
1838 return pages_to_free;
1839}
1840
1841static void zs_unregister_shrinker(struct zs_pool *pool)
1842{
1843 if (pool->shrinker_enabled) {
1844 unregister_shrinker(&pool->shrinker);
1845 pool->shrinker_enabled = false;
1846 }
1847}
1848
1849static int zs_register_shrinker(struct zs_pool *pool)
1850{
1851 pool->shrinker.scan_objects = zs_shrinker_scan;
1852 pool->shrinker.count_objects = zs_shrinker_count;
1853 pool->shrinker.batch = 0;
1854 pool->shrinker.seeks = DEFAULT_SEEKS;
1855
1856 return register_shrinker(&pool->shrinker);
1857}
1858
1775/** 1859/**
1776 * zs_create_pool - Creates an allocation pool to work from. 1860 * zs_create_pool - Creates an allocation pool to work from.
1777 * @flags: allocation flags used to allocate pool metadata 1861 * @flags: allocation flags used to allocate pool metadata
@@ -1857,6 +1941,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
1857 if (zs_pool_stat_create(name, pool)) 1941 if (zs_pool_stat_create(name, pool))
1858 goto err; 1942 goto err;
1859 1943
1944 /*
1945 * Not critical, we still can use the pool
1946 * and user can trigger compaction manually.
1947 */
1948 if (zs_register_shrinker(pool) == 0)
1949 pool->shrinker_enabled = true;
1860 return pool; 1950 return pool;
1861 1951
1862err: 1952err:
@@ -1869,6 +1959,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1869{ 1959{
1870 int i; 1960 int i;
1871 1961
1962 zs_unregister_shrinker(pool);
1872 zs_pool_stat_destroy(pool); 1963 zs_pool_stat_destroy(pool);
1873 1964
1874 for (i = 0; i < zs_size_classes; i++) { 1965 for (i = 0; i < zs_size_classes; i++) {
diff --git a/mm/zswap.c b/mm/zswap.c
index 2d5727baed59..48a1d081e2a5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -446,75 +446,14 @@ enum zswap_get_swap_ret {
446static int zswap_get_swap_cache_page(swp_entry_t entry, 446static int zswap_get_swap_cache_page(swp_entry_t entry,
447 struct page **retpage) 447 struct page **retpage)
448{ 448{
449 struct page *found_page, *new_page = NULL; 449 bool page_was_allocated;
450 struct address_space *swapper_space = swap_address_space(entry);
451 int err;
452 450
453 *retpage = NULL; 451 *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
454 do { 452 NULL, 0, &page_was_allocated);
455 /* 453 if (page_was_allocated)
456 * First check the swap cache. Since this is normally 454 return ZSWAP_SWAPCACHE_NEW;
457 * called after lookup_swap_cache() failed, re-calling 455 if (!*retpage)
458 * that would confuse statistics.
459 */
460 found_page = find_get_page(swapper_space, entry.val);
461 if (found_page)
462 break;
463
464 /*
465 * Get a new page to read into from swap.
466 */
467 if (!new_page) {
468 new_page = alloc_page(GFP_KERNEL);
469 if (!new_page)
470 break; /* Out of memory */
471 }
472
473 /*
474 * call radix_tree_preload() while we can wait.
475 */
476 err = radix_tree_preload(GFP_KERNEL);
477 if (err)
478 break;
479
480 /*
481 * Swap entry may have been freed since our caller observed it.
482 */
483 err = swapcache_prepare(entry);
484 if (err == -EEXIST) { /* seems racy */
485 radix_tree_preload_end();
486 continue;
487 }
488 if (err) { /* swp entry is obsolete ? */
489 radix_tree_preload_end();
490 break;
491 }
492
493 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
494 __set_page_locked(new_page);
495 SetPageSwapBacked(new_page);
496 err = __add_to_swap_cache(new_page, entry);
497 if (likely(!err)) {
498 radix_tree_preload_end();
499 lru_cache_add_anon(new_page);
500 *retpage = new_page;
501 return ZSWAP_SWAPCACHE_NEW;
502 }
503 radix_tree_preload_end();
504 ClearPageSwapBacked(new_page);
505 __clear_page_locked(new_page);
506 /*
507 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
508 * clear SWAP_HAS_CACHE flag.
509 */
510 swapcache_free(entry);
511 } while (err != -ENOMEM);
512
513 if (new_page)
514 page_cache_release(new_page);
515 if (!found_page)
516 return ZSWAP_SWAPCACHE_FAIL; 456 return ZSWAP_SWAPCACHE_FAIL;
517 *retpage = found_page;
518 return ZSWAP_SWAPCACHE_EXIST; 457 return ZSWAP_SWAPCACHE_EXIST;
519} 458}
520 459
@@ -816,7 +755,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
816 zswap_trees[type] = NULL; 755 zswap_trees[type] = NULL;
817} 756}
818 757
819static struct zpool_ops zswap_zpool_ops = { 758static const struct zpool_ops zswap_zpool_ops = {
820 .evict = zswap_writeback_entry 759 .evict = zswap_writeback_entry
821}; 760};
822 761