diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-08 20:52:23 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-08 20:52:23 -0400 |
| commit | f6f7a6369203fa3e07efb7f35cfd81efe9f25b07 (patch) | |
| tree | 97bec9ddd999040822acf314647eaf4208213589 /mm | |
| parent | 839fe9156fbe89c3157aa6146d22090f8cffddd8 (diff) | |
| parent | df69f52d990bd85159727bd26e819d3a6e49c666 (diff) | |
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton:
"Almost all of the rest of MM. There was an unusually large amount of
MM material this time"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits)
zpool: remove no-op module init/exit
mm: zbud: constify the zbud_ops
mm: zpool: constify the zpool_ops
mm: swap: zswap: maybe_preload & refactoring
zram: unify error reporting
zsmalloc: remove null check from destroy_handle_cache()
zsmalloc: do not take class lock in zs_shrinker_count()
zsmalloc: use class->pages_per_zspage
zsmalloc: consider ZS_ALMOST_FULL as migrate source
zsmalloc: partial page ordering within a fullness_list
zsmalloc: use shrinker to trigger auto-compaction
zsmalloc: account the number of compacted pages
zsmalloc/zram: introduce zs_pool_stats api
zsmalloc: cosmetic compaction code adjustments
zsmalloc: introduce zs_can_compact() function
zsmalloc: always keep per-class stats
zsmalloc: drop unused variable `nr_to_migrate'
mm/memblock.c: fix comment in __next_mem_range()
mm/page_alloc.c: fix type information of memoryless node
memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node()
...
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/bootmem.c | 7 | ||||
| -rw-r--r-- | mm/compaction.c | 175 | ||||
| -rw-r--r-- | mm/dmapool.c | 12 | ||||
| -rw-r--r-- | mm/early_ioremap.c | 22 | ||||
| -rw-r--r-- | mm/filemap.c | 36 | ||||
| -rw-r--r-- | mm/huge_memory.c | 163 | ||||
| -rw-r--r-- | mm/hugetlb.c | 432 | ||||
| -rw-r--r-- | mm/hwpoison-inject.c | 2 | ||||
| -rw-r--r-- | mm/internal.h | 1 | ||||
| -rw-r--r-- | mm/kmemleak.c | 3 | ||||
| -rw-r--r-- | mm/list_lru.c | 4 | ||||
| -rw-r--r-- | mm/madvise.c | 2 | ||||
| -rw-r--r-- | mm/memblock.c | 31 | ||||
| -rw-r--r-- | mm/memcontrol.c | 394 | ||||
| -rw-r--r-- | mm/memory-failure.c | 103 | ||||
| -rw-r--r-- | mm/memory.c | 48 | ||||
| -rw-r--r-- | mm/mempolicy.c | 7 | ||||
| -rw-r--r-- | mm/mempool.c | 3 | ||||
| -rw-r--r-- | mm/memtest.c | 27 | ||||
| -rw-r--r-- | mm/migrate.c | 13 | ||||
| -rw-r--r-- | mm/mmap.c | 71 | ||||
| -rw-r--r-- | mm/oom_kill.c | 142 | ||||
| -rw-r--r-- | mm/page_alloc.c | 80 | ||||
| -rw-r--r-- | mm/page_isolation.c | 35 | ||||
| -rw-r--r-- | mm/shmem.c | 16 | ||||
| -rw-r--r-- | mm/slab.c | 2 | ||||
| -rw-r--r-- | mm/slab_common.c | 5 | ||||
| -rw-r--r-- | mm/slob.c | 4 | ||||
| -rw-r--r-- | mm/slub.c | 2 | ||||
| -rw-r--r-- | mm/swap_state.c | 37 | ||||
| -rw-r--r-- | mm/swapfile.c | 42 | ||||
| -rw-r--r-- | mm/vmscan.c | 14 | ||||
| -rw-r--r-- | mm/zbud.c | 10 | ||||
| -rw-r--r-- | mm/zpool.c | 18 | ||||
| -rw-r--r-- | mm/zsmalloc.c | 235 | ||||
| -rw-r--r-- | mm/zswap.c | 75 |
36 files changed, 1243 insertions, 1030 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index a23dd1934654..3b6380784c28 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -236,6 +236,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
| 236 | count += pages; | 236 | count += pages; |
| 237 | while (pages--) | 237 | while (pages--) |
| 238 | __free_pages_bootmem(page++, cur++, 0); | 238 | __free_pages_bootmem(page++, cur++, 0); |
| 239 | bdata->node_bootmem_map = NULL; | ||
| 239 | 240 | ||
| 240 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); | 241 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); |
| 241 | 242 | ||
| @@ -294,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata, | |||
| 294 | sidx + bdata->node_min_pfn, | 295 | sidx + bdata->node_min_pfn, |
| 295 | eidx + bdata->node_min_pfn); | 296 | eidx + bdata->node_min_pfn); |
| 296 | 297 | ||
| 298 | if (WARN_ON(bdata->node_bootmem_map == NULL)) | ||
| 299 | return; | ||
| 300 | |||
| 297 | if (bdata->hint_idx > sidx) | 301 | if (bdata->hint_idx > sidx) |
| 298 | bdata->hint_idx = sidx; | 302 | bdata->hint_idx = sidx; |
| 299 | 303 | ||
| @@ -314,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, | |||
| 314 | eidx + bdata->node_min_pfn, | 318 | eidx + bdata->node_min_pfn, |
| 315 | flags); | 319 | flags); |
| 316 | 320 | ||
| 321 | if (WARN_ON(bdata->node_bootmem_map == NULL)) | ||
| 322 | return 0; | ||
| 323 | |||
| 317 | for (idx = sidx; idx < eidx; idx++) | 324 | for (idx = sidx; idx < eidx; idx++) |
| 318 | if (test_and_set_bit(idx, bdata->node_bootmem_map)) { | 325 | if (test_and_set_bit(idx, bdata->node_bootmem_map)) { |
| 319 | if (exclusive) { | 326 | if (exclusive) { |
diff --git a/mm/compaction.c b/mm/compaction.c index 018f08da99a2..c5c627aae996 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -207,6 +207,13 @@ static inline bool isolation_suitable(struct compact_control *cc, | |||
| 207 | return !get_pageblock_skip(page); | 207 | return !get_pageblock_skip(page); |
| 208 | } | 208 | } |
| 209 | 209 | ||
| 210 | static void reset_cached_positions(struct zone *zone) | ||
| 211 | { | ||
| 212 | zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; | ||
| 213 | zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; | ||
| 214 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | ||
| 215 | } | ||
| 216 | |||
| 210 | /* | 217 | /* |
| 211 | * This function is called to clear all cached information on pageblocks that | 218 | * This function is called to clear all cached information on pageblocks that |
| 212 | * should be skipped for page isolation when the migrate and free page scanner | 219 | * should be skipped for page isolation when the migrate and free page scanner |
| @@ -218,9 +225,6 @@ static void __reset_isolation_suitable(struct zone *zone) | |||
| 218 | unsigned long end_pfn = zone_end_pfn(zone); | 225 | unsigned long end_pfn = zone_end_pfn(zone); |
| 219 | unsigned long pfn; | 226 | unsigned long pfn; |
| 220 | 227 | ||
| 221 | zone->compact_cached_migrate_pfn[0] = start_pfn; | ||
| 222 | zone->compact_cached_migrate_pfn[1] = start_pfn; | ||
| 223 | zone->compact_cached_free_pfn = end_pfn; | ||
| 224 | zone->compact_blockskip_flush = false; | 228 | zone->compact_blockskip_flush = false; |
| 225 | 229 | ||
| 226 | /* Walk the zone and mark every pageblock as suitable for isolation */ | 230 | /* Walk the zone and mark every pageblock as suitable for isolation */ |
| @@ -238,6 +242,8 @@ static void __reset_isolation_suitable(struct zone *zone) | |||
| 238 | 242 | ||
| 239 | clear_pageblock_skip(page); | 243 | clear_pageblock_skip(page); |
| 240 | } | 244 | } |
| 245 | |||
| 246 | reset_cached_positions(zone); | ||
| 241 | } | 247 | } |
| 242 | 248 | ||
| 243 | void reset_isolation_suitable(pg_data_t *pgdat) | 249 | void reset_isolation_suitable(pg_data_t *pgdat) |
| @@ -431,6 +437,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
| 431 | 437 | ||
| 432 | if (!valid_page) | 438 | if (!valid_page) |
| 433 | valid_page = page; | 439 | valid_page = page; |
| 440 | |||
| 441 | /* | ||
| 442 | * For compound pages such as THP and hugetlbfs, we can save | ||
| 443 | * potentially a lot of iterations if we skip them at once. | ||
| 444 | * The check is racy, but we can consider only valid values | ||
| 445 | * and the only danger is skipping too much. | ||
| 446 | */ | ||
| 447 | if (PageCompound(page)) { | ||
| 448 | unsigned int comp_order = compound_order(page); | ||
| 449 | |||
| 450 | if (likely(comp_order < MAX_ORDER)) { | ||
| 451 | blockpfn += (1UL << comp_order) - 1; | ||
| 452 | cursor += (1UL << comp_order) - 1; | ||
| 453 | } | ||
| 454 | |||
| 455 | goto isolate_fail; | ||
| 456 | } | ||
| 457 | |||
| 434 | if (!PageBuddy(page)) | 458 | if (!PageBuddy(page)) |
| 435 | goto isolate_fail; | 459 | goto isolate_fail; |
| 436 | 460 | ||
| @@ -490,6 +514,13 @@ isolate_fail: | |||
| 490 | 514 | ||
| 491 | } | 515 | } |
| 492 | 516 | ||
| 517 | /* | ||
| 518 | * There is a tiny chance that we have read bogus compound_order(), | ||
| 519 | * so be careful to not go outside of the pageblock. | ||
| 520 | */ | ||
| 521 | if (unlikely(blockpfn > end_pfn)) | ||
| 522 | blockpfn = end_pfn; | ||
| 523 | |||
| 493 | trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, | 524 | trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, |
| 494 | nr_scanned, total_isolated); | 525 | nr_scanned, total_isolated); |
| 495 | 526 | ||
| @@ -674,6 +705,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
| 674 | 705 | ||
| 675 | /* Time to isolate some pages for migration */ | 706 | /* Time to isolate some pages for migration */ |
| 676 | for (; low_pfn < end_pfn; low_pfn++) { | 707 | for (; low_pfn < end_pfn; low_pfn++) { |
| 708 | bool is_lru; | ||
| 709 | |||
| 677 | /* | 710 | /* |
| 678 | * Periodically drop the lock (if held) regardless of its | 711 | * Periodically drop the lock (if held) regardless of its |
| 679 | * contention, to give chance to IRQs. Abort async compaction | 712 | * contention, to give chance to IRQs. Abort async compaction |
| @@ -717,36 +750,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
| 717 | * It's possible to migrate LRU pages and balloon pages | 750 | * It's possible to migrate LRU pages and balloon pages |
| 718 | * Skip any other type of page | 751 | * Skip any other type of page |
| 719 | */ | 752 | */ |
| 720 | if (!PageLRU(page)) { | 753 | is_lru = PageLRU(page); |
| 754 | if (!is_lru) { | ||
| 721 | if (unlikely(balloon_page_movable(page))) { | 755 | if (unlikely(balloon_page_movable(page))) { |
| 722 | if (balloon_page_isolate(page)) { | 756 | if (balloon_page_isolate(page)) { |
| 723 | /* Successfully isolated */ | 757 | /* Successfully isolated */ |
| 724 | goto isolate_success; | 758 | goto isolate_success; |
| 725 | } | 759 | } |
| 726 | } | 760 | } |
| 727 | continue; | ||
| 728 | } | 761 | } |
| 729 | 762 | ||
| 730 | /* | 763 | /* |
| 731 | * PageLRU is set. lru_lock normally excludes isolation | 764 | * Regardless of being on LRU, compound pages such as THP and |
| 732 | * splitting and collapsing (collapsing has already happened | 765 | * hugetlbfs are not to be compacted. We can potentially save |
| 733 | * if PageLRU is set) but the lock is not necessarily taken | 766 | * a lot of iterations if we skip them at once. The check is |
| 734 | * here and it is wasteful to take it just to check transhuge. | 767 | * racy, but we can consider only valid values and the only |
| 735 | * Check TransHuge without lock and skip the whole pageblock if | 768 | * danger is skipping too much. |
| 736 | * it's either a transhuge or hugetlbfs page, as calling | ||
| 737 | * compound_order() without preventing THP from splitting the | ||
| 738 | * page underneath us may return surprising results. | ||
| 739 | */ | 769 | */ |
| 740 | if (PageTransHuge(page)) { | 770 | if (PageCompound(page)) { |
| 741 | if (!locked) | 771 | unsigned int comp_order = compound_order(page); |
| 742 | low_pfn = ALIGN(low_pfn + 1, | 772 | |
| 743 | pageblock_nr_pages) - 1; | 773 | if (likely(comp_order < MAX_ORDER)) |
| 744 | else | 774 | low_pfn += (1UL << comp_order) - 1; |
| 745 | low_pfn += (1 << compound_order(page)) - 1; | ||
| 746 | 775 | ||
| 747 | continue; | 776 | continue; |
| 748 | } | 777 | } |
| 749 | 778 | ||
| 779 | if (!is_lru) | ||
| 780 | continue; | ||
| 781 | |||
| 750 | /* | 782 | /* |
| 751 | * Migration will fail if an anonymous page is pinned in memory, | 783 | * Migration will fail if an anonymous page is pinned in memory, |
| 752 | * so avoid taking lru_lock and isolating it unnecessarily in an | 784 | * so avoid taking lru_lock and isolating it unnecessarily in an |
| @@ -763,11 +795,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
| 763 | if (!locked) | 795 | if (!locked) |
| 764 | break; | 796 | break; |
| 765 | 797 | ||
| 766 | /* Recheck PageLRU and PageTransHuge under lock */ | 798 | /* Recheck PageLRU and PageCompound under lock */ |
| 767 | if (!PageLRU(page)) | 799 | if (!PageLRU(page)) |
| 768 | continue; | 800 | continue; |
| 769 | if (PageTransHuge(page)) { | 801 | |
| 770 | low_pfn += (1 << compound_order(page)) - 1; | 802 | /* |
| 803 | * Page become compound since the non-locked check, | ||
| 804 | * and it's on LRU. It can only be a THP so the order | ||
| 805 | * is safe to read and it's 0 for tail pages. | ||
| 806 | */ | ||
| 807 | if (unlikely(PageCompound(page))) { | ||
| 808 | low_pfn += (1UL << compound_order(page)) - 1; | ||
| 771 | continue; | 809 | continue; |
| 772 | } | 810 | } |
| 773 | } | 811 | } |
| @@ -778,7 +816,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
| 778 | if (__isolate_lru_page(page, isolate_mode) != 0) | 816 | if (__isolate_lru_page(page, isolate_mode) != 0) |
| 779 | continue; | 817 | continue; |
| 780 | 818 | ||
| 781 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | 819 | VM_BUG_ON_PAGE(PageCompound(page), page); |
| 782 | 820 | ||
| 783 | /* Successfully isolated */ | 821 | /* Successfully isolated */ |
| 784 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 822 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
| @@ -898,6 +936,16 @@ static bool suitable_migration_target(struct page *page) | |||
| 898 | } | 936 | } |
| 899 | 937 | ||
| 900 | /* | 938 | /* |
| 939 | * Test whether the free scanner has reached the same or lower pageblock than | ||
| 940 | * the migration scanner, and compaction should thus terminate. | ||
| 941 | */ | ||
| 942 | static inline bool compact_scanners_met(struct compact_control *cc) | ||
| 943 | { | ||
| 944 | return (cc->free_pfn >> pageblock_order) | ||
| 945 | <= (cc->migrate_pfn >> pageblock_order); | ||
| 946 | } | ||
| 947 | |||
| 948 | /* | ||
| 901 | * Based on information in the current compact_control, find blocks | 949 | * Based on information in the current compact_control, find blocks |
| 902 | * suitable for isolating free pages from and then isolate them. | 950 | * suitable for isolating free pages from and then isolate them. |
| 903 | */ | 951 | */ |
| @@ -933,8 +981,7 @@ static void isolate_freepages(struct compact_control *cc) | |||
| 933 | * pages on cc->migratepages. We stop searching if the migrate | 981 | * pages on cc->migratepages. We stop searching if the migrate |
| 934 | * and free page scanners meet or enough free pages are isolated. | 982 | * and free page scanners meet or enough free pages are isolated. |
| 935 | */ | 983 | */ |
| 936 | for (; block_start_pfn >= low_pfn && | 984 | for (; block_start_pfn >= low_pfn; |
| 937 | cc->nr_migratepages > cc->nr_freepages; | ||
| 938 | block_end_pfn = block_start_pfn, | 985 | block_end_pfn = block_start_pfn, |
| 939 | block_start_pfn -= pageblock_nr_pages, | 986 | block_start_pfn -= pageblock_nr_pages, |
| 940 | isolate_start_pfn = block_start_pfn) { | 987 | isolate_start_pfn = block_start_pfn) { |
| @@ -966,6 +1013,8 @@ static void isolate_freepages(struct compact_control *cc) | |||
| 966 | block_end_pfn, freelist, false); | 1013 | block_end_pfn, freelist, false); |
| 967 | 1014 | ||
| 968 | /* | 1015 | /* |
| 1016 | * If we isolated enough freepages, or aborted due to async | ||
| 1017 | * compaction being contended, terminate the loop. | ||
| 969 | * Remember where the free scanner should restart next time, | 1018 | * Remember where the free scanner should restart next time, |
| 970 | * which is where isolate_freepages_block() left off. | 1019 | * which is where isolate_freepages_block() left off. |
| 971 | * But if it scanned the whole pageblock, isolate_start_pfn | 1020 | * But if it scanned the whole pageblock, isolate_start_pfn |
| @@ -974,27 +1023,31 @@ static void isolate_freepages(struct compact_control *cc) | |||
| 974 | * In that case we will however want to restart at the start | 1023 | * In that case we will however want to restart at the start |
| 975 | * of the previous pageblock. | 1024 | * of the previous pageblock. |
| 976 | */ | 1025 | */ |
| 977 | cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? | 1026 | if ((cc->nr_freepages >= cc->nr_migratepages) |
| 978 | isolate_start_pfn : | 1027 | || cc->contended) { |
| 979 | block_start_pfn - pageblock_nr_pages; | 1028 | if (isolate_start_pfn >= block_end_pfn) |
| 980 | 1029 | isolate_start_pfn = | |
| 981 | /* | 1030 | block_start_pfn - pageblock_nr_pages; |
| 982 | * isolate_freepages_block() might have aborted due to async | ||
| 983 | * compaction being contended | ||
| 984 | */ | ||
| 985 | if (cc->contended) | ||
| 986 | break; | 1031 | break; |
| 1032 | } else { | ||
| 1033 | /* | ||
| 1034 | * isolate_freepages_block() should not terminate | ||
| 1035 | * prematurely unless contended, or isolated enough | ||
| 1036 | */ | ||
| 1037 | VM_BUG_ON(isolate_start_pfn < block_end_pfn); | ||
| 1038 | } | ||
| 987 | } | 1039 | } |
| 988 | 1040 | ||
| 989 | /* split_free_page does not map the pages */ | 1041 | /* split_free_page does not map the pages */ |
| 990 | map_pages(freelist); | 1042 | map_pages(freelist); |
| 991 | 1043 | ||
| 992 | /* | 1044 | /* |
| 993 | * If we crossed the migrate scanner, we want to keep it that way | 1045 | * Record where the free scanner will restart next time. Either we |
| 994 | * so that compact_finished() may detect this | 1046 | * broke from the loop and set isolate_start_pfn based on the last |
| 1047 | * call to isolate_freepages_block(), or we met the migration scanner | ||
| 1048 | * and the loop terminated due to isolate_start_pfn < low_pfn | ||
| 995 | */ | 1049 | */ |
| 996 | if (block_start_pfn < low_pfn) | 1050 | cc->free_pfn = isolate_start_pfn; |
| 997 | cc->free_pfn = cc->migrate_pfn; | ||
| 998 | } | 1051 | } |
| 999 | 1052 | ||
| 1000 | /* | 1053 | /* |
| @@ -1062,6 +1115,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
| 1062 | struct compact_control *cc) | 1115 | struct compact_control *cc) |
| 1063 | { | 1116 | { |
| 1064 | unsigned long low_pfn, end_pfn; | 1117 | unsigned long low_pfn, end_pfn; |
| 1118 | unsigned long isolate_start_pfn; | ||
| 1065 | struct page *page; | 1119 | struct page *page; |
| 1066 | const isolate_mode_t isolate_mode = | 1120 | const isolate_mode_t isolate_mode = |
| 1067 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | | 1121 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | |
| @@ -1110,6 +1164,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
| 1110 | continue; | 1164 | continue; |
| 1111 | 1165 | ||
| 1112 | /* Perform the isolation */ | 1166 | /* Perform the isolation */ |
| 1167 | isolate_start_pfn = low_pfn; | ||
| 1113 | low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, | 1168 | low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, |
| 1114 | isolate_mode); | 1169 | isolate_mode); |
| 1115 | 1170 | ||
| @@ -1119,6 +1174,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
| 1119 | } | 1174 | } |
| 1120 | 1175 | ||
| 1121 | /* | 1176 | /* |
| 1177 | * Record where we could have freed pages by migration and not | ||
| 1178 | * yet flushed them to buddy allocator. | ||
| 1179 | * - this is the lowest page that could have been isolated and | ||
| 1180 | * then freed by migration. | ||
| 1181 | */ | ||
| 1182 | if (cc->nr_migratepages && !cc->last_migrated_pfn) | ||
| 1183 | cc->last_migrated_pfn = isolate_start_pfn; | ||
| 1184 | |||
| 1185 | /* | ||
| 1122 | * Either we isolated something and proceed with migration. Or | 1186 | * Either we isolated something and proceed with migration. Or |
| 1123 | * we failed and compact_zone should decide if we should | 1187 | * we failed and compact_zone should decide if we should |
| 1124 | * continue or not. | 1188 | * continue or not. |
| @@ -1127,12 +1191,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
| 1127 | } | 1191 | } |
| 1128 | 1192 | ||
| 1129 | acct_isolated(zone, cc); | 1193 | acct_isolated(zone, cc); |
| 1130 | /* | 1194 | /* Record where migration scanner will be restarted. */ |
| 1131 | * Record where migration scanner will be restarted. If we end up in | 1195 | cc->migrate_pfn = low_pfn; |
| 1132 | * the same pageblock as the free scanner, make the scanners fully | ||
| 1133 | * meet so that compact_finished() terminates compaction. | ||
| 1134 | */ | ||
| 1135 | cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; | ||
| 1136 | 1196 | ||
| 1137 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; | 1197 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; |
| 1138 | } | 1198 | } |
| @@ -1147,11 +1207,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, | |||
| 1147 | return COMPACT_PARTIAL; | 1207 | return COMPACT_PARTIAL; |
| 1148 | 1208 | ||
| 1149 | /* Compaction run completes if the migrate and free scanner meet */ | 1209 | /* Compaction run completes if the migrate and free scanner meet */ |
| 1150 | if (cc->free_pfn <= cc->migrate_pfn) { | 1210 | if (compact_scanners_met(cc)) { |
| 1151 | /* Let the next compaction start anew. */ | 1211 | /* Let the next compaction start anew. */ |
| 1152 | zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; | 1212 | reset_cached_positions(zone); |
| 1153 | zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; | ||
| 1154 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | ||
| 1155 | 1213 | ||
| 1156 | /* | 1214 | /* |
| 1157 | * Mark that the PG_migrate_skip information should be cleared | 1215 | * Mark that the PG_migrate_skip information should be cleared |
| @@ -1295,7 +1353,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 1295 | unsigned long end_pfn = zone_end_pfn(zone); | 1353 | unsigned long end_pfn = zone_end_pfn(zone); |
| 1296 | const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); | 1354 | const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); |
| 1297 | const bool sync = cc->mode != MIGRATE_ASYNC; | 1355 | const bool sync = cc->mode != MIGRATE_ASYNC; |
| 1298 | unsigned long last_migrated_pfn = 0; | ||
| 1299 | 1356 | ||
| 1300 | ret = compaction_suitable(zone, cc->order, cc->alloc_flags, | 1357 | ret = compaction_suitable(zone, cc->order, cc->alloc_flags, |
| 1301 | cc->classzone_idx); | 1358 | cc->classzone_idx); |
| @@ -1333,6 +1390,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 1333 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; | 1390 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; |
| 1334 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | 1391 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; |
| 1335 | } | 1392 | } |
| 1393 | cc->last_migrated_pfn = 0; | ||
| 1336 | 1394 | ||
| 1337 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, | 1395 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, |
| 1338 | cc->free_pfn, end_pfn, sync); | 1396 | cc->free_pfn, end_pfn, sync); |
| @@ -1342,7 +1400,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 1342 | while ((ret = compact_finished(zone, cc, migratetype)) == | 1400 | while ((ret = compact_finished(zone, cc, migratetype)) == |
| 1343 | COMPACT_CONTINUE) { | 1401 | COMPACT_CONTINUE) { |
| 1344 | int err; | 1402 | int err; |
| 1345 | unsigned long isolate_start_pfn = cc->migrate_pfn; | ||
| 1346 | 1403 | ||
| 1347 | switch (isolate_migratepages(zone, cc)) { | 1404 | switch (isolate_migratepages(zone, cc)) { |
| 1348 | case ISOLATE_ABORT: | 1405 | case ISOLATE_ABORT: |
| @@ -1376,22 +1433,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 1376 | * migrate_pages() may return -ENOMEM when scanners meet | 1433 | * migrate_pages() may return -ENOMEM when scanners meet |
| 1377 | * and we want compact_finished() to detect it | 1434 | * and we want compact_finished() to detect it |
| 1378 | */ | 1435 | */ |
| 1379 | if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { | 1436 | if (err == -ENOMEM && !compact_scanners_met(cc)) { |
| 1380 | ret = COMPACT_PARTIAL; | 1437 | ret = COMPACT_PARTIAL; |
| 1381 | goto out; | 1438 | goto out; |
| 1382 | } | 1439 | } |
| 1383 | } | 1440 | } |
| 1384 | 1441 | ||
| 1385 | /* | ||
| 1386 | * Record where we could have freed pages by migration and not | ||
| 1387 | * yet flushed them to buddy allocator. We use the pfn that | ||
| 1388 | * isolate_migratepages() started from in this loop iteration | ||
| 1389 | * - this is the lowest page that could have been isolated and | ||
| 1390 | * then freed by migration. | ||
| 1391 | */ | ||
| 1392 | if (!last_migrated_pfn) | ||
| 1393 | last_migrated_pfn = isolate_start_pfn; | ||
| 1394 | |||
| 1395 | check_drain: | 1442 | check_drain: |
| 1396 | /* | 1443 | /* |
| 1397 | * Has the migration scanner moved away from the previous | 1444 | * Has the migration scanner moved away from the previous |
| @@ -1400,18 +1447,18 @@ check_drain: | |||
| 1400 | * compact_finished() can detect immediately if allocation | 1447 | * compact_finished() can detect immediately if allocation |
| 1401 | * would succeed. | 1448 | * would succeed. |
| 1402 | */ | 1449 | */ |
| 1403 | if (cc->order > 0 && last_migrated_pfn) { | 1450 | if (cc->order > 0 && cc->last_migrated_pfn) { |
| 1404 | int cpu; | 1451 | int cpu; |
| 1405 | unsigned long current_block_start = | 1452 | unsigned long current_block_start = |
| 1406 | cc->migrate_pfn & ~((1UL << cc->order) - 1); | 1453 | cc->migrate_pfn & ~((1UL << cc->order) - 1); |
| 1407 | 1454 | ||
| 1408 | if (last_migrated_pfn < current_block_start) { | 1455 | if (cc->last_migrated_pfn < current_block_start) { |
| 1409 | cpu = get_cpu(); | 1456 | cpu = get_cpu(); |
| 1410 | lru_add_drain_cpu(cpu); | 1457 | lru_add_drain_cpu(cpu); |
| 1411 | drain_local_pages(zone); | 1458 | drain_local_pages(zone); |
| 1412 | put_cpu(); | 1459 | put_cpu(); |
| 1413 | /* No more flushing until we migrate again */ | 1460 | /* No more flushing until we migrate again */ |
| 1414 | last_migrated_pfn = 0; | 1461 | cc->last_migrated_pfn = 0; |
| 1415 | } | 1462 | } |
| 1416 | } | 1463 | } |
| 1417 | 1464 | ||
diff --git a/mm/dmapool.c b/mm/dmapool.c index 59d10d16f0a5..71a8998cd03a 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
| @@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool) | |||
| 271 | { | 271 | { |
| 272 | bool empty = false; | 272 | bool empty = false; |
| 273 | 273 | ||
| 274 | if (unlikely(!pool)) | ||
| 275 | return; | ||
| 276 | |||
| 274 | mutex_lock(&pools_reg_lock); | 277 | mutex_lock(&pools_reg_lock); |
| 275 | mutex_lock(&pools_lock); | 278 | mutex_lock(&pools_lock); |
| 276 | list_del(&pool->pools); | 279 | list_del(&pool->pools); |
| @@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
| 334 | /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ | 337 | /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ |
| 335 | spin_unlock_irqrestore(&pool->lock, flags); | 338 | spin_unlock_irqrestore(&pool->lock, flags); |
| 336 | 339 | ||
| 337 | page = pool_alloc_page(pool, mem_flags); | 340 | page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO)); |
| 338 | if (!page) | 341 | if (!page) |
| 339 | return NULL; | 342 | return NULL; |
| 340 | 343 | ||
| @@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
| 372 | break; | 375 | break; |
| 373 | } | 376 | } |
| 374 | } | 377 | } |
| 375 | memset(retval, POOL_POISON_ALLOCATED, pool->size); | 378 | if (!(mem_flags & __GFP_ZERO)) |
| 379 | memset(retval, POOL_POISON_ALLOCATED, pool->size); | ||
| 376 | #endif | 380 | #endif |
| 377 | spin_unlock_irqrestore(&pool->lock, flags); | 381 | spin_unlock_irqrestore(&pool->lock, flags); |
| 382 | |||
| 383 | if (mem_flags & __GFP_ZERO) | ||
| 384 | memset(retval, 0, pool->size); | ||
| 385 | |||
| 378 | return retval; | 386 | return retval; |
| 379 | } | 387 | } |
| 380 | EXPORT_SYMBOL(dma_pool_alloc); | 388 | EXPORT_SYMBOL(dma_pool_alloc); |
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 0cfadafb3fb0..23f744d77ce0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c | |||
| @@ -224,6 +224,28 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size) | |||
| 224 | return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); | 224 | return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); |
| 225 | } | 225 | } |
| 226 | #endif | 226 | #endif |
| 227 | |||
| 228 | #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) | ||
| 229 | |||
| 230 | void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) | ||
| 231 | { | ||
| 232 | unsigned long slop, clen; | ||
| 233 | char *p; | ||
| 234 | |||
| 235 | while (size) { | ||
| 236 | slop = src & ~PAGE_MASK; | ||
| 237 | clen = size; | ||
| 238 | if (clen > MAX_MAP_CHUNK - slop) | ||
| 239 | clen = MAX_MAP_CHUNK - slop; | ||
| 240 | p = early_memremap(src & PAGE_MASK, clen + slop); | ||
| 241 | memcpy(dest, p + slop, clen); | ||
| 242 | early_memunmap(p, clen + slop); | ||
| 243 | dest += clen; | ||
| 244 | src += clen; | ||
| 245 | size -= clen; | ||
| 246 | } | ||
| 247 | } | ||
| 248 | |||
| 227 | #else /* CONFIG_MMU */ | 249 | #else /* CONFIG_MMU */ |
| 228 | 250 | ||
| 229 | void __init __iomem * | 251 | void __init __iomem * |
diff --git a/mm/filemap.c b/mm/filemap.c index 1283fc825458..72940fb38666 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -674,7 +674,7 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
| 674 | do { | 674 | do { |
| 675 | cpuset_mems_cookie = read_mems_allowed_begin(); | 675 | cpuset_mems_cookie = read_mems_allowed_begin(); |
| 676 | n = cpuset_mem_spread_node(); | 676 | n = cpuset_mem_spread_node(); |
| 677 | page = alloc_pages_exact_node(n, gfp, 0); | 677 | page = __alloc_pages_node(n, gfp, 0); |
| 678 | } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); | 678 | } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); |
| 679 | 679 | ||
| 680 | return page; | 680 | return page; |
| @@ -2473,21 +2473,6 @@ ssize_t generic_perform_write(struct file *file, | |||
| 2473 | iov_iter_count(i)); | 2473 | iov_iter_count(i)); |
| 2474 | 2474 | ||
| 2475 | again: | 2475 | again: |
| 2476 | /* | ||
| 2477 | * Bring in the user page that we will copy from _first_. | ||
| 2478 | * Otherwise there's a nasty deadlock on copying from the | ||
| 2479 | * same page as we're writing to, without it being marked | ||
| 2480 | * up-to-date. | ||
| 2481 | * | ||
| 2482 | * Not only is this an optimisation, but it is also required | ||
| 2483 | * to check that the address is actually valid, when atomic | ||
| 2484 | * usercopies are used, below. | ||
| 2485 | */ | ||
| 2486 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
| 2487 | status = -EFAULT; | ||
| 2488 | break; | ||
| 2489 | } | ||
| 2490 | |||
| 2491 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, | 2476 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, |
| 2492 | &page, &fsdata); | 2477 | &page, &fsdata); |
| 2493 | if (unlikely(status < 0)) | 2478 | if (unlikely(status < 0)) |
| @@ -2495,8 +2480,17 @@ again: | |||
| 2495 | 2480 | ||
| 2496 | if (mapping_writably_mapped(mapping)) | 2481 | if (mapping_writably_mapped(mapping)) |
| 2497 | flush_dcache_page(page); | 2482 | flush_dcache_page(page); |
| 2498 | 2483 | /* | |
| 2484 | * 'page' is now locked. If we are trying to copy from a | ||
| 2485 | * mapping of 'page' in userspace, the copy might fault and | ||
| 2486 | * would need PageUptodate() to complete. But, page can not be | ||
| 2487 | * made Uptodate without acquiring the page lock, which we hold. | ||
| 2488 | * Deadlock. Avoid with pagefault_disable(). Fix up below with | ||
| 2489 | * iov_iter_fault_in_readable(). | ||
| 2490 | */ | ||
| 2491 | pagefault_disable(); | ||
| 2499 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | 2492 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); |
| 2493 | pagefault_enable(); | ||
| 2500 | flush_dcache_page(page); | 2494 | flush_dcache_page(page); |
| 2501 | 2495 | ||
| 2502 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | 2496 | status = a_ops->write_end(file, mapping, pos, bytes, copied, |
| @@ -2519,6 +2513,14 @@ again: | |||
| 2519 | */ | 2513 | */ |
| 2520 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | 2514 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, |
| 2521 | iov_iter_single_seg_count(i)); | 2515 | iov_iter_single_seg_count(i)); |
| 2516 | /* | ||
| 2517 | * This is the fallback to recover if the copy from | ||
| 2518 | * userspace above faults. | ||
| 2519 | */ | ||
| 2520 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
| 2521 | status = -EFAULT; | ||
| 2522 | break; | ||
| 2523 | } | ||
| 2522 | goto again; | 2524 | goto again; |
| 2523 | } | 2525 | } |
| 2524 | pos += copied; | 2526 | pos += copied; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 279a818a39b1..b16279cbd91d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/swap.h> | 16 | #include <linux/swap.h> |
| 17 | #include <linux/shrinker.h> | 17 | #include <linux/shrinker.h> |
| 18 | #include <linux/mm_inline.h> | 18 | #include <linux/mm_inline.h> |
| 19 | #include <linux/dax.h> | ||
| 19 | #include <linux/kthread.h> | 20 | #include <linux/kthread.h> |
| 20 | #include <linux/khugepaged.h> | 21 | #include <linux/khugepaged.h> |
| 21 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
| @@ -105,7 +106,7 @@ static struct khugepaged_scan khugepaged_scan = { | |||
| 105 | }; | 106 | }; |
| 106 | 107 | ||
| 107 | 108 | ||
| 108 | static int set_recommended_min_free_kbytes(void) | 109 | static void set_recommended_min_free_kbytes(void) |
| 109 | { | 110 | { |
| 110 | struct zone *zone; | 111 | struct zone *zone; |
| 111 | int nr_zones = 0; | 112 | int nr_zones = 0; |
| @@ -140,7 +141,6 @@ static int set_recommended_min_free_kbytes(void) | |||
| 140 | min_free_kbytes = recommended_min; | 141 | min_free_kbytes = recommended_min; |
| 141 | } | 142 | } |
| 142 | setup_per_zone_wmarks(); | 143 | setup_per_zone_wmarks(); |
| 143 | return 0; | ||
| 144 | } | 144 | } |
| 145 | 145 | ||
| 146 | static int start_stop_khugepaged(void) | 146 | static int start_stop_khugepaged(void) |
| @@ -172,12 +172,7 @@ fail: | |||
| 172 | static atomic_t huge_zero_refcount; | 172 | static atomic_t huge_zero_refcount; |
| 173 | struct page *huge_zero_page __read_mostly; | 173 | struct page *huge_zero_page __read_mostly; |
| 174 | 174 | ||
| 175 | static inline bool is_huge_zero_pmd(pmd_t pmd) | 175 | struct page *get_huge_zero_page(void) |
| 176 | { | ||
| 177 | return is_huge_zero_page(pmd_page(pmd)); | ||
| 178 | } | ||
| 179 | |||
| 180 | static struct page *get_huge_zero_page(void) | ||
| 181 | { | 176 | { |
| 182 | struct page *zero_page; | 177 | struct page *zero_page; |
| 183 | retry: | 178 | retry: |
| @@ -794,16 +789,19 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | |||
| 794 | } | 789 | } |
| 795 | 790 | ||
| 796 | /* Caller must hold page table lock. */ | 791 | /* Caller must hold page table lock. */ |
| 797 | static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 792 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
| 798 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 793 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
| 799 | struct page *zero_page) | 794 | struct page *zero_page) |
| 800 | { | 795 | { |
| 801 | pmd_t entry; | 796 | pmd_t entry; |
| 797 | if (!pmd_none(*pmd)) | ||
| 798 | return false; | ||
| 802 | entry = mk_pmd(zero_page, vma->vm_page_prot); | 799 | entry = mk_pmd(zero_page, vma->vm_page_prot); |
| 803 | entry = pmd_mkhuge(entry); | 800 | entry = pmd_mkhuge(entry); |
| 804 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 801 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
| 805 | set_pmd_at(mm, haddr, pmd, entry); | 802 | set_pmd_at(mm, haddr, pmd, entry); |
| 806 | atomic_long_inc(&mm->nr_ptes); | 803 | atomic_long_inc(&mm->nr_ptes); |
| 804 | return true; | ||
| 807 | } | 805 | } |
| 808 | 806 | ||
| 809 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 807 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| @@ -870,6 +868,49 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 870 | flags); | 868 | flags); |
| 871 | } | 869 | } |
| 872 | 870 | ||
| 871 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
| 872 | pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) | ||
| 873 | { | ||
| 874 | struct mm_struct *mm = vma->vm_mm; | ||
| 875 | pmd_t entry; | ||
| 876 | spinlock_t *ptl; | ||
| 877 | |||
| 878 | ptl = pmd_lock(mm, pmd); | ||
| 879 | if (pmd_none(*pmd)) { | ||
| 880 | entry = pmd_mkhuge(pfn_pmd(pfn, prot)); | ||
| 881 | if (write) { | ||
| 882 | entry = pmd_mkyoung(pmd_mkdirty(entry)); | ||
| 883 | entry = maybe_pmd_mkwrite(entry, vma); | ||
| 884 | } | ||
| 885 | set_pmd_at(mm, addr, pmd, entry); | ||
| 886 | update_mmu_cache_pmd(vma, addr, pmd); | ||
| 887 | } | ||
| 888 | spin_unlock(ptl); | ||
| 889 | } | ||
| 890 | |||
| 891 | int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
| 892 | pmd_t *pmd, unsigned long pfn, bool write) | ||
| 893 | { | ||
| 894 | pgprot_t pgprot = vma->vm_page_prot; | ||
| 895 | /* | ||
| 896 | * If we had pmd_special, we could avoid all these restrictions, | ||
| 897 | * but we need to be consistent with PTEs and architectures that | ||
| 898 | * can't support a 'special' bit. | ||
| 899 | */ | ||
| 900 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); | ||
| 901 | BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == | ||
| 902 | (VM_PFNMAP|VM_MIXEDMAP)); | ||
| 903 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | ||
| 904 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | ||
| 905 | |||
| 906 | if (addr < vma->vm_start || addr >= vma->vm_end) | ||
| 907 | return VM_FAULT_SIGBUS; | ||
| 908 | if (track_pfn_insert(vma, &pgprot, pfn)) | ||
| 909 | return VM_FAULT_SIGBUS; | ||
| 910 | insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); | ||
| 911 | return VM_FAULT_NOPAGE; | ||
| 912 | } | ||
| 913 | |||
| 873 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 914 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
| 874 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 915 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
| 875 | struct vm_area_struct *vma) | 916 | struct vm_area_struct *vma) |
| @@ -1414,41 +1455,41 @@ out: | |||
| 1414 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1455 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
| 1415 | pmd_t *pmd, unsigned long addr) | 1456 | pmd_t *pmd, unsigned long addr) |
| 1416 | { | 1457 | { |
| 1458 | pmd_t orig_pmd; | ||
| 1417 | spinlock_t *ptl; | 1459 | spinlock_t *ptl; |
| 1418 | int ret = 0; | ||
| 1419 | 1460 | ||
| 1420 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1461 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) |
| 1421 | struct page *page; | 1462 | return 0; |
| 1422 | pgtable_t pgtable; | 1463 | /* |
| 1423 | pmd_t orig_pmd; | 1464 | * For architectures like ppc64 we look at deposited pgtable |
| 1424 | /* | 1465 | * when calling pmdp_huge_get_and_clear. So do the |
| 1425 | * For architectures like ppc64 we look at deposited pgtable | 1466 | * pgtable_trans_huge_withdraw after finishing pmdp related |
| 1426 | * when calling pmdp_huge_get_and_clear. So do the | 1467 | * operations. |
| 1427 | * pgtable_trans_huge_withdraw after finishing pmdp related | 1468 | */ |
| 1428 | * operations. | 1469 | orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, |
| 1429 | */ | 1470 | tlb->fullmm); |
| 1430 | orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, | 1471 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
| 1431 | tlb->fullmm); | 1472 | if (vma_is_dax(vma)) { |
| 1432 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1473 | spin_unlock(ptl); |
| 1433 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); | 1474 | if (is_huge_zero_pmd(orig_pmd)) |
| 1434 | if (is_huge_zero_pmd(orig_pmd)) { | ||
| 1435 | atomic_long_dec(&tlb->mm->nr_ptes); | ||
| 1436 | spin_unlock(ptl); | ||
| 1437 | put_huge_zero_page(); | 1475 | put_huge_zero_page(); |
| 1438 | } else { | 1476 | } else if (is_huge_zero_pmd(orig_pmd)) { |
| 1439 | page = pmd_page(orig_pmd); | 1477 | pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); |
| 1440 | page_remove_rmap(page); | 1478 | atomic_long_dec(&tlb->mm->nr_ptes); |
| 1441 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); | 1479 | spin_unlock(ptl); |
| 1442 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1480 | put_huge_zero_page(); |
| 1443 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1481 | } else { |
| 1444 | atomic_long_dec(&tlb->mm->nr_ptes); | 1482 | struct page *page = pmd_page(orig_pmd); |
| 1445 | spin_unlock(ptl); | 1483 | page_remove_rmap(page); |
| 1446 | tlb_remove_page(tlb, page); | 1484 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); |
| 1447 | } | 1485 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
| 1448 | pte_free(tlb->mm, pgtable); | 1486 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 1449 | ret = 1; | 1487 | pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); |
| 1488 | atomic_long_dec(&tlb->mm->nr_ptes); | ||
| 1489 | spin_unlock(ptl); | ||
| 1490 | tlb_remove_page(tlb, page); | ||
| 1450 | } | 1491 | } |
| 1451 | return ret; | 1492 | return 1; |
| 1452 | } | 1493 | } |
| 1453 | 1494 | ||
| 1454 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | 1495 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, |
| @@ -2285,8 +2326,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | |||
| 2285 | 2326 | ||
| 2286 | static void khugepaged_alloc_sleep(void) | 2327 | static void khugepaged_alloc_sleep(void) |
| 2287 | { | 2328 | { |
| 2288 | wait_event_freezable_timeout(khugepaged_wait, false, | 2329 | DEFINE_WAIT(wait); |
| 2289 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | 2330 | |
| 2331 | add_wait_queue(&khugepaged_wait, &wait); | ||
| 2332 | freezable_schedule_timeout_interruptible( | ||
| 2333 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | ||
| 2334 | remove_wait_queue(&khugepaged_wait, &wait); | ||
| 2290 | } | 2335 | } |
| 2291 | 2336 | ||
| 2292 | static int khugepaged_node_load[MAX_NUMNODES]; | 2337 | static int khugepaged_node_load[MAX_NUMNODES]; |
| @@ -2373,7 +2418,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, | |||
| 2373 | */ | 2418 | */ |
| 2374 | up_read(&mm->mmap_sem); | 2419 | up_read(&mm->mmap_sem); |
| 2375 | 2420 | ||
| 2376 | *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); | 2421 | *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); |
| 2377 | if (unlikely(!*hpage)) { | 2422 | if (unlikely(!*hpage)) { |
| 2378 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2423 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
| 2379 | *hpage = ERR_PTR(-ENOMEM); | 2424 | *hpage = ERR_PTR(-ENOMEM); |
| @@ -2911,7 +2956,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | |||
| 2911 | pmd_t *pmd) | 2956 | pmd_t *pmd) |
| 2912 | { | 2957 | { |
| 2913 | spinlock_t *ptl; | 2958 | spinlock_t *ptl; |
| 2914 | struct page *page; | 2959 | struct page *page = NULL; |
| 2915 | struct mm_struct *mm = vma->vm_mm; | 2960 | struct mm_struct *mm = vma->vm_mm; |
| 2916 | unsigned long haddr = address & HPAGE_PMD_MASK; | 2961 | unsigned long haddr = address & HPAGE_PMD_MASK; |
| 2917 | unsigned long mmun_start; /* For mmu_notifiers */ | 2962 | unsigned long mmun_start; /* For mmu_notifiers */ |
| @@ -2924,25 +2969,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | |||
| 2924 | again: | 2969 | again: |
| 2925 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2970 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
| 2926 | ptl = pmd_lock(mm, pmd); | 2971 | ptl = pmd_lock(mm, pmd); |
| 2927 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2972 | if (unlikely(!pmd_trans_huge(*pmd))) |
| 2928 | spin_unlock(ptl); | 2973 | goto unlock; |
| 2929 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2974 | if (vma_is_dax(vma)) { |
| 2930 | return; | 2975 | pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); |
| 2931 | } | 2976 | if (is_huge_zero_pmd(_pmd)) |
| 2932 | if (is_huge_zero_pmd(*pmd)) { | 2977 | put_huge_zero_page(); |
| 2978 | } else if (is_huge_zero_pmd(*pmd)) { | ||
| 2933 | __split_huge_zero_page_pmd(vma, haddr, pmd); | 2979 | __split_huge_zero_page_pmd(vma, haddr, pmd); |
| 2934 | spin_unlock(ptl); | 2980 | } else { |
| 2935 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2981 | page = pmd_page(*pmd); |
| 2936 | return; | 2982 | VM_BUG_ON_PAGE(!page_count(page), page); |
| 2983 | get_page(page); | ||
| 2937 | } | 2984 | } |
| 2938 | page = pmd_page(*pmd); | 2985 | unlock: |
| 2939 | VM_BUG_ON_PAGE(!page_count(page), page); | ||
| 2940 | get_page(page); | ||
| 2941 | spin_unlock(ptl); | 2986 | spin_unlock(ptl); |
| 2942 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2987 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
| 2943 | 2988 | ||
| 2944 | split_huge_page(page); | 2989 | if (!page) |
| 2990 | return; | ||
| 2945 | 2991 | ||
| 2992 | split_huge_page(page); | ||
| 2946 | put_page(page); | 2993 | put_page(page); |
| 2947 | 2994 | ||
| 2948 | /* | 2995 | /* |
| @@ -2991,7 +3038,7 @@ static void split_huge_page_address(struct mm_struct *mm, | |||
| 2991 | split_huge_page_pmd_mm(mm, address, pmd); | 3038 | split_huge_page_pmd_mm(mm, address, pmd); |
| 2992 | } | 3039 | } |
| 2993 | 3040 | ||
| 2994 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | 3041 | void vma_adjust_trans_huge(struct vm_area_struct *vma, |
| 2995 | unsigned long start, | 3042 | unsigned long start, |
| 2996 | unsigned long end, | 3043 | unsigned long end, |
| 2997 | long adjust_next) | 3044 | long adjust_next) |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51ae41d0fbc0..999fb0aef8f1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock); | |||
| 64 | * prevent spurious OOMs when the hugepage pool is fully utilized. | 64 | * prevent spurious OOMs when the hugepage pool is fully utilized. |
| 65 | */ | 65 | */ |
| 66 | static int num_fault_mutexes; | 66 | static int num_fault_mutexes; |
| 67 | static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; | 67 | struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; |
| 68 | 68 | ||
| 69 | /* Forward declaration */ | 69 | /* Forward declaration */ |
| 70 | static int hugetlb_acct_memory(struct hstate *h, long delta); | 70 | static int hugetlb_acct_memory(struct hstate *h, long delta); |
| @@ -240,11 +240,14 @@ struct file_region { | |||
| 240 | 240 | ||
| 241 | /* | 241 | /* |
| 242 | * Add the huge page range represented by [f, t) to the reserve | 242 | * Add the huge page range represented by [f, t) to the reserve |
| 243 | * map. Existing regions will be expanded to accommodate the | 243 | * map. In the normal case, existing regions will be expanded |
| 244 | * specified range. We know only existing regions need to be | 244 | * to accommodate the specified range. Sufficient regions should |
| 245 | * expanded, because region_add is only called after region_chg | 245 | * exist for expansion due to the previous call to region_chg |
| 246 | * with the same range. If a new file_region structure must | 246 | * with the same range. However, it is possible that region_del |
| 247 | * be allocated, it is done in region_chg. | 247 | * could have been called after region_chg and modifed the map |
| 248 | * in such a way that no region exists to be expanded. In this | ||
| 249 | * case, pull a region descriptor from the cache associated with | ||
| 250 | * the map and use that for the new range. | ||
| 248 | * | 251 | * |
| 249 | * Return the number of new huge pages added to the map. This | 252 | * Return the number of new huge pages added to the map. This |
| 250 | * number is greater than or equal to zero. | 253 | * number is greater than or equal to zero. |
| @@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t) | |||
| 261 | if (f <= rg->to) | 264 | if (f <= rg->to) |
| 262 | break; | 265 | break; |
| 263 | 266 | ||
| 267 | /* | ||
| 268 | * If no region exists which can be expanded to include the | ||
| 269 | * specified range, the list must have been modified by an | ||
| 270 | * interleving call to region_del(). Pull a region descriptor | ||
| 271 | * from the cache and use it for this range. | ||
| 272 | */ | ||
| 273 | if (&rg->link == head || t < rg->from) { | ||
| 274 | VM_BUG_ON(resv->region_cache_count <= 0); | ||
| 275 | |||
| 276 | resv->region_cache_count--; | ||
| 277 | nrg = list_first_entry(&resv->region_cache, struct file_region, | ||
| 278 | link); | ||
| 279 | list_del(&nrg->link); | ||
| 280 | |||
| 281 | nrg->from = f; | ||
| 282 | nrg->to = t; | ||
| 283 | list_add(&nrg->link, rg->link.prev); | ||
| 284 | |||
| 285 | add += t - f; | ||
| 286 | goto out_locked; | ||
| 287 | } | ||
| 288 | |||
| 264 | /* Round our left edge to the current segment if it encloses us. */ | 289 | /* Round our left edge to the current segment if it encloses us. */ |
| 265 | if (f > rg->from) | 290 | if (f > rg->from) |
| 266 | f = rg->from; | 291 | f = rg->from; |
| @@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t) | |||
| 294 | add += t - nrg->to; /* Added to end of region */ | 319 | add += t - nrg->to; /* Added to end of region */ |
| 295 | nrg->to = t; | 320 | nrg->to = t; |
| 296 | 321 | ||
| 322 | out_locked: | ||
| 323 | resv->adds_in_progress--; | ||
| 297 | spin_unlock(&resv->lock); | 324 | spin_unlock(&resv->lock); |
| 298 | VM_BUG_ON(add < 0); | 325 | VM_BUG_ON(add < 0); |
| 299 | return add; | 326 | return add; |
| @@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t) | |||
| 312 | * so that the subsequent region_add call will have all the | 339 | * so that the subsequent region_add call will have all the |
| 313 | * regions it needs and will not fail. | 340 | * regions it needs and will not fail. |
| 314 | * | 341 | * |
| 315 | * Returns the number of huge pages that need to be added | 342 | * Upon entry, region_chg will also examine the cache of region descriptors |
| 316 | * to the existing reservation map for the range [f, t). | 343 | * associated with the map. If there are not enough descriptors cached, one |
| 317 | * This number is greater or equal to zero. -ENOMEM is | 344 | * will be allocated for the in progress add operation. |
| 318 | * returned if a new file_region structure is needed and can | 345 | * |
| 319 | * not be allocated. | 346 | * Returns the number of huge pages that need to be added to the existing |
| 347 | * reservation map for the range [f, t). This number is greater or equal to | ||
| 348 | * zero. -ENOMEM is returned if a new file_region structure or cache entry | ||
| 349 | * is needed and can not be allocated. | ||
| 320 | */ | 350 | */ |
| 321 | static long region_chg(struct resv_map *resv, long f, long t) | 351 | static long region_chg(struct resv_map *resv, long f, long t) |
| 322 | { | 352 | { |
| @@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t) | |||
| 326 | 356 | ||
| 327 | retry: | 357 | retry: |
| 328 | spin_lock(&resv->lock); | 358 | spin_lock(&resv->lock); |
| 359 | retry_locked: | ||
| 360 | resv->adds_in_progress++; | ||
| 361 | |||
| 362 | /* | ||
| 363 | * Check for sufficient descriptors in the cache to accommodate | ||
| 364 | * the number of in progress add operations. | ||
| 365 | */ | ||
| 366 | if (resv->adds_in_progress > resv->region_cache_count) { | ||
| 367 | struct file_region *trg; | ||
| 368 | |||
| 369 | VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1); | ||
| 370 | /* Must drop lock to allocate a new descriptor. */ | ||
| 371 | resv->adds_in_progress--; | ||
| 372 | spin_unlock(&resv->lock); | ||
| 373 | |||
| 374 | trg = kmalloc(sizeof(*trg), GFP_KERNEL); | ||
| 375 | if (!trg) | ||
| 376 | return -ENOMEM; | ||
| 377 | |||
| 378 | spin_lock(&resv->lock); | ||
| 379 | list_add(&trg->link, &resv->region_cache); | ||
| 380 | resv->region_cache_count++; | ||
| 381 | goto retry_locked; | ||
| 382 | } | ||
| 383 | |||
| 329 | /* Locate the region we are before or in. */ | 384 | /* Locate the region we are before or in. */ |
| 330 | list_for_each_entry(rg, head, link) | 385 | list_for_each_entry(rg, head, link) |
| 331 | if (f <= rg->to) | 386 | if (f <= rg->to) |
| @@ -336,6 +391,7 @@ retry: | |||
| 336 | * size such that we can guarantee to record the reservation. */ | 391 | * size such that we can guarantee to record the reservation. */ |
| 337 | if (&rg->link == head || t < rg->from) { | 392 | if (&rg->link == head || t < rg->from) { |
| 338 | if (!nrg) { | 393 | if (!nrg) { |
| 394 | resv->adds_in_progress--; | ||
| 339 | spin_unlock(&resv->lock); | 395 | spin_unlock(&resv->lock); |
| 340 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); | 396 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); |
| 341 | if (!nrg) | 397 | if (!nrg) |
| @@ -385,43 +441,131 @@ out_nrg: | |||
| 385 | } | 441 | } |
| 386 | 442 | ||
| 387 | /* | 443 | /* |
| 388 | * Truncate the reserve map at index 'end'. Modify/truncate any | 444 | * Abort the in progress add operation. The adds_in_progress field |
| 389 | * region which contains end. Delete any regions past end. | 445 | * of the resv_map keeps track of the operations in progress between |
| 390 | * Return the number of huge pages removed from the map. | 446 | * calls to region_chg and region_add. Operations are sometimes |
| 447 | * aborted after the call to region_chg. In such cases, region_abort | ||
| 448 | * is called to decrement the adds_in_progress counter. | ||
| 449 | * | ||
| 450 | * NOTE: The range arguments [f, t) are not needed or used in this | ||
| 451 | * routine. They are kept to make reading the calling code easier as | ||
| 452 | * arguments will match the associated region_chg call. | ||
| 391 | */ | 453 | */ |
| 392 | static long region_truncate(struct resv_map *resv, long end) | 454 | static void region_abort(struct resv_map *resv, long f, long t) |
| 455 | { | ||
| 456 | spin_lock(&resv->lock); | ||
| 457 | VM_BUG_ON(!resv->region_cache_count); | ||
| 458 | resv->adds_in_progress--; | ||
| 459 | spin_unlock(&resv->lock); | ||
| 460 | } | ||
| 461 | |||
| 462 | /* | ||
| 463 | * Delete the specified range [f, t) from the reserve map. If the | ||
| 464 | * t parameter is LONG_MAX, this indicates that ALL regions after f | ||
| 465 | * should be deleted. Locate the regions which intersect [f, t) | ||
| 466 | * and either trim, delete or split the existing regions. | ||
| 467 | * | ||
| 468 | * Returns the number of huge pages deleted from the reserve map. | ||
| 469 | * In the normal case, the return value is zero or more. In the | ||
| 470 | * case where a region must be split, a new region descriptor must | ||
| 471 | * be allocated. If the allocation fails, -ENOMEM will be returned. | ||
| 472 | * NOTE: If the parameter t == LONG_MAX, then we will never split | ||
| 473 | * a region and possibly return -ENOMEM. Callers specifying | ||
| 474 | * t == LONG_MAX do not need to check for -ENOMEM error. | ||
| 475 | */ | ||
| 476 | static long region_del(struct resv_map *resv, long f, long t) | ||
| 393 | { | 477 | { |
| 394 | struct list_head *head = &resv->regions; | 478 | struct list_head *head = &resv->regions; |
| 395 | struct file_region *rg, *trg; | 479 | struct file_region *rg, *trg; |
| 396 | long chg = 0; | 480 | struct file_region *nrg = NULL; |
| 481 | long del = 0; | ||
| 397 | 482 | ||
| 483 | retry: | ||
| 398 | spin_lock(&resv->lock); | 484 | spin_lock(&resv->lock); |
| 399 | /* Locate the region we are either in or before. */ | 485 | list_for_each_entry_safe(rg, trg, head, link) { |
| 400 | list_for_each_entry(rg, head, link) | 486 | if (rg->to <= f) |
| 401 | if (end <= rg->to) | 487 | continue; |
| 488 | if (rg->from >= t) | ||
| 402 | break; | 489 | break; |
| 403 | if (&rg->link == head) | ||
| 404 | goto out; | ||
| 405 | 490 | ||
| 406 | /* If we are in the middle of a region then adjust it. */ | 491 | if (f > rg->from && t < rg->to) { /* Must split region */ |
| 407 | if (end > rg->from) { | 492 | /* |
| 408 | chg = rg->to - end; | 493 | * Check for an entry in the cache before dropping |
| 409 | rg->to = end; | 494 | * lock and attempting allocation. |
| 410 | rg = list_entry(rg->link.next, typeof(*rg), link); | 495 | */ |
| 411 | } | 496 | if (!nrg && |
| 497 | resv->region_cache_count > resv->adds_in_progress) { | ||
| 498 | nrg = list_first_entry(&resv->region_cache, | ||
| 499 | struct file_region, | ||
| 500 | link); | ||
| 501 | list_del(&nrg->link); | ||
| 502 | resv->region_cache_count--; | ||
| 503 | } | ||
| 412 | 504 | ||
| 413 | /* Drop any remaining regions. */ | 505 | if (!nrg) { |
| 414 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | 506 | spin_unlock(&resv->lock); |
| 415 | if (&rg->link == head) | 507 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); |
| 508 | if (!nrg) | ||
| 509 | return -ENOMEM; | ||
| 510 | goto retry; | ||
| 511 | } | ||
| 512 | |||
| 513 | del += t - f; | ||
| 514 | |||
| 515 | /* New entry for end of split region */ | ||
| 516 | nrg->from = t; | ||
| 517 | nrg->to = rg->to; | ||
| 518 | INIT_LIST_HEAD(&nrg->link); | ||
| 519 | |||
| 520 | /* Original entry is trimmed */ | ||
| 521 | rg->to = f; | ||
| 522 | |||
| 523 | list_add(&nrg->link, &rg->link); | ||
| 524 | nrg = NULL; | ||
| 416 | break; | 525 | break; |
| 417 | chg += rg->to - rg->from; | 526 | } |
| 418 | list_del(&rg->link); | 527 | |
| 419 | kfree(rg); | 528 | if (f <= rg->from && t >= rg->to) { /* Remove entire region */ |
| 529 | del += rg->to - rg->from; | ||
| 530 | list_del(&rg->link); | ||
| 531 | kfree(rg); | ||
| 532 | continue; | ||
| 533 | } | ||
| 534 | |||
| 535 | if (f <= rg->from) { /* Trim beginning of region */ | ||
| 536 | del += t - rg->from; | ||
| 537 | rg->from = t; | ||
| 538 | } else { /* Trim end of region */ | ||
| 539 | del += rg->to - f; | ||
| 540 | rg->to = f; | ||
| 541 | } | ||
| 420 | } | 542 | } |
| 421 | 543 | ||
| 422 | out: | ||
| 423 | spin_unlock(&resv->lock); | 544 | spin_unlock(&resv->lock); |
| 424 | return chg; | 545 | kfree(nrg); |
| 546 | return del; | ||
| 547 | } | ||
| 548 | |||
| 549 | /* | ||
| 550 | * A rare out of memory error was encountered which prevented removal of | ||
| 551 | * the reserve map region for a page. The huge page itself was free'ed | ||
| 552 | * and removed from the page cache. This routine will adjust the subpool | ||
| 553 | * usage count, and the global reserve count if needed. By incrementing | ||
| 554 | * these counts, the reserve map entry which could not be deleted will | ||
| 555 | * appear as a "reserved" entry instead of simply dangling with incorrect | ||
| 556 | * counts. | ||
| 557 | */ | ||
| 558 | void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) | ||
| 559 | { | ||
| 560 | struct hugepage_subpool *spool = subpool_inode(inode); | ||
| 561 | long rsv_adjust; | ||
| 562 | |||
| 563 | rsv_adjust = hugepage_subpool_get_pages(spool, 1); | ||
| 564 | if (restore_reserve && rsv_adjust) { | ||
| 565 | struct hstate *h = hstate_inode(inode); | ||
| 566 | |||
| 567 | hugetlb_acct_memory(h, 1); | ||
| 568 | } | ||
| 425 | } | 569 | } |
| 426 | 570 | ||
| 427 | /* | 571 | /* |
| @@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma, | |||
| 544 | struct resv_map *resv_map_alloc(void) | 688 | struct resv_map *resv_map_alloc(void) |
| 545 | { | 689 | { |
| 546 | struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); | 690 | struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); |
| 547 | if (!resv_map) | 691 | struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); |
| 692 | |||
| 693 | if (!resv_map || !rg) { | ||
| 694 | kfree(resv_map); | ||
| 695 | kfree(rg); | ||
| 548 | return NULL; | 696 | return NULL; |
| 697 | } | ||
| 549 | 698 | ||
| 550 | kref_init(&resv_map->refs); | 699 | kref_init(&resv_map->refs); |
| 551 | spin_lock_init(&resv_map->lock); | 700 | spin_lock_init(&resv_map->lock); |
| 552 | INIT_LIST_HEAD(&resv_map->regions); | 701 | INIT_LIST_HEAD(&resv_map->regions); |
| 553 | 702 | ||
| 703 | resv_map->adds_in_progress = 0; | ||
| 704 | |||
| 705 | INIT_LIST_HEAD(&resv_map->region_cache); | ||
| 706 | list_add(&rg->link, &resv_map->region_cache); | ||
| 707 | resv_map->region_cache_count = 1; | ||
| 708 | |||
| 554 | return resv_map; | 709 | return resv_map; |
| 555 | } | 710 | } |
| 556 | 711 | ||
| 557 | void resv_map_release(struct kref *ref) | 712 | void resv_map_release(struct kref *ref) |
| 558 | { | 713 | { |
| 559 | struct resv_map *resv_map = container_of(ref, struct resv_map, refs); | 714 | struct resv_map *resv_map = container_of(ref, struct resv_map, refs); |
| 715 | struct list_head *head = &resv_map->region_cache; | ||
| 716 | struct file_region *rg, *trg; | ||
| 560 | 717 | ||
| 561 | /* Clear out any active regions before we release the map. */ | 718 | /* Clear out any active regions before we release the map. */ |
| 562 | region_truncate(resv_map, 0); | 719 | region_del(resv_map, 0, LONG_MAX); |
| 720 | |||
| 721 | /* ... and any entries left in the cache */ | ||
| 722 | list_for_each_entry_safe(rg, trg, head, link) { | ||
| 723 | list_del(&rg->link); | ||
| 724 | kfree(rg); | ||
| 725 | } | ||
| 726 | |||
| 727 | VM_BUG_ON(resv_map->adds_in_progress); | ||
| 728 | |||
| 563 | kfree(resv_map); | 729 | kfree(resv_map); |
| 564 | } | 730 | } |
| 565 | 731 | ||
| @@ -635,8 +801,19 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) | |||
| 635 | } | 801 | } |
| 636 | 802 | ||
| 637 | /* Shared mappings always use reserves */ | 803 | /* Shared mappings always use reserves */ |
| 638 | if (vma->vm_flags & VM_MAYSHARE) | 804 | if (vma->vm_flags & VM_MAYSHARE) { |
| 639 | return true; | 805 | /* |
| 806 | * We know VM_NORESERVE is not set. Therefore, there SHOULD | ||
| 807 | * be a region map for all pages. The only situation where | ||
| 808 | * there is no region map is if a hole was punched via | ||
| 809 | * fallocate. In this case, there really are no reverves to | ||
| 810 | * use. This situation is indicated if chg != 0. | ||
| 811 | */ | ||
| 812 | if (chg) | ||
| 813 | return false; | ||
| 814 | else | ||
| 815 | return true; | ||
| 816 | } | ||
| 640 | 817 | ||
| 641 | /* | 818 | /* |
| 642 | * Only the process that called mmap() has reserves for | 819 | * Only the process that called mmap() has reserves for |
| @@ -1154,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
| 1154 | { | 1331 | { |
| 1155 | struct page *page; | 1332 | struct page *page; |
| 1156 | 1333 | ||
| 1157 | page = alloc_pages_exact_node(nid, | 1334 | page = __alloc_pages_node(nid, |
| 1158 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| | 1335 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| |
| 1159 | __GFP_REPEAT|__GFP_NOWARN, | 1336 | __GFP_REPEAT|__GFP_NOWARN, |
| 1160 | huge_page_order(h)); | 1337 | huge_page_order(h)); |
| @@ -1306,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
| 1306 | __GFP_REPEAT|__GFP_NOWARN, | 1483 | __GFP_REPEAT|__GFP_NOWARN, |
| 1307 | huge_page_order(h)); | 1484 | huge_page_order(h)); |
| 1308 | else | 1485 | else |
| 1309 | page = alloc_pages_exact_node(nid, | 1486 | page = __alloc_pages_node(nid, |
| 1310 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| | 1487 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| |
| 1311 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); | 1488 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); |
| 1312 | 1489 | ||
| @@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
| 1473 | } | 1650 | } |
| 1474 | } | 1651 | } |
| 1475 | 1652 | ||
| 1653 | |||
| 1476 | /* | 1654 | /* |
| 1477 | * vma_needs_reservation and vma_commit_reservation are used by the huge | 1655 | * vma_needs_reservation, vma_commit_reservation and vma_end_reservation |
| 1478 | * page allocation routines to manage reservations. | 1656 | * are used by the huge page allocation routines to manage reservations. |
| 1479 | * | 1657 | * |
| 1480 | * vma_needs_reservation is called to determine if the huge page at addr | 1658 | * vma_needs_reservation is called to determine if the huge page at addr |
| 1481 | * within the vma has an associated reservation. If a reservation is | 1659 | * within the vma has an associated reservation. If a reservation is |
| 1482 | * needed, the value 1 is returned. The caller is then responsible for | 1660 | * needed, the value 1 is returned. The caller is then responsible for |
| 1483 | * managing the global reservation and subpool usage counts. After | 1661 | * managing the global reservation and subpool usage counts. After |
| 1484 | * the huge page has been allocated, vma_commit_reservation is called | 1662 | * the huge page has been allocated, vma_commit_reservation is called |
| 1485 | * to add the page to the reservation map. | 1663 | * to add the page to the reservation map. If the page allocation fails, |
| 1664 | * the reservation must be ended instead of committed. vma_end_reservation | ||
| 1665 | * is called in such cases. | ||
| 1486 | * | 1666 | * |
| 1487 | * In the normal case, vma_commit_reservation returns the same value | 1667 | * In the normal case, vma_commit_reservation returns the same value |
| 1488 | * as the preceding vma_needs_reservation call. The only time this | 1668 | * as the preceding vma_needs_reservation call. The only time this |
| @@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
| 1490 | * is the responsibility of the caller to notice the difference and | 1670 | * is the responsibility of the caller to notice the difference and |
| 1491 | * take appropriate action. | 1671 | * take appropriate action. |
| 1492 | */ | 1672 | */ |
| 1673 | enum vma_resv_mode { | ||
| 1674 | VMA_NEEDS_RESV, | ||
| 1675 | VMA_COMMIT_RESV, | ||
| 1676 | VMA_END_RESV, | ||
| 1677 | }; | ||
| 1493 | static long __vma_reservation_common(struct hstate *h, | 1678 | static long __vma_reservation_common(struct hstate *h, |
| 1494 | struct vm_area_struct *vma, unsigned long addr, | 1679 | struct vm_area_struct *vma, unsigned long addr, |
| 1495 | bool commit) | 1680 | enum vma_resv_mode mode) |
| 1496 | { | 1681 | { |
| 1497 | struct resv_map *resv; | 1682 | struct resv_map *resv; |
| 1498 | pgoff_t idx; | 1683 | pgoff_t idx; |
| @@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h, | |||
| 1503 | return 1; | 1688 | return 1; |
| 1504 | 1689 | ||
| 1505 | idx = vma_hugecache_offset(h, vma, addr); | 1690 | idx = vma_hugecache_offset(h, vma, addr); |
| 1506 | if (commit) | 1691 | switch (mode) { |
| 1507 | ret = region_add(resv, idx, idx + 1); | 1692 | case VMA_NEEDS_RESV: |
| 1508 | else | ||
| 1509 | ret = region_chg(resv, idx, idx + 1); | 1693 | ret = region_chg(resv, idx, idx + 1); |
| 1694 | break; | ||
| 1695 | case VMA_COMMIT_RESV: | ||
| 1696 | ret = region_add(resv, idx, idx + 1); | ||
| 1697 | break; | ||
| 1698 | case VMA_END_RESV: | ||
| 1699 | region_abort(resv, idx, idx + 1); | ||
| 1700 | ret = 0; | ||
| 1701 | break; | ||
| 1702 | default: | ||
| 1703 | BUG(); | ||
| 1704 | } | ||
| 1510 | 1705 | ||
| 1511 | if (vma->vm_flags & VM_MAYSHARE) | 1706 | if (vma->vm_flags & VM_MAYSHARE) |
| 1512 | return ret; | 1707 | return ret; |
| @@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h, | |||
| 1517 | static long vma_needs_reservation(struct hstate *h, | 1712 | static long vma_needs_reservation(struct hstate *h, |
| 1518 | struct vm_area_struct *vma, unsigned long addr) | 1713 | struct vm_area_struct *vma, unsigned long addr) |
| 1519 | { | 1714 | { |
| 1520 | return __vma_reservation_common(h, vma, addr, false); | 1715 | return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); |
| 1521 | } | 1716 | } |
| 1522 | 1717 | ||
| 1523 | static long vma_commit_reservation(struct hstate *h, | 1718 | static long vma_commit_reservation(struct hstate *h, |
| 1524 | struct vm_area_struct *vma, unsigned long addr) | 1719 | struct vm_area_struct *vma, unsigned long addr) |
| 1525 | { | 1720 | { |
| 1526 | return __vma_reservation_common(h, vma, addr, true); | 1721 | return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); |
| 1722 | } | ||
| 1723 | |||
| 1724 | static void vma_end_reservation(struct hstate *h, | ||
| 1725 | struct vm_area_struct *vma, unsigned long addr) | ||
| 1726 | { | ||
| 1727 | (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); | ||
| 1527 | } | 1728 | } |
| 1528 | 1729 | ||
| 1529 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 1730 | struct page *alloc_huge_page(struct vm_area_struct *vma, |
| 1530 | unsigned long addr, int avoid_reserve) | 1731 | unsigned long addr, int avoid_reserve) |
| 1531 | { | 1732 | { |
| 1532 | struct hugepage_subpool *spool = subpool_vma(vma); | 1733 | struct hugepage_subpool *spool = subpool_vma(vma); |
| 1533 | struct hstate *h = hstate_vma(vma); | 1734 | struct hstate *h = hstate_vma(vma); |
| 1534 | struct page *page; | 1735 | struct page *page; |
| 1535 | long chg, commit; | 1736 | long map_chg, map_commit; |
| 1737 | long gbl_chg; | ||
| 1536 | int ret, idx; | 1738 | int ret, idx; |
| 1537 | struct hugetlb_cgroup *h_cg; | 1739 | struct hugetlb_cgroup *h_cg; |
| 1538 | 1740 | ||
| 1539 | idx = hstate_index(h); | 1741 | idx = hstate_index(h); |
| 1540 | /* | 1742 | /* |
| 1541 | * Processes that did not create the mapping will have no | 1743 | * Examine the region/reserve map to determine if the process |
| 1542 | * reserves and will not have accounted against subpool | 1744 | * has a reservation for the page to be allocated. A return |
| 1543 | * limit. Check that the subpool limit can be made before | 1745 | * code of zero indicates a reservation exists (no change). |
| 1544 | * satisfying the allocation MAP_NORESERVE mappings may also | ||
| 1545 | * need pages and subpool limit allocated allocated if no reserve | ||
| 1546 | * mapping overlaps. | ||
| 1547 | */ | 1746 | */ |
| 1548 | chg = vma_needs_reservation(h, vma, addr); | 1747 | map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); |
| 1549 | if (chg < 0) | 1748 | if (map_chg < 0) |
| 1550 | return ERR_PTR(-ENOMEM); | 1749 | return ERR_PTR(-ENOMEM); |
| 1551 | if (chg || avoid_reserve) | 1750 | |
| 1552 | if (hugepage_subpool_get_pages(spool, 1) < 0) | 1751 | /* |
| 1752 | * Processes that did not create the mapping will have no | ||
| 1753 | * reserves as indicated by the region/reserve map. Check | ||
| 1754 | * that the allocation will not exceed the subpool limit. | ||
| 1755 | * Allocations for MAP_NORESERVE mappings also need to be | ||
| 1756 | * checked against any subpool limit. | ||
| 1757 | */ | ||
| 1758 | if (map_chg || avoid_reserve) { | ||
| 1759 | gbl_chg = hugepage_subpool_get_pages(spool, 1); | ||
| 1760 | if (gbl_chg < 0) { | ||
| 1761 | vma_end_reservation(h, vma, addr); | ||
| 1553 | return ERR_PTR(-ENOSPC); | 1762 | return ERR_PTR(-ENOSPC); |
| 1763 | } | ||
| 1764 | |||
| 1765 | /* | ||
| 1766 | * Even though there was no reservation in the region/reserve | ||
| 1767 | * map, there could be reservations associated with the | ||
| 1768 | * subpool that can be used. This would be indicated if the | ||
| 1769 | * return value of hugepage_subpool_get_pages() is zero. | ||
| 1770 | * However, if avoid_reserve is specified we still avoid even | ||
| 1771 | * the subpool reservations. | ||
| 1772 | */ | ||
| 1773 | if (avoid_reserve) | ||
| 1774 | gbl_chg = 1; | ||
| 1775 | } | ||
| 1554 | 1776 | ||
| 1555 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | 1777 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); |
| 1556 | if (ret) | 1778 | if (ret) |
| 1557 | goto out_subpool_put; | 1779 | goto out_subpool_put; |
| 1558 | 1780 | ||
| 1559 | spin_lock(&hugetlb_lock); | 1781 | spin_lock(&hugetlb_lock); |
| 1560 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); | 1782 | /* |
| 1783 | * glb_chg is passed to indicate whether or not a page must be taken | ||
| 1784 | * from the global free pool (global change). gbl_chg == 0 indicates | ||
| 1785 | * a reservation exists for the allocation. | ||
| 1786 | */ | ||
| 1787 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); | ||
| 1561 | if (!page) { | 1788 | if (!page) { |
| 1562 | spin_unlock(&hugetlb_lock); | 1789 | spin_unlock(&hugetlb_lock); |
| 1563 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1790 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
| @@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 1573 | 1800 | ||
| 1574 | set_page_private(page, (unsigned long)spool); | 1801 | set_page_private(page, (unsigned long)spool); |
| 1575 | 1802 | ||
| 1576 | commit = vma_commit_reservation(h, vma, addr); | 1803 | map_commit = vma_commit_reservation(h, vma, addr); |
| 1577 | if (unlikely(chg > commit)) { | 1804 | if (unlikely(map_chg > map_commit)) { |
| 1578 | /* | 1805 | /* |
| 1579 | * The page was added to the reservation map between | 1806 | * The page was added to the reservation map between |
| 1580 | * vma_needs_reservation and vma_commit_reservation. | 1807 | * vma_needs_reservation and vma_commit_reservation. |
| @@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 1594 | out_uncharge_cgroup: | 1821 | out_uncharge_cgroup: |
| 1595 | hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); | 1822 | hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); |
| 1596 | out_subpool_put: | 1823 | out_subpool_put: |
| 1597 | if (chg || avoid_reserve) | 1824 | if (map_chg || avoid_reserve) |
| 1598 | hugepage_subpool_put_pages(spool, 1); | 1825 | hugepage_subpool_put_pages(spool, 1); |
| 1826 | vma_end_reservation(h, vma, addr); | ||
| 1599 | return ERR_PTR(-ENOSPC); | 1827 | return ERR_PTR(-ENOSPC); |
| 1600 | } | 1828 | } |
| 1601 | 1829 | ||
| @@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void) | |||
| 2311 | } | 2539 | } |
| 2312 | 2540 | ||
| 2313 | kobject_put(hugepages_kobj); | 2541 | kobject_put(hugepages_kobj); |
| 2314 | kfree(htlb_fault_mutex_table); | 2542 | kfree(hugetlb_fault_mutex_table); |
| 2315 | } | 2543 | } |
| 2316 | module_exit(hugetlb_exit); | 2544 | module_exit(hugetlb_exit); |
| 2317 | 2545 | ||
| @@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void) | |||
| 2344 | #else | 2572 | #else |
| 2345 | num_fault_mutexes = 1; | 2573 | num_fault_mutexes = 1; |
| 2346 | #endif | 2574 | #endif |
| 2347 | htlb_fault_mutex_table = | 2575 | hugetlb_fault_mutex_table = |
| 2348 | kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); | 2576 | kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); |
| 2349 | BUG_ON(!htlb_fault_mutex_table); | 2577 | BUG_ON(!hugetlb_fault_mutex_table); |
| 2350 | 2578 | ||
| 2351 | for (i = 0; i < num_fault_mutexes; i++) | 2579 | for (i = 0; i < num_fault_mutexes; i++) |
| 2352 | mutex_init(&htlb_fault_mutex_table[i]); | 2580 | mutex_init(&hugetlb_fault_mutex_table[i]); |
| 2353 | return 0; | 2581 | return 0; |
| 2354 | } | 2582 | } |
| 2355 | module_init(hugetlb_init); | 2583 | module_init(hugetlb_init); |
| @@ -3147,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, | |||
| 3147 | return page != NULL; | 3375 | return page != NULL; |
| 3148 | } | 3376 | } |
| 3149 | 3377 | ||
| 3378 | int huge_add_to_page_cache(struct page *page, struct address_space *mapping, | ||
| 3379 | pgoff_t idx) | ||
| 3380 | { | ||
| 3381 | struct inode *inode = mapping->host; | ||
| 3382 | struct hstate *h = hstate_inode(inode); | ||
| 3383 | int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | ||
| 3384 | |||
| 3385 | if (err) | ||
| 3386 | return err; | ||
| 3387 | ClearPagePrivate(page); | ||
| 3388 | |||
| 3389 | spin_lock(&inode->i_lock); | ||
| 3390 | inode->i_blocks += blocks_per_huge_page(h); | ||
| 3391 | spin_unlock(&inode->i_lock); | ||
| 3392 | return 0; | ||
| 3393 | } | ||
| 3394 | |||
| 3150 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 3395 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 3151 | struct address_space *mapping, pgoff_t idx, | 3396 | struct address_space *mapping, pgoff_t idx, |
| 3152 | unsigned long address, pte_t *ptep, unsigned int flags) | 3397 | unsigned long address, pte_t *ptep, unsigned int flags) |
| @@ -3194,21 +3439,13 @@ retry: | |||
| 3194 | set_page_huge_active(page); | 3439 | set_page_huge_active(page); |
| 3195 | 3440 | ||
| 3196 | if (vma->vm_flags & VM_MAYSHARE) { | 3441 | if (vma->vm_flags & VM_MAYSHARE) { |
| 3197 | int err; | 3442 | int err = huge_add_to_page_cache(page, mapping, idx); |
| 3198 | struct inode *inode = mapping->host; | ||
| 3199 | |||
| 3200 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | ||
| 3201 | if (err) { | 3443 | if (err) { |
| 3202 | put_page(page); | 3444 | put_page(page); |
| 3203 | if (err == -EEXIST) | 3445 | if (err == -EEXIST) |
| 3204 | goto retry; | 3446 | goto retry; |
| 3205 | goto out; | 3447 | goto out; |
| 3206 | } | 3448 | } |
| 3207 | ClearPagePrivate(page); | ||
| 3208 | |||
| 3209 | spin_lock(&inode->i_lock); | ||
| 3210 | inode->i_blocks += blocks_per_huge_page(h); | ||
| 3211 | spin_unlock(&inode->i_lock); | ||
| 3212 | } else { | 3449 | } else { |
| 3213 | lock_page(page); | 3450 | lock_page(page); |
| 3214 | if (unlikely(anon_vma_prepare(vma))) { | 3451 | if (unlikely(anon_vma_prepare(vma))) { |
| @@ -3236,11 +3473,14 @@ retry: | |||
| 3236 | * any allocations necessary to record that reservation occur outside | 3473 | * any allocations necessary to record that reservation occur outside |
| 3237 | * the spinlock. | 3474 | * the spinlock. |
| 3238 | */ | 3475 | */ |
| 3239 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) | 3476 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { |
| 3240 | if (vma_needs_reservation(h, vma, address) < 0) { | 3477 | if (vma_needs_reservation(h, vma, address) < 0) { |
| 3241 | ret = VM_FAULT_OOM; | 3478 | ret = VM_FAULT_OOM; |
| 3242 | goto backout_unlocked; | 3479 | goto backout_unlocked; |
| 3243 | } | 3480 | } |
| 3481 | /* Just decrements count, does not deallocate */ | ||
| 3482 | vma_end_reservation(h, vma, address); | ||
| 3483 | } | ||
| 3244 | 3484 | ||
| 3245 | ptl = huge_pte_lockptr(h, mm, ptep); | 3485 | ptl = huge_pte_lockptr(h, mm, ptep); |
| 3246 | spin_lock(ptl); | 3486 | spin_lock(ptl); |
| @@ -3280,7 +3520,7 @@ backout_unlocked: | |||
| 3280 | } | 3520 | } |
| 3281 | 3521 | ||
| 3282 | #ifdef CONFIG_SMP | 3522 | #ifdef CONFIG_SMP |
| 3283 | static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, | 3523 | u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, |
| 3284 | struct vm_area_struct *vma, | 3524 | struct vm_area_struct *vma, |
| 3285 | struct address_space *mapping, | 3525 | struct address_space *mapping, |
| 3286 | pgoff_t idx, unsigned long address) | 3526 | pgoff_t idx, unsigned long address) |
| @@ -3305,7 +3545,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, | |||
| 3305 | * For uniprocesor systems we always use a single mutex, so just | 3545 | * For uniprocesor systems we always use a single mutex, so just |
| 3306 | * return 0 and avoid the hashing overhead. | 3546 | * return 0 and avoid the hashing overhead. |
| 3307 | */ | 3547 | */ |
| 3308 | static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, | 3548 | u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, |
| 3309 | struct vm_area_struct *vma, | 3549 | struct vm_area_struct *vma, |
| 3310 | struct address_space *mapping, | 3550 | struct address_space *mapping, |
| 3311 | pgoff_t idx, unsigned long address) | 3551 | pgoff_t idx, unsigned long address) |
| @@ -3353,8 +3593,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3353 | * get spurious allocation failures if two CPUs race to instantiate | 3593 | * get spurious allocation failures if two CPUs race to instantiate |
| 3354 | * the same page in the page cache. | 3594 | * the same page in the page cache. |
| 3355 | */ | 3595 | */ |
| 3356 | hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); | 3596 | hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); |
| 3357 | mutex_lock(&htlb_fault_mutex_table[hash]); | 3597 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
| 3358 | 3598 | ||
| 3359 | entry = huge_ptep_get(ptep); | 3599 | entry = huge_ptep_get(ptep); |
| 3360 | if (huge_pte_none(entry)) { | 3600 | if (huge_pte_none(entry)) { |
| @@ -3387,6 +3627,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3387 | ret = VM_FAULT_OOM; | 3627 | ret = VM_FAULT_OOM; |
| 3388 | goto out_mutex; | 3628 | goto out_mutex; |
| 3389 | } | 3629 | } |
| 3630 | /* Just decrements count, does not deallocate */ | ||
| 3631 | vma_end_reservation(h, vma, address); | ||
| 3390 | 3632 | ||
| 3391 | if (!(vma->vm_flags & VM_MAYSHARE)) | 3633 | if (!(vma->vm_flags & VM_MAYSHARE)) |
| 3392 | pagecache_page = hugetlbfs_pagecache_page(h, | 3634 | pagecache_page = hugetlbfs_pagecache_page(h, |
| @@ -3437,7 +3679,7 @@ out_ptl: | |||
| 3437 | put_page(pagecache_page); | 3679 | put_page(pagecache_page); |
| 3438 | } | 3680 | } |
| 3439 | out_mutex: | 3681 | out_mutex: |
| 3440 | mutex_unlock(&htlb_fault_mutex_table[hash]); | 3682 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
| 3441 | /* | 3683 | /* |
| 3442 | * Generally it's safe to hold refcount during waiting page lock. But | 3684 | * Generally it's safe to hold refcount during waiting page lock. But |
| 3443 | * here we just wait to defer the next page fault to avoid busy loop and | 3685 | * here we just wait to defer the next page fault to avoid busy loop and |
| @@ -3726,12 +3968,15 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
| 3726 | } | 3968 | } |
| 3727 | return 0; | 3969 | return 0; |
| 3728 | out_err: | 3970 | out_err: |
| 3971 | if (!vma || vma->vm_flags & VM_MAYSHARE) | ||
| 3972 | region_abort(resv_map, from, to); | ||
| 3729 | if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | 3973 | if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
| 3730 | kref_put(&resv_map->refs, resv_map_release); | 3974 | kref_put(&resv_map->refs, resv_map_release); |
| 3731 | return ret; | 3975 | return ret; |
| 3732 | } | 3976 | } |
| 3733 | 3977 | ||
| 3734 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 3978 | long hugetlb_unreserve_pages(struct inode *inode, long start, long end, |
| 3979 | long freed) | ||
| 3735 | { | 3980 | { |
| 3736 | struct hstate *h = hstate_inode(inode); | 3981 | struct hstate *h = hstate_inode(inode); |
| 3737 | struct resv_map *resv_map = inode_resv_map(inode); | 3982 | struct resv_map *resv_map = inode_resv_map(inode); |
| @@ -3739,8 +3984,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
| 3739 | struct hugepage_subpool *spool = subpool_inode(inode); | 3984 | struct hugepage_subpool *spool = subpool_inode(inode); |
| 3740 | long gbl_reserve; | 3985 | long gbl_reserve; |
| 3741 | 3986 | ||
| 3742 | if (resv_map) | 3987 | if (resv_map) { |
| 3743 | chg = region_truncate(resv_map, offset); | 3988 | chg = region_del(resv_map, start, end); |
| 3989 | /* | ||
| 3990 | * region_del() can fail in the rare case where a region | ||
| 3991 | * must be split and another region descriptor can not be | ||
| 3992 | * allocated. If end == LONG_MAX, it will not fail. | ||
| 3993 | */ | ||
| 3994 | if (chg < 0) | ||
| 3995 | return chg; | ||
| 3996 | } | ||
| 3997 | |||
| 3744 | spin_lock(&inode->i_lock); | 3998 | spin_lock(&inode->i_lock); |
| 3745 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); | 3999 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
| 3746 | spin_unlock(&inode->i_lock); | 4000 | spin_unlock(&inode->i_lock); |
| @@ -3751,6 +4005,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
| 3751 | */ | 4005 | */ |
| 3752 | gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); | 4006 | gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); |
| 3753 | hugetlb_acct_memory(h, -gbl_reserve); | 4007 | hugetlb_acct_memory(h, -gbl_reserve); |
| 4008 | |||
| 4009 | return 0; | ||
| 3754 | } | 4010 | } |
| 3755 | 4011 | ||
| 3756 | #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE | 4012 | #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index bf73ac17dad4..aeba0edd6e44 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
| @@ -58,7 +58,7 @@ inject: | |||
| 58 | pr_info("Injecting memory failure at pfn %#lx\n", pfn); | 58 | pr_info("Injecting memory failure at pfn %#lx\n", pfn); |
| 59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); | 59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); |
| 60 | put_out: | 60 | put_out: |
| 61 | put_page(p); | 61 | put_hwpoison_page(p); |
| 62 | return 0; | 62 | return 0; |
| 63 | } | 63 | } |
| 64 | 64 | ||
diff --git a/mm/internal.h b/mm/internal.h index 1195dd2d6a2b..bc0fa9a69e46 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -182,6 +182,7 @@ struct compact_control { | |||
| 182 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 182 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
| 183 | unsigned long free_pfn; /* isolate_freepages search base */ | 183 | unsigned long free_pfn; /* isolate_freepages search base */ |
| 184 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 184 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
| 185 | unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ | ||
| 185 | enum migrate_mode mode; /* Async or sync migration mode */ | 186 | enum migrate_mode mode; /* Async or sync migration mode */ |
| 186 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ | 187 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
| 187 | int order; /* order a direct compactor needs */ | 188 | int order; /* order a direct compactor needs */ |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index cf79f110157c..f532f6a37b55 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
| @@ -838,6 +838,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size, | |||
| 838 | } | 838 | } |
| 839 | 839 | ||
| 840 | if (crt_early_log >= ARRAY_SIZE(early_log)) { | 840 | if (crt_early_log >= ARRAY_SIZE(early_log)) { |
| 841 | crt_early_log++; | ||
| 841 | kmemleak_disable(); | 842 | kmemleak_disable(); |
| 842 | return; | 843 | return; |
| 843 | } | 844 | } |
| @@ -1882,7 +1883,7 @@ void __init kmemleak_init(void) | |||
| 1882 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); | 1883 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); |
| 1883 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); | 1884 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); |
| 1884 | 1885 | ||
| 1885 | if (crt_early_log >= ARRAY_SIZE(early_log)) | 1886 | if (crt_early_log > ARRAY_SIZE(early_log)) |
| 1886 | pr_warning("Early log buffer exceeded (%d), please increase " | 1887 | pr_warning("Early log buffer exceeded (%d), please increase " |
| 1887 | "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); | 1888 | "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); |
| 1888 | 1889 | ||
diff --git a/mm/list_lru.c b/mm/list_lru.c index 909eca2c820e..e1da19fac1b3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c | |||
| @@ -99,8 +99,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) | |||
| 99 | struct list_lru_one *l; | 99 | struct list_lru_one *l; |
| 100 | 100 | ||
| 101 | spin_lock(&nlru->lock); | 101 | spin_lock(&nlru->lock); |
| 102 | l = list_lru_from_kmem(nlru, item); | ||
| 103 | if (list_empty(item)) { | 102 | if (list_empty(item)) { |
| 103 | l = list_lru_from_kmem(nlru, item); | ||
| 104 | list_add_tail(item, &l->list); | 104 | list_add_tail(item, &l->list); |
| 105 | l->nr_items++; | 105 | l->nr_items++; |
| 106 | spin_unlock(&nlru->lock); | 106 | spin_unlock(&nlru->lock); |
| @@ -118,8 +118,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) | |||
| 118 | struct list_lru_one *l; | 118 | struct list_lru_one *l; |
| 119 | 119 | ||
| 120 | spin_lock(&nlru->lock); | 120 | spin_lock(&nlru->lock); |
| 121 | l = list_lru_from_kmem(nlru, item); | ||
| 122 | if (!list_empty(item)) { | 121 | if (!list_empty(item)) { |
| 122 | l = list_lru_from_kmem(nlru, item); | ||
| 123 | list_del_init(item); | 123 | list_del_init(item); |
| 124 | l->nr_items--; | 124 | l->nr_items--; |
| 125 | spin_unlock(&nlru->lock); | 125 | spin_unlock(&nlru->lock); |
diff --git a/mm/madvise.c b/mm/madvise.c index ce3a4222c7e7..c889fcbb530e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -301,7 +301,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
| 301 | 301 | ||
| 302 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 302 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
| 303 | 303 | ||
| 304 | if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) | 304 | if (vma->vm_flags & VM_LOCKED) |
| 305 | return -EINVAL; | 305 | return -EINVAL; |
| 306 | 306 | ||
| 307 | f = vma->vm_file; | 307 | f = vma->vm_file; |
diff --git a/mm/memblock.c b/mm/memblock.c index 95ce68c6da8a..1c7b647e5897 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
| @@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p | |||
| 91 | return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); | 91 | return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); |
| 92 | } | 92 | } |
| 93 | 93 | ||
| 94 | static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | 94 | bool __init_memblock memblock_overlaps_region(struct memblock_type *type, |
| 95 | phys_addr_t base, phys_addr_t size) | 95 | phys_addr_t base, phys_addr_t size) |
| 96 | { | 96 | { |
| 97 | unsigned long i; | 97 | unsigned long i; |
| @@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
| 103 | break; | 103 | break; |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | return (i < type->cnt) ? i : -1; | 106 | return i < type->cnt; |
| 107 | } | 107 | } |
| 108 | 108 | ||
| 109 | /* | 109 | /* |
| @@ -569,6 +569,7 @@ repeat: | |||
| 569 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 569 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
| 570 | WARN_ON(nid != memblock_get_region_node(rgn)); | 570 | WARN_ON(nid != memblock_get_region_node(rgn)); |
| 571 | #endif | 571 | #endif |
| 572 | WARN_ON(flags != rgn->flags); | ||
| 572 | nr_new++; | 573 | nr_new++; |
| 573 | if (insert) | 574 | if (insert) |
| 574 | memblock_insert_region(type, i++, base, | 575 | memblock_insert_region(type, i++, base, |
| @@ -614,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base, | |||
| 614 | int nid, | 615 | int nid, |
| 615 | unsigned long flags) | 616 | unsigned long flags) |
| 616 | { | 617 | { |
| 617 | struct memblock_type *_rgn = &memblock.memory; | 618 | struct memblock_type *type = &memblock.memory; |
| 618 | 619 | ||
| 619 | memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", | 620 | memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", |
| 620 | (unsigned long long)base, | 621 | (unsigned long long)base, |
| 621 | (unsigned long long)base + size - 1, | 622 | (unsigned long long)base + size - 1, |
| 622 | flags, (void *)_RET_IP_); | 623 | flags, (void *)_RET_IP_); |
| 623 | 624 | ||
| 624 | return memblock_add_range(_rgn, base, size, nid, flags); | 625 | return memblock_add_range(type, base, size, nid, flags); |
| 625 | } | 626 | } |
| 626 | 627 | ||
| 627 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 628 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
| @@ -761,7 +762,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
| 761 | * | 762 | * |
| 762 | * This function isolates region [@base, @base + @size), and sets/clears flag | 763 | * This function isolates region [@base, @base + @size), and sets/clears flag |
| 763 | * | 764 | * |
| 764 | * Return 0 on succees, -errno on failure. | 765 | * Return 0 on success, -errno on failure. |
| 765 | */ | 766 | */ |
| 766 | static int __init_memblock memblock_setclr_flag(phys_addr_t base, | 767 | static int __init_memblock memblock_setclr_flag(phys_addr_t base, |
| 767 | phys_addr_t size, int set, int flag) | 768 | phys_addr_t size, int set, int flag) |
| @@ -788,7 +789,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, | |||
| 788 | * @base: the base phys addr of the region | 789 | * @base: the base phys addr of the region |
| 789 | * @size: the size of the region | 790 | * @size: the size of the region |
| 790 | * | 791 | * |
| 791 | * Return 0 on succees, -errno on failure. | 792 | * Return 0 on success, -errno on failure. |
| 792 | */ | 793 | */ |
| 793 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | 794 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) |
| 794 | { | 795 | { |
| @@ -800,7 +801,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | |||
| 800 | * @base: the base phys addr of the region | 801 | * @base: the base phys addr of the region |
| 801 | * @size: the size of the region | 802 | * @size: the size of the region |
| 802 | * | 803 | * |
| 803 | * Return 0 on succees, -errno on failure. | 804 | * Return 0 on success, -errno on failure. |
| 804 | */ | 805 | */ |
| 805 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | 806 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) |
| 806 | { | 807 | { |
| @@ -812,7 +813,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | |||
| 812 | * @base: the base phys addr of the region | 813 | * @base: the base phys addr of the region |
| 813 | * @size: the size of the region | 814 | * @size: the size of the region |
| 814 | * | 815 | * |
| 815 | * Return 0 on succees, -errno on failure. | 816 | * Return 0 on success, -errno on failure. |
| 816 | */ | 817 | */ |
| 817 | int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) | 818 | int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) |
| 818 | { | 819 | { |
| @@ -834,10 +835,10 @@ void __init_memblock __next_reserved_mem_region(u64 *idx, | |||
| 834 | phys_addr_t *out_start, | 835 | phys_addr_t *out_start, |
| 835 | phys_addr_t *out_end) | 836 | phys_addr_t *out_end) |
| 836 | { | 837 | { |
| 837 | struct memblock_type *rsv = &memblock.reserved; | 838 | struct memblock_type *type = &memblock.reserved; |
| 838 | 839 | ||
| 839 | if (*idx >= 0 && *idx < rsv->cnt) { | 840 | if (*idx >= 0 && *idx < type->cnt) { |
| 840 | struct memblock_region *r = &rsv->regions[*idx]; | 841 | struct memblock_region *r = &type->regions[*idx]; |
| 841 | phys_addr_t base = r->base; | 842 | phys_addr_t base = r->base; |
| 842 | phys_addr_t size = r->size; | 843 | phys_addr_t size = r->size; |
| 843 | 844 | ||
| @@ -975,7 +976,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, | |||
| 975 | * in type_b. | 976 | * in type_b. |
| 976 | * | 977 | * |
| 977 | * @idx: pointer to u64 loop variable | 978 | * @idx: pointer to u64 loop variable |
| 978 | * @nid: nid: node selector, %NUMA_NO_NODE for all nodes | 979 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
| 979 | * @flags: pick from blocks based on memory attributes | 980 | * @flags: pick from blocks based on memory attributes |
| 980 | * @type_a: pointer to memblock_type from where the range is taken | 981 | * @type_a: pointer to memblock_type from where the range is taken |
| 981 | * @type_b: pointer to memblock_type which excludes memory from being taken | 982 | * @type_b: pointer to memblock_type which excludes memory from being taken |
| @@ -1565,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size | |||
| 1565 | * Check if the region [@base, @base+@size) intersects a reserved memory block. | 1566 | * Check if the region [@base, @base+@size) intersects a reserved memory block. |
| 1566 | * | 1567 | * |
| 1567 | * RETURNS: | 1568 | * RETURNS: |
| 1568 | * 0 if false, non-zero if true | 1569 | * True if they intersect, false if not. |
| 1569 | */ | 1570 | */ |
| 1570 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) | 1571 | bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) |
| 1571 | { | 1572 | { |
| 1572 | memblock_cap_size(base, &size); | 1573 | memblock_cap_size(base, &size); |
| 1573 | return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; | 1574 | return memblock_overlaps_region(&memblock.reserved, base, size); |
| 1574 | } | 1575 | } |
| 1575 | 1576 | ||
| 1576 | void __init_memblock memblock_trim_memory(phys_addr_t align) | 1577 | void __init_memblock memblock_trim_memory(phys_addr_t align) |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1af057575ce9..1742a2db89c7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = { | |||
| 111 | "unevictable", | 111 | "unevictable", |
| 112 | }; | 112 | }; |
| 113 | 113 | ||
| 114 | /* | ||
| 115 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | ||
| 116 | * it will be incremated by the number of pages. This counter is used for | ||
| 117 | * for trigger some periodic events. This is straightforward and better | ||
| 118 | * than using jiffies etc. to handle periodic memcg event. | ||
| 119 | */ | ||
| 120 | enum mem_cgroup_events_target { | ||
| 121 | MEM_CGROUP_TARGET_THRESH, | ||
| 122 | MEM_CGROUP_TARGET_SOFTLIMIT, | ||
| 123 | MEM_CGROUP_TARGET_NUMAINFO, | ||
| 124 | MEM_CGROUP_NTARGETS, | ||
| 125 | }; | ||
| 126 | #define THRESHOLDS_EVENTS_TARGET 128 | 114 | #define THRESHOLDS_EVENTS_TARGET 128 |
| 127 | #define SOFTLIMIT_EVENTS_TARGET 1024 | 115 | #define SOFTLIMIT_EVENTS_TARGET 1024 |
| 128 | #define NUMAINFO_EVENTS_TARGET 1024 | 116 | #define NUMAINFO_EVENTS_TARGET 1024 |
| 129 | 117 | ||
| 130 | struct mem_cgroup_stat_cpu { | ||
| 131 | long count[MEM_CGROUP_STAT_NSTATS]; | ||
| 132 | unsigned long events[MEMCG_NR_EVENTS]; | ||
| 133 | unsigned long nr_page_events; | ||
| 134 | unsigned long targets[MEM_CGROUP_NTARGETS]; | ||
| 135 | }; | ||
| 136 | |||
| 137 | struct reclaim_iter { | ||
| 138 | struct mem_cgroup *position; | ||
| 139 | /* scan generation, increased every round-trip */ | ||
| 140 | unsigned int generation; | ||
| 141 | }; | ||
| 142 | |||
| 143 | /* | ||
| 144 | * per-zone information in memory controller. | ||
| 145 | */ | ||
| 146 | struct mem_cgroup_per_zone { | ||
| 147 | struct lruvec lruvec; | ||
| 148 | unsigned long lru_size[NR_LRU_LISTS]; | ||
| 149 | |||
| 150 | struct reclaim_iter iter[DEF_PRIORITY + 1]; | ||
| 151 | |||
| 152 | struct rb_node tree_node; /* RB tree node */ | ||
| 153 | unsigned long usage_in_excess;/* Set to the value by which */ | ||
| 154 | /* the soft limit is exceeded*/ | ||
| 155 | bool on_tree; | ||
| 156 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | ||
| 157 | /* use container_of */ | ||
| 158 | }; | ||
| 159 | |||
| 160 | struct mem_cgroup_per_node { | ||
| 161 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | ||
| 162 | }; | ||
| 163 | |||
| 164 | /* | 118 | /* |
| 165 | * Cgroups above their limits are maintained in a RB-Tree, independent of | 119 | * Cgroups above their limits are maintained in a RB-Tree, independent of |
| 166 | * their hierarchy representation | 120 | * their hierarchy representation |
| @@ -181,32 +135,6 @@ struct mem_cgroup_tree { | |||
| 181 | 135 | ||
| 182 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 136 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
| 183 | 137 | ||
| 184 | struct mem_cgroup_threshold { | ||
| 185 | struct eventfd_ctx *eventfd; | ||
| 186 | unsigned long threshold; | ||
| 187 | }; | ||
| 188 | |||
| 189 | /* For threshold */ | ||
| 190 | struct mem_cgroup_threshold_ary { | ||
| 191 | /* An array index points to threshold just below or equal to usage. */ | ||
| 192 | int current_threshold; | ||
| 193 | /* Size of entries[] */ | ||
| 194 | unsigned int size; | ||
| 195 | /* Array of thresholds */ | ||
| 196 | struct mem_cgroup_threshold entries[0]; | ||
| 197 | }; | ||
| 198 | |||
| 199 | struct mem_cgroup_thresholds { | ||
| 200 | /* Primary thresholds array */ | ||
| 201 | struct mem_cgroup_threshold_ary *primary; | ||
| 202 | /* | ||
| 203 | * Spare threshold array. | ||
| 204 | * This is needed to make mem_cgroup_unregister_event() "never fail". | ||
| 205 | * It must be able to store at least primary->size - 1 entries. | ||
| 206 | */ | ||
| 207 | struct mem_cgroup_threshold_ary *spare; | ||
| 208 | }; | ||
| 209 | |||
| 210 | /* for OOM */ | 138 | /* for OOM */ |
| 211 | struct mem_cgroup_eventfd_list { | 139 | struct mem_cgroup_eventfd_list { |
| 212 | struct list_head list; | 140 | struct list_head list; |
| @@ -256,113 +184,6 @@ struct mem_cgroup_event { | |||
| 256 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); | 184 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); |
| 257 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | 185 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
| 258 | 186 | ||
| 259 | /* | ||
| 260 | * The memory controller data structure. The memory controller controls both | ||
| 261 | * page cache and RSS per cgroup. We would eventually like to provide | ||
| 262 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | ||
| 263 | * to help the administrator determine what knobs to tune. | ||
| 264 | */ | ||
| 265 | struct mem_cgroup { | ||
| 266 | struct cgroup_subsys_state css; | ||
| 267 | |||
| 268 | /* Accounted resources */ | ||
| 269 | struct page_counter memory; | ||
| 270 | struct page_counter memsw; | ||
| 271 | struct page_counter kmem; | ||
| 272 | |||
| 273 | /* Normal memory consumption range */ | ||
| 274 | unsigned long low; | ||
| 275 | unsigned long high; | ||
| 276 | |||
| 277 | unsigned long soft_limit; | ||
| 278 | |||
| 279 | /* vmpressure notifications */ | ||
| 280 | struct vmpressure vmpressure; | ||
| 281 | |||
| 282 | /* css_online() has been completed */ | ||
| 283 | int initialized; | ||
| 284 | |||
| 285 | /* | ||
| 286 | * Should the accounting and control be hierarchical, per subtree? | ||
| 287 | */ | ||
| 288 | bool use_hierarchy; | ||
| 289 | |||
| 290 | /* protected by memcg_oom_lock */ | ||
| 291 | bool oom_lock; | ||
| 292 | int under_oom; | ||
| 293 | |||
| 294 | int swappiness; | ||
| 295 | /* OOM-Killer disable */ | ||
| 296 | int oom_kill_disable; | ||
| 297 | |||
| 298 | /* protect arrays of thresholds */ | ||
| 299 | struct mutex thresholds_lock; | ||
| 300 | |||
| 301 | /* thresholds for memory usage. RCU-protected */ | ||
| 302 | struct mem_cgroup_thresholds thresholds; | ||
| 303 | |||
| 304 | /* thresholds for mem+swap usage. RCU-protected */ | ||
| 305 | struct mem_cgroup_thresholds memsw_thresholds; | ||
| 306 | |||
| 307 | /* For oom notifier event fd */ | ||
| 308 | struct list_head oom_notify; | ||
| 309 | |||
| 310 | /* | ||
| 311 | * Should we move charges of a task when a task is moved into this | ||
| 312 | * mem_cgroup ? And what type of charges should we move ? | ||
| 313 | */ | ||
| 314 | unsigned long move_charge_at_immigrate; | ||
| 315 | /* | ||
| 316 | * set > 0 if pages under this cgroup are moving to other cgroup. | ||
| 317 | */ | ||
| 318 | atomic_t moving_account; | ||
| 319 | /* taken only while moving_account > 0 */ | ||
| 320 | spinlock_t move_lock; | ||
| 321 | struct task_struct *move_lock_task; | ||
| 322 | unsigned long move_lock_flags; | ||
| 323 | /* | ||
| 324 | * percpu counter. | ||
| 325 | */ | ||
| 326 | struct mem_cgroup_stat_cpu __percpu *stat; | ||
| 327 | spinlock_t pcp_counter_lock; | ||
| 328 | |||
| 329 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) | ||
| 330 | struct cg_proto tcp_mem; | ||
| 331 | #endif | ||
| 332 | #if defined(CONFIG_MEMCG_KMEM) | ||
| 333 | /* Index in the kmem_cache->memcg_params.memcg_caches array */ | ||
| 334 | int kmemcg_id; | ||
| 335 | bool kmem_acct_activated; | ||
| 336 | bool kmem_acct_active; | ||
| 337 | #endif | ||
| 338 | |||
| 339 | int last_scanned_node; | ||
| 340 | #if MAX_NUMNODES > 1 | ||
| 341 | nodemask_t scan_nodes; | ||
| 342 | atomic_t numainfo_events; | ||
| 343 | atomic_t numainfo_updating; | ||
| 344 | #endif | ||
| 345 | |||
| 346 | #ifdef CONFIG_CGROUP_WRITEBACK | ||
| 347 | struct list_head cgwb_list; | ||
| 348 | struct wb_domain cgwb_domain; | ||
| 349 | #endif | ||
| 350 | |||
| 351 | /* List of events which userspace want to receive */ | ||
| 352 | struct list_head event_list; | ||
| 353 | spinlock_t event_list_lock; | ||
| 354 | |||
| 355 | struct mem_cgroup_per_node *nodeinfo[0]; | ||
| 356 | /* WARNING: nodeinfo must be the last member here */ | ||
| 357 | }; | ||
| 358 | |||
| 359 | #ifdef CONFIG_MEMCG_KMEM | ||
| 360 | bool memcg_kmem_is_active(struct mem_cgroup *memcg) | ||
| 361 | { | ||
| 362 | return memcg->kmem_acct_active; | ||
| 363 | } | ||
| 364 | #endif | ||
| 365 | |||
| 366 | /* Stuffs for move charges at task migration. */ | 187 | /* Stuffs for move charges at task migration. */ |
| 367 | /* | 188 | /* |
| 368 | * Types of charges to be moved. | 189 | * Types of charges to be moved. |
| @@ -423,11 +244,6 @@ enum res_type { | |||
| 423 | */ | 244 | */ |
| 424 | static DEFINE_MUTEX(memcg_create_mutex); | 245 | static DEFINE_MUTEX(memcg_create_mutex); |
| 425 | 246 | ||
| 426 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | ||
| 427 | { | ||
| 428 | return s ? container_of(s, struct mem_cgroup, css) : NULL; | ||
| 429 | } | ||
| 430 | |||
| 431 | /* Some nice accessors for the vmpressure. */ | 247 | /* Some nice accessors for the vmpressure. */ |
| 432 | struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) | 248 | struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) |
| 433 | { | 249 | { |
| @@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk) | |||
| 499 | rcu_read_lock(); | 315 | rcu_read_lock(); |
| 500 | memcg = mem_cgroup_from_task(current); | 316 | memcg = mem_cgroup_from_task(current); |
| 501 | cg_proto = sk->sk_prot->proto_cgroup(memcg); | 317 | cg_proto = sk->sk_prot->proto_cgroup(memcg); |
| 502 | if (!mem_cgroup_is_root(memcg) && | 318 | if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && |
| 503 | memcg_proto_active(cg_proto) && | ||
| 504 | css_tryget_online(&memcg->css)) { | 319 | css_tryget_online(&memcg->css)) { |
| 505 | sk->sk_cgrp = cg_proto; | 320 | sk->sk_cgrp = cg_proto; |
| 506 | } | 321 | } |
| @@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) | |||
| 593 | return &memcg->nodeinfo[nid]->zoneinfo[zid]; | 408 | return &memcg->nodeinfo[nid]->zoneinfo[zid]; |
| 594 | } | 409 | } |
| 595 | 410 | ||
| 596 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) | ||
| 597 | { | ||
| 598 | return &memcg->css; | ||
| 599 | } | ||
| 600 | |||
| 601 | /** | 411 | /** |
| 602 | * mem_cgroup_css_from_page - css of the memcg associated with a page | 412 | * mem_cgroup_css_from_page - css of the memcg associated with a page |
| 603 | * @page: page of interest | 413 | * @page: page of interest |
| @@ -876,14 +686,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
| 876 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); | 686 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); |
| 877 | } | 687 | } |
| 878 | 688 | ||
| 879 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) | ||
| 880 | { | ||
| 881 | struct mem_cgroup_per_zone *mz; | ||
| 882 | |||
| 883 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | ||
| 884 | return mz->lru_size[lru]; | ||
| 885 | } | ||
| 886 | |||
| 887 | static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | 689 | static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, |
| 888 | int nid, | 690 | int nid, |
| 889 | unsigned int lru_mask) | 691 | unsigned int lru_mask) |
| @@ -986,6 +788,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
| 986 | 788 | ||
| 987 | return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); | 789 | return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); |
| 988 | } | 790 | } |
| 791 | EXPORT_SYMBOL(mem_cgroup_from_task); | ||
| 989 | 792 | ||
| 990 | static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) | 793 | static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) |
| 991 | { | 794 | { |
| @@ -1031,7 +834,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
| 1031 | struct mem_cgroup *prev, | 834 | struct mem_cgroup *prev, |
| 1032 | struct mem_cgroup_reclaim_cookie *reclaim) | 835 | struct mem_cgroup_reclaim_cookie *reclaim) |
| 1033 | { | 836 | { |
| 1034 | struct reclaim_iter *uninitialized_var(iter); | 837 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); |
| 1035 | struct cgroup_subsys_state *css = NULL; | 838 | struct cgroup_subsys_state *css = NULL; |
| 1036 | struct mem_cgroup *memcg = NULL; | 839 | struct mem_cgroup *memcg = NULL; |
| 1037 | struct mem_cgroup *pos = NULL; | 840 | struct mem_cgroup *pos = NULL; |
| @@ -1173,30 +976,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, | |||
| 1173 | iter != NULL; \ | 976 | iter != NULL; \ |
| 1174 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 977 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
| 1175 | 978 | ||
| 1176 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | ||
| 1177 | { | ||
| 1178 | struct mem_cgroup *memcg; | ||
| 1179 | |||
| 1180 | rcu_read_lock(); | ||
| 1181 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
| 1182 | if (unlikely(!memcg)) | ||
| 1183 | goto out; | ||
| 1184 | |||
| 1185 | switch (idx) { | ||
| 1186 | case PGFAULT: | ||
| 1187 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); | ||
| 1188 | break; | ||
| 1189 | case PGMAJFAULT: | ||
| 1190 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); | ||
| 1191 | break; | ||
| 1192 | default: | ||
| 1193 | BUG(); | ||
| 1194 | } | ||
| 1195 | out: | ||
| 1196 | rcu_read_unlock(); | ||
| 1197 | } | ||
| 1198 | EXPORT_SYMBOL(__mem_cgroup_count_vm_event); | ||
| 1199 | |||
| 1200 | /** | 979 | /** |
| 1201 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | 980 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg |
| 1202 | * @zone: zone of the wanted lruvec | 981 | * @zone: zone of the wanted lruvec |
| @@ -1295,15 +1074,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, | |||
| 1295 | VM_BUG_ON((long)(*lru_size) < 0); | 1074 | VM_BUG_ON((long)(*lru_size) < 0); |
| 1296 | } | 1075 | } |
| 1297 | 1076 | ||
| 1298 | bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) | ||
| 1299 | { | ||
| 1300 | if (root == memcg) | ||
| 1301 | return true; | ||
| 1302 | if (!root->use_hierarchy) | ||
| 1303 | return false; | ||
| 1304 | return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); | ||
| 1305 | } | ||
| 1306 | |||
| 1307 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) | 1077 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) |
| 1308 | { | 1078 | { |
| 1309 | struct mem_cgroup *task_memcg; | 1079 | struct mem_cgroup *task_memcg; |
| @@ -1330,39 +1100,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) | |||
| 1330 | return ret; | 1100 | return ret; |
| 1331 | } | 1101 | } |
| 1332 | 1102 | ||
| 1333 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | ||
| 1334 | { | ||
| 1335 | unsigned long inactive_ratio; | ||
| 1336 | unsigned long inactive; | ||
| 1337 | unsigned long active; | ||
| 1338 | unsigned long gb; | ||
| 1339 | |||
| 1340 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); | ||
| 1341 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); | ||
| 1342 | |||
| 1343 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | ||
| 1344 | if (gb) | ||
| 1345 | inactive_ratio = int_sqrt(10 * gb); | ||
| 1346 | else | ||
| 1347 | inactive_ratio = 1; | ||
| 1348 | |||
| 1349 | return inactive * inactive_ratio < active; | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | bool mem_cgroup_lruvec_online(struct lruvec *lruvec) | ||
| 1353 | { | ||
| 1354 | struct mem_cgroup_per_zone *mz; | ||
| 1355 | struct mem_cgroup *memcg; | ||
| 1356 | |||
| 1357 | if (mem_cgroup_disabled()) | ||
| 1358 | return true; | ||
| 1359 | |||
| 1360 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | ||
| 1361 | memcg = mz->memcg; | ||
| 1362 | |||
| 1363 | return !!(memcg->css.flags & CSS_ONLINE); | ||
| 1364 | } | ||
| 1365 | |||
| 1366 | #define mem_cgroup_from_counter(counter, member) \ | 1103 | #define mem_cgroup_from_counter(counter, member) \ |
| 1367 | container_of(counter, struct mem_cgroup, member) | 1104 | container_of(counter, struct mem_cgroup, member) |
| 1368 | 1105 | ||
| @@ -1394,15 +1131,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) | |||
| 1394 | return margin; | 1131 | return margin; |
| 1395 | } | 1132 | } |
| 1396 | 1133 | ||
| 1397 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) | ||
| 1398 | { | ||
| 1399 | /* root ? */ | ||
| 1400 | if (mem_cgroup_disabled() || !memcg->css.parent) | ||
| 1401 | return vm_swappiness; | ||
| 1402 | |||
| 1403 | return memcg->swappiness; | ||
| 1404 | } | ||
| 1405 | |||
| 1406 | /* | 1134 | /* |
| 1407 | * A routine for checking "mem" is under move_account() or not. | 1135 | * A routine for checking "mem" is under move_account() or not. |
| 1408 | * | 1136 | * |
| @@ -1545,6 +1273,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
| 1545 | static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | 1273 | static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, |
| 1546 | int order) | 1274 | int order) |
| 1547 | { | 1275 | { |
| 1276 | struct oom_control oc = { | ||
| 1277 | .zonelist = NULL, | ||
| 1278 | .nodemask = NULL, | ||
| 1279 | .gfp_mask = gfp_mask, | ||
| 1280 | .order = order, | ||
| 1281 | }; | ||
| 1548 | struct mem_cgroup *iter; | 1282 | struct mem_cgroup *iter; |
| 1549 | unsigned long chosen_points = 0; | 1283 | unsigned long chosen_points = 0; |
| 1550 | unsigned long totalpages; | 1284 | unsigned long totalpages; |
| @@ -1563,7 +1297,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
| 1563 | goto unlock; | 1297 | goto unlock; |
| 1564 | } | 1298 | } |
| 1565 | 1299 | ||
| 1566 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); | 1300 | check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); |
| 1567 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; | 1301 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; |
| 1568 | for_each_mem_cgroup_tree(iter, memcg) { | 1302 | for_each_mem_cgroup_tree(iter, memcg) { |
| 1569 | struct css_task_iter it; | 1303 | struct css_task_iter it; |
| @@ -1571,8 +1305,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
| 1571 | 1305 | ||
| 1572 | css_task_iter_start(&iter->css, &it); | 1306 | css_task_iter_start(&iter->css, &it); |
| 1573 | while ((task = css_task_iter_next(&it))) { | 1307 | while ((task = css_task_iter_next(&it))) { |
| 1574 | switch (oom_scan_process_thread(task, totalpages, NULL, | 1308 | switch (oom_scan_process_thread(&oc, task, totalpages)) { |
| 1575 | false)) { | ||
| 1576 | case OOM_SCAN_SELECT: | 1309 | case OOM_SCAN_SELECT: |
| 1577 | if (chosen) | 1310 | if (chosen) |
| 1578 | put_task_struct(chosen); | 1311 | put_task_struct(chosen); |
| @@ -1610,8 +1343,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
| 1610 | 1343 | ||
| 1611 | if (chosen) { | 1344 | if (chosen) { |
| 1612 | points = chosen_points * 1000 / totalpages; | 1345 | points = chosen_points * 1000 / totalpages; |
| 1613 | oom_kill_process(chosen, gfp_mask, order, points, totalpages, | 1346 | oom_kill_process(&oc, chosen, points, totalpages, memcg, |
| 1614 | memcg, NULL, "Memory cgroup out of memory"); | 1347 | "Memory cgroup out of memory"); |
| 1615 | } | 1348 | } |
| 1616 | unlock: | 1349 | unlock: |
| 1617 | mutex_unlock(&oom_lock); | 1350 | mutex_unlock(&oom_lock); |
| @@ -2062,23 +1795,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) | |||
| 2062 | } | 1795 | } |
| 2063 | EXPORT_SYMBOL(mem_cgroup_end_page_stat); | 1796 | EXPORT_SYMBOL(mem_cgroup_end_page_stat); |
| 2064 | 1797 | ||
| 2065 | /** | ||
| 2066 | * mem_cgroup_update_page_stat - update page state statistics | ||
| 2067 | * @memcg: memcg to account against | ||
| 2068 | * @idx: page state item to account | ||
| 2069 | * @val: number of pages (positive or negative) | ||
| 2070 | * | ||
| 2071 | * See mem_cgroup_begin_page_stat() for locking requirements. | ||
| 2072 | */ | ||
| 2073 | void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, | ||
| 2074 | enum mem_cgroup_stat_index idx, int val) | ||
| 2075 | { | ||
| 2076 | VM_BUG_ON(!rcu_read_lock_held()); | ||
| 2077 | |||
| 2078 | if (memcg) | ||
| 2079 | this_cpu_add(memcg->stat->count[idx], val); | ||
| 2080 | } | ||
| 2081 | |||
| 2082 | /* | 1798 | /* |
| 2083 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1799 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
| 2084 | * TODO: maybe necessary to use big numbers in big irons. | 1800 | * TODO: maybe necessary to use big numbers in big irons. |
| @@ -2504,16 +2220,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) | |||
| 2504 | css_put_many(&memcg->css, nr_pages); | 2220 | css_put_many(&memcg->css, nr_pages); |
| 2505 | } | 2221 | } |
| 2506 | 2222 | ||
| 2507 | /* | ||
| 2508 | * helper for acessing a memcg's index. It will be used as an index in the | ||
| 2509 | * child cache array in kmem_cache, and also to derive its name. This function | ||
| 2510 | * will return -1 when this is not a kmem-limited memcg. | ||
| 2511 | */ | ||
| 2512 | int memcg_cache_id(struct mem_cgroup *memcg) | ||
| 2513 | { | ||
| 2514 | return memcg ? memcg->kmemcg_id : -1; | ||
| 2515 | } | ||
| 2516 | |||
| 2517 | static int memcg_alloc_cache_id(void) | 2223 | static int memcg_alloc_cache_id(void) |
| 2518 | { | 2224 | { |
| 2519 | int id, size; | 2225 | int id, size; |
| @@ -5127,10 +4833,12 @@ static void mem_cgroup_clear_mc(void) | |||
| 5127 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | 4833 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, |
| 5128 | struct cgroup_taskset *tset) | 4834 | struct cgroup_taskset *tset) |
| 5129 | { | 4835 | { |
| 5130 | struct task_struct *p = cgroup_taskset_first(tset); | ||
| 5131 | int ret = 0; | ||
| 5132 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4836 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| 4837 | struct mem_cgroup *from; | ||
| 4838 | struct task_struct *p; | ||
| 4839 | struct mm_struct *mm; | ||
| 5133 | unsigned long move_flags; | 4840 | unsigned long move_flags; |
| 4841 | int ret = 0; | ||
| 5134 | 4842 | ||
| 5135 | /* | 4843 | /* |
| 5136 | * We are now commited to this value whatever it is. Changes in this | 4844 | * We are now commited to this value whatever it is. Changes in this |
| @@ -5138,36 +4846,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
| 5138 | * So we need to save it, and keep it going. | 4846 | * So we need to save it, and keep it going. |
| 5139 | */ | 4847 | */ |
| 5140 | move_flags = READ_ONCE(memcg->move_charge_at_immigrate); | 4848 | move_flags = READ_ONCE(memcg->move_charge_at_immigrate); |
| 5141 | if (move_flags) { | 4849 | if (!move_flags) |
| 5142 | struct mm_struct *mm; | 4850 | return 0; |
| 5143 | struct mem_cgroup *from = mem_cgroup_from_task(p); | ||
| 5144 | 4851 | ||
| 5145 | VM_BUG_ON(from == memcg); | 4852 | p = cgroup_taskset_first(tset); |
| 4853 | from = mem_cgroup_from_task(p); | ||
| 5146 | 4854 | ||
| 5147 | mm = get_task_mm(p); | 4855 | VM_BUG_ON(from == memcg); |
| 5148 | if (!mm) | 4856 | |
| 5149 | return 0; | 4857 | mm = get_task_mm(p); |
| 5150 | /* We move charges only when we move a owner of the mm */ | 4858 | if (!mm) |
| 5151 | if (mm->owner == p) { | 4859 | return 0; |
| 5152 | VM_BUG_ON(mc.from); | 4860 | /* We move charges only when we move a owner of the mm */ |
| 5153 | VM_BUG_ON(mc.to); | 4861 | if (mm->owner == p) { |
| 5154 | VM_BUG_ON(mc.precharge); | 4862 | VM_BUG_ON(mc.from); |
| 5155 | VM_BUG_ON(mc.moved_charge); | 4863 | VM_BUG_ON(mc.to); |
| 5156 | VM_BUG_ON(mc.moved_swap); | 4864 | VM_BUG_ON(mc.precharge); |
| 5157 | 4865 | VM_BUG_ON(mc.moved_charge); | |
| 5158 | spin_lock(&mc.lock); | 4866 | VM_BUG_ON(mc.moved_swap); |
| 5159 | mc.from = from; | 4867 | |
| 5160 | mc.to = memcg; | 4868 | spin_lock(&mc.lock); |
| 5161 | mc.flags = move_flags; | 4869 | mc.from = from; |
| 5162 | spin_unlock(&mc.lock); | 4870 | mc.to = memcg; |
| 5163 | /* We set mc.moving_task later */ | 4871 | mc.flags = move_flags; |
| 5164 | 4872 | spin_unlock(&mc.lock); | |
| 5165 | ret = mem_cgroup_precharge_mc(mm); | 4873 | /* We set mc.moving_task later */ |
| 5166 | if (ret) | 4874 | |
| 5167 | mem_cgroup_clear_mc(); | 4875 | ret = mem_cgroup_precharge_mc(mm); |
| 5168 | } | 4876 | if (ret) |
| 5169 | mmput(mm); | 4877 | mem_cgroup_clear_mc(); |
| 5170 | } | 4878 | } |
| 4879 | mmput(mm); | ||
| 5171 | return ret; | 4880 | return ret; |
| 5172 | } | 4881 | } |
| 5173 | 4882 | ||
| @@ -5521,19 +5230,6 @@ struct cgroup_subsys memory_cgrp_subsys = { | |||
| 5521 | }; | 5230 | }; |
| 5522 | 5231 | ||
| 5523 | /** | 5232 | /** |
| 5524 | * mem_cgroup_events - count memory events against a cgroup | ||
| 5525 | * @memcg: the memory cgroup | ||
| 5526 | * @idx: the event index | ||
| 5527 | * @nr: the number of events to account for | ||
| 5528 | */ | ||
| 5529 | void mem_cgroup_events(struct mem_cgroup *memcg, | ||
| 5530 | enum mem_cgroup_events_index idx, | ||
| 5531 | unsigned int nr) | ||
| 5532 | { | ||
| 5533 | this_cpu_add(memcg->stat->events[idx], nr); | ||
| 5534 | } | ||
| 5535 | |||
| 5536 | /** | ||
| 5537 | * mem_cgroup_low - check if memory consumption is below the normal range | 5233 | * mem_cgroup_low - check if memory consumption is below the normal range |
| 5538 | * @root: the highest ancestor to consider | 5234 | * @root: the highest ancestor to consider |
| 5539 | * @memcg: the memory cgroup to check | 5235 | * @memcg: the memory cgroup to check |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1f4446a90cef..eeda6485e76c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -146,7 +146,7 @@ static int hwpoison_filter_task(struct page *p) | |||
| 146 | if (!mem) | 146 | if (!mem) |
| 147 | return -EINVAL; | 147 | return -EINVAL; |
| 148 | 148 | ||
| 149 | css = mem_cgroup_css(mem); | 149 | css = &mem->css; |
| 150 | ino = cgroup_ino(css->cgroup); | 150 | ino = cgroup_ino(css->cgroup); |
| 151 | css_put(css); | 151 | css_put(css); |
| 152 | 152 | ||
| @@ -934,6 +934,27 @@ int get_hwpoison_page(struct page *page) | |||
| 934 | } | 934 | } |
| 935 | EXPORT_SYMBOL_GPL(get_hwpoison_page); | 935 | EXPORT_SYMBOL_GPL(get_hwpoison_page); |
| 936 | 936 | ||
| 937 | /** | ||
| 938 | * put_hwpoison_page() - Put refcount for memory error handling: | ||
| 939 | * @page: raw error page (hit by memory error) | ||
| 940 | */ | ||
| 941 | void put_hwpoison_page(struct page *page) | ||
| 942 | { | ||
| 943 | struct page *head = compound_head(page); | ||
| 944 | |||
| 945 | if (PageHuge(head)) { | ||
| 946 | put_page(head); | ||
| 947 | return; | ||
| 948 | } | ||
| 949 | |||
| 950 | if (PageTransHuge(head)) | ||
| 951 | if (page != head) | ||
| 952 | put_page(head); | ||
| 953 | |||
| 954 | put_page(page); | ||
| 955 | } | ||
| 956 | EXPORT_SYMBOL_GPL(put_hwpoison_page); | ||
| 957 | |||
| 937 | /* | 958 | /* |
| 938 | * Do all that is necessary to remove user space mappings. Unmap | 959 | * Do all that is necessary to remove user space mappings. Unmap |
| 939 | * the pages and send SIGBUS to the processes if the data was dirty. | 960 | * the pages and send SIGBUS to the processes if the data was dirty. |
| @@ -1100,7 +1121,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1100 | nr_pages = 1 << compound_order(hpage); | 1121 | nr_pages = 1 << compound_order(hpage); |
| 1101 | else /* normal page or thp */ | 1122 | else /* normal page or thp */ |
| 1102 | nr_pages = 1; | 1123 | nr_pages = 1; |
| 1103 | atomic_long_add(nr_pages, &num_poisoned_pages); | 1124 | num_poisoned_pages_add(nr_pages); |
| 1104 | 1125 | ||
| 1105 | /* | 1126 | /* |
| 1106 | * We need/can do nothing about count=0 pages. | 1127 | * We need/can do nothing about count=0 pages. |
| @@ -1128,7 +1149,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1128 | if (PageHWPoison(hpage)) { | 1149 | if (PageHWPoison(hpage)) { |
| 1129 | if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) | 1150 | if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) |
| 1130 | || (p != hpage && TestSetPageHWPoison(hpage))) { | 1151 | || (p != hpage && TestSetPageHWPoison(hpage))) { |
| 1131 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1152 | num_poisoned_pages_sub(nr_pages); |
| 1132 | unlock_page(hpage); | 1153 | unlock_page(hpage); |
| 1133 | return 0; | 1154 | return 0; |
| 1134 | } | 1155 | } |
| @@ -1152,10 +1173,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1152 | else | 1173 | else |
| 1153 | pr_err("MCE: %#lx: thp split failed\n", pfn); | 1174 | pr_err("MCE: %#lx: thp split failed\n", pfn); |
| 1154 | if (TestClearPageHWPoison(p)) | 1175 | if (TestClearPageHWPoison(p)) |
| 1155 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1176 | num_poisoned_pages_sub(nr_pages); |
| 1156 | put_page(p); | 1177 | put_hwpoison_page(p); |
| 1157 | if (p != hpage) | ||
| 1158 | put_page(hpage); | ||
| 1159 | return -EBUSY; | 1178 | return -EBUSY; |
| 1160 | } | 1179 | } |
| 1161 | VM_BUG_ON_PAGE(!page_count(p), p); | 1180 | VM_BUG_ON_PAGE(!page_count(p), p); |
| @@ -1214,16 +1233,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1214 | */ | 1233 | */ |
| 1215 | if (!PageHWPoison(p)) { | 1234 | if (!PageHWPoison(p)) { |
| 1216 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); | 1235 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); |
| 1217 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1236 | num_poisoned_pages_sub(nr_pages); |
| 1218 | unlock_page(hpage); | 1237 | unlock_page(hpage); |
| 1219 | put_page(hpage); | 1238 | put_hwpoison_page(hpage); |
| 1220 | return 0; | 1239 | return 0; |
| 1221 | } | 1240 | } |
| 1222 | if (hwpoison_filter(p)) { | 1241 | if (hwpoison_filter(p)) { |
| 1223 | if (TestClearPageHWPoison(p)) | 1242 | if (TestClearPageHWPoison(p)) |
| 1224 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1243 | num_poisoned_pages_sub(nr_pages); |
| 1225 | unlock_page(hpage); | 1244 | unlock_page(hpage); |
| 1226 | put_page(hpage); | 1245 | put_hwpoison_page(hpage); |
| 1227 | return 0; | 1246 | return 0; |
| 1228 | } | 1247 | } |
| 1229 | 1248 | ||
| @@ -1237,7 +1256,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1237 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { | 1256 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { |
| 1238 | action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); | 1257 | action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); |
| 1239 | unlock_page(hpage); | 1258 | unlock_page(hpage); |
| 1240 | put_page(hpage); | 1259 | put_hwpoison_page(hpage); |
| 1241 | return 0; | 1260 | return 0; |
| 1242 | } | 1261 | } |
| 1243 | /* | 1262 | /* |
| @@ -1426,6 +1445,22 @@ int unpoison_memory(unsigned long pfn) | |||
| 1426 | return 0; | 1445 | return 0; |
| 1427 | } | 1446 | } |
| 1428 | 1447 | ||
| 1448 | if (page_count(page) > 1) { | ||
| 1449 | pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn); | ||
| 1450 | return 0; | ||
| 1451 | } | ||
| 1452 | |||
| 1453 | if (page_mapped(page)) { | ||
| 1454 | pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn); | ||
| 1455 | return 0; | ||
| 1456 | } | ||
| 1457 | |||
| 1458 | if (page_mapping(page)) { | ||
| 1459 | pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", | ||
| 1460 | pfn); | ||
| 1461 | return 0; | ||
| 1462 | } | ||
| 1463 | |||
| 1429 | /* | 1464 | /* |
| 1430 | * unpoison_memory() can encounter thp only when the thp is being | 1465 | * unpoison_memory() can encounter thp only when the thp is being |
| 1431 | * worked by memory_failure() and the page lock is not held yet. | 1466 | * worked by memory_failure() and the page lock is not held yet. |
| @@ -1450,7 +1485,7 @@ int unpoison_memory(unsigned long pfn) | |||
| 1450 | return 0; | 1485 | return 0; |
| 1451 | } | 1486 | } |
| 1452 | if (TestClearPageHWPoison(p)) | 1487 | if (TestClearPageHWPoison(p)) |
| 1453 | atomic_long_dec(&num_poisoned_pages); | 1488 | num_poisoned_pages_dec(); |
| 1454 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1489 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
| 1455 | return 0; | 1490 | return 0; |
| 1456 | } | 1491 | } |
| @@ -1464,16 +1499,16 @@ int unpoison_memory(unsigned long pfn) | |||
| 1464 | */ | 1499 | */ |
| 1465 | if (TestClearPageHWPoison(page)) { | 1500 | if (TestClearPageHWPoison(page)) { |
| 1466 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); | 1501 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
| 1467 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1502 | num_poisoned_pages_sub(nr_pages); |
| 1468 | freeit = 1; | 1503 | freeit = 1; |
| 1469 | if (PageHuge(page)) | 1504 | if (PageHuge(page)) |
| 1470 | clear_page_hwpoison_huge_page(page); | 1505 | clear_page_hwpoison_huge_page(page); |
| 1471 | } | 1506 | } |
| 1472 | unlock_page(page); | 1507 | unlock_page(page); |
| 1473 | 1508 | ||
| 1474 | put_page(page); | 1509 | put_hwpoison_page(page); |
| 1475 | if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) | 1510 | if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) |
| 1476 | put_page(page); | 1511 | put_hwpoison_page(page); |
| 1477 | 1512 | ||
| 1478 | return 0; | 1513 | return 0; |
| 1479 | } | 1514 | } |
| @@ -1486,7 +1521,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) | |||
| 1486 | return alloc_huge_page_node(page_hstate(compound_head(p)), | 1521 | return alloc_huge_page_node(page_hstate(compound_head(p)), |
| 1487 | nid); | 1522 | nid); |
| 1488 | else | 1523 | else |
| 1489 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | 1524 | return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); |
| 1490 | } | 1525 | } |
| 1491 | 1526 | ||
| 1492 | /* | 1527 | /* |
| @@ -1533,7 +1568,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) | |||
| 1533 | /* | 1568 | /* |
| 1534 | * Try to free it. | 1569 | * Try to free it. |
| 1535 | */ | 1570 | */ |
| 1536 | put_page(page); | 1571 | put_hwpoison_page(page); |
| 1537 | shake_page(page, 1); | 1572 | shake_page(page, 1); |
| 1538 | 1573 | ||
| 1539 | /* | 1574 | /* |
| @@ -1542,7 +1577,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) | |||
| 1542 | ret = __get_any_page(page, pfn, 0); | 1577 | ret = __get_any_page(page, pfn, 0); |
| 1543 | if (!PageLRU(page)) { | 1578 | if (!PageLRU(page)) { |
| 1544 | /* Drop page reference which is from __get_any_page() */ | 1579 | /* Drop page reference which is from __get_any_page() */ |
| 1545 | put_page(page); | 1580 | put_hwpoison_page(page); |
| 1546 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1581 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", |
| 1547 | pfn, page->flags); | 1582 | pfn, page->flags); |
| 1548 | return -EIO; | 1583 | return -EIO; |
| @@ -1565,7 +1600,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
| 1565 | lock_page(hpage); | 1600 | lock_page(hpage); |
| 1566 | if (PageHWPoison(hpage)) { | 1601 | if (PageHWPoison(hpage)) { |
| 1567 | unlock_page(hpage); | 1602 | unlock_page(hpage); |
| 1568 | put_page(hpage); | 1603 | put_hwpoison_page(hpage); |
| 1569 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); | 1604 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); |
| 1570 | return -EBUSY; | 1605 | return -EBUSY; |
| 1571 | } | 1606 | } |
| @@ -1576,7 +1611,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
| 1576 | * get_any_page() and isolate_huge_page() takes a refcount each, | 1611 | * get_any_page() and isolate_huge_page() takes a refcount each, |
| 1577 | * so need to drop one here. | 1612 | * so need to drop one here. |
| 1578 | */ | 1613 | */ |
| 1579 | put_page(hpage); | 1614 | put_hwpoison_page(hpage); |
| 1580 | if (!ret) { | 1615 | if (!ret) { |
| 1581 | pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); | 1616 | pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); |
| 1582 | return -EBUSY; | 1617 | return -EBUSY; |
| @@ -1600,11 +1635,10 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
| 1600 | if (PageHuge(page)) { | 1635 | if (PageHuge(page)) { |
| 1601 | set_page_hwpoison_huge_page(hpage); | 1636 | set_page_hwpoison_huge_page(hpage); |
| 1602 | dequeue_hwpoisoned_huge_page(hpage); | 1637 | dequeue_hwpoisoned_huge_page(hpage); |
| 1603 | atomic_long_add(1 << compound_order(hpage), | 1638 | num_poisoned_pages_add(1 << compound_order(hpage)); |
| 1604 | &num_poisoned_pages); | ||
| 1605 | } else { | 1639 | } else { |
| 1606 | SetPageHWPoison(page); | 1640 | SetPageHWPoison(page); |
| 1607 | atomic_long_inc(&num_poisoned_pages); | 1641 | num_poisoned_pages_inc(); |
| 1608 | } | 1642 | } |
| 1609 | } | 1643 | } |
| 1610 | return ret; | 1644 | return ret; |
| @@ -1625,7 +1659,7 @@ static int __soft_offline_page(struct page *page, int flags) | |||
| 1625 | wait_on_page_writeback(page); | 1659 | wait_on_page_writeback(page); |
| 1626 | if (PageHWPoison(page)) { | 1660 | if (PageHWPoison(page)) { |
| 1627 | unlock_page(page); | 1661 | unlock_page(page); |
| 1628 | put_page(page); | 1662 | put_hwpoison_page(page); |
| 1629 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | 1663 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
| 1630 | return -EBUSY; | 1664 | return -EBUSY; |
| 1631 | } | 1665 | } |
| @@ -1640,10 +1674,10 @@ static int __soft_offline_page(struct page *page, int flags) | |||
| 1640 | * would need to fix isolation locking first. | 1674 | * would need to fix isolation locking first. |
| 1641 | */ | 1675 | */ |
| 1642 | if (ret == 1) { | 1676 | if (ret == 1) { |
| 1643 | put_page(page); | 1677 | put_hwpoison_page(page); |
| 1644 | pr_info("soft_offline: %#lx: invalidated\n", pfn); | 1678 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
| 1645 | SetPageHWPoison(page); | 1679 | SetPageHWPoison(page); |
| 1646 | atomic_long_inc(&num_poisoned_pages); | 1680 | num_poisoned_pages_inc(); |
| 1647 | return 0; | 1681 | return 0; |
| 1648 | } | 1682 | } |
| 1649 | 1683 | ||
| @@ -1657,14 +1691,12 @@ static int __soft_offline_page(struct page *page, int flags) | |||
| 1657 | * Drop page reference which is came from get_any_page() | 1691 | * Drop page reference which is came from get_any_page() |
| 1658 | * successful isolate_lru_page() already took another one. | 1692 | * successful isolate_lru_page() already took another one. |
| 1659 | */ | 1693 | */ |
| 1660 | put_page(page); | 1694 | put_hwpoison_page(page); |
| 1661 | if (!ret) { | 1695 | if (!ret) { |
| 1662 | LIST_HEAD(pagelist); | 1696 | LIST_HEAD(pagelist); |
| 1663 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1697 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
| 1664 | page_is_file_cache(page)); | 1698 | page_is_file_cache(page)); |
| 1665 | list_add(&page->lru, &pagelist); | 1699 | list_add(&page->lru, &pagelist); |
| 1666 | if (!TestSetPageHWPoison(page)) | ||
| 1667 | atomic_long_inc(&num_poisoned_pages); | ||
| 1668 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, | 1700 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
| 1669 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1701 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
| 1670 | if (ret) { | 1702 | if (ret) { |
| @@ -1679,8 +1711,6 @@ static int __soft_offline_page(struct page *page, int flags) | |||
| 1679 | pfn, ret, page->flags); | 1711 | pfn, ret, page->flags); |
| 1680 | if (ret > 0) | 1712 | if (ret > 0) |
| 1681 | ret = -EIO; | 1713 | ret = -EIO; |
| 1682 | if (TestClearPageHWPoison(page)) | ||
| 1683 | atomic_long_dec(&num_poisoned_pages); | ||
| 1684 | } | 1714 | } |
| 1685 | } else { | 1715 | } else { |
| 1686 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1716 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
| @@ -1719,12 +1749,16 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1719 | 1749 | ||
| 1720 | if (PageHWPoison(page)) { | 1750 | if (PageHWPoison(page)) { |
| 1721 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | 1751 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
| 1752 | if (flags & MF_COUNT_INCREASED) | ||
| 1753 | put_hwpoison_page(page); | ||
| 1722 | return -EBUSY; | 1754 | return -EBUSY; |
| 1723 | } | 1755 | } |
| 1724 | if (!PageHuge(page) && PageTransHuge(hpage)) { | 1756 | if (!PageHuge(page) && PageTransHuge(hpage)) { |
| 1725 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { | 1757 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { |
| 1726 | pr_info("soft offline: %#lx: failed to split THP\n", | 1758 | pr_info("soft offline: %#lx: failed to split THP\n", |
| 1727 | pfn); | 1759 | pfn); |
| 1760 | if (flags & MF_COUNT_INCREASED) | ||
| 1761 | put_hwpoison_page(page); | ||
| 1728 | return -EBUSY; | 1762 | return -EBUSY; |
| 1729 | } | 1763 | } |
| 1730 | } | 1764 | } |
| @@ -1742,11 +1776,10 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1742 | if (PageHuge(page)) { | 1776 | if (PageHuge(page)) { |
| 1743 | set_page_hwpoison_huge_page(hpage); | 1777 | set_page_hwpoison_huge_page(hpage); |
| 1744 | if (!dequeue_hwpoisoned_huge_page(hpage)) | 1778 | if (!dequeue_hwpoisoned_huge_page(hpage)) |
| 1745 | atomic_long_add(1 << compound_order(hpage), | 1779 | num_poisoned_pages_add(1 << compound_order(hpage)); |
| 1746 | &num_poisoned_pages); | ||
| 1747 | } else { | 1780 | } else { |
| 1748 | if (!TestSetPageHWPoison(page)) | 1781 | if (!TestSetPageHWPoison(page)) |
| 1749 | atomic_long_inc(&num_poisoned_pages); | 1782 | num_poisoned_pages_inc(); |
| 1750 | } | 1783 | } |
| 1751 | } | 1784 | } |
| 1752 | return ret; | 1785 | return ret; |
diff --git a/mm/memory.c b/mm/memory.c index bb04d8f2f86c..6cd0b2160401 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -2426,8 +2426,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
| 2426 | if (details.last_index < details.first_index) | 2426 | if (details.last_index < details.first_index) |
| 2427 | details.last_index = ULONG_MAX; | 2427 | details.last_index = ULONG_MAX; |
| 2428 | 2428 | ||
| 2429 | |||
| 2430 | /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ | ||
| 2431 | i_mmap_lock_write(mapping); | 2429 | i_mmap_lock_write(mapping); |
| 2432 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) | 2430 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
| 2433 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2431 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
| @@ -3015,9 +3013,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3015 | } else { | 3013 | } else { |
| 3016 | /* | 3014 | /* |
| 3017 | * The fault handler has no page to lock, so it holds | 3015 | * The fault handler has no page to lock, so it holds |
| 3018 | * i_mmap_lock for read to protect against truncate. | 3016 | * i_mmap_lock for write to protect against truncate. |
| 3019 | */ | 3017 | */ |
| 3020 | i_mmap_unlock_read(vma->vm_file->f_mapping); | 3018 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
| 3021 | } | 3019 | } |
| 3022 | goto uncharge_out; | 3020 | goto uncharge_out; |
| 3023 | } | 3021 | } |
| @@ -3031,9 +3029,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3031 | } else { | 3029 | } else { |
| 3032 | /* | 3030 | /* |
| 3033 | * The fault handler has no page to lock, so it holds | 3031 | * The fault handler has no page to lock, so it holds |
| 3034 | * i_mmap_lock for read to protect against truncate. | 3032 | * i_mmap_lock for write to protect against truncate. |
| 3035 | */ | 3033 | */ |
| 3036 | i_mmap_unlock_read(vma->vm_file->f_mapping); | 3034 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
| 3037 | } | 3035 | } |
| 3038 | return ret; | 3036 | return ret; |
| 3039 | uncharge_out: | 3037 | uncharge_out: |
| @@ -3232,6 +3230,27 @@ out: | |||
| 3232 | return 0; | 3230 | return 0; |
| 3233 | } | 3231 | } |
| 3234 | 3232 | ||
| 3233 | static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 3234 | unsigned long address, pmd_t *pmd, unsigned int flags) | ||
| 3235 | { | ||
| 3236 | if (!vma->vm_ops) | ||
| 3237 | return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); | ||
| 3238 | if (vma->vm_ops->pmd_fault) | ||
| 3239 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | ||
| 3240 | return VM_FAULT_FALLBACK; | ||
| 3241 | } | ||
| 3242 | |||
| 3243 | static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 3244 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd, | ||
| 3245 | unsigned int flags) | ||
| 3246 | { | ||
| 3247 | if (!vma->vm_ops) | ||
| 3248 | return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); | ||
| 3249 | if (vma->vm_ops->pmd_fault) | ||
| 3250 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | ||
| 3251 | return VM_FAULT_FALLBACK; | ||
| 3252 | } | ||
| 3253 | |||
| 3235 | /* | 3254 | /* |
| 3236 | * These routines also need to handle stuff like marking pages dirty | 3255 | * These routines also need to handle stuff like marking pages dirty |
| 3237 | * and/or accessed for architectures that don't do it in hardware (most | 3256 | * and/or accessed for architectures that don't do it in hardware (most |
| @@ -3267,12 +3286,12 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
| 3267 | barrier(); | 3286 | barrier(); |
| 3268 | if (!pte_present(entry)) { | 3287 | if (!pte_present(entry)) { |
| 3269 | if (pte_none(entry)) { | 3288 | if (pte_none(entry)) { |
| 3270 | if (vma->vm_ops) | 3289 | if (vma_is_anonymous(vma)) |
| 3290 | return do_anonymous_page(mm, vma, address, | ||
| 3291 | pte, pmd, flags); | ||
| 3292 | else | ||
| 3271 | return do_fault(mm, vma, address, pte, pmd, | 3293 | return do_fault(mm, vma, address, pte, pmd, |
| 3272 | flags, entry); | 3294 | flags, entry); |
| 3273 | |||
| 3274 | return do_anonymous_page(mm, vma, address, pte, pmd, | ||
| 3275 | flags); | ||
| 3276 | } | 3295 | } |
| 3277 | return do_swap_page(mm, vma, address, | 3296 | return do_swap_page(mm, vma, address, |
| 3278 | pte, pmd, flags, entry); | 3297 | pte, pmd, flags, entry); |
| @@ -3334,10 +3353,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3334 | if (!pmd) | 3353 | if (!pmd) |
| 3335 | return VM_FAULT_OOM; | 3354 | return VM_FAULT_OOM; |
| 3336 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { | 3355 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
| 3337 | int ret = VM_FAULT_FALLBACK; | 3356 | int ret = create_huge_pmd(mm, vma, address, pmd, flags); |
| 3338 | if (!vma->vm_ops) | ||
| 3339 | ret = do_huge_pmd_anonymous_page(mm, vma, address, | ||
| 3340 | pmd, flags); | ||
| 3341 | if (!(ret & VM_FAULT_FALLBACK)) | 3357 | if (!(ret & VM_FAULT_FALLBACK)) |
| 3342 | return ret; | 3358 | return ret; |
| 3343 | } else { | 3359 | } else { |
| @@ -3361,8 +3377,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3361 | orig_pmd, pmd); | 3377 | orig_pmd, pmd); |
| 3362 | 3378 | ||
| 3363 | if (dirty && !pmd_write(orig_pmd)) { | 3379 | if (dirty && !pmd_write(orig_pmd)) { |
| 3364 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3380 | ret = wp_huge_pmd(mm, vma, address, pmd, |
| 3365 | orig_pmd); | 3381 | orig_pmd, flags); |
| 3366 | if (!(ret & VM_FAULT_FALLBACK)) | 3382 | if (!(ret & VM_FAULT_FALLBACK)) |
| 3367 | return ret; | 3383 | return ret; |
| 3368 | } else { | 3384 | } else { |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a7f1e0d1d6b8..87a177917cb2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, | |||
| 608 | 608 | ||
| 609 | qp->prev = vma; | 609 | qp->prev = vma; |
| 610 | 610 | ||
| 611 | if (vma->vm_flags & VM_PFNMAP) | ||
| 612 | return 1; | ||
| 613 | |||
| 614 | if (flags & MPOL_MF_LAZY) { | 611 | if (flags & MPOL_MF_LAZY) { |
| 615 | /* Similar to task_numa_work, skip inaccessible VMAs */ | 612 | /* Similar to task_numa_work, skip inaccessible VMAs */ |
| 616 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | 613 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) |
| @@ -945,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x | |||
| 945 | return alloc_huge_page_node(page_hstate(compound_head(page)), | 942 | return alloc_huge_page_node(page_hstate(compound_head(page)), |
| 946 | node); | 943 | node); |
| 947 | else | 944 | else |
| 948 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | | 945 | return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | |
| 949 | __GFP_THISNODE, 0); | 946 | __GFP_THISNODE, 0); |
| 950 | } | 947 | } |
| 951 | 948 | ||
| @@ -2001,7 +1998,7 @@ retry_cpuset: | |||
| 2001 | nmask = policy_nodemask(gfp, pol); | 1998 | nmask = policy_nodemask(gfp, pol); |
| 2002 | if (!nmask || node_isset(hpage_node, *nmask)) { | 1999 | if (!nmask || node_isset(hpage_node, *nmask)) { |
| 2003 | mpol_cond_put(pol); | 2000 | mpol_cond_put(pol); |
| 2004 | page = alloc_pages_exact_node(hpage_node, | 2001 | page = __alloc_pages_node(hpage_node, |
| 2005 | gfp | __GFP_THISNODE, order); | 2002 | gfp | __GFP_THISNODE, order); |
| 2006 | goto out; | 2003 | goto out; |
| 2007 | } | 2004 | } |
diff --git a/mm/mempool.c b/mm/mempool.c index 2cc08de8b1db..4c533bc51d73 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
| @@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool) | |||
| 150 | */ | 150 | */ |
| 151 | void mempool_destroy(mempool_t *pool) | 151 | void mempool_destroy(mempool_t *pool) |
| 152 | { | 152 | { |
| 153 | if (unlikely(!pool)) | ||
| 154 | return; | ||
| 155 | |||
| 153 | while (pool->curr_nr) { | 156 | while (pool->curr_nr) { |
| 154 | void *element = remove_element(pool); | 157 | void *element = remove_element(pool); |
| 155 | pool->free(element, pool->pool_data); | 158 | pool->free(element, pool->pool_data); |
diff --git a/mm/memtest.c b/mm/memtest.c index 0a1cc133f6d7..8eaa4c3a5f65 100644 --- a/mm/memtest.c +++ b/mm/memtest.c | |||
| @@ -1,11 +1,6 @@ | |||
| 1 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
| 2 | #include <linux/errno.h> | ||
| 3 | #include <linux/string.h> | ||
| 4 | #include <linux/types.h> | 2 | #include <linux/types.h> |
| 5 | #include <linux/mm.h> | ||
| 6 | #include <linux/smp.h> | ||
| 7 | #include <linux/init.h> | 3 | #include <linux/init.h> |
| 8 | #include <linux/pfn.h> | ||
| 9 | #include <linux/memblock.h> | 4 | #include <linux/memblock.h> |
| 10 | 5 | ||
| 11 | static u64 patterns[] __initdata = { | 6 | static u64 patterns[] __initdata = { |
| @@ -31,10 +26,8 @@ static u64 patterns[] __initdata = { | |||
| 31 | 26 | ||
| 32 | static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) | 27 | static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) |
| 33 | { | 28 | { |
| 34 | printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", | 29 | pr_info(" %016llx bad mem addr %pa - %pa reserved\n", |
| 35 | (unsigned long long) pattern, | 30 | cpu_to_be64(pattern), &start_bad, &end_bad); |
| 36 | (unsigned long long) start_bad, | ||
| 37 | (unsigned long long) end_bad); | ||
| 38 | memblock_reserve(start_bad, end_bad - start_bad); | 31 | memblock_reserve(start_bad, end_bad - start_bad); |
| 39 | } | 32 | } |
| 40 | 33 | ||
| @@ -79,26 +72,26 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) | |||
| 79 | this_start = clamp(this_start, start, end); | 72 | this_start = clamp(this_start, start, end); |
| 80 | this_end = clamp(this_end, start, end); | 73 | this_end = clamp(this_end, start, end); |
| 81 | if (this_start < this_end) { | 74 | if (this_start < this_end) { |
| 82 | printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", | 75 | pr_info(" %pa - %pa pattern %016llx\n", |
| 83 | (unsigned long long)this_start, | 76 | &this_start, &this_end, cpu_to_be64(pattern)); |
| 84 | (unsigned long long)this_end, | ||
| 85 | (unsigned long long)cpu_to_be64(pattern)); | ||
| 86 | memtest(pattern, this_start, this_end - this_start); | 77 | memtest(pattern, this_start, this_end - this_start); |
| 87 | } | 78 | } |
| 88 | } | 79 | } |
| 89 | } | 80 | } |
| 90 | 81 | ||
| 91 | /* default is disabled */ | 82 | /* default is disabled */ |
| 92 | static int memtest_pattern __initdata; | 83 | static unsigned int memtest_pattern __initdata; |
| 93 | 84 | ||
| 94 | static int __init parse_memtest(char *arg) | 85 | static int __init parse_memtest(char *arg) |
| 95 | { | 86 | { |
| 87 | int ret = 0; | ||
| 88 | |||
| 96 | if (arg) | 89 | if (arg) |
| 97 | memtest_pattern = simple_strtoul(arg, NULL, 0); | 90 | ret = kstrtouint(arg, 0, &memtest_pattern); |
| 98 | else | 91 | else |
| 99 | memtest_pattern = ARRAY_SIZE(patterns); | 92 | memtest_pattern = ARRAY_SIZE(patterns); |
| 100 | 93 | ||
| 101 | return 0; | 94 | return ret; |
| 102 | } | 95 | } |
| 103 | 96 | ||
| 104 | early_param("memtest", parse_memtest); | 97 | early_param("memtest", parse_memtest); |
| @@ -111,7 +104,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end) | |||
| 111 | if (!memtest_pattern) | 104 | if (!memtest_pattern) |
| 112 | return; | 105 | return; |
| 113 | 106 | ||
| 114 | printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); | 107 | pr_info("early_memtest: # of tests: %u\n", memtest_pattern); |
| 115 | for (i = memtest_pattern-1; i < UINT_MAX; --i) { | 108 | for (i = memtest_pattern-1; i < UINT_MAX; --i) { |
| 116 | idx = i % ARRAY_SIZE(patterns); | 109 | idx = i % ARRAY_SIZE(patterns); |
| 117 | do_one_pass(patterns[idx], start, end); | 110 | do_one_pass(patterns[idx], start, end); |
diff --git a/mm/migrate.c b/mm/migrate.c index 5c08cab5419e..02ce25df16c2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -880,8 +880,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
| 880 | /* Establish migration ptes or remove ptes */ | 880 | /* Establish migration ptes or remove ptes */ |
| 881 | if (page_mapped(page)) { | 881 | if (page_mapped(page)) { |
| 882 | try_to_unmap(page, | 882 | try_to_unmap(page, |
| 883 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| | 883 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
| 884 | TTU_IGNORE_HWPOISON); | ||
| 885 | page_was_mapped = 1; | 884 | page_was_mapped = 1; |
| 886 | } | 885 | } |
| 887 | 886 | ||
| @@ -952,9 +951,11 @@ out: | |||
| 952 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 951 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
| 953 | page_is_file_cache(page)); | 952 | page_is_file_cache(page)); |
| 954 | /* Soft-offlined page shouldn't go through lru cache list */ | 953 | /* Soft-offlined page shouldn't go through lru cache list */ |
| 955 | if (reason == MR_MEMORY_FAILURE) | 954 | if (reason == MR_MEMORY_FAILURE) { |
| 956 | put_page(page); | 955 | put_page(page); |
| 957 | else | 956 | if (!test_set_page_hwpoison(page)) |
| 957 | num_poisoned_pages_inc(); | ||
| 958 | } else | ||
| 958 | putback_lru_page(page); | 959 | putback_lru_page(page); |
| 959 | } | 960 | } |
| 960 | 961 | ||
| @@ -1194,7 +1195,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
| 1194 | return alloc_huge_page_node(page_hstate(compound_head(p)), | 1195 | return alloc_huge_page_node(page_hstate(compound_head(p)), |
| 1195 | pm->node); | 1196 | pm->node); |
| 1196 | else | 1197 | else |
| 1197 | return alloc_pages_exact_node(pm->node, | 1198 | return __alloc_pages_node(pm->node, |
| 1198 | GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); | 1199 | GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); |
| 1199 | } | 1200 | } |
| 1200 | 1201 | ||
| @@ -1554,7 +1555,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, | |||
| 1554 | int nid = (int) data; | 1555 | int nid = (int) data; |
| 1555 | struct page *newpage; | 1556 | struct page *newpage; |
| 1556 | 1557 | ||
| 1557 | newpage = alloc_pages_exact_node(nid, | 1558 | newpage = __alloc_pages_node(nid, |
| 1558 | (GFP_HIGHUSER_MOVABLE | | 1559 | (GFP_HIGHUSER_MOVABLE | |
| 1559 | __GFP_THISNODE | __GFP_NOMEMALLOC | | 1560 | __GFP_THISNODE | __GFP_NOMEMALLOC | |
| 1560 | __GFP_NORETRY | __GFP_NOWARN) & | 1561 | __GFP_NORETRY | __GFP_NOWARN) & |
| @@ -2455,7 +2455,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2455 | unsigned long addr, int new_below) | 2455 | unsigned long addr, int new_below) |
| 2456 | { | 2456 | { |
| 2457 | struct vm_area_struct *new; | 2457 | struct vm_area_struct *new; |
| 2458 | int err = -ENOMEM; | 2458 | int err; |
| 2459 | 2459 | ||
| 2460 | if (is_vm_hugetlb_page(vma) && (addr & | 2460 | if (is_vm_hugetlb_page(vma) && (addr & |
| 2461 | ~(huge_page_mask(hstate_vma(vma))))) | 2461 | ~(huge_page_mask(hstate_vma(vma))))) |
| @@ -2463,7 +2463,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2463 | 2463 | ||
| 2464 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2464 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
| 2465 | if (!new) | 2465 | if (!new) |
| 2466 | goto out_err; | 2466 | return -ENOMEM; |
| 2467 | 2467 | ||
| 2468 | /* most fields are the same, copy all, and then fixup */ | 2468 | /* most fields are the same, copy all, and then fixup */ |
| 2469 | *new = *vma; | 2469 | *new = *vma; |
| @@ -2511,7 +2511,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2511 | mpol_put(vma_policy(new)); | 2511 | mpol_put(vma_policy(new)); |
| 2512 | out_free_vma: | 2512 | out_free_vma: |
| 2513 | kmem_cache_free(vm_area_cachep, new); | 2513 | kmem_cache_free(vm_area_cachep, new); |
| 2514 | out_err: | ||
| 2515 | return err; | 2514 | return err; |
| 2516 | } | 2515 | } |
| 2517 | 2516 | ||
| @@ -2872,6 +2871,13 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | |||
| 2872 | struct vm_area_struct *prev; | 2871 | struct vm_area_struct *prev; |
| 2873 | struct rb_node **rb_link, *rb_parent; | 2872 | struct rb_node **rb_link, *rb_parent; |
| 2874 | 2873 | ||
| 2874 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, | ||
| 2875 | &prev, &rb_link, &rb_parent)) | ||
| 2876 | return -ENOMEM; | ||
| 2877 | if ((vma->vm_flags & VM_ACCOUNT) && | ||
| 2878 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | ||
| 2879 | return -ENOMEM; | ||
| 2880 | |||
| 2875 | /* | 2881 | /* |
| 2876 | * The vm_pgoff of a purely anonymous vma should be irrelevant | 2882 | * The vm_pgoff of a purely anonymous vma should be irrelevant |
| 2877 | * until its first write fault, when page's anon_vma and index | 2883 | * until its first write fault, when page's anon_vma and index |
| @@ -2884,16 +2890,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | |||
| 2884 | * using the existing file pgoff checks and manipulations. | 2890 | * using the existing file pgoff checks and manipulations. |
| 2885 | * Similarly in do_mmap_pgoff and in do_brk. | 2891 | * Similarly in do_mmap_pgoff and in do_brk. |
| 2886 | */ | 2892 | */ |
| 2887 | if (!vma->vm_file) { | 2893 | if (vma_is_anonymous(vma)) { |
| 2888 | BUG_ON(vma->anon_vma); | 2894 | BUG_ON(vma->anon_vma); |
| 2889 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; | 2895 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; |
| 2890 | } | 2896 | } |
| 2891 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, | ||
| 2892 | &prev, &rb_link, &rb_parent)) | ||
| 2893 | return -ENOMEM; | ||
| 2894 | if ((vma->vm_flags & VM_ACCOUNT) && | ||
| 2895 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | ||
| 2896 | return -ENOMEM; | ||
| 2897 | 2897 | ||
| 2898 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2898 | vma_link(mm, vma, prev, rb_link, rb_parent); |
| 2899 | return 0; | 2899 | return 0; |
| @@ -2918,7 +2918,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
| 2918 | * If anonymous vma has not yet been faulted, update new pgoff | 2918 | * If anonymous vma has not yet been faulted, update new pgoff |
| 2919 | * to match new location, to increase its chance of merging. | 2919 | * to match new location, to increase its chance of merging. |
| 2920 | */ | 2920 | */ |
| 2921 | if (unlikely(!vma->vm_file && !vma->anon_vma)) { | 2921 | if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { |
| 2922 | pgoff = addr >> PAGE_SHIFT; | 2922 | pgoff = addr >> PAGE_SHIFT; |
| 2923 | faulted_in_anon_vma = false; | 2923 | faulted_in_anon_vma = false; |
| 2924 | } | 2924 | } |
| @@ -2952,30 +2952,31 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
| 2952 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); | 2952 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); |
| 2953 | } else { | 2953 | } else { |
| 2954 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2954 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
| 2955 | if (new_vma) { | 2955 | if (!new_vma) |
| 2956 | *new_vma = *vma; | 2956 | goto out; |
| 2957 | new_vma->vm_start = addr; | 2957 | *new_vma = *vma; |
| 2958 | new_vma->vm_end = addr + len; | 2958 | new_vma->vm_start = addr; |
| 2959 | new_vma->vm_pgoff = pgoff; | 2959 | new_vma->vm_end = addr + len; |
| 2960 | if (vma_dup_policy(vma, new_vma)) | 2960 | new_vma->vm_pgoff = pgoff; |
| 2961 | goto out_free_vma; | 2961 | if (vma_dup_policy(vma, new_vma)) |
| 2962 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); | 2962 | goto out_free_vma; |
| 2963 | if (anon_vma_clone(new_vma, vma)) | 2963 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
| 2964 | goto out_free_mempol; | 2964 | if (anon_vma_clone(new_vma, vma)) |
| 2965 | if (new_vma->vm_file) | 2965 | goto out_free_mempol; |
| 2966 | get_file(new_vma->vm_file); | 2966 | if (new_vma->vm_file) |
| 2967 | if (new_vma->vm_ops && new_vma->vm_ops->open) | 2967 | get_file(new_vma->vm_file); |
| 2968 | new_vma->vm_ops->open(new_vma); | 2968 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
| 2969 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | 2969 | new_vma->vm_ops->open(new_vma); |
| 2970 | *need_rmap_locks = false; | 2970 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
| 2971 | } | 2971 | *need_rmap_locks = false; |
| 2972 | } | 2972 | } |
| 2973 | return new_vma; | 2973 | return new_vma; |
| 2974 | 2974 | ||
| 2975 | out_free_mempol: | 2975 | out_free_mempol: |
| 2976 | mpol_put(vma_policy(new_vma)); | 2976 | mpol_put(vma_policy(new_vma)); |
| 2977 | out_free_vma: | 2977 | out_free_vma: |
| 2978 | kmem_cache_free(vm_area_cachep, new_vma); | 2978 | kmem_cache_free(vm_area_cachep, new_vma); |
| 2979 | out: | ||
| 2979 | return NULL; | 2980 | return NULL; |
| 2980 | } | 2981 | } |
| 2981 | 2982 | ||
| @@ -3027,21 +3028,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, | |||
| 3027 | pgoff_t pgoff; | 3028 | pgoff_t pgoff; |
| 3028 | struct page **pages; | 3029 | struct page **pages; |
| 3029 | 3030 | ||
| 3030 | /* | ||
| 3031 | * special mappings have no vm_file, and in that case, the mm | ||
| 3032 | * uses vm_pgoff internally. So we have to subtract it from here. | ||
| 3033 | * We are allowed to do this because we are the mm; do not copy | ||
| 3034 | * this code into drivers! | ||
| 3035 | */ | ||
| 3036 | pgoff = vmf->pgoff - vma->vm_pgoff; | ||
| 3037 | |||
| 3038 | if (vma->vm_ops == &legacy_special_mapping_vmops) | 3031 | if (vma->vm_ops == &legacy_special_mapping_vmops) |
| 3039 | pages = vma->vm_private_data; | 3032 | pages = vma->vm_private_data; |
| 3040 | else | 3033 | else |
| 3041 | pages = ((struct vm_special_mapping *)vma->vm_private_data)-> | 3034 | pages = ((struct vm_special_mapping *)vma->vm_private_data)-> |
| 3042 | pages; | 3035 | pages; |
| 3043 | 3036 | ||
| 3044 | for (; pgoff && *pages; ++pages) | 3037 | for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) |
| 3045 | pgoff--; | 3038 | pgoff--; |
| 3046 | 3039 | ||
| 3047 | if (*pages) { | 3040 | if (*pages) { |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dff991e0681e..1ecc0bcaecc5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
| 196 | * Determine the type of allocation constraint. | 196 | * Determine the type of allocation constraint. |
| 197 | */ | 197 | */ |
| 198 | #ifdef CONFIG_NUMA | 198 | #ifdef CONFIG_NUMA |
| 199 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | 199 | static enum oom_constraint constrained_alloc(struct oom_control *oc, |
| 200 | gfp_t gfp_mask, nodemask_t *nodemask, | 200 | unsigned long *totalpages) |
| 201 | unsigned long *totalpages) | ||
| 202 | { | 201 | { |
| 203 | struct zone *zone; | 202 | struct zone *zone; |
| 204 | struct zoneref *z; | 203 | struct zoneref *z; |
| 205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 204 | enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); |
| 206 | bool cpuset_limited = false; | 205 | bool cpuset_limited = false; |
| 207 | int nid; | 206 | int nid; |
| 208 | 207 | ||
| 209 | /* Default to all available memory */ | 208 | /* Default to all available memory */ |
| 210 | *totalpages = totalram_pages + total_swap_pages; | 209 | *totalpages = totalram_pages + total_swap_pages; |
| 211 | 210 | ||
| 212 | if (!zonelist) | 211 | if (!oc->zonelist) |
| 213 | return CONSTRAINT_NONE; | 212 | return CONSTRAINT_NONE; |
| 214 | /* | 213 | /* |
| 215 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid | 214 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid |
| 216 | * to kill current.We have to random task kill in this case. | 215 | * to kill current.We have to random task kill in this case. |
| 217 | * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. | 216 | * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. |
| 218 | */ | 217 | */ |
| 219 | if (gfp_mask & __GFP_THISNODE) | 218 | if (oc->gfp_mask & __GFP_THISNODE) |
| 220 | return CONSTRAINT_NONE; | 219 | return CONSTRAINT_NONE; |
| 221 | 220 | ||
| 222 | /* | 221 | /* |
| @@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
| 224 | * the page allocator means a mempolicy is in effect. Cpuset policy | 223 | * the page allocator means a mempolicy is in effect. Cpuset policy |
| 225 | * is enforced in get_page_from_freelist(). | 224 | * is enforced in get_page_from_freelist(). |
| 226 | */ | 225 | */ |
| 227 | if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { | 226 | if (oc->nodemask && |
| 227 | !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { | ||
| 228 | *totalpages = total_swap_pages; | 228 | *totalpages = total_swap_pages; |
| 229 | for_each_node_mask(nid, *nodemask) | 229 | for_each_node_mask(nid, *oc->nodemask) |
| 230 | *totalpages += node_spanned_pages(nid); | 230 | *totalpages += node_spanned_pages(nid); |
| 231 | return CONSTRAINT_MEMORY_POLICY; | 231 | return CONSTRAINT_MEMORY_POLICY; |
| 232 | } | 232 | } |
| 233 | 233 | ||
| 234 | /* Check this allocation failure is caused by cpuset's wall function */ | 234 | /* Check this allocation failure is caused by cpuset's wall function */ |
| 235 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 235 | for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, |
| 236 | high_zoneidx, nodemask) | 236 | high_zoneidx, oc->nodemask) |
| 237 | if (!cpuset_zone_allowed(zone, gfp_mask)) | 237 | if (!cpuset_zone_allowed(zone, oc->gfp_mask)) |
| 238 | cpuset_limited = true; | 238 | cpuset_limited = true; |
| 239 | 239 | ||
| 240 | if (cpuset_limited) { | 240 | if (cpuset_limited) { |
| @@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
| 246 | return CONSTRAINT_NONE; | 246 | return CONSTRAINT_NONE; |
| 247 | } | 247 | } |
| 248 | #else | 248 | #else |
| 249 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | 249 | static enum oom_constraint constrained_alloc(struct oom_control *oc, |
| 250 | gfp_t gfp_mask, nodemask_t *nodemask, | 250 | unsigned long *totalpages) |
| 251 | unsigned long *totalpages) | ||
| 252 | { | 251 | { |
| 253 | *totalpages = totalram_pages + total_swap_pages; | 252 | *totalpages = totalram_pages + total_swap_pages; |
| 254 | return CONSTRAINT_NONE; | 253 | return CONSTRAINT_NONE; |
| 255 | } | 254 | } |
| 256 | #endif | 255 | #endif |
| 257 | 256 | ||
| 258 | enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | 257 | enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, |
| 259 | unsigned long totalpages, const nodemask_t *nodemask, | 258 | struct task_struct *task, unsigned long totalpages) |
| 260 | bool force_kill) | ||
| 261 | { | 259 | { |
| 262 | if (oom_unkillable_task(task, NULL, nodemask)) | 260 | if (oom_unkillable_task(task, NULL, oc->nodemask)) |
| 263 | return OOM_SCAN_CONTINUE; | 261 | return OOM_SCAN_CONTINUE; |
| 264 | 262 | ||
| 265 | /* | 263 | /* |
| @@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
| 267 | * Don't allow any other task to have access to the reserves. | 265 | * Don't allow any other task to have access to the reserves. |
| 268 | */ | 266 | */ |
| 269 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { | 267 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { |
| 270 | if (!force_kill) | 268 | if (oc->order != -1) |
| 271 | return OOM_SCAN_ABORT; | 269 | return OOM_SCAN_ABORT; |
| 272 | } | 270 | } |
| 273 | if (!task->mm) | 271 | if (!task->mm) |
| @@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
| 280 | if (oom_task_origin(task)) | 278 | if (oom_task_origin(task)) |
| 281 | return OOM_SCAN_SELECT; | 279 | return OOM_SCAN_SELECT; |
| 282 | 280 | ||
| 283 | if (task_will_free_mem(task) && !force_kill) | 281 | if (task_will_free_mem(task) && oc->order != -1) |
| 284 | return OOM_SCAN_ABORT; | 282 | return OOM_SCAN_ABORT; |
| 285 | 283 | ||
| 286 | return OOM_SCAN_OK; | 284 | return OOM_SCAN_OK; |
| @@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
| 289 | /* | 287 | /* |
| 290 | * Simple selection loop. We chose the process with the highest | 288 | * Simple selection loop. We chose the process with the highest |
| 291 | * number of 'points'. Returns -1 on scan abort. | 289 | * number of 'points'. Returns -1 on scan abort. |
| 292 | * | ||
| 293 | * (not docbooked, we don't want this one cluttering up the manual) | ||
| 294 | */ | 290 | */ |
| 295 | static struct task_struct *select_bad_process(unsigned int *ppoints, | 291 | static struct task_struct *select_bad_process(struct oom_control *oc, |
| 296 | unsigned long totalpages, const nodemask_t *nodemask, | 292 | unsigned int *ppoints, unsigned long totalpages) |
| 297 | bool force_kill) | ||
| 298 | { | 293 | { |
| 299 | struct task_struct *g, *p; | 294 | struct task_struct *g, *p; |
| 300 | struct task_struct *chosen = NULL; | 295 | struct task_struct *chosen = NULL; |
| @@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
| 304 | for_each_process_thread(g, p) { | 299 | for_each_process_thread(g, p) { |
| 305 | unsigned int points; | 300 | unsigned int points; |
| 306 | 301 | ||
| 307 | switch (oom_scan_process_thread(p, totalpages, nodemask, | 302 | switch (oom_scan_process_thread(oc, p, totalpages)) { |
| 308 | force_kill)) { | ||
| 309 | case OOM_SCAN_SELECT: | 303 | case OOM_SCAN_SELECT: |
| 310 | chosen = p; | 304 | chosen = p; |
| 311 | chosen_points = ULONG_MAX; | 305 | chosen_points = ULONG_MAX; |
| @@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
| 318 | case OOM_SCAN_OK: | 312 | case OOM_SCAN_OK: |
| 319 | break; | 313 | break; |
| 320 | }; | 314 | }; |
| 321 | points = oom_badness(p, NULL, nodemask, totalpages); | 315 | points = oom_badness(p, NULL, oc->nodemask, totalpages); |
| 322 | if (!points || points < chosen_points) | 316 | if (!points || points < chosen_points) |
| 323 | continue; | 317 | continue; |
| 324 | /* Prefer thread group leaders for display purposes */ | 318 | /* Prefer thread group leaders for display purposes */ |
| @@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
| 380 | rcu_read_unlock(); | 374 | rcu_read_unlock(); |
| 381 | } | 375 | } |
| 382 | 376 | ||
| 383 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | 377 | static void dump_header(struct oom_control *oc, struct task_struct *p, |
| 384 | struct mem_cgroup *memcg, const nodemask_t *nodemask) | 378 | struct mem_cgroup *memcg) |
| 385 | { | 379 | { |
| 386 | task_lock(current); | 380 | task_lock(current); |
| 387 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 381 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
| 388 | "oom_score_adj=%hd\n", | 382 | "oom_score_adj=%hd\n", |
| 389 | current->comm, gfp_mask, order, | 383 | current->comm, oc->gfp_mask, oc->order, |
| 390 | current->signal->oom_score_adj); | 384 | current->signal->oom_score_adj); |
| 391 | cpuset_print_task_mems_allowed(current); | 385 | cpuset_print_task_mems_allowed(current); |
| 392 | task_unlock(current); | 386 | task_unlock(current); |
| @@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 396 | else | 390 | else |
| 397 | show_mem(SHOW_MEM_FILTER_NODES); | 391 | show_mem(SHOW_MEM_FILTER_NODES); |
| 398 | if (sysctl_oom_dump_tasks) | 392 | if (sysctl_oom_dump_tasks) |
| 399 | dump_tasks(memcg, nodemask); | 393 | dump_tasks(memcg, oc->nodemask); |
| 400 | } | 394 | } |
| 401 | 395 | ||
| 402 | /* | 396 | /* |
| @@ -487,10 +481,9 @@ void oom_killer_enable(void) | |||
| 487 | * Must be called while holding a reference to p, which will be released upon | 481 | * Must be called while holding a reference to p, which will be released upon |
| 488 | * returning. | 482 | * returning. |
| 489 | */ | 483 | */ |
| 490 | void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | 484 | void oom_kill_process(struct oom_control *oc, struct task_struct *p, |
| 491 | unsigned int points, unsigned long totalpages, | 485 | unsigned int points, unsigned long totalpages, |
| 492 | struct mem_cgroup *memcg, nodemask_t *nodemask, | 486 | struct mem_cgroup *memcg, const char *message) |
| 493 | const char *message) | ||
| 494 | { | 487 | { |
| 495 | struct task_struct *victim = p; | 488 | struct task_struct *victim = p; |
| 496 | struct task_struct *child; | 489 | struct task_struct *child; |
| @@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 514 | task_unlock(p); | 507 | task_unlock(p); |
| 515 | 508 | ||
| 516 | if (__ratelimit(&oom_rs)) | 509 | if (__ratelimit(&oom_rs)) |
| 517 | dump_header(p, gfp_mask, order, memcg, nodemask); | 510 | dump_header(oc, p, memcg); |
| 518 | 511 | ||
| 519 | task_lock(p); | 512 | task_lock(p); |
| 520 | pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", | 513 | pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", |
| @@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 537 | /* | 530 | /* |
| 538 | * oom_badness() returns 0 if the thread is unkillable | 531 | * oom_badness() returns 0 if the thread is unkillable |
| 539 | */ | 532 | */ |
| 540 | child_points = oom_badness(child, memcg, nodemask, | 533 | child_points = oom_badness(child, memcg, oc->nodemask, |
| 541 | totalpages); | 534 | totalpages); |
| 542 | if (child_points > victim_points) { | 535 | if (child_points > victim_points) { |
| 543 | put_task_struct(victim); | 536 | put_task_struct(victim); |
| @@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 600 | /* | 593 | /* |
| 601 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 594 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
| 602 | */ | 595 | */ |
| 603 | void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | 596 | void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, |
| 604 | int order, const nodemask_t *nodemask, | ||
| 605 | struct mem_cgroup *memcg) | 597 | struct mem_cgroup *memcg) |
| 606 | { | 598 | { |
| 607 | if (likely(!sysctl_panic_on_oom)) | 599 | if (likely(!sysctl_panic_on_oom)) |
| @@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
| 615 | if (constraint != CONSTRAINT_NONE) | 607 | if (constraint != CONSTRAINT_NONE) |
| 616 | return; | 608 | return; |
| 617 | } | 609 | } |
| 618 | dump_header(NULL, gfp_mask, order, memcg, nodemask); | 610 | /* Do not panic for oom kills triggered by sysrq */ |
| 611 | if (oc->order == -1) | ||
| 612 | return; | ||
| 613 | dump_header(oc, NULL, memcg); | ||
| 619 | panic("Out of memory: %s panic_on_oom is enabled\n", | 614 | panic("Out of memory: %s panic_on_oom is enabled\n", |
| 620 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); | 615 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); |
| 621 | } | 616 | } |
| @@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb) | |||
| 635 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); | 630 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); |
| 636 | 631 | ||
| 637 | /** | 632 | /** |
| 638 | * __out_of_memory - kill the "best" process when we run out of memory | 633 | * out_of_memory - kill the "best" process when we run out of memory |
| 639 | * @zonelist: zonelist pointer | 634 | * @oc: pointer to struct oom_control |
| 640 | * @gfp_mask: memory allocation flags | ||
| 641 | * @order: amount of memory being requested as a power of 2 | ||
| 642 | * @nodemask: nodemask passed to page allocator | ||
| 643 | * @force_kill: true if a task must be killed, even if others are exiting | ||
| 644 | * | 635 | * |
| 645 | * If we run out of memory, we have the choice between either | 636 | * If we run out of memory, we have the choice between either |
| 646 | * killing a random task (bad), letting the system crash (worse) | 637 | * killing a random task (bad), letting the system crash (worse) |
| 647 | * OR try to be smart about which process to kill. Note that we | 638 | * OR try to be smart about which process to kill. Note that we |
| 648 | * don't have to be perfect here, we just have to be good. | 639 | * don't have to be perfect here, we just have to be good. |
| 649 | */ | 640 | */ |
| 650 | bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 641 | bool out_of_memory(struct oom_control *oc) |
| 651 | int order, nodemask_t *nodemask, bool force_kill) | ||
| 652 | { | 642 | { |
| 653 | const nodemask_t *mpol_mask; | ||
| 654 | struct task_struct *p; | 643 | struct task_struct *p; |
| 655 | unsigned long totalpages; | 644 | unsigned long totalpages; |
| 656 | unsigned long freed = 0; | 645 | unsigned long freed = 0; |
| 657 | unsigned int uninitialized_var(points); | 646 | unsigned int uninitialized_var(points); |
| 658 | enum oom_constraint constraint = CONSTRAINT_NONE; | 647 | enum oom_constraint constraint = CONSTRAINT_NONE; |
| 659 | int killed = 0; | ||
| 660 | 648 | ||
| 661 | if (oom_killer_disabled) | 649 | if (oom_killer_disabled) |
| 662 | return false; | 650 | return false; |
| @@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
| 664 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | 652 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); |
| 665 | if (freed > 0) | 653 | if (freed > 0) |
| 666 | /* Got some memory back in the last second. */ | 654 | /* Got some memory back in the last second. */ |
| 667 | goto out; | 655 | return true; |
| 668 | 656 | ||
| 669 | /* | 657 | /* |
| 670 | * If current has a pending SIGKILL or is exiting, then automatically | 658 | * If current has a pending SIGKILL or is exiting, then automatically |
| @@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
| 677 | if (current->mm && | 665 | if (current->mm && |
| 678 | (fatal_signal_pending(current) || task_will_free_mem(current))) { | 666 | (fatal_signal_pending(current) || task_will_free_mem(current))) { |
| 679 | mark_oom_victim(current); | 667 | mark_oom_victim(current); |
| 680 | goto out; | 668 | return true; |
| 681 | } | 669 | } |
| 682 | 670 | ||
| 683 | /* | 671 | /* |
| 684 | * Check if there were limitations on the allocation (only relevant for | 672 | * Check if there were limitations on the allocation (only relevant for |
| 685 | * NUMA) that may require different handling. | 673 | * NUMA) that may require different handling. |
| 686 | */ | 674 | */ |
| 687 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask, | 675 | constraint = constrained_alloc(oc, &totalpages); |
| 688 | &totalpages); | 676 | if (constraint != CONSTRAINT_MEMORY_POLICY) |
| 689 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; | 677 | oc->nodemask = NULL; |
| 690 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); | 678 | check_panic_on_oom(oc, constraint, NULL); |
| 691 | 679 | ||
| 692 | if (sysctl_oom_kill_allocating_task && current->mm && | 680 | if (sysctl_oom_kill_allocating_task && current->mm && |
| 693 | !oom_unkillable_task(current, NULL, nodemask) && | 681 | !oom_unkillable_task(current, NULL, oc->nodemask) && |
| 694 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { | 682 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { |
| 695 | get_task_struct(current); | 683 | get_task_struct(current); |
| 696 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, | 684 | oom_kill_process(oc, current, 0, totalpages, NULL, |
| 697 | nodemask, | ||
| 698 | "Out of memory (oom_kill_allocating_task)"); | 685 | "Out of memory (oom_kill_allocating_task)"); |
| 699 | goto out; | 686 | return true; |
| 700 | } | 687 | } |
| 701 | 688 | ||
| 702 | p = select_bad_process(&points, totalpages, mpol_mask, force_kill); | 689 | p = select_bad_process(oc, &points, totalpages); |
| 703 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 690 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
| 704 | if (!p) { | 691 | if (!p && oc->order != -1) { |
| 705 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); | 692 | dump_header(oc, NULL, NULL); |
| 706 | panic("Out of memory and no killable processes...\n"); | 693 | panic("Out of memory and no killable processes...\n"); |
| 707 | } | 694 | } |
| 708 | if (p != (void *)-1UL) { | 695 | if (p && p != (void *)-1UL) { |
| 709 | oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, | 696 | oom_kill_process(oc, p, points, totalpages, NULL, |
| 710 | nodemask, "Out of memory"); | 697 | "Out of memory"); |
| 711 | killed = 1; | 698 | /* |
| 712 | } | 699 | * Give the killed process a good chance to exit before trying |
| 713 | out: | 700 | * to allocate memory again. |
| 714 | /* | 701 | */ |
| 715 | * Give the killed threads a good chance of exiting before trying to | ||
| 716 | * allocate memory again. | ||
| 717 | */ | ||
| 718 | if (killed) | ||
| 719 | schedule_timeout_killable(1); | 702 | schedule_timeout_killable(1); |
| 720 | 703 | } | |
| 721 | return true; | 704 | return true; |
| 722 | } | 705 | } |
| 723 | 706 | ||
| @@ -728,13 +711,20 @@ out: | |||
| 728 | */ | 711 | */ |
| 729 | void pagefault_out_of_memory(void) | 712 | void pagefault_out_of_memory(void) |
| 730 | { | 713 | { |
| 714 | struct oom_control oc = { | ||
| 715 | .zonelist = NULL, | ||
| 716 | .nodemask = NULL, | ||
| 717 | .gfp_mask = 0, | ||
| 718 | .order = 0, | ||
| 719 | }; | ||
| 720 | |||
| 731 | if (mem_cgroup_oom_synchronize(true)) | 721 | if (mem_cgroup_oom_synchronize(true)) |
| 732 | return; | 722 | return; |
| 733 | 723 | ||
| 734 | if (!mutex_trylock(&oom_lock)) | 724 | if (!mutex_trylock(&oom_lock)) |
| 735 | return; | 725 | return; |
| 736 | 726 | ||
| 737 | if (!out_of_memory(NULL, 0, 0, NULL, false)) { | 727 | if (!out_of_memory(&oc)) { |
| 738 | /* | 728 | /* |
| 739 | * There shouldn't be any user tasks runnable while the | 729 | * There shouldn't be any user tasks runnable while the |
| 740 | * OOM killer is disabled, so the current task has to | 730 | * OOM killer is disabled, so the current task has to |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b401d40cb4fd..48aaf7b9f253 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly; | |||
| 125 | int percpu_pagelist_fraction; | 125 | int percpu_pagelist_fraction; |
| 126 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 126 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
| 127 | 127 | ||
| 128 | /* | ||
| 129 | * A cached value of the page's pageblock's migratetype, used when the page is | ||
| 130 | * put on a pcplist. Used to avoid the pageblock migratetype lookup when | ||
| 131 | * freeing from pcplists in most cases, at the cost of possibly becoming stale. | ||
| 132 | * Also the migratetype set in the page does not necessarily match the pcplist | ||
| 133 | * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any | ||
| 134 | * other index - this ensures that it will be put on the correct CMA freelist. | ||
| 135 | */ | ||
| 136 | static inline int get_pcppage_migratetype(struct page *page) | ||
| 137 | { | ||
| 138 | return page->index; | ||
| 139 | } | ||
| 140 | |||
| 141 | static inline void set_pcppage_migratetype(struct page *page, int migratetype) | ||
| 142 | { | ||
| 143 | page->index = migratetype; | ||
| 144 | } | ||
| 145 | |||
| 128 | #ifdef CONFIG_PM_SLEEP | 146 | #ifdef CONFIG_PM_SLEEP |
| 129 | /* | 147 | /* |
| 130 | * The following functions are used by the suspend/hibernate code to temporarily | 148 | * The following functions are used by the suspend/hibernate code to temporarily |
| @@ -791,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
| 791 | page = list_entry(list->prev, struct page, lru); | 809 | page = list_entry(list->prev, struct page, lru); |
| 792 | /* must delete as __free_one_page list manipulates */ | 810 | /* must delete as __free_one_page list manipulates */ |
| 793 | list_del(&page->lru); | 811 | list_del(&page->lru); |
| 794 | mt = get_freepage_migratetype(page); | 812 | |
| 813 | mt = get_pcppage_migratetype(page); | ||
| 814 | /* MIGRATE_ISOLATE page should not go to pcplists */ | ||
| 815 | VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); | ||
| 816 | /* Pageblock could have been isolated meanwhile */ | ||
| 795 | if (unlikely(has_isolate_pageblock(zone))) | 817 | if (unlikely(has_isolate_pageblock(zone))) |
| 796 | mt = get_pageblock_migratetype(page); | 818 | mt = get_pageblock_migratetype(page); |
| 797 | 819 | ||
| @@ -955,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 955 | migratetype = get_pfnblock_migratetype(page, pfn); | 977 | migratetype = get_pfnblock_migratetype(page, pfn); |
| 956 | local_irq_save(flags); | 978 | local_irq_save(flags); |
| 957 | __count_vm_events(PGFREE, 1 << order); | 979 | __count_vm_events(PGFREE, 1 << order); |
| 958 | set_freepage_migratetype(page, migratetype); | ||
| 959 | free_one_page(page_zone(page), page, pfn, order, migratetype); | 980 | free_one_page(page_zone(page), page, pfn, order, migratetype); |
| 960 | local_irq_restore(flags); | 981 | local_irq_restore(flags); |
| 961 | } | 982 | } |
| @@ -1383,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
| 1383 | rmv_page_order(page); | 1404 | rmv_page_order(page); |
| 1384 | area->nr_free--; | 1405 | area->nr_free--; |
| 1385 | expand(zone, page, order, current_order, area, migratetype); | 1406 | expand(zone, page, order, current_order, area, migratetype); |
| 1386 | set_freepage_migratetype(page, migratetype); | 1407 | set_pcppage_migratetype(page, migratetype); |
| 1387 | return page; | 1408 | return page; |
| 1388 | } | 1409 | } |
| 1389 | 1410 | ||
| @@ -1460,7 +1481,6 @@ int move_freepages(struct zone *zone, | |||
| 1460 | order = page_order(page); | 1481 | order = page_order(page); |
| 1461 | list_move(&page->lru, | 1482 | list_move(&page->lru, |
| 1462 | &zone->free_area[order].free_list[migratetype]); | 1483 | &zone->free_area[order].free_list[migratetype]); |
| 1463 | set_freepage_migratetype(page, migratetype); | ||
| 1464 | page += 1 << order; | 1484 | page += 1 << order; |
| 1465 | pages_moved += 1 << order; | 1485 | pages_moved += 1 << order; |
| 1466 | } | 1486 | } |
| @@ -1630,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
| 1630 | expand(zone, page, order, current_order, area, | 1650 | expand(zone, page, order, current_order, area, |
| 1631 | start_migratetype); | 1651 | start_migratetype); |
| 1632 | /* | 1652 | /* |
| 1633 | * The freepage_migratetype may differ from pageblock's | 1653 | * The pcppage_migratetype may differ from pageblock's |
| 1634 | * migratetype depending on the decisions in | 1654 | * migratetype depending on the decisions in |
| 1635 | * try_to_steal_freepages(). This is OK as long as it | 1655 | * find_suitable_fallback(). This is OK as long as it does not |
| 1636 | * does not differ for MIGRATE_CMA pageblocks. For CMA | 1656 | * differ for MIGRATE_CMA pageblocks. Those can be used as |
| 1637 | * we need to make sure unallocated pages flushed from | 1657 | * fallback only via special __rmqueue_cma_fallback() function |
| 1638 | * pcp lists are returned to the correct freelist. | ||
| 1639 | */ | 1658 | */ |
| 1640 | set_freepage_migratetype(page, start_migratetype); | 1659 | set_pcppage_migratetype(page, start_migratetype); |
| 1641 | 1660 | ||
| 1642 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1661 | trace_mm_page_alloc_extfrag(page, order, current_order, |
| 1643 | start_migratetype, fallback_mt); | 1662 | start_migratetype, fallback_mt); |
| @@ -1713,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
| 1713 | else | 1732 | else |
| 1714 | list_add_tail(&page->lru, list); | 1733 | list_add_tail(&page->lru, list); |
| 1715 | list = &page->lru; | 1734 | list = &page->lru; |
| 1716 | if (is_migrate_cma(get_freepage_migratetype(page))) | 1735 | if (is_migrate_cma(get_pcppage_migratetype(page))) |
| 1717 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, | 1736 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, |
| 1718 | -(1 << order)); | 1737 | -(1 << order)); |
| 1719 | } | 1738 | } |
| @@ -1910,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
| 1910 | return; | 1929 | return; |
| 1911 | 1930 | ||
| 1912 | migratetype = get_pfnblock_migratetype(page, pfn); | 1931 | migratetype = get_pfnblock_migratetype(page, pfn); |
| 1913 | set_freepage_migratetype(page, migratetype); | 1932 | set_pcppage_migratetype(page, migratetype); |
| 1914 | local_irq_save(flags); | 1933 | local_irq_save(flags); |
| 1915 | __count_vm_event(PGFREE); | 1934 | __count_vm_event(PGFREE); |
| 1916 | 1935 | ||
| @@ -2115,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
| 2115 | if (!page) | 2134 | if (!page) |
| 2116 | goto failed; | 2135 | goto failed; |
| 2117 | __mod_zone_freepage_state(zone, -(1 << order), | 2136 | __mod_zone_freepage_state(zone, -(1 << order), |
| 2118 | get_freepage_migratetype(page)); | 2137 | get_pcppage_migratetype(page)); |
| 2119 | } | 2138 | } |
| 2120 | 2139 | ||
| 2121 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | 2140 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); |
| @@ -2696,6 +2715,12 @@ static inline struct page * | |||
| 2696 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2715 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
| 2697 | const struct alloc_context *ac, unsigned long *did_some_progress) | 2716 | const struct alloc_context *ac, unsigned long *did_some_progress) |
| 2698 | { | 2717 | { |
| 2718 | struct oom_control oc = { | ||
| 2719 | .zonelist = ac->zonelist, | ||
| 2720 | .nodemask = ac->nodemask, | ||
| 2721 | .gfp_mask = gfp_mask, | ||
| 2722 | .order = order, | ||
| 2723 | }; | ||
| 2699 | struct page *page; | 2724 | struct page *page; |
| 2700 | 2725 | ||
| 2701 | *did_some_progress = 0; | 2726 | *did_some_progress = 0; |
| @@ -2747,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
| 2747 | goto out; | 2772 | goto out; |
| 2748 | } | 2773 | } |
| 2749 | /* Exhausted what can be done so it's blamo time */ | 2774 | /* Exhausted what can be done so it's blamo time */ |
| 2750 | if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) | 2775 | if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) |
| 2751 | || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) | ||
| 2752 | *did_some_progress = 1; | 2776 | *did_some_progress = 1; |
| 2753 | out: | 2777 | out: |
| 2754 | mutex_unlock(&oom_lock); | 2778 | mutex_unlock(&oom_lock); |
| @@ -3490,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact); | |||
| 3490 | * | 3514 | * |
| 3491 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling | 3515 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling |
| 3492 | * back. | 3516 | * back. |
| 3493 | * Note this is not alloc_pages_exact_node() which allocates on a specific node, | ||
| 3494 | * but is not exact. | ||
| 3495 | */ | 3517 | */ |
| 3496 | void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) | 3518 | void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) |
| 3497 | { | 3519 | { |
| @@ -5066,7 +5088,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
| 5066 | { | 5088 | { |
| 5067 | unsigned long zone_start_pfn, zone_end_pfn; | 5089 | unsigned long zone_start_pfn, zone_end_pfn; |
| 5068 | 5090 | ||
| 5069 | /* When hotadd a new node, the node should be empty */ | 5091 | /* When hotadd a new node from cpu_up(), the node should be empty */ |
| 5070 | if (!node_start_pfn && !node_end_pfn) | 5092 | if (!node_start_pfn && !node_end_pfn) |
| 5071 | return 0; | 5093 | return 0; |
| 5072 | 5094 | ||
| @@ -5133,7 +5155,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
| 5133 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; | 5155 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
| 5134 | unsigned long zone_start_pfn, zone_end_pfn; | 5156 | unsigned long zone_start_pfn, zone_end_pfn; |
| 5135 | 5157 | ||
| 5136 | /* When hotadd a new node, the node should be empty */ | 5158 | /* When hotadd a new node from cpu_up(), the node should be empty */ |
| 5137 | if (!node_start_pfn && !node_end_pfn) | 5159 | if (!node_start_pfn && !node_end_pfn) |
| 5138 | return 0; | 5160 | return 0; |
| 5139 | 5161 | ||
| @@ -5306,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | |||
| 5306 | * | 5328 | * |
| 5307 | * NOTE: pgdat should get zeroed by caller. | 5329 | * NOTE: pgdat should get zeroed by caller. |
| 5308 | */ | 5330 | */ |
| 5309 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 5331 | static void __paginginit free_area_init_core(struct pglist_data *pgdat) |
| 5310 | unsigned long node_start_pfn, unsigned long node_end_pfn) | ||
| 5311 | { | 5332 | { |
| 5312 | enum zone_type j; | 5333 | enum zone_type j; |
| 5313 | int nid = pgdat->node_id; | 5334 | int nid = pgdat->node_id; |
| @@ -5458,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
| 5458 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5479 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
| 5459 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 5480 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
| 5460 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, | 5481 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
| 5461 | (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); | 5482 | (u64)start_pfn << PAGE_SHIFT, |
| 5483 | end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); | ||
| 5462 | #endif | 5484 | #endif |
| 5463 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, | 5485 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
| 5464 | zones_size, zholes_size); | 5486 | zones_size, zholes_size); |
| @@ -5470,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
| 5470 | (unsigned long)pgdat->node_mem_map); | 5492 | (unsigned long)pgdat->node_mem_map); |
| 5471 | #endif | 5493 | #endif |
| 5472 | 5494 | ||
| 5473 | free_area_init_core(pgdat, start_pfn, end_pfn); | 5495 | free_area_init_core(pgdat); |
| 5474 | } | 5496 | } |
| 5475 | 5497 | ||
| 5476 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5498 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
| @@ -5481,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
| 5481 | */ | 5503 | */ |
| 5482 | void __init setup_nr_node_ids(void) | 5504 | void __init setup_nr_node_ids(void) |
| 5483 | { | 5505 | { |
| 5484 | unsigned int node; | 5506 | unsigned int highest; |
| 5485 | unsigned int highest = 0; | ||
| 5486 | 5507 | ||
| 5487 | for_each_node_mask(node, node_possible_map) | 5508 | highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); |
| 5488 | highest = node; | ||
| 5489 | nr_node_ids = highest + 1; | 5509 | nr_node_ids = highest + 1; |
| 5490 | } | 5510 | } |
| 5491 | #endif | 5511 | #endif |
| @@ -6006,7 +6026,7 @@ void __init mem_init_print_info(const char *str) | |||
| 6006 | * set_dma_reserve - set the specified number of pages reserved in the first zone | 6026 | * set_dma_reserve - set the specified number of pages reserved in the first zone |
| 6007 | * @new_dma_reserve: The number of pages to mark reserved | 6027 | * @new_dma_reserve: The number of pages to mark reserved |
| 6008 | * | 6028 | * |
| 6009 | * The per-cpu batchsize and zone watermarks are determined by present_pages. | 6029 | * The per-cpu batchsize and zone watermarks are determined by managed_pages. |
| 6010 | * In the DMA zone, a significant percentage may be consumed by kernel image | 6030 | * In the DMA zone, a significant percentage may be consumed by kernel image |
| 6011 | * and other unfreeable allocations which can skew the watermarks badly. This | 6031 | * and other unfreeable allocations which can skew the watermarks badly. This |
| 6012 | * function may optionally be used to account for unfreeable pages in the | 6032 | * function may optionally be used to account for unfreeable pages in the |
| @@ -6059,7 +6079,7 @@ void __init page_alloc_init(void) | |||
| 6059 | } | 6079 | } |
| 6060 | 6080 | ||
| 6061 | /* | 6081 | /* |
| 6062 | * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio | 6082 | * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio |
| 6063 | * or min_free_kbytes changes. | 6083 | * or min_free_kbytes changes. |
| 6064 | */ | 6084 | */ |
| 6065 | static void calculate_totalreserve_pages(void) | 6085 | static void calculate_totalreserve_pages(void) |
| @@ -6103,7 +6123,7 @@ static void calculate_totalreserve_pages(void) | |||
| 6103 | 6123 | ||
| 6104 | /* | 6124 | /* |
| 6105 | * setup_per_zone_lowmem_reserve - called whenever | 6125 | * setup_per_zone_lowmem_reserve - called whenever |
| 6106 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone | 6126 | * sysctl_lowmem_reserve_ratio changes. Ensures that each zone |
| 6107 | * has a correct pages reserved value, so an adequate number of | 6127 | * has a correct pages reserved value, so an adequate number of |
| 6108 | * pages are left in the zone after a successful __alloc_pages(). | 6128 | * pages are left in the zone after a successful __alloc_pages(). |
| 6109 | */ | 6129 | */ |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 303c908790ef..4568fd58f70a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
| @@ -9,7 +9,8 @@ | |||
| 9 | #include <linux/hugetlb.h> | 9 | #include <linux/hugetlb.h> |
| 10 | #include "internal.h" | 10 | #include "internal.h" |
| 11 | 11 | ||
| 12 | int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) | 12 | static int set_migratetype_isolate(struct page *page, |
| 13 | bool skip_hwpoisoned_pages) | ||
| 13 | { | 14 | { |
| 14 | struct zone *zone; | 15 | struct zone *zone; |
| 15 | unsigned long flags, pfn; | 16 | unsigned long flags, pfn; |
| @@ -72,7 +73,7 @@ out: | |||
| 72 | return ret; | 73 | return ret; |
| 73 | } | 74 | } |
| 74 | 75 | ||
| 75 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | 76 | static void unset_migratetype_isolate(struct page *page, unsigned migratetype) |
| 76 | { | 77 | { |
| 77 | struct zone *zone; | 78 | struct zone *zone; |
| 78 | unsigned long flags, nr_pages; | 79 | unsigned long flags, nr_pages; |
| @@ -223,34 +224,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, | |||
| 223 | continue; | 224 | continue; |
| 224 | } | 225 | } |
| 225 | page = pfn_to_page(pfn); | 226 | page = pfn_to_page(pfn); |
| 226 | if (PageBuddy(page)) { | 227 | if (PageBuddy(page)) |
| 227 | /* | 228 | /* |
| 228 | * If race between isolatation and allocation happens, | 229 | * If the page is on a free list, it has to be on |
| 229 | * some free pages could be in MIGRATE_MOVABLE list | 230 | * the correct MIGRATE_ISOLATE freelist. There is no |
| 230 | * although pageblock's migratation type of the page | 231 | * simple way to verify that as VM_BUG_ON(), though. |
| 231 | * is MIGRATE_ISOLATE. Catch it and move the page into | ||
| 232 | * MIGRATE_ISOLATE list. | ||
| 233 | */ | 232 | */ |
| 234 | if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { | ||
| 235 | struct page *end_page; | ||
| 236 | |||
| 237 | end_page = page + (1 << page_order(page)) - 1; | ||
| 238 | move_freepages(page_zone(page), page, end_page, | ||
| 239 | MIGRATE_ISOLATE); | ||
| 240 | } | ||
| 241 | pfn += 1 << page_order(page); | 233 | pfn += 1 << page_order(page); |
| 242 | } | 234 | else if (skip_hwpoisoned_pages && PageHWPoison(page)) |
| 243 | else if (page_count(page) == 0 && | 235 | /* A HWPoisoned page cannot be also PageBuddy */ |
| 244 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) | ||
| 245 | pfn += 1; | ||
| 246 | else if (skip_hwpoisoned_pages && PageHWPoison(page)) { | ||
| 247 | /* | ||
| 248 | * The HWPoisoned page may be not in buddy | ||
| 249 | * system, and page_count() is not 0. | ||
| 250 | */ | ||
| 251 | pfn++; | 236 | pfn++; |
| 252 | continue; | ||
| 253 | } | ||
| 254 | else | 237 | else |
| 255 | break; | 238 | break; |
| 256 | } | 239 | } |
diff --git a/mm/shmem.c b/mm/shmem.c index dbe0c1e8349c..48ce82926d93 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -542,6 +542,21 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
| 542 | } | 542 | } |
| 543 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 543 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
| 544 | 544 | ||
| 545 | static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, | ||
| 546 | struct kstat *stat) | ||
| 547 | { | ||
| 548 | struct inode *inode = dentry->d_inode; | ||
| 549 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
| 550 | |||
| 551 | spin_lock(&info->lock); | ||
| 552 | shmem_recalc_inode(inode); | ||
| 553 | spin_unlock(&info->lock); | ||
| 554 | |||
| 555 | generic_fillattr(inode, stat); | ||
| 556 | |||
| 557 | return 0; | ||
| 558 | } | ||
| 559 | |||
| 545 | static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | 560 | static int shmem_setattr(struct dentry *dentry, struct iattr *attr) |
| 546 | { | 561 | { |
| 547 | struct inode *inode = d_inode(dentry); | 562 | struct inode *inode = d_inode(dentry); |
| @@ -3122,6 +3137,7 @@ static const struct file_operations shmem_file_operations = { | |||
| 3122 | }; | 3137 | }; |
| 3123 | 3138 | ||
| 3124 | static const struct inode_operations shmem_inode_operations = { | 3139 | static const struct inode_operations shmem_inode_operations = { |
| 3140 | .getattr = shmem_getattr, | ||
| 3125 | .setattr = shmem_setattr, | 3141 | .setattr = shmem_setattr, |
| 3126 | #ifdef CONFIG_TMPFS_XATTR | 3142 | #ifdef CONFIG_TMPFS_XATTR |
| 3127 | .setxattr = shmem_setxattr, | 3143 | .setxattr = shmem_setxattr, |
| @@ -1595,7 +1595,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | |||
| 1595 | if (memcg_charge_slab(cachep, flags, cachep->gfporder)) | 1595 | if (memcg_charge_slab(cachep, flags, cachep->gfporder)) |
| 1596 | return NULL; | 1596 | return NULL; |
| 1597 | 1597 | ||
| 1598 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); | 1598 | page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
| 1599 | if (!page) { | 1599 | if (!page) { |
| 1600 | memcg_uncharge_slab(cachep, cachep->gfporder); | 1600 | memcg_uncharge_slab(cachep, cachep->gfporder); |
| 1601 | slab_out_of_memory(cachep, flags, nodeid); | 1601 | slab_out_of_memory(cachep, flags, nodeid); |
diff --git a/mm/slab_common.c b/mm/slab_common.c index c26829fe4e37..5ce4faeb16fb 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
| @@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, | |||
| 500 | struct kmem_cache *root_cache) | 500 | struct kmem_cache *root_cache) |
| 501 | { | 501 | { |
| 502 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ | 502 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ |
| 503 | struct cgroup_subsys_state *css = mem_cgroup_css(memcg); | 503 | struct cgroup_subsys_state *css = &memcg->css; |
| 504 | struct memcg_cache_array *arr; | 504 | struct memcg_cache_array *arr; |
| 505 | struct kmem_cache *s = NULL; | 505 | struct kmem_cache *s = NULL; |
| 506 | char *cache_name; | 506 | char *cache_name; |
| @@ -640,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
| 640 | bool need_rcu_barrier = false; | 640 | bool need_rcu_barrier = false; |
| 641 | bool busy = false; | 641 | bool busy = false; |
| 642 | 642 | ||
| 643 | if (unlikely(!s)) | ||
| 644 | return; | ||
| 645 | |||
| 643 | BUG_ON(!is_root_cache(s)); | 646 | BUG_ON(!is_root_cache(s)); |
| 644 | 647 | ||
| 645 | get_online_cpus(); | 648 | get_online_cpus(); |
| @@ -45,7 +45,7 @@ | |||
| 45 | * NUMA support in SLOB is fairly simplistic, pushing most of the real | 45 | * NUMA support in SLOB is fairly simplistic, pushing most of the real |
| 46 | * logic down to the page allocator, and simply doing the node accounting | 46 | * logic down to the page allocator, and simply doing the node accounting |
| 47 | * on the upper levels. In the event that a node id is explicitly | 47 | * on the upper levels. In the event that a node id is explicitly |
| 48 | * provided, alloc_pages_exact_node() with the specified node id is used | 48 | * provided, __alloc_pages_node() with the specified node id is used |
| 49 | * instead. The common case (or when the node id isn't explicitly provided) | 49 | * instead. The common case (or when the node id isn't explicitly provided) |
| 50 | * will default to the current node, as per numa_node_id(). | 50 | * will default to the current node, as per numa_node_id(). |
| 51 | * | 51 | * |
| @@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) | |||
| 193 | 193 | ||
| 194 | #ifdef CONFIG_NUMA | 194 | #ifdef CONFIG_NUMA |
| 195 | if (node != NUMA_NO_NODE) | 195 | if (node != NUMA_NO_NODE) |
| 196 | page = alloc_pages_exact_node(node, gfp, order); | 196 | page = __alloc_pages_node(node, gfp, order); |
| 197 | else | 197 | else |
| 198 | #endif | 198 | #endif |
| 199 | page = alloc_pages(gfp, order); | 199 | page = alloc_pages(gfp, order); |
| @@ -1334,7 +1334,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, | |||
| 1334 | if (node == NUMA_NO_NODE) | 1334 | if (node == NUMA_NO_NODE) |
| 1335 | page = alloc_pages(flags, order); | 1335 | page = alloc_pages(flags, order); |
| 1336 | else | 1336 | else |
| 1337 | page = alloc_pages_exact_node(node, flags, order); | 1337 | page = __alloc_pages_node(node, flags, order); |
| 1338 | 1338 | ||
| 1339 | if (!page) | 1339 | if (!page) |
| 1340 | memcg_uncharge_slab(s, order); | 1340 | memcg_uncharge_slab(s, order); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 8bc8e66138da..d504adb7fa5f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry) | |||
| 288 | return page; | 288 | return page; |
| 289 | } | 289 | } |
| 290 | 290 | ||
| 291 | /* | 291 | struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, |
| 292 | * Locate a page of swap in physical memory, reserving swap cache space | 292 | struct vm_area_struct *vma, unsigned long addr, |
| 293 | * and reading the disk if it is not already cached. | 293 | bool *new_page_allocated) |
| 294 | * A failure return means that either the page allocation failed or that | ||
| 295 | * the swap entry is no longer in use. | ||
| 296 | */ | ||
| 297 | struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | ||
| 298 | struct vm_area_struct *vma, unsigned long addr) | ||
| 299 | { | 294 | { |
| 300 | struct page *found_page, *new_page = NULL; | 295 | struct page *found_page, *new_page = NULL; |
| 296 | struct address_space *swapper_space = swap_address_space(entry); | ||
| 301 | int err; | 297 | int err; |
| 298 | *new_page_allocated = false; | ||
| 302 | 299 | ||
| 303 | do { | 300 | do { |
| 304 | /* | 301 | /* |
| @@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 306 | * called after lookup_swap_cache() failed, re-calling | 303 | * called after lookup_swap_cache() failed, re-calling |
| 307 | * that would confuse statistics. | 304 | * that would confuse statistics. |
| 308 | */ | 305 | */ |
| 309 | found_page = find_get_page(swap_address_space(entry), | 306 | found_page = find_get_page(swapper_space, entry.val); |
| 310 | entry.val); | ||
| 311 | if (found_page) | 307 | if (found_page) |
| 312 | break; | 308 | break; |
| 313 | 309 | ||
| @@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 366 | * Initiate read into locked page and return. | 362 | * Initiate read into locked page and return. |
| 367 | */ | 363 | */ |
| 368 | lru_cache_add_anon(new_page); | 364 | lru_cache_add_anon(new_page); |
| 369 | swap_readpage(new_page); | 365 | *new_page_allocated = true; |
| 370 | return new_page; | 366 | return new_page; |
| 371 | } | 367 | } |
| 372 | radix_tree_preload_end(); | 368 | radix_tree_preload_end(); |
| @@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 384 | return found_page; | 380 | return found_page; |
| 385 | } | 381 | } |
| 386 | 382 | ||
| 383 | /* | ||
| 384 | * Locate a page of swap in physical memory, reserving swap cache space | ||
| 385 | * and reading the disk if it is not already cached. | ||
| 386 | * A failure return means that either the page allocation failed or that | ||
| 387 | * the swap entry is no longer in use. | ||
| 388 | */ | ||
| 389 | struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | ||
| 390 | struct vm_area_struct *vma, unsigned long addr) | ||
| 391 | { | ||
| 392 | bool page_was_allocated; | ||
| 393 | struct page *retpage = __read_swap_cache_async(entry, gfp_mask, | ||
| 394 | vma, addr, &page_was_allocated); | ||
| 395 | |||
| 396 | if (page_was_allocated) | ||
| 397 | swap_readpage(retpage); | ||
| 398 | |||
| 399 | return retpage; | ||
| 400 | } | ||
| 401 | |||
| 387 | static unsigned long swapin_nr_pages(unsigned long offset) | 402 | static unsigned long swapin_nr_pages(unsigned long offset) |
| 388 | { | 403 | { |
| 389 | static unsigned long prev_offset; | 404 | static unsigned long prev_offset; |
diff --git a/mm/swapfile.c b/mm/swapfile.c index aebc2dd6e649..58877312cf6b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -875,6 +875,48 @@ int page_swapcount(struct page *page) | |||
| 875 | } | 875 | } |
| 876 | 876 | ||
| 877 | /* | 877 | /* |
| 878 | * How many references to @entry are currently swapped out? | ||
| 879 | * This considers COUNT_CONTINUED so it returns exact answer. | ||
| 880 | */ | ||
| 881 | int swp_swapcount(swp_entry_t entry) | ||
| 882 | { | ||
| 883 | int count, tmp_count, n; | ||
| 884 | struct swap_info_struct *p; | ||
| 885 | struct page *page; | ||
| 886 | pgoff_t offset; | ||
| 887 | unsigned char *map; | ||
| 888 | |||
| 889 | p = swap_info_get(entry); | ||
| 890 | if (!p) | ||
| 891 | return 0; | ||
| 892 | |||
| 893 | count = swap_count(p->swap_map[swp_offset(entry)]); | ||
| 894 | if (!(count & COUNT_CONTINUED)) | ||
| 895 | goto out; | ||
| 896 | |||
| 897 | count &= ~COUNT_CONTINUED; | ||
| 898 | n = SWAP_MAP_MAX + 1; | ||
| 899 | |||
| 900 | offset = swp_offset(entry); | ||
| 901 | page = vmalloc_to_page(p->swap_map + offset); | ||
| 902 | offset &= ~PAGE_MASK; | ||
| 903 | VM_BUG_ON(page_private(page) != SWP_CONTINUED); | ||
| 904 | |||
| 905 | do { | ||
| 906 | page = list_entry(page->lru.next, struct page, lru); | ||
| 907 | map = kmap_atomic(page); | ||
| 908 | tmp_count = map[offset]; | ||
| 909 | kunmap_atomic(map); | ||
| 910 | |||
| 911 | count += (tmp_count & ~COUNT_CONTINUED) * n; | ||
| 912 | n *= (SWAP_CONT_MAX + 1); | ||
| 913 | } while (tmp_count & COUNT_CONTINUED); | ||
| 914 | out: | ||
| 915 | spin_unlock(&p->lock); | ||
| 916 | return count; | ||
| 917 | } | ||
| 918 | |||
| 919 | /* | ||
| 878 | * We can write to an anon page without COW if there are no other references | 920 | * We can write to an anon page without COW if there are no other references |
| 879 | * to it. And as a side-effect, free up its swap: because the old content | 921 | * to it. And as a side-effect, free up its swap: because the old content |
| 880 | * on disk will never be read, and seeking back there to write new content | 922 | * on disk will never be read, and seeking back there to write new content |
diff --git a/mm/vmscan.c b/mm/vmscan.c index b1139039122a..2d978b28a410 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc) | |||
| 175 | if (!memcg) | 175 | if (!memcg) |
| 176 | return true; | 176 | return true; |
| 177 | #ifdef CONFIG_CGROUP_WRITEBACK | 177 | #ifdef CONFIG_CGROUP_WRITEBACK |
| 178 | if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup)) | 178 | if (memcg->css.cgroup) |
| 179 | return true; | 179 | return true; |
| 180 | #endif | 180 | #endif |
| 181 | return false; | 181 | return false; |
| @@ -985,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 985 | * __GFP_IO|__GFP_FS for this reason); but more thought | 985 | * __GFP_IO|__GFP_FS for this reason); but more thought |
| 986 | * would probably show more reasons. | 986 | * would probably show more reasons. |
| 987 | * | 987 | * |
| 988 | * 3) Legacy memcg encounters a page that is not already marked | 988 | * 3) Legacy memcg encounters a page that is already marked |
| 989 | * PageReclaim. memcg does not have any dirty pages | 989 | * PageReclaim. memcg does not have any dirty pages |
| 990 | * throttling so we could easily OOM just because too many | 990 | * throttling so we could easily OOM just because too many |
| 991 | * pages are in writeback and there is nothing else to | 991 | * pages are in writeback and there is nothing else to |
| @@ -1015,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 1015 | */ | 1015 | */ |
| 1016 | SetPageReclaim(page); | 1016 | SetPageReclaim(page); |
| 1017 | nr_writeback++; | 1017 | nr_writeback++; |
| 1018 | |||
| 1019 | goto keep_locked; | 1018 | goto keep_locked; |
| 1020 | 1019 | ||
| 1021 | /* Case 3 above */ | 1020 | /* Case 3 above */ |
| 1022 | } else { | 1021 | } else { |
| 1022 | unlock_page(page); | ||
| 1023 | wait_on_page_writeback(page); | 1023 | wait_on_page_writeback(page); |
| 1024 | /* then go back and try same page again */ | ||
| 1025 | list_add_tail(&page->lru, page_list); | ||
| 1026 | continue; | ||
| 1024 | } | 1027 | } |
| 1025 | } | 1028 | } |
| 1026 | 1029 | ||
| @@ -1196,7 +1199,7 @@ cull_mlocked: | |||
| 1196 | if (PageSwapCache(page)) | 1199 | if (PageSwapCache(page)) |
| 1197 | try_to_free_swap(page); | 1200 | try_to_free_swap(page); |
| 1198 | unlock_page(page); | 1201 | unlock_page(page); |
| 1199 | putback_lru_page(page); | 1202 | list_add(&page->lru, &ret_pages); |
| 1200 | continue; | 1203 | continue; |
| 1201 | 1204 | ||
| 1202 | activate_locked: | 1205 | activate_locked: |
| @@ -1359,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 1359 | unsigned long nr_taken = 0; | 1362 | unsigned long nr_taken = 0; |
| 1360 | unsigned long scan; | 1363 | unsigned long scan; |
| 1361 | 1364 | ||
| 1362 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | 1365 | for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && |
| 1366 | !list_empty(src); scan++) { | ||
| 1363 | struct page *page; | 1367 | struct page *page; |
| 1364 | int nr_pages; | 1368 | int nr_pages; |
| 1365 | 1369 | ||
| @@ -96,10 +96,10 @@ struct zbud_pool { | |||
| 96 | struct list_head buddied; | 96 | struct list_head buddied; |
| 97 | struct list_head lru; | 97 | struct list_head lru; |
| 98 | u64 pages_nr; | 98 | u64 pages_nr; |
| 99 | struct zbud_ops *ops; | 99 | const struct zbud_ops *ops; |
| 100 | #ifdef CONFIG_ZPOOL | 100 | #ifdef CONFIG_ZPOOL |
| 101 | struct zpool *zpool; | 101 | struct zpool *zpool; |
| 102 | struct zpool_ops *zpool_ops; | 102 | const struct zpool_ops *zpool_ops; |
| 103 | #endif | 103 | #endif |
| 104 | }; | 104 | }; |
| 105 | 105 | ||
| @@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) | |||
| 133 | return -ENOENT; | 133 | return -ENOENT; |
| 134 | } | 134 | } |
| 135 | 135 | ||
| 136 | static struct zbud_ops zbud_zpool_ops = { | 136 | static const struct zbud_ops zbud_zpool_ops = { |
| 137 | .evict = zbud_zpool_evict | 137 | .evict = zbud_zpool_evict |
| 138 | }; | 138 | }; |
| 139 | 139 | ||
| 140 | static void *zbud_zpool_create(char *name, gfp_t gfp, | 140 | static void *zbud_zpool_create(char *name, gfp_t gfp, |
| 141 | struct zpool_ops *zpool_ops, | 141 | const struct zpool_ops *zpool_ops, |
| 142 | struct zpool *zpool) | 142 | struct zpool *zpool) |
| 143 | { | 143 | { |
| 144 | struct zbud_pool *pool; | 144 | struct zbud_pool *pool; |
| @@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr) | |||
| 302 | * Return: pointer to the new zbud pool or NULL if the metadata allocation | 302 | * Return: pointer to the new zbud pool or NULL if the metadata allocation |
| 303 | * failed. | 303 | * failed. |
| 304 | */ | 304 | */ |
| 305 | struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) | 305 | struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) |
| 306 | { | 306 | { |
| 307 | struct zbud_pool *pool; | 307 | struct zbud_pool *pool; |
| 308 | int i; | 308 | int i; |
diff --git a/mm/zpool.c b/mm/zpool.c index 722a4f60e90b..68d2dd8ed2d8 100644 --- a/mm/zpool.c +++ b/mm/zpool.c | |||
| @@ -22,7 +22,7 @@ struct zpool { | |||
| 22 | 22 | ||
| 23 | struct zpool_driver *driver; | 23 | struct zpool_driver *driver; |
| 24 | void *pool; | 24 | void *pool; |
| 25 | struct zpool_ops *ops; | 25 | const struct zpool_ops *ops; |
| 26 | 26 | ||
| 27 | struct list_head list; | 27 | struct list_head list; |
| 28 | }; | 28 | }; |
| @@ -115,7 +115,7 @@ static void zpool_put_driver(struct zpool_driver *driver) | |||
| 115 | * Returns: New zpool on success, NULL on failure. | 115 | * Returns: New zpool on success, NULL on failure. |
| 116 | */ | 116 | */ |
| 117 | struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, | 117 | struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, |
| 118 | struct zpool_ops *ops) | 118 | const struct zpool_ops *ops) |
| 119 | { | 119 | { |
| 120 | struct zpool_driver *driver; | 120 | struct zpool_driver *driver; |
| 121 | struct zpool *zpool; | 121 | struct zpool *zpool; |
| @@ -320,20 +320,6 @@ u64 zpool_get_total_size(struct zpool *zpool) | |||
| 320 | return zpool->driver->total_size(zpool->pool); | 320 | return zpool->driver->total_size(zpool->pool); |
| 321 | } | 321 | } |
| 322 | 322 | ||
| 323 | static int __init init_zpool(void) | ||
| 324 | { | ||
| 325 | pr_info("loaded\n"); | ||
| 326 | return 0; | ||
| 327 | } | ||
| 328 | |||
| 329 | static void __exit exit_zpool(void) | ||
| 330 | { | ||
| 331 | pr_info("unloaded\n"); | ||
| 332 | } | ||
| 333 | |||
| 334 | module_init(init_zpool); | ||
| 335 | module_exit(exit_zpool); | ||
| 336 | |||
| 337 | MODULE_LICENSE("GPL"); | 323 | MODULE_LICENSE("GPL"); |
| 338 | MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); | 324 | MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); |
| 339 | MODULE_DESCRIPTION("Common API for compressed memory storage"); | 325 | MODULE_DESCRIPTION("Common API for compressed memory storage"); |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0a7f81aa2249..f135b1b6fcdc 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
| @@ -169,14 +169,12 @@ enum zs_stat_type { | |||
| 169 | NR_ZS_STAT_TYPE, | 169 | NR_ZS_STAT_TYPE, |
| 170 | }; | 170 | }; |
| 171 | 171 | ||
| 172 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 173 | |||
| 174 | static struct dentry *zs_stat_root; | ||
| 175 | |||
| 176 | struct zs_size_stat { | 172 | struct zs_size_stat { |
| 177 | unsigned long objs[NR_ZS_STAT_TYPE]; | 173 | unsigned long objs[NR_ZS_STAT_TYPE]; |
| 178 | }; | 174 | }; |
| 179 | 175 | ||
| 176 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 177 | static struct dentry *zs_stat_root; | ||
| 180 | #endif | 178 | #endif |
| 181 | 179 | ||
| 182 | /* | 180 | /* |
| @@ -201,6 +199,8 @@ static int zs_size_classes; | |||
| 201 | static const int fullness_threshold_frac = 4; | 199 | static const int fullness_threshold_frac = 4; |
| 202 | 200 | ||
| 203 | struct size_class { | 201 | struct size_class { |
| 202 | spinlock_t lock; | ||
| 203 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; | ||
| 204 | /* | 204 | /* |
| 205 | * Size of objects stored in this class. Must be multiple | 205 | * Size of objects stored in this class. Must be multiple |
| 206 | * of ZS_ALIGN. | 206 | * of ZS_ALIGN. |
| @@ -210,16 +210,10 @@ struct size_class { | |||
| 210 | 210 | ||
| 211 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ | 211 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ |
| 212 | int pages_per_zspage; | 212 | int pages_per_zspage; |
| 213 | /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ | ||
| 214 | bool huge; | ||
| 215 | |||
| 216 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 217 | struct zs_size_stat stats; | 213 | struct zs_size_stat stats; |
| 218 | #endif | ||
| 219 | |||
| 220 | spinlock_t lock; | ||
| 221 | 214 | ||
| 222 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; | 215 | /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ |
| 216 | bool huge; | ||
| 223 | }; | 217 | }; |
| 224 | 218 | ||
| 225 | /* | 219 | /* |
| @@ -251,6 +245,15 @@ struct zs_pool { | |||
| 251 | gfp_t flags; /* allocation flags used when growing pool */ | 245 | gfp_t flags; /* allocation flags used when growing pool */ |
| 252 | atomic_long_t pages_allocated; | 246 | atomic_long_t pages_allocated; |
| 253 | 247 | ||
| 248 | struct zs_pool_stats stats; | ||
| 249 | |||
| 250 | /* Compact classes */ | ||
| 251 | struct shrinker shrinker; | ||
| 252 | /* | ||
| 253 | * To signify that register_shrinker() was successful | ||
| 254 | * and unregister_shrinker() will not Oops. | ||
| 255 | */ | ||
| 256 | bool shrinker_enabled; | ||
| 254 | #ifdef CONFIG_ZSMALLOC_STAT | 257 | #ifdef CONFIG_ZSMALLOC_STAT |
| 255 | struct dentry *stat_dentry; | 258 | struct dentry *stat_dentry; |
| 256 | #endif | 259 | #endif |
| @@ -285,8 +288,7 @@ static int create_handle_cache(struct zs_pool *pool) | |||
| 285 | 288 | ||
| 286 | static void destroy_handle_cache(struct zs_pool *pool) | 289 | static void destroy_handle_cache(struct zs_pool *pool) |
| 287 | { | 290 | { |
| 288 | if (pool->handle_cachep) | 291 | kmem_cache_destroy(pool->handle_cachep); |
| 289 | kmem_cache_destroy(pool->handle_cachep); | ||
| 290 | } | 292 | } |
| 291 | 293 | ||
| 292 | static unsigned long alloc_handle(struct zs_pool *pool) | 294 | static unsigned long alloc_handle(struct zs_pool *pool) |
| @@ -309,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj) | |||
| 309 | 311 | ||
| 310 | #ifdef CONFIG_ZPOOL | 312 | #ifdef CONFIG_ZPOOL |
| 311 | 313 | ||
| 312 | static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops, | 314 | static void *zs_zpool_create(char *name, gfp_t gfp, |
| 315 | const struct zpool_ops *zpool_ops, | ||
| 313 | struct zpool *zpool) | 316 | struct zpool *zpool) |
| 314 | { | 317 | { |
| 315 | return zs_create_pool(name, gfp); | 318 | return zs_create_pool(name, gfp); |
| @@ -441,8 +444,6 @@ static int get_size_class_index(int size) | |||
| 441 | return min(zs_size_classes - 1, idx); | 444 | return min(zs_size_classes - 1, idx); |
| 442 | } | 445 | } |
| 443 | 446 | ||
| 444 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 445 | |||
| 446 | static inline void zs_stat_inc(struct size_class *class, | 447 | static inline void zs_stat_inc(struct size_class *class, |
| 447 | enum zs_stat_type type, unsigned long cnt) | 448 | enum zs_stat_type type, unsigned long cnt) |
| 448 | { | 449 | { |
| @@ -461,6 +462,8 @@ static inline unsigned long zs_stat_get(struct size_class *class, | |||
| 461 | return class->stats.objs[type]; | 462 | return class->stats.objs[type]; |
| 462 | } | 463 | } |
| 463 | 464 | ||
| 465 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 466 | |||
| 464 | static int __init zs_stat_init(void) | 467 | static int __init zs_stat_init(void) |
| 465 | { | 468 | { |
| 466 | if (!debugfs_initialized()) | 469 | if (!debugfs_initialized()) |
| @@ -576,23 +579,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool) | |||
| 576 | } | 579 | } |
| 577 | 580 | ||
| 578 | #else /* CONFIG_ZSMALLOC_STAT */ | 581 | #else /* CONFIG_ZSMALLOC_STAT */ |
| 579 | |||
| 580 | static inline void zs_stat_inc(struct size_class *class, | ||
| 581 | enum zs_stat_type type, unsigned long cnt) | ||
| 582 | { | ||
| 583 | } | ||
| 584 | |||
| 585 | static inline void zs_stat_dec(struct size_class *class, | ||
| 586 | enum zs_stat_type type, unsigned long cnt) | ||
| 587 | { | ||
| 588 | } | ||
| 589 | |||
| 590 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 591 | enum zs_stat_type type) | ||
| 592 | { | ||
| 593 | return 0; | ||
| 594 | } | ||
| 595 | |||
| 596 | static int __init zs_stat_init(void) | 582 | static int __init zs_stat_init(void) |
| 597 | { | 583 | { |
| 598 | return 0; | 584 | return 0; |
| @@ -610,7 +596,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) | |||
| 610 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | 596 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) |
| 611 | { | 597 | { |
| 612 | } | 598 | } |
| 613 | |||
| 614 | #endif | 599 | #endif |
| 615 | 600 | ||
| 616 | 601 | ||
| @@ -658,13 +643,22 @@ static void insert_zspage(struct page *page, struct size_class *class, | |||
| 658 | if (fullness >= _ZS_NR_FULLNESS_GROUPS) | 643 | if (fullness >= _ZS_NR_FULLNESS_GROUPS) |
| 659 | return; | 644 | return; |
| 660 | 645 | ||
| 661 | head = &class->fullness_list[fullness]; | ||
| 662 | if (*head) | ||
| 663 | list_add_tail(&page->lru, &(*head)->lru); | ||
| 664 | |||
| 665 | *head = page; | ||
| 666 | zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? | 646 | zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? |
| 667 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); | 647 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); |
| 648 | |||
| 649 | head = &class->fullness_list[fullness]; | ||
| 650 | if (!*head) { | ||
| 651 | *head = page; | ||
| 652 | return; | ||
| 653 | } | ||
| 654 | |||
| 655 | /* | ||
| 656 | * We want to see more ZS_FULL pages and less almost | ||
| 657 | * empty/full. Put pages with higher ->inuse first. | ||
| 658 | */ | ||
| 659 | list_add_tail(&page->lru, &(*head)->lru); | ||
| 660 | if (page->inuse >= (*head)->inuse) | ||
| 661 | *head = page; | ||
| 668 | } | 662 | } |
| 669 | 663 | ||
| 670 | /* | 664 | /* |
| @@ -1495,7 +1489,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) | |||
| 1495 | } | 1489 | } |
| 1496 | EXPORT_SYMBOL_GPL(zs_free); | 1490 | EXPORT_SYMBOL_GPL(zs_free); |
| 1497 | 1491 | ||
| 1498 | static void zs_object_copy(unsigned long src, unsigned long dst, | 1492 | static void zs_object_copy(unsigned long dst, unsigned long src, |
| 1499 | struct size_class *class) | 1493 | struct size_class *class) |
| 1500 | { | 1494 | { |
| 1501 | struct page *s_page, *d_page; | 1495 | struct page *s_page, *d_page; |
| @@ -1602,8 +1596,6 @@ struct zs_compact_control { | |||
| 1602 | /* Starting object index within @s_page which used for live object | 1596 | /* Starting object index within @s_page which used for live object |
| 1603 | * in the subpage. */ | 1597 | * in the subpage. */ |
| 1604 | int index; | 1598 | int index; |
| 1605 | /* how many of objects are migrated */ | ||
| 1606 | int nr_migrated; | ||
| 1607 | }; | 1599 | }; |
| 1608 | 1600 | ||
| 1609 | static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | 1601 | static int migrate_zspage(struct zs_pool *pool, struct size_class *class, |
| @@ -1614,7 +1606,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | |||
| 1614 | struct page *s_page = cc->s_page; | 1606 | struct page *s_page = cc->s_page; |
| 1615 | struct page *d_page = cc->d_page; | 1607 | struct page *d_page = cc->d_page; |
| 1616 | unsigned long index = cc->index; | 1608 | unsigned long index = cc->index; |
| 1617 | int nr_migrated = 0; | ||
| 1618 | int ret = 0; | 1609 | int ret = 0; |
| 1619 | 1610 | ||
| 1620 | while (1) { | 1611 | while (1) { |
| @@ -1636,23 +1627,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | |||
| 1636 | 1627 | ||
| 1637 | used_obj = handle_to_obj(handle); | 1628 | used_obj = handle_to_obj(handle); |
| 1638 | free_obj = obj_malloc(d_page, class, handle); | 1629 | free_obj = obj_malloc(d_page, class, handle); |
| 1639 | zs_object_copy(used_obj, free_obj, class); | 1630 | zs_object_copy(free_obj, used_obj, class); |
| 1640 | index++; | 1631 | index++; |
| 1641 | record_obj(handle, free_obj); | 1632 | record_obj(handle, free_obj); |
| 1642 | unpin_tag(handle); | 1633 | unpin_tag(handle); |
| 1643 | obj_free(pool, class, used_obj); | 1634 | obj_free(pool, class, used_obj); |
| 1644 | nr_migrated++; | ||
| 1645 | } | 1635 | } |
| 1646 | 1636 | ||
| 1647 | /* Remember last position in this iteration */ | 1637 | /* Remember last position in this iteration */ |
| 1648 | cc->s_page = s_page; | 1638 | cc->s_page = s_page; |
| 1649 | cc->index = index; | 1639 | cc->index = index; |
| 1650 | cc->nr_migrated = nr_migrated; | ||
| 1651 | 1640 | ||
| 1652 | return ret; | 1641 | return ret; |
| 1653 | } | 1642 | } |
| 1654 | 1643 | ||
| 1655 | static struct page *alloc_target_page(struct size_class *class) | 1644 | static struct page *isolate_target_page(struct size_class *class) |
| 1656 | { | 1645 | { |
| 1657 | int i; | 1646 | int i; |
| 1658 | struct page *page; | 1647 | struct page *page; |
| @@ -1668,8 +1657,17 @@ static struct page *alloc_target_page(struct size_class *class) | |||
| 1668 | return page; | 1657 | return page; |
| 1669 | } | 1658 | } |
| 1670 | 1659 | ||
| 1671 | static void putback_zspage(struct zs_pool *pool, struct size_class *class, | 1660 | /* |
| 1672 | struct page *first_page) | 1661 | * putback_zspage - add @first_page into right class's fullness list |
| 1662 | * @pool: target pool | ||
| 1663 | * @class: destination class | ||
| 1664 | * @first_page: target page | ||
| 1665 | * | ||
| 1666 | * Return @fist_page's fullness_group | ||
| 1667 | */ | ||
| 1668 | static enum fullness_group putback_zspage(struct zs_pool *pool, | ||
| 1669 | struct size_class *class, | ||
| 1670 | struct page *first_page) | ||
| 1673 | { | 1671 | { |
| 1674 | enum fullness_group fullness; | 1672 | enum fullness_group fullness; |
| 1675 | 1673 | ||
| @@ -1687,50 +1685,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class, | |||
| 1687 | 1685 | ||
| 1688 | free_zspage(first_page); | 1686 | free_zspage(first_page); |
| 1689 | } | 1687 | } |
| 1688 | |||
| 1689 | return fullness; | ||
| 1690 | } | 1690 | } |
| 1691 | 1691 | ||
| 1692 | static struct page *isolate_source_page(struct size_class *class) | 1692 | static struct page *isolate_source_page(struct size_class *class) |
| 1693 | { | 1693 | { |
| 1694 | struct page *page; | 1694 | int i; |
| 1695 | struct page *page = NULL; | ||
| 1695 | 1696 | ||
| 1696 | page = class->fullness_list[ZS_ALMOST_EMPTY]; | 1697 | for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { |
| 1697 | if (page) | 1698 | page = class->fullness_list[i]; |
| 1698 | remove_zspage(page, class, ZS_ALMOST_EMPTY); | 1699 | if (!page) |
| 1700 | continue; | ||
| 1701 | |||
| 1702 | remove_zspage(page, class, i); | ||
| 1703 | break; | ||
| 1704 | } | ||
| 1699 | 1705 | ||
| 1700 | return page; | 1706 | return page; |
| 1701 | } | 1707 | } |
| 1702 | 1708 | ||
| 1703 | static unsigned long __zs_compact(struct zs_pool *pool, | 1709 | /* |
| 1704 | struct size_class *class) | 1710 | * |
| 1711 | * Based on the number of unused allocated objects calculate | ||
| 1712 | * and return the number of pages that we can free. | ||
| 1713 | */ | ||
| 1714 | static unsigned long zs_can_compact(struct size_class *class) | ||
| 1715 | { | ||
| 1716 | unsigned long obj_wasted; | ||
| 1717 | |||
| 1718 | obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - | ||
| 1719 | zs_stat_get(class, OBJ_USED); | ||
| 1720 | |||
| 1721 | obj_wasted /= get_maxobj_per_zspage(class->size, | ||
| 1722 | class->pages_per_zspage); | ||
| 1723 | |||
| 1724 | return obj_wasted * class->pages_per_zspage; | ||
| 1725 | } | ||
| 1726 | |||
| 1727 | static void __zs_compact(struct zs_pool *pool, struct size_class *class) | ||
| 1705 | { | 1728 | { |
| 1706 | int nr_to_migrate; | ||
| 1707 | struct zs_compact_control cc; | 1729 | struct zs_compact_control cc; |
| 1708 | struct page *src_page; | 1730 | struct page *src_page; |
| 1709 | struct page *dst_page = NULL; | 1731 | struct page *dst_page = NULL; |
| 1710 | unsigned long nr_total_migrated = 0; | ||
| 1711 | 1732 | ||
| 1712 | spin_lock(&class->lock); | 1733 | spin_lock(&class->lock); |
| 1713 | while ((src_page = isolate_source_page(class))) { | 1734 | while ((src_page = isolate_source_page(class))) { |
| 1714 | 1735 | ||
| 1715 | BUG_ON(!is_first_page(src_page)); | 1736 | BUG_ON(!is_first_page(src_page)); |
| 1716 | 1737 | ||
| 1717 | /* The goal is to migrate all live objects in source page */ | 1738 | if (!zs_can_compact(class)) |
| 1718 | nr_to_migrate = src_page->inuse; | 1739 | break; |
| 1740 | |||
| 1719 | cc.index = 0; | 1741 | cc.index = 0; |
| 1720 | cc.s_page = src_page; | 1742 | cc.s_page = src_page; |
| 1721 | 1743 | ||
| 1722 | while ((dst_page = alloc_target_page(class))) { | 1744 | while ((dst_page = isolate_target_page(class))) { |
| 1723 | cc.d_page = dst_page; | 1745 | cc.d_page = dst_page; |
| 1724 | /* | 1746 | /* |
| 1725 | * If there is no more space in dst_page, try to | 1747 | * If there is no more space in dst_page, resched |
| 1726 | * allocate another zspage. | 1748 | * and see if anyone had allocated another zspage. |
| 1727 | */ | 1749 | */ |
| 1728 | if (!migrate_zspage(pool, class, &cc)) | 1750 | if (!migrate_zspage(pool, class, &cc)) |
| 1729 | break; | 1751 | break; |
| 1730 | 1752 | ||
| 1731 | putback_zspage(pool, class, dst_page); | 1753 | putback_zspage(pool, class, dst_page); |
| 1732 | nr_total_migrated += cc.nr_migrated; | ||
| 1733 | nr_to_migrate -= cc.nr_migrated; | ||
| 1734 | } | 1754 | } |
| 1735 | 1755 | ||
| 1736 | /* Stop if we couldn't find slot */ | 1756 | /* Stop if we couldn't find slot */ |
| @@ -1738,9 +1758,9 @@ static unsigned long __zs_compact(struct zs_pool *pool, | |||
| 1738 | break; | 1758 | break; |
| 1739 | 1759 | ||
| 1740 | putback_zspage(pool, class, dst_page); | 1760 | putback_zspage(pool, class, dst_page); |
| 1741 | putback_zspage(pool, class, src_page); | 1761 | if (putback_zspage(pool, class, src_page) == ZS_EMPTY) |
| 1762 | pool->stats.pages_compacted += class->pages_per_zspage; | ||
| 1742 | spin_unlock(&class->lock); | 1763 | spin_unlock(&class->lock); |
| 1743 | nr_total_migrated += cc.nr_migrated; | ||
| 1744 | cond_resched(); | 1764 | cond_resched(); |
| 1745 | spin_lock(&class->lock); | 1765 | spin_lock(&class->lock); |
| 1746 | } | 1766 | } |
| @@ -1749,14 +1769,11 @@ static unsigned long __zs_compact(struct zs_pool *pool, | |||
| 1749 | putback_zspage(pool, class, src_page); | 1769 | putback_zspage(pool, class, src_page); |
| 1750 | 1770 | ||
| 1751 | spin_unlock(&class->lock); | 1771 | spin_unlock(&class->lock); |
| 1752 | |||
| 1753 | return nr_total_migrated; | ||
| 1754 | } | 1772 | } |
| 1755 | 1773 | ||
| 1756 | unsigned long zs_compact(struct zs_pool *pool) | 1774 | unsigned long zs_compact(struct zs_pool *pool) |
| 1757 | { | 1775 | { |
| 1758 | int i; | 1776 | int i; |
| 1759 | unsigned long nr_migrated = 0; | ||
| 1760 | struct size_class *class; | 1777 | struct size_class *class; |
| 1761 | 1778 | ||
| 1762 | for (i = zs_size_classes - 1; i >= 0; i--) { | 1779 | for (i = zs_size_classes - 1; i >= 0; i--) { |
| @@ -1765,13 +1782,80 @@ unsigned long zs_compact(struct zs_pool *pool) | |||
| 1765 | continue; | 1782 | continue; |
| 1766 | if (class->index != i) | 1783 | if (class->index != i) |
| 1767 | continue; | 1784 | continue; |
| 1768 | nr_migrated += __zs_compact(pool, class); | 1785 | __zs_compact(pool, class); |
| 1769 | } | 1786 | } |
| 1770 | 1787 | ||
| 1771 | return nr_migrated; | 1788 | return pool->stats.pages_compacted; |
| 1772 | } | 1789 | } |
| 1773 | EXPORT_SYMBOL_GPL(zs_compact); | 1790 | EXPORT_SYMBOL_GPL(zs_compact); |
| 1774 | 1791 | ||
| 1792 | void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) | ||
| 1793 | { | ||
| 1794 | memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); | ||
| 1795 | } | ||
| 1796 | EXPORT_SYMBOL_GPL(zs_pool_stats); | ||
| 1797 | |||
| 1798 | static unsigned long zs_shrinker_scan(struct shrinker *shrinker, | ||
| 1799 | struct shrink_control *sc) | ||
| 1800 | { | ||
| 1801 | unsigned long pages_freed; | ||
| 1802 | struct zs_pool *pool = container_of(shrinker, struct zs_pool, | ||
| 1803 | shrinker); | ||
| 1804 | |||
| 1805 | pages_freed = pool->stats.pages_compacted; | ||
| 1806 | /* | ||
| 1807 | * Compact classes and calculate compaction delta. | ||
| 1808 | * Can run concurrently with a manually triggered | ||
| 1809 | * (by user) compaction. | ||
| 1810 | */ | ||
| 1811 | pages_freed = zs_compact(pool) - pages_freed; | ||
| 1812 | |||
| 1813 | return pages_freed ? pages_freed : SHRINK_STOP; | ||
| 1814 | } | ||
| 1815 | |||
| 1816 | static unsigned long zs_shrinker_count(struct shrinker *shrinker, | ||
| 1817 | struct shrink_control *sc) | ||
| 1818 | { | ||
| 1819 | int i; | ||
| 1820 | struct size_class *class; | ||
| 1821 | unsigned long pages_to_free = 0; | ||
| 1822 | struct zs_pool *pool = container_of(shrinker, struct zs_pool, | ||
| 1823 | shrinker); | ||
| 1824 | |||
| 1825 | if (!pool->shrinker_enabled) | ||
| 1826 | return 0; | ||
| 1827 | |||
| 1828 | for (i = zs_size_classes - 1; i >= 0; i--) { | ||
| 1829 | class = pool->size_class[i]; | ||
| 1830 | if (!class) | ||
| 1831 | continue; | ||
| 1832 | if (class->index != i) | ||
| 1833 | continue; | ||
| 1834 | |||
| 1835 | pages_to_free += zs_can_compact(class); | ||
| 1836 | } | ||
| 1837 | |||
| 1838 | return pages_to_free; | ||
| 1839 | } | ||
| 1840 | |||
| 1841 | static void zs_unregister_shrinker(struct zs_pool *pool) | ||
| 1842 | { | ||
| 1843 | if (pool->shrinker_enabled) { | ||
| 1844 | unregister_shrinker(&pool->shrinker); | ||
| 1845 | pool->shrinker_enabled = false; | ||
| 1846 | } | ||
| 1847 | } | ||
| 1848 | |||
| 1849 | static int zs_register_shrinker(struct zs_pool *pool) | ||
| 1850 | { | ||
| 1851 | pool->shrinker.scan_objects = zs_shrinker_scan; | ||
| 1852 | pool->shrinker.count_objects = zs_shrinker_count; | ||
| 1853 | pool->shrinker.batch = 0; | ||
| 1854 | pool->shrinker.seeks = DEFAULT_SEEKS; | ||
| 1855 | |||
| 1856 | return register_shrinker(&pool->shrinker); | ||
| 1857 | } | ||
| 1858 | |||
| 1775 | /** | 1859 | /** |
| 1776 | * zs_create_pool - Creates an allocation pool to work from. | 1860 | * zs_create_pool - Creates an allocation pool to work from. |
| 1777 | * @flags: allocation flags used to allocate pool metadata | 1861 | * @flags: allocation flags used to allocate pool metadata |
| @@ -1857,6 +1941,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) | |||
| 1857 | if (zs_pool_stat_create(name, pool)) | 1941 | if (zs_pool_stat_create(name, pool)) |
| 1858 | goto err; | 1942 | goto err; |
| 1859 | 1943 | ||
| 1944 | /* | ||
| 1945 | * Not critical, we still can use the pool | ||
| 1946 | * and user can trigger compaction manually. | ||
| 1947 | */ | ||
| 1948 | if (zs_register_shrinker(pool) == 0) | ||
| 1949 | pool->shrinker_enabled = true; | ||
| 1860 | return pool; | 1950 | return pool; |
| 1861 | 1951 | ||
| 1862 | err: | 1952 | err: |
| @@ -1869,6 +1959,7 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
| 1869 | { | 1959 | { |
| 1870 | int i; | 1960 | int i; |
| 1871 | 1961 | ||
| 1962 | zs_unregister_shrinker(pool); | ||
| 1872 | zs_pool_stat_destroy(pool); | 1963 | zs_pool_stat_destroy(pool); |
| 1873 | 1964 | ||
| 1874 | for (i = 0; i < zs_size_classes; i++) { | 1965 | for (i = 0; i < zs_size_classes; i++) { |
diff --git a/mm/zswap.c b/mm/zswap.c index 2d5727baed59..48a1d081e2a5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
| @@ -446,75 +446,14 @@ enum zswap_get_swap_ret { | |||
| 446 | static int zswap_get_swap_cache_page(swp_entry_t entry, | 446 | static int zswap_get_swap_cache_page(swp_entry_t entry, |
| 447 | struct page **retpage) | 447 | struct page **retpage) |
| 448 | { | 448 | { |
| 449 | struct page *found_page, *new_page = NULL; | 449 | bool page_was_allocated; |
| 450 | struct address_space *swapper_space = swap_address_space(entry); | ||
| 451 | int err; | ||
| 452 | 450 | ||
| 453 | *retpage = NULL; | 451 | *retpage = __read_swap_cache_async(entry, GFP_KERNEL, |
| 454 | do { | 452 | NULL, 0, &page_was_allocated); |
| 455 | /* | 453 | if (page_was_allocated) |
| 456 | * First check the swap cache. Since this is normally | 454 | return ZSWAP_SWAPCACHE_NEW; |
| 457 | * called after lookup_swap_cache() failed, re-calling | 455 | if (!*retpage) |
| 458 | * that would confuse statistics. | ||
| 459 | */ | ||
| 460 | found_page = find_get_page(swapper_space, entry.val); | ||
| 461 | if (found_page) | ||
| 462 | break; | ||
| 463 | |||
| 464 | /* | ||
| 465 | * Get a new page to read into from swap. | ||
| 466 | */ | ||
| 467 | if (!new_page) { | ||
| 468 | new_page = alloc_page(GFP_KERNEL); | ||
| 469 | if (!new_page) | ||
| 470 | break; /* Out of memory */ | ||
| 471 | } | ||
| 472 | |||
| 473 | /* | ||
| 474 | * call radix_tree_preload() while we can wait. | ||
| 475 | */ | ||
| 476 | err = radix_tree_preload(GFP_KERNEL); | ||
| 477 | if (err) | ||
| 478 | break; | ||
| 479 | |||
| 480 | /* | ||
| 481 | * Swap entry may have been freed since our caller observed it. | ||
| 482 | */ | ||
| 483 | err = swapcache_prepare(entry); | ||
| 484 | if (err == -EEXIST) { /* seems racy */ | ||
| 485 | radix_tree_preload_end(); | ||
| 486 | continue; | ||
| 487 | } | ||
| 488 | if (err) { /* swp entry is obsolete ? */ | ||
| 489 | radix_tree_preload_end(); | ||
| 490 | break; | ||
| 491 | } | ||
| 492 | |||
| 493 | /* May fail (-ENOMEM) if radix-tree node allocation failed. */ | ||
| 494 | __set_page_locked(new_page); | ||
| 495 | SetPageSwapBacked(new_page); | ||
| 496 | err = __add_to_swap_cache(new_page, entry); | ||
| 497 | if (likely(!err)) { | ||
| 498 | radix_tree_preload_end(); | ||
| 499 | lru_cache_add_anon(new_page); | ||
| 500 | *retpage = new_page; | ||
| 501 | return ZSWAP_SWAPCACHE_NEW; | ||
| 502 | } | ||
| 503 | radix_tree_preload_end(); | ||
| 504 | ClearPageSwapBacked(new_page); | ||
| 505 | __clear_page_locked(new_page); | ||
| 506 | /* | ||
| 507 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | ||
| 508 | * clear SWAP_HAS_CACHE flag. | ||
| 509 | */ | ||
| 510 | swapcache_free(entry); | ||
| 511 | } while (err != -ENOMEM); | ||
| 512 | |||
| 513 | if (new_page) | ||
| 514 | page_cache_release(new_page); | ||
| 515 | if (!found_page) | ||
| 516 | return ZSWAP_SWAPCACHE_FAIL; | 456 | return ZSWAP_SWAPCACHE_FAIL; |
| 517 | *retpage = found_page; | ||
| 518 | return ZSWAP_SWAPCACHE_EXIST; | 457 | return ZSWAP_SWAPCACHE_EXIST; |
| 519 | } | 458 | } |
| 520 | 459 | ||
| @@ -816,7 +755,7 @@ static void zswap_frontswap_invalidate_area(unsigned type) | |||
| 816 | zswap_trees[type] = NULL; | 755 | zswap_trees[type] = NULL; |
| 817 | } | 756 | } |
| 818 | 757 | ||
| 819 | static struct zpool_ops zswap_zpool_ops = { | 758 | static const struct zpool_ops zswap_zpool_ops = { |
| 820 | .evict = zswap_writeback_entry | 759 | .evict = zswap_writeback_entry |
| 821 | }; | 760 | }; |
| 822 | 761 | ||
