aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 22:05:45 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 22:05:45 -0500
commitdf32e43a54d04eda35d2859beaf90e3864d53288 (patch)
tree7a61cf658b2949bd426285eb9902be7758ced1ba /mm
parentfbd918a2026d0464ce9c23f57b7de4bcfccdc2e6 (diff)
parent78d5506e82b21a1a1de68c24182db2c2fe521422 (diff)
Merge branch 'akpm' (incoming from Andrew)
Merge first patch-bomb from Andrew Morton: - a couple of misc things - inotify/fsnotify work from Jan - ocfs2 updates (partial) - about half of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (117 commits) mm/migrate: remove unused function, fail_migrate_page() mm/migrate: remove putback_lru_pages, fix comment on putback_movable_pages mm/migrate: correct failure handling if !hugepage_migration_support() mm/migrate: add comment about permanent failure path mm, page_alloc: warn for non-blockable __GFP_NOFAIL allocation failure mm: compaction: reset scanner positions immediately when they meet mm: compaction: do not mark unmovable pageblocks as skipped in async compaction mm: compaction: detect when scanners meet in isolate_freepages mm: compaction: reset cached scanner pfn's before reading them mm: compaction: encapsulate defer reset logic mm: compaction: trace compaction begin and end memcg, oom: lock mem_cgroup_print_oom_info sched: add tracepoints related to NUMA task migration mm: numa: do not automatically migrate KSM pages mm: numa: trace tasks that fail migration due to rate limiting mm: numa: limit scope of lock for NUMA migrate rate limiting mm: numa: make NUMA-migrate related functions static lib/show_mem.c: show num_poisoned_pages when oom mm/hwpoison: add '#' to hwpoison_inject mm/memblock: use WARN_ONCE when MAX_NUMNODES passed as input parameter ...
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c61
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h4
-rw-r--r--mm/ksm.c121
-rw-r--r--mm/memblock.c387
-rw-r--r--mm/memcontrol.c17
-rw-r--r--mm/memory-failure.c10
-rw-r--r--mm/memory.c16
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/migrate.c89
-rw-r--r--mm/mlock.c18
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/nobootmem.c10
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/oom_kill.c51
-rw-r--r--mm/page_alloc.c89
-rw-r--r--mm/page_cgroup.c5
-rw-r--r--mm/percpu.c38
-rw-r--r--mm/rmap.c580
-rw-r--r--mm/sparse-vmemmap.c6
-rw-r--r--mm/sparse.c27
-rw-r--r--mm/swap.c278
-rw-r--r--mm/util.c36
-rw-r--r--mm/vmalloc.c20
26 files changed, 1145 insertions, 820 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index f58bcd016f43..3a91a2ea3d34 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
459 unsigned long flags; 459 unsigned long flags;
460 bool locked = false; 460 bool locked = false;
461 struct page *page = NULL, *valid_page = NULL; 461 struct page *page = NULL, *valid_page = NULL;
462 bool skipped_async_unsuitable = false;
462 463
463 /* 464 /*
464 * Ensure that there are not too many pages isolated from the LRU 465 * Ensure that there are not too many pages isolated from the LRU
@@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
534 if (!cc->sync && last_pageblock_nr != pageblock_nr && 535 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
535 !migrate_async_suitable(get_pageblock_migratetype(page))) { 536 !migrate_async_suitable(get_pageblock_migratetype(page))) {
536 cc->finished_update_migrate = true; 537 cc->finished_update_migrate = true;
538 skipped_async_unsuitable = true;
537 goto next_pageblock; 539 goto next_pageblock;
538 } 540 }
539 541
@@ -627,8 +629,13 @@ next_pageblock:
627 if (locked) 629 if (locked)
628 spin_unlock_irqrestore(&zone->lru_lock, flags); 630 spin_unlock_irqrestore(&zone->lru_lock, flags);
629 631
630 /* Update the pageblock-skip if the whole pageblock was scanned */ 632 /*
631 if (low_pfn == end_pfn) 633 * Update the pageblock-skip information and cached scanner pfn,
634 * if the whole pageblock was scanned without isolating any page.
635 * This is not done when pageblock was skipped due to being unsuitable
636 * for async compaction, so that eventual sync compaction can try.
637 */
638 if (low_pfn == end_pfn && !skipped_async_unsuitable)
632 update_pageblock_skip(cc, valid_page, nr_isolated, true); 639 update_pageblock_skip(cc, valid_page, nr_isolated, true);
633 640
634 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 641 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -660,7 +667,7 @@ static void isolate_freepages(struct zone *zone,
660 * is the end of the pageblock the migration scanner is using. 667 * is the end of the pageblock the migration scanner is using.
661 */ 668 */
662 pfn = cc->free_pfn; 669 pfn = cc->free_pfn;
663 low_pfn = cc->migrate_pfn + pageblock_nr_pages; 670 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
664 671
665 /* 672 /*
666 * Take care that if the migration scanner is at the end of the zone 673 * Take care that if the migration scanner is at the end of the zone
@@ -676,7 +683,7 @@ static void isolate_freepages(struct zone *zone,
676 * pages on cc->migratepages. We stop searching if the migrate 683 * pages on cc->migratepages. We stop searching if the migrate
677 * and free page scanners meet or enough free pages are isolated. 684 * and free page scanners meet or enough free pages are isolated.
678 */ 685 */
679 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; 686 for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
680 pfn -= pageblock_nr_pages) { 687 pfn -= pageblock_nr_pages) {
681 unsigned long isolated; 688 unsigned long isolated;
682 689
@@ -738,7 +745,14 @@ static void isolate_freepages(struct zone *zone,
738 /* split_free_page does not map the pages */ 745 /* split_free_page does not map the pages */
739 map_pages(freelist); 746 map_pages(freelist);
740 747
741 cc->free_pfn = high_pfn; 748 /*
749 * If we crossed the migrate scanner, we want to keep it that way
750 * so that compact_finished() may detect this
751 */
752 if (pfn < low_pfn)
753 cc->free_pfn = max(pfn, zone->zone_start_pfn);
754 else
755 cc->free_pfn = high_pfn;
742 cc->nr_freepages = nr_freepages; 756 cc->nr_freepages = nr_freepages;
743} 757}
744 758
@@ -837,6 +851,10 @@ static int compact_finished(struct zone *zone,
837 851
838 /* Compaction run completes if the migrate and free scanner meet */ 852 /* Compaction run completes if the migrate and free scanner meet */
839 if (cc->free_pfn <= cc->migrate_pfn) { 853 if (cc->free_pfn <= cc->migrate_pfn) {
854 /* Let the next compaction start anew. */
855 zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
856 zone->compact_cached_free_pfn = zone_end_pfn(zone);
857
840 /* 858 /*
841 * Mark that the PG_migrate_skip information should be cleared 859 * Mark that the PG_migrate_skip information should be cleared
842 * by kswapd when it goes to sleep. kswapd does not set the 860 * by kswapd when it goes to sleep. kswapd does not set the
@@ -947,6 +965,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
947 } 965 }
948 966
949 /* 967 /*
968 * Clear pageblock skip if there were failures recently and compaction
969 * is about to be retried after being deferred. kswapd does not do
970 * this reset as it'll reset the cached information when going to sleep.
971 */
972 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
973 __reset_isolation_suitable(zone);
974
975 /*
950 * Setup to move all movable pages to the end of the zone. Used cached 976 * Setup to move all movable pages to the end of the zone. Used cached
951 * information on where the scanners should start but check that it 977 * information on where the scanners should start but check that it
952 * is initialised by ensuring the values are within zone boundaries. 978 * is initialised by ensuring the values are within zone boundaries.
@@ -962,13 +988,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
962 zone->compact_cached_migrate_pfn = cc->migrate_pfn; 988 zone->compact_cached_migrate_pfn = cc->migrate_pfn;
963 } 989 }
964 990
965 /* 991 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
966 * Clear pageblock skip if there were failures recently and compaction
967 * is about to be retried after being deferred. kswapd does not do
968 * this reset as it'll reset the cached information when going to sleep.
969 */
970 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
971 __reset_isolation_suitable(zone);
972 992
973 migrate_prep_local(); 993 migrate_prep_local();
974 994
@@ -1003,7 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1003 if (err) { 1023 if (err) {
1004 putback_movable_pages(&cc->migratepages); 1024 putback_movable_pages(&cc->migratepages);
1005 cc->nr_migratepages = 0; 1025 cc->nr_migratepages = 0;
1006 if (err == -ENOMEM) { 1026 /*
1027 * migrate_pages() may return -ENOMEM when scanners meet
1028 * and we want compact_finished() to detect it
1029 */
1030 if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
1007 ret = COMPACT_PARTIAL; 1031 ret = COMPACT_PARTIAL;
1008 goto out; 1032 goto out;
1009 } 1033 }
@@ -1015,6 +1039,8 @@ out:
1015 cc->nr_freepages -= release_freepages(&cc->freepages); 1039 cc->nr_freepages -= release_freepages(&cc->freepages);
1016 VM_BUG_ON(cc->nr_freepages != 0); 1040 VM_BUG_ON(cc->nr_freepages != 0);
1017 1041
1042 trace_mm_compaction_end(ret);
1043
1018 return ret; 1044 return ret;
1019} 1045}
1020 1046
@@ -1120,12 +1146,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1120 compact_zone(zone, cc); 1146 compact_zone(zone, cc);
1121 1147
1122 if (cc->order > 0) { 1148 if (cc->order > 0) {
1123 int ok = zone_watermark_ok(zone, cc->order, 1149 if (zone_watermark_ok(zone, cc->order,
1124 low_wmark_pages(zone), 0, 0); 1150 low_wmark_pages(zone), 0, 0))
1125 if (ok && cc->order >= zone->compact_order_failed) 1151 compaction_defer_reset(zone, cc->order, false);
1126 zone->compact_order_failed = cc->order + 1;
1127 /* Currently async compaction is never deferred. */ 1152 /* Currently async compaction is never deferred. */
1128 else if (!ok && cc->sync) 1153 else if (cc->sync)
1129 defer_compaction(zone, cc->order); 1154 defer_compaction(zone, cc->order);
1130 } 1155 }
1131 1156
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dee6cf4e6d34..04306b9de90d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
690 */ 690 */
691int PageHuge(struct page *page) 691int PageHuge(struct page *page)
692{ 692{
693 compound_page_dtor *dtor;
694
695 if (!PageCompound(page)) 693 if (!PageCompound(page))
696 return 0; 694 return 0;
697 695
698 page = compound_head(page); 696 page = compound_head(page);
699 dtor = get_compound_page_dtor(page); 697 return get_compound_page_dtor(page) == free_huge_page;
700
701 return dtor == free_huge_page;
702} 698}
703EXPORT_SYMBOL_GPL(PageHuge); 699EXPORT_SYMBOL_GPL(PageHuge);
704 700
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
708 */ 704 */
709int PageHeadHuge(struct page *page_head) 705int PageHeadHuge(struct page *page_head)
710{ 706{
711 compound_page_dtor *dtor;
712
713 if (!PageHead(page_head)) 707 if (!PageHead(page_head))
714 return 0; 708 return 0;
715 709
716 dtor = get_compound_page_dtor(page_head); 710 return get_compound_page_dtor(page_head) == free_huge_page;
717
718 return dtor == free_huge_page;
719} 711}
720EXPORT_SYMBOL_GPL(PageHeadHuge);
721 712
722pgoff_t __basepage_index(struct page *page) 713pgoff_t __basepage_index(struct page *page)
723{ 714{
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1280 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 1271 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1281 void *addr; 1272 void *addr;
1282 1273
1283 addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 1274 addr = memblock_virt_alloc_try_nid_nopanic(
1284 huge_page_size(h), huge_page_size(h), 0); 1275 huge_page_size(h), huge_page_size(h),
1285 1276 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
1286 if (addr) { 1277 if (addr) {
1287 /* 1278 /*
1288 * Use the beginning of the huge page to store the 1279 * Use the beginning of the huge page to store the
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void)
1322 1313
1323#ifdef CONFIG_HIGHMEM 1314#ifdef CONFIG_HIGHMEM
1324 page = pfn_to_page(m->phys >> PAGE_SHIFT); 1315 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1325 free_bootmem_late((unsigned long)m, 1316 memblock_free_late(__pa(m),
1326 sizeof(struct huge_bootmem_page)); 1317 sizeof(struct huge_bootmem_page));
1327#else 1318#else
1328 page = virt_to_page(m); 1319 page = virt_to_page(m);
1329#endif 1320#endif
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2355 int cow; 2346 int cow;
2356 struct hstate *h = hstate_vma(vma); 2347 struct hstate *h = hstate_vma(vma);
2357 unsigned long sz = huge_page_size(h); 2348 unsigned long sz = huge_page_size(h);
2349 unsigned long mmun_start; /* For mmu_notifiers */
2350 unsigned long mmun_end; /* For mmu_notifiers */
2351 int ret = 0;
2358 2352
2359 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2353 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2360 2354
2355 mmun_start = vma->vm_start;
2356 mmun_end = vma->vm_end;
2357 if (cow)
2358 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
2359
2361 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2360 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2362 spinlock_t *src_ptl, *dst_ptl; 2361 spinlock_t *src_ptl, *dst_ptl;
2363 src_pte = huge_pte_offset(src, addr); 2362 src_pte = huge_pte_offset(src, addr);
2364 if (!src_pte) 2363 if (!src_pte)
2365 continue; 2364 continue;
2366 dst_pte = huge_pte_alloc(dst, addr, sz); 2365 dst_pte = huge_pte_alloc(dst, addr, sz);
2367 if (!dst_pte) 2366 if (!dst_pte) {
2368 goto nomem; 2367 ret = -ENOMEM;
2368 break;
2369 }
2369 2370
2370 /* If the pagetables are shared don't copy or take references */ 2371 /* If the pagetables are shared don't copy or take references */
2371 if (dst_pte == src_pte) 2372 if (dst_pte == src_pte)
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2386 spin_unlock(src_ptl); 2387 spin_unlock(src_ptl);
2387 spin_unlock(dst_ptl); 2388 spin_unlock(dst_ptl);
2388 } 2389 }
2389 return 0;
2390 2390
2391nomem: 2391 if (cow)
2392 return -ENOMEM; 2392 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
2393
2394 return ret;
2393} 2395}
2394 2396
2395static int is_hugetlb_entry_migration(pte_t pte) 2397static int is_hugetlb_entry_migration(pte_t pte)
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3079same_page: 3081same_page:
3080 if (pages) { 3082 if (pages) {
3081 pages[i] = mem_map_offset(page, pfn_offset); 3083 pages[i] = mem_map_offset(page, pfn_offset);
3082 get_page(pages[i]); 3084 get_page_foll(pages[i]);
3083 } 3085 }
3084 3086
3085 if (vmas) 3087 if (vmas)
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4c84678371eb..95487c71cad5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
55 return 0; 55 return 0;
56 56
57inject: 57inject:
58 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); 58 pr_info("Injecting memory failure at pfn %#lx\n", pfn);
59 return memory_failure(pfn, 18, MF_COUNT_INCREASED); 59 return memory_failure(pfn, 18, MF_COUNT_INCREASED);
60} 60}
61 61
diff --git a/mm/internal.h b/mm/internal.h
index 684f7aa9692a..a346ba120e42 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -47,11 +47,9 @@ static inline void __get_page_tail_foll(struct page *page,
47 * page_cache_get_speculative()) on tail pages. 47 * page_cache_get_speculative()) on tail pages.
48 */ 48 */
49 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); 49 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
50 VM_BUG_ON(atomic_read(&page->_count) != 0);
51 VM_BUG_ON(page_mapcount(page) < 0);
52 if (get_page_head) 50 if (get_page_head)
53 atomic_inc(&page->first_page->_count); 51 atomic_inc(&page->first_page->_count);
54 atomic_inc(&page->_mapcount); 52 get_huge_page_tail(page);
55} 53}
56 54
57/* 55/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff79dc95..3df141e5f3e0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page,
1891 return new_page; 1891 return new_page;
1892} 1892}
1893 1893
1894int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, 1894int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
1895 unsigned long *vm_flags)
1896{ 1895{
1897 struct stable_node *stable_node; 1896 struct stable_node *stable_node;
1898 struct rmap_item *rmap_item; 1897 struct rmap_item *rmap_item;
1899 unsigned int mapcount = page_mapcount(page); 1898 int ret = SWAP_AGAIN;
1900 int referenced = 0;
1901 int search_new_forks = 0; 1899 int search_new_forks = 0;
1902 1900
1903 VM_BUG_ON(!PageKsm(page)); 1901 VM_BUG_ON(!PageKsm(page));
1902
1903 /*
1904 * Rely on the page lock to protect against concurrent modifications
1905 * to that page's node of the stable tree.
1906 */
1904 VM_BUG_ON(!PageLocked(page)); 1907 VM_BUG_ON(!PageLocked(page));
1905 1908
1906 stable_node = page_stable_node(page); 1909 stable_node = page_stable_node(page);
1907 if (!stable_node) 1910 if (!stable_node)
1908 return 0; 1911 return ret;
1909again: 1912again:
1910 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { 1913 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1911 struct anon_vma *anon_vma = rmap_item->anon_vma; 1914 struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1928,113 +1931,16 @@ again:
1928 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1931 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1929 continue; 1932 continue;
1930 1933
1931 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 1934 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1932 continue;
1933
1934 referenced += page_referenced_one(page, vma,
1935 rmap_item->address, &mapcount, vm_flags);
1936 if (!search_new_forks || !mapcount)
1937 break;
1938 }
1939 anon_vma_unlock_read(anon_vma);
1940 if (!mapcount)
1941 goto out;
1942 }
1943 if (!search_new_forks++)
1944 goto again;
1945out:
1946 return referenced;
1947}
1948
1949int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1950{
1951 struct stable_node *stable_node;
1952 struct rmap_item *rmap_item;
1953 int ret = SWAP_AGAIN;
1954 int search_new_forks = 0;
1955
1956 VM_BUG_ON(!PageKsm(page));
1957 VM_BUG_ON(!PageLocked(page));
1958
1959 stable_node = page_stable_node(page);
1960 if (!stable_node)
1961 return SWAP_FAIL;
1962again:
1963 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1964 struct anon_vma *anon_vma = rmap_item->anon_vma;
1965 struct anon_vma_chain *vmac;
1966 struct vm_area_struct *vma;
1967
1968 anon_vma_lock_read(anon_vma);
1969 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1970 0, ULONG_MAX) {
1971 vma = vmac->vma;
1972 if (rmap_item->address < vma->vm_start ||
1973 rmap_item->address >= vma->vm_end)
1974 continue;
1975 /*
1976 * Initially we examine only the vma which covers this
1977 * rmap_item; but later, if there is still work to do,
1978 * we examine covering vmas in other mms: in case they
1979 * were forked from the original since ksmd passed.
1980 */
1981 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1982 continue; 1935 continue;
1983 1936
1984 ret = try_to_unmap_one(page, vma, 1937 ret = rwc->rmap_one(page, vma,
1985 rmap_item->address, flags); 1938 rmap_item->address, rwc->arg);
1986 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1939 if (ret != SWAP_AGAIN) {
1987 anon_vma_unlock_read(anon_vma); 1940 anon_vma_unlock_read(anon_vma);
1988 goto out; 1941 goto out;
1989 } 1942 }
1990 } 1943 if (rwc->done && rwc->done(page)) {
1991 anon_vma_unlock_read(anon_vma);
1992 }
1993 if (!search_new_forks++)
1994 goto again;
1995out:
1996 return ret;
1997}
1998
1999#ifdef CONFIG_MIGRATION
2000int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
2001 struct vm_area_struct *, unsigned long, void *), void *arg)
2002{
2003 struct stable_node *stable_node;
2004 struct rmap_item *rmap_item;
2005 int ret = SWAP_AGAIN;
2006 int search_new_forks = 0;
2007
2008 VM_BUG_ON(!PageKsm(page));
2009 VM_BUG_ON(!PageLocked(page));
2010
2011 stable_node = page_stable_node(page);
2012 if (!stable_node)
2013 return ret;
2014again:
2015 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2016 struct anon_vma *anon_vma = rmap_item->anon_vma;
2017 struct anon_vma_chain *vmac;
2018 struct vm_area_struct *vma;
2019
2020 anon_vma_lock_read(anon_vma);
2021 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2022 0, ULONG_MAX) {
2023 vma = vmac->vma;
2024 if (rmap_item->address < vma->vm_start ||
2025 rmap_item->address >= vma->vm_end)
2026 continue;
2027 /*
2028 * Initially we examine only the vma which covers this
2029 * rmap_item; but later, if there is still work to do,
2030 * we examine covering vmas in other mms: in case they
2031 * were forked from the original since ksmd passed.
2032 */
2033 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2034 continue;
2035
2036 ret = rmap_one(page, vma, rmap_item->address, arg);
2037 if (ret != SWAP_AGAIN) {
2038 anon_vma_unlock_read(anon_vma); 1944 anon_vma_unlock_read(anon_vma);
2039 goto out; 1945 goto out;
2040 } 1946 }
@@ -2047,6 +1953,7 @@ out:
2047 return ret; 1953 return ret;
2048} 1954}
2049 1955
1956#ifdef CONFIG_MIGRATION
2050void ksm_migrate_page(struct page *newpage, struct page *oldpage) 1957void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2051{ 1958{
2052 struct stable_node *stable_node; 1959 struct stable_node *stable_node;
diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477bb5558..1c2ef2c7edab 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -21,6 +21,9 @@
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22 22
23#include <asm-generic/sections.h> 23#include <asm-generic/sections.h>
24#include <linux/io.h>
25
26#include "internal.h"
24 27
25static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 28static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
26static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 29static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = {
39}; 42};
40 43
41int memblock_debug __initdata_memblock; 44int memblock_debug __initdata_memblock;
45#ifdef CONFIG_MOVABLE_NODE
46bool movable_node_enabled __initdata_memblock = false;
47#endif
42static int memblock_can_resize __initdata_memblock; 48static int memblock_can_resize __initdata_memblock;
43static int memblock_memory_in_slab __initdata_memblock = 0; 49static int memblock_memory_in_slab __initdata_memblock = 0;
44static int memblock_reserved_in_slab __initdata_memblock = 0; 50static int memblock_reserved_in_slab __initdata_memblock = 0;
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
91 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 97 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
92 * @size: size of free area to find 98 * @size: size of free area to find
93 * @align: alignment of free area to find 99 * @align: alignment of free area to find
94 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 100 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
95 * 101 *
96 * Utility called from memblock_find_in_range_node(), find free area bottom-up. 102 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
97 * 103 *
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
123 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 129 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
124 * @size: size of free area to find 130 * @size: size of free area to find
125 * @align: alignment of free area to find 131 * @align: alignment of free area to find
126 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 132 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
127 * 133 *
128 * Utility called from memblock_find_in_range_node(), find free area top-down. 134 * Utility called from memblock_find_in_range_node(), find free area top-down.
129 * 135 *
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
154 160
155/** 161/**
156 * memblock_find_in_range_node - find free area in given range and node 162 * memblock_find_in_range_node - find free area in given range and node
157 * @start: start of candidate range
158 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
159 * @size: size of free area to find 163 * @size: size of free area to find
160 * @align: alignment of free area to find 164 * @align: alignment of free area to find
161 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 165 * @start: start of candidate range
166 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
167 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
162 * 168 *
163 * Find @size free area aligned to @align in the specified range and node. 169 * Find @size free area aligned to @align in the specified range and node.
164 * 170 *
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
173 * RETURNS: 179 * RETURNS:
174 * Found address on success, 0 on failure. 180 * Found address on success, 0 on failure.
175 */ 181 */
176phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 182phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
177 phys_addr_t end, phys_addr_t size, 183 phys_addr_t align, phys_addr_t start,
178 phys_addr_t align, int nid) 184 phys_addr_t end, int nid)
179{ 185{
180 int ret; 186 int ret;
181 phys_addr_t kernel_end; 187 phys_addr_t kernel_end;
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
238 phys_addr_t end, phys_addr_t size, 244 phys_addr_t end, phys_addr_t size,
239 phys_addr_t align) 245 phys_addr_t align)
240{ 246{
241 return memblock_find_in_range_node(start, end, size, align, 247 return memblock_find_in_range_node(size, align, start, end,
242 MAX_NUMNODES); 248 NUMA_NO_NODE);
243} 249}
244 250
245static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 251static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -255,6 +261,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
255 type->cnt = 1; 261 type->cnt = 1;
256 type->regions[0].base = 0; 262 type->regions[0].base = 0;
257 type->regions[0].size = 0; 263 type->regions[0].size = 0;
264 type->regions[0].flags = 0;
258 memblock_set_region_node(&type->regions[0], MAX_NUMNODES); 265 memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
259 } 266 }
260} 267}
@@ -265,6 +272,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
265 if (memblock.reserved.regions == memblock_reserved_init_regions) 272 if (memblock.reserved.regions == memblock_reserved_init_regions)
266 return 0; 273 return 0;
267 274
275 /*
276 * Don't allow nobootmem allocator to free reserved memory regions
277 * array if
278 * - CONFIG_DEBUG_FS is enabled;
279 * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled;
280 * - reserved memory regions array have been resized during boot.
281 * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved"
282 * will show garbage instead of state of memory reservations.
283 */
284 if (IS_ENABLED(CONFIG_DEBUG_FS) &&
285 !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
286 return 0;
287
268 *addr = __pa(memblock.reserved.regions); 288 *addr = __pa(memblock.reserved.regions);
269 289
270 return PAGE_ALIGN(sizeof(struct memblock_region) * 290 return PAGE_ALIGN(sizeof(struct memblock_region) *
@@ -405,7 +425,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
405 425
406 if (this->base + this->size != next->base || 426 if (this->base + this->size != next->base ||
407 memblock_get_region_node(this) != 427 memblock_get_region_node(this) !=
408 memblock_get_region_node(next)) { 428 memblock_get_region_node(next) ||
429 this->flags != next->flags) {
409 BUG_ON(this->base + this->size > next->base); 430 BUG_ON(this->base + this->size > next->base);
410 i++; 431 i++;
411 continue; 432 continue;
@@ -425,13 +446,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
425 * @base: base address of the new region 446 * @base: base address of the new region
426 * @size: size of the new region 447 * @size: size of the new region
427 * @nid: node id of the new region 448 * @nid: node id of the new region
449 * @flags: flags of the new region
428 * 450 *
429 * Insert new memblock region [@base,@base+@size) into @type at @idx. 451 * Insert new memblock region [@base,@base+@size) into @type at @idx.
430 * @type must already have extra room to accomodate the new region. 452 * @type must already have extra room to accomodate the new region.
431 */ 453 */
432static void __init_memblock memblock_insert_region(struct memblock_type *type, 454static void __init_memblock memblock_insert_region(struct memblock_type *type,
433 int idx, phys_addr_t base, 455 int idx, phys_addr_t base,
434 phys_addr_t size, int nid) 456 phys_addr_t size,
457 int nid, unsigned long flags)
435{ 458{
436 struct memblock_region *rgn = &type->regions[idx]; 459 struct memblock_region *rgn = &type->regions[idx];
437 460
@@ -439,6 +462,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
439 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); 462 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
440 rgn->base = base; 463 rgn->base = base;
441 rgn->size = size; 464 rgn->size = size;
465 rgn->flags = flags;
442 memblock_set_region_node(rgn, nid); 466 memblock_set_region_node(rgn, nid);
443 type->cnt++; 467 type->cnt++;
444 type->total_size += size; 468 type->total_size += size;
@@ -450,6 +474,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
450 * @base: base address of the new region 474 * @base: base address of the new region
451 * @size: size of the new region 475 * @size: size of the new region
452 * @nid: nid of the new region 476 * @nid: nid of the new region
477 * @flags: flags of the new region
453 * 478 *
454 * Add new memblock region [@base,@base+@size) into @type. The new region 479 * Add new memblock region [@base,@base+@size) into @type. The new region
455 * is allowed to overlap with existing ones - overlaps don't affect already 480 * is allowed to overlap with existing ones - overlaps don't affect already
@@ -460,7 +485,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
460 * 0 on success, -errno on failure. 485 * 0 on success, -errno on failure.
461 */ 486 */
462static int __init_memblock memblock_add_region(struct memblock_type *type, 487static int __init_memblock memblock_add_region(struct memblock_type *type,
463 phys_addr_t base, phys_addr_t size, int nid) 488 phys_addr_t base, phys_addr_t size,
489 int nid, unsigned long flags)
464{ 490{
465 bool insert = false; 491 bool insert = false;
466 phys_addr_t obase = base; 492 phys_addr_t obase = base;
@@ -475,6 +501,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
475 WARN_ON(type->cnt != 1 || type->total_size); 501 WARN_ON(type->cnt != 1 || type->total_size);
476 type->regions[0].base = base; 502 type->regions[0].base = base;
477 type->regions[0].size = size; 503 type->regions[0].size = size;
504 type->regions[0].flags = flags;
478 memblock_set_region_node(&type->regions[0], nid); 505 memblock_set_region_node(&type->regions[0], nid);
479 type->total_size = size; 506 type->total_size = size;
480 return 0; 507 return 0;
@@ -505,7 +532,8 @@ repeat:
505 nr_new++; 532 nr_new++;
506 if (insert) 533 if (insert)
507 memblock_insert_region(type, i++, base, 534 memblock_insert_region(type, i++, base,
508 rbase - base, nid); 535 rbase - base, nid,
536 flags);
509 } 537 }
510 /* area below @rend is dealt with, forget about it */ 538 /* area below @rend is dealt with, forget about it */
511 base = min(rend, end); 539 base = min(rend, end);
@@ -515,7 +543,8 @@ repeat:
515 if (base < end) { 543 if (base < end) {
516 nr_new++; 544 nr_new++;
517 if (insert) 545 if (insert)
518 memblock_insert_region(type, i, base, end - base, nid); 546 memblock_insert_region(type, i, base, end - base,
547 nid, flags);
519 } 548 }
520 549
521 /* 550 /*
@@ -537,12 +566,13 @@ repeat:
537int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, 566int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
538 int nid) 567 int nid)
539{ 568{
540 return memblock_add_region(&memblock.memory, base, size, nid); 569 return memblock_add_region(&memblock.memory, base, size, nid, 0);
541} 570}
542 571
543int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 572int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
544{ 573{
545 return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); 574 return memblock_add_region(&memblock.memory, base, size,
575 MAX_NUMNODES, 0);
546} 576}
547 577
548/** 578/**
@@ -597,7 +627,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
597 rgn->size -= base - rbase; 627 rgn->size -= base - rbase;
598 type->total_size -= base - rbase; 628 type->total_size -= base - rbase;
599 memblock_insert_region(type, i, rbase, base - rbase, 629 memblock_insert_region(type, i, rbase, base - rbase,
600 memblock_get_region_node(rgn)); 630 memblock_get_region_node(rgn),
631 rgn->flags);
601 } else if (rend > end) { 632 } else if (rend > end) {
602 /* 633 /*
603 * @rgn intersects from above. Split and redo the 634 * @rgn intersects from above. Split and redo the
@@ -607,7 +638,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
607 rgn->size -= end - rbase; 638 rgn->size -= end - rbase;
608 type->total_size -= end - rbase; 639 type->total_size -= end - rbase;
609 memblock_insert_region(type, i--, rbase, end - rbase, 640 memblock_insert_region(type, i--, rbase, end - rbase,
610 memblock_get_region_node(rgn)); 641 memblock_get_region_node(rgn),
642 rgn->flags);
611 } else { 643 } else {
612 /* @rgn is fully contained, record it */ 644 /* @rgn is fully contained, record it */
613 if (!*end_rgn) 645 if (!*end_rgn)
@@ -643,28 +675,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
643{ 675{
644 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", 676 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
645 (unsigned long long)base, 677 (unsigned long long)base,
646 (unsigned long long)base + size, 678 (unsigned long long)base + size - 1,
647 (void *)_RET_IP_); 679 (void *)_RET_IP_);
648 680
649 return __memblock_remove(&memblock.reserved, base, size); 681 return __memblock_remove(&memblock.reserved, base, size);
650} 682}
651 683
652int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 684static int __init_memblock memblock_reserve_region(phys_addr_t base,
685 phys_addr_t size,
686 int nid,
687 unsigned long flags)
653{ 688{
654 struct memblock_type *_rgn = &memblock.reserved; 689 struct memblock_type *_rgn = &memblock.reserved;
655 690
656 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", 691 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
657 (unsigned long long)base, 692 (unsigned long long)base,
658 (unsigned long long)base + size, 693 (unsigned long long)base + size - 1,
659 (void *)_RET_IP_); 694 flags, (void *)_RET_IP_);
660 695
661 return memblock_add_region(_rgn, base, size, MAX_NUMNODES); 696 return memblock_add_region(_rgn, base, size, nid, flags);
697}
698
699int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
700{
701 return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
702}
703
704/**
705 * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
706 * @base: the base phys addr of the region
707 * @size: the size of the region
708 *
709 * This function isolates region [@base, @base + @size), and mark it with flag
710 * MEMBLOCK_HOTPLUG.
711 *
712 * Return 0 on succees, -errno on failure.
713 */
714int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
715{
716 struct memblock_type *type = &memblock.memory;
717 int i, ret, start_rgn, end_rgn;
718
719 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
720 if (ret)
721 return ret;
722
723 for (i = start_rgn; i < end_rgn; i++)
724 memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
725
726 memblock_merge_regions(type);
727 return 0;
728}
729
730/**
731 * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
732 * @base: the base phys addr of the region
733 * @size: the size of the region
734 *
735 * This function isolates region [@base, @base + @size), and clear flag
736 * MEMBLOCK_HOTPLUG for the isolated regions.
737 *
738 * Return 0 on succees, -errno on failure.
739 */
740int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
741{
742 struct memblock_type *type = &memblock.memory;
743 int i, ret, start_rgn, end_rgn;
744
745 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
746 if (ret)
747 return ret;
748
749 for (i = start_rgn; i < end_rgn; i++)
750 memblock_clear_region_flags(&type->regions[i],
751 MEMBLOCK_HOTPLUG);
752
753 memblock_merge_regions(type);
754 return 0;
662} 755}
663 756
664/** 757/**
665 * __next_free_mem_range - next function for for_each_free_mem_range() 758 * __next_free_mem_range - next function for for_each_free_mem_range()
666 * @idx: pointer to u64 loop variable 759 * @idx: pointer to u64 loop variable
667 * @nid: node selector, %MAX_NUMNODES for all nodes 760 * @nid: node selector, %NUMA_NO_NODE for all nodes
668 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 761 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
669 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 762 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
670 * @out_nid: ptr to int for nid of the range, can be %NULL 763 * @out_nid: ptr to int for nid of the range, can be %NULL
@@ -693,13 +786,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
693 int mi = *idx & 0xffffffff; 786 int mi = *idx & 0xffffffff;
694 int ri = *idx >> 32; 787 int ri = *idx >> 32;
695 788
789 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
790 nid = NUMA_NO_NODE;
791
696 for ( ; mi < mem->cnt; mi++) { 792 for ( ; mi < mem->cnt; mi++) {
697 struct memblock_region *m = &mem->regions[mi]; 793 struct memblock_region *m = &mem->regions[mi];
698 phys_addr_t m_start = m->base; 794 phys_addr_t m_start = m->base;
699 phys_addr_t m_end = m->base + m->size; 795 phys_addr_t m_end = m->base + m->size;
700 796
701 /* only memory regions are associated with nodes, check it */ 797 /* only memory regions are associated with nodes, check it */
702 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 798 if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
703 continue; 799 continue;
704 800
705 /* scan areas before each reservation for intersection */ 801 /* scan areas before each reservation for intersection */
@@ -740,12 +836,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
740/** 836/**
741 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 837 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
742 * @idx: pointer to u64 loop variable 838 * @idx: pointer to u64 loop variable
743 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 839 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
744 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 840 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
745 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 841 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
746 * @out_nid: ptr to int for nid of the range, can be %NULL 842 * @out_nid: ptr to int for nid of the range, can be %NULL
747 * 843 *
748 * Reverse of __next_free_mem_range(). 844 * Reverse of __next_free_mem_range().
845 *
846 * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't
847 * be able to hot-remove hotpluggable memory used by the kernel. So this
848 * function skip hotpluggable regions if needed when allocating memory for the
849 * kernel.
749 */ 850 */
750void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, 851void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
751 phys_addr_t *out_start, 852 phys_addr_t *out_start,
@@ -756,6 +857,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
756 int mi = *idx & 0xffffffff; 857 int mi = *idx & 0xffffffff;
757 int ri = *idx >> 32; 858 int ri = *idx >> 32;
758 859
860 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
861 nid = NUMA_NO_NODE;
862
759 if (*idx == (u64)ULLONG_MAX) { 863 if (*idx == (u64)ULLONG_MAX) {
760 mi = mem->cnt - 1; 864 mi = mem->cnt - 1;
761 ri = rsv->cnt; 865 ri = rsv->cnt;
@@ -767,7 +871,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
767 phys_addr_t m_end = m->base + m->size; 871 phys_addr_t m_end = m->base + m->size;
768 872
769 /* only memory regions are associated with nodes, check it */ 873 /* only memory regions are associated with nodes, check it */
770 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 874 if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
875 continue;
876
877 /* skip hotpluggable memory regions if needed */
878 if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
771 continue; 879 continue;
772 880
773 /* scan areas before each reservation for intersection */ 881 /* scan areas before each reservation for intersection */
@@ -837,18 +945,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
837 * memblock_set_node - set node ID on memblock regions 945 * memblock_set_node - set node ID on memblock regions
838 * @base: base of area to set node ID for 946 * @base: base of area to set node ID for
839 * @size: size of area to set node ID for 947 * @size: size of area to set node ID for
948 * @type: memblock type to set node ID for
840 * @nid: node ID to set 949 * @nid: node ID to set
841 * 950 *
842 * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. 951 * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
843 * Regions which cross the area boundaries are split as necessary. 952 * Regions which cross the area boundaries are split as necessary.
844 * 953 *
845 * RETURNS: 954 * RETURNS:
846 * 0 on success, -errno on failure. 955 * 0 on success, -errno on failure.
847 */ 956 */
848int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, 957int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
849 int nid) 958 struct memblock_type *type, int nid)
850{ 959{
851 struct memblock_type *type = &memblock.memory;
852 int start_rgn, end_rgn; 960 int start_rgn, end_rgn;
853 int i, ret; 961 int i, ret;
854 962
@@ -870,13 +978,13 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
870{ 978{
871 phys_addr_t found; 979 phys_addr_t found;
872 980
873 if (WARN_ON(!align)) 981 if (!align)
874 align = __alignof__(long long); 982 align = SMP_CACHE_BYTES;
875 983
876 /* align @size to avoid excessive fragmentation on reserved array */ 984 /* align @size to avoid excessive fragmentation on reserved array */
877 size = round_up(size, align); 985 size = round_up(size, align);
878 986
879 found = memblock_find_in_range_node(0, max_addr, size, align, nid); 987 found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
880 if (found && !memblock_reserve(found, size)) 988 if (found && !memblock_reserve(found, size))
881 return found; 989 return found;
882 990
@@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n
890 998
891phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 999phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
892{ 1000{
893 return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); 1001 return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
894} 1002}
895 1003
896phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 1004phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
920 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 1028 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
921} 1029}
922 1030
1031/**
1032 * memblock_virt_alloc_internal - allocate boot memory block
1033 * @size: size of memory block to be allocated in bytes
1034 * @align: alignment of the region and block's size
1035 * @min_addr: the lower bound of the memory region to allocate (phys address)
1036 * @max_addr: the upper bound of the memory region to allocate (phys address)
1037 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1038 *
1039 * The @min_addr limit is dropped if it can not be satisfied and the allocation
1040 * will fall back to memory below @min_addr. Also, allocation may fall back
1041 * to any node in the system if the specified node can not
1042 * hold the requested memory.
1043 *
1044 * The allocation is performed from memory region limited by
1045 * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
1046 *
1047 * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
1048 *
1049 * The phys address of allocated boot memory block is converted to virtual and
1050 * allocated memory is reset to 0.
1051 *
1052 * In addition, function sets the min_count to 0 using kmemleak_alloc for
1053 * allocated boot memory block, so that it is never reported as leaks.
1054 *
1055 * RETURNS:
1056 * Virtual address of allocated memory block on success, NULL on failure.
1057 */
1058static void * __init memblock_virt_alloc_internal(
1059 phys_addr_t size, phys_addr_t align,
1060 phys_addr_t min_addr, phys_addr_t max_addr,
1061 int nid)
1062{
1063 phys_addr_t alloc;
1064 void *ptr;
1065
1066 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
1067 nid = NUMA_NO_NODE;
1068
1069 /*
1070 * Detect any accidental use of these APIs after slab is ready, as at
1071 * this moment memblock may be deinitialized already and its
1072 * internal data may be destroyed (after execution of free_all_bootmem)
1073 */
1074 if (WARN_ON_ONCE(slab_is_available()))
1075 return kzalloc_node(size, GFP_NOWAIT, nid);
1076
1077 if (!align)
1078 align = SMP_CACHE_BYTES;
1079
1080 /* align @size to avoid excessive fragmentation on reserved array */
1081 size = round_up(size, align);
1082
1083again:
1084 alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
1085 nid);
1086 if (alloc)
1087 goto done;
1088
1089 if (nid != NUMA_NO_NODE) {
1090 alloc = memblock_find_in_range_node(size, align, min_addr,
1091 max_addr, NUMA_NO_NODE);
1092 if (alloc)
1093 goto done;
1094 }
1095
1096 if (min_addr) {
1097 min_addr = 0;
1098 goto again;
1099 } else {
1100 goto error;
1101 }
1102
1103done:
1104 memblock_reserve(alloc, size);
1105 ptr = phys_to_virt(alloc);
1106 memset(ptr, 0, size);
1107
1108 /*
1109 * The min_count is set to 0 so that bootmem allocated blocks
1110 * are never reported as leaks. This is because many of these blocks
1111 * are only referred via the physical address which is not
1112 * looked up by kmemleak.
1113 */
1114 kmemleak_alloc(ptr, size, 0, 0);
1115
1116 return ptr;
1117
1118error:
1119 return NULL;
1120}
1121
1122/**
1123 * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
1124 * @size: size of memory block to be allocated in bytes
1125 * @align: alignment of the region and block's size
1126 * @min_addr: the lower bound of the memory region from where the allocation
1127 * is preferred (phys address)
1128 * @max_addr: the upper bound of the memory region from where the allocation
1129 * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
1130 * allocate only from memory limited by memblock.current_limit value
1131 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1132 *
1133 * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
1134 * additional debug information (including caller info), if enabled.
1135 *
1136 * RETURNS:
1137 * Virtual address of allocated memory block on success, NULL on failure.
1138 */
1139void * __init memblock_virt_alloc_try_nid_nopanic(
1140 phys_addr_t size, phys_addr_t align,
1141 phys_addr_t min_addr, phys_addr_t max_addr,
1142 int nid)
1143{
1144 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1145 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1146 (u64)max_addr, (void *)_RET_IP_);
1147 return memblock_virt_alloc_internal(size, align, min_addr,
1148 max_addr, nid);
1149}
1150
1151/**
1152 * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
1153 * @size: size of memory block to be allocated in bytes
1154 * @align: alignment of the region and block's size
1155 * @min_addr: the lower bound of the memory region from where the allocation
1156 * is preferred (phys address)
1157 * @max_addr: the upper bound of the memory region from where the allocation
1158 * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
1159 * allocate only from memory limited by memblock.current_limit value
1160 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1161 *
1162 * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
1163 * which provides debug information (including caller info), if enabled,
1164 * and panics if the request can not be satisfied.
1165 *
1166 * RETURNS:
1167 * Virtual address of allocated memory block on success, NULL on failure.
1168 */
1169void * __init memblock_virt_alloc_try_nid(
1170 phys_addr_t size, phys_addr_t align,
1171 phys_addr_t min_addr, phys_addr_t max_addr,
1172 int nid)
1173{
1174 void *ptr;
1175
1176 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1177 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1178 (u64)max_addr, (void *)_RET_IP_);
1179 ptr = memblock_virt_alloc_internal(size, align,
1180 min_addr, max_addr, nid);
1181 if (ptr)
1182 return ptr;
1183
1184 panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
1185 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1186 (u64)max_addr);
1187 return NULL;
1188}
1189
1190/**
1191 * __memblock_free_early - free boot memory block
1192 * @base: phys starting address of the boot memory block
1193 * @size: size of the boot memory block in bytes
1194 *
1195 * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
1196 * The freeing memory will not be released to the buddy allocator.
1197 */
1198void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
1199{
1200 memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
1201 __func__, (u64)base, (u64)base + size - 1,
1202 (void *)_RET_IP_);
1203 kmemleak_free_part(__va(base), size);
1204 __memblock_remove(&memblock.reserved, base, size);
1205}
1206
1207/*
1208 * __memblock_free_late - free bootmem block pages directly to buddy allocator
1209 * @addr: phys starting address of the boot memory block
1210 * @size: size of the boot memory block in bytes
1211 *
1212 * This is only useful when the bootmem allocator has already been torn
1213 * down, but we are still initializing the system. Pages are released directly
1214 * to the buddy allocator, no bootmem metadata is updated because it is gone.
1215 */
1216void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
1217{
1218 u64 cursor, end;
1219
1220 memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
1221 __func__, (u64)base, (u64)base + size - 1,
1222 (void *)_RET_IP_);
1223 kmemleak_free_part(__va(base), size);
1224 cursor = PFN_UP(base);
1225 end = PFN_DOWN(base + size);
1226
1227 for (; cursor < end; cursor++) {
1228 __free_pages_bootmem(pfn_to_page(cursor), 0);
1229 totalram_pages++;
1230 }
1231}
923 1232
924/* 1233/*
925 * Remaining API functions 1234 * Remaining API functions
@@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
1101static void __init_memblock memblock_dump(struct memblock_type *type, char *name) 1410static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
1102{ 1411{
1103 unsigned long long base, size; 1412 unsigned long long base, size;
1413 unsigned long flags;
1104 int i; 1414 int i;
1105 1415
1106 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); 1416 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
@@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
1111 1421
1112 base = rgn->base; 1422 base = rgn->base;
1113 size = rgn->size; 1423 size = rgn->size;
1424 flags = rgn->flags;
1114#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 1425#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1115 if (memblock_get_region_node(rgn) != MAX_NUMNODES) 1426 if (memblock_get_region_node(rgn) != MAX_NUMNODES)
1116 snprintf(nid_buf, sizeof(nid_buf), " on node %d", 1427 snprintf(nid_buf, sizeof(nid_buf), " on node %d",
1117 memblock_get_region_node(rgn)); 1428 memblock_get_region_node(rgn));
1118#endif 1429#endif
1119 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", 1430 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
1120 name, i, base, base + size - 1, size, nid_buf); 1431 name, i, base, base + size - 1, size, nid_buf, flags);
1121 } 1432 }
1122} 1433}
1123 1434
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7caff36180cd..67dd2a881433 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1688,13 +1688,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1688 */ 1688 */
1689void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1689void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1690{ 1690{
1691 struct cgroup *task_cgrp;
1692 struct cgroup *mem_cgrp;
1693 /* 1691 /*
1694 * Need a buffer in BSS, can't rely on allocations. The code relies 1692 * protects memcg_name and makes sure that parallel ooms do not
1695 * on the assumption that OOM is serialized for memory controller. 1693 * interleave
1696 * If this assumption is broken, revisit this code.
1697 */ 1694 */
1695 static DEFINE_SPINLOCK(oom_info_lock);
1696 struct cgroup *task_cgrp;
1697 struct cgroup *mem_cgrp;
1698 static char memcg_name[PATH_MAX]; 1698 static char memcg_name[PATH_MAX];
1699 int ret; 1699 int ret;
1700 struct mem_cgroup *iter; 1700 struct mem_cgroup *iter;
@@ -1703,6 +1703,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1703 if (!p) 1703 if (!p)
1704 return; 1704 return;
1705 1705
1706 spin_lock(&oom_info_lock);
1706 rcu_read_lock(); 1707 rcu_read_lock();
1707 1708
1708 mem_cgrp = memcg->css.cgroup; 1709 mem_cgrp = memcg->css.cgroup;
@@ -1771,6 +1772,7 @@ done:
1771 1772
1772 pr_cont("\n"); 1773 pr_cont("\n");
1773 } 1774 }
1775 spin_unlock(&oom_info_lock);
1774} 1776}
1775 1777
1776/* 1778/*
@@ -3000,7 +3002,8 @@ static DEFINE_MUTEX(set_limit_mutex);
3000static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 3002static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
3001{ 3003{
3002 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 3004 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
3003 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); 3005 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) ==
3006 KMEM_ACCOUNTED_MASK;
3004} 3007}
3005 3008
3006/* 3009/*
@@ -3126,7 +3129,7 @@ int memcg_cache_id(struct mem_cgroup *memcg)
3126 * But when we create a new cache, we can call this as well if its parent 3129 * But when we create a new cache, we can call this as well if its parent
3127 * is kmem-limited. That will have to hold set_limit_mutex as well. 3130 * is kmem-limited. That will have to hold set_limit_mutex as well.
3128 */ 3131 */
3129int memcg_update_cache_sizes(struct mem_cgroup *memcg) 3132static int memcg_update_cache_sizes(struct mem_cgroup *memcg)
3130{ 3133{
3131 int num, ret; 3134 int num, ret;
3132 3135
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fabe55046c1d..b25ed321e667 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
611} 611}
612 612
613/* 613/*
614 * Dirty cache page page 614 * Dirty pagecache page
615 * Issues: when the error hit a hole page the error is not properly 615 * Issues: when the error hit a hole page the error is not properly
616 * propagated. 616 * propagated.
617 */ 617 */
@@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags)
1585 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1585 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1586 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1586 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1587 if (ret) { 1587 if (ret) {
1588 putback_lru_pages(&pagelist); 1588 if (!list_empty(&pagelist)) {
1589 list_del(&page->lru);
1590 dec_zone_page_state(page, NR_ISOLATED_ANON +
1591 page_is_file_cache(page));
1592 putback_lru_page(page);
1593 }
1594
1589 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1595 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1590 pfn, ret, page->flags); 1596 pfn, ret, page->flags);
1591 if (ret > 0) 1597 if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9e57d2..86487dfa5e59 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,6 +59,7 @@
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h> 60#include <linux/migrate.h>
61#include <linux/string.h> 61#include <linux/string.h>
62#include <linux/dma-debug.h>
62 63
63#include <asm/io.h> 64#include <asm/io.h>
64#include <asm/pgalloc.h> 65#include <asm/pgalloc.h>
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2559 2560
2560static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2561static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2561{ 2562{
2563 debug_dma_assert_idle(src);
2564
2562 /* 2565 /*
2563 * If the source page was a PFN mapping, we don't have 2566 * If the source page was a PFN mapping, we don't have
2564 * a "struct page" for it. We do a best-effort copy by 2567 * a "struct page" for it. We do a best-effort copy by
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src,
4272#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 4275#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
4273 4276
4274#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS 4277#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4278
4279static struct kmem_cache *page_ptl_cachep;
4280
4281void __init ptlock_cache_init(void)
4282{
4283 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4284 SLAB_PANIC, NULL);
4285}
4286
4275bool ptlock_alloc(struct page *page) 4287bool ptlock_alloc(struct page *page)
4276{ 4288{
4277 spinlock_t *ptl; 4289 spinlock_t *ptl;
4278 4290
4279 ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); 4291 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4280 if (!ptl) 4292 if (!ptl)
4281 return false; 4293 return false;
4282 page->ptl = ptl; 4294 page->ptl = ptl;
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page)
4285 4297
4286void ptlock_free(struct page *page) 4298void ptlock_free(struct page *page)
4287{ 4299{
4288 kfree(page->ptl); 4300 kmem_cache_free(page_ptl_cachep, page->ptl);
4289} 4301}
4290#endif 4302#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235502db..cc2ab37220b7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,7 +9,6 @@
9#include <linux/swap.h> 9#include <linux/swap.h>
10#include <linux/interrupt.h> 10#include <linux/interrupt.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/bootmem.h>
13#include <linux/compiler.h> 12#include <linux/compiler.h>
14#include <linux/export.h> 13#include <linux/export.h>
15#include <linux/pagevec.h> 14#include <linux/pagevec.h>
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
269} 268}
270 269
271/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 270/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
272 * alloc_bootmem_node_nopanic() */ 271 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
273static int __ref ensure_zone_is_initialized(struct zone *zone, 272static int __ref ensure_zone_is_initialized(struct zone *zone,
274 unsigned long start_pfn, unsigned long num_pages) 273 unsigned long start_pfn, unsigned long num_pages)
275{ 274{
@@ -1446,6 +1445,7 @@ static int __init cmdline_parse_movable_node(char *p)
1446 * the kernel away from hotpluggable memory. 1445 * the kernel away from hotpluggable memory.
1447 */ 1446 */
1448 memblock_set_bottom_up(true); 1447 memblock_set_bottom_up(true);
1448 movable_node_enabled = true;
1449#else 1449#else
1450 pr_warn("movable_node option not supported\n"); 1450 pr_warn("movable_node option not supported\n");
1451#endif 1451#endif
diff --git a/mm/migrate.c b/mm/migrate.c
index 9194375b2307..a8025befc323 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -72,28 +72,12 @@ int migrate_prep_local(void)
72} 72}
73 73
74/* 74/*
75 * Add isolated pages on the list back to the LRU under page lock
76 * to avoid leaking evictable pages back onto unevictable list.
77 */
78void putback_lru_pages(struct list_head *l)
79{
80 struct page *page;
81 struct page *page2;
82
83 list_for_each_entry_safe(page, page2, l, lru) {
84 list_del(&page->lru);
85 dec_zone_page_state(page, NR_ISOLATED_ANON +
86 page_is_file_cache(page));
87 putback_lru_page(page);
88 }
89}
90
91/*
92 * Put previously isolated pages back onto the appropriate lists 75 * Put previously isolated pages back onto the appropriate lists
93 * from where they were once taken off for compaction/migration. 76 * from where they were once taken off for compaction/migration.
94 * 77 *
95 * This function shall be used instead of putback_lru_pages(), 78 * This function shall be used whenever the isolated pageset has been
96 * whenever the isolated pageset has been built by isolate_migratepages_range() 79 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
80 * and isolate_huge_page().
97 */ 81 */
98void putback_movable_pages(struct list_head *l) 82void putback_movable_pages(struct list_head *l)
99{ 83{
@@ -199,7 +183,12 @@ out:
199 */ 183 */
200static void remove_migration_ptes(struct page *old, struct page *new) 184static void remove_migration_ptes(struct page *old, struct page *new)
201{ 185{
202 rmap_walk(new, remove_migration_pte, old); 186 struct rmap_walk_control rwc = {
187 .rmap_one = remove_migration_pte,
188 .arg = old,
189 };
190
191 rmap_walk(new, &rwc);
203} 192}
204 193
205/* 194/*
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
563 * Migration functions 552 * Migration functions
564 ***********************************************************/ 553 ***********************************************************/
565 554
566/* Always fail migration. Used for mappings that are not movable */
567int fail_migrate_page(struct address_space *mapping,
568 struct page *newpage, struct page *page)
569{
570 return -EIO;
571}
572EXPORT_SYMBOL(fail_migrate_page);
573
574/* 555/*
575 * Common logic to directly migrate a single page suitable for 556 * Common logic to directly migrate a single page suitable for
576 * pages that do not use PagePrivate/PagePrivate2. 557 * pages that do not use PagePrivate/PagePrivate2.
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1008{ 989{
1009 int rc = 0; 990 int rc = 0;
1010 int *result = NULL; 991 int *result = NULL;
1011 struct page *new_hpage = get_new_page(hpage, private, &result); 992 struct page *new_hpage;
1012 struct anon_vma *anon_vma = NULL; 993 struct anon_vma *anon_vma = NULL;
1013 994
1014 /* 995 /*
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1018 * tables or check whether the hugepage is pmd-based or not before 999 * tables or check whether the hugepage is pmd-based or not before
1019 * kicking migration. 1000 * kicking migration.
1020 */ 1001 */
1021 if (!hugepage_migration_support(page_hstate(hpage))) 1002 if (!hugepage_migration_support(page_hstate(hpage))) {
1003 putback_active_hugepage(hpage);
1022 return -ENOSYS; 1004 return -ENOSYS;
1005 }
1023 1006
1007 new_hpage = get_new_page(hpage, private, &result);
1024 if (!new_hpage) 1008 if (!new_hpage)
1025 return -ENOMEM; 1009 return -ENOMEM;
1026 1010
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1120 nr_succeeded++; 1104 nr_succeeded++;
1121 break; 1105 break;
1122 default: 1106 default:
1123 /* Permanent failure */ 1107 /*
1108 * Permanent failure (-EBUSY, -ENOSYS, etc.):
1109 * unlike -EAGAIN case, the failed page is
1110 * removed from migration page list and not
1111 * retried in the next outer loop.
1112 */
1124 nr_failed++; 1113 nr_failed++;
1125 break; 1114 break;
1126 } 1115 }
@@ -1594,31 +1583,38 @@ bool migrate_ratelimited(int node)
1594} 1583}
1595 1584
1596/* Returns true if the node is migrate rate-limited after the update */ 1585/* Returns true if the node is migrate rate-limited after the update */
1597bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) 1586static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
1587 unsigned long nr_pages)
1598{ 1588{
1599 bool rate_limited = false;
1600
1601 /* 1589 /*
1602 * Rate-limit the amount of data that is being migrated to a node. 1590 * Rate-limit the amount of data that is being migrated to a node.
1603 * Optimal placement is no good if the memory bus is saturated and 1591 * Optimal placement is no good if the memory bus is saturated and
1604 * all the time is being spent migrating! 1592 * all the time is being spent migrating!
1605 */ 1593 */
1606 spin_lock(&pgdat->numabalancing_migrate_lock);
1607 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { 1594 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1595 spin_lock(&pgdat->numabalancing_migrate_lock);
1608 pgdat->numabalancing_migrate_nr_pages = 0; 1596 pgdat->numabalancing_migrate_nr_pages = 0;
1609 pgdat->numabalancing_migrate_next_window = jiffies + 1597 pgdat->numabalancing_migrate_next_window = jiffies +
1610 msecs_to_jiffies(migrate_interval_millisecs); 1598 msecs_to_jiffies(migrate_interval_millisecs);
1599 spin_unlock(&pgdat->numabalancing_migrate_lock);
1611 } 1600 }
1612 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) 1601 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
1613 rate_limited = true; 1602 trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
1614 else 1603 nr_pages);
1615 pgdat->numabalancing_migrate_nr_pages += nr_pages; 1604 return true;
1616 spin_unlock(&pgdat->numabalancing_migrate_lock); 1605 }
1617 1606
1618 return rate_limited; 1607 /*
1608 * This is an unlocked non-atomic update so errors are possible.
1609 * The consequences are failing to migrate when we potentiall should
1610 * have which is not severe enough to warrant locking. If it is ever
1611 * a problem, it can be converted to a per-cpu counter.
1612 */
1613 pgdat->numabalancing_migrate_nr_pages += nr_pages;
1614 return false;
1619} 1615}
1620 1616
1621int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1617static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1622{ 1618{
1623 int page_lru; 1619 int page_lru;
1624 1620
@@ -1705,7 +1701,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1705 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1701 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1706 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); 1702 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1707 if (nr_remaining) { 1703 if (nr_remaining) {
1708 putback_lru_pages(&migratepages); 1704 if (!list_empty(&migratepages)) {
1705 list_del(&page->lru);
1706 dec_zone_page_state(page, NR_ISOLATED_ANON +
1707 page_is_file_cache(page));
1708 putback_lru_page(page);
1709 }
1709 isolated = 0; 1710 isolated = 0;
1710 } else 1711 } else
1711 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1712 count_vm_numa_event(NUMA_PAGE_MIGRATE);
diff --git a/mm/mlock.c b/mm/mlock.c
index 192e6eebe4f2..10819ed4df3e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
709 709
710 lru_add_drain_all(); /* flush pagevec */ 710 lru_add_drain_all(); /* flush pagevec */
711 711
712 down_write(&current->mm->mmap_sem);
713 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 712 len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
714 start &= PAGE_MASK; 713 start &= PAGE_MASK;
715 714
716 locked = len >> PAGE_SHIFT;
717 locked += current->mm->locked_vm;
718
719 lock_limit = rlimit(RLIMIT_MEMLOCK); 715 lock_limit = rlimit(RLIMIT_MEMLOCK);
720 lock_limit >>= PAGE_SHIFT; 716 lock_limit >>= PAGE_SHIFT;
717 locked = len >> PAGE_SHIFT;
718
719 down_write(&current->mm->mmap_sem);
720
721 locked += current->mm->locked_vm;
721 722
722 /* check against resource limits */ 723 /* check against resource limits */
723 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 724 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
724 error = do_mlock(start, len, 1); 725 error = do_mlock(start, len, 1);
726
725 up_write(&current->mm->mmap_sem); 727 up_write(&current->mm->mmap_sem);
726 if (!error) 728 if (!error)
727 error = __mm_populate(start, len, 0); 729 error = __mm_populate(start, len, 0);
@@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
732{ 734{
733 int ret; 735 int ret;
734 736
735 down_write(&current->mm->mmap_sem);
736 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 737 len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
737 start &= PAGE_MASK; 738 start &= PAGE_MASK;
739
740 down_write(&current->mm->mmap_sem);
738 ret = do_mlock(start, len, 0); 741 ret = do_mlock(start, len, 0);
739 up_write(&current->mm->mmap_sem); 742 up_write(&current->mm->mmap_sem);
743
740 return ret; 744 return ret;
741} 745}
742 746
@@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags)
781 if (flags & MCL_CURRENT) 785 if (flags & MCL_CURRENT)
782 lru_add_drain_all(); /* flush pagevec */ 786 lru_add_drain_all(); /* flush pagevec */
783 787
784 down_write(&current->mm->mmap_sem);
785
786 lock_limit = rlimit(RLIMIT_MEMLOCK); 788 lock_limit = rlimit(RLIMIT_MEMLOCK);
787 lock_limit >>= PAGE_SHIFT; 789 lock_limit >>= PAGE_SHIFT;
788 790
789 ret = -ENOMEM; 791 ret = -ENOMEM;
792 down_write(&current->mm->mmap_sem);
793
790 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || 794 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
791 capable(CAP_IPC_LOCK)) 795 capable(CAP_IPC_LOCK))
792 ret = do_mlockall(flags); 796 ret = do_mlockall(flags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d785f1e..a0e7153a79e6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
86 86
87int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ 87int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
88int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 88int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
89unsigned long sysctl_overcommit_kbytes __read_mostly;
89int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 90int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
90unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 91unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
91unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 92unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -1190,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
1190 return hint; 1191 return hint;
1191} 1192}
1192 1193
1194static inline int mlock_future_check(struct mm_struct *mm,
1195 unsigned long flags,
1196 unsigned long len)
1197{
1198 unsigned long locked, lock_limit;
1199
1200 /* mlock MCL_FUTURE? */
1201 if (flags & VM_LOCKED) {
1202 locked = len >> PAGE_SHIFT;
1203 locked += mm->locked_vm;
1204 lock_limit = rlimit(RLIMIT_MEMLOCK);
1205 lock_limit >>= PAGE_SHIFT;
1206 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1207 return -EAGAIN;
1208 }
1209 return 0;
1210}
1211
1193/* 1212/*
1194 * The caller must hold down_write(&current->mm->mmap_sem). 1213 * The caller must hold down_write(&current->mm->mmap_sem).
1195 */ 1214 */
@@ -1251,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1251 if (!can_do_mlock()) 1270 if (!can_do_mlock())
1252 return -EPERM; 1271 return -EPERM;
1253 1272
1254 /* mlock MCL_FUTURE? */ 1273 if (mlock_future_check(mm, vm_flags, len))
1255 if (vm_flags & VM_LOCKED) { 1274 return -EAGAIN;
1256 unsigned long locked, lock_limit;
1257 locked = len >> PAGE_SHIFT;
1258 locked += mm->locked_vm;
1259 lock_limit = rlimit(RLIMIT_MEMLOCK);
1260 lock_limit >>= PAGE_SHIFT;
1261 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1262 return -EAGAIN;
1263 }
1264 1275
1265 if (file) { 1276 if (file) {
1266 struct inode *inode = file_inode(file); 1277 struct inode *inode = file_inode(file);
@@ -2591,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2591 if (error & ~PAGE_MASK) 2602 if (error & ~PAGE_MASK)
2592 return error; 2603 return error;
2593 2604
2594 /* 2605 error = mlock_future_check(mm, mm->def_flags, len);
2595 * mlock MCL_FUTURE? 2606 if (error)
2596 */ 2607 return error;
2597 if (mm->def_flags & VM_LOCKED) {
2598 unsigned long locked, lock_limit;
2599 locked = len >> PAGE_SHIFT;
2600 locked += mm->locked_vm;
2601 lock_limit = rlimit(RLIMIT_MEMLOCK);
2602 lock_limit >>= PAGE_SHIFT;
2603 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
2604 return -EAGAIN;
2605 }
2606 2608
2607 /* 2609 /*
2608 * mm->mmap_sem is required to protect against another thread 2610 * mm->mmap_sem is required to protect against another thread
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bb53a6591aea..7332c1785744 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
23#include <linux/mmu_notifier.h> 23#include <linux/mmu_notifier.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26#include <linux/ksm.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
63 64
64 ptent = *pte; 65 ptent = *pte;
65 page = vm_normal_page(vma, addr, oldpte); 66 page = vm_normal_page(vma, addr, oldpte);
66 if (page) { 67 if (page && !PageKsm(page)) {
67 if (!pte_numa(oldpte)) { 68 if (!pte_numa(oldpte)) {
68 ptent = pte_mknuma(ptent); 69 ptent = pte_mknuma(ptent);
69 set_pte_at(mm, addr, pte, ptent); 70 set_pte_at(mm, addr, pte, ptent);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d374655..19121ceb8874 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
41 if (limit > memblock.current_limit) 41 if (limit > memblock.current_limit)
42 limit = memblock.current_limit; 42 limit = memblock.current_limit;
43 43
44 addr = memblock_find_in_range_node(goal, limit, size, align, nid); 44 addr = memblock_find_in_range_node(size, align, goal, limit, nid);
45 if (!addr) 45 if (!addr)
46 return NULL; 46 return NULL;
47 47
@@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void)
117 phys_addr_t start, end, size; 117 phys_addr_t start, end, size;
118 u64 i; 118 u64 i;
119 119
120 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) 120 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
121 count += __free_memory_core(start, end); 121 count += __free_memory_core(start, end);
122 122
123 /* free range that is used for reserved array if we allocate it */ 123 /* free range that is used for reserved array if we allocate it */
@@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void)
161 reset_all_zones_managed_pages(); 161 reset_all_zones_managed_pages();
162 162
163 /* 163 /*
164 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 164 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
165 * because in some case like Node0 doesn't have RAM installed 165 * because in some case like Node0 doesn't have RAM installed
166 * low ram will be on Node1 166 * low ram will be on Node1
167 */ 167 */
@@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
215 215
216restart: 216restart:
217 217
218 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); 218 ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
219 219
220 if (ptr) 220 if (ptr)
221 return ptr; 221 return ptr;
@@ -299,7 +299,7 @@ again:
299 if (ptr) 299 if (ptr)
300 return ptr; 300 return ptr;
301 301
302 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, 302 ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
303 goal, limit); 303 goal, limit);
304 if (ptr) 304 if (ptr)
305 return ptr; 305 return ptr;
diff --git a/mm/nommu.c b/mm/nommu.c
index fec093adad9c..8740213b1647 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn;
60struct percpu_counter vm_committed_as; 60struct percpu_counter vm_committed_as;
61int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 61int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
62int sysctl_overcommit_ratio = 50; /* default is 50% */ 62int sysctl_overcommit_ratio = 50; /* default is 50% */
63unsigned long sysctl_overcommit_kbytes __read_mostly;
63int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 64int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
64int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 65int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
65unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 66unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e4a600a6163..054ff47c4478 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
47#ifdef CONFIG_NUMA 47#ifdef CONFIG_NUMA
48/** 48/**
49 * has_intersects_mems_allowed() - check task eligiblity for kill 49 * has_intersects_mems_allowed() - check task eligiblity for kill
50 * @tsk: task struct of which task to consider 50 * @start: task struct of which task to consider
51 * @mask: nodemask passed to page allocator for mempolicy ooms 51 * @mask: nodemask passed to page allocator for mempolicy ooms
52 * 52 *
53 * Task eligibility is determined by whether or not a candidate task, @tsk, 53 * Task eligibility is determined by whether or not a candidate task, @tsk,
54 * shares the same mempolicy nodes as current if it is bound by such a policy 54 * shares the same mempolicy nodes as current if it is bound by such a policy
55 * and whether or not it has the same set of allowed cpuset nodes. 55 * and whether or not it has the same set of allowed cpuset nodes.
56 */ 56 */
57static bool has_intersects_mems_allowed(struct task_struct *tsk, 57static bool has_intersects_mems_allowed(struct task_struct *start,
58 const nodemask_t *mask) 58 const nodemask_t *mask)
59{ 59{
60 struct task_struct *start = tsk; 60 struct task_struct *tsk;
61 bool ret = false;
61 62
62 do { 63 rcu_read_lock();
64 for_each_thread(start, tsk) {
63 if (mask) { 65 if (mask) {
64 /* 66 /*
65 * If this is a mempolicy constrained oom, tsk's 67 * If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
67 * mempolicy intersects current, otherwise it may be 69 * mempolicy intersects current, otherwise it may be
68 * needlessly killed. 70 * needlessly killed.
69 */ 71 */
70 if (mempolicy_nodemask_intersects(tsk, mask)) 72 ret = mempolicy_nodemask_intersects(tsk, mask);
71 return true;
72 } else { 73 } else {
73 /* 74 /*
74 * This is not a mempolicy constrained oom, so only 75 * This is not a mempolicy constrained oom, so only
75 * check the mems of tsk's cpuset. 76 * check the mems of tsk's cpuset.
76 */ 77 */
77 if (cpuset_mems_allowed_intersects(current, tsk)) 78 ret = cpuset_mems_allowed_intersects(current, tsk);
78 return true;
79 } 79 }
80 } while_each_thread(start, tsk); 80 if (ret)
81 break;
82 }
83 rcu_read_unlock();
81 84
82 return false; 85 return ret;
83} 86}
84#else 87#else
85static bool has_intersects_mems_allowed(struct task_struct *tsk, 88static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
97 */ 100 */
98struct task_struct *find_lock_task_mm(struct task_struct *p) 101struct task_struct *find_lock_task_mm(struct task_struct *p)
99{ 102{
100 struct task_struct *t = p; 103 struct task_struct *t;
101 104
102 do { 105 rcu_read_lock();
106
107 for_each_thread(p, t) {
103 task_lock(t); 108 task_lock(t);
104 if (likely(t->mm)) 109 if (likely(t->mm))
105 return t; 110 goto found;
106 task_unlock(t); 111 task_unlock(t);
107 } while_each_thread(p, t); 112 }
113 t = NULL;
114found:
115 rcu_read_unlock();
108 116
109 return NULL; 117 return t;
110} 118}
111 119
112/* return true if the task is not adequate as candidate victim task. */ 120/* return true if the task is not adequate as candidate victim task. */
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
301 unsigned long chosen_points = 0; 309 unsigned long chosen_points = 0;
302 310
303 rcu_read_lock(); 311 rcu_read_lock();
304 do_each_thread(g, p) { 312 for_each_process_thread(g, p) {
305 unsigned int points; 313 unsigned int points;
306 314
307 switch (oom_scan_process_thread(p, totalpages, nodemask, 315 switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -323,7 +331,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
323 chosen = p; 331 chosen = p;
324 chosen_points = points; 332 chosen_points = points;
325 } 333 }
326 } while_each_thread(g, p); 334 }
327 if (chosen) 335 if (chosen)
328 get_task_struct(chosen); 336 get_task_struct(chosen);
329 rcu_read_unlock(); 337 rcu_read_unlock();
@@ -406,7 +414,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
406{ 414{
407 struct task_struct *victim = p; 415 struct task_struct *victim = p;
408 struct task_struct *child; 416 struct task_struct *child;
409 struct task_struct *t = p; 417 struct task_struct *t;
410 struct mm_struct *mm; 418 struct mm_struct *mm;
411 unsigned int victim_points = 0; 419 unsigned int victim_points = 0;
412 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 420 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +445,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
437 * still freeing memory. 445 * still freeing memory.
438 */ 446 */
439 read_lock(&tasklist_lock); 447 read_lock(&tasklist_lock);
440 do { 448 for_each_thread(p, t) {
441 list_for_each_entry(child, &t->children, sibling) { 449 list_for_each_entry(child, &t->children, sibling) {
442 unsigned int child_points; 450 unsigned int child_points;
443 451
@@ -455,13 +463,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
455 get_task_struct(victim); 463 get_task_struct(victim);
456 } 464 }
457 } 465 }
458 } while_each_thread(p, t); 466 }
459 read_unlock(&tasklist_lock); 467 read_unlock(&tasklist_lock);
460 468
461 rcu_read_lock();
462 p = find_lock_task_mm(victim); 469 p = find_lock_task_mm(victim);
463 if (!p) { 470 if (!p) {
464 rcu_read_unlock();
465 put_task_struct(victim); 471 put_task_struct(victim);
466 return; 472 return;
467 } else if (victim != p) { 473 } else if (victim != p) {
@@ -487,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
487 * That thread will now get access to memory reserves since it has a 493 * That thread will now get access to memory reserves since it has a
488 * pending fatal signal. 494 * pending fatal signal.
489 */ 495 */
496 rcu_read_lock();
490 for_each_process(p) 497 for_each_process(p)
491 if (p->mm == mm && !same_thread_group(p, victim) && 498 if (p->mm == mm && !same_thread_group(p, victim) &&
492 !(p->flags & PF_KTHREAD)) { 499 !(p->flags & PF_KTHREAD)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5248fe070aa4..533e2147d14f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2072,13 +2072,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2072 return; 2072 return;
2073 2073
2074 /* 2074 /*
2075 * Walking all memory to count page types is very expensive and should
2076 * be inhibited in non-blockable contexts.
2077 */
2078 if (!(gfp_mask & __GFP_WAIT))
2079 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2080
2081 /*
2082 * This documents exceptions given to allocations in certain 2075 * This documents exceptions given to allocations in certain
2083 * contexts that are allowed to allocate outside current's set 2076 * contexts that are allowed to allocate outside current's set
2084 * of allowed nodes. 2077 * of allowed nodes.
@@ -2242,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2242 preferred_zone, migratetype); 2235 preferred_zone, migratetype);
2243 if (page) { 2236 if (page) {
2244 preferred_zone->compact_blockskip_flush = false; 2237 preferred_zone->compact_blockskip_flush = false;
2245 preferred_zone->compact_considered = 0; 2238 compaction_defer_reset(preferred_zone, order, true);
2246 preferred_zone->compact_defer_shift = 0;
2247 if (order >= preferred_zone->compact_order_failed)
2248 preferred_zone->compact_order_failed = order + 1;
2249 count_vm_event(COMPACTSUCCESS); 2239 count_vm_event(COMPACTSUCCESS);
2250 return page; 2240 return page;
2251 } 2241 }
@@ -2535,8 +2525,15 @@ rebalance:
2535 } 2525 }
2536 2526
2537 /* Atomic allocations - we can't balance anything */ 2527 /* Atomic allocations - we can't balance anything */
2538 if (!wait) 2528 if (!wait) {
2529 /*
2530 * All existing users of the deprecated __GFP_NOFAIL are
2531 * blockable, so warn of any new users that actually allow this
2532 * type of allocation to fail.
2533 */
2534 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
2539 goto nopage; 2535 goto nopage;
2536 }
2540 2537
2541 /* Avoid recursion of direct reclaim */ 2538 /* Avoid recursion of direct reclaim */
2542 if (current->flags & PF_MEMALLOC) 2539 if (current->flags & PF_MEMALLOC)
@@ -3901,6 +3898,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3901 struct page *page; 3898 struct page *page;
3902 unsigned long block_migratetype; 3899 unsigned long block_migratetype;
3903 int reserve; 3900 int reserve;
3901 int old_reserve;
3904 3902
3905 /* 3903 /*
3906 * Get the start pfn, end pfn and the number of blocks to reserve 3904 * Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3920,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3922 * future allocation of hugepages at runtime. 3920 * future allocation of hugepages at runtime.
3923 */ 3921 */
3924 reserve = min(2, reserve); 3922 reserve = min(2, reserve);
3923 old_reserve = zone->nr_migrate_reserve_block;
3924
3925 /* When memory hot-add, we almost always need to do nothing */
3926 if (reserve == old_reserve)
3927 return;
3928 zone->nr_migrate_reserve_block = reserve;
3925 3929
3926 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3930 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3927 if (!pfn_valid(pfn)) 3931 if (!pfn_valid(pfn))
@@ -3959,6 +3963,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3959 reserve--; 3963 reserve--;
3960 continue; 3964 continue;
3961 } 3965 }
3966 } else if (!old_reserve) {
3967 /*
3968 * At boot time we don't need to scan the whole zone
3969 * for turning off MIGRATE_RESERVE.
3970 */
3971 break;
3962 } 3972 }
3963 3973
3964 /* 3974 /*
@@ -4209,7 +4219,6 @@ static noinline __init_refok
4209int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4219int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4210{ 4220{
4211 int i; 4221 int i;
4212 struct pglist_data *pgdat = zone->zone_pgdat;
4213 size_t alloc_size; 4222 size_t alloc_size;
4214 4223
4215 /* 4224 /*
@@ -4225,7 +4234,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4225 4234
4226 if (!slab_is_available()) { 4235 if (!slab_is_available()) {
4227 zone->wait_table = (wait_queue_head_t *) 4236 zone->wait_table = (wait_queue_head_t *)
4228 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4237 memblock_virt_alloc_node_nopanic(
4238 alloc_size, zone->zone_pgdat->node_id);
4229 } else { 4239 } else {
4230 /* 4240 /*
4231 * This case means that a zone whose size was 0 gets new memory 4241 * This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4355,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4345#endif 4355#endif
4346 4356
4347/** 4357/**
4348 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4358 * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
4349 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4359 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4350 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4360 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
4351 * 4361 *
4352 * If an architecture guarantees that all ranges registered with 4362 * If an architecture guarantees that all ranges registered with
4353 * add_active_ranges() contain no holes and may be freed, this 4363 * add_active_ranges() contain no holes and may be freed, this
4354 * this function may be used instead of calling free_bootmem() manually. 4364 * this function may be used instead of calling memblock_free_early_nid()
4365 * manually.
4355 */ 4366 */
4356void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4367void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4357{ 4368{
@@ -4363,9 +4374,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4363 end_pfn = min(end_pfn, max_low_pfn); 4374 end_pfn = min(end_pfn, max_low_pfn);
4364 4375
4365 if (start_pfn < end_pfn) 4376 if (start_pfn < end_pfn)
4366 free_bootmem_node(NODE_DATA(this_nid), 4377 memblock_free_early_nid(PFN_PHYS(start_pfn),
4367 PFN_PHYS(start_pfn), 4378 (end_pfn - start_pfn) << PAGE_SHIFT,
4368 (end_pfn - start_pfn) << PAGE_SHIFT); 4379 this_nid);
4369 } 4380 }
4370} 4381}
4371 4382
@@ -4636,8 +4647,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
4636 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4647 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4637 zone->pageblock_flags = NULL; 4648 zone->pageblock_flags = NULL;
4638 if (usemapsize) 4649 if (usemapsize)
4639 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4650 zone->pageblock_flags =
4640 usemapsize); 4651 memblock_virt_alloc_node_nopanic(usemapsize,
4652 pgdat->node_id);
4641} 4653}
4642#else 4654#else
4643static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4655static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4843,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4831 size = (end - start) * sizeof(struct page); 4843 size = (end - start) * sizeof(struct page);
4832 map = alloc_remap(pgdat->node_id, size); 4844 map = alloc_remap(pgdat->node_id, size);
4833 if (!map) 4845 if (!map)
4834 map = alloc_bootmem_node_nopanic(pgdat, size); 4846 map = memblock_virt_alloc_node_nopanic(size,
4847 pgdat->node_id);
4835 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4848 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4836 } 4849 }
4837#ifndef CONFIG_NEED_MULTIPLE_NODES 4850#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5012,9 +5025,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5012 nodemask_t saved_node_state = node_states[N_MEMORY]; 5025 nodemask_t saved_node_state = node_states[N_MEMORY];
5013 unsigned long totalpages = early_calculate_totalpages(); 5026 unsigned long totalpages = early_calculate_totalpages();
5014 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5027 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5028 struct memblock_type *type = &memblock.memory;
5029
5030 /* Need to find movable_zone earlier when movable_node is specified. */
5031 find_usable_zone_for_movable();
5032
5033 /*
5034 * If movable_node is specified, ignore kernelcore and movablecore
5035 * options.
5036 */
5037 if (movable_node_is_enabled()) {
5038 for (i = 0; i < type->cnt; i++) {
5039 if (!memblock_is_hotpluggable(&type->regions[i]))
5040 continue;
5041
5042 nid = type->regions[i].nid;
5043
5044 usable_startpfn = PFN_DOWN(type->regions[i].base);
5045 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
5046 min(usable_startpfn, zone_movable_pfn[nid]) :
5047 usable_startpfn;
5048 }
5049
5050 goto out2;
5051 }
5015 5052
5016 /* 5053 /*
5017 * If movablecore was specified, calculate what size of 5054 * If movablecore=nn[KMG] was specified, calculate what size of
5018 * kernelcore that corresponds so that memory usable for 5055 * kernelcore that corresponds so that memory usable for
5019 * any allocation type is evenly spread. If both kernelcore 5056 * any allocation type is evenly spread. If both kernelcore
5020 * and movablecore are specified, then the value of kernelcore 5057 * and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5077,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5040 goto out; 5077 goto out;
5041 5078
5042 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5079 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
5043 find_usable_zone_for_movable();
5044 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5080 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5045 5081
5046restart: 5082restart:
@@ -5131,6 +5167,7 @@ restart:
5131 if (usable_nodes && required_kernelcore > usable_nodes) 5167 if (usable_nodes && required_kernelcore > usable_nodes)
5132 goto restart; 5168 goto restart;
5133 5169
5170out2:
5134 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5171 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5135 for (nid = 0; nid < MAX_NUMNODES; nid++) 5172 for (nid = 0; nid < MAX_NUMNODES; nid++)
5136 zone_movable_pfn[nid] = 5173 zone_movable_pfn[nid] =
@@ -5857,7 +5894,7 @@ void *__init alloc_large_system_hash(const char *tablename,
5857 do { 5894 do {
5858 size = bucketsize << log2qty; 5895 size = bucketsize << log2qty;
5859 if (flags & HASH_EARLY) 5896 if (flags & HASH_EARLY)
5860 table = alloc_bootmem_nopanic(size); 5897 table = memblock_virt_alloc_nopanic(size, 0);
5861 else if (hashdist) 5898 else if (hashdist)
5862 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5899 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5863 else { 5900 else {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3bd0b8e6ab12..cfd162882c00 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)
54 54
55 table_size = sizeof(struct page_cgroup) * nr_pages; 55 table_size = sizeof(struct page_cgroup) * nr_pages;
56 56
57 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), 57 base = memblock_virt_alloc_try_nid_nopanic(
58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
59 BOOTMEM_ALLOC_ACCESSIBLE, nid);
59 if (!base) 60 if (!base)
60 return -ENOMEM; 61 return -ENOMEM;
61 NODE_DATA(nid)->node_page_cgroup = base; 62 NODE_DATA(nid)->node_page_cgroup = base;
diff --git a/mm/percpu.c b/mm/percpu.c
index afbf352ae580..036cfe07050f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1063 __alignof__(ai->groups[0].cpu_map[0])); 1063 __alignof__(ai->groups[0].cpu_map[0]));
1064 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); 1064 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1065 1065
1066 ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); 1066 ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
1067 if (!ptr) 1067 if (!ptr)
1068 return NULL; 1068 return NULL;
1069 ai = ptr; 1069 ai = ptr;
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1088 */ 1088 */
1089void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) 1089void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1090{ 1090{
1091 free_bootmem(__pa(ai), ai->__ai_size); 1091 memblock_free_early(__pa(ai), ai->__ai_size);
1092} 1092}
1093 1093
1094/** 1094/**
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1246 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); 1246 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1247 1247
1248 /* process group information and build config tables accordingly */ 1248 /* process group information and build config tables accordingly */
1249 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); 1249 group_offsets = memblock_virt_alloc(ai->nr_groups *
1250 group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); 1250 sizeof(group_offsets[0]), 0);
1251 unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); 1251 group_sizes = memblock_virt_alloc(ai->nr_groups *
1252 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); 1252 sizeof(group_sizes[0]), 0);
1253 unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
1254 unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
1253 1255
1254 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1256 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1255 unit_map[cpu] = UINT_MAX; 1257 unit_map[cpu] = UINT_MAX;
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1311 * empty chunks. 1313 * empty chunks.
1312 */ 1314 */
1313 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; 1315 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1314 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); 1316 pcpu_slot = memblock_virt_alloc(
1317 pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
1315 for (i = 0; i < pcpu_nr_slots; i++) 1318 for (i = 0; i < pcpu_nr_slots; i++)
1316 INIT_LIST_HEAD(&pcpu_slot[i]); 1319 INIT_LIST_HEAD(&pcpu_slot[i]);
1317 1320
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1322 * covers static area + reserved area (mostly used for module 1325 * covers static area + reserved area (mostly used for module
1323 * static percpu allocation). 1326 * static percpu allocation).
1324 */ 1327 */
1325 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1328 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1326 INIT_LIST_HEAD(&schunk->list); 1329 INIT_LIST_HEAD(&schunk->list);
1327 schunk->base_addr = base_addr; 1330 schunk->base_addr = base_addr;
1328 schunk->map = smap; 1331 schunk->map = smap;
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1346 1349
1347 /* init dynamic chunk if necessary */ 1350 /* init dynamic chunk if necessary */
1348 if (dyn_size) { 1351 if (dyn_size) {
1349 dchunk = alloc_bootmem(pcpu_chunk_struct_size); 1352 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1350 INIT_LIST_HEAD(&dchunk->list); 1353 INIT_LIST_HEAD(&dchunk->list);
1351 dchunk->base_addr = base_addr; 1354 dchunk->base_addr = base_addr;
1352 dchunk->map = dmap; 1355 dchunk->map = dmap;
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1626 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; 1629 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1627 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); 1630 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1628 1631
1629 areas = alloc_bootmem_nopanic(areas_size); 1632 areas = memblock_virt_alloc_nopanic(areas_size, 0);
1630 if (!areas) { 1633 if (!areas) {
1631 rc = -ENOMEM; 1634 rc = -ENOMEM;
1632 goto out_free; 1635 goto out_free;
@@ -1712,7 +1715,7 @@ out_free_areas:
1712out_free: 1715out_free:
1713 pcpu_free_alloc_info(ai); 1716 pcpu_free_alloc_info(ai);
1714 if (areas) 1717 if (areas)
1715 free_bootmem(__pa(areas), areas_size); 1718 memblock_free_early(__pa(areas), areas_size);
1716 return rc; 1719 return rc;
1717} 1720}
1718#endif /* BUILD_EMBED_FIRST_CHUNK */ 1721#endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
1760 /* unaligned allocations can't be freed, round up to page size */ 1763 /* unaligned allocations can't be freed, round up to page size */
1761 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * 1764 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1762 sizeof(pages[0])); 1765 sizeof(pages[0]));
1763 pages = alloc_bootmem(pages_size); 1766 pages = memblock_virt_alloc(pages_size, 0);
1764 1767
1765 /* allocate pages */ 1768 /* allocate pages */
1766 j = 0; 1769 j = 0;
@@ -1823,7 +1826,7 @@ enomem:
1823 free_fn(page_address(pages[j]), PAGE_SIZE); 1826 free_fn(page_address(pages[j]), PAGE_SIZE);
1824 rc = -ENOMEM; 1827 rc = -ENOMEM;
1825out_free_ar: 1828out_free_ar:
1826 free_bootmem(__pa(pages), pages_size); 1829 memblock_free_early(__pa(pages), pages_size);
1827 pcpu_free_alloc_info(ai); 1830 pcpu_free_alloc_info(ai);
1828 return rc; 1831 return rc;
1829} 1832}
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset);
1848static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, 1851static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
1849 size_t align) 1852 size_t align)
1850{ 1853{
1851 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 1854 return memblock_virt_alloc_from_nopanic(
1855 size, align, __pa(MAX_DMA_ADDRESS));
1852} 1856}
1853 1857
1854static void __init pcpu_dfl_fc_free(void *ptr, size_t size) 1858static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
1855{ 1859{
1856 free_bootmem(__pa(ptr), size); 1860 memblock_free_early(__pa(ptr), size);
1857} 1861}
1858 1862
1859void __init setup_per_cpu_areas(void) 1863void __init setup_per_cpu_areas(void)
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void)
1896 void *fc; 1900 void *fc;
1897 1901
1898 ai = pcpu_alloc_alloc_info(1, 1); 1902 ai = pcpu_alloc_alloc_info(1, 1);
1899 fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 1903 fc = memblock_virt_alloc_from_nopanic(unit_size,
1904 PAGE_SIZE,
1905 __pa(MAX_DMA_ADDRESS));
1900 if (!ai || !fc) 1906 if (!ai || !fc)
1901 panic("Failed to allocate memory for percpu areas."); 1907 panic("Failed to allocate memory for percpu areas.");
1902 /* kmemleak tracks the percpu allocations separately */ 1908 /* kmemleak tracks the percpu allocations separately */
diff --git a/mm/rmap.c b/mm/rmap.c
index 068522d8502a..962e2a1e13a0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
660 return 1; 660 return 1;
661} 661}
662 662
663struct page_referenced_arg {
664 int mapcount;
665 int referenced;
666 unsigned long vm_flags;
667 struct mem_cgroup *memcg;
668};
663/* 669/*
664 * Subfunctions of page_referenced: page_referenced_one called 670 * arg: page_referenced_arg will be passed
665 * repeatedly from either page_referenced_anon or page_referenced_file.
666 */ 671 */
667int page_referenced_one(struct page *page, struct vm_area_struct *vma, 672int page_referenced_one(struct page *page, struct vm_area_struct *vma,
668 unsigned long address, unsigned int *mapcount, 673 unsigned long address, void *arg)
669 unsigned long *vm_flags)
670{ 674{
671 struct mm_struct *mm = vma->vm_mm; 675 struct mm_struct *mm = vma->vm_mm;
672 spinlock_t *ptl; 676 spinlock_t *ptl;
673 int referenced = 0; 677 int referenced = 0;
678 struct page_referenced_arg *pra = arg;
674 679
675 if (unlikely(PageTransHuge(page))) { 680 if (unlikely(PageTransHuge(page))) {
676 pmd_t *pmd; 681 pmd_t *pmd;
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
682 pmd = page_check_address_pmd(page, mm, address, 687 pmd = page_check_address_pmd(page, mm, address,
683 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); 688 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
684 if (!pmd) 689 if (!pmd)
685 goto out; 690 return SWAP_AGAIN;
686 691
687 if (vma->vm_flags & VM_LOCKED) { 692 if (vma->vm_flags & VM_LOCKED) {
688 spin_unlock(ptl); 693 spin_unlock(ptl);
689 *mapcount = 0; /* break early from loop */ 694 pra->vm_flags |= VM_LOCKED;
690 *vm_flags |= VM_LOCKED; 695 return SWAP_FAIL; /* To break the loop */
691 goto out;
692 } 696 }
693 697
694 /* go ahead even if the pmd is pmd_trans_splitting() */ 698 /* go ahead even if the pmd is pmd_trans_splitting() */
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
704 */ 708 */
705 pte = page_check_address(page, mm, address, &ptl, 0); 709 pte = page_check_address(page, mm, address, &ptl, 0);
706 if (!pte) 710 if (!pte)
707 goto out; 711 return SWAP_AGAIN;
708 712
709 if (vma->vm_flags & VM_LOCKED) { 713 if (vma->vm_flags & VM_LOCKED) {
710 pte_unmap_unlock(pte, ptl); 714 pte_unmap_unlock(pte, ptl);
711 *mapcount = 0; /* break early from loop */ 715 pra->vm_flags |= VM_LOCKED;
712 *vm_flags |= VM_LOCKED; 716 return SWAP_FAIL; /* To break the loop */
713 goto out;
714 } 717 }
715 718
716 if (ptep_clear_flush_young_notify(vma, address, pte)) { 719 if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
727 pte_unmap_unlock(pte, ptl); 730 pte_unmap_unlock(pte, ptl);
728 } 731 }
729 732
730 (*mapcount)--; 733 if (referenced) {
731 734 pra->referenced++;
732 if (referenced) 735 pra->vm_flags |= vma->vm_flags;
733 *vm_flags |= vma->vm_flags;
734out:
735 return referenced;
736}
737
738static int page_referenced_anon(struct page *page,
739 struct mem_cgroup *memcg,
740 unsigned long *vm_flags)
741{
742 unsigned int mapcount;
743 struct anon_vma *anon_vma;
744 pgoff_t pgoff;
745 struct anon_vma_chain *avc;
746 int referenced = 0;
747
748 anon_vma = page_lock_anon_vma_read(page);
749 if (!anon_vma)
750 return referenced;
751
752 mapcount = page_mapcount(page);
753 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
754 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
755 struct vm_area_struct *vma = avc->vma;
756 unsigned long address = vma_address(page, vma);
757 /*
758 * If we are reclaiming on behalf of a cgroup, skip
759 * counting on behalf of references from different
760 * cgroups
761 */
762 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
763 continue;
764 referenced += page_referenced_one(page, vma, address,
765 &mapcount, vm_flags);
766 if (!mapcount)
767 break;
768 } 736 }
769 737
770 page_unlock_anon_vma_read(anon_vma); 738 pra->mapcount--;
771 return referenced; 739 if (!pra->mapcount)
740 return SWAP_SUCCESS; /* To break the loop */
741
742 return SWAP_AGAIN;
772} 743}
773 744
774/** 745static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
775 * page_referenced_file - referenced check for object-based rmap
776 * @page: the page we're checking references on.
777 * @memcg: target memory control group
778 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
779 *
780 * For an object-based mapped page, find all the places it is mapped and
781 * check/clear the referenced flag. This is done by following the page->mapping
782 * pointer, then walking the chain of vmas it holds. It returns the number
783 * of references it found.
784 *
785 * This function is only called from page_referenced for object-based pages.
786 */
787static int page_referenced_file(struct page *page,
788 struct mem_cgroup *memcg,
789 unsigned long *vm_flags)
790{ 746{
791 unsigned int mapcount; 747 struct page_referenced_arg *pra = arg;
792 struct address_space *mapping = page->mapping; 748 struct mem_cgroup *memcg = pra->memcg;
793 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
794 struct vm_area_struct *vma;
795 int referenced = 0;
796
797 /*
798 * The caller's checks on page->mapping and !PageAnon have made
799 * sure that this is a file page: the check for page->mapping
800 * excludes the case just before it gets set on an anon page.
801 */
802 BUG_ON(PageAnon(page));
803
804 /*
805 * The page lock not only makes sure that page->mapping cannot
806 * suddenly be NULLified by truncation, it makes sure that the
807 * structure at mapping cannot be freed and reused yet,
808 * so we can safely take mapping->i_mmap_mutex.
809 */
810 BUG_ON(!PageLocked(page));
811
812 mutex_lock(&mapping->i_mmap_mutex);
813 749
814 /* 750 if (!mm_match_cgroup(vma->vm_mm, memcg))
815 * i_mmap_mutex does not stabilize mapcount at all, but mapcount 751 return true;
816 * is more likely to be accurate if we note it after spinning.
817 */
818 mapcount = page_mapcount(page);
819
820 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
821 unsigned long address = vma_address(page, vma);
822 /*
823 * If we are reclaiming on behalf of a cgroup, skip
824 * counting on behalf of references from different
825 * cgroups
826 */
827 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
828 continue;
829 referenced += page_referenced_one(page, vma, address,
830 &mapcount, vm_flags);
831 if (!mapcount)
832 break;
833 }
834 752
835 mutex_unlock(&mapping->i_mmap_mutex); 753 return false;
836 return referenced;
837} 754}
838 755
839/** 756/**
@@ -851,41 +768,57 @@ int page_referenced(struct page *page,
851 struct mem_cgroup *memcg, 768 struct mem_cgroup *memcg,
852 unsigned long *vm_flags) 769 unsigned long *vm_flags)
853{ 770{
854 int referenced = 0; 771 int ret;
855 int we_locked = 0; 772 int we_locked = 0;
773 struct page_referenced_arg pra = {
774 .mapcount = page_mapcount(page),
775 .memcg = memcg,
776 };
777 struct rmap_walk_control rwc = {
778 .rmap_one = page_referenced_one,
779 .arg = (void *)&pra,
780 .anon_lock = page_lock_anon_vma_read,
781 };
856 782
857 *vm_flags = 0; 783 *vm_flags = 0;
858 if (page_mapped(page) && page_rmapping(page)) { 784 if (!page_mapped(page))
859 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 785 return 0;
860 we_locked = trylock_page(page); 786
861 if (!we_locked) { 787 if (!page_rmapping(page))
862 referenced++; 788 return 0;
863 goto out; 789
864 } 790 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
865 } 791 we_locked = trylock_page(page);
866 if (unlikely(PageKsm(page))) 792 if (!we_locked)
867 referenced += page_referenced_ksm(page, memcg, 793 return 1;
868 vm_flags);
869 else if (PageAnon(page))
870 referenced += page_referenced_anon(page, memcg,
871 vm_flags);
872 else if (page->mapping)
873 referenced += page_referenced_file(page, memcg,
874 vm_flags);
875 if (we_locked)
876 unlock_page(page);
877 } 794 }
878out: 795
879 return referenced; 796 /*
797 * If we are reclaiming on behalf of a cgroup, skip
798 * counting on behalf of references from different
799 * cgroups
800 */
801 if (memcg) {
802 rwc.invalid_vma = invalid_page_referenced_vma;
803 }
804
805 ret = rmap_walk(page, &rwc);
806 *vm_flags = pra.vm_flags;
807
808 if (we_locked)
809 unlock_page(page);
810
811 return pra.referenced;
880} 812}
881 813
882static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, 814static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
883 unsigned long address) 815 unsigned long address, void *arg)
884{ 816{
885 struct mm_struct *mm = vma->vm_mm; 817 struct mm_struct *mm = vma->vm_mm;
886 pte_t *pte; 818 pte_t *pte;
887 spinlock_t *ptl; 819 spinlock_t *ptl;
888 int ret = 0; 820 int ret = 0;
821 int *cleaned = arg;
889 822
890 pte = page_check_address(page, mm, address, &ptl, 1); 823 pte = page_check_address(page, mm, address, &ptl, 1);
891 if (!pte) 824 if (!pte)
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
904 837
905 pte_unmap_unlock(pte, ptl); 838 pte_unmap_unlock(pte, ptl);
906 839
907 if (ret) 840 if (ret) {
908 mmu_notifier_invalidate_page(mm, address); 841 mmu_notifier_invalidate_page(mm, address);
842 (*cleaned)++;
843 }
909out: 844out:
910 return ret; 845 return SWAP_AGAIN;
911} 846}
912 847
913static int page_mkclean_file(struct address_space *mapping, struct page *page) 848static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
914{ 849{
915 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 850 if (vma->vm_flags & VM_SHARED)
916 struct vm_area_struct *vma; 851 return 0;
917 int ret = 0;
918
919 BUG_ON(PageAnon(page));
920 852
921 mutex_lock(&mapping->i_mmap_mutex); 853 return 1;
922 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
923 if (vma->vm_flags & VM_SHARED) {
924 unsigned long address = vma_address(page, vma);
925 ret += page_mkclean_one(page, vma, address);
926 }
927 }
928 mutex_unlock(&mapping->i_mmap_mutex);
929 return ret;
930} 854}
931 855
932int page_mkclean(struct page *page) 856int page_mkclean(struct page *page)
933{ 857{
934 int ret = 0; 858 int cleaned = 0;
859 struct address_space *mapping;
860 struct rmap_walk_control rwc = {
861 .arg = (void *)&cleaned,
862 .rmap_one = page_mkclean_one,
863 .invalid_vma = invalid_mkclean_vma,
864 };
935 865
936 BUG_ON(!PageLocked(page)); 866 BUG_ON(!PageLocked(page));
937 867
938 if (page_mapped(page)) { 868 if (!page_mapped(page))
939 struct address_space *mapping = page_mapping(page); 869 return 0;
940 if (mapping)
941 ret = page_mkclean_file(mapping, page);
942 }
943 870
944 return ret; 871 mapping = page_mapping(page);
872 if (!mapping)
873 return 0;
874
875 rmap_walk(page, &rwc);
876
877 return cleaned;
945} 878}
946EXPORT_SYMBOL_GPL(page_mkclean); 879EXPORT_SYMBOL_GPL(page_mkclean);
947 880
@@ -1177,17 +1110,17 @@ out:
1177} 1110}
1178 1111
1179/* 1112/*
1180 * Subfunctions of try_to_unmap: try_to_unmap_one called 1113 * @arg: enum ttu_flags will be passed to this argument
1181 * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
1182 */ 1114 */
1183int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1115int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1184 unsigned long address, enum ttu_flags flags) 1116 unsigned long address, void *arg)
1185{ 1117{
1186 struct mm_struct *mm = vma->vm_mm; 1118 struct mm_struct *mm = vma->vm_mm;
1187 pte_t *pte; 1119 pte_t *pte;
1188 pte_t pteval; 1120 pte_t pteval;
1189 spinlock_t *ptl; 1121 spinlock_t *ptl;
1190 int ret = SWAP_AGAIN; 1122 int ret = SWAP_AGAIN;
1123 enum ttu_flags flags = (enum ttu_flags)arg;
1191 1124
1192 pte = page_check_address(page, mm, address, &ptl, 0); 1125 pte = page_check_address(page, mm, address, &ptl, 0);
1193 if (!pte) 1126 if (!pte)
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1426 return ret; 1359 return ret;
1427} 1360}
1428 1361
1429bool is_vma_temporary_stack(struct vm_area_struct *vma) 1362static int try_to_unmap_nonlinear(struct page *page,
1430{ 1363 struct address_space *mapping, struct vm_area_struct *vma)
1431 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1432
1433 if (!maybe_stack)
1434 return false;
1435
1436 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1437 VM_STACK_INCOMPLETE_SETUP)
1438 return true;
1439
1440 return false;
1441}
1442
1443/**
1444 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
1445 * rmap method
1446 * @page: the page to unmap/unlock
1447 * @flags: action and flags
1448 *
1449 * Find all the mappings of a page using the mapping pointer and the vma chains
1450 * contained in the anon_vma struct it points to.
1451 *
1452 * This function is only called from try_to_unmap/try_to_munlock for
1453 * anonymous pages.
1454 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1455 * where the page was found will be held for write. So, we won't recheck
1456 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1457 * 'LOCKED.
1458 */
1459static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1460{
1461 struct anon_vma *anon_vma;
1462 pgoff_t pgoff;
1463 struct anon_vma_chain *avc;
1464 int ret = SWAP_AGAIN;
1465
1466 anon_vma = page_lock_anon_vma_read(page);
1467 if (!anon_vma)
1468 return ret;
1469
1470 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1471 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1472 struct vm_area_struct *vma = avc->vma;
1473 unsigned long address;
1474
1475 /*
1476 * During exec, a temporary VMA is setup and later moved.
1477 * The VMA is moved under the anon_vma lock but not the
1478 * page tables leading to a race where migration cannot
1479 * find the migration ptes. Rather than increasing the
1480 * locking requirements of exec(), migration skips
1481 * temporary VMAs until after exec() completes.
1482 */
1483 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
1484 is_vma_temporary_stack(vma))
1485 continue;
1486
1487 address = vma_address(page, vma);
1488 ret = try_to_unmap_one(page, vma, address, flags);
1489 if (ret != SWAP_AGAIN || !page_mapped(page))
1490 break;
1491 }
1492
1493 page_unlock_anon_vma_read(anon_vma);
1494 return ret;
1495}
1496
1497/**
1498 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1499 * @page: the page to unmap/unlock
1500 * @flags: action and flags
1501 *
1502 * Find all the mappings of a page using the mapping pointer and the vma chains
1503 * contained in the address_space struct it points to.
1504 *
1505 * This function is only called from try_to_unmap/try_to_munlock for
1506 * object-based pages.
1507 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1508 * where the page was found will be held for write. So, we won't recheck
1509 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1510 * 'LOCKED.
1511 */
1512static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1513{ 1364{
1514 struct address_space *mapping = page->mapping;
1515 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1516 struct vm_area_struct *vma;
1517 int ret = SWAP_AGAIN; 1365 int ret = SWAP_AGAIN;
1518 unsigned long cursor; 1366 unsigned long cursor;
1519 unsigned long max_nl_cursor = 0; 1367 unsigned long max_nl_cursor = 0;
1520 unsigned long max_nl_size = 0; 1368 unsigned long max_nl_size = 0;
1521 unsigned int mapcount; 1369 unsigned int mapcount;
1522 1370
1523 if (PageHuge(page)) 1371 list_for_each_entry(vma,
1524 pgoff = page->index << compound_order(page); 1372 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1525 1373
1526 mutex_lock(&mapping->i_mmap_mutex);
1527 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1528 unsigned long address = vma_address(page, vma);
1529 ret = try_to_unmap_one(page, vma, address, flags);
1530 if (ret != SWAP_AGAIN || !page_mapped(page))
1531 goto out;
1532 }
1533
1534 if (list_empty(&mapping->i_mmap_nonlinear))
1535 goto out;
1536
1537 /*
1538 * We don't bother to try to find the munlocked page in nonlinears.
1539 * It's costly. Instead, later, page reclaim logic may call
1540 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
1541 */
1542 if (TTU_ACTION(flags) == TTU_MUNLOCK)
1543 goto out;
1544
1545 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1546 shared.nonlinear) {
1547 cursor = (unsigned long) vma->vm_private_data; 1374 cursor = (unsigned long) vma->vm_private_data;
1548 if (cursor > max_nl_cursor) 1375 if (cursor > max_nl_cursor)
1549 max_nl_cursor = cursor; 1376 max_nl_cursor = cursor;
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1553 } 1380 }
1554 1381
1555 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ 1382 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
1556 ret = SWAP_FAIL; 1383 return SWAP_FAIL;
1557 goto out;
1558 } 1384 }
1559 1385
1560 /* 1386 /*
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1566 */ 1392 */
1567 mapcount = page_mapcount(page); 1393 mapcount = page_mapcount(page);
1568 if (!mapcount) 1394 if (!mapcount)
1569 goto out; 1395 return ret;
1396
1570 cond_resched(); 1397 cond_resched();
1571 1398
1572 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 1399 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1574 max_nl_cursor = CLUSTER_SIZE; 1401 max_nl_cursor = CLUSTER_SIZE;
1575 1402
1576 do { 1403 do {
1577 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1404 list_for_each_entry(vma,
1578 shared.nonlinear) { 1405 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1406
1579 cursor = (unsigned long) vma->vm_private_data; 1407 cursor = (unsigned long) vma->vm_private_data;
1580 while ( cursor < max_nl_cursor && 1408 while (cursor < max_nl_cursor &&
1581 cursor < vma->vm_end - vma->vm_start) { 1409 cursor < vma->vm_end - vma->vm_start) {
1582 if (try_to_unmap_cluster(cursor, &mapcount, 1410 if (try_to_unmap_cluster(cursor, &mapcount,
1583 vma, page) == SWAP_MLOCK) 1411 vma, page) == SWAP_MLOCK)
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1585 cursor += CLUSTER_SIZE; 1413 cursor += CLUSTER_SIZE;
1586 vma->vm_private_data = (void *) cursor; 1414 vma->vm_private_data = (void *) cursor;
1587 if ((int)mapcount <= 0) 1415 if ((int)mapcount <= 0)
1588 goto out; 1416 return ret;
1589 } 1417 }
1590 vma->vm_private_data = (void *) max_nl_cursor; 1418 vma->vm_private_data = (void *) max_nl_cursor;
1591 } 1419 }
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1600 */ 1428 */
1601 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) 1429 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
1602 vma->vm_private_data = NULL; 1430 vma->vm_private_data = NULL;
1603out: 1431
1604 mutex_unlock(&mapping->i_mmap_mutex);
1605 return ret; 1432 return ret;
1606} 1433}
1607 1434
1435bool is_vma_temporary_stack(struct vm_area_struct *vma)
1436{
1437 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1438
1439 if (!maybe_stack)
1440 return false;
1441
1442 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1443 VM_STACK_INCOMPLETE_SETUP)
1444 return true;
1445
1446 return false;
1447}
1448
1449static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
1450{
1451 return is_vma_temporary_stack(vma);
1452}
1453
1454static int page_not_mapped(struct page *page)
1455{
1456 return !page_mapped(page);
1457};
1458
1608/** 1459/**
1609 * try_to_unmap - try to remove all page table mappings to a page 1460 * try_to_unmap - try to remove all page table mappings to a page
1610 * @page: the page to get unmapped 1461 * @page: the page to get unmapped
@@ -1622,16 +1473,29 @@ out:
1622int try_to_unmap(struct page *page, enum ttu_flags flags) 1473int try_to_unmap(struct page *page, enum ttu_flags flags)
1623{ 1474{
1624 int ret; 1475 int ret;
1476 struct rmap_walk_control rwc = {
1477 .rmap_one = try_to_unmap_one,
1478 .arg = (void *)flags,
1479 .done = page_not_mapped,
1480 .file_nonlinear = try_to_unmap_nonlinear,
1481 .anon_lock = page_lock_anon_vma_read,
1482 };
1625 1483
1626 BUG_ON(!PageLocked(page));
1627 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); 1484 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
1628 1485
1629 if (unlikely(PageKsm(page))) 1486 /*
1630 ret = try_to_unmap_ksm(page, flags); 1487 * During exec, a temporary VMA is setup and later moved.
1631 else if (PageAnon(page)) 1488 * The VMA is moved under the anon_vma lock but not the
1632 ret = try_to_unmap_anon(page, flags); 1489 * page tables leading to a race where migration cannot
1633 else 1490 * find the migration ptes. Rather than increasing the
1634 ret = try_to_unmap_file(page, flags); 1491 * locking requirements of exec(), migration skips
1492 * temporary VMAs until after exec() completes.
1493 */
1494 if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page))
1495 rwc.invalid_vma = invalid_migration_vma;
1496
1497 ret = rmap_walk(page, &rwc);
1498
1635 if (ret != SWAP_MLOCK && !page_mapped(page)) 1499 if (ret != SWAP_MLOCK && !page_mapped(page))
1636 ret = SWAP_SUCCESS; 1500 ret = SWAP_SUCCESS;
1637 return ret; 1501 return ret;
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1654 */ 1518 */
1655int try_to_munlock(struct page *page) 1519int try_to_munlock(struct page *page)
1656{ 1520{
1521 int ret;
1522 struct rmap_walk_control rwc = {
1523 .rmap_one = try_to_unmap_one,
1524 .arg = (void *)TTU_MUNLOCK,
1525 .done = page_not_mapped,
1526 /*
1527 * We don't bother to try to find the munlocked page in
1528 * nonlinears. It's costly. Instead, later, page reclaim logic
1529 * may call try_to_unmap() and recover PG_mlocked lazily.
1530 */
1531 .file_nonlinear = NULL,
1532 .anon_lock = page_lock_anon_vma_read,
1533
1534 };
1535
1657 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1536 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1658 1537
1659 if (unlikely(PageKsm(page))) 1538 ret = rmap_walk(page, &rwc);
1660 return try_to_unmap_ksm(page, TTU_MUNLOCK); 1539 return ret;
1661 else if (PageAnon(page))
1662 return try_to_unmap_anon(page, TTU_MUNLOCK);
1663 else
1664 return try_to_unmap_file(page, TTU_MUNLOCK);
1665} 1540}
1666 1541
1667void __put_anon_vma(struct anon_vma *anon_vma) 1542void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma)
1674 anon_vma_free(anon_vma); 1549 anon_vma_free(anon_vma);
1675} 1550}
1676 1551
1677#ifdef CONFIG_MIGRATION 1552static struct anon_vma *rmap_walk_anon_lock(struct page *page,
1678/* 1553 struct rmap_walk_control *rwc)
1679 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
1680 * Called by migrate.c to remove migration ptes, but might be used more later.
1681 */
1682static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1683 struct vm_area_struct *, unsigned long, void *), void *arg)
1684{ 1554{
1685 struct anon_vma *anon_vma; 1555 struct anon_vma *anon_vma;
1686 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1556
1687 struct anon_vma_chain *avc; 1557 if (rwc->anon_lock)
1688 int ret = SWAP_AGAIN; 1558 return rwc->anon_lock(page);
1689 1559
1690 /* 1560 /*
1691 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1561 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1695 */ 1565 */
1696 anon_vma = page_anon_vma(page); 1566 anon_vma = page_anon_vma(page);
1697 if (!anon_vma) 1567 if (!anon_vma)
1698 return ret; 1568 return NULL;
1569
1699 anon_vma_lock_read(anon_vma); 1570 anon_vma_lock_read(anon_vma);
1571 return anon_vma;
1572}
1573
1574/*
1575 * rmap_walk_anon - do something to anonymous page using the object-based
1576 * rmap method
1577 * @page: the page to be handled
1578 * @rwc: control variable according to each walk type
1579 *
1580 * Find all the mappings of a page using the mapping pointer and the vma chains
1581 * contained in the anon_vma struct it points to.
1582 *
1583 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1584 * where the page was found will be held for write. So, we won't recheck
1585 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1586 * LOCKED.
1587 */
1588static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1589{
1590 struct anon_vma *anon_vma;
1591 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1592 struct anon_vma_chain *avc;
1593 int ret = SWAP_AGAIN;
1594
1595 anon_vma = rmap_walk_anon_lock(page, rwc);
1596 if (!anon_vma)
1597 return ret;
1598
1700 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1599 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1701 struct vm_area_struct *vma = avc->vma; 1600 struct vm_area_struct *vma = avc->vma;
1702 unsigned long address = vma_address(page, vma); 1601 unsigned long address = vma_address(page, vma);
1703 ret = rmap_one(page, vma, address, arg); 1602
1603 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1604 continue;
1605
1606 ret = rwc->rmap_one(page, vma, address, rwc->arg);
1704 if (ret != SWAP_AGAIN) 1607 if (ret != SWAP_AGAIN)
1705 break; 1608 break;
1609 if (rwc->done && rwc->done(page))
1610 break;
1706 } 1611 }
1707 anon_vma_unlock_read(anon_vma); 1612 anon_vma_unlock_read(anon_vma);
1708 return ret; 1613 return ret;
1709} 1614}
1710 1615
1711static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, 1616/*
1712 struct vm_area_struct *, unsigned long, void *), void *arg) 1617 * rmap_walk_file - do something to file page using the object-based rmap method
1618 * @page: the page to be handled
1619 * @rwc: control variable according to each walk type
1620 *
1621 * Find all the mappings of a page using the mapping pointer and the vma chains
1622 * contained in the address_space struct it points to.
1623 *
1624 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1625 * where the page was found will be held for write. So, we won't recheck
1626 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1627 * LOCKED.
1628 */
1629static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1713{ 1630{
1714 struct address_space *mapping = page->mapping; 1631 struct address_space *mapping = page->mapping;
1715 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1632 pgoff_t pgoff = page->index << compound_order(page);
1716 struct vm_area_struct *vma; 1633 struct vm_area_struct *vma;
1717 int ret = SWAP_AGAIN; 1634 int ret = SWAP_AGAIN;
1718 1635
1636 /*
1637 * The page lock not only makes sure that page->mapping cannot
1638 * suddenly be NULLified by truncation, it makes sure that the
1639 * structure at mapping cannot be freed and reused yet,
1640 * so we can safely take mapping->i_mmap_mutex.
1641 */
1642 VM_BUG_ON(!PageLocked(page));
1643
1719 if (!mapping) 1644 if (!mapping)
1720 return ret; 1645 return ret;
1721 mutex_lock(&mapping->i_mmap_mutex); 1646 mutex_lock(&mapping->i_mmap_mutex);
1722 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1647 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1723 unsigned long address = vma_address(page, vma); 1648 unsigned long address = vma_address(page, vma);
1724 ret = rmap_one(page, vma, address, arg); 1649
1650 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1651 continue;
1652
1653 ret = rwc->rmap_one(page, vma, address, rwc->arg);
1725 if (ret != SWAP_AGAIN) 1654 if (ret != SWAP_AGAIN)
1726 break; 1655 goto done;
1656 if (rwc->done && rwc->done(page))
1657 goto done;
1727 } 1658 }
1728 /* 1659
1729 * No nonlinear handling: being always shared, nonlinear vmas 1660 if (!rwc->file_nonlinear)
1730 * never contain migration ptes. Decide what to do about this 1661 goto done;
1731 * limitation to linear when we need rmap_walk() on nonlinear. 1662
1732 */ 1663 if (list_empty(&mapping->i_mmap_nonlinear))
1664 goto done;
1665
1666 ret = rwc->file_nonlinear(page, mapping, vma);
1667
1668done:
1733 mutex_unlock(&mapping->i_mmap_mutex); 1669 mutex_unlock(&mapping->i_mmap_mutex);
1734 return ret; 1670 return ret;
1735} 1671}
1736 1672
1737int rmap_walk(struct page *page, int (*rmap_one)(struct page *, 1673int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
1738 struct vm_area_struct *, unsigned long, void *), void *arg)
1739{ 1674{
1740 VM_BUG_ON(!PageLocked(page));
1741
1742 if (unlikely(PageKsm(page))) 1675 if (unlikely(PageKsm(page)))
1743 return rmap_walk_ksm(page, rmap_one, arg); 1676 return rmap_walk_ksm(page, rwc);
1744 else if (PageAnon(page)) 1677 else if (PageAnon(page))
1745 return rmap_walk_anon(page, rmap_one, arg); 1678 return rmap_walk_anon(page, rwc);
1746 else 1679 else
1747 return rmap_walk_file(page, rmap_one, arg); 1680 return rmap_walk_file(page, rwc);
1748} 1681}
1749#endif /* CONFIG_MIGRATION */
1750 1682
1751#ifdef CONFIG_HUGETLB_PAGE 1683#ifdef CONFIG_HUGETLB_PAGE
1752/* 1684/*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 27eeab3be757..4cba9c2783a1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 40 unsigned long align,
41 unsigned long goal) 41 unsigned long goal)
42{ 42{
43 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); 43 return memblock_virt_alloc_try_nid(size, align, goal,
44 BOOTMEM_ALLOC_ACCESSIBLE, node);
44} 45}
45 46
46static void *vmemmap_buf; 47static void *vmemmap_buf;
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
226 227
227 if (vmemmap_buf_start) { 228 if (vmemmap_buf_start) {
228 /* need to free left buf */ 229 /* need to free left buf */
229 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); 230 memblock_free_early(__pa(vmemmap_buf),
231 vmemmap_buf_end - vmemmap_buf);
230 vmemmap_buf = NULL; 232 vmemmap_buf = NULL;
231 vmemmap_buf_end = NULL; 233 vmemmap_buf_end = NULL;
232 } 234 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0e9590..63c3ea5c119c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
69 else 69 else
70 section = kzalloc(array_size, GFP_KERNEL); 70 section = kzalloc(array_size, GFP_KERNEL);
71 } else { 71 } else {
72 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 72 section = memblock_virt_alloc_node(array_size, nid);
73 } 73 }
74 74
75 return section; 75 return section;
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
279 limit = goal + (1UL << PA_SECTION_SHIFT); 279 limit = goal + (1UL << PA_SECTION_SHIFT);
280 nid = early_pfn_to_nid(goal >> PAGE_SHIFT); 280 nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
281again: 281again:
282 p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, 282 p = memblock_virt_alloc_try_nid_nopanic(size,
283 SMP_CACHE_BYTES, goal, limit); 283 SMP_CACHE_BYTES, goal, limit,
284 nid);
284 if (!p && limit) { 285 if (!p && limit) {
285 limit = 0; 286 limit = 0;
286 goto again; 287 goto again;
@@ -331,7 +332,7 @@ static unsigned long * __init
331sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 332sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
332 unsigned long size) 333 unsigned long size)
333{ 334{
334 return alloc_bootmem_node_nopanic(pgdat, size); 335 return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
335} 336}
336 337
337static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 338static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
376 return map; 377 return map;
377 378
378 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); 379 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
379 map = __alloc_bootmem_node_high(NODE_DATA(nid), size, 380 map = memblock_virt_alloc_try_nid(size,
380 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 381 PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
382 BOOTMEM_ALLOC_ACCESSIBLE, nid);
381 return map; 383 return map;
382} 384}
383void __init sparse_mem_maps_populate_node(struct page **map_map, 385void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
401 } 403 }
402 404
403 size = PAGE_ALIGN(size); 405 size = PAGE_ALIGN(size);
404 map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, 406 map = memblock_virt_alloc_try_nid(size * map_count,
405 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 407 PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
408 BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
406 if (map) { 409 if (map) {
407 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 410 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
408 if (!present_section_nr(pnum)) 411 if (!present_section_nr(pnum))
@@ -545,7 +548,7 @@ void __init sparse_init(void)
545 * sparse_early_mem_map_alloc, so allocate usemap_map at first. 548 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
546 */ 549 */
547 size = sizeof(unsigned long *) * NR_MEM_SECTIONS; 550 size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
548 usemap_map = alloc_bootmem(size); 551 usemap_map = memblock_virt_alloc(size, 0);
549 if (!usemap_map) 552 if (!usemap_map)
550 panic("can not allocate usemap_map\n"); 553 panic("can not allocate usemap_map\n");
551 alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, 554 alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
@@ -553,7 +556,7 @@ void __init sparse_init(void)
553 556
554#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 557#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
555 size2 = sizeof(struct page *) * NR_MEM_SECTIONS; 558 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
556 map_map = alloc_bootmem(size2); 559 map_map = memblock_virt_alloc(size2, 0);
557 if (!map_map) 560 if (!map_map)
558 panic("can not allocate map_map\n"); 561 panic("can not allocate map_map\n");
559 alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, 562 alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
@@ -583,9 +586,9 @@ void __init sparse_init(void)
583 vmemmap_populate_print_last(); 586 vmemmap_populate_print_last();
584 587
585#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 588#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
586 free_bootmem(__pa(map_map), size2); 589 memblock_free_early(__pa(map_map), size2);
587#endif 590#endif
588 free_bootmem(__pa(usemap_map), size); 591 memblock_free_early(__pa(usemap_map), size);
589} 592}
590 593
591#ifdef CONFIG_MEMORY_HOTPLUG 594#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aaabd03..d1100b619e61 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h>
35 34
36#include "internal.h" 35#include "internal.h"
37 36
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page)
82 81
83static void put_compound_page(struct page *page) 82static void put_compound_page(struct page *page)
84{ 83{
85 if (unlikely(PageTail(page))) { 84 struct page *page_head;
86 /* __split_huge_page_refcount can run under us */
87 struct page *page_head = compound_trans_head(page);
88
89 if (likely(page != page_head &&
90 get_page_unless_zero(page_head))) {
91 unsigned long flags;
92 85
86 if (likely(!PageTail(page))) {
87 if (put_page_testzero(page)) {
93 /* 88 /*
94 * THP can not break up slab pages so avoid taking 89 * By the time all refcounts have been released
95 * compound_lock(). Slab performs non-atomic bit ops 90 * split_huge_page cannot run anymore from under us.
96 * on page->flags for better performance. In particular
97 * slab_unlock() in slub used to be a hot path. It is
98 * still hot on arches that do not support
99 * this_cpu_cmpxchg_double().
100 */ 91 */
101 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 92 if (PageHead(page))
102 if (likely(PageTail(page))) { 93 __put_compound_page(page);
103 /* 94 else
104 * __split_huge_page_refcount 95 __put_single_page(page);
105 * cannot race here. 96 }
106 */ 97 return;
107 VM_BUG_ON(!PageHead(page_head)); 98 }
108 atomic_dec(&page->_mapcount); 99
109 if (put_page_testzero(page_head)) 100 /* __split_huge_page_refcount can run under us */
110 VM_BUG_ON(1); 101 page_head = compound_trans_head(page);
111 if (put_page_testzero(page_head)) 102
112 __put_compound_page(page_head); 103 /*
113 return; 104 * THP can not break up slab pages so avoid taking
114 } else 105 * compound_lock() and skip the tail page refcounting (in
115 /* 106 * _mapcount) too. Slab performs non-atomic bit ops on
116 * __split_huge_page_refcount 107 * page->flags for better performance. In particular
117 * run before us, "page" was a 108 * slab_unlock() in slub used to be a hot path. It is still
118 * THP tail. The split 109 * hot on arches that do not support
119 * page_head has been freed 110 * this_cpu_cmpxchg_double().
120 * and reallocated as slab or 111 *
121 * hugetlbfs page of smaller 112 * If "page" is part of a slab or hugetlbfs page it cannot be
122 * order (only possible if 113 * splitted and the head page cannot change from under us. And
123 * reallocated as slab on 114 * if "page" is part of a THP page under splitting, if the
124 * x86). 115 * head page pointed by the THP tail isn't a THP head anymore,
125 */ 116 * we'll find PageTail clear after smp_rmb() and we'll treat
126 goto skip_lock; 117 * it as a single page.
127 } 118 */
119 if (!__compound_tail_refcounted(page_head)) {
120 /*
121 * If "page" is a THP tail, we must read the tail page
122 * flags after the head page flags. The
123 * split_huge_page side enforces write memory barriers
124 * between clearing PageTail and before the head page
125 * can be freed and reallocated.
126 */
127 smp_rmb();
128 if (likely(PageTail(page))) {
128 /* 129 /*
129 * page_head wasn't a dangling pointer but it 130 * __split_huge_page_refcount cannot race
130 * may not be a head page anymore by the time 131 * here.
131 * we obtain the lock. That is ok as long as it
132 * can't be freed from under us.
133 */ 132 */
134 flags = compound_lock_irqsave(page_head); 133 VM_BUG_ON(!PageHead(page_head));
135 if (unlikely(!PageTail(page))) { 134 VM_BUG_ON(page_mapcount(page) != 0);
136 /* __split_huge_page_refcount run before us */ 135 if (put_page_testzero(page_head)) {
137 compound_unlock_irqrestore(page_head, flags); 136 /*
138skip_lock: 137 * If this is the tail of a slab
139 if (put_page_testzero(page_head)) { 138 * compound page, the tail pin must
140 /* 139 * not be the last reference held on
141 * The head page may have been 140 * the page, because the PG_slab
142 * freed and reallocated as a 141 * cannot be cleared before all tail
143 * compound page of smaller 142 * pins (which skips the _mapcount
144 * order and then freed again. 143 * tail refcounting) have been
145 * All we know is that it 144 * released. For hugetlbfs the tail
146 * cannot have become: a THP 145 * pin may be the last reference on
147 * page, a compound page of 146 * the page instead, because
148 * higher order, a tail page. 147 * PageHeadHuge will not go away until
149 * That is because we still 148 * the compound page enters the buddy
150 * hold the refcount of the 149 * allocator.
151 * split THP tail and 150 */
152 * page_head was the THP head 151 VM_BUG_ON(PageSlab(page_head));
153 * before the split. 152 __put_compound_page(page_head);
154 */
155 if (PageHead(page_head))
156 __put_compound_page(page_head);
157 else
158 __put_single_page(page_head);
159 }
160out_put_single:
161 if (put_page_testzero(page))
162 __put_single_page(page);
163 return;
164 } 153 }
165 VM_BUG_ON(page_head != page->first_page); 154 return;
155 } else
166 /* 156 /*
167 * We can release the refcount taken by 157 * __split_huge_page_refcount run before us,
168 * get_page_unless_zero() now that 158 * "page" was a THP tail. The split page_head
169 * __split_huge_page_refcount() is blocked on 159 * has been freed and reallocated as slab or
170 * the compound_lock. 160 * hugetlbfs page of smaller order (only
161 * possible if reallocated as slab on x86).
171 */ 162 */
172 if (put_page_testzero(page_head)) 163 goto out_put_single;
173 VM_BUG_ON(1); 164 }
174 /* __split_huge_page_refcount will wait now */
175 VM_BUG_ON(page_mapcount(page) <= 0);
176 atomic_dec(&page->_mapcount);
177 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
178 VM_BUG_ON(atomic_read(&page->_count) != 0);
179 compound_unlock_irqrestore(page_head, flags);
180 165
166 if (likely(page != page_head && get_page_unless_zero(page_head))) {
167 unsigned long flags;
168
169 /*
170 * page_head wasn't a dangling pointer but it may not
171 * be a head page anymore by the time we obtain the
172 * lock. That is ok as long as it can't be freed from
173 * under us.
174 */
175 flags = compound_lock_irqsave(page_head);
176 if (unlikely(!PageTail(page))) {
177 /* __split_huge_page_refcount run before us */
178 compound_unlock_irqrestore(page_head, flags);
181 if (put_page_testzero(page_head)) { 179 if (put_page_testzero(page_head)) {
180 /*
181 * The head page may have been freed
182 * and reallocated as a compound page
183 * of smaller order and then freed
184 * again. All we know is that it
185 * cannot have become: a THP page, a
186 * compound page of higher order, a
187 * tail page. That is because we
188 * still hold the refcount of the
189 * split THP tail and page_head was
190 * the THP head before the split.
191 */
182 if (PageHead(page_head)) 192 if (PageHead(page_head))
183 __put_compound_page(page_head); 193 __put_compound_page(page_head);
184 else 194 else
185 __put_single_page(page_head); 195 __put_single_page(page_head);
186 } 196 }
187 } else { 197out_put_single:
188 /* page_head is a dangling pointer */ 198 if (put_page_testzero(page))
189 VM_BUG_ON(PageTail(page)); 199 __put_single_page(page);
190 goto out_put_single; 200 return;
191 } 201 }
192 } else if (put_page_testzero(page)) { 202 VM_BUG_ON(page_head != page->first_page);
193 if (PageHead(page)) 203 /*
194 __put_compound_page(page); 204 * We can release the refcount taken by
195 else 205 * get_page_unless_zero() now that
196 __put_single_page(page); 206 * __split_huge_page_refcount() is blocked on the
207 * compound_lock.
208 */
209 if (put_page_testzero(page_head))
210 VM_BUG_ON(1);
211 /* __split_huge_page_refcount will wait now */
212 VM_BUG_ON(page_mapcount(page) <= 0);
213 atomic_dec(&page->_mapcount);
214 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
215 VM_BUG_ON(atomic_read(&page->_count) != 0);
216 compound_unlock_irqrestore(page_head, flags);
217
218 if (put_page_testzero(page_head)) {
219 if (PageHead(page_head))
220 __put_compound_page(page_head);
221 else
222 __put_single_page(page_head);
223 }
224 } else {
225 /* page_head is a dangling pointer */
226 VM_BUG_ON(PageTail(page));
227 goto out_put_single;
197 } 228 }
198} 229}
199 230
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page)
221 * split_huge_page(). 252 * split_huge_page().
222 */ 253 */
223 unsigned long flags; 254 unsigned long flags;
224 bool got = false; 255 bool got;
225 struct page *page_head = compound_trans_head(page); 256 struct page *page_head = compound_trans_head(page);
226 257
227 if (likely(page != page_head && get_page_unless_zero(page_head))) { 258 /* Ref to put_compound_page() comment. */
228 /* Ref to put_compound_page() comment. */ 259 if (!__compound_tail_refcounted(page_head)) {
229 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 260 smp_rmb();
230 if (likely(PageTail(page))) { 261 if (likely(PageTail(page))) {
231 /* 262 /*
232 * This is a hugetlbfs page or a slab 263 * This is a hugetlbfs page or a slab
233 * page. __split_huge_page_refcount 264 * page. __split_huge_page_refcount
234 * cannot race here. 265 * cannot race here.
235 */ 266 */
236 VM_BUG_ON(!PageHead(page_head)); 267 VM_BUG_ON(!PageHead(page_head));
237 __get_page_tail_foll(page, false); 268 __get_page_tail_foll(page, true);
238 return true; 269 return true;
239 } else { 270 } else {
240 /* 271 /*
241 * __split_huge_page_refcount run 272 * __split_huge_page_refcount run
242 * before us, "page" was a THP 273 * before us, "page" was a THP
243 * tail. The split page_head has been 274 * tail. The split page_head has been
244 * freed and reallocated as slab or 275 * freed and reallocated as slab or
245 * hugetlbfs page of smaller order 276 * hugetlbfs page of smaller order
246 * (only possible if reallocated as 277 * (only possible if reallocated as
247 * slab on x86). 278 * slab on x86).
248 */ 279 */
249 put_page(page_head); 280 return false;
250 return false;
251 }
252 } 281 }
282 }
253 283
284 got = false;
285 if (likely(page != page_head && get_page_unless_zero(page_head))) {
254 /* 286 /*
255 * page_head wasn't a dangling pointer but it 287 * page_head wasn't a dangling pointer but it
256 * may not be a head page anymore by the time 288 * may not be a head page anymore by the time
diff --git a/mm/util.c b/mm/util.c
index 808f375648e7..a24aa22f2473 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page)
404 return mapping; 404 return mapping;
405} 405}
406 406
407int overcommit_ratio_handler(struct ctl_table *table, int write,
408 void __user *buffer, size_t *lenp,
409 loff_t *ppos)
410{
411 int ret;
412
413 ret = proc_dointvec(table, write, buffer, lenp, ppos);
414 if (ret == 0 && write)
415 sysctl_overcommit_kbytes = 0;
416 return ret;
417}
418
419int overcommit_kbytes_handler(struct ctl_table *table, int write,
420 void __user *buffer, size_t *lenp,
421 loff_t *ppos)
422{
423 int ret;
424
425 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
426 if (ret == 0 && write)
427 sysctl_overcommit_ratio = 0;
428 return ret;
429}
430
407/* 431/*
408 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used 432 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
409 */ 433 */
410unsigned long vm_commit_limit(void) 434unsigned long vm_commit_limit(void)
411{ 435{
412 return ((totalram_pages - hugetlb_total_pages()) 436 unsigned long allowed;
413 * sysctl_overcommit_ratio / 100) + total_swap_pages; 437
438 if (sysctl_overcommit_kbytes)
439 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
440 else
441 allowed = ((totalram_pages - hugetlb_total_pages())
442 * sysctl_overcommit_ratio / 100);
443 allowed += total_swap_pages;
444
445 return allowed;
414} 446}
415 447
416 448
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf96803c5b..e4f0db2a3eae 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x)
220} 220}
221 221
222/* 222/*
223 * Walk a vmap address to the struct page it maps. 223 * Walk a vmap address to the physical pfn it maps to.
224 */ 224 */
225struct page *vmalloc_to_page(const void *vmalloc_addr) 225unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
226{ 226{
227 unsigned long addr = (unsigned long) vmalloc_addr; 227 unsigned long addr = (unsigned long) vmalloc_addr;
228 struct page *page = NULL; 228 unsigned long pfn = 0;
229 pgd_t *pgd = pgd_offset_k(addr); 229 pgd_t *pgd = pgd_offset_k(addr);
230 230
231 /* 231 /*
@@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
244 ptep = pte_offset_map(pmd, addr); 244 ptep = pte_offset_map(pmd, addr);
245 pte = *ptep; 245 pte = *ptep;
246 if (pte_present(pte)) 246 if (pte_present(pte))
247 page = pte_page(pte); 247 pfn = pte_pfn(pte);
248 pte_unmap(ptep); 248 pte_unmap(ptep);
249 } 249 }
250 } 250 }
251 } 251 }
252 return page; 252 return pfn;
253} 253}
254EXPORT_SYMBOL(vmalloc_to_page); 254EXPORT_SYMBOL(vmalloc_to_pfn);
255 255
256/* 256/*
257 * Map a vmalloc()-space virtual address to the physical page frame number. 257 * Map a vmalloc()-space virtual address to the struct page.
258 */ 258 */
259unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 259struct page *vmalloc_to_page(const void *vmalloc_addr)
260{ 260{
261 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 261 return pfn_to_page(vmalloc_to_pfn(vmalloc_addr));
262} 262}
263EXPORT_SYMBOL(vmalloc_to_pfn); 263EXPORT_SYMBOL(vmalloc_to_page);
264 264
265 265
266/*** Global kva allocator ***/ 266/*** Global kva allocator ***/